Commit 3b059c2f authored by zihanl's avatar zihanl
Browse files

add file names and update evaluation scripts

parent 178d4f9c
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -11,9 +11,9 @@ Blow we present the steps to run our multi-stage dialogue prompting (MSDP) frame

### Stage-1: Prompting for Knowledge Generation
1. The script [`tasks/knwl_dialo/scripts/prompt_knwl_gen.sh`](./scripts/prompt_knwl_gen.sh) provides an example for how to perform the first-stage prompting for the knowledge generation.
2. The F1/FK1 score can be evaluated through [`tasks/knwl_dialo/scripts/eval_generation.sh`](./scripts/eval_generation.sh). Other automatic metrics (i.e., BLEU, METEOR, and ROUGE-L) follow the [nlg-eval](https://github.com/Maluuba/nlg-eval).
2. We provide the script [`tasks/knwl_dialo/scripts/eval_knwl_generation.sh`](./scripts/eval_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.

### Stage-2: Prompting for Response Generation
1. The script [`tasks/knwl_dialo/scripts/prep_resp_gen.sh`](./scripts/prep_resp_gen.sh) helps to prepare the input file for the response generation (based on the previously generated knowledge file).
2. The script [`tasks/knwl_dialo/scripts/prompt_resp_gen.sh`](./scripts/prompt_resp_gen.sh) provides an example for how to perform the second-stage prompting for the response generation.
3. The automatic evaluations are the same as mentioned aboved for the knowledge generation.
3. We provide the script [`tasks/knwl_dialo/scripts/eval_resp_generation.sh`](./scripts/eval_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
+43 −0
Original line number Diff line number Diff line
#!/bin/bash

# This script is used to evaluate the F1 or KF1 scores.
#########################
# Evaluate the F1 scores.
#########################

WORLD_SIZE=1

DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"
                  
OUTPUT_PATH=<PATH_OF_THE_OUTPUT_GENERATION>
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH>
MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
        (e.g., /testseen_knowledge_generations.txt)
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
        (e.g., /testseen_knowledge_reference.txt)

python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
        --num-layers 24 \
@@ -21,5 +24,20 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
        --max-position-embeddings 2048 \
        --micro-batch-size 4 \
        --task KNWL-DIALO-EVAL-F1 \
        --guess-file ${OUTPUT_PATH} \
        --guess-file ${MODEL_GEN_PATH} \
        --answer-file ${GROUND_TRUTH_PATH}


############################################
# Evaluate BLEU, METEOR, and ROUGE-L scores.
############################################

# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
# evaluate the BLEU, METEOR, and ROUGE-L scores. 

# To evaluate on these metrics, please setup the environments based on 
# the nlg-eval github, and run the corresponding evaluation commands.

nlg-eval \
    --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
    --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
+64 −0
Original line number Diff line number Diff line
#!/bin/bash

#########################
# Evaluate the F1 scores.
#########################

WORLD_SIZE=1
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"
                  
MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
        (e.g., /testseen_response_generations.txt)
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
        (e.g., /testseen_response_reference.txt)

python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
        --num-layers 24 \
        --hidden-size 1024 \
        --num-attention-heads 16 \
        --seq-length 2048 \
        --max-position-embeddings 2048 \
        --micro-batch-size 4 \
        --task KNWL-DIALO-EVAL-F1 \
        --guess-file ${MODEL_GEN_PATH} \
        --answer-file ${GROUND_TRUTH_PATH}


##########################
# Evaluate the KF1 scores.
##########################
                  
MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
        (e.g., /testseen_response_generations.txt)
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
        (e.g., /testseen_knowledge_reference.txt)

python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
        --num-layers 24 \
        --hidden-size 1024 \
        --num-attention-heads 16 \
        --seq-length 2048 \
        --max-position-embeddings 2048 \
        --micro-batch-size 4 \
        --task KNWL-DIALO-EVAL-F1 \
        --guess-file ${MODEL_GEN_PATH} \
        --answer-file ${GROUND_TRUTH_PATH}


############################################
# Evaluate BLEU, METEOR, and ROUGE-L scores.
############################################

# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
# evaluate the BLEU, METEOR, and ROUGE-L scores. 

# To evaluate on these metrics, please setup the environments based on 
# the nlg-eval github, and run the corresponding evaluation commands.

nlg-eval \
    --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
    --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
+6 −3
Original line number Diff line number Diff line
@@ -4,9 +4,12 @@

DIR=`pwd`

TEST_FILE=<PATH_OF_THE_PROCESSED_TEST_DATA>
KNOWLEDGE_FILE=<PATH_OF_THE_GENERATED_KNOWLEDGE_DATA>
PROCESSED_FILE=<PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
        (e.g., /testseen_processed.txt)
KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
        (e.g., /testseen_knowledge_generations.txt)
PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
        (e.g., /testseen_processed_with_generated_knowledge.txt)

python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func prepare_input \
+8 −6
Original line number Diff line number Diff line
@@ -12,12 +12,14 @@ DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
                  --master_addr localhost \
                  --master_port 6000"

CHECKPOINT_PATH=<PATH_OF_THE_LANGUAGE_MODEL>
INPUT_PATH=<PATH_OF_THE_INPUT_TEST_DATA_FILE>
PROMPT_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION_PROMPTS>
VOCAB_PATH=<PATH_OF_THE_VOCAB_FILE>
MERGE_PATH=<PATH_OF_THE_MERGE_FILE>
OUTPUT_PATH=<PATH_OF_THE_OUTPUT_GENERATION_FILE>
CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
        (e.g., /testseen_knowledge_prompts.json)
OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
        (e.g., /testseen_knowledge_generations.txt)

python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
        --num-layers 24 \
Loading