add file names and update evaluation scripts (3b059c2f) · Commits · candle / Megatron-LM

tasks/knwl_dialo/README.md

+2 −2

Original line number	Diff line number	Diff line
		@@ -11,9 +11,9 @@ Blow we present the steps to run our multi-stage dialogue prompting (MSDP) frame

		### Stage-1: Prompting for Knowledge Generation
		1. The script [`tasks/knwl_dialo/scripts/prompt_knwl_gen.sh`](./scripts/prompt_knwl_gen.sh) provides an example for how to perform the first-stage prompting for the knowledge generation.
		2. The F1/FK1 score can be evaluated through [`tasks/knwl_dialo/scripts/eval_generation.sh`](./scripts/eval_generation.sh). Other automatic metrics (i.e., BLEU, METEOR, and ROUGE-L) follow the [nlg-eval](https://github.com/Maluuba/nlg-eval).
		2. We provide the script [`tasks/knwl_dialo/scripts/eval_knwl_generation.sh`](./scripts/eval_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.

		### Stage-2: Prompting for Response Generation
		1. The script [`tasks/knwl_dialo/scripts/prep_resp_gen.sh`](./scripts/prep_resp_gen.sh) helps to prepare the input file for the response generation (based on the previously generated knowledge file).
		2. The script [`tasks/knwl_dialo/scripts/prompt_resp_gen.sh`](./scripts/prompt_resp_gen.sh) provides an example for how to perform the second-stage prompting for the response generation.
		3. The automatic evaluations are the same as mentioned aboved for the knowledge generation.
		3. We provide the script [`tasks/knwl_dialo/scripts/eval_resp_generation.sh`](./scripts/eval_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.

tasks/knwl_dialo/scripts/eval_generation.sh→tasks/knwl_dialo/scripts/eval_knwl_generation.sh

+43 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		# This script is used to evaluate the F1 or KF1 scores.
		#########################
		# Evaluate the F1 scores.
		#########################

		WORLD_SIZE=1

		DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
		--nnodes 1 \
		--node_rank 0 \
		--master_addr localhost \
		--master_port 6000"

		OUTPUT_PATH=<PATH_OF_THE_OUTPUT_GENERATION>
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH>
		MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
		(e.g., /testseen_knowledge_generations.txt)
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \
		(e.g., /testseen_knowledge_reference.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--num-layers 24 \
		@@ -21,5 +24,20 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--max-position-embeddings 2048 \
		--micro-batch-size 4 \
		--task KNWL-DIALO-EVAL-F1 \
		--guess-file ${OUTPUT_PATH} \
		--guess-file ${MODEL_GEN_PATH} \
		--answer-file ${GROUND_TRUTH_PATH}


		############################################
		# Evaluate BLEU, METEOR, and ROUGE-L scores.
		############################################

		# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
		# evaluate the BLEU, METEOR, and ROUGE-L scores.

		# To evaluate on these metrics, please setup the environments based on
		# the nlg-eval github, and run the corresponding evaluation commands.

		nlg-eval \
		--hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
		--references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>

tasks/knwl_dialo/scripts/eval_resp_generation.sh

0 → 100644

+64 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		#########################
		# Evaluate the F1 scores.
		#########################

		WORLD_SIZE=1
		DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
		--nnodes 1 \
		--node_rank 0 \
		--master_addr localhost \
		--master_port 6000"

		MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \
		(e.g., /testseen_response_generations.txt)
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \
		(e.g., /testseen_response_reference.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--seq-length 2048 \
		--max-position-embeddings 2048 \
		--micro-batch-size 4 \
		--task KNWL-DIALO-EVAL-F1 \
		--guess-file ${MODEL_GEN_PATH} \
		--answer-file ${GROUND_TRUTH_PATH}


		##########################
		# Evaluate the KF1 scores.
		##########################

		MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \
		(e.g., /testseen_response_generations.txt)
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \
		(e.g., /testseen_knowledge_reference.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--seq-length 2048 \
		--max-position-embeddings 2048 \
		--micro-batch-size 4 \
		--task KNWL-DIALO-EVAL-F1 \
		--guess-file ${MODEL_GEN_PATH} \
		--answer-file ${GROUND_TRUTH_PATH}


		############################################
		# Evaluate BLEU, METEOR, and ROUGE-L scores.
		############################################

		# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
		# evaluate the BLEU, METEOR, and ROUGE-L scores.

		# To evaluate on these metrics, please setup the environments based on
		# the nlg-eval github, and run the corresponding evaluation commands.

		nlg-eval \
		--hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
		--references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>

tasks/knwl_dialo/scripts/prep_resp_gen.sh

+6 −3

Original line number	Diff line number	Diff line
		@@ -4,9 +4,12 @@

		DIR=`pwd`

		TEST_FILE=<PATH_OF_THE_PROCESSED_TEST_DATA>
		KNOWLEDGE_FILE=<PATH_OF_THE_GENERATED_KNOWLEDGE_DATA>
		PROCESSED_FILE=<PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
		TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
		(e.g., /testseen_processed.txt)
		KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
		(e.g., /testseen_knowledge_generations.txt)
		PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
		(e.g., /testseen_processed_with_generated_knowledge.txt)

		python ${DIR}/tasks/knwl_dialo/preprocessing.py \
		--func prepare_input \

tasks/knwl_dialo/scripts/prompt_knwl_gen.sh

+8 −6

Original line number	Diff line number	Diff line
		@@ -12,12 +12,14 @@ DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
		--master_addr localhost \
		--master_port 6000"

		CHECKPOINT_PATH=<PATH_OF_THE_LANGUAGE_MODEL>
		INPUT_PATH=<PATH_OF_THE_INPUT_TEST_DATA_FILE>
		PROMPT_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION_PROMPTS>
		VOCAB_PATH=<PATH_OF_THE_VOCAB_FILE>
		MERGE_PATH=<PATH_OF_THE_MERGE_FILE>
		OUTPUT_PATH=<PATH_OF_THE_OUTPUT_GENERATION_FILE>
		CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
		VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
		MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
		INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
		PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
		(e.g., /testseen_knowledge_prompts.json)
		OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
		(e.g., /testseen_knowledge_generations.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--num-layers 24 \