Merge branch 'multistage-prompting/main-multistage' into 'main' (5833553a) · Commits · candle / Megatron-LM

examples/msdp/README.md

0 → 100644

+5 −0

Original line number	Diff line number	Diff line

		# Multi-Stage Prompting for Knowledgeable Dialogue Generation

		This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).

examples/msdp/data_processing.sh

0 → 100644

+83 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		# Data preparation for our framework: preprocessing the WoW and WoI datasets
		# The datasets can be downloaded through the following links:
		# WoW: https://parl.ai/projects/wizard_of_wikipedia/
		# WoI: https://parl.ai/projects/sea/

		DIR=`pwd`
		# Before running the preprocessing, please download
		# the wizard of wikipedia and wizard datasets
		WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
		WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>

		# We provide examples for processing the raw data from Wizard of Wikipedia
		# Processing the train dataset (train.json)
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func process_wow_dataset \
		--raw_file ${WOW_DATA_FOLDER}/train.json \
		--processed_file ${WOW_DATA_FOLDER}/train_processed.txt

		# Processing test seen dataset (test_random_split.json)
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func process_wow_dataset \
		--raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
		--processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
		--knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
		--resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt

		# processing test unseen dataset (test_topic_split.json)
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func process_wow_dataset \
		--raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
		--processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
		--knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
		--resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt


		# We provide the following script to process the raw data from Wizard of Internet
		# Processing the test dataset (test.jsonl)
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func process_woi_dataset \
		--raw_file ${WOI_DATA_FOLDER}/test.jsonl \
		--processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
		--knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
		--resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt


		# Get the knowledge generation prompts for the each test dataset in WoW and WoI
		MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL>
		# WoW test seen
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func get_knwl_gen_prompts \
		--test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
		--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
		--model_file ${MODEL_FILE} \
		--processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
		--data_type wow_seen

		# WoW test unseen
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func get_knwl_gen_prompts \
		--test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
		--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
		--model_file ${MODEL_FILE} \
		--processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
		--data_type wow_unseen

		# WoI
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func get_knwl_gen_prompts \
		--test_file ${WOI_DATA_FOLDER}/test_processed.txt \
		--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
		--model_file ${MODEL_FILE} \
		--processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
		--data_type woi


		# Get the response generation prompts (can be applied for all the test datasets)
		python ${DIR}/tasks/msdp/preprocessing.py \
		--func get_resp_gen_prompts \
		--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
		--processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt

examples/msdp/eval_knwl_generation.sh

0 → 100644

+43 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		#########################
		# Evaluate the F1 scores.
		#########################

		WORLD_SIZE=1
		DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
		--nnodes 1 \
		--node_rank 0 \
		--master_addr localhost \
		--master_port 6000"

		MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
		(e.g., /testseen_knowledge_generations.txt)
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \
		(e.g., /testseen_knowledge_reference.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--seq-length 2048 \
		--max-position-embeddings 2048 \
		--micro-batch-size 4 \
		--task MSDP-EVAL-F1 \
		--guess-file ${MODEL_GEN_PATH} \
		--answer-file ${GROUND_TRUTH_PATH}


		############################################
		# Evaluate BLEU, METEOR, and ROUGE-L scores.
		############################################

		# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
		# evaluate the BLEU, METEOR, and ROUGE-L scores.

		# To evaluate on these metrics, please setup the environments based on
		# the nlg-eval github, and run the corresponding evaluation commands.

		nlg-eval \
		--hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
		--references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>

examples/msdp/eval_resp_generation.sh

0 → 100644

+64 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		#########################
		# Evaluate the F1 scores.
		#########################

		WORLD_SIZE=1
		DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
		--nnodes 1 \
		--node_rank 0 \
		--master_addr localhost \
		--master_port 6000"

		MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \
		(e.g., /testseen_response_generations.txt)
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \
		(e.g., /testseen_response_reference.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--seq-length 2048 \
		--max-position-embeddings 2048 \
		--micro-batch-size 4 \
		--task MSDP-EVAL-F1 \
		--guess-file ${MODEL_GEN_PATH} \
		--answer-file ${GROUND_TRUTH_PATH}


		##########################
		# Evaluate the KF1 scores.
		##########################

		MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \
		(e.g., /testseen_response_generations.txt)
		GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \
		(e.g., /testseen_knowledge_reference.txt)

		python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--seq-length 2048 \
		--max-position-embeddings 2048 \
		--micro-batch-size 4 \
		--task MSDP-EVAL-F1 \
		--guess-file ${MODEL_GEN_PATH} \
		--answer-file ${GROUND_TRUTH_PATH}


		############################################
		# Evaluate BLEU, METEOR, and ROUGE-L scores.
		############################################

		# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
		# evaluate the BLEU, METEOR, and ROUGE-L scores.

		# To evaluate on these metrics, please setup the environments based on
		# the nlg-eval github, and run the corresponding evaluation commands.

		nlg-eval \
		--hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
		--references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>

examples/msdp/prep_resp_gen.sh

0 → 100644

+18 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		# Preparing the input file for the response generation (second-stage prompting)

		DIR=`pwd`

		TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
		(e.g., /testseen_processed.txt)
		KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
		(e.g., /testseen_knowledge_generations.txt)
		PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
		(e.g., /testseen_processed_with_generated_knowledge.txt)

		python ${DIR}/tasks/msdp/preprocessing.py \
		--func prepare_input \
		--test_file ${TEST_FILE} \
		--knowledge_gen_file ${KNOWLEDGE_FILE} \
		--processed_file ${PROCESSED_FILE}