Merge branch 'sc21' into 'main' (e269e200) · Commits · candle / Megatron-LM

examples/sc21/CONFIG.sh

0 → 100755

+57 −0

Original line number	Diff line number	Diff line
		#!/bin/bash


		# SLURM options.
		export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
		export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>


		# Source code.
		export MEGATRON_CODE_DIR=<megatron source code directory>


		# This variable is used to mount the relevant part of the filesystem
		# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
		# launch directory already get mounted; this variable should be used to
		# mount the directories that contain the data and tokenizer files.
		export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>


		# Data and tokenizer files.
		MEGATRON_DATA=<path to megatron processed data>
		BPE_VOCAB_FILE=<path to bpe vocab file>
		BPE_MERGE_FILE=<path to bpe merges file>


		# Megatron input parameters.
		# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
		# that are not listed here.
		export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
		--tensor-model-parallel-size ${TP} \
		--pipeline-model-parallel-size ${PP} \
		--micro-batch-size ${MBS} \
		--global-batch-size ${GBS} \
		--num-layers ${NLS} \
		--hidden-size ${HS} \
		--num-attention-heads ${NAH} \
		--DDP-impl ${DDP} \
		--data-path ${MEGATRON_DATA} \
		--vocab-file ${BPE_VOCAB_FILE} \
		--merge-file ${BPE_MERGE_FILE} \
		--log-interval 5 \
		--seq-length 2048 \
		--max-position-embeddings 2048 \
		--train-iters 500 \
		--lr-decay-iters 320 \
		--lr 0.0001 \
		--min-lr 0.00001 \
		--lr-decay-style cosine \
		--lr-warmup-fraction 0.01 \
		--split 969,30,1 \
		--eval-iters 100 \
		--eval-interval 1000 \
		--clip-grad 1.0 \
		--fp16 \
		--loss-scale 8192 "

examples/sc21/README.md

0 → 100644

+45 −0

Original line number	Diff line number	Diff line
		# Reproducing Figures in SC21 Paper


		This directory contains some of the scripts that were used to produce the
		results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
		to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
		scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
		[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
		schedulers as well.


		## Setup

		All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
		update the unspecified values (in angle brackets `<...>`) before launching any
		scripts.



		## Scripts

		Below is a list of scripts that can be used to reproduce various figures in our
		[paper](https://arxiv.org/pdf/2104.04473.pdf):

		* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
		for GPT models ranging from 1 billion to 1 trillion parameters.
		* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
		performance of pipeline parallelism.
		* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
		the interleaved schedule on a 175B GPT model.
		* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
		different degrees of pipeline and tensor model parallelism on a model with
		162.2 billion parameters.
		* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
		different degrees of data and pipeline model parallelism on a model with
		5.9 billion parameters.
		* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
		different degrees of data and tensor model parallelism on a model with
		5.9 billion parameters.
		* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
		microbatch size.
		* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
		activation recomputation.
		* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
		the scatter-gather communication optimization.

examples/sc21/SBATCH.sh

0 → 100755

+13 −0

Original line number	Diff line number	Diff line
		#!/bin/bash


		sbatch -p ${SLURM_PARTITION} \
		-A ${SLURM_ACCOUNT} \
		--job-name=${JOB_NAME} \
		--nodes=${NNODES} \
		--export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh

		exit 0

examples/sc21/SRUN.sh

0 → 100755

+18 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8


		THIS_DIR=`pwd`
		DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
		mkdir -p ${THIS_DIR}/logs


		CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"


		srun -l \
		--container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
		--container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
		--output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"

examples/sc21/run_figure_11.sh

0 → 100755

+46 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		# ================================
		# Choose the case to run.
		# ================================

		# Pipeline-parallel size options = [1, 2, 4, 8].
		PP=1

		# Batch size (global batch size) options = [8, 128].
		GBS=8





		# Set pipeline-parallel size options.
		NLS=$((3*PP))
		NNODES=${PP}


		# Other params.
		TP=8
		MBS=1
		HS=20480
		NAH=128
		DDP=local
		MEGATRON_EXTRA_PARAMS="--checkpoint-activations "


		# Name of the job.
		export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}


		# Import the configs.
		. `pwd`/CONFIG.sh


		# Submit the job.
		. `pwd`/SBATCH.sh


		exit 0