From a5a551b79186208afc154b8917d3098c9ec8af76 Mon Sep 17 00:00:00 2001 From: Jens Glaser <glaserj@ornl.gov> Date: Fri, 16 Feb 2024 01:26:41 -0500 Subject: [PATCH 1/2] Fix checkpoint restart --- affinity_pred_pl/finetune_pl.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/affinity_pred_pl/finetune_pl.py b/affinity_pred_pl/finetune_pl.py index c84d985..f3dede3 100644 --- a/affinity_pred_pl/finetune_pl.py +++ b/affinity_pred_pl/finetune_pl.py @@ -22,7 +22,6 @@ from dataclasses import dataclass, field from enum import Enum from transformers.trainer_utils import is_main_process -from transformers.trainer_utils import get_last_checkpoint import datasets from torch.utils.data import random_split @@ -243,34 +242,25 @@ def main(): # seed the weight initialization torch.manual_seed(args.seed) - if os.path.isdir(args.output_dir) and not args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: - raise ValueError( - f"Output directory ({args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - logger.info("Training/evaluation parameters %s", args) model = AffinityCLIP(seq_model_name=seq_model_name,smiles_model_name=smiles_model_directory) - if os.path.isdir(args.output_dir): - print(f"[WARNING] Output directory already exists! output_dir = {output_dir}") - strategy = pl.strategies.DeepSpeedStrategy(logging_level=logging.INFO) - trainer = pl.Trainer(max_epochs=100,accelerator="gpu",strategy=strategy,default_root_dir=args.output_dir,val_check_interval=250) + pl_logger = pl.loggers.TensorBoardLogger(save_dir=args.output_dir, version=0) + trainer = pl.Trainer(max_epochs=100, + accelerator="gpu", + strategy=strategy, + default_root_dir=args.output_dir, + callbacks=[pl.callbacks.ModelCheckpoint(save_last='link')], + logger=pl_logger, + val_check_interval=250) trainer.strategy.config["zero_force_ds_cpu_optimizer"] = False all_metrics = {} logger.info("*** Train ***") train_dataset = AffinityDataModule(dataset=args.dataset,num_workers=4,train_batch_size=1,val_batch_size=1) - train_result = trainer.fit(model,train_dataset) + train_result = trainer.fit(model, train_dataset, ckpt_path='last') return all_metrics -- GitLab From 4bbeac6379210f4a1ef8dcf6587608c2805b8aa6 Mon Sep 17 00:00:00 2001 From: Jens Glaser <glaserj@ornl.gov> Date: Fri, 16 Feb 2024 01:26:59 -0500 Subject: [PATCH 2/2] update job script --- train/run_finetune_pl.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/train/run_finetune_pl.sh b/train/run_finetune_pl.sh index 00f4a08..a17ef2d 100644 --- a/train/run_finetune_pl.sh +++ b/train/run_finetune_pl.sh @@ -3,6 +3,7 @@ #BSUB -W 60 #BSUB -J wciscc #BSUB -o wciscc.o%J +#BSUB -e wciscc.e%J #BSUB -P trn022 module load open-ce/1.5.2-py39-0 @@ -13,6 +14,8 @@ module unload hsi module unload xalt module list +cat $LSB_DJOB_RANKFILE +export PYTHONBUFFERED=1 echo "Starting directory: `pwd`" @@ -25,16 +28,15 @@ source env_finetune_wciscc.sh env > saved_env_${LSB_JOBID}.txt echo "Done! Environment ready." +set -x + OMP_NUM_THREADS=1 -LAMBDA=2.5e-5 -ENSEMBLE_ID=1 NNODES=4 NGPUS=$(( 6 * $NNODES )) DATASET=/gpfs/wolf/trn022/proj-shared/wciscc2024/dataset//pdbbind_canonical.parquet -echo "jsrun --smpiargs none -r 1 -g 6 -a 6 -c 42 python ../affinity_pred_pl/finetune_pl.py --dataset=$DATASET --output_dir=./results_bert_clip_pl_g${NGPUS}_${LAMBDA}_${ENSEMBLE_ID} --model_type=bert" -jsrun --smpiargs none -r 1 -g 6 -a 6 -c 42 python ../affinity_pred_pl/finetune_pl.py --dataset=$DATASET --output_dir=./results_bert_clip_pl_g${NGPUS}_${LAMBDA}_${ENSEMBLE_ID} --model_type=bert +jsrun -r 1 -g 6 -a 6 -c 42 python ../affinity_pred_pl/finetune_pl.py --dataset=$DATASET --output_dir=./results_bert_clip_pl_g${NGPUS} --model_type=bert echo "Finished job!" -- GitLab