From a5a551b79186208afc154b8917d3098c9ec8af76 Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Fri, 16 Feb 2024 01:26:41 -0500
Subject: [PATCH 1/2] Fix checkpoint restart

---
 affinity_pred_pl/finetune_pl.py | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/affinity_pred_pl/finetune_pl.py b/affinity_pred_pl/finetune_pl.py
index c84d985..f3dede3 100644
--- a/affinity_pred_pl/finetune_pl.py
+++ b/affinity_pred_pl/finetune_pl.py
@@ -22,7 +22,6 @@ from dataclasses import dataclass, field
 from enum import Enum
 
 from transformers.trainer_utils import is_main_process
-from transformers.trainer_utils import get_last_checkpoint
 
 import datasets
 from torch.utils.data import random_split
@@ -243,34 +242,25 @@ def main():
     # seed the weight initialization
     torch.manual_seed(args.seed)
 
-    if os.path.isdir(args.output_dir) and not args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(args.output_dir)
-        if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
     logger.info("Training/evaluation parameters %s", args)
 
     model = AffinityCLIP(seq_model_name=seq_model_name,smiles_model_name=smiles_model_directory)
 
-    if os.path.isdir(args.output_dir):
-        print(f"[WARNING] Output directory already exists! output_dir = {output_dir}")
-
     strategy = pl.strategies.DeepSpeedStrategy(logging_level=logging.INFO)
-    trainer = pl.Trainer(max_epochs=100,accelerator="gpu",strategy=strategy,default_root_dir=args.output_dir,val_check_interval=250) 
+    pl_logger = pl.loggers.TensorBoardLogger(save_dir=args.output_dir, version=0)
+    trainer = pl.Trainer(max_epochs=100,
+                         accelerator="gpu",
+                         strategy=strategy,
+                         default_root_dir=args.output_dir,
+                         callbacks=[pl.callbacks.ModelCheckpoint(save_last='link')],
+                         logger=pl_logger,
+                         val_check_interval=250)
     trainer.strategy.config["zero_force_ds_cpu_optimizer"] = False
 
     all_metrics = {}
     logger.info("*** Train ***")
     train_dataset = AffinityDataModule(dataset=args.dataset,num_workers=4,train_batch_size=1,val_batch_size=1)
-    train_result = trainer.fit(model,train_dataset)
+    train_result = trainer.fit(model, train_dataset, ckpt_path='last')
 
     return all_metrics
 
-- 
GitLab


From 4bbeac6379210f4a1ef8dcf6587608c2805b8aa6 Mon Sep 17 00:00:00 2001
From: Jens Glaser <glaserj@ornl.gov>
Date: Fri, 16 Feb 2024 01:26:59 -0500
Subject: [PATCH 2/2] update job script

---
 train/run_finetune_pl.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/train/run_finetune_pl.sh b/train/run_finetune_pl.sh
index 00f4a08..a17ef2d 100644
--- a/train/run_finetune_pl.sh
+++ b/train/run_finetune_pl.sh
@@ -3,6 +3,7 @@
 #BSUB -W 60
 #BSUB -J wciscc
 #BSUB -o wciscc.o%J
+#BSUB -e wciscc.e%J
 #BSUB -P trn022
 
 module load open-ce/1.5.2-py39-0
@@ -13,6 +14,8 @@ module unload hsi
 module unload xalt
 module list
 
+cat $LSB_DJOB_RANKFILE
+export PYTHONBUFFERED=1
 
 echo "Starting directory: `pwd`"
 
@@ -25,16 +28,15 @@ source env_finetune_wciscc.sh
 env > saved_env_${LSB_JOBID}.txt
 echo "Done! Environment ready."
 
+set -x
+
 OMP_NUM_THREADS=1 
-LAMBDA=2.5e-5 
-ENSEMBLE_ID=1
 NNODES=4
 NGPUS=$(( 6 * $NNODES )) 
 
 DATASET=/gpfs/wolf/trn022/proj-shared/wciscc2024/dataset//pdbbind_canonical.parquet
 
-echo "jsrun --smpiargs none -r 1 -g 6 -a 6 -c 42 python ../affinity_pred_pl/finetune_pl.py --dataset=$DATASET --output_dir=./results_bert_clip_pl_g${NGPUS}_${LAMBDA}_${ENSEMBLE_ID} --model_type=bert"
-jsrun --smpiargs none -r 1 -g 6 -a 6 -c 42 python ../affinity_pred_pl/finetune_pl.py --dataset=$DATASET --output_dir=./results_bert_clip_pl_g${NGPUS}_${LAMBDA}_${ENSEMBLE_ID} --model_type=bert
+jsrun -r 1 -g 6 -a 6 -c 42 python ../affinity_pred_pl/finetune_pl.py --dataset=$DATASET --output_dir=./results_bert_clip_pl_g${NGPUS} --model_type=bert
 
 echo "Finished job!"
 
-- 
GitLab