diff --git a/scripts/train_ntrace.py b/scripts/train_ntrace.py index aa30d53407cbbfa0550f66ab0ba838f00880d1dd..1ead7e112f53e302b18286e41346992c143a9a02 100644 --- a/scripts/train_ntrace.py +++ b/scripts/train_ntrace.py @@ -14,8 +14,8 @@ if __name__ == "__main__": train_params = { "cuda_id": 0, - "n_epochs": 50, # seems like 50 is the point where training and validation loss diverge - "n_training": 1_500_000, + "n_epochs": 100, # seems like 50 is the point where training and validation loss diverge + "n_training": 2_000_000, "error": 0.07, "batch_size": 180, "learning_rate": 0.005, @@ -23,13 +23,14 @@ if __name__ == "__main__": "optimizer": "Adam", "loss": "composite", "cache_dir": "data", - "experiment_name": "Train_nTRACE", - "run_name": "d2048_h32_l6_cmloss", + "experiment_name": "nTrace_cu_film_6layer", + "run_name": "d2048_h32_l6", + "datadir": "expdata", } train( train_params=train_params, model_params=model_params, load_pretrained=False, - auto_advance=False, + auto_advance=True, ) diff --git a/src/tgreft/analysis/evaluate.py b/src/tgreft/analysis/evaluate.py index 2f16459c0b0f84ea8317a665b38fbbe6d045c8ff..c332fcd091e4e88d7cbde90f5748db4c31acb595 100644 --- a/src/tgreft/analysis/evaluate.py +++ b/src/tgreft/analysis/evaluate.py @@ -17,7 +17,7 @@ from tgreft.analysis.utils import interpolate_data, load_csv, get_rcurve_from_cs class RealDataEvaluator: """Evaluate the model performance against real data. - The real data evalutor will use the real experiment data and reference fitting results + The real data evaluator will use the real experiment data and reference fitting results by domain scientist to check the model performance, recording the data in the dataframe. Parameters @@ -49,7 +49,7 @@ class RealDataEvaluator: rcurve_cache[search_key] = rcurve return rcurve_cache - def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu") -> float: + def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu", epoch: int=0) -> float: """Evaluate the model performance against real data, and return the evaluation score. Parameters @@ -60,6 +60,8 @@ class RealDataEvaluator: The device to run the model. system : str The system to be evaluated, either "cu" or "mo". + epoch : int + The epoch of the model. Returns ------- @@ -79,12 +81,12 @@ class RealDataEvaluator: diff = df_pred.drop(columns=["Label", "IPTS", "Run"]) - df_ref.drop(columns=["Label", "IPTS", "Run"]) # set nan to zero -> nan means any parameter will be fine diff = diff.fillna(0) - # compute the loss + # compute the average loss loss = np.sqrt(np.mean(diff**2)) # save the predicted data to disk as csv files - df_pred.to_csv(f"predicted_{system}.csv", index=False) + df_pred.to_csv(f"predicted_{system}_epoch{epoch:03d}.csv", index=False) # return the average loss - return loss + return loss / len(df_ref) def _eval(self, model: torch.nn.Module, device: torch.device, df: pd.DataFrame) -> pd.DataFrame: """Evaluate the model performance against the given dataframe. @@ -135,7 +137,7 @@ class RealDataEvaluator: with torch.no_grad(): r_curve = torch.tensor(r_curve, device=device, dtype=torch.float32) r_curve[r_curve <= 0] = 1 # avoid log(0) or log(negative) - pred = model(r_curve.unsqueeze(0)).squeeze(0).numpy() + pred = model(r_curve.unsqueeze(0)).cpu().squeeze(0).numpy() return pred diff --git a/src/tgreft/nn/loss.py b/src/tgreft/nn/loss.py index cea8e707e7b223a6b20f7779f77e4b916bc75d75..b36326d7e6f0513aaf0ec0884173f1914381b5e2 100644 --- a/src/tgreft/nn/loss.py +++ b/src/tgreft/nn/loss.py @@ -1,20 +1,11 @@ #!/usr/bin/env python """Extended loss functions for tgreft.""" -import logging import torch import torch.nn as nn import numpy as np from multiprocessing import Pool from tgreft.utils.data.data_loader import param_to_rcurve -logger = logging.getLogger("LOSS") -logger.setLevel(logging.DEBUG) -# create a file handler -handler = logging.FileHandler("loss.log") -logger.addHandler(handler) -formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") -logger.handlers[0].setFormatter(formatter) - def parallel_param_to_rcurve(params): with Pool() as p: @@ -60,7 +51,6 @@ class CompositeLoss(nn.Module): true_rcurve = torch.log(true_rcurve) loss_rcurve = self.rcurve_loss(pred_rcurve, true_rcurve) # - calculate the combined loss - logger.debug(f"loss_param: {loss_param};\tloss_rcurve: {loss_rcurve}") loss = self.lambda_param * loss_param + self.lambda_curve * loss_rcurve return loss diff --git a/src/tgreft/train/train_ntrace.py b/src/tgreft/train/train_ntrace.py index a415fdeb027b666efe655bc0b40c177c6292386c..e8a7fff592c9d0fd8e10e7719c6fc855f39d0f26 100644 --- a/src/tgreft/train/train_ntrace.py +++ b/src/tgreft/train/train_ntrace.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """Train function for the transformer model.""" import os +import shutil import torch import mlflow import logging @@ -16,6 +17,7 @@ from tgreft.train.generic import ( get_loss, ) from tgreft.utils.data.data_loader import get_dataset +from tgreft.analysis.evaluate import RealDataEvaluator logger = logging.getLogger("mTRACE_trainer") @@ -64,8 +66,9 @@ def train( optimizer = train_params["optimizer"] loss = train_params["loss"] cache_dir = train_params["cache_dir"] - experiment_name = train_params.get("experiment_name", "Train_REFL_GPT") + experiment_name = train_params.get("experiment_name", "Train_REFL") run_name = train_params.get("run_name", None) + datadir = train_params.get("datadir", "data") # parse model parameters d_model = model_params["d_model"] @@ -98,6 +101,9 @@ def train( shuffle=False, ) + # prepare real data evaluator + real_data_evaluator = RealDataEvaluator(datadir=datadir) + # prepare model logger.info("Preparing model...") device = torch.device(f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu") @@ -110,10 +116,10 @@ def train( to_log=to_log, ).to(device) # check if need to load pretrained model - model_name = "model_gpt.pt" + model_name = "model_nTRACE.pt" if load_pretrained and os.path.exists(model_name): logger.info("Loading pretrained model...") - model.load_state_dict(torch.load("model_gpt.pt")) + model.load_state_dict(torch.load(model_name)) # calculate the number of parameters in the model n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) # log the number of parameters @@ -138,6 +144,9 @@ def train( # start training with mlflow logging best_loss = float("inf") + best_loss_epoch = 0 + best_real_data_loss = float("inf") + best_real_data_loss_epoch = 0 logger.info("Start training...") mlflow.set_experiment(experiment_name) with mlflow.start_run(run_name=run_name): @@ -170,6 +179,20 @@ def train( ) logger.info(f"Testing loss: {test_loss}") mlflow.log_metric("test_loss", test_loss, step=epoch) + + # evaluate on real data + system="cu" + real_data_loss = real_data_evaluator.evaluate( + model=model, + device=device, + system=system, + epoch=epoch, + ) + logger.info(f"Real data loss: {real_data_loss}") + artifact_name = f"predicted_{system}_epoch{epoch:03d}.csv" + mlflow.log_metric("real_data_loss", real_data_loss, step=epoch) + mlflow.log_artifact(artifact_name, "real_data_pred") + # visualize every 5 epochs if epoch % 5 == 0: logger.info("Visualizing model...") @@ -180,13 +203,29 @@ def train( device=device, ) logger.info("Done visualizing model.") - # save model if loss is better - if test_loss < best_loss: - best_loss = test_loss - logger.info("Saving model...") - torch.save(model.state_dict(), model_name) - mlflow.log_artifact(model_name, "models") - logger.info("Done saving model.") + + # save model if loss is better + if test_loss < best_loss: + best_loss = test_loss + best_loss_epoch = epoch + logger.info("Saving model...") + torch.save(model.state_dict(), model_name) + mlflow.log_artifact(model_name, "models") + logger.info("Done saving model.") + mlflow.log_metric("best_loss", best_loss, step=epoch) + mlflow.log_metric("best_loss_epoch", best_loss_epoch, step=epoch) + + # save model if real data loss is better + if real_data_loss < best_real_data_loss: + best_real_data_loss = real_data_loss + best_real_data_loss_epoch = epoch + logger.info("Saving model...") + model_name_real = "model_nTRACE_real.pt" + torch.save(model.state_dict(), model_name_real) + mlflow.log_artifact(model_name_real, "models") + logger.info("Done saving model.") + mlflow.log_metric("best_real_data_loss", best_real_data_loss, step=epoch) + mlflow.log_metric("best_real_data_loss_epoch", best_real_data_loss_epoch, step=epoch) # if loss becomes nan, break the loop if np.isnan(test_loss): @@ -196,7 +235,7 @@ def train( logger.info("Training complete, saving model...") model_name_final = "model_gpt_final.pt" torch.save(model.state_dict(), model_name_final) - mlflow.log_artifact(model_name_final, "models") + mlflow.log_artifact(model_name_final) logger.info("Done saving model.") # get final loss diff --git a/src/tgreft/utils/data/data_loader.py b/src/tgreft/utils/data/data_loader.py index 600c32429e1ad4bb15d07af1df86b3c924c072ab..b80df4bb3274e4b942bc93b94f3b7d2f6df15cef 100644 --- a/src/tgreft/utils/data/data_loader.py +++ b/src/tgreft/utils/data/data_loader.py @@ -83,33 +83,87 @@ def generate_data( """ # generate the reference parameters # NOTE: the n-Trace model is more expressive with relaxed bounds + + # cu_film, 6 layer parameters_ref = np.column_stack( [ - np.random.uniform(-1.0, 7.0, n_dataset), # electolyte_sld, including air and H2O + np.random.uniform(5.0, 7.0, n_dataset), # electolyte_sld, np.random.uniform(5, 120, n_dataset), # electolyte_roughness, np.random.uniform(-5.0, 6.5, n_dataset), # sei_sld, np.random.uniform(10, 500, n_dataset), # sei_thickness, np.random.uniform(1, 80, n_dataset), # sei_roughness, - np.random.uniform(-2, 7, n_dataset), # bulk_3_sld, - np.random.uniform(10, 200, n_dataset), # bulk_3_thickness, - np.random.uniform(1, 55, n_dataset), # bulk_3_roughness, - - np.random.uniform(2, 7, n_dataset), # bulk_2_sld, - np.random.uniform(20, 700, n_dataset), # bulk_2_thickness (cu_thickness), - np.random.uniform(1, 55, n_dataset), # bulk_2_roughness (cu_roughness), - - np.random.uniform(-3.5, 7, n_dataset), # bulk_1_sld, - np.random.uniform(10, 200, n_dataset), # bulk_1_thickness, - np.random.uniform(1, 55, n_dataset), # bulk_1_roughness, - + np.random.uniform(-2, 6, n_dataset), # material_sld, + np.random.uniform(10, 200, n_dataset), # material_thickness, + np.random.uniform(1, 35, n_dataset), # material_roughness, + + np.random.uniform(6, 7, n_dataset), # cu_sld, + np.random.uniform(20, 700, n_dataset), # cu_thickness, + np.random.uniform(1, 35, n_dataset), # cu_roughness, + + np.random.uniform(-3.5, 0, n_dataset), # ti_sld, + np.random.uniform(10, 100, n_dataset), # ti_thickness, + np.random.uniform(1, 35, n_dataset), # ti_roughness, + np.random.uniform(1, 4.2, n_dataset), # oxide_sld, np.random.uniform(5, 50, n_dataset), # oxide_thickness, np.random.uniform(1, 10, n_dataset), # oxide_roughness, ] ) + # mo_film, 5 layer + # parameters_ref = np.column_stack( + # [ + # np.random.uniform(-1.0, 7.0, n_dataset), # electolyte_sld, I changed the lower bound to -1 to include air and H2O + # np.random.uniform(5, 120, n_dataset), # electolyte_roughness, + + # np.random.uniform(-5.0, 6.5, n_dataset), # sei_sld, + # np.random.uniform(10, 500, n_dataset), # sei_thickness, + # np.random.uniform(1, 80, n_dataset), # sei_roughness, + + # np.random.uniform(-2, 7, n_dataset), # bulk_3_sld, + # np.random.uniform(10, 200, n_dataset), # material_thickness, + # np.random.uniform(1, 55, n_dataset), # material_roughness, + + # np.random.uniform(2, 6, n_dataset), # bulk_2_sld, + # np.random.uniform(20, 700, n_dataset), # cu_thickness, + # np.random.uniform(1, 55, n_dataset), # cu_roughness, + + # np.random.uniform(1, 4.2, n_dataset), # oxide_sld, + # np.random.uniform(5, 50, n_dataset), # oxide_thickness, + # np.random.uniform(1, 10, n_dataset), # oxide_roughness, + # ] + # ) + + # extended 6 layer model + # parameters_ref = np.column_stack( + # [ + # np.random.uniform(-1.0, 7.0, n_dataset), # electolyte_sld, including air and H2O + # np.random.uniform(5, 120, n_dataset), # electolyte_roughness, + + # np.random.uniform(-5.0, 6.5, n_dataset), # sei_sld, + # np.random.uniform(10, 500, n_dataset), # sei_thickness, + # np.random.uniform(1, 80, n_dataset), # sei_roughness, + + # np.random.uniform(-2, 7, n_dataset), # bulk_3_sld, + # np.random.uniform(10, 200, n_dataset), # bulk_3_thickness, + # np.random.uniform(1, 55, n_dataset), # bulk_3_roughness, + + # np.random.uniform(2, 7, n_dataset), # bulk_2_sld, + # np.random.uniform(20, 700, n_dataset), # bulk_2_thickness (cu_thickness), + # np.random.uniform(1, 55, n_dataset), # bulk_2_roughness (cu_roughness), + + # np.random.uniform(-3.5, 7, n_dataset), # bulk_1_sld, + # np.random.uniform(10, 200, n_dataset), # bulk_1_thickness, + # np.random.uniform(1, 55, n_dataset), # bulk_1_roughness, + + # np.random.uniform(1, 4.2, n_dataset), # oxide_sld, + # np.random.uniform(5, 50, n_dataset), # oxide_thickness, + # np.random.uniform(1, 10, n_dataset), # oxide_roughness, + # ] + # ) + # generate the reference rcurves r_curves = np.apply_along_axis(param_to_rcurve, 1, parameters_ref) logger.debug(f"r_curves.shape: {r_curves.shape}")