Commit 05c8e55a authored by Zhang, Chen's avatar Zhang, Chen
Browse files

Merge branch 'cu_film_run' into 'main'

update training code for cu film system

See merge request !7
parents 5400180a 544cd364
Loading
Loading
Loading
Loading
+6 −5
Original line number Diff line number Diff line
@@ -14,8 +14,8 @@ if __name__ == "__main__":

    train_params = {
        "cuda_id": 0,
        "n_epochs": 50,  # seems like 50 is the point where training and validation loss diverge
        "n_training": 1_500_000,
        "n_epochs": 100,  # seems like 50 is the point where training and validation loss diverge
        "n_training": 2_000_000,
        "error": 0.07,
        "batch_size": 180,
        "learning_rate": 0.005,
@@ -23,13 +23,14 @@ if __name__ == "__main__":
        "optimizer": "Adam",
        "loss": "composite",
        "cache_dir": "data",
        "experiment_name": "Train_nTRACE",
        "run_name": "d2048_h32_l6_cmloss",
        "experiment_name": "nTrace_cu_film_6layer",
        "run_name": "d2048_h32_l6",
        "datadir": "expdata",
    }

    train(
        train_params=train_params,
        model_params=model_params,
        load_pretrained=False,
        auto_advance=False,
        auto_advance=True,
    )
+8 −6
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ from tgreft.analysis.utils import interpolate_data, load_csv, get_rcurve_from_cs
class RealDataEvaluator:
    """Evaluate the model performance against real data.

    The real data evalutor will use the real experiment data and reference fitting results
    The real data evaluator will use the real experiment data and reference fitting results
    by domain scientist to check the model performance, recording the data in the dataframe.

    Parameters
@@ -49,7 +49,7 @@ class RealDataEvaluator:
                rcurve_cache[search_key] = rcurve
        return rcurve_cache

    def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu") -> float:
    def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu", epoch: int=0) -> float:
        """Evaluate the model performance against real data, and return the evaluation score.

        Parameters
@@ -60,6 +60,8 @@ class RealDataEvaluator:
            The device to run the model.
        system : str
            The system to be evaluated, either "cu" or "mo".
        epoch : int
            The epoch of the model.

        Returns
        -------
@@ -79,12 +81,12 @@ class RealDataEvaluator:
        diff = df_pred.drop(columns=["Label", "IPTS", "Run"]) - df_ref.drop(columns=["Label", "IPTS", "Run"])
        # set nan to zero -> nan means any parameter will be fine
        diff = diff.fillna(0)
        # compute the loss
        # compute the average loss
        loss = np.sqrt(np.mean(diff**2))
        # save the predicted data to disk as csv files
        df_pred.to_csv(f"predicted_{system}.csv", index=False)
        df_pred.to_csv(f"predicted_{system}_epoch{epoch:03d}.csv", index=False)
        # return the average loss
        return loss
        return loss / len(df_ref)

    def _eval(self, model: torch.nn.Module, device: torch.device, df: pd.DataFrame) -> pd.DataFrame:
        """Evaluate the model performance against the given dataframe.
@@ -135,7 +137,7 @@ class RealDataEvaluator:
        with torch.no_grad():
            r_curve = torch.tensor(r_curve, device=device, dtype=torch.float32)
            r_curve[r_curve <= 0] = 1  # avoid log(0) or log(negative)
            pred = model(r_curve.unsqueeze(0)).squeeze(0).numpy()
            pred = model(r_curve.unsqueeze(0)).cpu().squeeze(0).numpy()
        return pred


+0 −10
Original line number Diff line number Diff line
#!/usr/bin/env python
"""Extended loss functions for tgreft."""
import logging
import torch
import torch.nn as nn
import numpy as np
from multiprocessing import Pool
from tgreft.utils.data.data_loader import param_to_rcurve

logger = logging.getLogger("LOSS")
logger.setLevel(logging.DEBUG)
# create a file handler
handler = logging.FileHandler("loss.log")
logger.addHandler(handler)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
logger.handlers[0].setFormatter(formatter)


def parallel_param_to_rcurve(params):
    with Pool() as p:
@@ -60,7 +51,6 @@ class CompositeLoss(nn.Module):
        true_rcurve = torch.log(true_rcurve)
        loss_rcurve = self.rcurve_loss(pred_rcurve, true_rcurve)
        # - calculate the combined loss
        logger.debug(f"loss_param: {loss_param};\tloss_rcurve: {loss_rcurve}")
        loss = self.lambda_param * loss_param + self.lambda_curve * loss_rcurve
        return loss

+50 −11
Original line number Diff line number Diff line
#!/usr/bin/env python3
"""Train function for the transformer model."""
import os
import shutil
import torch
import mlflow
import logging
@@ -16,6 +17,7 @@ from tgreft.train.generic import (
    get_loss,
)
from tgreft.utils.data.data_loader import get_dataset
from tgreft.analysis.evaluate import RealDataEvaluator


logger = logging.getLogger("mTRACE_trainer")
@@ -64,8 +66,9 @@ def train(
    optimizer = train_params["optimizer"]
    loss = train_params["loss"]
    cache_dir = train_params["cache_dir"]
    experiment_name = train_params.get("experiment_name", "Train_REFL_GPT")
    experiment_name = train_params.get("experiment_name", "Train_REFL")
    run_name = train_params.get("run_name", None)
    datadir = train_params.get("datadir", "data")

    # parse model parameters
    d_model = model_params["d_model"]
@@ -98,6 +101,9 @@ def train(
        shuffle=False,
    )

    # prepare real data evaluator
    real_data_evaluator = RealDataEvaluator(datadir=datadir)

    # prepare model
    logger.info("Preparing model...")
    device = torch.device(f"cuda:{cuda_id}" if torch.cuda.is_available() else "cpu")
@@ -110,10 +116,10 @@ def train(
        to_log=to_log,
    ).to(device)
    # check if need to load pretrained model
    model_name = "model_gpt.pt"
    model_name = "model_nTRACE.pt"
    if load_pretrained and os.path.exists(model_name):
        logger.info("Loading pretrained model...")
        model.load_state_dict(torch.load("model_gpt.pt"))
        model.load_state_dict(torch.load(model_name))
    # calculate the number of parameters in the model
    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # log the number of parameters
@@ -138,6 +144,9 @@ def train(

    # start training with mlflow logging
    best_loss = float("inf")
    best_loss_epoch = 0
    best_real_data_loss = float("inf")
    best_real_data_loss_epoch = 0
    logger.info("Start training...")
    mlflow.set_experiment(experiment_name)
    with mlflow.start_run(run_name=run_name):
@@ -170,6 +179,20 @@ def train(
            )
            logger.info(f"Testing loss: {test_loss}")
            mlflow.log_metric("test_loss", test_loss, step=epoch)

            # evaluate on real data
            system="cu"
            real_data_loss = real_data_evaluator.evaluate(
                model=model,
                device=device,
                system=system,
                epoch=epoch,
            )
            logger.info(f"Real data loss: {real_data_loss}")
            artifact_name = f"predicted_{system}_epoch{epoch:03d}.csv"
            mlflow.log_metric("real_data_loss", real_data_loss, step=epoch)
            mlflow.log_artifact(artifact_name, "real_data_pred")

            # visualize every 5 epochs
            if epoch % 5 == 0:
                logger.info("Visualizing model...")
@@ -180,13 +203,29 @@ def train(
                    device=device,
                )
                logger.info("Done visualizing model.")

            # save model if loss is better
            if test_loss < best_loss:
                best_loss = test_loss
                best_loss_epoch = epoch
                logger.info("Saving model...")
                torch.save(model.state_dict(), model_name)
                mlflow.log_artifact(model_name, "models")
                logger.info("Done saving model.")
            mlflow.log_metric("best_loss", best_loss, step=epoch)
            mlflow.log_metric("best_loss_epoch", best_loss_epoch, step=epoch)

            # save model if real data loss is better
            if real_data_loss < best_real_data_loss:
                best_real_data_loss = real_data_loss
                best_real_data_loss_epoch = epoch
                logger.info("Saving model...")
                model_name_real = "model_nTRACE_real.pt"
                torch.save(model.state_dict(), model_name_real)
                mlflow.log_artifact(model_name_real, "models")
                logger.info("Done saving model.")
            mlflow.log_metric("best_real_data_loss", best_real_data_loss, step=epoch)
            mlflow.log_metric("best_real_data_loss_epoch", best_real_data_loss_epoch, step=epoch)

            # if loss becomes nan, break the loop
            if np.isnan(test_loss):
@@ -196,7 +235,7 @@ def train(
        logger.info("Training complete, saving model...")
        model_name_final = "model_gpt_final.pt"
        torch.save(model.state_dict(), model_name_final)
        mlflow.log_artifact(model_name_final, "models")
        mlflow.log_artifact(model_name_final)
        logger.info("Done saving model.")

        # get final loss
+67 −13
Original line number Diff line number Diff line
@@ -83,26 +83,28 @@ def generate_data(
    """
    # generate the reference parameters
    # NOTE: the n-Trace model is more expressive with relaxed bounds

    # cu_film, 6 layer
    parameters_ref = np.column_stack(
        [
            np.random.uniform(-1.0, 7.0, n_dataset),  # electolyte_sld, including air and H2O
            np.random.uniform(5.0, 7.0, n_dataset),  # electolyte_sld,
            np.random.uniform(5, 120, n_dataset),  # electolyte_roughness,

            np.random.uniform(-5.0, 6.5, n_dataset),  # sei_sld,
            np.random.uniform(10, 500, n_dataset),  # sei_thickness,
            np.random.uniform(1, 80, n_dataset),  # sei_roughness,

            np.random.uniform(-2, 7, n_dataset),  # bulk_3_sld,
            np.random.uniform(10, 200, n_dataset),  # bulk_3_thickness,
            np.random.uniform(1, 55, n_dataset),  # bulk_3_roughness,
            np.random.uniform(-2, 6, n_dataset),  # material_sld,
            np.random.uniform(10, 200, n_dataset),  # material_thickness,
            np.random.uniform(1, 35, n_dataset),  # material_roughness,

            np.random.uniform(2, 7, n_dataset),  # bulk_2_sld,
            np.random.uniform(20, 700, n_dataset),  # bulk_2_thickness (cu_thickness),
            np.random.uniform(1, 55, n_dataset),  # bulk_2_roughness (cu_roughness),
            np.random.uniform(6, 7, n_dataset),  # cu_sld,
            np.random.uniform(20, 700, n_dataset),  # cu_thickness,
            np.random.uniform(1, 35, n_dataset),  # cu_roughness,

            np.random.uniform(-3.5, 7, n_dataset),  # bulk_1_sld,
            np.random.uniform(10, 200, n_dataset),  # bulk_1_thickness,
            np.random.uniform(1, 55, n_dataset),  # bulk_1_roughness,
            np.random.uniform(-3.5, 0, n_dataset),  # ti_sld,
            np.random.uniform(10, 100, n_dataset),  # ti_thickness,
            np.random.uniform(1, 35, n_dataset),  # ti_roughness,

            np.random.uniform(1, 4.2, n_dataset),  # oxide_sld,
            np.random.uniform(5, 50, n_dataset),  # oxide_thickness,
@@ -110,6 +112,58 @@ def generate_data(
        ]
    )

    # mo_film, 5 layer
    # parameters_ref = np.column_stack(
    #     [
    #         np.random.uniform(-1.0, 7.0, n_dataset),  # electolyte_sld,  I changed the lower bound to -1 to include air and H2O
    #         np.random.uniform(5, 120, n_dataset),  # electolyte_roughness,

    #         np.random.uniform(-5.0, 6.5, n_dataset),  # sei_sld,
    #         np.random.uniform(10, 500, n_dataset),  # sei_thickness,
    #         np.random.uniform(1, 80, n_dataset),  # sei_roughness,

    #         np.random.uniform(-2, 7, n_dataset),  # bulk_3_sld,
    #         np.random.uniform(10, 200, n_dataset),  # material_thickness,
    #         np.random.uniform(1, 55, n_dataset),  # material_roughness,
            
    #         np.random.uniform(2, 6, n_dataset),  # bulk_2_sld,
    #         np.random.uniform(20, 700, n_dataset),  # cu_thickness,
    #         np.random.uniform(1, 55, n_dataset),  # cu_roughness,

    #         np.random.uniform(1, 4.2, n_dataset),  # oxide_sld,
    #         np.random.uniform(5, 50, n_dataset),  # oxide_thickness,
    #         np.random.uniform(1, 10, n_dataset),  # oxide_roughness,
    #     ]
    # )

    # extended 6 layer model
    # parameters_ref = np.column_stack(
    #     [
    #         np.random.uniform(-1.0, 7.0, n_dataset),  # electolyte_sld, including air and H2O
    #         np.random.uniform(5, 120, n_dataset),  # electolyte_roughness,

    #         np.random.uniform(-5.0, 6.5, n_dataset),  # sei_sld,
    #         np.random.uniform(10, 500, n_dataset),  # sei_thickness,
    #         np.random.uniform(1, 80, n_dataset),  # sei_roughness,

    #         np.random.uniform(-2, 7, n_dataset),  # bulk_3_sld,
    #         np.random.uniform(10, 200, n_dataset),  # bulk_3_thickness,
    #         np.random.uniform(1, 55, n_dataset),  # bulk_3_roughness,
            
    #         np.random.uniform(2, 7, n_dataset),  # bulk_2_sld,
    #         np.random.uniform(20, 700, n_dataset),  # bulk_2_thickness (cu_thickness),
    #         np.random.uniform(1, 55, n_dataset),  # bulk_2_roughness (cu_roughness),
            
    #         np.random.uniform(-3.5, 7, n_dataset),  # bulk_1_sld,
    #         np.random.uniform(10, 200, n_dataset),  # bulk_1_thickness,
    #         np.random.uniform(1, 55, n_dataset),  # bulk_1_roughness,
            
    #         np.random.uniform(1, 4.2, n_dataset),  # oxide_sld,
    #         np.random.uniform(5, 50, n_dataset),  # oxide_thickness,
    #         np.random.uniform(1, 10, n_dataset),  # oxide_roughness,
    #     ]
    # )

    # generate the reference rcurves
    r_curves = np.apply_along_axis(param_to_rcurve, 1, parameters_ref)
    logger.debug(f"r_curves.shape: {r_curves.shape}")