Loading data/cu_film/reference_params.csv LFS +2 −2 Original line number Diff line number Diff line version https://git-lfs.github.com/spec/v1 oid sha256:a7e50fb0aae4b53a3ad319cc849a53fe24b925c43cf8012c715c88593473205a size 4704 oid sha256:1a6260fe3082bf7e5abd18a04acbb89a6d5e77850ab01afb73c489d3fb92a359 size 4705 data/mo_film/reference_params.csv +4 −4 Original line number Diff line number Diff line IPTS,Run,THF SLD,THF roughness,SEI thick,SEI SLD,SEI rough,Plated thick,Plated SLD,Plated rough,Mo thick,Mo SLD,Mo rough,SiOnan thick,SiOnan SLD,SiOnan rough 29196,201083,6.1,nan,nan,6.1,nan,6.1,12.2,787,4.4,6.7,nan,nan,nan,nan 29196,201095,6.0,55,222,5.8,11.6,13.1,2.0,15,772,3.4,15,nan,nan,nan Mo-A,207407,6.2,nan,nan,6.2,42.5,3.8,6.6,726,4.2,18,nan,nan,nan,nan Label,IPTS,Run,THF SLD,THF roughness,SEI thick,SEI SLD,SEI rough,Plated thick,Plated SLD,Plated rough,Mo thick,Mo SLD,Mo rough,SiOnan thick,SiOnan SLD,SiOnan rough Mo_0,29196,201083,6.1,nan,nan,6.1,nan,6.1,12.2,787,4.4,6.7,nan,nan,nan,nan Mo_0,29196,201095,6.0,55,222,5.8,11.6,13.1,2.0,15,772,3.4,15,nan,nan,nan Mo_A,29196,207407,6.2,nan,nan,6.2,42.5,3.8,6.6,726,4.2,18,nan,nan,nan,nan src/tgreft/analysis/evaluate.py +128 −1 Original line number Diff line number Diff line """Functions used to evaluate the performance of the model.""" #!/usr/bin/env python import torch import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from typing import Tuple, Optional from refl1d.names import Experiment, QProbe, Parameter Loading @@ -9,7 +11,132 @@ from refl1d.names import FitProblem from bumps.fitters import fit from tgreft.utils.data.data_synthesis import RCurveGenerator from tgreft.utils.data.data_loader import param_to_rcurve from tgreft.analysis.utils import interpolate_data, load_csv from tgreft.analysis.utils import interpolate_data, load_csv, get_rcurve_from_csv class RealDataEvaluator: """Evaluate the model performance against real data. The real data evalutor will use the real experiment data and reference fitting results by domain scientist to check the model performance, recording the data in the dataframe. Parameters ---------- datadir : str The directory of the real data. """ def __init__(self, datadir: str): self.datadir = datadir # self.cu_film_dir = os.path.join(self.datadir, "cu_film") self.mo_film_dir = os.path.join(self.datadir, "mo_film") # load reference data into dataframe self.df_cu_film_ref = pd.read_csv(os.path.join(self.cu_film_dir, "reference_params.csv")) self.df_mo_film_ref = pd.read_csv(os.path.join(self.mo_film_dir, "reference_params.csv")) # cache experiment rcurves self.rcurve_cache = self._load_rcurves() def _load_rcurves(self): """Load the experiment rcurves into memory.""" rcurve_cache = {} # for df, datadir in zip([self.df_cu_film_ref, self.df_mo_film_ref], [self.cu_film_dir, self.mo_film_dir]): for _, row in df.iterrows(): data_path = os.path.join(datadir, row["Label"], f"IPTS_{row['IPTS']}_{row['Run']}.txt") search_key = f"{row['Label']}_{row['IPTS']}_{row['Run']}" rcurve, _ = get_rcurve_from_csv(data_path) rcurve_cache[search_key] = rcurve return rcurve_cache def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu") -> float: """Evaluate the model performance against real data, and return the evaluation score. Parameters ---------- model : torch.nn.Module The model to be evaluated. device : torch.device The device to run the model. system : str The system to be evaluated, either "cu" or "mo". Returns ------- float The average loss of the model on the real data. """ # check system if system == "cu": df_ref = self.df_cu_film_ref elif system == "mo": df_ref = self.df_mo_film_ref else: raise ValueError(f"Unknown system: {system}") # evaluate the model df_pred = self._eval(model, device, df_ref) # calculate the loss diff = df_pred.drop(columns=["Label", "IPTS", "Run"]) - df_ref.drop(columns=["Label", "IPTS", "Run"]) # set nan to zero -> nan means any parameter will be fine diff = diff.fillna(0) # compute the loss loss = np.sqrt(np.mean(diff**2)) # save the predicted data to disk as csv files df_pred.to_csv(f"predicted_{system}.csv", index=False) # return the average loss return loss def _eval(self, model: torch.nn.Module, device: torch.device, df: pd.DataFrame) -> pd.DataFrame: """Evaluate the model performance against the given dataframe. Parameters ---------- model : torch.nn.Module The model to be evaluated. device : torch.device The device to run the model. df : pd.DataFrame The dataframe contains reference data. datadir : str The directory of the data. Returns ------- pd.DataFrame The predicted layer parameters from the model. """ df_pred = [] for _, row in df.iterrows(): search_key = f"{row['Label']}_{row['IPTS']}_{row['Run']}" r_curve = self.rcurve_cache[search_key] pred = self._get_pred(model, device, r_curve) entry = [row["Label"], row["IPTS"], row["Run"]] + list(pred) df_pred.append(entry) df_pred = pd.DataFrame(df_pred, columns=df.columns) return df_pred def _get_pred(self, model: torch.nn.Module, device: torch.device, r_curve: np.ndarray) -> np.ndarray: """Evaluate the model performance against the given dataframe. Parameters ---------- model : torch.nn.Module The model to be evaluated. device : torch.device The device to run the model. r_curve : np.ndarray The rcurve of the experiment data. Returns ------- np.ndarray The predicted layer parameters from the model. """ with torch.no_grad(): r_curve = torch.tensor(r_curve, device=device, dtype=torch.float32) r_curve[r_curve <= 0] = 1 # avoid log(0) or log(negative) pred = model(r_curve.unsqueeze(0)).squeeze(0).numpy() return pred def parameters_refine( Loading Loading
data/cu_film/reference_params.csv LFS +2 −2 Original line number Diff line number Diff line version https://git-lfs.github.com/spec/v1 oid sha256:a7e50fb0aae4b53a3ad319cc849a53fe24b925c43cf8012c715c88593473205a size 4704 oid sha256:1a6260fe3082bf7e5abd18a04acbb89a6d5e77850ab01afb73c489d3fb92a359 size 4705
data/mo_film/reference_params.csv +4 −4 Original line number Diff line number Diff line IPTS,Run,THF SLD,THF roughness,SEI thick,SEI SLD,SEI rough,Plated thick,Plated SLD,Plated rough,Mo thick,Mo SLD,Mo rough,SiOnan thick,SiOnan SLD,SiOnan rough 29196,201083,6.1,nan,nan,6.1,nan,6.1,12.2,787,4.4,6.7,nan,nan,nan,nan 29196,201095,6.0,55,222,5.8,11.6,13.1,2.0,15,772,3.4,15,nan,nan,nan Mo-A,207407,6.2,nan,nan,6.2,42.5,3.8,6.6,726,4.2,18,nan,nan,nan,nan Label,IPTS,Run,THF SLD,THF roughness,SEI thick,SEI SLD,SEI rough,Plated thick,Plated SLD,Plated rough,Mo thick,Mo SLD,Mo rough,SiOnan thick,SiOnan SLD,SiOnan rough Mo_0,29196,201083,6.1,nan,nan,6.1,nan,6.1,12.2,787,4.4,6.7,nan,nan,nan,nan Mo_0,29196,201095,6.0,55,222,5.8,11.6,13.1,2.0,15,772,3.4,15,nan,nan,nan Mo_A,29196,207407,6.2,nan,nan,6.2,42.5,3.8,6.6,726,4.2,18,nan,nan,nan,nan
src/tgreft/analysis/evaluate.py +128 −1 Original line number Diff line number Diff line """Functions used to evaluate the performance of the model.""" #!/usr/bin/env python import torch import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from typing import Tuple, Optional from refl1d.names import Experiment, QProbe, Parameter Loading @@ -9,7 +11,132 @@ from refl1d.names import FitProblem from bumps.fitters import fit from tgreft.utils.data.data_synthesis import RCurveGenerator from tgreft.utils.data.data_loader import param_to_rcurve from tgreft.analysis.utils import interpolate_data, load_csv from tgreft.analysis.utils import interpolate_data, load_csv, get_rcurve_from_csv class RealDataEvaluator: """Evaluate the model performance against real data. The real data evalutor will use the real experiment data and reference fitting results by domain scientist to check the model performance, recording the data in the dataframe. Parameters ---------- datadir : str The directory of the real data. """ def __init__(self, datadir: str): self.datadir = datadir # self.cu_film_dir = os.path.join(self.datadir, "cu_film") self.mo_film_dir = os.path.join(self.datadir, "mo_film") # load reference data into dataframe self.df_cu_film_ref = pd.read_csv(os.path.join(self.cu_film_dir, "reference_params.csv")) self.df_mo_film_ref = pd.read_csv(os.path.join(self.mo_film_dir, "reference_params.csv")) # cache experiment rcurves self.rcurve_cache = self._load_rcurves() def _load_rcurves(self): """Load the experiment rcurves into memory.""" rcurve_cache = {} # for df, datadir in zip([self.df_cu_film_ref, self.df_mo_film_ref], [self.cu_film_dir, self.mo_film_dir]): for _, row in df.iterrows(): data_path = os.path.join(datadir, row["Label"], f"IPTS_{row['IPTS']}_{row['Run']}.txt") search_key = f"{row['Label']}_{row['IPTS']}_{row['Run']}" rcurve, _ = get_rcurve_from_csv(data_path) rcurve_cache[search_key] = rcurve return rcurve_cache def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu") -> float: """Evaluate the model performance against real data, and return the evaluation score. Parameters ---------- model : torch.nn.Module The model to be evaluated. device : torch.device The device to run the model. system : str The system to be evaluated, either "cu" or "mo". Returns ------- float The average loss of the model on the real data. """ # check system if system == "cu": df_ref = self.df_cu_film_ref elif system == "mo": df_ref = self.df_mo_film_ref else: raise ValueError(f"Unknown system: {system}") # evaluate the model df_pred = self._eval(model, device, df_ref) # calculate the loss diff = df_pred.drop(columns=["Label", "IPTS", "Run"]) - df_ref.drop(columns=["Label", "IPTS", "Run"]) # set nan to zero -> nan means any parameter will be fine diff = diff.fillna(0) # compute the loss loss = np.sqrt(np.mean(diff**2)) # save the predicted data to disk as csv files df_pred.to_csv(f"predicted_{system}.csv", index=False) # return the average loss return loss def _eval(self, model: torch.nn.Module, device: torch.device, df: pd.DataFrame) -> pd.DataFrame: """Evaluate the model performance against the given dataframe. Parameters ---------- model : torch.nn.Module The model to be evaluated. device : torch.device The device to run the model. df : pd.DataFrame The dataframe contains reference data. datadir : str The directory of the data. Returns ------- pd.DataFrame The predicted layer parameters from the model. """ df_pred = [] for _, row in df.iterrows(): search_key = f"{row['Label']}_{row['IPTS']}_{row['Run']}" r_curve = self.rcurve_cache[search_key] pred = self._get_pred(model, device, r_curve) entry = [row["Label"], row["IPTS"], row["Run"]] + list(pred) df_pred.append(entry) df_pred = pd.DataFrame(df_pred, columns=df.columns) return df_pred def _get_pred(self, model: torch.nn.Module, device: torch.device, r_curve: np.ndarray) -> np.ndarray: """Evaluate the model performance against the given dataframe. Parameters ---------- model : torch.nn.Module The model to be evaluated. device : torch.device The device to run the model. r_curve : np.ndarray The rcurve of the experiment data. Returns ------- np.ndarray The predicted layer parameters from the model. """ with torch.no_grad(): r_curve = torch.tensor(r_curve, device=device, dtype=torch.float32) r_curve[r_curve <= 0] = 1 # avoid log(0) or log(negative) pred = model(r_curve.unsqueeze(0)).squeeze(0).numpy() return pred def parameters_refine( Loading