Commit 593eaef7 authored by Zhang, Chen's avatar Zhang, Chen
Browse files

add evaluator for real data

parent 7d3517b7
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
version https://git-lfs.github.com/spec/v1
oid sha256:a7e50fb0aae4b53a3ad319cc849a53fe24b925c43cf8012c715c88593473205a
size 4704
oid sha256:1a6260fe3082bf7e5abd18a04acbb89a6d5e77850ab01afb73c489d3fb92a359
size 4705
+4 −4
Original line number Diff line number Diff line
IPTS,Run,THF SLD,THF roughness,SEI thick,SEI SLD,SEI rough,Plated thick,Plated SLD,Plated rough,Mo thick,Mo SLD,Mo rough,SiOnan thick,SiOnan SLD,SiOnan rough
29196,201083,6.1,nan,nan,6.1,nan,6.1,12.2,787,4.4,6.7,nan,nan,nan,nan
29196,201095,6.0,55,222,5.8,11.6,13.1,2.0,15,772,3.4,15,nan,nan,nan
Mo-A,207407,6.2,nan,nan,6.2,42.5,3.8,6.6,726,4.2,18,nan,nan,nan,nan
Label,IPTS,Run,THF SLD,THF roughness,SEI thick,SEI SLD,SEI rough,Plated thick,Plated SLD,Plated rough,Mo thick,Mo SLD,Mo rough,SiOnan thick,SiOnan SLD,SiOnan rough
Mo_0,29196,201083,6.1,nan,nan,6.1,nan,6.1,12.2,787,4.4,6.7,nan,nan,nan,nan
Mo_0,29196,201095,6.0,55,222,5.8,11.6,13.1,2.0,15,772,3.4,15,nan,nan,nan
Mo_A,29196,207407,6.2,nan,nan,6.2,42.5,3.8,6.6,726,4.2,18,nan,nan,nan,nan
+128 −1
Original line number Diff line number Diff line
"""Functions used to evaluate the performance of the model."""
#!/usr/bin/env python
import torch
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple, Optional
from refl1d.names import Experiment, QProbe, Parameter
@@ -9,7 +11,132 @@ from refl1d.names import FitProblem
from bumps.fitters import fit
from tgreft.utils.data.data_synthesis import RCurveGenerator
from tgreft.utils.data.data_loader import param_to_rcurve
from tgreft.analysis.utils import interpolate_data, load_csv
from tgreft.analysis.utils import interpolate_data, load_csv, get_rcurve_from_csv


class RealDataEvaluator:
    """Evaluate the model performance against real data.

    The real data evalutor will use the real experiment data and reference fitting results
    by domain scientist to check the model performance, recording the data in the dataframe.

    Parameters
    ----------
    datadir : str
        The directory of the real data.
    """

    def __init__(self, datadir: str):
        self.datadir = datadir
        #
        self.cu_film_dir = os.path.join(self.datadir, "cu_film")
        self.mo_film_dir = os.path.join(self.datadir, "mo_film")
        # load reference data into dataframe
        self.df_cu_film_ref = pd.read_csv(os.path.join(self.cu_film_dir, "reference_params.csv"))
        self.df_mo_film_ref = pd.read_csv(os.path.join(self.mo_film_dir, "reference_params.csv"))
        # cache experiment rcurves
        self.rcurve_cache = self._load_rcurves()

    def _load_rcurves(self):
        """Load the experiment rcurves into memory."""
        rcurve_cache = {}
        #
        for df, datadir in zip([self.df_cu_film_ref, self.df_mo_film_ref], [self.cu_film_dir, self.mo_film_dir]):
            for _, row in df.iterrows():
                data_path = os.path.join(datadir, row["Label"], f"IPTS_{row['IPTS']}_{row['Run']}.txt")
                search_key = f"{row['Label']}_{row['IPTS']}_{row['Run']}"
                rcurve, _ = get_rcurve_from_csv(data_path)
                rcurve_cache[search_key] = rcurve
        return rcurve_cache

    def evaluate(self, model: torch.nn.Module, device: torch.device, system: str = "cu") -> float:
        """Evaluate the model performance against real data, and return the evaluation score.

        Parameters
        ----------
        model : torch.nn.Module
            The model to be evaluated.
        device : torch.device
            The device to run the model.
        system : str
            The system to be evaluated, either "cu" or "mo".

        Returns
        -------
        float
            The average loss of the model on the real data.
        """
        # check system
        if system == "cu":
            df_ref = self.df_cu_film_ref
        elif system == "mo":
            df_ref = self.df_mo_film_ref
        else:
            raise ValueError(f"Unknown system: {system}")
        # evaluate the model
        df_pred = self._eval(model, device, df_ref)
        # calculate the loss
        diff = df_pred.drop(columns=["Label", "IPTS", "Run"]) - df_ref.drop(columns=["Label", "IPTS", "Run"])
        # set nan to zero -> nan means any parameter will be fine
        diff = diff.fillna(0)
        # compute the loss
        loss = np.sqrt(np.mean(diff**2))
        # save the predicted data to disk as csv files
        df_pred.to_csv(f"predicted_{system}.csv", index=False)
        # return the average loss
        return loss

    def _eval(self, model: torch.nn.Module, device: torch.device, df: pd.DataFrame) -> pd.DataFrame:
        """Evaluate the model performance against the given dataframe.

        Parameters
        ----------
        model : torch.nn.Module
            The model to be evaluated.
        device : torch.device
            The device to run the model.
        df : pd.DataFrame
            The dataframe contains reference data.
        datadir : str
            The directory of the data.

        Returns
        -------
        pd.DataFrame
            The predicted layer parameters from the model.
        """
        df_pred = []
        for _, row in df.iterrows():
            search_key = f"{row['Label']}_{row['IPTS']}_{row['Run']}"
            r_curve = self.rcurve_cache[search_key]
            pred = self._get_pred(model, device, r_curve)
            entry = [row["Label"], row["IPTS"], row["Run"]] + list(pred)
            df_pred.append(entry)
        df_pred = pd.DataFrame(df_pred, columns=df.columns)
        return df_pred

    def _get_pred(self, model: torch.nn.Module, device: torch.device, r_curve: np.ndarray) -> np.ndarray:
        """Evaluate the model performance against the given dataframe.

        Parameters
        ----------
        model : torch.nn.Module
            The model to be evaluated.
        device : torch.device
            The device to run the model.
        r_curve : np.ndarray
            The rcurve of the experiment data.

        Returns
        -------
        np.ndarray
            The predicted layer parameters from the model.
        """
        with torch.no_grad():
            r_curve = torch.tensor(r_curve, device=device, dtype=torch.float32)
            r_curve[r_curve <= 0] = 1  # avoid log(0) or log(negative)
            pred = model(r_curve.unsqueeze(0)).squeeze(0).numpy()
        return pred


def parameters_refine(