Source code for summit.benchmarks.experimental_emulator

import os
import os.path as osp

from summit.experiment import Experiment

import numpy as np

from summit.benchmarks.experiment_emulator.bnn_emulator import BNNEmulator
from summit.utils.dataset import DataSet
from summit.domain import *
from summit.utils import jsonify_dict, unjsonify_dict


[docs]class ExperimentalEmulator(Experiment):
    """Experimental Emulator

    Parameters
    ---------
    domain: summit.domain.Domain
        The domain of the experiment
    dataset: class:~summit.utils.dataset.DataSet, optional
        A DataSet with data for training where the data columns correspond to the domain and the data rows correspond to the training points.
        By default: None
    csv_dataset: string, optional
        Path to csv_file with data for training where columns correspond to the domain and the rows correspond to the training points.
        Note that the first row should exactly match the variable names of the domain and the second row should only have "DATA" as entry.
        By default: None
    model_name: string, optional
        Name of the model that is used for saving model parameters. Should be unique.
        By default: "dataset_emulator_model_name"
    regressor_type: string, optional
        Type of the regressor that is used within the emulator (available: "BNN").
        By default: "BNN"
    cat_to_descr: Boolean, optional
        If True, transform categorical variable to one or more continuous variable(s)
        corresponding to the descriptors of the categorical variable (else do nothing).
        By default: False

    Examples
    --------
    >>> test_domain = ReizmanSuzukiEmulator().domain
    >>> e = ExperimentalEmulator(domain=test_domain, model_name="Pytest")
    No trained model for Pytest. Train this model with ExperimentalEmulator.train() in order to use this Emulator as an virtual Experiment.
    >>> columns = [v.name for v in e.domain.variables]
    >>> train_values = {("catalyst", "DATA"): ["P1-L2", "P1-L7", "P1-L3", "P1-L3"], ("t_res", "DATA"): [60, 120, 110, 250], ("temperature", "DATA"): [110, 30, 70, 80], ("catalyst_loading", "DATA"): [0.508, 0.6, 1.4, 1.3], ("yield", "DATA"): [20, 40, 60, 34], ("ton", "DATA"): [33, 34, 21, 22]}
    >>> train_dataset = DataSet(train_values, columns=columns)
    >>> e.train(train_dataset, verbose=False, cv_fold=2, test_size=0.25)
    >>> columns = [v.name for v in e.domain.variables]
    >>> values = [float(v.bounds[0] + 0.6 * (v.bounds[1] - v.bounds[0])) if v.variable_type == 'continuous' else v.levels[-1] for v in e.domain.variables]
    >>> values = np.array(values)
    >>> values = np.atleast_2d(values)
    >>> conditions = DataSet(values, columns=columns)
    >>> results = e.run_experiments(conditions)

    """

    def __init__(
        self,
        domain,
        dataset=None,
        csv_dataset=None,
        model_name="dataset_name_emulator_bnn",
        regressor_type="BNN",
        cat_to_descr=False,
        **kwargs
    ):
        super().__init__(domain)
        dataset = self._check_datasets(dataset, csv_dataset)

        kwargs["cat_to_descr"] = cat_to_descr

        if regressor_type == "BNN":
            self.emulator = BNNEmulator(
                domain=domain, dataset=dataset, model_name=model_name, kwargs=kwargs
            )
            try:
                self.extras = [self.emulator._load_model(model_name)]
            except:
                print(
                    "No trained model for {}. Train this model with ExperimentalEmulator.train() in order to use this Emulator as an virtual Experiment.".format(
                        self.emulator.model_name
                    )
                )
        else:
            raise NotImplementedError(
                "Regressor type <{}> not implemented yet".format(str(regressor_type))
            )

    def _run(self, conditions, **kwargs):
        condition = DataSet.from_df(conditions.to_frame().T)
        infer_dict = self.emulator.infer_model(dataset=condition)
        for k, v in infer_dict.items():
            conditions[(k, "DATA")] = v
        return conditions, None

    def train(self, dataset=None, csv_dataset=None, verbose=True, **kwargs):
        dataset = self._check_datasets(dataset, csv_dataset)
        self.emulator.set_training_hyperparameters(kwargs=kwargs)
        self.emulator.train_model(dataset=dataset, verbose=verbose, kwargs=kwargs)
        self.extras = [self.emulator.output_models]

    def validate(
        self,
        dataset=None,
        csv_dataset=None,
        parity_plots=False,
        get_pred=False,
        **kwargs
    ):
        dataset = self._check_datasets(dataset, csv_dataset)
        if dataset is not None:
            return self.emulator.validate_model(
                dataset=dataset,
                parity_plots=parity_plots,
                get_pred=get_pred,
                kwargs=kwargs,
            )
        else:
            try:
                print("Evaluation based on training and test set.")
                return self.emulator.validate_model(parity_plots=parity_plots)
            except:
                raise ValueError("No dataset to evaluate.")

    def _check_datasets(self, dataset=None, csv_dataset=None):
        if csv_dataset:
            if dataset:
                print(
                    "Dataset and csv.dataset are given, hence dataset will be overwritten by csv.data."
                )
            dataset = DataSet.read_csv(csv_dataset, index_col=None)
        return dataset

    def to_dict(self, **kwargs):
        """Serialize the class to a dictionary

        Subclasses can add a experiment_params dictionary
        key with custom parameters for the experiment
        """
        kwargs.update(
            dict(
                model_name=self.emulator.model_name,
                dataset=self.emulator._dataset.to_dict()
                if self.emulator._dataset is not None
                else None,
                output_models=self.emulator.output_models,
            )
        )
        return super().to_dict(**kwargs)

    @classmethod
    def from_dict(cls, d):
        dataset = d["experiment_params"]["dataset"]
        d["experiment_params"]["dataset"] = DataSet.from_dict(dataset)
        exp = super().from_dict(d)
        exp.emulator.output_models = d["experiment_params"]["output_models"]
        return exp


[docs]class ReizmanSuzukiEmulator(ExperimentalEmulator):
    """Reizman Suzuki Emulator

    Virtual experiments representing the Suzuki-Miyaura Cross-Coupling reaction
    similar to Reizman et al. (2016). Experimental outcomes are based on an
    emulator that is trained on the experimental data published by Reizman et al.

    Parameters
    ----------
    case: int, optional, default=1
        Reizman et al. (2016) reported experimental data for 4 different
        cases. The case number refers to the cases they reported.
        Please see their paper for more information on the cases.

    Examples
    --------
    >>> reizman_emulator = ReizmanSuzukiEmulator(case=1)

    Notes
    -----
    This benchmark is based on data from [Reizman]_ et al.

    References
    ----------
    .. [Reizman] B. J. Reizman et al., React. Chem. Eng., 2016, 1, 658–666.
       DOI: `10.1039/C6RE00153J <https://doi.org/10.1039/C6RE00153J>`_.

    """

    def __init__(self, case=1, **kwargs):
        model_name = "reizman_suzuki_case" + str(case)
        domain = self.setup_domain()
        dataset_file = osp.join(
            osp.dirname(osp.realpath(__file__)),
            "experiment_emulator/data/" + model_name + "_train_test.csv",
        )
        super().__init__(domain=domain, csv_dataset=dataset_file, model_name=model_name)

    def setup_domain(self):
        domain = Domain()

        # Decision variables
        des_1 = "Catalyst type - different ligands"
        domain += CategoricalVariable(
            name="catalyst",
            description=des_1,
            levels=[
                "P1-L1",
                "P2-L1",
                "P1-L2",
                "P1-L3",
                "P1-L4",
                "P1-L5",
                "P1-L6",
                "P1-L7",
            ],
        )

        des_2 = "Residence time in seconds (s)"
        domain += ContinuousVariable(name="t_res", description=des_2, bounds=[60, 600])

        des_3 = "Reactor temperature in degrees Celsius (ºC)"
        domain += ContinuousVariable(
            name="temperature", description=des_3, bounds=[30, 110]
        )

        des_4 = "Catalyst loading in mol%"
        domain += ContinuousVariable(
            name="catalyst_loading", description=des_4, bounds=[0.5, 2.5]
        )

        # Objectives
        des_5 = (
            "Turnover number - moles product generated divided by moles catalyst used"
        )
        domain += ContinuousVariable(
            name="ton",
            description=des_5,
            bounds=[0, 200],  # TODO: not sure about bounds, maybe redefine
            is_objective=True,
            maximize=True,
        )

        des_6 = "Yield"
        domain += ContinuousVariable(
            name="yield",
            description=des_6,
            bounds=[0, 100],
            is_objective=True,
            maximize=True,
        )

        return domain

[docs]    def to_dict(self):
        """Serialize the class to a dictionary"""
        experiment_params = dict(
            case=self.emulator.model_name[-1],
        )
        return super().to_dict(**experiment_params)


[docs]class BaumgartnerCrossCouplingEmulator(ExperimentalEmulator):
    """Baumgartner Cross Coupling Emulator

    Virtual experiments representing the Aniline Cross-Coupling reaction
    similar to Baumgartner et al. (2019). Experimental outcomes are based on an
    emulator that is trained on the experimental data published by Baumgartner et al.

    This is a five dimensional optimisation of temperature, residence time, base equivalents,
    catalyst and base.

    The categorical variables (catalyst and base) contain descriptors
    calculated using COSMO-RS. Specifically, the descriptors are the first two sigma moments.

    Examples
    --------
    >>> bemul = BaumgartnerCrossCouplingDescriptorEmulator()

    Notes
    -----
    This benchmark is based on data from [Baumgartner]_ et al.

    References
    ----------

    .. [Baumgartner] L. M. Baumgartner et al., Org. Process Res. Dev., 2019, 23, 1594–1601
       DOI: `10.1021/acs.oprd.9b00236 <https://`doi.org/10.1021/acs.oprd.9b00236>`_

    """

    def __init__(self, **kwargs):
        model_name = kwargs.get("model_name", "baumgartner_aniline_cn_crosscoupling")
        dataset_file = kwargs.get(
            "dataset_file", "baumgartner_aniline_cn_crosscoupling.csv"
        )
        domain = self.setup_domain()
        dataset_file = osp.join(
            osp.dirname(osp.realpath(__file__)),
            "experiment_emulator/data/" + dataset_file,
        )
        super().__init__(domain=domain, csv_dataset=dataset_file, model_name=model_name)

    def setup_domain(self):
        domain = Domain()

        # Decision variables
        des_1 = "Catalyst type"
        catalyst_df = DataSet(
            [
                [460.7543, 67.2057],  # 30.8413, 2.3043, 0], #, 424.64, 421.25040226],
                [518.8408, 89.8738],  # 39.4424, 2.5548, 0], #, 487.7, 781.11247064],
                [819.933, 129.0808],  # 83.2017, 4.2959, 0], #, 815.06, 880.74916884],
            ],
            index=["tBuXPhos", "tBuBrettPhos", "AlPhos"],
            columns=[
                "area_cat",
                "M2_cat",
            ],  # , 'M3_cat', 'Macc3_cat', 'Mdon3_cat'] #,'mol_weight', 'sol']
        )
        domain += CategoricalVariable(
            name="catalyst",
            description=des_1,
            levels=["tBuXPhos", "tBuBrettPhos", "AlPhos"],
            descriptors=catalyst_df,
        )

        des_2 = "Base"
        base_df = DataSet(
            [
                [162.2992, 25.8165],  # 40.9469, 3.0278, 0], #101.19, 642.2973283],
                [
                    165.5447,
                    81.4847,
                ],  # 107.0287, 10.215, 0.0169], # 115.18, 534.01544123],
                [227.3523, 30.554],  # 14.3676, 1.1196, 0.0127], # 171.28, 839.81215],
                [192.4693, 59.8367],  # 82.0661, 7.42, 0], # 152.24, 1055.82799],
            ],
            index=["TEA", "TMG", "BTMG", "DBU"],
            columns=["area", "M2"],  # , 'M3', 'Macc3', 'Mdon3'], # 'mol_weight', 'sol']
        )
        domain += CategoricalVariable(
            name="base",
            description=des_2,
            levels=["DBU", "BTMG", "TMG", "TEA"],
            descriptors=base_df,
        )

        des_3 = "Base equivalents"
        domain += ContinuousVariable(
            name="base_equivalents", description=des_3, bounds=[1.0, 2.5]
        )

        des_4 = "Temperature in degrees Celsius (ºC)"
        domain += ContinuousVariable(
            name="temperature", description=des_4, bounds=[30, 100]
        )

        des_5 = "residence time in seconds (s)"
        domain += ContinuousVariable(name="t_res", description=des_5, bounds=[60, 1800])

        des_6 = "Yield"
        domain += ContinuousVariable(
            name="yld",
            description=des_6,
            bounds=[0.0, 1.0],
            is_objective=True,
            maximize=True,
        )

        return domain


[docs]class BaumgartnerCrossCouplingDescriptorEmulator(ExperimentalEmulator):
    """Baumgartner Cross Coupling Emulator

    Virtual experiments representing the Aniline Cross-Coupling reaction
    similar to Baumgartner et al. (2019). Experimental outcomes are based on an
    emulator that is trained on the experimental data published by Baumgartner et al.

    The difference with this model is that it uses descriptors for the catalyst and base
    instead of one-hot encoding the options. The descriptors are the first two
    sigma moments from COSMO-RS.


    Parameters
    ----------

    Examples
    --------
    >>> bemul = BaumgartnerCrossCouplingDescriptorEmulator()

    Notes
    -----
    This benchmark is based on data from [Baumgartner]_ et al.

    References
    ----------

    .. [Baumgartner] L. M. Baumgartner et al., Org. Process Res. Dev., 2019, 23, 1594–1601
       DOI: `10.1021/acs.oprd.9b00236 <https://doi.org/10.1021/acs.oprd.9b00236>`_

    """

    def __init__(self, **kwargs):
        model_name = kwargs.get(
            "model_name", "baumgartner_aniline_cn_crosscoupling_descriptors"
        )
        dataset_file = kwargs.get(
            "dataset_file", "baumgartner_aniline_cn_crosscoupling_descriptors.csv"
        )
        domain = self.setup_domain()
        dataset_file = osp.join(
            osp.dirname(osp.realpath(__file__)),
            "experiment_emulator/data/" + dataset_file,
        )
        super().__init__(domain=domain, csv_dataset=dataset_file, model_name=model_name)

    def setup_domain(self):
        domain = Domain()

        # Decision variables
        des_1 = "Catalyst type with descriptors"
        catalyst_df = DataSet(
            [
                [460.7543, 67.2057, 30.8413, 2.3043, 0],  # , 424.64, 421.25040226],
                [518.8408, 89.8738, 39.4424, 2.5548, 0],  # , 487.7, 781.11247064],
                [819.933, 129.0808, 83.2017, 4.2959, 0],  # , 815.06, 880.74916884],
            ],
            index=["tBuXPhos", "tBuBrettPhos", "AlPhos"],
            columns=[
                "area_cat",
                "M2_cat",
                "M3_cat",
                "Macc3_cat",
                "Mdon3_cat",
            ],  # ,'mol_weight', 'sol']
        )
        domain += CategoricalVariable(
            name="catalyst", description=des_1, descriptors=catalyst_df
        )

        des_2 = "Base type with descriptors"
        base_df = DataSet(
            [
                [162.2992, 25.8165, 40.9469, 3.0278, 0],  # 101.19, 642.2973283],
                [165.5447, 81.4847, 107.0287, 10.215, 0.0169],  # 115.18, 534.01544123],
                [227.3523, 30.554, 14.3676, 1.1196, 0.0127],  # 171.28, 839.81215],
                [192.4693, 59.8367, 82.0661, 7.42, 0],  # 152.24, 1055.82799],
            ],
            index=["TEA", "TMG", "BTMG", "DBU"],
            columns=["area", "M2", "M3", "Macc3", "Mdon3"],  # 'mol_weight', 'sol']
        )
        domain += CategoricalVariable(
            name="base", description=des_2, descriptors=base_df
        )

        des_3 = "Base equivalents"
        domain += ContinuousVariable(
            name="base_equivalents", description=des_3, bounds=[1.0, 2.5]
        )

        des_4 = "Temperature in degrees Celsius (ºC)"
        domain += ContinuousVariable(
            name="temperature", description=des_4, bounds=[30, 100]
        )

        des_5 = "residence time in seconds (s)"
        domain += ContinuousVariable(name="t_res", description=des_5, bounds=[60, 1800])

        des_6 = "Yield"
        domain += ContinuousVariable(
            name="yield",
            description=des_6,
            bounds=[0.0, 1.0],
            is_objective=True,
            maximize=True,
        )

        return domain


[docs]class BaumgartnerCrossCouplingEmulator_Yield_Cost(BaumgartnerCrossCouplingEmulator):
    """Baumgartner Cross Coupling Emulator

    Virtual experiments representing the Aniline Cross-Coupling reaction
    similar to Baumgartner et al. (2019). Experimental outcomes are based on an
    emulator that is trained on the experimental data published by Baumgartner et al.

    This is a multiobjective version for optimizing yield and cost simultaneously.

    Parameters
    ----------

    Examples
    --------
    >>> bemul = BaumgartnerCrossCouplingDescriptorEmulator()

    Notes
    -----
    This benchmark is based on data from [Baumgartner]_ et al.

    References
    ----------

    .. [Baumgartner] L. M. Baumgartner et al., Org. Process Res. Dev., 2019, 23, 1594–1601
       DOI: `10.1021/acs.oprd.9b00236 <https://doi.org/10.1021/acs.oprd.9b00236>`_

    """

    def __init__(self, **kwargs):
        super().__init__()
        self.init_domain = self._domain
        self.mod_domain = self._domain + ContinuousVariable(
            name="cost",
            description="cost in USD of 40 uL reaction",
            bounds=[0.0, 1.0],
            is_objective=True,
            maximize=False,
        )
        self._domain = self.mod_domain

    def _run(self, conditions, **kwargs):
        # Change to original domain for running predictive model
        self._domain = self.init_domain
        conditions, _ = super()._run(conditions=conditions, **kwargs)

        # Calculate costs
        costs = self._calculate_costs(conditions)
        conditions[("cost", "DATA")] = costs

        # Change back to modified domain
        self._domain = self.mod_domain
        return conditions, {}

    @classmethod
    def _calculate_costs(cls, conditions):
        catalyst = conditions["catalyst"].values
        base = conditions["base"].values
        base_equiv = conditions["base_equivalents"].values

        # Calculate amounts
        droplet_vol = 40 * 1e-3  # mL
        mmol_triflate = 0.91 * droplet_vol
        mmol_anniline = 1.6 * mmol_triflate
        catalyst_equiv = {
            "tBuXPhos": 0.0095,
            "tBuBrettPhos": 0.0094,
            "AlPhos": 0.0094,
        }
        mmol_catalyst = [catalyst_equiv[c] * mmol_triflate for c in catalyst]
        mmol_base = base_equiv * mmol_triflate

        # Calculate costs
        cost_triflate = mmol_triflate * 5.91  # triflate is $5.91/mmol
        cost_anniline = mmol_anniline * 0.01  # anniline is $0.01/mmol
        cost_catalyst = np.array(
            [cls._get_catalyst_cost(c, m) for c, m in zip(catalyst, mmol_catalyst)]
        )
        cost_base = np.array(
            [cls._get_base_cost(b, m) for b, m in zip(base, mmol_base)]
        )
        tot_cost = cost_triflate + cost_anniline + cost_catalyst + cost_base
        if len(tot_cost) == 1:
            tot_cost = tot_cost[0]
        return tot_cost

    @staticmethod
    def _get_catalyst_cost(catalyst, catalyst_mmol):
        catalyst_prices = {
            "tBuXPhos": 94.08,
            "tBuBrettPhos": 182.85,
            "AlPhos": 594.18,
        }
        return float(catalyst_prices[catalyst] * catalyst_mmol)

    @staticmethod
    def _get_base_cost(base, mmol_base):
        # prices in $/mmol
        base_prices = {
            "DBU": 0.03,
            "BTMG": 1.2,
            "TMG": 0.001,
            "TEA": 0.01,
        }
        return float(base_prices[base] * mmol_base)