Source code for summit.strategies.gryffin

from .base import Strategy
from summit.domain import Domain, DomainError
from summit.utils.dataset import DataSet
from summit import get_summit_config_path

import numpy as np
import pandas as pd

from gryffin import Gryffin
import json

import os
import copy
import pickle
import uuid
import pathlib


[docs]class GRYFFIN(Strategy):
    """Gryffin is a single objective Bayesian optimisation strategy.

    It is designed to work well with mixed domains (i.e., categorical and continuous variables).

    Parameters
    ----------
    domain: :class:`~summit.domain.Domain`
        The Summit domain describing the optimization problem.
    transform : :class:`~summit.strategies.base.Transform`, optional
        A transform object. By default no transformation will be done
        on the input variables or objectives.
    use_descriptors: bool, optional
        Whether descriptors of categorical variables are used.
        If not,auto_desc_gen must be True when categorical variables are used.
        Default is True.
    auto_desc_gen: bool, optional
        Whether Dynamic Gryffin is used if descriptors are provided.
        Gryffin applies automatic descriptor generation, hence transforms the given descriptors with a non-linear transformation to new descriptors (more "meaningful" or higher-correlated ones).
        Defaults to False (i.e., Static Gryffin with originally given descriptors is used).
    sampling_strategies: int, optional
        Number of sampling strategies (similar to sampling of GPs).
        One factor (next to batches) for the number of suggested new points in one optimization step.
        Total number of suggested points: sampling_strategies x batches.
        Defaults to 4.
    batches: int, optional
        Number of suggested points within one sampling strategy.
        One factor (next to sampling_strategies) for the number of suggested new points in one optimization step.
        Total number of suggested points: sampling_strategies x batches. Defaults to 1.
    logging: -1, optional
        Corresponds to the verbosity level of logging of Gryffin. See the Notes for potential logging levels.
        Defaults to -1
    parallel: Boolean, optional
        Run optimisation in parallel. Default True.
    boosted: Boolean, optional
        Whether "pseudo-boosting" is applied See the original paper in references below for more details.
    sampler: string, optional
        A priori distribution of categorical variables. By default: 'uniform'
    softness: float, optional
        Softness of Chimera. By default: 0.001
    continuous_optimizer: string, optional
        Optimizer type for continuous variables (available: "adam").
        By default: 'adam'
    categorical_optimizer: string, optional
        Optimizer type for categorical variables (available: "naive").
        By default: naive
    discrete_optimizer: string, optional
        Optimizer type for discrete variables ((available: "naive").
        By default: naive

    Attributes
    ----------

    xbest : internal state
        Best point from all iterations.
    fbest : internal state
        Objective value at best point from all iterations.
    param : internal state
        A list containing all evaluated X and corresponding Y values.

    Examples
    --------

    >>> from summit.domain import *
    >>> from summit.strategies import GRYFFIN
    >>> import numpy as np
    >>> domain = Domain()
    >>> domain += ContinuousVariable(name="temperature", description="reaction temperature in celsius", bounds=[50, 100])
    >>> domain += CategoricalVariable(name="flowrate_a", description="flow of reactant a in mL/min", levels=[1,2,3,4,5])
    >>> base_df = DataSet([[1,2,3],[2,3,4],[8,8,8]], index = ["solv1","solv2","solv3"], columns=["MP","mol_weight","area"])
    >>> domain += CategoricalVariable(name="solvent", description="solvent type - categorical", descriptors=base_df)
    >>> domain += ContinuousVariable(name="yield", description="yield of reaction", bounds=[0,100], is_objective=True)
    >>> strategy = GRYFFIN(domain, auto_desc_gen=True)
    >>> next_experiments = strategy.suggest_experiments()

    Notes
    -----

    verbosity_levels:
    * -1= ''
    * 0= ['INFO', 'FATAL']
    * 1= ['INFO', 'ERROR', 'FATAL']
    * 2= ['INFO', 'WARNING', 'ERROR', 'FATAL']
    * 3= ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'FATAL']

    Gryffin was created by the Aspuru-Guzik group. See the paper by [Hase]_ or the
    `Github repository <https://github.com/aspuru-guzik-group/gryffin>`_.

    References
    ----------

    .. [Hase] Häse, F., Roch, L.M. and Aspuru-Guzik, A., 2020. Gryffin: An algorithm for Bayesian
           optimization for categorical variables informed by physical intuition with applications to chemistry.
           arXiv preprint `arXiv:2003.12127 <https://arxiv.org/pdf/2003.12127.pdf>`_.

    """

    def __init__(
        self,
        domain,
        transform=None,
        use_descriptors=True,
        auto_desc_gen=False,
        sampling_strategies=4,
        batches=1,
        logging=-1,
        parallel=True,
        boosted=True,
        sampler="uniform",
        softness=0.001,
        continuous_optimizer="adam",
        categorical_optimizer="naive",
        discrete_optimizer="naive",
        **kwargs,
    ):
        kwargs.update({"transform_descriptors": False})
        Strategy.__init__(self, domain, transform=transform, **kwargs)

        self.domain_inputs = []
        self.domain_objectives = []
        self.prev_param = None

        tmp_dir = self._get_tmp_dir()

        # create a temporary config.json file to initialize GRYFFIN
        self.use_descriptors = use_descriptors
        config_dict = {
            "general": {
                "auto_desc_gen": auto_desc_gen,
                "parallel": parallel,
                "boosted": boosted,
                "sampling_strategies": sampling_strategies,
                "batches": batches,
                "sampler": sampler,
                "softness": softness,
                "continuous_optimizer": continuous_optimizer,
                "categorical_optimizer": categorical_optimizer,
                "discrete_optimizer": discrete_optimizer,
                "verbosity": {
                    "default": logging,
                    "bayesian_network": logging,
                    "random_sampler": logging,
                },
            }
        }

        delay_setup = kwargs.get("delay_setup", False)
        if not delay_setup:
            self._setup_gryffin(config_dict, tmp_dir)

[docs]    def suggest_experiments(self, prev_res: DataSet = None, **kwargs):
        """Suggest experiments using Gryffin optimization strategy

        Parameters
        ----------
        prev_res: :class:`~summit.utils.data.DataSet`, optional
            Dataset with data from previous experiments of previous iteration.
            If no data is passed, then random sampling will
            be used to suggest an initial design.

        Returns
        -------
        next_experiments : :class:`~summit.utils.data.DataSet`
            A Dataset object with the suggested experiments

        """

        param = None
        xbest = np.zeros(self.domain.num_continuous_dimensions())
        obj = self.domain.output_variables[0]
        fbest = float("inf")

        # Suggest random initial design
        if prev_res is None:
            request = self.gryffin.recommend(observations=[])
        else:
            # Get inputs and outputs
            inputs, outputs = self.transform.transform_inputs_outputs(
                prev_res, transform_descriptors=False
            )

            # Set up maximization and minimization by converting maximization to minimization problem
            for v in self.domain.variables:
                if v.is_objective and v.maximize:
                    outputs[v.name] = -1 * outputs[v.name]

            inputs_dict = inputs.to_dict(orient="records")
            outputs_dict = outputs.to_dict(orient="records")
            prev_samples = [
                {
                    **{k1[0]: [v1] for k1, v1 in inputs_dict[i].items()},
                    **{k2[0]: v2 for k2, v2 in outputs_dict[i].items()},
                }
                for i in range(len(inputs_dict))
            ]

            observations = []
            if self.prev_param is not None:
                observations = self.prev_param
            observations.extend(prev_samples)
            param = observations

            request = self.gryffin.recommend(observations=observations)

            for obs in observations:
                if obs[obj.name] < fbest:
                    fbest = obs[obj.name]
                    xbest = np.asarray([v[0] for k, v in obs.items() if k != obj.name])

        # Generate DataSet object with variable values of next
        next_experiments = None
        if request is not None and len(request) != 0:
            next_experiments = {}
            for k in request[0].keys():
                next_experiments[k] = [r[k][0] for r in request]
            next_experiments = DataSet.from_df(pd.DataFrame(data=next_experiments))
            next_experiments[("strategy", "METADATA")] = "GRYFFIN"

        obj = self.domain.output_variables[0]
        objective_dir = -1.0 if obj.maximize else 1.0
        fbest = objective_dir * fbest
        self.fbest = fbest
        self.xbest = xbest
        self.prev_param = param

        # Do any necessary transformation back
        next_experiments = self.transform.un_transform(
            next_experiments, transform_descriptors=False
        )

        return next_experiments

    def _create_gryffin_domain(self, tmp_dir):
        for v in self.domain.variables:
            if not v.is_objective:
                if v.variable_type == "continuous":
                    self.domain_inputs.append(
                        {
                            "name": v.name,
                            "type": v.variable_type,
                            "low": float(v.bounds[0]),
                            "high": float(v.bounds[1]),
                            "size": 1,
                        }
                    )
                elif v.variable_type == "categorical":
                    if v.ds is not None and self.use_descriptors:
                        descriptors = [
                            v.ds.loc[[l], :].values[0].tolist() for l in v.ds.index
                        ]
                    else:
                        descriptors = None
                    self.domain_inputs.append(
                        {
                            "name": v.name,
                            "type": "categorical",
                            "size": 1,
                            "levels": v.levels,
                            "descriptors": descriptors,
                            "category_details": str(
                                tmp_dir / "CatDetails" / f"cat_details_{v.name}.pkl"
                            ),
                        }
                    )
                else:
                    raise TypeError(
                        "Unknown variable type: {}.".format(v.variable_type)
                    )
            else:
                self.domain_objectives.append(
                    {
                        "name": v.name,
                        "goal": "minimize",
                    }
                )

        if len(self.domain_objectives) > 1:
            raise ValueError(
                "Gryffin only works with single objective problems. Use a transform for multiobjective problems"
            )

        # TODO: how does GRYFFIN handle constraints?
        if self.domain.constraints != []:
            raise NotImplementedError("Gryffin can not handle constraints yet.")
            # keep SOBO constraint wrapping for later application when gryffin adds constraint handling
            # constraints = self.constr_wrapper(self.domain)
            # self.constraints = [{"name": "constr_" + str(i),
            #                     "constraint": c[0] if c[1] in ["<=", "<"] else "(" + c[0] + ")*(-1)"}
            #                    for i,c in enumerate(constraints) if not (c[1] == "==")]
        else:
            self.constraints = None

    def _setup_gryffin(self, config_dict: dict, tmp_dir: pathlib.Path):
        # Create class attribute
        self.config_dict = copy.deepcopy(config_dict)
        self._create_gryffin_domain(tmp_dir)

        # Update paramters
        config_dict["parameters"] = self.domain_inputs
        config_dict["objectives"] = self.domain_objectives
        config_dict["general"]["scratch_dir"] = str(tmp_dir / "scratch")
        config_dict["database"] = {
            "format": "pickle",
            "path": str(tmp_dir / "SearchProgress"),
        }

        # Save config file
        config_file = "config.json"
        config_file_path = tmp_dir / config_file
        with open(config_file_path, "w") as configfile:
            json.dump(config_dict, configfile, indent=2)

        # write categories
        category_writer = CategoryWriter(inputs=self.domain_inputs)
        category_writer.write_categories(save_dir=tmp_dir)

        # initialize gryffin
        self.gryffin = Gryffin(config_file_path)

    def _get_tmp_dir(self):
        # Create directories to store temporary files
        summit_config_path = get_summit_config_path()
        self.uuid_val = uuid.uuid4()  # Unique identifier for this run
        tmp_dir = summit_config_path / "gryffin" / str(self.uuid_val)
        if not os.path.isdir(tmp_dir):
            os.makedirs(tmp_dir)
        return tmp_dir

[docs]    def reset(self):
        """Reset the internal parameters"""
        self.prev_param = None

[docs]    def to_dict(self):
        if self.prev_param is not None:
            param = self.prev_param
        else:
            param = None
        strategy_params = dict(
            config_dict=self.config_dict,
            use_descriptors=self.use_descriptors,
            prev_param=param,
        )
        return super().to_dict(**strategy_params)

[docs]    @classmethod
    def from_dict(cls, d):
        # Gather parameters
        strategy_params = d["strategy_params"]
        d["strategy_params"]["delay_setup"] = True
        param = strategy_params["prev_param"]

        # Setup gryffin
        gryffin = super().from_dict(d)
        tmp_dir = gryffin._get_tmp_dir()
        if strategy_params.get("config_dict") is not None:
            gryffin._setup_gryffin(strategy_params["config_dict"], tmp_dir)
        gryffin.prev_param = param
        return gryffin

    # TODO: update constraint wrapper when Gryffin can handle constraints
    """ 
    def constr_wrapper(self, summit_domain):
        v_input_names = [v.name for v in summit_domain.variables if not v.is_objective]
        gpyopt_constraints = []
        for c in summit_domain.constraints:
            tmp_c = c.lhs
            for v_input_index, v_input_name in enumerate(v_input_names):
                v_gpyopt_name = "x[:,"+str(v_input_index)+"]"
                tmp_c = tmp_c.replace(v_input_name, v_gpyopt_name)
            gpyopt_constraints.append([tmp_c, c.constraint_type])
        return gpyopt_constraints
    """


class CategoryWriter(object):
    """Category Writer for Gryffin (adapted from https://github.com/aspuru-guzik-group/gryffin)

    Parameters
    ----------
    inputs: array-like
        List containing the input variables. Each entry is a dictionary describing the features of the input variable.

    Notes
    ----------
    This implementation uses the software package Gryffin provided by
    the Aspuru-Guzik Group and published by Haese et al. (2020), arXiv:2003.12127.

    Copyright (C) 2020, Harvard University.
    All rights reserved.

    """

    def __init__(self, inputs):
        self.cat_inputs = [
            [ent["name"], ent["levels"], ent["descriptors"]]
            for ent in inputs
            if ent["type"] == "categorical"
        ]

    def write_categories(self, save_dir):
        """Writes categories to pkl file
        :param save_dir: string, path where category details will be saved.
        """

        for cat_inp in self.cat_inputs:
            param_name = cat_inp[0]
            param_opt = cat_inp[1]
            param_descr = cat_inp[2]

            opt_list = []
            for opt in range(len(param_opt)):
                # TODO: descriptors all the same?
                if param_descr is not None:
                    descriptors = np.array(param_descr[opt])
                    opt_dict = {"name": param_opt[opt], "descriptors": descriptors}
                else:
                    opt_dict = {"name": param_opt[opt]}
                opt_list.append(copy.deepcopy(opt_dict))

            # create cat_details dir if necessary
            if not os.path.isdir("%s/CatDetails" % save_dir):
                os.mkdir("%s/CatDetails" % save_dir)

            cat_details_file = "%s/CatDetails/cat_details_%s.pkl" % (
                save_dir,
                param_name,
            )
            pickle.dump(opt_list, open(cat_details_file, "wb"))