Source code for summit.strategies.entmoot

from summit.strategies.base import Strategy
from summit.domain import *
from summit.utils.dataset import DataSet

import string
import numpy as np
import pandas as pd


[docs]class ENTMOOT(Strategy):
    """
    Single-objective Bayesian optimization, using gradient-boosted trees
    instead of Gaussian processes, via ENTMOOT (ENsemble Tree MOdel Optimization Tool)

    This is currently an experimental feature and requires Gurobipy to be installed.

    Parameters
    ----------
    domain: :class:`~summit.domain.Domain`
        The Summit domain describing the optimization problem.
    transform : :class:`~summit.strategies.base.Transform`, optional
        A transform object. By default no transformation will be done
        on the input variables or objectives.
    estimator_type: string, optional
        The ENTMOOT base_estimator type.
        By default, Gradient-Boosted Regression
    std_estimator_type: string, optional
        The ENTMOOT std_estimator
        By default, bounded data distance
    acquisition_type: string, optional
        The acquisition function type from ENTMOOT. See notes for options.
        By default, Lower Confidence Bound.
    optimizer_type: string, optional
        The optimizer used in ENTMOOT for maximization of the acquisition function.
        By default, sampling will be used.
    generator_type: string, optional
        The method for generating initial points before a model can be trained.
        By default, uniform random points will be used.
    initial_points: int, optional
        How many points to require before training models
    min_child_samples: int, optional
        Minimum size of a leaf in tree models

    Examples
    --------
    >>> from summit.domain import *
    >>> from summit.strategies.entmoot import ENTMOOT
    >>> import numpy as np
    >>> domain = Domain()
    >>> domain += ContinuousVariable(name='temperature', description='reaction temperature in celsius', bounds=[50, 100])
    >>> domain += CategoricalVariable(name='flowrate_a', description='flow of reactant a in mL/min', levels=[1,2,3,4,5])
    >>> domain += ContinuousVariable(name='flowrate_b', description='flow of reactant b in mL/min', bounds=[0.1, 0.5])
    >>> domain += ContinuousVariable(name="yld", description='yield of reaction', bounds=[0,100], is_objective=True)
    >>> # strategy = ENTMOOT(domain)
    >>> # next_experiments = strategy.suggest_experiments(5)

    Notes
    ----------

    Estimator type can either by GBRT (Gradient-boosted regression trees) or RF (random forest from scikit-learn).

    Acquisition function type can only be LCB (lower confidence bound).

    Based on the paper from [Thebelt]_ et al.

    .. [Thebelt] A. Thebelt et al.
       "ENTMOOT: A Framework for Optimization over Ensemble Tree Models", `ArXiv <https://arxiv.org/abs/2003.04774>`_

    """

    def __init__(
        self,
        domain,
        transform=None,
        estimator_type=None,
        std_estimator_type=None,
        acquisition_type=None,
        optimizer_type=None,
        generator_type=None,
        initial_points=50,
        min_child_samples=5,
        **kwargs
    ):
        Strategy.__init__(self, domain, transform=transform, **kwargs)

        self.use_descriptors = kwargs.get("use_descriptors", False)
        # TODO: notation - discrete in our model (e.g., catalyst type) = categorical?
        self.input_domain = []
        for v in self.domain.variables:
            if not v.is_objective:
                if isinstance(v, ContinuousVariable):
                    self.input_domain.append(
                        {
                            "name": v.name,
                            "type": v.variable_type,
                            "domain": (v.bounds[0], v.bounds[1]),
                        }
                    )
                elif isinstance(v, CategoricalVariable):
                    raise ValueError(
                        "Categorical Variables are not yet implemented "
                        "for ENTMOOT strategy."
                    )
                    if not self.use_descriptors:
                        self.input_domain.append(
                            {
                                "name": v.name,
                                "type": "categorical",
                                "domain": tuple(self.categorical_wrapper(v.levels)),
                            }
                        )
                    elif v.ds is not None and self.use_descriptors:
                        if v.ds is None:
                            raise ValueError(
                                "No descriptors provided for variable: {}".format(
                                    v.name
                                )
                            )
                        descriptor_names = v.ds.data_columns
                        descriptors = np.asarray(
                            [
                                v.ds.loc[:, [l]].values.tolist()
                                for l in v.ds.data_columns
                            ]
                        )
                        for j, d in enumerate(descriptors):
                            self.input_domain.append(
                                {
                                    "name": descriptor_names[j],
                                    "type": "continuous",
                                    "domain": (
                                        np.min(np.asarray(d)),
                                        np.max(np.asarray(d)),
                                    ),
                                }
                            )
                    elif v.ds is None and self.use_descriptors:
                        raise ValueError(
                            "Cannot use descriptors because none are provided."
                        )
                    # TODO: GPyOpt currently does not support mixed-domains w/ bandit inputs, there is a PR for this though
                else:
                    raise TypeError("Unknown variable type.")

        # TODO: how to handle equality constraints? Could we remove '==' from constraint types as each equality
        #  constraint reduces the degrees of freedom?
        if len(self.domain.constraints) != 0:
            self.constraints = self.constr_wrapper(self.domain)
        else:
            self.constraints = None

        self.input_dim = len(self.domain.input_variables)

        if estimator_type in [
            "GBRT",
            "RF",
        ]:
            self.estimator_type = estimator_type
        else:
            self.estimator_type = "GBRT"  # default model type is GB trees

        if std_estimator_type in [
            "BDD",
            "L1BDD",
            "DDP",
            "L1DDP",
        ]:
            self.std_estimator_type = std_estimator_type
        else:
            self.std_estimator_type = (
                "BDD"  # default model type is bounded data distance
            )

        if acquisition_type in [
            "LCB",
        ]:
            self.acquisition_type = acquisition_type
        else:
            self.acquisition_type = (
                "LCB"  # default acquisition function is lower confidence bound
            )

        """ 
        Method for optimization of acquisition function
           sampling: optimized by computing `acquisition_type` at `n_points` 
               randomly sampled points
           global: optimized by using global solver to find minimum of 
               `acquisition_type`. Requires gurobipy
        """
        if optimizer_type in ["sampling", "global"]:
            self.optimizer_type = optimizer_type
        else:
            self.optimizer_type = "sampling"  # default optimizer: sampling

        if (self.optimizer_type == "sampling") & (self.constraints is not None):
            raise ValueError(
                "Constraints can only be applied when ENTMOOT is using"
                "global solver. Set optimizer_type = global or remove"
                "constraints."
            )

        import pkg_resources

        required = {"gurobipy"}
        installed = {pkg.key for pkg in pkg_resources.working_set}
        self.gurobi_missing = required - installed

        """
        Sets an initial points generator. Can be either
        - "random" for uniform random numbers,
        - "sobol" for a Sobol sequence,
        - "halton" for a Halton sequence,
        - "hammersly" for a Hammersly sequence,
        - "lhs" for a latin hypercube sequence,
        - "grid" for a uniform grid sequence
        """
        if generator_type in [
            "random",
            "sobol",
            "halton",
            "hammersly",
            "lhs",
            "grid",
        ]:
            self.generator_type = generator_type
        else:
            self.generator_type = "random"

        self.initial_points = initial_points
        self.min_child_samples = min_child_samples
        self.prev_param = None

[docs]    def suggest_experiments(
        self, num_experiments=1, prev_res: DataSet = None, **kwargs
    ):
        """Suggest experiments using ENTMOOT tree-based Bayesian Optimization

        Parameters
        ----------
        num_experiments: int, optional
            The number of experiments (i.e., samples) to generate. Default is 1.
        prev_res: :class:`~summit.utils.data.DataSet`, optional
            Dataset with data from previous experiments of previous iteration.
            If no data is passed, then random sampling will
            be used to suggest an initial design.

        Returns
        -------
        next_experiments : :class:`~summit.utils.data.DataSet`
            A Dataset object with the suggested experiments

        """
        from entmoot.optimizer.optimizer import Optimizer
        from entmoot.space.space import Space

        param = None
        xbest = np.zeros(self.domain.num_continuous_dimensions())
        obj = self.domain.output_variables[0]
        objective_dir = -1.0 if obj.maximize else 1.0
        fbest = float("inf")

        bounds = [k["domain"] for k in self.input_domain]
        space = Space(bounds)

        if not self.gurobi_missing:
            from gurobipy import LinExpr
            from entmoot.optimizer.gurobi_utils import get_core_gurobi_model

            core_model = get_core_gurobi_model(space)
            gvars = core_model.getVars()

            for c in self.constraints:
                left = LinExpr()
                left.addTerms(c[0], gvars)
                left.addConstant(c[1])
                core_model.addLConstr(left, c[2], 0)

            core_model.update()
            acq_optimizer_kwargs = {"add_model_core": core_model}
        else:
            acq_optimizer_kwargs = None

        entmoot_model = Optimizer(
            dimensions=bounds,
            base_estimator=self.estimator_type,
            std_estimator=self.std_estimator_type,
            n_initial_points=self.initial_points,
            initial_point_generator=self.generator_type,
            acq_func=self.acquisition_type,
            acq_optimizer=self.optimizer_type,
            random_state=None,
            acq_func_kwargs=None,
            acq_optimizer_kwargs=acq_optimizer_kwargs,
            base_estimator_kwargs={"min_child_samples": self.min_child_samples},
            std_estimator_kwargs=None,
            model_queue_size=None,
            verbose=False,
        )

        # If we have previous results:
        if prev_res is not None:
            # Get inputs and outputs
            inputs, outputs = self.transform.transform_inputs_outputs(
                prev_res, transform_descriptors=self.use_descriptors
            )

            # Set up maximization and minimization by converting maximization to minimization problem
            for v in self.domain.variables:
                if v.is_objective and v.maximize:
                    outputs[v.name] = -1 * outputs[v.name]
                if isinstance(v, CategoricalVariable):
                    if not self.use_descriptors:
                        inputs[v.name] = self.categorical_wrapper(
                            inputs[v.name], v.levels
                        )

            inputs = inputs.to_numpy()
            outputs = outputs.to_numpy()

            if self.prev_param is not None:
                X_step = self.prev_param[0]
                Y_step = self.prev_param[1]

                X_step = np.vstack((X_step, inputs))
                Y_step = np.vstack((Y_step, outputs))

            else:
                X_step = inputs
                Y_step = outputs
            # Convert to list form to give to optimizer
            prev_X = [list(x) for x in X_step]
            prev_y = [y for x in Y_step for y in x]

            # Train entmoot model
            entmoot_model.tell(prev_X, prev_y, fit=True)

            # Store parameters (history of suggested points and function evaluations)
            param = [X_step, Y_step]
            fbest = np.min(Y_step)
            xbest = X_step[np.argmin(Y_step)]

        request = np.array(
            entmoot_model.ask(n_points=num_experiments, strategy="cl_mean")
        )
        # Generate DataSet object with variable values of next
        next_experiments = None
        transform_descriptors = False
        if request is not None and len(request) != 0:
            next_experiments = {}
            i_inp = 0
            for v in self.domain.variables:
                if not v.is_objective:
                    if isinstance(v, CategoricalVariable):
                        if v.ds is None or not self.use_descriptors:
                            cat_list = []
                            for j, entry in enumerate(request[:, i_inp]):
                                cat_list.append(
                                    self.categorical_unwrap(entry, v.levels)
                                )
                            next_experiments[v.name] = np.asarray(cat_list)
                            i_inp += 1
                        else:
                            descriptor_names = v.ds.data_columns
                            for d in descriptor_names:
                                next_experiments[d] = request[:, i_inp]
                                i_inp += 1
                            transform_descriptors = True
                    else:
                        next_experiments[v.name] = request[:, i_inp]
                        i_inp += 1
            next_experiments = DataSet.from_df(pd.DataFrame(data=next_experiments))
            next_experiments[("strategy", "METADATA")] = "ENTMOOT"

        self.fbest = objective_dir * fbest
        self.xbest = xbest
        self.prev_param = param

        # Do any necessary transformation back
        next_experiments = self.transform.un_transform(
            next_experiments, transform_descriptors=self.use_descriptors
        )

        return next_experiments

[docs]    def reset(self):
        """Reset the internal parameters"""
        self.prev_param = None

    def constr_wrapper(self, summit_domain):
        v_input_names = [v.name for v in summit_domain.variables if not v.is_objective]
        constraints = []
        for c in summit_domain.constraints:
            tmp_c = c.lhs
            # Split LHS on + signs into fragments
            tmp_p = str.split(tmp_c, "+")

            tmp_a = []
            for t in tmp_p:
                # For each of the fragments, split on -
                terms = str.split(t, "-")
                for i in range(len(terms)):
                    if i == 0:
                        # If the first part in the fragment is not empty, that
                        # means the first term was positive.
                        if terms[0] != "":
                            tmp_a.append(terms[0])
                    # All of the terms in the split will have
                    # negative coefficients.
                    else:
                        tmp_a.append("-" + terms[i])
            # Split the terms into coefficients and variables:
            constraint_dict = dict()
            for term in tmp_a:
                for i, char in enumerate(term):
                    if char in string.ascii_letters:
                        index = i
                        c_variable = term[index:]
                        if term[:index] == "":
                            c_coeff = 1.0
                        elif term[:index] == "-":
                            c_coeff = -1.0
                        else:
                            c_coeff = float(term[:index])
                        break
                    else:
                        c_variable = "constant"
                        c_coeff = term
                constraint_dict[c_variable] = c_coeff
            # Place coefficients in the variable order the model expects.
            constraints_ordered = []
            for v_input_index, v_input_name in enumerate(v_input_names):
                constraints_ordered.append(constraint_dict.get(v_input_name, 0))
            constraints.append(
                [constraints_ordered, constraint_dict["constant"], c.constraint_type]
            )
        return constraints

[docs]    def to_dict(self):
        if self.prev_param is not None:
            param = [self.prev_param[0].tolist(), self.prev_param[1].tolist()]
        else:
            param = None

        strategy_params = dict(
            prev_param=param,
            use_descriptors=self.use_descriptors,
            estimator_type=self.estimator_type,
            std_estimator_type=self.std_estimator_type,
            acquisition_type=self.acquisition_type,
            optimizer_type=self.optimizer_type,
            generator_type=self.generator_type,
            initial_points=self.initial_points,
            min_child_samples=self.min_child_samples,
        )

        return super().to_dict(**strategy_params)

[docs]    @classmethod
    def from_dict(cls, d):
        # Setup ENTMOOT
        entmoot = super().from_dict(d)
        param = d["strategy_params"]["prev_param"]
        if param is not None:
            param = [np.array(param[0]), np.array(param[1])]
            entmoot.prev_param = param
        return entmoot


"""
    def categorical_wrapper(self, categories, reference_categories=None):
        if not reference_categories:
            return [i for i, _ in enumerate(categories)]
        else:
            return [reference_categories.index(c) for c in categories]

    def categorical_unwrap(self, gpyopt_level, categories):
        return categories[int(gpyopt_level)]
"""