from summit.strategies.base import Strategy
from summit.domain import *
from summit.utils.dataset import DataSet
import string
import numpy as np
import pandas as pd
[docs]class ENTMOOT(Strategy):
"""
Single-objective Bayesian optimization, using gradient-boosted trees
instead of Gaussian processes, via ENTMOOT (ENsemble Tree MOdel Optimization Tool)
This is currently an experimental feature and requires Gurobipy to be installed.
Parameters
----------
domain: :class:`~summit.domain.Domain`
The Summit domain describing the optimization problem.
transform : :class:`~summit.strategies.base.Transform`, optional
A transform object. By default no transformation will be done
on the input variables or objectives.
estimator_type: string, optional
The ENTMOOT base_estimator type.
By default, Gradient-Boosted Regression
std_estimator_type: string, optional
The ENTMOOT std_estimator
By default, bounded data distance
acquisition_type: string, optional
The acquisition function type from ENTMOOT. See notes for options.
By default, Lower Confidence Bound.
optimizer_type: string, optional
The optimizer used in ENTMOOT for maximization of the acquisition function.
By default, sampling will be used.
generator_type: string, optional
The method for generating initial points before a model can be trained.
By default, uniform random points will be used.
initial_points: int, optional
How many points to require before training models
min_child_samples: int, optional
Minimum size of a leaf in tree models
Examples
--------
>>> from summit.domain import *
>>> from summit.strategies.entmoot import ENTMOOT
>>> import numpy as np
>>> domain = Domain()
>>> domain += ContinuousVariable(name='temperature', description='reaction temperature in celsius', bounds=[50, 100])
>>> domain += CategoricalVariable(name='flowrate_a', description='flow of reactant a in mL/min', levels=[1,2,3,4,5])
>>> domain += ContinuousVariable(name='flowrate_b', description='flow of reactant b in mL/min', bounds=[0.1, 0.5])
>>> domain += ContinuousVariable(name="yld", description='yield of reaction', bounds=[0,100], is_objective=True)
>>> # strategy = ENTMOOT(domain)
>>> # next_experiments = strategy.suggest_experiments(5)
Notes
----------
Estimator type can either by GBRT (Gradient-boosted regression trees) or RF (random forest from scikit-learn).
Acquisition function type can only be LCB (lower confidence bound).
Based on the paper from [Thebelt]_ et al.
.. [Thebelt] A. Thebelt et al.
"ENTMOOT: A Framework for Optimization over Ensemble Tree Models", `ArXiv <https://arxiv.org/abs/2003.04774>`_
"""
def __init__(
self,
domain,
transform=None,
estimator_type=None,
std_estimator_type=None,
acquisition_type=None,
optimizer_type=None,
generator_type=None,
initial_points=50,
min_child_samples=5,
**kwargs
):
Strategy.__init__(self, domain, transform=transform, **kwargs)
self.use_descriptors = kwargs.get("use_descriptors", False)
# TODO: notation - discrete in our model (e.g., catalyst type) = categorical?
self.input_domain = []
for v in self.domain.variables:
if not v.is_objective:
if isinstance(v, ContinuousVariable):
self.input_domain.append(
{
"name": v.name,
"type": v.variable_type,
"domain": (v.bounds[0], v.bounds[1]),
}
)
elif isinstance(v, CategoricalVariable):
raise ValueError(
"Categorical Variables are not yet implemented "
"for ENTMOOT strategy."
)
if not self.use_descriptors:
self.input_domain.append(
{
"name": v.name,
"type": "categorical",
"domain": tuple(self.categorical_wrapper(v.levels)),
}
)
elif v.ds is not None and self.use_descriptors:
if v.ds is None:
raise ValueError(
"No descriptors provided for variable: {}".format(
v.name
)
)
descriptor_names = v.ds.data_columns
descriptors = np.asarray(
[
v.ds.loc[:, [l]].values.tolist()
for l in v.ds.data_columns
]
)
for j, d in enumerate(descriptors):
self.input_domain.append(
{
"name": descriptor_names[j],
"type": "continuous",
"domain": (
np.min(np.asarray(d)),
np.max(np.asarray(d)),
),
}
)
elif v.ds is None and self.use_descriptors:
raise ValueError(
"Cannot use descriptors because none are provided."
)
# TODO: GPyOpt currently does not support mixed-domains w/ bandit inputs, there is a PR for this though
else:
raise TypeError("Unknown variable type.")
# TODO: how to handle equality constraints? Could we remove '==' from constraint types as each equality
# constraint reduces the degrees of freedom?
if len(self.domain.constraints) != 0:
self.constraints = self.constr_wrapper(self.domain)
else:
self.constraints = None
self.input_dim = len(self.domain.input_variables)
if estimator_type in [
"GBRT",
"RF",
]:
self.estimator_type = estimator_type
else:
self.estimator_type = "GBRT" # default model type is GB trees
if std_estimator_type in [
"BDD",
"L1BDD",
"DDP",
"L1DDP",
]:
self.std_estimator_type = std_estimator_type
else:
self.std_estimator_type = (
"BDD" # default model type is bounded data distance
)
if acquisition_type in [
"LCB",
]:
self.acquisition_type = acquisition_type
else:
self.acquisition_type = (
"LCB" # default acquisition function is lower confidence bound
)
"""
Method for optimization of acquisition function
sampling: optimized by computing `acquisition_type` at `n_points`
randomly sampled points
global: optimized by using global solver to find minimum of
`acquisition_type`. Requires gurobipy
"""
if optimizer_type in ["sampling", "global"]:
self.optimizer_type = optimizer_type
else:
self.optimizer_type = "sampling" # default optimizer: sampling
if (self.optimizer_type == "sampling") & (self.constraints is not None):
raise ValueError(
"Constraints can only be applied when ENTMOOT is using"
"global solver. Set optimizer_type = global or remove"
"constraints."
)
import pkg_resources
required = {"gurobipy"}
installed = {pkg.key for pkg in pkg_resources.working_set}
self.gurobi_missing = required - installed
"""
Sets an initial points generator. Can be either
- "random" for uniform random numbers,
- "sobol" for a Sobol sequence,
- "halton" for a Halton sequence,
- "hammersly" for a Hammersly sequence,
- "lhs" for a latin hypercube sequence,
- "grid" for a uniform grid sequence
"""
if generator_type in [
"random",
"sobol",
"halton",
"hammersly",
"lhs",
"grid",
]:
self.generator_type = generator_type
else:
self.generator_type = "random"
self.initial_points = initial_points
self.min_child_samples = min_child_samples
self.prev_param = None
[docs] def suggest_experiments(
self, num_experiments=1, prev_res: DataSet = None, **kwargs
):
"""Suggest experiments using ENTMOOT tree-based Bayesian Optimization
Parameters
----------
num_experiments: int, optional
The number of experiments (i.e., samples) to generate. Default is 1.
prev_res: :class:`~summit.utils.data.DataSet`, optional
Dataset with data from previous experiments of previous iteration.
If no data is passed, then random sampling will
be used to suggest an initial design.
Returns
-------
next_experiments : :class:`~summit.utils.data.DataSet`
A Dataset object with the suggested experiments
"""
from entmoot.optimizer.optimizer import Optimizer
from entmoot.space.space import Space
param = None
xbest = np.zeros(self.domain.num_continuous_dimensions())
obj = self.domain.output_variables[0]
objective_dir = -1.0 if obj.maximize else 1.0
fbest = float("inf")
bounds = [k["domain"] for k in self.input_domain]
space = Space(bounds)
if not self.gurobi_missing:
from gurobipy import LinExpr
from entmoot.optimizer.gurobi_utils import get_core_gurobi_model
core_model = get_core_gurobi_model(space)
gvars = core_model.getVars()
for c in self.constraints:
left = LinExpr()
left.addTerms(c[0], gvars)
left.addConstant(c[1])
core_model.addLConstr(left, c[2], 0)
core_model.update()
acq_optimizer_kwargs = {"add_model_core": core_model}
else:
acq_optimizer_kwargs = None
entmoot_model = Optimizer(
dimensions=bounds,
base_estimator=self.estimator_type,
std_estimator=self.std_estimator_type,
n_initial_points=self.initial_points,
initial_point_generator=self.generator_type,
acq_func=self.acquisition_type,
acq_optimizer=self.optimizer_type,
random_state=None,
acq_func_kwargs=None,
acq_optimizer_kwargs=acq_optimizer_kwargs,
base_estimator_kwargs={"min_child_samples": self.min_child_samples},
std_estimator_kwargs=None,
model_queue_size=None,
verbose=False,
)
# If we have previous results:
if prev_res is not None:
# Get inputs and outputs
inputs, outputs = self.transform.transform_inputs_outputs(
prev_res, transform_descriptors=self.use_descriptors
)
# Set up maximization and minimization by converting maximization to minimization problem
for v in self.domain.variables:
if v.is_objective and v.maximize:
outputs[v.name] = -1 * outputs[v.name]
if isinstance(v, CategoricalVariable):
if not self.use_descriptors:
inputs[v.name] = self.categorical_wrapper(
inputs[v.name], v.levels
)
inputs = inputs.to_numpy()
outputs = outputs.to_numpy()
if self.prev_param is not None:
X_step = self.prev_param[0]
Y_step = self.prev_param[1]
X_step = np.vstack((X_step, inputs))
Y_step = np.vstack((Y_step, outputs))
else:
X_step = inputs
Y_step = outputs
# Convert to list form to give to optimizer
prev_X = [list(x) for x in X_step]
prev_y = [y for x in Y_step for y in x]
# Train entmoot model
entmoot_model.tell(prev_X, prev_y, fit=True)
# Store parameters (history of suggested points and function evaluations)
param = [X_step, Y_step]
fbest = np.min(Y_step)
xbest = X_step[np.argmin(Y_step)]
request = np.array(
entmoot_model.ask(n_points=num_experiments, strategy="cl_mean")
)
# Generate DataSet object with variable values of next
next_experiments = None
transform_descriptors = False
if request is not None and len(request) != 0:
next_experiments = {}
i_inp = 0
for v in self.domain.variables:
if not v.is_objective:
if isinstance(v, CategoricalVariable):
if v.ds is None or not self.use_descriptors:
cat_list = []
for j, entry in enumerate(request[:, i_inp]):
cat_list.append(
self.categorical_unwrap(entry, v.levels)
)
next_experiments[v.name] = np.asarray(cat_list)
i_inp += 1
else:
descriptor_names = v.ds.data_columns
for d in descriptor_names:
next_experiments[d] = request[:, i_inp]
i_inp += 1
transform_descriptors = True
else:
next_experiments[v.name] = request[:, i_inp]
i_inp += 1
next_experiments = DataSet.from_df(pd.DataFrame(data=next_experiments))
next_experiments[("strategy", "METADATA")] = "ENTMOOT"
self.fbest = objective_dir * fbest
self.xbest = xbest
self.prev_param = param
# Do any necessary transformation back
next_experiments = self.transform.un_transform(
next_experiments, transform_descriptors=self.use_descriptors
)
return next_experiments
[docs] def reset(self):
"""Reset the internal parameters"""
self.prev_param = None
def constr_wrapper(self, summit_domain):
v_input_names = [v.name for v in summit_domain.variables if not v.is_objective]
constraints = []
for c in summit_domain.constraints:
tmp_c = c.lhs
# Split LHS on + signs into fragments
tmp_p = str.split(tmp_c, "+")
tmp_a = []
for t in tmp_p:
# For each of the fragments, split on -
terms = str.split(t, "-")
for i in range(len(terms)):
if i == 0:
# If the first part in the fragment is not empty, that
# means the first term was positive.
if terms[0] != "":
tmp_a.append(terms[0])
# All of the terms in the split will have
# negative coefficients.
else:
tmp_a.append("-" + terms[i])
# Split the terms into coefficients and variables:
constraint_dict = dict()
for term in tmp_a:
for i, char in enumerate(term):
if char in string.ascii_letters:
index = i
c_variable = term[index:]
if term[:index] == "":
c_coeff = 1.0
elif term[:index] == "-":
c_coeff = -1.0
else:
c_coeff = float(term[:index])
break
else:
c_variable = "constant"
c_coeff = term
constraint_dict[c_variable] = c_coeff
# Place coefficients in the variable order the model expects.
constraints_ordered = []
for v_input_index, v_input_name in enumerate(v_input_names):
constraints_ordered.append(constraint_dict.get(v_input_name, 0))
constraints.append(
[constraints_ordered, constraint_dict["constant"], c.constraint_type]
)
return constraints
[docs] def to_dict(self):
if self.prev_param is not None:
param = [self.prev_param[0].tolist(), self.prev_param[1].tolist()]
else:
param = None
strategy_params = dict(
prev_param=param,
use_descriptors=self.use_descriptors,
estimator_type=self.estimator_type,
std_estimator_type=self.std_estimator_type,
acquisition_type=self.acquisition_type,
optimizer_type=self.optimizer_type,
generator_type=self.generator_type,
initial_points=self.initial_points,
min_child_samples=self.min_child_samples,
)
return super().to_dict(**strategy_params)
[docs] @classmethod
def from_dict(cls, d):
# Setup ENTMOOT
entmoot = super().from_dict(d)
param = d["strategy_params"]["prev_param"]
if param is not None:
param = [np.array(param[0]), np.array(param[1])]
entmoot.prev_param = param
return entmoot
"""
def categorical_wrapper(self, categories, reference_categories=None):
if not reference_categories:
return [i for i, _ in enumerate(categories)]
else:
return [reference_categories.index(c) for c in categories]
def categorical_unwrap(self, gpyopt_level, categories):
return categories[int(gpyopt_level)]
"""