Source code for summit.strategies.sobo

from .base import Strategy
from .random import LHS
from summit.domain import *
from summit.utils.dataset import DataSet


import numpy as np
import pandas as pd
from abc import ABC, abstractmethod


[docs]class SOBO(Strategy): """Single-objective Bayesian Optimization (SOBO) This is a general BO method since it is a wrapper around GPyOpt. Parameters ---------- domain: :class:`~summit.domain.Domain` The Summit domain describing the optimization problem. transform : :class:`~summit.strategies.base.Transform`, optional A transform object. By default no transformation will be done on the input variables or objectives. gp_model_type: string, optional The GPy Gaussian Process model type. See notes for options. By default, gaussian processes with the Matern 5.2 kernel will be used. use_descriptors : bool, optional Whether to use descriptors of categorical variables. Defaults to False. acquisition_type: string, optional The acquisition function type from GPyOpt. See notes for options. By default, Excpected Improvement (EI). optimizer_type: string, optional The internal optimizer used in GPyOpt for maximization of the acquisition function. By default, lfbgs will be used. evaluator_type: string, optional The evaluator type used for batch mode (how multiple points are chosen in one iteration). By default, thompson sampling will be used. kernel: :class:`~GPy.kern.kern`, optional The kernel used in the GP. By default a Matern 5.2 kernel (GPy object) will be used. exact_feval: boolean, optional Whether the function evaluations are exact (True) or noisy (False). By default: False. ARD: boolean, optional Whether automatic relevance determination should be applied (True). By default: True. standardize_outputs: boolean, optional Whether the outputs should be standardized (True). By default: True. Examples -------- >>> from summit.domain import * >>> from summit.strategies import SOBO >>> domain = Domain() >>> domain += ContinuousVariable(name='temperature', description='reaction temperature in celsius', bounds=[50, 100]) >>> domain += CategoricalVariable(name='flowrate_a', description='flow of reactant a in mL/min', levels=[1,2,3,4,5]) >>> domain += ContinuousVariable(name='flowrate_b', description='flow of reactant b in mL/min', bounds=[0.1, 0.5]) >>> domain += ContinuousVariable(name="yld", description='yield of reaction', bounds=[0,100], is_objective=True) >>> strategy = SOBO(domain) >>> next_experiments = strategy.suggest_experiments(5) Notes ---------- Gaussian Process (GP) model GP: standard Gaussian Process GP_MCMC: Gaussian Process with prior in hyperparameters sparseGP: sparse Gaussian Process warpedGP: warped Gaussian Process InputWarpedGP: input warped Gaussian Process RF: random forest (scikit-learn) Acquisition function type EI: expected improvement EI_MCMC: integrated expected improvement (requires GP_MCMC model) (https://dash.harvard.edu/bitstream/handle/1/11708816/snoek-bayesopt-nips-2012.pdf?sequence%3D1) LCB: lower confidence bound LCB_MCMC: integrated GP-Lower confidence bound (requires GP_MCMC model) MPI: maximum probability of improvement MPI_MCMC: maximum probability of improvement (requires GP_MCMC model) LP: local penalization ES: entropy search This implementation uses the python package GPyOpt provided by the Machine Learning Group of the University of Sheffield. Github repository: https://github.com/SheffieldML/GPyOpt """ def __init__( self, domain, transform=None, gp_model_type=None, acquisition_type=None, optimizer_type=None, evaluator_type=None, **kwargs ): from GPy.kern import Matern52 Strategy.__init__(self, domain, transform=transform, **kwargs) self.use_descriptors = kwargs.get("use_descriptors", False) # TODO: notation - discrete in our model (e.g., catalyst type) = categorical? self.input_domain = [] for v in self.domain.variables: if not v.is_objective: if isinstance(v, ContinuousVariable): self.input_domain.append( { "name": v.name, "type": v.variable_type, "domain": (v.bounds[0], v.bounds[1]), } ) elif isinstance(v, CategoricalVariable): if not self.use_descriptors: self.input_domain.append( { "name": v.name, "type": "categorical", "domain": tuple(self.categorical_wrapper(v.levels)), } ) elif v.ds is not None and self.use_descriptors: if v.ds is None: raise ValueError( "No descriptors provided for variable: {}".format( v.name ) ) descriptor_names = v.ds.data_columns descriptors = np.asarray( [ v.ds.loc[:, [l]].values.tolist() for l in v.ds.data_columns ] ) for j, d in enumerate(descriptors): self.input_domain.append( { "name": descriptor_names[j], "type": "continuous", "domain": ( np.min(np.asarray(d)), np.max(np.asarray(d)), ), } ) elif v.ds is None and self.use_descriptors: raise ValueError( "Cannot use descriptors because none are provided." ) # TODO: GPyOpt currently does not support mixed-domains w/ bandit inputs, there is a PR for this though else: raise TypeError("Unknown variable type.") # TODO: how to handle equality constraints? Could we remove '==' from constraint types as each equality # constraint reduces the degrees of freedom? if self.domain.constraints is not None: constraints = self.constr_wrapper(self.domain) self.constraints = [ { "name": "constr_" + str(i), "constraint": c[0] if c[1] in ["<=", "<"] else "(" + c[0] + ")*(-1)", } for i, c in enumerate(constraints) if not (c[1] == "==") ] else: self.constraints = None self.input_dim = len(self.domain.input_variables) if gp_model_type in [ "GP", "GP_MCMC", "sparseGP", "warpedGP", "InputWarpedGP", "RF", ]: self.gp_model_type = gp_model_type else: self.gp_model_type = "GP" # default model type is a standard Gaussian Process (from GPy package) if acquisition_type in [ "EI", "EI_MCMC", "LCB", "LCB_MCMC", "MPI", "MPI_MCMC", "LP", "ES", ]: self.acquisition_type = acquisition_type else: self.acquisition_type = ( "EI" # default acquisition function is expected utility improvement ) """ Method for optimization of acquisition function lbfgs: Limited-memory Broyden–Fletcher–Goldfarb–Shanno, DIRECT: Dividing Rectangles, CMA: covariance matrix adaption """ if optimizer_type in ["lbfgs", "DIRECT", "CMA"]: self.optimizer_type = optimizer_type else: self.optimizer_type = "lbfgs" # default optimizer: lbfgs if evaluator_type in [ "sequential", "random", "local_penalization", "thompson_sampling", ]: self.evaluator_type = evaluator_type else: self.evaluator_type = "random" # specify GPy kernel: # https://gpy.readthedocs.io/en/deploy/GPy.kern.html#subpackages self.kernel = kwargs.get("kernel", Matern52(self.input_dim)) # Are function values exact (w/o noise)? self.exact_feval = kwargs.get("exact_feval", False) # automatic relevance determination self.ARD = kwargs.get("ARD", True) # Standardization of outputs? self.standardize_outputs = kwargs.get("standardize_outputs", True) self.prev_param = None
[docs] def suggest_experiments( self, num_experiments=1, prev_res: DataSet = None, **kwargs ): """Suggest experiments using GPyOpt single-objective Bayesian Optimization Parameters ---------- num_experiments: int, optional The number of experiments (i.e., samples) to generate. Default is 1. prev_res: :class:`~summit.utils.data.DataSet`, optional Dataset with data from previous experiments of previous iteration. If no data is passed, then random sampling will be used to suggest an initial design. Returns ------- next_experiments : :class:`~summit.utils.data.DataSet` A Dataset object with the suggested experiments """ import GPyOpt param = None xbest = np.zeros(self.domain.num_continuous_dimensions()) obj = self.domain.output_variables[0] objective_dir = -1.0 if obj.maximize else 1.0 fbest = float("inf") # Suggest random initial design if prev_res is None: """lhs design does not consider constraints lhs = LHS(self.domain) next_experiments = lhs.suggest_experiments((num_experiments)) return next_experiments, None, float("inf"), None """ feasible_region = GPyOpt.Design_space( space=self.input_domain, constraints=self.constraints ) request = GPyOpt.experiment_design.initial_design( "random", feasible_region, num_experiments ) else: # Get inputs and outputs inputs, outputs = self.transform.transform_inputs_outputs( prev_res, categorical_method="descriptors" if self.use_descriptors else None, ) # Set up maximization and minimization by converting maximization to minimization problem for v in self.domain.variables: if v.is_objective and v.maximize: outputs[v.name] = -1 * outputs[v.name] if isinstance(v, CategoricalVariable): if not self.use_descriptors: inputs[v.name] = self.categorical_wrapper( inputs[v.name], v.levels ) inputs = inputs.to_numpy().astype(float) outputs = outputs.to_numpy() if self.prev_param is not None: X_step = self.prev_param[0].astype(float) Y_step = self.prev_param[1].astype(float) X_step = np.vstack((X_step, inputs)) Y_step = np.vstack((Y_step, outputs)) else: X_step = inputs Y_step = outputs sobo_model = GPyOpt.methods.BayesianOptimization( f=None, domain=self.input_domain, constraints=self.constraints, model_type=self.gp_model_type, kernel=self.kernel, acquisition_type=self.acquisition_type, acquisition_optimizer_type=self.optimizer_type, normalize_Y=self.standardize_outputs, batch_size=num_experiments, evaluator_type=self.evaluator_type, maximize=False, ARD=self.ARD, exact_feval=self.exact_feval, X=X_step, Y=Y_step, ) request = sobo_model.suggest_next_locations() # Store parameters (history of suggested points and function evaluations) param = [X_step, Y_step] fbest = np.min(Y_step) xbest = X_step[np.argmin(Y_step)] # Generate DataSet object with variable values of next next_experiments = None transform_descriptors = False if request is not None and len(request) != 0: next_experiments = {} i_inp = 0 for v in self.domain.variables: if not v.is_objective: if isinstance(v, CategoricalVariable): if v.ds is None or not self.use_descriptors: cat_list = [] for j, entry in enumerate(request[:, i_inp]): cat_list.append( self.categorical_unwrap(entry, v.levels) ) next_experiments[v.name] = np.asarray(cat_list) i_inp += 1 else: descriptor_names = v.ds.data_columns for d in descriptor_names: next_experiments[d] = request[:, i_inp] i_inp += 1 transform_descriptors = True else: next_experiments[v.name] = request[:, i_inp] i_inp += 1 next_experiments = DataSet.from_df(pd.DataFrame(data=next_experiments)) next_experiments[("strategy", "METADATA")] = "Single-objective BayOpt" self.fbest = objective_dir * fbest self.xbest = xbest self.prev_param = param # Do any necessary transformation back next_experiments = self.transform.un_transform( next_experiments, categorical_method="descriptors" if self.use_descriptors else None, ) return next_experiments
[docs] def reset(self): """Reset the internal parameters""" self.prev_param = None
[docs] def to_dict(self): if self.prev_param is not None: param = [ self.prev_param[0].astype(float).tolist(), self.prev_param[1].astype(float).tolist(), ] else: param = None strategy_params = dict( prev_param=param, use_descriptors=self.use_descriptors, gp_model_type=self.gp_model_type, acquisition_type=self.acquisition_type, optimizer_type=self.optimizer_type, evaluator_type=self.evaluator_type, kernel=self.kernel.to_dict(), exact_feval=self.exact_feval, ARD=self.ARD, standardize_outputs=self.standardize_outputs, ) return super().to_dict(**strategy_params)
[docs] @classmethod def from_dict(cls, d): from GPy.kern import Kern # Get kernel kernel = d["strategy_params"].get("kernel") if kernel is not None: kernel = Kern.from_dict(kernel) d["strategy_params"]["kernel"] = kernel # Setup SOBO sobo = super().from_dict(d) param = d["strategy_params"]["prev_param"] if param is not None: param = [np.array(param[0]), np.array(param[1])] sobo.prev_param = param return sobo
def constr_wrapper(self, summit_domain): v_input_names = [v.name for v in summit_domain.variables if not v.is_objective] gpyopt_constraints = [] for c in summit_domain.constraints: tmp_c = c.lhs for v_input_index, v_input_name in enumerate(v_input_names): v_gpyopt_name = "x[:," + str(v_input_index) + "]" tmp_c = tmp_c.replace(v_input_name, v_gpyopt_name) gpyopt_constraints.append([tmp_c, c.constraint_type]) return gpyopt_constraints def categorical_wrapper(self, categories, reference_categories=None): if not reference_categories: return [i for i, _ in enumerate(categories)] else: return [reference_categories.index(c) for c in categories] def categorical_unwrap(self, gpyopt_level, categories): return categories[int(gpyopt_level)]