Source code for summit.strategies.random

from .base import Strategy, Design, Transform
from summit.domain import *
from summit.utils.dataset import DataSet
import numpy as np
import pandas as pd
from typing import Tuple


[docs]class Random(Strategy): """Random strategy for experiment suggestion Parameters ---------- domain: `summit.domain.Domain` A summit domain object random_state: `np.random.RandomState`` A random state object to seed the random generator Attributes ---------- domain Examples ------- >>> from summit.domain import Domain, ContinuousVariable >>> from summit.strategies import Random >>> import numpy as np >>> domain = Domain() >>> domain += ContinuousVariable(name='temperature', description='reaction temperature in celsius', bounds=[50, 100]) >>> domain += ContinuousVariable(name='flowrate_a', description='flow of reactant a in mL/min', bounds=[0.1, 0.5]) >>> domain += ContinuousVariable(name='flowrate_b', description='flow of reactant b in mL/min', bounds=[0.1, 0.5]) >>> strategy = Random(domain, random_state=np.random.RandomState(3)) >>> strategy.suggest_experiments(5) NAME temperature flowrate_a flowrate_b strategy TYPE DATA DATA DATA METADATA 0 77.539895 0.458517 0.111950 Random 1 85.407391 0.150234 0.282733 Random 2 64.545237 0.182897 0.359658 Random 3 75.541380 0.120587 0.211395 Random 4 94.647348 0.276324 0.370502 Random Notes ----- Descriptors variables are selected randomly as if they were discrete variables instead of sampling evenly in the continuous space. """ def __init__( self, domain: Domain, transform: Transform = None, random_state: np.random.RandomState = None, **kwargs, ): super().__init__(domain, transform, **kwargs) self._rstate = random_state if random_state else np.random.RandomState()
[docs] def suggest_experiments(self, num_experiments: int, **kwargs) -> DataSet: """Suggest experiments for a random experimental design Parameters ---------- num_experiments: int The number of experiments (i.e., samples) to generate Returns ------- next_experiments : :class:`~summit.utils.data.DataSet` A Dataset object with the suggested experiments """ design = Design(self.domain, num_experiments, "random") for variable in self.domain.input_variables: if isinstance(variable, ContinuousVariable): values = self._random_continuous(variable, num_experiments) indices = None elif isinstance(variable, CategoricalVariable): indices, values = self._random_categorical(variable, num_experiments) else: raise DomainError( f"Variable {variable} is not one of the possible variable types (continuous or categorical)." ) design.add_variable(variable.name, values, indices=indices) ds = design.to_dataset() ds[("strategy", "METADATA")] = "Random" return self.transform.un_transform(ds, categorical_method=None)
def _random_continuous( self, variable: ContinuousVariable, num_samples: int ) -> np.ndarray: """Generate a random design for a given continuous variable""" sample = self._rstate.rand(num_samples, 1) b = variable.lower_bound * np.ones([num_samples, 1]) values = b + sample * (variable.upper_bound - variable.lower_bound) return np.atleast_2d(values).T def _random_categorical( self, variable: CategoricalVariable, num_samples: int ) -> Tuple[np.ndarray, np.ndarray]: """Generate a random design for a given discrete variable""" indices = self._rstate.randint(0, variable.num_levels, size=num_samples) values = np.array([variable.levels[i] for i in indices]) values.shape = (num_samples, 1) indices.shape = (num_samples, 1) return indices, values def reset(self): pass
[docs]class LHS(Strategy): """Latin hypercube sampling (LHS) strategy for experiment suggestion LHS samples evenly throughout the continuous part of the domain, which can result in better data for model training. Parameters ---------- domain: `summit.domain.Domain` A summit domain object random_state: `np.random.RandomState`` A random state object to seed the random generator categorical_method : str, optional The method for transforming categorical variables. Either "one-hot" or "descriptors". Descriptors must be included in the categorical variables for the later. Examples -------- >>> from summit.domain import Domain, ContinuousVariable >>> from summit.strategies import LHS >>> import numpy as np >>> domain = Domain() >>> domain += ContinuousVariable(name='temperature', description='reaction temperature in celsius', bounds=[50, 100]) >>> domain += ContinuousVariable(name='flowrate_a', description='flow of reactant a in mL/min', bounds=[0.1, 0.5]) >>> domain += ContinuousVariable(name='flowrate_b', description='flow of reactant b in mL/min', bounds=[0.1, 0.5]) >>> strategy = LHS(domain, random_state=np.random.RandomState(3)) >>> strategy.suggest_experiments(5) NAME temperature flowrate_a flowrate_b strategy TYPE DATA DATA DATA METADATA 0 95.0 0.46 0.38 LHS 1 65.0 0.14 0.14 LHS 2 55.0 0.22 0.30 LHS 3 85.0 0.30 0.46 LHS 4 75.0 0.38 0.22 LHS Notes ----- LHS was first introduced by [McKay]_ and coworkers in 1979. We rely on the implementation from `pyDoE2 <https://github.com/clicumu/pydoe2>`_. Our version randomly selects a categorical variable if no descriptors are available. If descriptors are available it samples in the continuous space and then chooses the closest point by Euclidean distance. References ---------- .. [McKay] R.J. Beckman et al., Technometrics, 1979, 21, 239–245. """ def __init__( self, domain: Domain, transform: Transform = None, random_state: np.random.RandomState = None, categorical_method: str = None, ): super().__init__(domain, transform) self._rstate = random_state if random_state else np.random.RandomState() self.categorical_method = categorical_method
[docs] def suggest_experiments( self, num_experiments, criterion="center", exclude=[], **kwargs ) -> DataSet: """Generate latin hypercube intial design Parameters ---------- num_experiments: int The number of experiments (i.e., samples) to generate criterion: str, optional The criterion used for the LHS. Allowable values are "center" or "c", "maximin" or "m", "centermaximin" or "cm", and "correlation" or "corr". Default is center. exclude: array like, optional List of variable names that should be excluded from the design. Default is None. Returns ------- next_experiments : :class:`~summit.utils.data.DataSet` A Dataset object with the suggested experiments """ # design = Design(self.domain, num_experiments, "Latin design", exclude=exclude) design = pd.DataFrame() # Instantiate the random design class to be used with categorical variables with no descriptors rdesigner = Random(self.domain, random_state=self._rstate) # Get categorical variables without descriptors categoricals = [] for v in self.domain.input_variables: if isinstance(v, CategoricalVariable): if v.ds is None: categoricals.append(v.name) # Sampling n = self.domain.num_continuous_dimensions(include_descriptors=True) if n != 0: samples = lhs( n, samples=num_experiments, criterion=criterion, random_state=self._rstate, ) k = 0 columns = [] for variable in self.domain.input_variables: if variable.name in exclude: continue # For continuous variables, use samples directly if isinstance(variable, ContinuousVariable): b = variable.lower_bound * np.ones(num_experiments) values = b + samples[:, k] * ( variable.upper_bound - variable.lower_bound ) design.insert(design.shape[1], variable.name, values) k += 1 # For categorical variable with no descriptors, randomly choose elif ( isinstance(variable, CategoricalVariable) and variable.name in categoricals ) or ( isinstance(variable, CategoricalVariable) and self.categorical_method == None ): indices, values = rdesigner._random_categorical( variable, num_experiments ) design.insert(design.shape[1], variable.name, values[:, 0]) # For categorical variable with descriptors, look in descriptors space # The untransform method at the end should find the closest point by euclidean distance. elif isinstance(variable, CategoricalVariable) and variable.ds is not None: num_descriptors = variable.num_descriptors values = samples[:, k : k + num_descriptors] # Scaling var_min = ( variable.ds.loc[:, variable.ds.data_columns].min(axis=0).to_numpy() ) var_min = np.atleast_2d(var_min) var_max = ( variable.ds.loc[:, variable.ds.data_columns].max(axis=0).to_numpy() ) var_max = np.atleast_2d(var_max) var_range = var_max - var_min # Rescale values_scaled = var_min + values * var_range values = values_scaled values.shape = (num_experiments, num_descriptors) k += num_descriptors # Add each descriptors names = variable.ds.columns.levels[0].to_list() for i in range(num_descriptors): design.insert(design.shape[1], names[i], values_scaled[:, i]) else: raise DomainError( f"Variable {variable} is not one of the possible variable types (continuous or categorical)." ) # design.add_variable(variable.name, values, indices=indices) design = DataSet.from_df(design) design[("strategy", "METADATA")] = "LHS" return self.transform.un_transform( design, categorical_method=self.categorical_method )
def reset(self): pass
""" The lhs code was copied from pyDoE and was originally published by the following individuals for use with Scilab: Copyright (C) 2012 - 2013 - Michael Baudin Copyright (C) 2012 - Maria Christopoulou Copyright (C) 2010 - 2011 - INRIA - Michael Baudin Copyright (C) 2009 - Yann Collette Copyright (C) 2009 - CEA - Jean-Marc Martinez website: forge.scilab.org/index.php/p/scidoe/sourcetree/master/macros Much thanks goes to these individuals. It has been converted to Python by Abraham Lee. """ def lhs(n, samples=None, criterion=None, iterations=None, random_state=None): """ Generate a latin-hypercube design Parameters ---------- n : int The number of factors to generate samples for Optional -------- samples : int The number of samples to generate for each factor (Default: n) criterion : str Allowable values are "center" or "c", "maximin" or "m", "centermaximin" or "cm", and "correlation" or "corr". If no value given, the design is simply randomized. iterations : int The number of iterations in the maximin and correlations algorithms (Default: 5). Returns ------- H : 2d-array An n-by-samples design matrix that has been normalized so factor values are uniformly spaced between zero and one. Example ------- >>> import numpy as np A 3-factor design (defaults to 3 samples):: >>> lhs(3, random_state=np.random.RandomState(3)) array([[0.5036092 , 0.73574763, 0.6320977 ], [0.70852844, 0.63098232, 0.09696825], [0.1835993 , 0.23604927, 0.6838224 ]]) A 4-factor design with 6 samples:: >>> lhs(4, samples=6, random_state=np.random.RandomState(3)) array([[0.3419112 , 0.54641455, 0.3383127 , 0.59847714], [0.88058751, 0.11802464, 0.61270915, 0.4094722 ], [0.09179965, 0.40680164, 0.18759755, 0.20120715], [0.67066365, 0.94885632, 0.90674229, 0.85947796], [0.60819067, 0.31604885, 0.04848412, 0.08513793], [0.31549116, 0.75980901, 0.70987541, 0.7358502 ]]) A 2-factor design with 5 centered samples:: >>> lhs(2, samples=5, criterion='center', random_state=np.random.RandomState(3)) array([[0.7, 0.7], [0.1, 0.1], [0.5, 0.9], [0.3, 0.3], [0.9, 0.5]]) A 3-factor design with 4 samples where the minimum distance between all samples has been maximized:: >>> lhs(3, samples=4, criterion='maximin', random_state=np.random.RandomState(3)) array([[0.07987376, 0.37639351, 0.92316265], [0.25650657, 0.7314332 , 0.12061145], [0.55174153, 0.00530644, 0.56933076], [0.79401553, 0.9975753 , 0.47950751]]) A 4-factor design with 5 samples where the samples are as uncorrelated as possible (within 10 iterations):: >>> lhs(4, samples=5, criterion='correlation', iterations=10, random_state=np.random.RandomState(3)) array([[0.72982881, 0.91177082, 0.73525098, 0.71817256], [0.37858939, 0.48816197, 0.40597524, 0.10216552], [0.80479638, 0.37925862, 0.85185049, 0.49136664], [0.11015958, 0.65569746, 0.22511706, 0.88302024], [0.41029344, 0.14162956, 0.05818095, 0.24144858]]) """ H = None random_state = random_state if random_state else np.random.RandomState() if samples is None: samples = n if criterion is not None: assert criterion.lower() in ( "center", "c", "maximin", "m", "centermaximin", "cm", "correlation", "corr", ), 'Invalid value for "criterion": {}'.format(criterion) else: H = _lhsclassic(n, samples, random_state) if criterion is None: criterion = "center" if iterations is None: iterations = 5 if H is None: if criterion.lower() in ("center", "c"): H = _lhscentered(n, samples, random_state) elif criterion.lower() in ("maximin", "m"): H = _lhsmaximin(n, samples, iterations, "maximin", random_state) elif criterion.lower() in ("centermaximin", "cm"): H = _lhsmaximin(n, samples, iterations, "centermaximin", random_state) elif criterion.lower() in ("correlation", "corr"): H = _lhscorrelate(n, samples, iterations, random_state) return H ################################################################################ def _lhsclassic(n, samples, random_state): # Generate the intervals cut = np.linspace(0, 1, samples + 1) # Fill points uniformly in each interval u = random_state.rand(samples, n) a = cut[:samples] b = cut[1 : samples + 1] rdpoints = np.zeros_like(u) for j in range(n): rdpoints[:, j] = u[:, j] * (b - a) + a # Make the random pairings H = np.zeros_like(rdpoints) for j in range(n): order = random_state.permutation(range(samples)) H[:, j] = rdpoints[order, j] return H ################################################################################ def _lhscentered(n, samples, random_state): # Generate the intervals cut = np.linspace(0, 1, samples + 1) # Fill points uniformly in each interval u = random_state.rand(samples, n) a = cut[:samples] b = cut[1 : samples + 1] _center = (a + b) / 2 # Make the random pairings H = np.zeros_like(u) for j in range(n): H[:, j] = random_state.permutation(_center) return H ################################################################################ def _lhsmaximin(n, samples, iterations, lhstype, random_state): maxdist = 0 # Maximize the minimum distance between points for i in range(iterations): if lhstype == "maximin": Hcandidate = _lhsclassic(n, samples, random_state) else: Hcandidate = _lhscentered(n, samples, random_state) d = _pdist(Hcandidate) if maxdist < np.min(d): maxdist = np.min(d) H = Hcandidate.copy() return H ################################################################################ def _lhscorrelate(n, samples, iterations, random_state): mincorr = np.inf # Minimize the components correlation coefficients for i in range(iterations): # Generate a random LHS Hcandidate = _lhsclassic(n, samples, random_state) R = np.corrcoef(Hcandidate) if np.max(np.abs(R[R != 1])) < mincorr: mincorr = np.max(np.abs(R - np.eye(R.shape[0]))) # print('new candidate solution found with max,abs corrcoef = {}'.format(mincorr)) H = Hcandidate.copy() return H ################################################################################ def _pdist(x): """ Calculate the pair-wise point distances of a matrix Parameters ---------- x : 2d-array An m-by-n array of scalars, where there are m points in n dimensions. Returns ------- d : array A 1-by-b array of scalars, where b = m*(m - 1)/2. This array contains all the pair-wise point distances, arranged in the order (1, 0), (2, 0), ..., (m-1, 0), (2, 1), ..., (m-1, 1), ..., (m-1, m-2). """ x = np.atleast_2d(x) assert len(x.shape) == 2, "Input array must be 2d-dimensional" m, n = x.shape if m < 2: return [] d = [] for i in range(m - 1): for j in range(i + 1, m): d.append((sum((x[j, :] - x[i, :]) ** 2)) ** 0.5) return np.array(d)