Coverage for mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py: 88%
98 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-06 00:35 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-06 00:35 +0000
1#
2# Copyright (c) Microsoft Corporation.
3# Licensed under the MIT License.
4#
5"""
6Contains the wrapper class for SMAC Bayesian optimizers.
7See Also: <https://automl.github.io/SMAC3/main/index.html>
8"""
10from logging import warning
11from pathlib import Path
12from typing import Dict, List, Optional, Union, TYPE_CHECKING
13from tempfile import TemporaryDirectory
15import ConfigSpace
16import numpy.typing as npt
17import pandas as pd
19from mlos_core.optimizers.bayesian_optimizers.bayesian_optimizer import BaseBayesianOptimizer
20from mlos_core.spaces.adapters.adapter import BaseSpaceAdapter
21from mlos_core.spaces.adapters.identity_adapter import IdentityAdapter
24class SmacOptimizer(BaseBayesianOptimizer):
25 """Wrapper class for SMAC based Bayesian optimization.
27 Parameters
28 ----------
29 parameter_space : ConfigSpace.ConfigurationSpace
30 The parameter space to optimize.
32 space_adapter : BaseSpaceAdapter
33 The space adapter class to employ for parameter space transformations.
35 seed : Optional[int]
36 By default SMAC uses a known seed (0) to keep results reproducible.
37 However, if a `None` seed is explicitly provided, we let a random seed be produced by SMAC.
39 run_name : Optional[str]
40 Name of this run. This is used to easily distinguish across different runs.
41 If set to `None` (default), SMAC will generate a hash from metadata.
43 output_directory : Optional[str]
44 The directory where SMAC output will saved. If set to `None` (default), a temporary dir will be used.
46 max_trials : int
47 Maximum number of trials (i.e., function evaluations) to be run. Defaults to 100.
48 Note that modifying this value directly affects the value of `n_random_init`, if latter is set to `None`.
50 n_random_init : Optional[int]
51 Number of points evaluated at start to bootstrap the optimizer.
52 Default depends on max_trials and number of parameters and max_ratio.
53 Note: it can sometimes be useful to set this to 1 when pre-warming the
54 optimizer from historical data.
55 See Also: mlos_bench.optimizer.bulk_register
57 max_ratio : Optional[int]
58 Maximum ratio of max_trials to be random configurations to be evaluated
59 at start to bootstrap the optimizer.
60 Useful if you want to explicitly control the number of random
61 configurations evaluated at start.
63 use_default_config: bool
64 Whether to use the default config for the first trial after random initialization.
66 n_random_probability: float
67 Probability of choosing to evaluate a random configuration during optimization.
68 Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation.
69 """
71 def __init__(self, *, # pylint: disable=too-many-locals
72 parameter_space: ConfigSpace.ConfigurationSpace,
73 space_adapter: Optional[BaseSpaceAdapter] = None,
74 seed: Optional[int] = 0,
75 run_name: Optional[str] = None,
76 output_directory: Optional[str] = None,
77 max_trials: int = 100,
78 n_random_init: Optional[int] = None,
79 max_ratio: Optional[float] = None,
80 use_default_config: bool = False,
81 n_random_probability: float = 0.1):
83 super().__init__(
84 parameter_space=parameter_space,
85 space_adapter=space_adapter,
86 )
88 # Declare at the top because we need it in __del__/cleanup()
89 self._temp_output_directory: Optional[TemporaryDirectory] = None
91 # pylint: disable=import-outside-toplevel
92 from smac import HyperparameterOptimizationFacade as Optimizer_Smac
93 from smac import Scenario
94 from smac.intensifier.abstract_intensifier import AbstractIntensifier
95 from smac.main.config_selector import ConfigSelector
96 from smac.random_design.probability_design import ProbabilityRandomDesign
97 from smac.runhistory import TrialInfo
99 # Store for TrialInfo instances returned by .ask()
100 self.trial_info_map: Dict[ConfigSpace.Configuration, TrialInfo] = {}
102 # The default when not specified is to use a known seed (0) to keep results reproducible.
103 # However, if a `None` seed is explicitly provided, we let a random seed be produced by SMAC.
104 # https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario
105 seed = -1 if seed is None else seed
107 # Create temporary directory for SMAC output (if none provided)
108 if output_directory is None:
109 # pylint: disable=consider-using-with
110 try:
111 self._temp_output_directory = TemporaryDirectory(ignore_cleanup_errors=True) # Argument added in Python 3.10
112 except TypeError:
113 self._temp_output_directory = TemporaryDirectory()
114 output_directory = self._temp_output_directory.name
116 if n_random_init is not None:
117 assert isinstance(n_random_init, int) and n_random_init >= 0
118 if n_random_init == max_trials and use_default_config:
119 # Increase max budgeted trials to account for use_default_config.
120 max_trials += 1
122 scenario: Scenario = Scenario(
123 self.optimizer_parameter_space,
124 name=run_name,
125 output_directory=Path(output_directory),
126 deterministic=True,
127 use_default_config=use_default_config,
128 n_trials=max_trials,
129 seed=seed or -1, # if -1, SMAC will generate a random seed internally
130 n_workers=1, # Use a single thread for evaluating trials
131 )
132 intensifier: AbstractIntensifier = Optimizer_Smac.get_intensifier(scenario, max_config_calls=1)
133 config_selector: ConfigSelector = Optimizer_Smac.get_config_selector(scenario, retrain_after=1)
135 # TODO: When bulk registering prior configs to rewarm the optimizer,
136 # there is a way to inform SMAC's initial design that we have
137 # additional_configs and can set n_configs == 0.
138 # Additionally, we may want to consider encoding those values into the
139 # runhistory when prewarming the optimizer so that the initial design
140 # doesn't reperform random init.
141 # See Also: #488
143 initial_design_args: Dict[str, Union[list, int, float, Scenario]] = {
144 'scenario': scenario,
145 # Workaround a bug in SMAC that sets a default arg to a mutable
146 # value that can cause issues when multiple optimizers are
147 # instantiated with the use_default_config option within the same
148 # process that use different ConfigSpaces so that the second
149 # receives the default config from both as an additional config.
150 'additional_configs': []
151 }
152 if n_random_init is not None:
153 initial_design_args['n_configs'] = n_random_init
154 if n_random_init > 0.25 * max_trials and max_ratio is None:
155 warning(
156 'Number of random initial configurations (%d) is ' +
157 'greater than 25%% of max_trials (%d). ' +
158 'Consider setting max_ratio to avoid SMAC overriding n_random_init.',
159 n_random_init,
160 max_trials,
161 )
162 if max_ratio is not None:
163 assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0
164 initial_design_args['max_ratio'] = max_ratio
166 # Use the default InitialDesign from SMAC.
167 # (currently SBOL instead of LatinHypercube due to better uniformity
168 # for initial sampling which results in lower overall samples required)
169 initial_design = Optimizer_Smac.get_initial_design(**initial_design_args) # type: ignore[arg-type]
170 # initial_design = LatinHypercubeInitialDesign(**initial_design_args) # type: ignore[arg-type]
172 # Workaround a bug in SMAC that doesn't pass the seed to the random
173 # design when generated a random_design for itself via the
174 # get_random_design static method when random_design is None.
175 assert isinstance(n_random_probability, float) and n_random_probability >= 0
176 random_design = ProbabilityRandomDesign(probability=n_random_probability, seed=scenario.seed)
178 self.base_optimizer = Optimizer_Smac(
179 scenario,
180 SmacOptimizer._dummy_target_func,
181 initial_design=initial_design,
182 intensifier=intensifier,
183 random_design=random_design,
184 config_selector=config_selector,
185 overwrite=True,
186 logging_level=False, # Use the existing logger
187 )
189 def __del__(self) -> None:
190 # Best-effort attempt to clean up, in case the user forgets to call .cleanup()
191 self.cleanup()
193 @property
194 def n_random_init(self) -> int:
195 """
196 Gets the number of random samples to use to initialize the optimizer's search space sampling.
198 Note: This may not be equal to the value passed to the initializer, due to logic present in the SMAC.
199 See Also: max_ratio
201 Returns
202 -------
203 int
204 The number of random samples used to initialize the optimizer's search space sampling.
205 """
206 # pylint: disable=protected-access
207 return self.base_optimizer._initial_design._n_configs
209 @staticmethod
210 def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None:
211 """Dummy target function for SMAC optimizer.
213 Since we only use the ask-and-tell interface, this is never called.
215 Parameters
216 ----------
217 config : ConfigSpace.Configuration
218 Configuration to evaluate.
220 seed : int
221 Random seed to use for the target function. Not actually used.
222 """
223 # NOTE: Providing a target function when using the ask-and-tell interface is an imperfection of the API
224 # -- this planned to be fixed in some future release: https://github.com/automl/SMAC3/issues/946
225 raise RuntimeError('This function should never be called.')
227 def _register(self, configurations: pd.DataFrame, scores: pd.Series, context: Optional[pd.DataFrame] = None) -> None:
228 """Registers the given configurations and scores.
230 Parameters
231 ----------
232 configurations : pd.DataFrame
233 Dataframe of configurations / parameters. The columns are parameter names and the rows are the configurations.
235 scores : pd.Series
236 Scores from running the configurations. The index is the same as the index of the configurations.
238 context : pd.DataFrame
239 Not Yet Implemented.
240 """
241 from smac.runhistory import StatusType, TrialInfo, TrialValue # pylint: disable=import-outside-toplevel
243 if context is not None:
244 raise NotImplementedError()
246 # Register each trial (one-by-one)
247 for config, score in zip(self._to_configspace_configs(configurations), scores.tolist()):
248 # Retrieve previously generated TrialInfo (returned by .ask()) or create new TrialInfo instance
249 info: TrialInfo = self.trial_info_map.get(config, TrialInfo(config=config, seed=self.base_optimizer.scenario.seed))
250 value: TrialValue = TrialValue(cost=score, time=0.0, status=StatusType.SUCCESS)
251 self.base_optimizer.tell(info, value, save=False)
253 # Save optimizer once we register all configs
254 self.base_optimizer.optimizer.save()
256 def _suggest(self, context: Optional[pd.DataFrame] = None) -> pd.DataFrame:
257 """Suggests a new configuration.
259 Parameters
260 ----------
261 context : pd.DataFrame
262 Not Yet Implemented.
264 Returns
265 -------
266 configuration : pd.DataFrame
267 Pandas dataframe with a single row. Column names are the parameter names.
268 """
269 if TYPE_CHECKING:
270 from smac.runhistory import TrialInfo # pylint: disable=import-outside-toplevel
272 if context is not None:
273 raise NotImplementedError()
275 trial: TrialInfo = self.base_optimizer.ask()
276 trial.config.is_valid_configuration()
277 self.optimizer_parameter_space.check_configuration(trial.config)
278 assert trial.config.config_space == self.optimizer_parameter_space
279 self.trial_info_map[trial.config] = trial
280 config_df = pd.DataFrame([trial.config], columns=list(self.optimizer_parameter_space.keys()))
281 return config_df
283 def register_pending(self, configurations: pd.DataFrame, context: Optional[pd.DataFrame] = None) -> None:
284 raise NotImplementedError()
286 def surrogate_predict(self, configurations: pd.DataFrame, context: Optional[pd.DataFrame] = None) -> npt.NDArray:
287 from smac.utils.configspace import convert_configurations_to_array # pylint: disable=import-outside-toplevel
289 if context is not None:
290 raise NotImplementedError()
291 if self._space_adapter and not isinstance(self._space_adapter, IdentityAdapter):
292 raise NotImplementedError()
294 # pylint: disable=protected-access
295 if len(self._observations) <= self.base_optimizer._initial_design._n_configs:
296 raise RuntimeError(
297 'Surrogate model can make predictions *only* after all initial points have been evaluated ' +
298 f'{len(self._observations)} <= {self.base_optimizer._initial_design._n_configs}')
299 if self.base_optimizer._config_selector._model is None:
300 raise RuntimeError('Surrogate model is not yet trained')
302 configs: npt.NDArray = convert_configurations_to_array(self._to_configspace_configs(configurations))
303 mean_predictions, _ = self.base_optimizer._config_selector._model.predict(configs)
304 return mean_predictions.reshape(-1,)
306 def acquisition_function(self, configurations: pd.DataFrame, context: Optional[pd.DataFrame] = None) -> npt.NDArray:
307 if context is not None:
308 raise NotImplementedError()
309 if self._space_adapter:
310 raise NotImplementedError()
312 # pylint: disable=protected-access
313 if self.base_optimizer._config_selector._acquisition_function is None:
314 raise RuntimeError('Acquisition function is not yet initialized')
316 configs: list = self._to_configspace_configs(configurations)
317 return self.base_optimizer._config_selector._acquisition_function(configs).reshape(-1,)
319 def cleanup(self) -> None:
320 if self._temp_output_directory is not None:
321 self._temp_output_directory.cleanup()
322 self._temp_output_directory = None
324 def _to_configspace_configs(self, configurations: pd.DataFrame) -> List[ConfigSpace.Configuration]:
325 """Convert a dataframe of configurations to a list of ConfigSpace configurations.
327 Parameters
328 ----------
329 configurations : pd.DataFrame
330 Dataframe of configurations / parameters. The columns are parameter names and the rows are the configurations.
332 Returns
333 -------
334 configurations : list
335 List of ConfigSpace configurations.
336 """
337 return [
338 ConfigSpace.Configuration(self.optimizer_parameter_space, values=config.to_dict())
339 for (_, config) in configurations.astype('O').iterrows()
340 ]