Coverage for mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py: 88%

98 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-06 00:35 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5""" 

6Contains the wrapper class for SMAC Bayesian optimizers. 

7See Also: <https://automl.github.io/SMAC3/main/index.html> 

8""" 

9 

10from logging import warning 

11from pathlib import Path 

12from typing import Dict, List, Optional, Union, TYPE_CHECKING 

13from tempfile import TemporaryDirectory 

14 

15import ConfigSpace 

16import numpy.typing as npt 

17import pandas as pd 

18 

19from mlos_core.optimizers.bayesian_optimizers.bayesian_optimizer import BaseBayesianOptimizer 

20from mlos_core.spaces.adapters.adapter import BaseSpaceAdapter 

21from mlos_core.spaces.adapters.identity_adapter import IdentityAdapter 

22 

23 

24class SmacOptimizer(BaseBayesianOptimizer): 

25 """Wrapper class for SMAC based Bayesian optimization. 

26 

27 Parameters 

28 ---------- 

29 parameter_space : ConfigSpace.ConfigurationSpace 

30 The parameter space to optimize. 

31 

32 space_adapter : BaseSpaceAdapter 

33 The space adapter class to employ for parameter space transformations. 

34 

35 seed : Optional[int] 

36 By default SMAC uses a known seed (0) to keep results reproducible. 

37 However, if a `None` seed is explicitly provided, we let a random seed be produced by SMAC. 

38 

39 run_name : Optional[str] 

40 Name of this run. This is used to easily distinguish across different runs. 

41 If set to `None` (default), SMAC will generate a hash from metadata. 

42 

43 output_directory : Optional[str] 

44 The directory where SMAC output will saved. If set to `None` (default), a temporary dir will be used. 

45 

46 max_trials : int 

47 Maximum number of trials (i.e., function evaluations) to be run. Defaults to 100. 

48 Note that modifying this value directly affects the value of `n_random_init`, if latter is set to `None`. 

49 

50 n_random_init : Optional[int] 

51 Number of points evaluated at start to bootstrap the optimizer. 

52 Default depends on max_trials and number of parameters and max_ratio. 

53 Note: it can sometimes be useful to set this to 1 when pre-warming the 

54 optimizer from historical data. 

55 See Also: mlos_bench.optimizer.bulk_register 

56 

57 max_ratio : Optional[int] 

58 Maximum ratio of max_trials to be random configurations to be evaluated 

59 at start to bootstrap the optimizer. 

60 Useful if you want to explicitly control the number of random 

61 configurations evaluated at start. 

62 

63 use_default_config: bool 

64 Whether to use the default config for the first trial after random initialization. 

65 

66 n_random_probability: float 

67 Probability of choosing to evaluate a random configuration during optimization. 

68 Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. 

69 """ 

70 

71 def __init__(self, *, # pylint: disable=too-many-locals 

72 parameter_space: ConfigSpace.ConfigurationSpace, 

73 space_adapter: Optional[BaseSpaceAdapter] = None, 

74 seed: Optional[int] = 0, 

75 run_name: Optional[str] = None, 

76 output_directory: Optional[str] = None, 

77 max_trials: int = 100, 

78 n_random_init: Optional[int] = None, 

79 max_ratio: Optional[float] = None, 

80 use_default_config: bool = False, 

81 n_random_probability: float = 0.1): 

82 

83 super().__init__( 

84 parameter_space=parameter_space, 

85 space_adapter=space_adapter, 

86 ) 

87 

88 # Declare at the top because we need it in __del__/cleanup() 

89 self._temp_output_directory: Optional[TemporaryDirectory] = None 

90 

91 # pylint: disable=import-outside-toplevel 

92 from smac import HyperparameterOptimizationFacade as Optimizer_Smac 

93 from smac import Scenario 

94 from smac.intensifier.abstract_intensifier import AbstractIntensifier 

95 from smac.main.config_selector import ConfigSelector 

96 from smac.random_design.probability_design import ProbabilityRandomDesign 

97 from smac.runhistory import TrialInfo 

98 

99 # Store for TrialInfo instances returned by .ask() 

100 self.trial_info_map: Dict[ConfigSpace.Configuration, TrialInfo] = {} 

101 

102 # The default when not specified is to use a known seed (0) to keep results reproducible. 

103 # However, if a `None` seed is explicitly provided, we let a random seed be produced by SMAC. 

104 # https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario 

105 seed = -1 if seed is None else seed 

106 

107 # Create temporary directory for SMAC output (if none provided) 

108 if output_directory is None: 

109 # pylint: disable=consider-using-with 

110 try: 

111 self._temp_output_directory = TemporaryDirectory(ignore_cleanup_errors=True) # Argument added in Python 3.10 

112 except TypeError: 

113 self._temp_output_directory = TemporaryDirectory() 

114 output_directory = self._temp_output_directory.name 

115 

116 if n_random_init is not None: 

117 assert isinstance(n_random_init, int) and n_random_init >= 0 

118 if n_random_init == max_trials and use_default_config: 

119 # Increase max budgeted trials to account for use_default_config. 

120 max_trials += 1 

121 

122 scenario: Scenario = Scenario( 

123 self.optimizer_parameter_space, 

124 name=run_name, 

125 output_directory=Path(output_directory), 

126 deterministic=True, 

127 use_default_config=use_default_config, 

128 n_trials=max_trials, 

129 seed=seed or -1, # if -1, SMAC will generate a random seed internally 

130 n_workers=1, # Use a single thread for evaluating trials 

131 ) 

132 intensifier: AbstractIntensifier = Optimizer_Smac.get_intensifier(scenario, max_config_calls=1) 

133 config_selector: ConfigSelector = Optimizer_Smac.get_config_selector(scenario, retrain_after=1) 

134 

135 # TODO: When bulk registering prior configs to rewarm the optimizer, 

136 # there is a way to inform SMAC's initial design that we have 

137 # additional_configs and can set n_configs == 0. 

138 # Additionally, we may want to consider encoding those values into the 

139 # runhistory when prewarming the optimizer so that the initial design 

140 # doesn't reperform random init. 

141 # See Also: #488 

142 

143 initial_design_args: Dict[str, Union[list, int, float, Scenario]] = { 

144 'scenario': scenario, 

145 # Workaround a bug in SMAC that sets a default arg to a mutable 

146 # value that can cause issues when multiple optimizers are 

147 # instantiated with the use_default_config option within the same 

148 # process that use different ConfigSpaces so that the second 

149 # receives the default config from both as an additional config. 

150 'additional_configs': [] 

151 } 

152 if n_random_init is not None: 

153 initial_design_args['n_configs'] = n_random_init 

154 if n_random_init > 0.25 * max_trials and max_ratio is None: 

155 warning( 

156 'Number of random initial configurations (%d) is ' + 

157 'greater than 25%% of max_trials (%d). ' + 

158 'Consider setting max_ratio to avoid SMAC overriding n_random_init.', 

159 n_random_init, 

160 max_trials, 

161 ) 

162 if max_ratio is not None: 

163 assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0 

164 initial_design_args['max_ratio'] = max_ratio 

165 

166 # Use the default InitialDesign from SMAC. 

167 # (currently SBOL instead of LatinHypercube due to better uniformity 

168 # for initial sampling which results in lower overall samples required) 

169 initial_design = Optimizer_Smac.get_initial_design(**initial_design_args) # type: ignore[arg-type] 

170 # initial_design = LatinHypercubeInitialDesign(**initial_design_args) # type: ignore[arg-type] 

171 

172 # Workaround a bug in SMAC that doesn't pass the seed to the random 

173 # design when generated a random_design for itself via the 

174 # get_random_design static method when random_design is None. 

175 assert isinstance(n_random_probability, float) and n_random_probability >= 0 

176 random_design = ProbabilityRandomDesign(probability=n_random_probability, seed=scenario.seed) 

177 

178 self.base_optimizer = Optimizer_Smac( 

179 scenario, 

180 SmacOptimizer._dummy_target_func, 

181 initial_design=initial_design, 

182 intensifier=intensifier, 

183 random_design=random_design, 

184 config_selector=config_selector, 

185 overwrite=True, 

186 logging_level=False, # Use the existing logger 

187 ) 

188 

189 def __del__(self) -> None: 

190 # Best-effort attempt to clean up, in case the user forgets to call .cleanup() 

191 self.cleanup() 

192 

193 @property 

194 def n_random_init(self) -> int: 

195 """ 

196 Gets the number of random samples to use to initialize the optimizer's search space sampling. 

197 

198 Note: This may not be equal to the value passed to the initializer, due to logic present in the SMAC. 

199 See Also: max_ratio 

200 

201 Returns 

202 ------- 

203 int 

204 The number of random samples used to initialize the optimizer's search space sampling. 

205 """ 

206 # pylint: disable=protected-access 

207 return self.base_optimizer._initial_design._n_configs 

208 

209 @staticmethod 

210 def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None: 

211 """Dummy target function for SMAC optimizer. 

212 

213 Since we only use the ask-and-tell interface, this is never called. 

214 

215 Parameters 

216 ---------- 

217 config : ConfigSpace.Configuration 

218 Configuration to evaluate. 

219 

220 seed : int 

221 Random seed to use for the target function. Not actually used. 

222 """ 

223 # NOTE: Providing a target function when using the ask-and-tell interface is an imperfection of the API 

224 # -- this planned to be fixed in some future release: https://github.com/automl/SMAC3/issues/946 

225 raise RuntimeError('This function should never be called.') 

226 

227 def _register(self, configurations: pd.DataFrame, scores: pd.Series, context: Optional[pd.DataFrame] = None) -> None: 

228 """Registers the given configurations and scores. 

229 

230 Parameters 

231 ---------- 

232 configurations : pd.DataFrame 

233 Dataframe of configurations / parameters. The columns are parameter names and the rows are the configurations. 

234 

235 scores : pd.Series 

236 Scores from running the configurations. The index is the same as the index of the configurations. 

237 

238 context : pd.DataFrame 

239 Not Yet Implemented. 

240 """ 

241 from smac.runhistory import StatusType, TrialInfo, TrialValue # pylint: disable=import-outside-toplevel 

242 

243 if context is not None: 

244 raise NotImplementedError() 

245 

246 # Register each trial (one-by-one) 

247 for config, score in zip(self._to_configspace_configs(configurations), scores.tolist()): 

248 # Retrieve previously generated TrialInfo (returned by .ask()) or create new TrialInfo instance 

249 info: TrialInfo = self.trial_info_map.get(config, TrialInfo(config=config, seed=self.base_optimizer.scenario.seed)) 

250 value: TrialValue = TrialValue(cost=score, time=0.0, status=StatusType.SUCCESS) 

251 self.base_optimizer.tell(info, value, save=False) 

252 

253 # Save optimizer once we register all configs 

254 self.base_optimizer.optimizer.save() 

255 

256 def _suggest(self, context: Optional[pd.DataFrame] = None) -> pd.DataFrame: 

257 """Suggests a new configuration. 

258 

259 Parameters 

260 ---------- 

261 context : pd.DataFrame 

262 Not Yet Implemented. 

263 

264 Returns 

265 ------- 

266 configuration : pd.DataFrame 

267 Pandas dataframe with a single row. Column names are the parameter names. 

268 """ 

269 if TYPE_CHECKING: 

270 from smac.runhistory import TrialInfo # pylint: disable=import-outside-toplevel 

271 

272 if context is not None: 

273 raise NotImplementedError() 

274 

275 trial: TrialInfo = self.base_optimizer.ask() 

276 trial.config.is_valid_configuration() 

277 self.optimizer_parameter_space.check_configuration(trial.config) 

278 assert trial.config.config_space == self.optimizer_parameter_space 

279 self.trial_info_map[trial.config] = trial 

280 config_df = pd.DataFrame([trial.config], columns=list(self.optimizer_parameter_space.keys())) 

281 return config_df 

282 

283 def register_pending(self, configurations: pd.DataFrame, context: Optional[pd.DataFrame] = None) -> None: 

284 raise NotImplementedError() 

285 

286 def surrogate_predict(self, configurations: pd.DataFrame, context: Optional[pd.DataFrame] = None) -> npt.NDArray: 

287 from smac.utils.configspace import convert_configurations_to_array # pylint: disable=import-outside-toplevel 

288 

289 if context is not None: 

290 raise NotImplementedError() 

291 if self._space_adapter and not isinstance(self._space_adapter, IdentityAdapter): 

292 raise NotImplementedError() 

293 

294 # pylint: disable=protected-access 

295 if len(self._observations) <= self.base_optimizer._initial_design._n_configs: 

296 raise RuntimeError( 

297 'Surrogate model can make predictions *only* after all initial points have been evaluated ' + 

298 f'{len(self._observations)} <= {self.base_optimizer._initial_design._n_configs}') 

299 if self.base_optimizer._config_selector._model is None: 

300 raise RuntimeError('Surrogate model is not yet trained') 

301 

302 configs: npt.NDArray = convert_configurations_to_array(self._to_configspace_configs(configurations)) 

303 mean_predictions, _ = self.base_optimizer._config_selector._model.predict(configs) 

304 return mean_predictions.reshape(-1,) 

305 

306 def acquisition_function(self, configurations: pd.DataFrame, context: Optional[pd.DataFrame] = None) -> npt.NDArray: 

307 if context is not None: 

308 raise NotImplementedError() 

309 if self._space_adapter: 

310 raise NotImplementedError() 

311 

312 # pylint: disable=protected-access 

313 if self.base_optimizer._config_selector._acquisition_function is None: 

314 raise RuntimeError('Acquisition function is not yet initialized') 

315 

316 configs: list = self._to_configspace_configs(configurations) 

317 return self.base_optimizer._config_selector._acquisition_function(configs).reshape(-1,) 

318 

319 def cleanup(self) -> None: 

320 if self._temp_output_directory is not None: 

321 self._temp_output_directory.cleanup() 

322 self._temp_output_directory = None 

323 

324 def _to_configspace_configs(self, configurations: pd.DataFrame) -> List[ConfigSpace.Configuration]: 

325 """Convert a dataframe of configurations to a list of ConfigSpace configurations. 

326 

327 Parameters 

328 ---------- 

329 configurations : pd.DataFrame 

330 Dataframe of configurations / parameters. The columns are parameter names and the rows are the configurations. 

331 

332 Returns 

333 ------- 

334 configurations : list 

335 List of ConfigSpace configurations. 

336 """ 

337 return [ 

338 ConfigSpace.Configuration(self.optimizer_parameter_space, values=config.to_dict()) 

339 for (_, config) in configurations.astype('O').iterrows() 

340 ]