Coverage for mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py: 87%

111 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-20 00:44 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5""" 

6Contains the wrapper class for the :py:class:`.SmacOptimizer`. 

7 

8Notes 

9----- 

10See the `SMAC3 Documentation <https://automl.github.io/SMAC3/main/index.html>`_ for 

11more details. 

12""" 

13 

14from logging import warning 

15from pathlib import Path 

16from tempfile import TemporaryDirectory 

17from typing import TYPE_CHECKING, Dict, List, Optional, Union 

18from warnings import warn 

19 

20import ConfigSpace 

21import numpy.typing as npt 

22import pandas as pd 

23 

24from mlos_core.data_classes import Observation, Observations, Suggestion 

25from mlos_core.optimizers.bayesian_optimizers.bayesian_optimizer import ( 

26 BaseBayesianOptimizer, 

27) 

28from mlos_core.spaces.adapters.adapter import BaseSpaceAdapter 

29from mlos_core.spaces.adapters.identity_adapter import IdentityAdapter 

30 

31 

32class SmacOptimizer(BaseBayesianOptimizer): 

33 """Wrapper class for SMAC based Bayesian optimization.""" 

34 

35 def __init__( 

36 self, 

37 *, # pylint: disable=too-many-locals,too-many-arguments 

38 parameter_space: ConfigSpace.ConfigurationSpace, 

39 optimization_targets: List[str], 

40 objective_weights: Optional[List[float]] = None, 

41 space_adapter: Optional[BaseSpaceAdapter] = None, 

42 seed: Optional[int] = 0, 

43 run_name: Optional[str] = None, 

44 output_directory: Optional[str] = None, 

45 max_trials: int = 100, 

46 n_random_init: Optional[int] = None, 

47 max_ratio: Optional[float] = None, 

48 use_default_config: bool = False, 

49 n_random_probability: float = 0.1, 

50 ): 

51 """ 

52 Instantiate a new SMAC optimizer wrapper. 

53 

54 Parameters 

55 ---------- 

56 parameter_space : ConfigSpace.ConfigurationSpace 

57 The parameter space to optimize. 

58 

59 optimization_targets : List[str] 

60 The names of the optimization targets to minimize. 

61 

62 objective_weights : Optional[List[float]] 

63 Optional list of weights of optimization targets. 

64 

65 space_adapter : BaseSpaceAdapter 

66 The space adapter class to employ for parameter space transformations. 

67 

68 seed : Optional[int] 

69 By default SMAC uses a known seed (0) to keep results reproducible. 

70 However, if a `None` seed is explicitly provided, we let a random seed 

71 be produced by SMAC. 

72 

73 run_name : Optional[str] 

74 Name of this run. This is used to easily distinguish across different runs. 

75 If set to `None` (default), SMAC will generate a hash from metadata. 

76 

77 output_directory : Optional[str] 

78 The directory where SMAC output will saved. If set to `None` (default), 

79 a temporary dir will be used. 

80 

81 max_trials : int 

82 Maximum number of trials (i.e., function evaluations) to be run. Defaults to 100. 

83 Note that modifying this value directly affects the value of 

84 `n_random_init`, if latter is set to `None`. 

85 

86 n_random_init : Optional[int] 

87 Number of points evaluated at start to bootstrap the optimizer. 

88 Default depends on max_trials and number of parameters and max_ratio. 

89 Note: it can sometimes be useful to set this to 1 when pre-warming the 

90 optimizer from historical data. See Also: 

91 :py:meth:`mlos_bench.optimizers.base_optimizer.Optimizer.bulk_register` 

92 

93 max_ratio : Optional[int] 

94 Maximum ratio of max_trials to be random configs to be evaluated 

95 at start to bootstrap the optimizer. 

96 Useful if you want to explicitly control the number of random 

97 configs evaluated at start. 

98 

99 use_default_config : bool 

100 Whether to use the default config for the first trial after random initialization. 

101 

102 n_random_probability : float 

103 Probability of choosing to evaluate a random configuration during optimization. 

104 Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. 

105 """ 

106 super().__init__( 

107 parameter_space=parameter_space, 

108 optimization_targets=optimization_targets, 

109 objective_weights=objective_weights, 

110 space_adapter=space_adapter, 

111 ) 

112 

113 # Declare at the top because we need it in __del__/cleanup() 

114 self._temp_output_directory: Optional[TemporaryDirectory] = None 

115 

116 # pylint: disable=import-outside-toplevel 

117 from smac import HyperparameterOptimizationFacade as Optimizer_Smac 

118 from smac import Scenario 

119 from smac.intensifier.abstract_intensifier import AbstractIntensifier 

120 from smac.main.config_selector import ConfigSelector 

121 from smac.random_design.probability_design import ProbabilityRandomDesign 

122 from smac.runhistory import TrialInfo 

123 from smac.utils.configspace import convert_configurations_to_array 

124 

125 # Save util function here as a property for later usage, also to satisfy linter 

126 self._convert_configurations_to_array = convert_configurations_to_array 

127 

128 # Store for TrialInfo instances returned by .ask() 

129 self.trial_info_map: Dict[ConfigSpace.Configuration, TrialInfo] = {} 

130 

131 # The default when not specified is to use a known seed (0) to keep results reproducible. 

132 # However, if a `None` seed is explicitly provided, we let a random seed be 

133 # produced by SMAC. 

134 # https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario 

135 seed = -1 if seed is None else seed 

136 

137 # Create temporary directory for SMAC output (if none provided) 

138 if output_directory is None: 

139 # pylint: disable=consider-using-with 

140 try: 

141 # Argument added in Python 3.10 

142 self._temp_output_directory = TemporaryDirectory(ignore_cleanup_errors=True) 

143 except TypeError: 

144 self._temp_output_directory = TemporaryDirectory() 

145 output_directory = self._temp_output_directory.name 

146 assert output_directory is not None 

147 

148 if n_random_init is not None: 

149 assert isinstance(n_random_init, int) and n_random_init >= 0 

150 if n_random_init == max_trials and use_default_config: 

151 # Increase max budgeted trials to account for use_default_config. 

152 max_trials += 1 

153 

154 scenario: Scenario = Scenario( 

155 self.optimizer_parameter_space, 

156 objectives=self._optimization_targets, 

157 name=run_name, 

158 output_directory=Path(output_directory), 

159 deterministic=True, 

160 use_default_config=use_default_config, 

161 n_trials=max_trials, 

162 seed=seed or -1, # if -1, SMAC will generate a random seed internally 

163 n_workers=1, # Use a single thread for evaluating trials 

164 ) 

165 intensifier: AbstractIntensifier = Optimizer_Smac.get_intensifier( 

166 scenario, 

167 max_config_calls=1, 

168 ) 

169 config_selector: ConfigSelector = Optimizer_Smac.get_config_selector( 

170 scenario, 

171 retrain_after=1, 

172 ) 

173 

174 # TODO: When bulk registering prior configs to rewarm the optimizer, 

175 # there is a way to inform SMAC's initial design that we have 

176 # additional_configs and can set n_configs == 0. 

177 # Additionally, we may want to consider encoding those values into the 

178 # runhistory when prewarming the optimizer so that the initial design 

179 # doesn't reperform random init. 

180 # See Also: #488 

181 

182 initial_design_args: Dict[str, Union[list, int, float, Scenario]] = { 

183 "scenario": scenario, 

184 # Workaround a bug in SMAC that sets a default arg to a mutable 

185 # value that can cause issues when multiple optimizers are 

186 # instantiated with the use_default_config option within the same 

187 # process that use different ConfigSpaces so that the second 

188 # receives the default config from both as an additional config. 

189 "additional_configs": [], 

190 } 

191 if n_random_init is not None: 

192 initial_design_args["n_configs"] = n_random_init 

193 if n_random_init > 0.25 * max_trials and max_ratio is None: 

194 warning( 

195 "Number of random initial configs (%d) is " 

196 + "greater than 25%% of max_trials (%d). " 

197 + "Consider setting max_ratio to avoid SMAC overriding n_random_init.", 

198 n_random_init, 

199 max_trials, 

200 ) 

201 if max_ratio is not None: 

202 assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0 

203 initial_design_args["max_ratio"] = max_ratio 

204 self._max_ratio = max_ratio 

205 

206 # Use the default InitialDesign from SMAC. 

207 # (currently SBOL instead of LatinHypercube due to better uniformity 

208 # for initial sampling which results in lower overall samples required) 

209 initial_design = Optimizer_Smac.get_initial_design( 

210 **initial_design_args, # type: ignore[arg-type] 

211 ) 

212 # initial_design = LatinHypercubeInitialDesign( 

213 # **initial_design_args, # type: ignore[arg-type] 

214 # ) 

215 

216 # Workaround a bug in SMAC that doesn't pass the seed to the random 

217 # design when generated a random_design for itself via the 

218 # get_random_design static method when random_design is None. 

219 assert isinstance(n_random_probability, float) and n_random_probability >= 0 

220 random_design = ProbabilityRandomDesign( 

221 probability=n_random_probability, 

222 seed=scenario.seed, 

223 ) 

224 

225 self.base_optimizer = Optimizer_Smac( 

226 scenario, 

227 SmacOptimizer._dummy_target_func, 

228 initial_design=initial_design, 

229 intensifier=intensifier, 

230 random_design=random_design, 

231 config_selector=config_selector, 

232 multi_objective_algorithm=Optimizer_Smac.get_multi_objective_algorithm( 

233 scenario, 

234 objective_weights=self._objective_weights, 

235 ), 

236 overwrite=True, 

237 logging_level=False, # Use the existing logger 

238 ) 

239 

240 def __del__(self) -> None: 

241 # Best-effort attempt to clean up, in case the user forgets to call .cleanup() 

242 self.cleanup() 

243 

244 @property 

245 def max_ratio(self) -> Optional[float]: 

246 """ 

247 Gets the `max_ratio` parameter used in py:meth:`constructor <.__init__>` of this 

248 SmacOptimizer. 

249 

250 Returns 

251 ------- 

252 float 

253 """ 

254 return self._max_ratio 

255 

256 @property 

257 def n_random_init(self) -> int: 

258 """ 

259 Gets the number of random samples to use to initialize the optimizer's search 

260 space sampling. 

261 

262 Note: This may not be equal to the value passed to the initializer, due to 

263 logic present in the SMAC. 

264 

265 See Also 

266 -------- 

267 :py:attr:`.max_ratio` 

268 

269 Returns 

270 ------- 

271 int 

272 The number of random samples used to initialize the optimizer's search space sampling. 

273 """ 

274 # pylint: disable=protected-access 

275 return self.base_optimizer._initial_design._n_configs 

276 

277 @staticmethod 

278 def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None: 

279 """ 

280 Dummy target function for SMAC optimizer. 

281 

282 Since we only use the ask-and-tell interface, this is never called. 

283 

284 Parameters 

285 ---------- 

286 config : ConfigSpace.Configuration 

287 Configuration to evaluate. 

288 

289 seed : int 

290 Random seed to use for the target function. Not actually used. 

291 """ 

292 # NOTE: Providing a target function when using the ask-and-tell interface is 

293 # an imperfection of the API -- this is planned to be fixed in some future 

294 # release: https://github.com/automl/SMAC3/issues/946 

295 raise RuntimeError("This function should never be called.") 

296 

297 def _register( 

298 self, 

299 observations: Observations, 

300 ) -> None: 

301 """ 

302 Registers one or more configs/score pairs (observations) with the underlying 

303 optimizer. 

304 

305 Parameters 

306 ---------- 

307 observations : Observations 

308 The set of config/scores to register. 

309 """ 

310 # TODO: Implement bulk registration. 

311 # (e.g., by rebuilding the base optimizer instance with all observations). 

312 for observation in observations: 

313 self._register_single(observation) 

314 

315 def _register_single( 

316 self, 

317 observation: Observation, 

318 ) -> None: 

319 """ 

320 Registers the given config and its score. 

321 

322 Parameters 

323 ---------- 

324 observation: Observation 

325 The observation to register. 

326 """ 

327 from smac.runhistory import ( # pylint: disable=import-outside-toplevel 

328 StatusType, 

329 TrialInfo, 

330 TrialValue, 

331 ) 

332 

333 if observation.context is not None: 

334 warn( 

335 f"Not Implemented: Ignoring context {list(observation.context.index)}", 

336 UserWarning, 

337 ) 

338 

339 # Retrieve previously generated TrialInfo (returned by .ask()) or create 

340 # new TrialInfo instance 

341 config = ConfigSpace.Configuration( 

342 self.optimizer_parameter_space, 

343 values=observation.config.dropna().to_dict(), 

344 ) 

345 info: TrialInfo = self.trial_info_map.get( 

346 config, 

347 TrialInfo(config=config, seed=self.base_optimizer.scenario.seed), 

348 ) 

349 value = TrialValue( 

350 cost=list(observation.score.astype(float)), 

351 time=0.0, 

352 status=StatusType.SUCCESS, 

353 ) 

354 self.base_optimizer.tell(info, value, save=False) 

355 

356 # Save optimizer once we register all configs 

357 self.base_optimizer.optimizer.save() 

358 

359 def _suggest( 

360 self, 

361 *, 

362 context: Optional[pd.Series] = None, 

363 ) -> Suggestion: 

364 """ 

365 Suggests a new configuration. 

366 

367 Parameters 

368 ---------- 

369 context : pd.DataFrame 

370 Not Yet Implemented. 

371 

372 Returns 

373 ------- 

374 suggestion: Suggestion 

375 The suggestion to evaluate. 

376 """ 

377 if TYPE_CHECKING: 

378 # pylint: disable=import-outside-toplevel,unused-import 

379 from smac.runhistory import TrialInfo 

380 

381 if context is not None: 

382 warn(f"Not Implemented: Ignoring context {list(context.index)}", UserWarning) 

383 

384 trial: TrialInfo = self.base_optimizer.ask() 

385 trial.config.check_valid_configuration() 

386 ConfigSpace.Configuration( 

387 self.optimizer_parameter_space, 

388 values=trial.config, 

389 ).check_valid_configuration() 

390 assert trial.config.config_space == self.optimizer_parameter_space 

391 self.trial_info_map[trial.config] = trial 

392 config_sr = pd.Series(dict(trial.config), dtype=object) 

393 return Suggestion(config=config_sr, context=context, metadata=None) 

394 

395 def register_pending(self, pending: Suggestion) -> None: 

396 raise NotImplementedError() 

397 

398 def surrogate_predict(self, suggestion: Suggestion) -> npt.NDArray: 

399 if suggestion.context is not None: 

400 warn( 

401 f"Not Implemented: Ignoring context {list(suggestion.context.index)}", 

402 UserWarning, 

403 ) 

404 if self._space_adapter and not isinstance(self._space_adapter, IdentityAdapter): 

405 raise NotImplementedError("Space adapter not supported for surrogate_predict.") 

406 

407 # pylint: disable=protected-access 

408 if len(self._observations) <= self.base_optimizer._initial_design._n_configs: 

409 raise RuntimeError( 

410 "Surrogate model can make predictions *only* after " 

411 "all initial points have been evaluated " 

412 f"{len(self._observations)} <= {self.base_optimizer._initial_design._n_configs}" 

413 ) 

414 if self.base_optimizer._config_selector._model is None: 

415 raise RuntimeError("Surrogate model is not yet trained") 

416 

417 config_array = self._convert_configurations_to_array( 

418 [ 

419 ConfigSpace.Configuration( 

420 self.optimizer_parameter_space, values=suggestion.config.to_dict() 

421 ) 

422 ] 

423 ) 

424 mean_predictions, _ = self.base_optimizer._config_selector._model.predict(config_array) 

425 return mean_predictions.reshape( 

426 -1, 

427 ) 

428 

429 def acquisition_function(self, suggestion: Suggestion) -> npt.NDArray: 

430 if suggestion.context is not None: 

431 warn( 

432 f"Not Implemented: Ignoring context {list(suggestion.context.index)}", 

433 UserWarning, 

434 ) 

435 if self._space_adapter: 

436 raise NotImplementedError() 

437 

438 # pylint: disable=protected-access 

439 if self.base_optimizer._config_selector._acquisition_function is None: 

440 raise RuntimeError("Acquisition function is not yet initialized") 

441 

442 return self.base_optimizer._config_selector._acquisition_function( 

443 suggestion.config.config_to_configspace(self.optimizer_parameter_space) 

444 ).reshape( 

445 -1, 

446 ) 

447 

448 def cleanup(self) -> None: 

449 if hasattr(self, "_temp_output_directory") and self._temp_output_directory is not None: 

450 self._temp_output_directory.cleanup() 

451 self._temp_output_directory = None 

452 

453 def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace.Configuration]: 

454 """ 

455 Convert a dataframe of configs to a list of ConfigSpace configs. 

456 

457 Parameters 

458 ---------- 

459 configs : pd.DataFrame 

460 Dataframe of configs / parameters. The columns are parameter names and 

461 the rows are the configs. 

462 

463 Returns 

464 ------- 

465 configs : list 

466 List of ConfigSpace configs. 

467 """ 

468 return [ 

469 ConfigSpace.Configuration(self.optimizer_parameter_space, values=config.to_dict()) 

470 for (_, config) in configs.astype("O").iterrows() 

471 ]