Coverage for mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py: 87%

108 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-22 01:18 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5""" 

6Contains the wrapper class for the :py:class:`.SmacOptimizer`. 

7 

8Notes 

9----- 

10See the `SMAC3 Documentation <https://automl.github.io/SMAC3/main/index.html>`_ for 

11more details. 

12""" 

13 

14from logging import warning 

15from pathlib import Path 

16from tempfile import TemporaryDirectory 

17from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union 

18from warnings import warn 

19 

20import ConfigSpace 

21import numpy.typing as npt 

22import pandas as pd 

23 

24from mlos_core.optimizers.bayesian_optimizers.bayesian_optimizer import ( 

25 BaseBayesianOptimizer, 

26) 

27from mlos_core.spaces.adapters.adapter import BaseSpaceAdapter 

28from mlos_core.spaces.adapters.identity_adapter import IdentityAdapter 

29from mlos_core.util import drop_nulls 

30 

31 

32class SmacOptimizer(BaseBayesianOptimizer): 

33 """Wrapper class for SMAC based Bayesian optimization.""" 

34 

35 def __init__( 

36 self, 

37 *, # pylint: disable=too-many-locals,too-many-arguments 

38 parameter_space: ConfigSpace.ConfigurationSpace, 

39 optimization_targets: List[str], 

40 objective_weights: Optional[List[float]] = None, 

41 space_adapter: Optional[BaseSpaceAdapter] = None, 

42 seed: Optional[int] = 0, 

43 run_name: Optional[str] = None, 

44 output_directory: Optional[str] = None, 

45 max_trials: int = 100, 

46 n_random_init: Optional[int] = None, 

47 max_ratio: Optional[float] = None, 

48 use_default_config: bool = False, 

49 n_random_probability: float = 0.1, 

50 ): 

51 """ 

52 Instantiate a new SMAC optimizer wrapper. 

53 

54 Parameters 

55 ---------- 

56 parameter_space : ConfigSpace.ConfigurationSpace 

57 The parameter space to optimize. 

58 

59 optimization_targets : List[str] 

60 The names of the optimization targets to minimize. 

61 

62 objective_weights : Optional[List[float]] 

63 Optional list of weights of optimization targets. 

64 

65 space_adapter : BaseSpaceAdapter 

66 The space adapter class to employ for parameter space transformations. 

67 

68 seed : Optional[int] 

69 By default SMAC uses a known seed (0) to keep results reproducible. 

70 However, if a `None` seed is explicitly provided, we let a random seed 

71 be produced by SMAC. 

72 

73 run_name : Optional[str] 

74 Name of this run. This is used to easily distinguish across different runs. 

75 If set to `None` (default), SMAC will generate a hash from metadata. 

76 

77 output_directory : Optional[str] 

78 The directory where SMAC output will saved. If set to `None` (default), 

79 a temporary dir will be used. 

80 

81 max_trials : int 

82 Maximum number of trials (i.e., function evaluations) to be run. Defaults to 100. 

83 Note that modifying this value directly affects the value of 

84 `n_random_init`, if latter is set to `None`. 

85 

86 n_random_init : Optional[int] 

87 Number of points evaluated at start to bootstrap the optimizer. 

88 Default depends on max_trials and number of parameters and max_ratio. 

89 Note: it can sometimes be useful to set this to 1 when pre-warming the 

90 optimizer from historical data. See Also: 

91 :py:meth:`mlos_bench.optimizers.base_optimizer.Optimizer.bulk_register` 

92 

93 max_ratio : Optional[int] 

94 Maximum ratio of max_trials to be random configs to be evaluated 

95 at start to bootstrap the optimizer. 

96 Useful if you want to explicitly control the number of random 

97 configs evaluated at start. 

98 

99 use_default_config : bool 

100 Whether to use the default config for the first trial after random initialization. 

101 

102 n_random_probability : float 

103 Probability of choosing to evaluate a random configuration during optimization. 

104 Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. 

105 """ 

106 super().__init__( 

107 parameter_space=parameter_space, 

108 optimization_targets=optimization_targets, 

109 objective_weights=objective_weights, 

110 space_adapter=space_adapter, 

111 ) 

112 

113 # Declare at the top because we need it in __del__/cleanup() 

114 self._temp_output_directory: Optional[TemporaryDirectory] = None 

115 

116 # pylint: disable=import-outside-toplevel 

117 from smac import HyperparameterOptimizationFacade as Optimizer_Smac 

118 from smac import Scenario 

119 from smac.intensifier.abstract_intensifier import AbstractIntensifier 

120 from smac.main.config_selector import ConfigSelector 

121 from smac.random_design.probability_design import ProbabilityRandomDesign 

122 from smac.runhistory import TrialInfo 

123 

124 # Store for TrialInfo instances returned by .ask() 

125 self.trial_info_map: Dict[ConfigSpace.Configuration, TrialInfo] = {} 

126 

127 # The default when not specified is to use a known seed (0) to keep results reproducible. 

128 # However, if a `None` seed is explicitly provided, we let a random seed be 

129 # produced by SMAC. 

130 # https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario 

131 seed = -1 if seed is None else seed 

132 

133 # Create temporary directory for SMAC output (if none provided) 

134 if output_directory is None: 

135 # pylint: disable=consider-using-with 

136 try: 

137 # Argument added in Python 3.10 

138 self._temp_output_directory = TemporaryDirectory(ignore_cleanup_errors=True) 

139 except TypeError: 

140 self._temp_output_directory = TemporaryDirectory() 

141 output_directory = self._temp_output_directory.name 

142 assert output_directory is not None 

143 

144 if n_random_init is not None: 

145 assert isinstance(n_random_init, int) and n_random_init >= 0 

146 if n_random_init == max_trials and use_default_config: 

147 # Increase max budgeted trials to account for use_default_config. 

148 max_trials += 1 

149 

150 scenario: Scenario = Scenario( 

151 self.optimizer_parameter_space, 

152 objectives=self._optimization_targets, 

153 name=run_name, 

154 output_directory=Path(output_directory), 

155 deterministic=True, 

156 use_default_config=use_default_config, 

157 n_trials=max_trials, 

158 seed=seed or -1, # if -1, SMAC will generate a random seed internally 

159 n_workers=1, # Use a single thread for evaluating trials 

160 ) 

161 intensifier: AbstractIntensifier = Optimizer_Smac.get_intensifier( 

162 scenario, 

163 max_config_calls=1, 

164 ) 

165 config_selector: ConfigSelector = Optimizer_Smac.get_config_selector( 

166 scenario, 

167 retrain_after=1, 

168 ) 

169 

170 # TODO: When bulk registering prior configs to rewarm the optimizer, 

171 # there is a way to inform SMAC's initial design that we have 

172 # additional_configs and can set n_configs == 0. 

173 # Additionally, we may want to consider encoding those values into the 

174 # runhistory when prewarming the optimizer so that the initial design 

175 # doesn't reperform random init. 

176 # See Also: #488 

177 

178 initial_design_args: Dict[str, Union[list, int, float, Scenario]] = { 

179 "scenario": scenario, 

180 # Workaround a bug in SMAC that sets a default arg to a mutable 

181 # value that can cause issues when multiple optimizers are 

182 # instantiated with the use_default_config option within the same 

183 # process that use different ConfigSpaces so that the second 

184 # receives the default config from both as an additional config. 

185 "additional_configs": [], 

186 } 

187 if n_random_init is not None: 

188 initial_design_args["n_configs"] = n_random_init 

189 if n_random_init > 0.25 * max_trials and max_ratio is None: 

190 warning( 

191 "Number of random initial configs (%d) is " 

192 + "greater than 25%% of max_trials (%d). " 

193 + "Consider setting max_ratio to avoid SMAC overriding n_random_init.", 

194 n_random_init, 

195 max_trials, 

196 ) 

197 if max_ratio is not None: 

198 assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0 

199 initial_design_args["max_ratio"] = max_ratio 

200 self._max_ratio = max_ratio 

201 

202 # Use the default InitialDesign from SMAC. 

203 # (currently SBOL instead of LatinHypercube due to better uniformity 

204 # for initial sampling which results in lower overall samples required) 

205 initial_design = Optimizer_Smac.get_initial_design( 

206 **initial_design_args, # type: ignore[arg-type] 

207 ) 

208 # initial_design = LatinHypercubeInitialDesign( 

209 # **initial_design_args, # type: ignore[arg-type] 

210 # ) 

211 

212 # Workaround a bug in SMAC that doesn't pass the seed to the random 

213 # design when generated a random_design for itself via the 

214 # get_random_design static method when random_design is None. 

215 assert isinstance(n_random_probability, float) and n_random_probability >= 0 

216 random_design = ProbabilityRandomDesign( 

217 probability=n_random_probability, 

218 seed=scenario.seed, 

219 ) 

220 

221 self.base_optimizer = Optimizer_Smac( 

222 scenario, 

223 SmacOptimizer._dummy_target_func, 

224 initial_design=initial_design, 

225 intensifier=intensifier, 

226 random_design=random_design, 

227 config_selector=config_selector, 

228 multi_objective_algorithm=Optimizer_Smac.get_multi_objective_algorithm( 

229 scenario, 

230 objective_weights=self._objective_weights, 

231 ), 

232 overwrite=True, 

233 logging_level=False, # Use the existing logger 

234 ) 

235 

236 def __del__(self) -> None: 

237 # Best-effort attempt to clean up, in case the user forgets to call .cleanup() 

238 self.cleanup() 

239 

240 @property 

241 def max_ratio(self) -> Optional[float]: 

242 """ 

243 Gets the `max_ratio` parameter used in py:meth:`constructor <.__init__>` of this 

244 SmacOptimizer. 

245 

246 Returns 

247 ------- 

248 float 

249 """ 

250 return self._max_ratio 

251 

252 @property 

253 def n_random_init(self) -> int: 

254 """ 

255 Gets the number of random samples to use to initialize the optimizer's search 

256 space sampling. 

257 

258 Note: This may not be equal to the value passed to the initializer, due to 

259 logic present in the SMAC. 

260 

261 See Also 

262 -------- 

263 :py:attr:`.max_ratio` 

264 

265 Returns 

266 ------- 

267 int 

268 The number of random samples used to initialize the optimizer's search space sampling. 

269 """ 

270 # pylint: disable=protected-access 

271 return self.base_optimizer._initial_design._n_configs 

272 

273 @staticmethod 

274 def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None: 

275 """ 

276 Dummy target function for SMAC optimizer. 

277 

278 Since we only use the ask-and-tell interface, this is never called. 

279 

280 Parameters 

281 ---------- 

282 config : ConfigSpace.Configuration 

283 Configuration to evaluate. 

284 

285 seed : int 

286 Random seed to use for the target function. Not actually used. 

287 """ 

288 # NOTE: Providing a target function when using the ask-and-tell interface is 

289 # an imperfection of the API -- this is planned to be fixed in some future 

290 # release: https://github.com/automl/SMAC3/issues/946 

291 raise RuntimeError("This function should never be called.") 

292 

293 def _register( 

294 self, 

295 *, 

296 configs: pd.DataFrame, 

297 scores: pd.DataFrame, 

298 context: Optional[pd.DataFrame] = None, 

299 metadata: Optional[pd.DataFrame] = None, 

300 ) -> None: 

301 """ 

302 Registers the given configs and scores. 

303 

304 Parameters 

305 ---------- 

306 configs : pd.DataFrame 

307 Dataframe of configs / parameters. The columns are parameter names and 

308 the rows are the configs. 

309 

310 scores : pd.DataFrame 

311 Scores from running the configs. The index is the same as the index of 

312 the configs. 

313 

314 context : pd.DataFrame 

315 Not Yet Implemented. 

316 

317 metadata: pd.DataFrame 

318 Not Yet Implemented. 

319 """ 

320 from smac.runhistory import ( # pylint: disable=import-outside-toplevel 

321 StatusType, 

322 TrialInfo, 

323 TrialValue, 

324 ) 

325 

326 if context is not None: 

327 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

328 

329 # Register each trial (one-by-one) 

330 for config, (_i, score) in zip( 

331 self._to_configspace_configs(configs=configs), scores.iterrows() 

332 ): 

333 # Retrieve previously generated TrialInfo (returned by .ask()) or create 

334 # new TrialInfo instance 

335 info: TrialInfo = self.trial_info_map.get( 

336 config, 

337 TrialInfo(config=config, seed=self.base_optimizer.scenario.seed), 

338 ) 

339 value = TrialValue(cost=list(score.astype(float)), time=0.0, status=StatusType.SUCCESS) 

340 self.base_optimizer.tell(info, value, save=False) 

341 

342 # Save optimizer once we register all configs 

343 self.base_optimizer.optimizer.save() 

344 

345 def _suggest( 

346 self, 

347 *, 

348 context: Optional[pd.DataFrame] = None, 

349 ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: 

350 """ 

351 Suggests a new configuration. 

352 

353 Parameters 

354 ---------- 

355 context : pd.DataFrame 

356 Not Yet Implemented. 

357 

358 Returns 

359 ------- 

360 configuration : pd.DataFrame 

361 Pandas dataframe with a single row. Column names are the parameter names. 

362 

363 metadata : Optional[pd.DataFrame] 

364 Not yet implemented. 

365 """ 

366 if TYPE_CHECKING: 

367 # pylint: disable=import-outside-toplevel,unused-import 

368 from smac.runhistory import TrialInfo 

369 

370 if context is not None: 

371 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

372 

373 trial: TrialInfo = self.base_optimizer.ask() 

374 trial.config.check_valid_configuration() 

375 ConfigSpace.Configuration( 

376 self.optimizer_parameter_space, 

377 values=trial.config, 

378 ).check_valid_configuration() 

379 assert trial.config.config_space == self.optimizer_parameter_space 

380 self.trial_info_map[trial.config] = trial 

381 config_df = pd.DataFrame( 

382 [trial.config], columns=list(self.optimizer_parameter_space.keys()) 

383 ) 

384 return config_df, None 

385 

386 def register_pending( 

387 self, 

388 *, 

389 configs: pd.DataFrame, 

390 context: Optional[pd.DataFrame] = None, 

391 metadata: Optional[pd.DataFrame] = None, 

392 ) -> None: 

393 raise NotImplementedError() 

394 

395 def surrogate_predict( 

396 self, 

397 *, 

398 configs: pd.DataFrame, 

399 context: Optional[pd.DataFrame] = None, 

400 ) -> npt.NDArray: 

401 # pylint: disable=import-outside-toplevel 

402 from smac.utils.configspace import convert_configurations_to_array 

403 

404 if context is not None: 

405 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

406 if self._space_adapter and not isinstance(self._space_adapter, IdentityAdapter): 

407 raise NotImplementedError("Space adapter not supported for surrogate_predict.") 

408 

409 # pylint: disable=protected-access 

410 if len(self._observations) <= self.base_optimizer._initial_design._n_configs: 

411 raise RuntimeError( 

412 "Surrogate model can make predictions *only* after " 

413 "all initial points have been evaluated " 

414 f"{len(self._observations)} <= {self.base_optimizer._initial_design._n_configs}" 

415 ) 

416 if self.base_optimizer._config_selector._model is None: 

417 raise RuntimeError("Surrogate model is not yet trained") 

418 

419 config_array: npt.NDArray = convert_configurations_to_array( 

420 self._to_configspace_configs(configs=configs) 

421 ) 

422 mean_predictions, _ = self.base_optimizer._config_selector._model.predict(config_array) 

423 return mean_predictions.reshape( 

424 -1, 

425 ) 

426 

427 def acquisition_function( 

428 self, 

429 *, 

430 configs: pd.DataFrame, 

431 context: Optional[pd.DataFrame] = None, 

432 ) -> npt.NDArray: 

433 if context is not None: 

434 warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) 

435 if self._space_adapter: 

436 raise NotImplementedError() 

437 

438 # pylint: disable=protected-access 

439 if self.base_optimizer._config_selector._acquisition_function is None: 

440 raise RuntimeError("Acquisition function is not yet initialized") 

441 

442 cs_configs: list = self._to_configspace_configs(configs=configs) 

443 return self.base_optimizer._config_selector._acquisition_function(cs_configs).reshape( 

444 -1, 

445 ) 

446 

447 def cleanup(self) -> None: 

448 if hasattr(self, "_temp_output_directory") and self._temp_output_directory is not None: 

449 self._temp_output_directory.cleanup() 

450 self._temp_output_directory = None 

451 

452 def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace.Configuration]: 

453 """ 

454 Convert a dataframe of configs to a list of ConfigSpace configs. 

455 

456 Parameters 

457 ---------- 

458 configs : pd.DataFrame 

459 Dataframe of configs / parameters. The columns are parameter names and 

460 the rows are the configs. 

461 

462 Returns 

463 ------- 

464 configs : list 

465 List of ConfigSpace configs. 

466 """ 

467 return [ 

468 ConfigSpace.Configuration( 

469 self.optimizer_parameter_space, 

470 # Remove None values for inactive parameters 

471 values=drop_nulls(config.to_dict()), 

472 allow_inactive_with_values=False, 

473 ) 

474 for (_, config) in configs.astype("O").iterrows() 

475 ]