Coverage for mlos_viz/mlos_viz/base.py: 90%

158 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-30 00:51 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5"""Base functions for visualizing, explain, and gain insights from results.""" 

6 

7import re 

8import warnings 

9from collections.abc import Callable, Iterable 

10from importlib.metadata import version 

11from typing import Any, Literal 

12 

13import pandas 

14import seaborn as sns 

15from matplotlib import pyplot as plt 

16from packaging.version import Version 

17from pandas.api.types import is_numeric_dtype 

18from pandas.core.groupby.generic import SeriesGroupBy 

19 

20from mlos_bench.storage.base_experiment_data import ExperimentData 

21from mlos_viz.util import expand_results_data_args 

22 

23_SEABORN_VERS = Version(version("seaborn")) 

24 

25 

26def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> dict[str, Any]: 

27 """ 

28 Assembles a smaller kwargs dict for the specified target function. 

29 

30 Note: this only works with non-positional kwargs (e.g., those after a * arg). 

31 """ 

32 target_kwargs = {} 

33 for kword in target.__kwdefaults__ or {}: 

34 if kword in kwargs: 

35 target_kwargs[kword] = kwargs[kword] 

36 return target_kwargs 

37 

38 

39def ignore_plotter_warnings() -> None: 

40 """Suppress some annoying warnings from third-party data visualization packages by 

41 adding them to the warnings filter. 

42 """ 

43 warnings.filterwarnings("ignore", category=FutureWarning) 

44 if _SEABORN_VERS <= Version("0.13.1"): 

45 warnings.filterwarnings( 

46 "ignore", 

47 category=DeprecationWarning, 

48 module="seaborn", # but actually comes from pandas 

49 message="is_categorical_dtype is deprecated and will be removed in a future version.", 

50 ) 

51 # See Also: https://github.com/mwaskom/seaborn/issues/3804 

52 warnings.filterwarnings( 

53 "ignore", 

54 category=PendingDeprecationWarning, 

55 module="seaborn", # but actually comes from matplotlib 

56 message=( 

57 "vert: bool will be deprecated in a future version. " 

58 "Use orientation: {'vertical', 'horizontal'} instead." 

59 ), 

60 ) 

61 

62 warnings.filterwarnings( 

63 "ignore", 

64 module="matplotlib", 

65 category=DeprecationWarning, 

66 message="'mode' parameter is deprecated and will be removed in Pillow 13", 

67 ) 

68 

69 

70def _add_groupby_desc_column( 

71 results_df: pandas.DataFrame, 

72 groupby_columns: list[str] | None = None, 

73) -> tuple[pandas.DataFrame, list[str], str]: 

74 """ 

75 Adds a group descriptor column to the results_df. 

76 

77 Parameters 

78 ---------- 

79 results_df: ExperimentData 

80 The experiment data to add the descriptor column to. 

81 groupby_columns: Optional[list[str]] 

82 """ 

83 # Compose a new groupby_column for display purposes that is the 

84 # concatenation of the min trial_id (the first one) of each config trial 

85 # group and the config_id. 

86 # Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to 

87 # be on the same axis anyways. 

88 if groupby_columns is None: 

89 groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"] 

90 groupby_column = ",".join(groupby_columns) 

91 results_df[groupby_column] = ( 

92 results_df[groupby_columns].astype(str).apply(",".join, axis=1) 

93 ) # pylint: disable=unnecessary-lambda 

94 groupby_columns.append(groupby_column) 

95 return (results_df, groupby_columns, groupby_column) 

96 

97 

98def augment_results_df_with_config_trial_group_stats( 

99 exp_data: ExperimentData | None = None, 

100 *, 

101 results_df: pandas.DataFrame | None = None, 

102 requested_result_cols: Iterable[str] | None = None, 

103) -> pandas.DataFrame: 

104 # pylint: disable=too-complex 

105 """ 

106 Add a number of useful statistical measure columns to the results dataframe. 

107 

108 In particular, for each numeric result, we add the following columns for each 

109 requested result column: 

110 

111 - ".p50": the median of each config trial group results 

112 

113 - ".p75": the p75 of each config trial group results 

114 

115 - ".p90": the p90 of each config trial group results 

116 

117 - ".p95": the p95 of each config trial group results 

118 

119 - ".p99": the p95 of each config trial group results 

120 

121 - ".mean": the mean of each config trial group results 

122 

123 - ".stddev": the mean of each config trial group results 

124 

125 - ".var": the variance of each config trial group results 

126 

127 - ".var_zscore": the zscore of this group (i.e., variance relative to the stddev 

128 of all group variances). This can be useful for filtering out outliers (e.g., 

129 configs with high variance relative to others by restricting to abs < 2 to 

130 remove those two standard deviations from the mean variance across all config 

131 trial groups). 

132 

133 Additionally, we add a "tunable_config_trial_group_size" column that indicates 

134 the number of trials using a particular config. 

135 

136 Parameters 

137 ---------- 

138 exp_data : ExperimentData 

139 The ExperimentData (e.g., obtained from the storage layer) to plot. 

140 results_df : pandas.DataFrame | None 

141 The results dataframe to augment, by default None to use the results_df property. 

142 requested_result_cols : Optional[Iterable[str]] 

143 Which results columns to augment, by default None to use all results columns 

144 that look numeric. 

145 

146 Returns 

147 ------- 

148 pandas.DataFrame 

149 The augmented results dataframe. 

150 """ 

151 if results_df is None: 

152 if exp_data is None: 

153 raise ValueError("Either exp_data or results_df must be provided.") 

154 results_df = exp_data.results_df 

155 results_groups = results_df.groupby("tunable_config_id") 

156 if len(results_groups) <= 1: 

157 raise ValueError(f"Not enough data: {len(results_groups)}") 

158 

159 if requested_result_cols is None: 

160 result_cols = { 

161 col 

162 for col in results_df.columns 

163 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) 

164 } 

165 else: 

166 result_cols = { 

167 col 

168 for col in requested_result_cols 

169 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns 

170 } 

171 result_cols.update( 

172 { 

173 ExperimentData.RESULT_COLUMN_PREFIX + col 

174 for col in requested_result_cols 

175 if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns 

176 } 

177 ) 

178 

179 def compute_zscore_for_group_agg( 

180 results_groups_perf: "SeriesGroupBy", 

181 stats_df: pandas.DataFrame, 

182 result_col: str, 

183 agg: Literal["mean"] | Literal["var"] | Literal["std"], 

184 ) -> None: 

185 results_groups_perf_aggs = results_groups_perf.agg(agg) # TODO: avoid recalculating? 

186 # Compute the zscore of the chosen aggregate performance of each group into 

187 # each row in the dataframe. 

188 stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean() 

189 stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std() 

190 stats_df[result_col + f".{agg}_zscore"] = ( 

191 stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"] 

192 ) / stats_df[result_col + f".{agg}_stddev"] 

193 stats_df.drop( 

194 columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True 

195 ) 

196 

197 augmented_results_df = results_df 

198 augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform( 

199 "count" 

200 ) 

201 for result_col in result_cols: 

202 if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX): 

203 continue 

204 if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE): 

205 # Ignore computing variance on things like that look like timestamps. 

206 continue 

207 if not is_numeric_dtype(results_df[result_col]): 

208 continue 

209 if results_df[result_col].unique().size == 1: 

210 continue 

211 results_groups_perf = results_groups[result_col] 

212 stats_df = pandas.DataFrame() 

213 stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True) 

214 stats_df[result_col + ".var"] = results_groups_perf.transform("var") 

215 stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5) 

216 

217 compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var") 

218 quantiles = [0.50, 0.75, 0.90, 0.95, 0.99] 

219 for quantile in quantiles: # TODO: can we do this in one pass? 

220 quantile_col = f"{result_col}.p{int(quantile * 100)}" 

221 stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile) 

222 augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1) 

223 return augmented_results_df 

224 

225 

226def limit_top_n_configs( 

227 exp_data: ExperimentData | None = None, 

228 *, 

229 results_df: pandas.DataFrame | None = None, 

230 objectives: dict[str, Literal["min", "max"]] | None = None, 

231 top_n_configs: int = 10, 

232 method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean", 

233) -> tuple[pandas.DataFrame, list[int], dict[str, bool]]: 

234 # pylint: disable=too-many-locals 

235 """ 

236 Utility function to process the results and determine the best performing configs 

237 including potential repeats to help assess variability. 

238 

239 Parameters 

240 ---------- 

241 exp_data : ExperimentData | None 

242 The ExperimentData (e.g., obtained from the storage layer) to operate on. 

243 results_df : pandas.DataFrame | None 

244 The results dataframe to augment, by default None to use 

245 :py:attr:`.ExperimentData.results_df` property. 

246 objectives : Iterable[str] 

247 Which result column(s) to use for sorting the configs, and in which 

248 direction ("min" or "max"). 

249 By default None to automatically select the :py:attr:`.ExperimentData.objectives`. 

250 top_n_configs : int 

251 How many configs to return, including the default, by default 10. 

252 method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean", 

253 Which statistical method to use when sorting the config groups before 

254 determining the cutoff, by default "mean". 

255 

256 Returns 

257 ------- 

258 (top_n_config_results_df, top_n_config_ids, orderby_cols) : 

259 tuple[pandas.DataFrame, list[int], dict[str, bool]] 

260 The filtered results dataframe, the config ids, and the columns used to 

261 order the configs. 

262 """ 

263 # Do some input checking first. 

264 if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]: 

265 raise ValueError(f"Invalid method: {method}") 

266 

267 # Prepare the orderby columns. 

268 (results_df, objs_cols) = expand_results_data_args( 

269 exp_data, 

270 results_df=results_df, 

271 objectives=objectives, 

272 ) 

273 assert isinstance(results_df, pandas.DataFrame) 

274 

275 # Augment the results dataframe with some useful stats. 

276 results_df = augment_results_df_with_config_trial_group_stats( 

277 exp_data=exp_data, 

278 results_df=results_df, 

279 requested_result_cols=objs_cols.keys(), 

280 ) 

281 # Note: mypy seems to lose its mind for some reason and keeps forgetting that 

282 # results_df is not None and is in fact a DataFrame, so we periodically assert 

283 # it in this func for now. 

284 assert results_df is not None 

285 orderby_cols: dict[str, bool] = { 

286 obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items() 

287 } 

288 

289 config_id_col = "tunable_config_id" 

290 group_id_col = "tunable_config_trial_group_id" # first trial_id per config group 

291 trial_id_col = "trial_id" 

292 

293 default_config_id = ( 

294 results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id 

295 ) 

296 assert default_config_id is not None, "Failed to determine default config id." 

297 

298 # Filter out configs whose variance is too large. 

299 # But also make sure the default configs is still in the resulting dataframe 

300 # (for comparison purposes). 

301 for obj_col in objs_cols: 

302 assert results_df is not None 

303 if method == "mean": 

304 singletons_mask = results_df["tunable_config_trial_group_size"] == 1 

305 else: 

306 singletons_mask = results_df["tunable_config_trial_group_size"] > 1 

307 results_df = results_df.loc[ 

308 ( 

309 (results_df[f"{obj_col}.var_zscore"].abs() < 2) 

310 | (singletons_mask) 

311 | (results_df[config_id_col] == default_config_id) 

312 ) 

313 ] 

314 assert results_df is not None 

315 

316 # Also, filter results that are worse than the default. 

317 default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id] 

318 for orderby_col, ascending in orderby_cols.items(): 

319 default_vals = default_config_results_df[orderby_col].unique() 

320 assert len(default_vals) == 1 

321 default_val = default_vals[0] 

322 assert results_df is not None 

323 if ascending: 

324 results_df = results_df.loc[(results_df[orderby_col] <= default_val)] 

325 else: 

326 results_df = results_df.loc[(results_df[orderby_col] >= default_val)] 

327 

328 # Now regroup and filter to the top-N configs by their group performance dimensions. 

329 assert results_df is not None 

330 group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[ 

331 orderby_cols.keys() 

332 ] 

333 top_n_config_ids: list[int] = ( 

334 group_results_df.sort_values( 

335 by=list(orderby_cols.keys()), ascending=list(orderby_cols.values()) 

336 ) 

337 .head(top_n_configs) 

338 .index.tolist() 

339 ) 

340 

341 # Remove the default config if it's included. We'll add it back later. 

342 if default_config_id in top_n_config_ids: 

343 top_n_config_ids.remove(default_config_id) 

344 # Get just the top-n config results. 

345 # Sort by the group ids. 

346 top_n_config_results_df = results_df.loc[ 

347 (results_df[config_id_col].isin(top_n_config_ids)) 

348 ].sort_values([group_id_col, config_id_col, trial_id_col]) 

349 # Place the default config at the top of the list. 

350 top_n_config_ids.insert(0, default_config_id) 

351 top_n_config_results_df = pandas.concat( 

352 [default_config_results_df, top_n_config_results_df], 

353 axis=0, 

354 ) 

355 return (top_n_config_results_df, top_n_config_ids, orderby_cols) 

356 

357 

358def plot_optimizer_trends( 

359 exp_data: ExperimentData | None = None, 

360 *, 

361 results_df: pandas.DataFrame | None = None, 

362 objectives: dict[str, Literal["min", "max"]] | None = None, 

363) -> None: 

364 """ 

365 Plots the optimizer trends for the Experiment. 

366 

367 Parameters 

368 ---------- 

369 exp_data : ExperimentData 

370 The ExperimentData (e.g., obtained from the storage layer) to plot. 

371 results_df : pandas.DataFrame | None 

372 Optional results_df to plot. 

373 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property. 

374 objectives : Optional[dict[str, Literal["min", "max"]]] 

375 Optional objectives to plot. 

376 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property. 

377 """ 

378 (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives) 

379 (results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df) 

380 

381 for objective_column, ascending in obj_cols.items(): 

382 incumbent_column = objective_column + ".incumbent" 

383 

384 # Determine the mean of each config trial group to match the box plots. 

385 group_results_df = ( 

386 results_df.groupby(groupby_columns)[objective_column] 

387 .mean() 

388 .reset_index() 

389 .sort_values(groupby_columns) 

390 ) 

391 # 

392 # Note: technically the optimizer (usually) uses the *first* result for a 

393 # given config trial group before moving on to a new config (x-axis), so 

394 # plotting the mean may be slightly misleading when trying to understand the 

395 # actual path taken by the optimizer in case of high variance samples. 

396 # Here's a way to do that, though it can also be misleading if the optimizer 

397 # later gets a worse value for that config group as well. 

398 # 

399 # group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby( 

400 # groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index() 

401 

402 # Calculate the incumbent (best seen so far) 

403 if ascending: 

404 group_results_df[incumbent_column] = group_results_df[objective_column].cummin() 

405 else: 

406 group_results_df[incumbent_column] = group_results_df[objective_column].cummax() 

407 

408 (_fig, axis) = plt.subplots(figsize=(15, 5)) 

409 

410 # Result of each set of trials for a config 

411 sns.boxplot( 

412 data=results_df, 

413 x=groupby_column, 

414 y=objective_column, 

415 ax=axis, 

416 ) 

417 

418 # Results of the best so far. 

419 axis = sns.lineplot( 

420 data=group_results_df, 

421 x=groupby_column, 

422 y=incumbent_column, 

423 alpha=0.7, 

424 label="Mean of Incumbent Config Trial Group", 

425 ax=axis, 

426 ) 

427 

428 plt.yscale("log") 

429 plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")) 

430 

431 plt.xlabel("Config Trial Group ID, Config ID") 

432 plt.xticks(rotation=90, fontsize=8) 

433 

434 plt.title( 

435 "Optimizer Trends for Experiment: " + exp_data.experiment_id 

436 if exp_data is not None 

437 else "" 

438 ) 

439 plt.grid() 

440 plt.show() 

441 

442 

443def plot_top_n_configs( 

444 exp_data: ExperimentData | None = None, 

445 *, 

446 results_df: pandas.DataFrame | None = None, 

447 objectives: dict[str, Literal["min", "max"]] | None = None, 

448 with_scatter_plot: bool = False, 

449 **kwargs: Any, 

450) -> None: 

451 # pylint: disable=too-many-locals 

452 """ 

453 Plots the top-N configs along with the default config for the given 

454 :py:class:`.ExperimentData`. 

455 

456 Intended to be used from a Jupyter notebook. 

457 

458 Parameters 

459 ---------- 

460 exp_data: ExperimentData 

461 The experiment data to plot. 

462 results_df : pandas.DataFrame | None 

463 Optional results_df to plot. 

464 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property. 

465 objectives : Optional[dict[str, Literal["min", "max"]]] 

466 Optional objectives to plot. 

467 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property. 

468 with_scatter_plot : bool 

469 Whether to also add scatter plot to the output figure. 

470 kwargs : dict 

471 Remaining keyword arguments are passed along to the 

472 :py:func:`limit_top_n_configs` function. 

473 """ 

474 (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives) 

475 top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs) 

476 if "results_df" not in top_n_config_args: 

477 top_n_config_args["results_df"] = results_df 

478 if "objectives" not in top_n_config_args: 

479 top_n_config_args["objectives"] = objectives 

480 (top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs( 

481 exp_data=exp_data, 

482 **top_n_config_args, 

483 ) 

484 

485 (top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column( 

486 top_n_config_results_df, 

487 ) 

488 top_n = len(top_n_config_results_df[groupby_column].unique()) - 1 

489 

490 for orderby_col, ascending in orderby_cols.items(): 

491 opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "") 

492 (_fig, axis) = plt.subplots() 

493 sns.violinplot( 

494 data=top_n_config_results_df, 

495 x=groupby_column, 

496 y=orderby_col, 

497 ax=axis, 

498 ) 

499 if with_scatter_plot: 

500 sns.scatterplot( 

501 data=top_n_config_results_df, 

502 x=groupby_column, 

503 y=orderby_col, 

504 legend=False, 

505 ax=axis, 

506 ) 

507 plt.grid() 

508 (xticks, xlabels) = plt.xticks() 

509 # default should be in the first position based on top_n_configs() return 

510 xlabels[0] = "default" # type: ignore[call-overload] 

511 plt.xticks(xticks, xlabels) # type: ignore[arg-type] 

512 plt.xlabel("Config Trial Group, Config ID") 

513 plt.xticks(rotation=90) 

514 plt.ylabel(opt_tgt) 

515 plt.yscale("log") 

516 extra_title = "(lower is better)" if ascending else "(lower is better)" 

517 plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}") 

518 plt.show()