Coverage for mlos_viz/mlos_viz/base.py: 90%

157 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-01 00:52 +0000

1# 

2# Copyright (c) Microsoft Corporation. 

3# Licensed under the MIT License. 

4# 

5"""Base functions for visualizing, explain, and gain insights from results.""" 

6 

7import re 

8import warnings 

9from collections.abc import Callable, Iterable 

10from importlib.metadata import version 

11from typing import Any, Literal 

12 

13import pandas 

14import seaborn as sns 

15from matplotlib import pyplot as plt 

16from packaging.version import Version 

17from pandas.api.types import is_numeric_dtype 

18from pandas.core.groupby.generic import SeriesGroupBy 

19 

20from mlos_bench.storage.base_experiment_data import ExperimentData 

21from mlos_viz.util import expand_results_data_args 

22 

23_SEABORN_VERS = Version(version("seaborn")) 

24 

25 

26def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> dict[str, Any]: 

27 """ 

28 Assembles a smaller kwargs dict for the specified target function. 

29 

30 Note: this only works with non-positional kwargs (e.g., those after a * arg). 

31 """ 

32 target_kwargs = {} 

33 for kword in target.__kwdefaults__: # or {} # intentionally omitted for now 

34 if kword in kwargs: 

35 target_kwargs[kword] = kwargs[kword] 

36 return target_kwargs 

37 

38 

39def ignore_plotter_warnings() -> None: 

40 """Suppress some annoying warnings from third-party data visualization packages by 

41 adding them to the warnings filter. 

42 """ 

43 warnings.filterwarnings("ignore", category=FutureWarning) 

44 if _SEABORN_VERS <= Version("0.13.1"): 

45 warnings.filterwarnings( 

46 "ignore", 

47 category=DeprecationWarning, 

48 module="seaborn", # but actually comes from pandas 

49 message="is_categorical_dtype is deprecated and will be removed in a future version.", 

50 ) 

51 # See Also: https://github.com/mwaskom/seaborn/issues/3804 

52 warnings.filterwarnings( 

53 "ignore", 

54 category=PendingDeprecationWarning, 

55 module="seaborn", # but actually comes from matplotlib 

56 message=( 

57 "vert: bool will be deprecated in a future version. " 

58 "Use orientation: {'vertical', 'horizontal'} instead." 

59 ), 

60 ) 

61 

62 

63def _add_groupby_desc_column( 

64 results_df: pandas.DataFrame, 

65 groupby_columns: list[str] | None = None, 

66) -> tuple[pandas.DataFrame, list[str], str]: 

67 """ 

68 Adds a group descriptor column to the results_df. 

69 

70 Parameters 

71 ---------- 

72 results_df: ExperimentData 

73 The experiment data to add the descriptor column to. 

74 groupby_columns: Optional[list[str]] 

75 """ 

76 # Compose a new groupby_column for display purposes that is the 

77 # concatenation of the min trial_id (the first one) of each config trial 

78 # group and the config_id. 

79 # Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to 

80 # be on the same axis anyways. 

81 if groupby_columns is None: 

82 groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"] 

83 groupby_column = ",".join(groupby_columns) 

84 results_df[groupby_column] = ( 

85 results_df[groupby_columns].astype(str).apply(",".join, axis=1) 

86 ) # pylint: disable=unnecessary-lambda 

87 groupby_columns.append(groupby_column) 

88 return (results_df, groupby_columns, groupby_column) 

89 

90 

91def augment_results_df_with_config_trial_group_stats( 

92 exp_data: ExperimentData | None = None, 

93 *, 

94 results_df: pandas.DataFrame | None = None, 

95 requested_result_cols: Iterable[str] | None = None, 

96) -> pandas.DataFrame: 

97 # pylint: disable=too-complex 

98 """ 

99 Add a number of useful statistical measure columns to the results dataframe. 

100 

101 In particular, for each numeric result, we add the following columns for each 

102 requested result column: 

103 

104 - ".p50": the median of each config trial group results 

105 

106 - ".p75": the p75 of each config trial group results 

107 

108 - ".p90": the p90 of each config trial group results 

109 

110 - ".p95": the p95 of each config trial group results 

111 

112 - ".p99": the p95 of each config trial group results 

113 

114 - ".mean": the mean of each config trial group results 

115 

116 - ".stddev": the mean of each config trial group results 

117 

118 - ".var": the variance of each config trial group results 

119 

120 - ".var_zscore": the zscore of this group (i.e., variance relative to the stddev 

121 of all group variances). This can be useful for filtering out outliers (e.g., 

122 configs with high variance relative to others by restricting to abs < 2 to 

123 remove those two standard deviations from the mean variance across all config 

124 trial groups). 

125 

126 Additionally, we add a "tunable_config_trial_group_size" column that indicates 

127 the number of trials using a particular config. 

128 

129 Parameters 

130 ---------- 

131 exp_data : ExperimentData 

132 The ExperimentData (e.g., obtained from the storage layer) to plot. 

133 results_df : pandas.DataFrame | None 

134 The results dataframe to augment, by default None to use the results_df property. 

135 requested_result_cols : Optional[Iterable[str]] 

136 Which results columns to augment, by default None to use all results columns 

137 that look numeric. 

138 

139 Returns 

140 ------- 

141 pandas.DataFrame 

142 The augmented results dataframe. 

143 """ 

144 if results_df is None: 

145 if exp_data is None: 

146 raise ValueError("Either exp_data or results_df must be provided.") 

147 results_df = exp_data.results_df 

148 results_groups = results_df.groupby("tunable_config_id") 

149 if len(results_groups) <= 1: 

150 raise ValueError(f"Not enough data: {len(results_groups)}") 

151 

152 if requested_result_cols is None: 

153 result_cols = { 

154 col 

155 for col in results_df.columns 

156 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) 

157 } 

158 else: 

159 result_cols = { 

160 col 

161 for col in requested_result_cols 

162 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns 

163 } 

164 result_cols.update( 

165 { 

166 ExperimentData.RESULT_COLUMN_PREFIX + col 

167 for col in requested_result_cols 

168 if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns 

169 } 

170 ) 

171 

172 def compute_zscore_for_group_agg( 

173 results_groups_perf: "SeriesGroupBy", 

174 stats_df: pandas.DataFrame, 

175 result_col: str, 

176 agg: Literal["mean"] | Literal["var"] | Literal["std"], 

177 ) -> None: 

178 results_groups_perf_aggs = results_groups_perf.agg(agg) # TODO: avoid recalculating? 

179 # Compute the zscore of the chosen aggregate performance of each group into 

180 # each row in the dataframe. 

181 stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean() 

182 stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std() 

183 stats_df[result_col + f".{agg}_zscore"] = ( 

184 stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"] 

185 ) / stats_df[result_col + f".{agg}_stddev"] 

186 stats_df.drop( 

187 columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True 

188 ) 

189 

190 augmented_results_df = results_df 

191 augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform( 

192 "count" 

193 ) 

194 for result_col in result_cols: 

195 if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX): 

196 continue 

197 if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE): 

198 # Ignore computing variance on things like that look like timestamps. 

199 continue 

200 if not is_numeric_dtype(results_df[result_col]): 

201 continue 

202 if results_df[result_col].unique().size == 1: 

203 continue 

204 results_groups_perf = results_groups[result_col] 

205 stats_df = pandas.DataFrame() 

206 stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True) 

207 stats_df[result_col + ".var"] = results_groups_perf.transform("var") 

208 stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5) 

209 

210 compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var") 

211 quantiles = [0.50, 0.75, 0.90, 0.95, 0.99] 

212 for quantile in quantiles: # TODO: can we do this in one pass? 

213 quantile_col = f"{result_col}.p{int(quantile * 100)}" 

214 stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile) 

215 augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1) 

216 return augmented_results_df 

217 

218 

219def limit_top_n_configs( 

220 exp_data: ExperimentData | None = None, 

221 *, 

222 results_df: pandas.DataFrame | None = None, 

223 objectives: dict[str, Literal["min", "max"]] | None = None, 

224 top_n_configs: int = 10, 

225 method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean", 

226) -> tuple[pandas.DataFrame, list[int], dict[str, bool]]: 

227 # pylint: disable=too-many-locals 

228 """ 

229 Utility function to process the results and determine the best performing configs 

230 including potential repeats to help assess variability. 

231 

232 Parameters 

233 ---------- 

234 exp_data : ExperimentData | None 

235 The ExperimentData (e.g., obtained from the storage layer) to operate on. 

236 results_df : pandas.DataFrame | None 

237 The results dataframe to augment, by default None to use 

238 :py:attr:`.ExperimentData.results_df` property. 

239 objectives : Iterable[str] 

240 Which result column(s) to use for sorting the configs, and in which 

241 direction ("min" or "max"). 

242 By default None to automatically select the :py:attr:`.ExperimentData.objectives`. 

243 top_n_configs : int 

244 How many configs to return, including the default, by default 10. 

245 method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean", 

246 Which statistical method to use when sorting the config groups before 

247 determining the cutoff, by default "mean". 

248 

249 Returns 

250 ------- 

251 (top_n_config_results_df, top_n_config_ids, orderby_cols) : 

252 tuple[pandas.DataFrame, list[int], dict[str, bool]] 

253 The filtered results dataframe, the config ids, and the columns used to 

254 order the configs. 

255 """ 

256 # Do some input checking first. 

257 if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]: 

258 raise ValueError(f"Invalid method: {method}") 

259 

260 # Prepare the orderby columns. 

261 (results_df, objs_cols) = expand_results_data_args( 

262 exp_data, 

263 results_df=results_df, 

264 objectives=objectives, 

265 ) 

266 assert isinstance(results_df, pandas.DataFrame) 

267 

268 # Augment the results dataframe with some useful stats. 

269 results_df = augment_results_df_with_config_trial_group_stats( 

270 exp_data=exp_data, 

271 results_df=results_df, 

272 requested_result_cols=objs_cols.keys(), 

273 ) 

274 # Note: mypy seems to lose its mind for some reason and keeps forgetting that 

275 # results_df is not None and is in fact a DataFrame, so we periodically assert 

276 # it in this func for now. 

277 assert results_df is not None 

278 orderby_cols: dict[str, bool] = { 

279 obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items() 

280 } 

281 

282 config_id_col = "tunable_config_id" 

283 group_id_col = "tunable_config_trial_group_id" # first trial_id per config group 

284 trial_id_col = "trial_id" 

285 

286 default_config_id = ( 

287 results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id 

288 ) 

289 assert default_config_id is not None, "Failed to determine default config id." 

290 

291 # Filter out configs whose variance is too large. 

292 # But also make sure the default configs is still in the resulting dataframe 

293 # (for comparison purposes). 

294 for obj_col in objs_cols: 

295 assert results_df is not None 

296 if method == "mean": 

297 singletons_mask = results_df["tunable_config_trial_group_size"] == 1 

298 else: 

299 singletons_mask = results_df["tunable_config_trial_group_size"] > 1 

300 results_df = results_df.loc[ 

301 ( 

302 (results_df[f"{obj_col}.var_zscore"].abs() < 2) 

303 | (singletons_mask) 

304 | (results_df[config_id_col] == default_config_id) 

305 ) 

306 ] 

307 assert results_df is not None 

308 

309 # Also, filter results that are worse than the default. 

310 default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id] 

311 for orderby_col, ascending in orderby_cols.items(): 

312 default_vals = default_config_results_df[orderby_col].unique() 

313 assert len(default_vals) == 1 

314 default_val = default_vals[0] 

315 assert results_df is not None 

316 if ascending: 

317 results_df = results_df.loc[(results_df[orderby_col] <= default_val)] 

318 else: 

319 results_df = results_df.loc[(results_df[orderby_col] >= default_val)] 

320 

321 # Now regroup and filter to the top-N configs by their group performance dimensions. 

322 assert results_df is not None 

323 group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[ 

324 orderby_cols.keys() 

325 ] 

326 top_n_config_ids: list[int] = ( 

327 group_results_df.sort_values( 

328 by=list(orderby_cols.keys()), ascending=list(orderby_cols.values()) 

329 ) 

330 .head(top_n_configs) 

331 .index.tolist() 

332 ) 

333 

334 # Remove the default config if it's included. We'll add it back later. 

335 if default_config_id in top_n_config_ids: 

336 top_n_config_ids.remove(default_config_id) 

337 # Get just the top-n config results. 

338 # Sort by the group ids. 

339 top_n_config_results_df = results_df.loc[ 

340 (results_df[config_id_col].isin(top_n_config_ids)) 

341 ].sort_values([group_id_col, config_id_col, trial_id_col]) 

342 # Place the default config at the top of the list. 

343 top_n_config_ids.insert(0, default_config_id) 

344 top_n_config_results_df = pandas.concat( 

345 [default_config_results_df, top_n_config_results_df], 

346 axis=0, 

347 ) 

348 return (top_n_config_results_df, top_n_config_ids, orderby_cols) 

349 

350 

351def plot_optimizer_trends( 

352 exp_data: ExperimentData | None = None, 

353 *, 

354 results_df: pandas.DataFrame | None = None, 

355 objectives: dict[str, Literal["min", "max"]] | None = None, 

356) -> None: 

357 """ 

358 Plots the optimizer trends for the Experiment. 

359 

360 Parameters 

361 ---------- 

362 exp_data : ExperimentData 

363 The ExperimentData (e.g., obtained from the storage layer) to plot. 

364 results_df : pandas.DataFrame | None 

365 Optional results_df to plot. 

366 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property. 

367 objectives : Optional[dict[str, Literal["min", "max"]]] 

368 Optional objectives to plot. 

369 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property. 

370 """ 

371 (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives) 

372 (results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df) 

373 

374 for objective_column, ascending in obj_cols.items(): 

375 incumbent_column = objective_column + ".incumbent" 

376 

377 # Determine the mean of each config trial group to match the box plots. 

378 group_results_df = ( 

379 results_df.groupby(groupby_columns)[objective_column] 

380 .mean() 

381 .reset_index() 

382 .sort_values(groupby_columns) 

383 ) 

384 # 

385 # Note: technically the optimizer (usually) uses the *first* result for a 

386 # given config trial group before moving on to a new config (x-axis), so 

387 # plotting the mean may be slightly misleading when trying to understand the 

388 # actual path taken by the optimizer in case of high variance samples. 

389 # Here's a way to do that, though it can also be misleading if the optimizer 

390 # later gets a worse value for that config group as well. 

391 # 

392 # group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby( 

393 # groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index() 

394 

395 # Calculate the incumbent (best seen so far) 

396 if ascending: 

397 group_results_df[incumbent_column] = group_results_df[objective_column].cummin() 

398 else: 

399 group_results_df[incumbent_column] = group_results_df[objective_column].cummax() 

400 

401 (_fig, axis) = plt.subplots(figsize=(15, 5)) 

402 

403 # Result of each set of trials for a config 

404 sns.boxplot( 

405 data=results_df, 

406 x=groupby_column, 

407 y=objective_column, 

408 ax=axis, 

409 ) 

410 

411 # Results of the best so far. 

412 axis = sns.lineplot( 

413 data=group_results_df, 

414 x=groupby_column, 

415 y=incumbent_column, 

416 alpha=0.7, 

417 label="Mean of Incumbent Config Trial Group", 

418 ax=axis, 

419 ) 

420 

421 plt.yscale("log") 

422 plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")) 

423 

424 plt.xlabel("Config Trial Group ID, Config ID") 

425 plt.xticks(rotation=90, fontsize=8) 

426 

427 plt.title( 

428 "Optimizer Trends for Experiment: " + exp_data.experiment_id 

429 if exp_data is not None 

430 else "" 

431 ) 

432 plt.grid() 

433 plt.show() 

434 

435 

436def plot_top_n_configs( 

437 exp_data: ExperimentData | None = None, 

438 *, 

439 results_df: pandas.DataFrame | None = None, 

440 objectives: dict[str, Literal["min", "max"]] | None = None, 

441 with_scatter_plot: bool = False, 

442 **kwargs: Any, 

443) -> None: 

444 # pylint: disable=too-many-locals 

445 """ 

446 Plots the top-N configs along with the default config for the given 

447 :py:class:`.ExperimentData`. 

448 

449 Intended to be used from a Jupyter notebook. 

450 

451 Parameters 

452 ---------- 

453 exp_data: ExperimentData 

454 The experiment data to plot. 

455 results_df : pandas.DataFrame | None 

456 Optional results_df to plot. 

457 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property. 

458 objectives : Optional[dict[str, Literal["min", "max"]]] 

459 Optional objectives to plot. 

460 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property. 

461 with_scatter_plot : bool 

462 Whether to also add scatter plot to the output figure. 

463 kwargs : dict 

464 Remaining keyword arguments are passed along to the 

465 :py:func:`limit_top_n_configs` function. 

466 """ 

467 (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives) 

468 top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs) 

469 if "results_df" not in top_n_config_args: 

470 top_n_config_args["results_df"] = results_df 

471 if "objectives" not in top_n_config_args: 

472 top_n_config_args["objectives"] = objectives 

473 (top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs( 

474 exp_data=exp_data, 

475 **top_n_config_args, 

476 ) 

477 

478 (top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column( 

479 top_n_config_results_df, 

480 ) 

481 top_n = len(top_n_config_results_df[groupby_column].unique()) - 1 

482 

483 for orderby_col, ascending in orderby_cols.items(): 

484 opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "") 

485 (_fig, axis) = plt.subplots() 

486 sns.violinplot( 

487 data=top_n_config_results_df, 

488 x=groupby_column, 

489 y=orderby_col, 

490 ax=axis, 

491 ) 

492 if with_scatter_plot: 

493 sns.scatterplot( 

494 data=top_n_config_results_df, 

495 x=groupby_column, 

496 y=orderby_col, 

497 legend=False, 

498 ax=axis, 

499 ) 

500 plt.grid() 

501 (xticks, xlabels) = plt.xticks() 

502 # default should be in the first position based on top_n_configs() return 

503 xlabels[0] = "default" # type: ignore[call-overload] 

504 plt.xticks(xticks, xlabels) # type: ignore[arg-type] 

505 plt.xlabel("Config Trial Group, Config ID") 

506 plt.xticks(rotation=90) 

507 plt.ylabel(opt_tgt) 

508 plt.yscale("log") 

509 extra_title = "(lower is better)" if ascending else "(lower is better)" 

510 plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}") 

511 plt.show()