Coverage for mlos_viz/mlos_viz/base.py: 90%
158 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-30 00:51 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-30 00:51 +0000
1#
2# Copyright (c) Microsoft Corporation.
3# Licensed under the MIT License.
4#
5"""Base functions for visualizing, explain, and gain insights from results."""
7import re
8import warnings
9from collections.abc import Callable, Iterable
10from importlib.metadata import version
11from typing import Any, Literal
13import pandas
14import seaborn as sns
15from matplotlib import pyplot as plt
16from packaging.version import Version
17from pandas.api.types import is_numeric_dtype
18from pandas.core.groupby.generic import SeriesGroupBy
20from mlos_bench.storage.base_experiment_data import ExperimentData
21from mlos_viz.util import expand_results_data_args
23_SEABORN_VERS = Version(version("seaborn"))
26def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> dict[str, Any]:
27 """
28 Assembles a smaller kwargs dict for the specified target function.
30 Note: this only works with non-positional kwargs (e.g., those after a * arg).
31 """
32 target_kwargs = {}
33 for kword in target.__kwdefaults__ or {}:
34 if kword in kwargs:
35 target_kwargs[kword] = kwargs[kword]
36 return target_kwargs
39def ignore_plotter_warnings() -> None:
40 """Suppress some annoying warnings from third-party data visualization packages by
41 adding them to the warnings filter.
42 """
43 warnings.filterwarnings("ignore", category=FutureWarning)
44 if _SEABORN_VERS <= Version("0.13.1"):
45 warnings.filterwarnings(
46 "ignore",
47 category=DeprecationWarning,
48 module="seaborn", # but actually comes from pandas
49 message="is_categorical_dtype is deprecated and will be removed in a future version.",
50 )
51 # See Also: https://github.com/mwaskom/seaborn/issues/3804
52 warnings.filterwarnings(
53 "ignore",
54 category=PendingDeprecationWarning,
55 module="seaborn", # but actually comes from matplotlib
56 message=(
57 "vert: bool will be deprecated in a future version. "
58 "Use orientation: {'vertical', 'horizontal'} instead."
59 ),
60 )
62 warnings.filterwarnings(
63 "ignore",
64 module="matplotlib",
65 category=DeprecationWarning,
66 message="'mode' parameter is deprecated and will be removed in Pillow 13",
67 )
70def _add_groupby_desc_column(
71 results_df: pandas.DataFrame,
72 groupby_columns: list[str] | None = None,
73) -> tuple[pandas.DataFrame, list[str], str]:
74 """
75 Adds a group descriptor column to the results_df.
77 Parameters
78 ----------
79 results_df: ExperimentData
80 The experiment data to add the descriptor column to.
81 groupby_columns: Optional[list[str]]
82 """
83 # Compose a new groupby_column for display purposes that is the
84 # concatenation of the min trial_id (the first one) of each config trial
85 # group and the config_id.
86 # Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to
87 # be on the same axis anyways.
88 if groupby_columns is None:
89 groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"]
90 groupby_column = ",".join(groupby_columns)
91 results_df[groupby_column] = (
92 results_df[groupby_columns].astype(str).apply(",".join, axis=1)
93 ) # pylint: disable=unnecessary-lambda
94 groupby_columns.append(groupby_column)
95 return (results_df, groupby_columns, groupby_column)
98def augment_results_df_with_config_trial_group_stats(
99 exp_data: ExperimentData | None = None,
100 *,
101 results_df: pandas.DataFrame | None = None,
102 requested_result_cols: Iterable[str] | None = None,
103) -> pandas.DataFrame:
104 # pylint: disable=too-complex
105 """
106 Add a number of useful statistical measure columns to the results dataframe.
108 In particular, for each numeric result, we add the following columns for each
109 requested result column:
111 - ".p50": the median of each config trial group results
113 - ".p75": the p75 of each config trial group results
115 - ".p90": the p90 of each config trial group results
117 - ".p95": the p95 of each config trial group results
119 - ".p99": the p95 of each config trial group results
121 - ".mean": the mean of each config trial group results
123 - ".stddev": the mean of each config trial group results
125 - ".var": the variance of each config trial group results
127 - ".var_zscore": the zscore of this group (i.e., variance relative to the stddev
128 of all group variances). This can be useful for filtering out outliers (e.g.,
129 configs with high variance relative to others by restricting to abs < 2 to
130 remove those two standard deviations from the mean variance across all config
131 trial groups).
133 Additionally, we add a "tunable_config_trial_group_size" column that indicates
134 the number of trials using a particular config.
136 Parameters
137 ----------
138 exp_data : ExperimentData
139 The ExperimentData (e.g., obtained from the storage layer) to plot.
140 results_df : pandas.DataFrame | None
141 The results dataframe to augment, by default None to use the results_df property.
142 requested_result_cols : Optional[Iterable[str]]
143 Which results columns to augment, by default None to use all results columns
144 that look numeric.
146 Returns
147 -------
148 pandas.DataFrame
149 The augmented results dataframe.
150 """
151 if results_df is None:
152 if exp_data is None:
153 raise ValueError("Either exp_data or results_df must be provided.")
154 results_df = exp_data.results_df
155 results_groups = results_df.groupby("tunable_config_id")
156 if len(results_groups) <= 1:
157 raise ValueError(f"Not enough data: {len(results_groups)}")
159 if requested_result_cols is None:
160 result_cols = {
161 col
162 for col in results_df.columns
163 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX)
164 }
165 else:
166 result_cols = {
167 col
168 for col in requested_result_cols
169 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns
170 }
171 result_cols.update(
172 {
173 ExperimentData.RESULT_COLUMN_PREFIX + col
174 for col in requested_result_cols
175 if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns
176 }
177 )
179 def compute_zscore_for_group_agg(
180 results_groups_perf: "SeriesGroupBy",
181 stats_df: pandas.DataFrame,
182 result_col: str,
183 agg: Literal["mean"] | Literal["var"] | Literal["std"],
184 ) -> None:
185 results_groups_perf_aggs = results_groups_perf.agg(agg) # TODO: avoid recalculating?
186 # Compute the zscore of the chosen aggregate performance of each group into
187 # each row in the dataframe.
188 stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean()
189 stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std()
190 stats_df[result_col + f".{agg}_zscore"] = (
191 stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"]
192 ) / stats_df[result_col + f".{agg}_stddev"]
193 stats_df.drop(
194 columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True
195 )
197 augmented_results_df = results_df
198 augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform(
199 "count"
200 )
201 for result_col in result_cols:
202 if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX):
203 continue
204 if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE):
205 # Ignore computing variance on things like that look like timestamps.
206 continue
207 if not is_numeric_dtype(results_df[result_col]):
208 continue
209 if results_df[result_col].unique().size == 1:
210 continue
211 results_groups_perf = results_groups[result_col]
212 stats_df = pandas.DataFrame()
213 stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True)
214 stats_df[result_col + ".var"] = results_groups_perf.transform("var")
215 stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5)
217 compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var")
218 quantiles = [0.50, 0.75, 0.90, 0.95, 0.99]
219 for quantile in quantiles: # TODO: can we do this in one pass?
220 quantile_col = f"{result_col}.p{int(quantile * 100)}"
221 stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile)
222 augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1)
223 return augmented_results_df
226def limit_top_n_configs(
227 exp_data: ExperimentData | None = None,
228 *,
229 results_df: pandas.DataFrame | None = None,
230 objectives: dict[str, Literal["min", "max"]] | None = None,
231 top_n_configs: int = 10,
232 method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean",
233) -> tuple[pandas.DataFrame, list[int], dict[str, bool]]:
234 # pylint: disable=too-many-locals
235 """
236 Utility function to process the results and determine the best performing configs
237 including potential repeats to help assess variability.
239 Parameters
240 ----------
241 exp_data : ExperimentData | None
242 The ExperimentData (e.g., obtained from the storage layer) to operate on.
243 results_df : pandas.DataFrame | None
244 The results dataframe to augment, by default None to use
245 :py:attr:`.ExperimentData.results_df` property.
246 objectives : Iterable[str]
247 Which result column(s) to use for sorting the configs, and in which
248 direction ("min" or "max").
249 By default None to automatically select the :py:attr:`.ExperimentData.objectives`.
250 top_n_configs : int
251 How many configs to return, including the default, by default 10.
252 method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean",
253 Which statistical method to use when sorting the config groups before
254 determining the cutoff, by default "mean".
256 Returns
257 -------
258 (top_n_config_results_df, top_n_config_ids, orderby_cols) :
259 tuple[pandas.DataFrame, list[int], dict[str, bool]]
260 The filtered results dataframe, the config ids, and the columns used to
261 order the configs.
262 """
263 # Do some input checking first.
264 if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]:
265 raise ValueError(f"Invalid method: {method}")
267 # Prepare the orderby columns.
268 (results_df, objs_cols) = expand_results_data_args(
269 exp_data,
270 results_df=results_df,
271 objectives=objectives,
272 )
273 assert isinstance(results_df, pandas.DataFrame)
275 # Augment the results dataframe with some useful stats.
276 results_df = augment_results_df_with_config_trial_group_stats(
277 exp_data=exp_data,
278 results_df=results_df,
279 requested_result_cols=objs_cols.keys(),
280 )
281 # Note: mypy seems to lose its mind for some reason and keeps forgetting that
282 # results_df is not None and is in fact a DataFrame, so we periodically assert
283 # it in this func for now.
284 assert results_df is not None
285 orderby_cols: dict[str, bool] = {
286 obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items()
287 }
289 config_id_col = "tunable_config_id"
290 group_id_col = "tunable_config_trial_group_id" # first trial_id per config group
291 trial_id_col = "trial_id"
293 default_config_id = (
294 results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id
295 )
296 assert default_config_id is not None, "Failed to determine default config id."
298 # Filter out configs whose variance is too large.
299 # But also make sure the default configs is still in the resulting dataframe
300 # (for comparison purposes).
301 for obj_col in objs_cols:
302 assert results_df is not None
303 if method == "mean":
304 singletons_mask = results_df["tunable_config_trial_group_size"] == 1
305 else:
306 singletons_mask = results_df["tunable_config_trial_group_size"] > 1
307 results_df = results_df.loc[
308 (
309 (results_df[f"{obj_col}.var_zscore"].abs() < 2)
310 | (singletons_mask)
311 | (results_df[config_id_col] == default_config_id)
312 )
313 ]
314 assert results_df is not None
316 # Also, filter results that are worse than the default.
317 default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id]
318 for orderby_col, ascending in orderby_cols.items():
319 default_vals = default_config_results_df[orderby_col].unique()
320 assert len(default_vals) == 1
321 default_val = default_vals[0]
322 assert results_df is not None
323 if ascending:
324 results_df = results_df.loc[(results_df[orderby_col] <= default_val)]
325 else:
326 results_df = results_df.loc[(results_df[orderby_col] >= default_val)]
328 # Now regroup and filter to the top-N configs by their group performance dimensions.
329 assert results_df is not None
330 group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[
331 orderby_cols.keys()
332 ]
333 top_n_config_ids: list[int] = (
334 group_results_df.sort_values(
335 by=list(orderby_cols.keys()), ascending=list(orderby_cols.values())
336 )
337 .head(top_n_configs)
338 .index.tolist()
339 )
341 # Remove the default config if it's included. We'll add it back later.
342 if default_config_id in top_n_config_ids:
343 top_n_config_ids.remove(default_config_id)
344 # Get just the top-n config results.
345 # Sort by the group ids.
346 top_n_config_results_df = results_df.loc[
347 (results_df[config_id_col].isin(top_n_config_ids))
348 ].sort_values([group_id_col, config_id_col, trial_id_col])
349 # Place the default config at the top of the list.
350 top_n_config_ids.insert(0, default_config_id)
351 top_n_config_results_df = pandas.concat(
352 [default_config_results_df, top_n_config_results_df],
353 axis=0,
354 )
355 return (top_n_config_results_df, top_n_config_ids, orderby_cols)
358def plot_optimizer_trends(
359 exp_data: ExperimentData | None = None,
360 *,
361 results_df: pandas.DataFrame | None = None,
362 objectives: dict[str, Literal["min", "max"]] | None = None,
363) -> None:
364 """
365 Plots the optimizer trends for the Experiment.
367 Parameters
368 ----------
369 exp_data : ExperimentData
370 The ExperimentData (e.g., obtained from the storage layer) to plot.
371 results_df : pandas.DataFrame | None
372 Optional results_df to plot.
373 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property.
374 objectives : Optional[dict[str, Literal["min", "max"]]]
375 Optional objectives to plot.
376 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property.
377 """
378 (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
379 (results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df)
381 for objective_column, ascending in obj_cols.items():
382 incumbent_column = objective_column + ".incumbent"
384 # Determine the mean of each config trial group to match the box plots.
385 group_results_df = (
386 results_df.groupby(groupby_columns)[objective_column]
387 .mean()
388 .reset_index()
389 .sort_values(groupby_columns)
390 )
391 #
392 # Note: technically the optimizer (usually) uses the *first* result for a
393 # given config trial group before moving on to a new config (x-axis), so
394 # plotting the mean may be slightly misleading when trying to understand the
395 # actual path taken by the optimizer in case of high variance samples.
396 # Here's a way to do that, though it can also be misleading if the optimizer
397 # later gets a worse value for that config group as well.
398 #
399 # group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby(
400 # groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index()
402 # Calculate the incumbent (best seen so far)
403 if ascending:
404 group_results_df[incumbent_column] = group_results_df[objective_column].cummin()
405 else:
406 group_results_df[incumbent_column] = group_results_df[objective_column].cummax()
408 (_fig, axis) = plt.subplots(figsize=(15, 5))
410 # Result of each set of trials for a config
411 sns.boxplot(
412 data=results_df,
413 x=groupby_column,
414 y=objective_column,
415 ax=axis,
416 )
418 # Results of the best so far.
419 axis = sns.lineplot(
420 data=group_results_df,
421 x=groupby_column,
422 y=incumbent_column,
423 alpha=0.7,
424 label="Mean of Incumbent Config Trial Group",
425 ax=axis,
426 )
428 plt.yscale("log")
429 plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, ""))
431 plt.xlabel("Config Trial Group ID, Config ID")
432 plt.xticks(rotation=90, fontsize=8)
434 plt.title(
435 "Optimizer Trends for Experiment: " + exp_data.experiment_id
436 if exp_data is not None
437 else ""
438 )
439 plt.grid()
440 plt.show()
443def plot_top_n_configs(
444 exp_data: ExperimentData | None = None,
445 *,
446 results_df: pandas.DataFrame | None = None,
447 objectives: dict[str, Literal["min", "max"]] | None = None,
448 with_scatter_plot: bool = False,
449 **kwargs: Any,
450) -> None:
451 # pylint: disable=too-many-locals
452 """
453 Plots the top-N configs along with the default config for the given
454 :py:class:`.ExperimentData`.
456 Intended to be used from a Jupyter notebook.
458 Parameters
459 ----------
460 exp_data: ExperimentData
461 The experiment data to plot.
462 results_df : pandas.DataFrame | None
463 Optional results_df to plot.
464 If not provided, defaults to :py:attr:`.ExperimentData.results_df` property.
465 objectives : Optional[dict[str, Literal["min", "max"]]]
466 Optional objectives to plot.
467 If not provided, defaults to :py:attr:`.ExperimentData.objectives` property.
468 with_scatter_plot : bool
469 Whether to also add scatter plot to the output figure.
470 kwargs : dict
471 Remaining keyword arguments are passed along to the
472 :py:func:`limit_top_n_configs` function.
473 """
474 (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
475 top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs)
476 if "results_df" not in top_n_config_args:
477 top_n_config_args["results_df"] = results_df
478 if "objectives" not in top_n_config_args:
479 top_n_config_args["objectives"] = objectives
480 (top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs(
481 exp_data=exp_data,
482 **top_n_config_args,
483 )
485 (top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column(
486 top_n_config_results_df,
487 )
488 top_n = len(top_n_config_results_df[groupby_column].unique()) - 1
490 for orderby_col, ascending in orderby_cols.items():
491 opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")
492 (_fig, axis) = plt.subplots()
493 sns.violinplot(
494 data=top_n_config_results_df,
495 x=groupby_column,
496 y=orderby_col,
497 ax=axis,
498 )
499 if with_scatter_plot:
500 sns.scatterplot(
501 data=top_n_config_results_df,
502 x=groupby_column,
503 y=orderby_col,
504 legend=False,
505 ax=axis,
506 )
507 plt.grid()
508 (xticks, xlabels) = plt.xticks()
509 # default should be in the first position based on top_n_configs() return
510 xlabels[0] = "default" # type: ignore[call-overload]
511 plt.xticks(xticks, xlabels) # type: ignore[arg-type]
512 plt.xlabel("Config Trial Group, Config ID")
513 plt.xticks(rotation=90)
514 plt.ylabel(opt_tgt)
515 plt.yscale("log")
516 extra_title = "(lower is better)" if ascending else "(lower is better)"
517 plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}")
518 plt.show()