Coverage for mlos_viz/mlos_viz/base.py: 90%
155 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-05 00:36 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-05 00:36 +0000
1#
2# Copyright (c) Microsoft Corporation.
3# Licensed under the MIT License.
4#
5"""
6Base functions for visualizing, explain, and gain insights from results.
7"""
9from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union
11import re
12import warnings
14from importlib.metadata import version
16from matplotlib import pyplot as plt
17import pandas
18from pandas.api.types import is_numeric_dtype
19from pandas.core.groupby.generic import SeriesGroupBy
20import seaborn as sns
22from mlos_bench.storage.base_experiment_data import ExperimentData
23from mlos_viz.util import expand_results_data_args
26_SEABORN_VERS = version('seaborn')
29def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> Dict[str, Any]:
30 """
31 Assembles a smaller kwargs dict for the specified target function.
33 Note: this only works with non-positional kwargs (e.g., those after a * arg).
34 """
35 target_kwargs = {}
36 for kword in target.__kwdefaults__: # or {} # intentionally omitted for now
37 if kword in kwargs:
38 target_kwargs[kword] = kwargs[kword]
39 return target_kwargs
42def ignore_plotter_warnings() -> None:
43 """
44 Suppress some annoying warnings from third-party data visualization packages by
45 adding them to the warnings filter.
46 """
47 warnings.filterwarnings("ignore", category=FutureWarning)
48 if _SEABORN_VERS <= '0.13.1':
49 warnings.filterwarnings("ignore", category=DeprecationWarning, module="seaborn", # but actually comes from pandas
50 message="is_categorical_dtype is deprecated and will be removed in a future version.")
53def _add_groupby_desc_column(results_df: pandas.DataFrame,
54 groupby_columns: Optional[List[str]] = None,
55 ) -> Tuple[pandas.DataFrame, List[str], str]:
56 """
57 Adds a group descriptor column to the results_df.
59 Parameters
60 ----------
61 results_df: ExperimentData
62 The experiment data to add the descriptor column to.
63 groupby_columns: Optional[List[str]]
64 """
65 # Compose a new groupby_column for display purposes that is the
66 # concatenation of the min trial_id (the first one) of each config trial
67 # group and the config_id.
68 # Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to
69 # be on the same axis anyways.
70 if groupby_columns is None:
71 groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"]
72 groupby_column = ",".join(groupby_columns)
73 results_df[groupby_column] = results_df[groupby_columns].astype(str).apply(
74 lambda x: ",".join(x), axis=1) # pylint: disable=unnecessary-lambda
75 groupby_columns.append(groupby_column)
76 return (results_df, groupby_columns, groupby_column)
79def augment_results_df_with_config_trial_group_stats(exp_data: Optional[ExperimentData] = None,
80 *,
81 results_df: Optional[pandas.DataFrame] = None,
82 requested_result_cols: Optional[Iterable[str]] = None,
83 ) -> pandas.DataFrame:
84 # pylint: disable=too-complex
85 """
86 Add a number of useful statistical measure columns to the results dataframe.
88 In particular, for each numeric result, we add the following columns for each
89 requested result column:
91 - ".p50": the median of each config trial group results
93 - ".p75": the p75 of each config trial group results
95 - ".p90": the p90 of each config trial group results
97 - ".p95": the p95 of each config trial group results
99 - ".p99": the p95 of each config trial group results
101 - ".mean": the mean of each config trial group results
103 - ".stddev": the mean of each config trial group results
105 - ".var": the variance of each config trial group results
107 - ".var_zscore": the zscore of this group (i.e., variance relative to the stddev
108 of all group variances). This can be useful for filtering out outliers (e.g.,
109 configs with high variance relative to others by restricting to abs < 2 to
110 remove those two standard deviations from the mean variance across all config
111 trial groups).
113 Additionally, we add a "tunable_config_trial_group_size" column that indicates
114 the number of trials using a particular config.
116 Parameters
117 ----------
118 exp_data : ExperimentData
119 The ExperimentData (e.g., obtained from the storage layer) to plot.
120 results_df : Optional[pandas.DataFrame]
121 The results dataframe to augment, by default None to use the results_df property.
122 requested_result_cols : Optional[Iterable[str]]
123 Which results columns to augment, by default None to use all results columns
124 that look numeric.
126 Returns
127 -------
128 pandas.DataFrame
129 The augmented results dataframe.
130 """
131 if results_df is None:
132 if exp_data is None:
133 raise ValueError("Either exp_data or results_df must be provided.")
134 results_df = exp_data.results_df
135 results_groups = results_df.groupby("tunable_config_id")
136 if len(results_groups) <= 1:
137 raise ValueError(f"Not enough data: {len(results_groups)}")
139 if requested_result_cols is None:
140 result_cols = set(col for col in results_df.columns if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX))
141 else:
142 result_cols = set(col for col in requested_result_cols
143 if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns)
144 result_cols.update(set(ExperimentData.RESULT_COLUMN_PREFIX + col for col in requested_result_cols
145 if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns))
147 def compute_zscore_for_group_agg(
148 results_groups_perf: "SeriesGroupBy",
149 stats_df: pandas.DataFrame,
150 result_col: str,
151 agg: Union[Literal["mean"], Literal["var"], Literal["std"]]
152 ) -> None:
153 results_groups_perf_aggs = results_groups_perf.agg(agg) # TODO: avoid recalculating?
154 # Compute the zscore of the chosen aggregate performance of each group into each row in the dataframe.
155 stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean()
156 stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std()
157 stats_df[result_col + f".{agg}_zscore"] = \
158 (stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"]) \
159 / stats_df[result_col + f".{agg}_stddev"]
160 stats_df.drop(columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True)
162 augmented_results_df = results_df
163 augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform("count")
164 for result_col in result_cols:
165 if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX):
166 continue
167 if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE):
168 # Ignore computing variance on things like that look like timestamps.
169 continue
170 if not is_numeric_dtype(results_df[result_col]):
171 continue
172 if results_df[result_col].unique().size == 1:
173 continue
174 results_groups_perf = results_groups[result_col]
175 stats_df = pandas.DataFrame()
176 stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True)
177 stats_df[result_col + ".var"] = results_groups_perf.transform("var")
178 stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5)
180 compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var")
181 quantiles = [0.50, 0.75, 0.90, 0.95, 0.99]
182 for quantile in quantiles: # TODO: can we do this in one pass?
183 quantile_col = result_col + f".p{int(quantile*100)}"
184 stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile)
185 augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1)
186 return augmented_results_df
189def limit_top_n_configs(exp_data: Optional[ExperimentData] = None,
190 *,
191 results_df: Optional[pandas.DataFrame] = None,
192 objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
193 top_n_configs: int = 10,
194 method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean",
195 ) -> Tuple[pandas.DataFrame, List[int], Dict[str, bool]]:
196 # pylint: disable=too-many-locals
197 """
198 Utility function to process the results and determine the best performing
199 configs including potential repeats to help assess variability.
201 Parameters
202 ----------
203 exp_data : Optional[ExperimentData]
204 The ExperimentData (e.g., obtained from the storage layer) to operate on.
205 results_df : Optional[pandas.DataFrame]
206 The results dataframe to augment, by default None to use the results_df property.
207 objectives : Iterable[str], optional
208 Which result column(s) to use for sorting the configs, and in which direction ("min" or "max").
209 By default None to automatically select the experiment objectives.
210 top_n_configs : int, optional
211 How many configs to return, including the default, by default 20.
212 method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean",
213 Which statistical method to use when sorting the config groups before determining the cutoff, by default "mean".
215 Returns
216 -------
217 (top_n_config_results_df, top_n_config_ids, orderby_cols) : Tuple[pandas.DataFrame, List[int], Dict[str, bool]]
218 The filtered results dataframe, the config ids, and the columns used to order the configs.
219 """
220 # Do some input checking first.
221 if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]:
222 raise ValueError(f"Invalid method: {method}")
224 # Prepare the orderby columns.
225 (results_df, objs_cols) = expand_results_data_args(exp_data, results_df=results_df, objectives=objectives)
226 assert isinstance(results_df, pandas.DataFrame)
228 # Augment the results dataframe with some useful stats.
229 results_df = augment_results_df_with_config_trial_group_stats(
230 exp_data=exp_data,
231 results_df=results_df,
232 requested_result_cols=objs_cols.keys(),
233 )
234 # Note: mypy seems to lose its mind for some reason and keeps forgetting that
235 # results_df is not None and is in fact a DataFrame, so we periodically assert
236 # it in this func for now.
237 assert results_df is not None
238 orderby_cols: Dict[str, bool] = {obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items()}
240 config_id_col = "tunable_config_id"
241 group_id_col = "tunable_config_trial_group_id" # first trial_id per config group
242 trial_id_col = "trial_id"
244 default_config_id = results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id
245 assert default_config_id is not None, "Failed to determine default config id."
247 # Filter out configs whose variance is too large.
248 # But also make sure the default configs is still in the resulting dataframe
249 # (for comparison purposes).
250 for obj_col in objs_cols:
251 assert results_df is not None
252 if method == "mean":
253 singletons_mask = results_df["tunable_config_trial_group_size"] == 1
254 else:
255 singletons_mask = results_df["tunable_config_trial_group_size"] > 1
256 results_df = results_df.loc[(
257 (results_df[f"{obj_col}.var_zscore"].abs() < 2)
258 | (singletons_mask)
259 | (results_df[config_id_col] == default_config_id)
260 )]
261 assert results_df is not None
263 # Also, filter results that are worse than the default.
264 default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id]
265 for (orderby_col, ascending) in orderby_cols.items():
266 default_vals = default_config_results_df[orderby_col].unique()
267 assert len(default_vals) == 1
268 default_val = default_vals[0]
269 assert results_df is not None
270 if ascending:
271 results_df = results_df.loc[(results_df[orderby_col] <= default_val)]
272 else:
273 results_df = results_df.loc[(results_df[orderby_col] >= default_val)]
275 # Now regroup and filter to the top-N configs by their group performance dimensions.
276 assert results_df is not None
277 group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[orderby_cols.keys()]
278 top_n_config_ids: List[int] = group_results_df.sort_values(
279 by=list(orderby_cols.keys()), ascending=list(orderby_cols.values())).head(top_n_configs).index.tolist()
281 # Remove the default config if it's included. We'll add it back later.
282 if default_config_id in top_n_config_ids:
283 top_n_config_ids.remove(default_config_id)
284 # Get just the top-n config results.
285 # Sort by the group ids.
286 top_n_config_results_df = results_df.loc[(
287 results_df[config_id_col].isin(top_n_config_ids)
288 )].sort_values([group_id_col, config_id_col, trial_id_col])
289 # Place the default config at the top of the list.
290 top_n_config_ids.insert(0, default_config_id)
291 top_n_config_results_df = pandas.concat([default_config_results_df, top_n_config_results_df], axis=0)
292 return (top_n_config_results_df, top_n_config_ids, orderby_cols)
295def plot_optimizer_trends(
296 exp_data: Optional[ExperimentData] = None,
297 *,
298 results_df: Optional[pandas.DataFrame] = None,
299 objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
300) -> None:
301 """
302 Plots the optimizer trends for the Experiment.
304 Parameters
305 ----------
306 exp_data : ExperimentData
307 The ExperimentData (e.g., obtained from the storage layer) to plot.
308 results_df : Optional["pandas.DataFrame"]
309 Optional results_df to plot.
310 If not provided, defaults to exp_data.results_df property.
311 objectives : Optional[Dict[str, Literal["min", "max"]]]
312 Optional objectives to plot.
313 If not provided, defaults to exp_data.objectives property.
314 """
315 (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
316 (results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df)
318 for (objective_column, ascending) in obj_cols.items():
319 incumbent_column = objective_column + ".incumbent"
321 # Determine the mean of each config trial group to match the box plots.
322 group_results_df = results_df.groupby(groupby_columns)[objective_column].mean()\
323 .reset_index().sort_values(groupby_columns)
324 #
325 # Note: technically the optimizer (usually) uses the *first* result for a
326 # given config trial group before moving on to a new config (x-axis), so
327 # plotting the mean may be slightly misleading when trying to understand the
328 # actual path taken by the optimizer in case of high variance samples.
329 # Here's a way to do that, though it can also be misleading if the optimizer
330 # later gets a worse value for that config group as well.
331 #
332 # group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby(
333 # groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index()
335 # Calculate the incumbent (best seen so far)
336 if ascending:
337 group_results_df[incumbent_column] = group_results_df[objective_column].cummin()
338 else:
339 group_results_df[incumbent_column] = group_results_df[objective_column].cummax()
341 (_fig, axis) = plt.subplots(figsize=(15, 5))
343 # Result of each set of trials for a config
344 sns.boxplot(
345 data=results_df,
346 x=groupby_column,
347 y=objective_column,
348 ax=axis,
349 )
351 # Results of the best so far.
352 axis = sns.lineplot(
353 data=group_results_df,
354 x=groupby_column,
355 y=incumbent_column,
356 alpha=0.7,
357 label="Mean of Incumbent Config Trial Group",
358 ax=axis,
359 )
361 plt.yscale('log')
362 plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, ""))
364 plt.xlabel("Config Trial Group ID, Config ID")
365 plt.xticks(rotation=90, fontsize=8)
367 plt.title("Optimizer Trends for Experiment: " + exp_data.experiment_id if exp_data is not None else "")
368 plt.grid()
369 plt.show() # type: ignore[no-untyped-call]
372def plot_top_n_configs(exp_data: Optional[ExperimentData] = None,
373 *,
374 results_df: Optional[pandas.DataFrame] = None,
375 objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
376 with_scatter_plot: bool = False,
377 **kwargs: Any,
378 ) -> None:
379 # pylint: disable=too-many-locals
380 """
381 Plots the top-N configs along with the default config for the given ExperimentData.
383 Intended to be used from a Jupyter notebook.
385 Parameters
386 ----------
387 exp_data: ExperimentData
388 The experiment data to plot.
389 results_df : Optional["pandas.DataFrame"]
390 Optional results_df to plot.
391 If not provided, defaults to exp_data.results_df property.
392 objectives : Optional[Dict[str, Literal["min", "max"]]]
393 Optional objectives to plot.
394 If not provided, defaults to exp_data.objectives property.
395 with_scatter_plot : bool
396 Whether to also add scatter plot to the output figure.
397 kwargs : dict
398 Remaining keyword arguments are passed along to the limit_top_n_configs function.
399 """
400 (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
401 top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs)
402 if "results_df" not in top_n_config_args:
403 top_n_config_args["results_df"] = results_df
404 if "objectives" not in top_n_config_args:
405 top_n_config_args["objectives"] = objectives
406 (top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs(exp_data=exp_data, **top_n_config_args)
408 (top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column(top_n_config_results_df)
409 top_n = len(top_n_config_results_df[groupby_column].unique()) - 1
411 for (orderby_col, ascending) in orderby_cols.items():
412 opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")
413 (_fig, axis) = plt.subplots()
414 sns.violinplot(
415 data=top_n_config_results_df,
416 x=groupby_column,
417 y=orderby_col,
418 ax=axis,
419 )
420 if with_scatter_plot:
421 sns.scatterplot(
422 data=top_n_config_results_df,
423 x=groupby_column,
424 y=orderby_col,
425 legend=None,
426 ax=axis,
427 )
428 plt.grid()
429 (xticks, xlabels) = plt.xticks()
430 # default should be in the first position based on top_n_configs() return
431 xlabels[0] = "default" # type: ignore[call-overload]
432 plt.xticks(xticks, xlabels) # type: ignore[arg-type]
433 plt.xlabel("Config Trial Group, Config ID")
434 plt.xticks(rotation=90)
435 plt.ylabel(opt_tgt)
436 plt.yscale('log')
437 extra_title = "(lower is better)" if ascending else "(lower is better)"
438 plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}")
439 plt.show() # type: ignore[no-untyped-call]