Source code for vivainsights.identify_habit

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Identify recurring behavioral habits from Viva Insights metrics.
"""

__all__ = ['identify_habit']

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.create_boxplot import create_boxplot

[docs] def identify_habit( data, metric, threshold=1, width=1, max_window=4, # Set a default value for max_window hrvar=None, return_type="plot", plot_mode="time", figsize: tuple = None, fill_col=("#E5E5E5", "#0078D4")): """Identify recurring behavioral habits from a metric. Analyses a dataset to determine whether a habit exists based on a specified metric and thresholds. Can return classified data, plots, or summary statistics. Parameters ---------- data : pandas.DataFrame Person query data. Must include ``PersonId``, ``MetricDate``, and the *metric* column. metric : str Column name of the metric to analyse. threshold : int, default 1 Minimum value for a week to count as a qualifying event. width : int, default 1 Number of qualifying events required to establish a habit. max_window : int, default 4 Maximum number of periods to consider for a habit. hrvar : str or None, default None Column name for grouping (used with ``plot_mode="boxplot"``). return_type : str, default "plot" ``"data"`` for a classified DataFrame, ``"plot"`` for a chart, or ``"summary"`` for summary statistics. plot_mode : str, default "time" ``"time"`` for a stacked bar time series, ``"boxplot"`` for a boxplot by group. figsize : tuple or None, default None Figure size ``(width, height)`` in inches. Defaults to ``(8, 6)``. fill_col : tuple, default ("#E5E5E5", "#0078D4") Colours for the plot. Returns ------- pandas.DataFrame, matplotlib.figure.Figure, or dict Classified data, a plot, or summary statistics depending on *return_type*. Examples -------- Return classified data with habit labels: >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.identify_habit(pq_data, metric='Multitasking_hours', threshold=1, width=9, max_window=12, return_type="data") Return a plot of habit classification over time: >>> vi.identify_habit(pq_data, metric='Multitasking_hours', threshold=1, width=9, max_window=12, return_type="plot") Return a summary dictionary of habit statistics: >>> vi.identify_habit(pq_data, metric='Multitasking_hours', threshold=1, width=9, max_window=12, return_type="summary") Group results by an HR variable: >>> vi.identify_habit( ... pq_data, ... metric='Multitasking_hours', ... threshold=1, ... width=9, ... max_window=12, ... hrvar='Organization', ... return_type="plot", ... ) """ # Ensure MetricDate is a datetime object data['MetricDate'] = pd.to_datetime(data['MetricDate']) # Validate max_window if not isinstance(max_window, int) or max_window <= 0: raise ValueError("`max_window` must be a positive integer.") # Validate width if not isinstance(width, int) or width <= 0: raise ValueError("`width` must be a positive integer.") # Calculate cumulative sums and habit classification data = data.sort_values(by=['PersonId', 'MetricDate']) data['cumsum_value'] = data.groupby('PersonId')[metric].transform(lambda x: (x >= threshold).cumsum()) data['lagged_cumsum'] = data.groupby('PersonId')['cumsum_value'].shift(max_window, fill_value=0) data['sum_last_w'] = data['cumsum_value'] - data['lagged_cumsum'] data['IsHabit'] = data['sum_last_w'] >= width if return_type == "data": return data elif return_type == "plot": if plot_mode == "time": # Time series plot habit_summary = ( data.groupby(['MetricDate', 'IsHabit']) .agg(n=('PersonId', 'nunique')) .reset_index() ) habit_summary['IsHabit'] = habit_summary['IsHabit'].map({True: "Habit", False: "No Habit"}) habit_pivot = habit_summary.pivot(index='MetricDate', columns='IsHabit', values='n').fillna(0) habit_pivot = habit_pivot.div(habit_pivot.sum(axis=1), axis=0) # Convert to percentages # Plot with improved formatting fig, ax = plt.subplots(figsize=figsize if figsize else (8, 6)) habit_pivot.plot( kind='bar', stacked=True, color=fill_col[::-1], # Reverse colors to stack blue below grey ax=ax ) ax.set_title(f"Habitual Behavior - {metric.replace('_', ' ')}", fontsize=14, fontweight="bold") ax.set_ylabel("Percentage", fontsize=12) ax.set_xlabel("MetricDate", fontsize=12) ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{int(y * 100)}%")) # Format y-axis as percentages ax.legend(title="Is Habit", labels=["No Habit", "Habit"], fontsize=10) ax.set_xticks(range(len(habit_pivot.index))) ax.set_xticklabels(habit_pivot.index.strftime("%b %d, %y"), rotation=45, ha="right") # Format x-axis dates plt.tight_layout() return fig # Return the figure object elif plot_mode == "boxplot": # Use create_boxplot for boxplot plot_data = data.copy() # Convert 'IsHabit' to numeric for boxplot plot_data['IsHabit'] = plot_data['IsHabit'].astype(int) return create_boxplot(data=plot_data, metric='IsHabit', hrvar=hrvar, mingroup=1, return_type="plot") else: raise ValueError("Invalid plot mode") elif return_type == "summary": # Summary statistics recent_stats = data[data['MetricDate'] == data['MetricDate'].max()] recent_summary = { "Most recent week - Total persons with habit": recent_stats['IsHabit'].sum(), "Most recent week - % of pop with habit": recent_stats['IsHabit'].mean(), } dist_summary = { "Mean - % of Person-weeks with habit": data['IsHabit'].mean(), "Median - % of Person-weeks with habit": data['IsHabit'].median(), "Min - % of Person-weeks with habit": data['IsHabit'].min(), "Max - % of Person-weeks with habit": data['IsHabit'].max(), "SD - % of Person-weeks with habit": data['IsHabit'].std(), } person_week_summary = { "Total Person-weeks with habit": data['IsHabit'].sum(), "Total Person-weeks": len(data), "% of Person-weeks with habit": data['IsHabit'].mean(), "Total Persons": data['PersonId'].nunique(), "Total Weeks": data['MetricDate'].nunique(), } return {**recent_summary, **dist_summary, **person_week_summary} else: raise ValueError("Invalid return type")