Source code for vivainsights.identify_habit

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.create_boxplot import create_boxplot

[docs] def identify_habit( data, metric, threshold=1, width=1, max_window=4, # Set a default value for max_window hrvar=None, return_type="plot", plot_mode="time", fill_col=("#E5E5E5", "#0078D4")): """ Name ----- identify_habit Description ----------- Identify habitual behavior over a given interval of time. This function analyzes a dataset to determine whether a habit exists based on a specified metric and thresholds. It can return data, plots, or summary statistics depending on the `return_type` argument. Parameters ---------- data : pandas.DataFrame A dataset containing the data to analyze. Must include 'PersonId', 'MetricDate', and the metric column. metric : str Column name of the metric to analyze. threshold : int, optional Minimum value for a week to be considered a qualifying count. Default is 1. width : int, optional Number of qualifying counts required for a habit. max_window : int, optional Maximum number of periods to consider for a habit. hrvar : str, optional Column name for grouping (e.g., department or team). Default is None. return_type : str, optional Type of output to return. Must be one of the following: - "data": Returns a DataFrame with habit classification. - "plot": Returns a plot of habitual behavior. - "summary": Returns summary statistics. Default is "plot". plot_mode : str, optional Type of plot to generate if `return_type` is "plot". Must be one of the following: - "time": Time series plot of habitual behavior. - "boxplot": Boxplot of habitual behavior by group. Default is "time". fill_col : tuple, optional Colors for the plot. Default is ("#E5E5E5", "#0078D4"). Returns ------- Depending on the value of `return_type`, the function returns: - pandas.DataFrame: If `return_type` is "data". - matplotlib plot: If `return_type` is "plot". - dict: Summary statistics if `return_type` is "summary". Examples -------- >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.identify_habit(pq_data, metric='Multitasking_hours', threshold=1, width=9, max_window=12, return_type="data") >>> vi.identify_habit(pq_data, metric='Multitasking_hours', threshold=1, width=9, max_window=12, return_type="plot", plot_mode="time") >>> vi.identify_habit(pq_data, metric='Multitasking_hours', threshold=1, width=9, max_window=12, return_type="summary") """ # Ensure MetricDate is a datetime object data['MetricDate'] = pd.to_datetime(data['MetricDate']) # Validate max_window if not isinstance(max_window, int) or max_window <= 0: raise ValueError("`max_window` must be a positive integer.") # Validate width if not isinstance(width, int) or width <= 0: raise ValueError("`width` must be a positive integer.") # Calculate cumulative sums and habit classification data = data.sort_values(by=['PersonId', 'MetricDate']) data['cumsum_value'] = data.groupby('PersonId')[metric].transform(lambda x: (x >= threshold).cumsum()) data['lagged_cumsum'] = data.groupby('PersonId')['cumsum_value'].shift(max_window, fill_value=0) data['sum_last_w'] = data['cumsum_value'] - data['lagged_cumsum'] data['IsHabit'] = data['sum_last_w'] >= width if return_type == "data": return data elif return_type == "plot": if plot_mode == "time": # Time series plot habit_summary = ( data.groupby(['MetricDate', 'IsHabit']) .agg(n=('PersonId', 'nunique')) .reset_index() ) habit_summary['IsHabit'] = habit_summary['IsHabit'].map({True: "Habit", False: "No Habit"}) habit_pivot = habit_summary.pivot(index='MetricDate', columns='IsHabit', values='n').fillna(0) habit_pivot = habit_pivot.div(habit_pivot.sum(axis=1), axis=0) # Convert to percentages # Plot with improved formatting fig, ax = plt.subplots(figsize=(10, 6)) habit_pivot.plot( kind='bar', stacked=True, color=fill_col[::-1], # Reverse colors to stack blue below grey ax=ax ) ax.set_title(f"Habitual Behavior - {metric.replace('_', ' ')}", fontsize=14, fontweight="bold") ax.set_ylabel("Percentage", fontsize=12) ax.set_xlabel("MetricDate", fontsize=12) ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{int(y * 100)}%")) # Format y-axis as percentages ax.legend(title="Is Habit", labels=["No Habit", "Habit"], fontsize=10) ax.set_xticks(range(len(habit_pivot.index))) ax.set_xticklabels(habit_pivot.index.strftime("%b %d, %y"), rotation=45, ha="right") # Format x-axis dates plt.tight_layout() return fig # Return the figure object elif plot_mode == "boxplot": # Use create_boxplot for boxplot plot_data = data.copy() # Convert 'IsHabit' to numeric for boxplot plot_data['IsHabit'] = plot_data['IsHabit'].astype(int) return create_boxplot(data=plot_data, metric='IsHabit', hrvar=hrvar, mingroup=1, return_type="plot") else: raise ValueError("Invalid plot mode") elif return_type == "summary": # Summary statistics recent_stats = data[data['MetricDate'] == data['MetricDate'].max()] recent_summary = { "Most recent week - Total persons with habit": recent_stats['IsHabit'].sum(), "Most recent week - % of pop with habit": recent_stats['IsHabit'].mean(), } dist_summary = { "Mean - % of Person-weeks with habit": data['IsHabit'].mean(), "Median - % of Person-weeks with habit": data['IsHabit'].median(), "Min - % of Person-weeks with habit": data['IsHabit'].min(), "Max - % of Person-weeks with habit": data['IsHabit'].max(), "SD - % of Person-weeks with habit": data['IsHabit'].std(), } person_week_summary = { "Total Person-weeks with habit": data['IsHabit'].sum(), "Total Person-weeks": len(data), "% of Person-weeks with habit": data['IsHabit'].mean(), "Total Persons": data['PersonId'].nunique(), "Total Weeks": data['MetricDate'].nunique(), } return {**recent_summary, **dist_summary, **person_week_summary} else: raise ValueError("Invalid return type")