Source code for vivainsights.create_bar

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The code defines a function `create_bar` that calculates and visualizes the mean of a selected
metric, grouped by a selected HR variable. 

The metrics are first aggregated at a user-level prior to being aggregated at the level of the HR variable. The function `create_bar` returns either a plot object or a table, depending on the value passed to `return_type`. 
"""
import pandas as pd
import seaborn as sns
from vivainsights.extract_date_range import extract_date_range
from vivainsights.us_to_space import us_to_space
from vivainsights.totals_col import totals_col
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator
import matplotlib
    

[docs]
def create_bar_calc(
    data: pd.DataFrame,
    metric: str,
    hrvar: str, 
    mingroup = 5,
    stats = False
    ):
    """Calculate the mean of a selected metric, grouped by a selected HR variable."""
    data = data.groupby(['PersonId',hrvar])        
    data = data[metric].mean()
    data = data.reset_index()
    output = data.groupby(hrvar).agg(
        metric = (metric, 'mean'),
        n = ('PersonId', 'nunique')
        )
    output = output[output['n'] >= mingroup]
    output = output.rename_axis(hrvar).reset_index()
    output = output.sort_values(by = 'metric', ascending=False)
    
    if stats == True:
        stats_df = data.groupby(hrvar).agg(
            sd = (metric, 'std'),
            median = (metric, 'median'),
            min = (metric, 'min'),
            max = (metric, 'max')
            )
        
        # Join output with stats_df
        output = pd.merge(output, stats_df, on=hrvar, how='outer')
    
    return output



[docs]
def create_bar_viz(
    data: pd.DataFrame,
    metric: str,
    hrvar: str,
    mingroup = 5,
    percent: bool = False,
    plot_title = None,
    plot_subtitle = None):
    """Visualise the mean of a selected metric, grouped by a selected HR variable."""
    sum_df = create_bar_calc(data, metric, hrvar, mingroup)
    caption_text = extract_date_range(data, return_type='text')
    plot_order = sum_df[hrvar].to_numpy()

    # Title and subtitle text
    if plot_title is None:
        title_text = us_to_space(metric)
    else:
        title_text = plot_title

    if plot_subtitle is None:
        subtitle_text = f'Weekly average by {hrvar}'  # TODO: make this dynamic by date interval
    else:
        subtitle_text = plot_subtitle

    # fig = plt.figure()
    fig, ax = plt.subplots(figsize=(4, 6))

    # Create grid
    # Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data.
    ax.grid(which="major", axis='x', color='#758D99', alpha=0.6, zorder=1)

    # Remove splines. Can be done 1 at a time or can slice with a list.
    ax.spines[['top', 'right', 'bottom']].set_visible(False)

    # Make left spine slightly thicker
    ax.spines['left'].set_linewidth(1.1)

    ax.barh(sum_df[hrvar], sum_df['metric'], color='#1d627e', zorder=2)

    if percent == True:
        # Set the x-axis format to percentage
        ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))

    # Shrink y-lim to make plot a bit tighter
    # Using length of summary table to make it dynamic
    ax.set_ylim(-0.5, len(sum_df) - 0.5)

    # Reformat x-axis tick labels
    ax.xaxis.set_tick_params(labeltop=True,  # Put x-axis labels on top
                             labelbottom=False,  # Set no x-axis labels on bottom
                             bottom=False,  # Set no ticks on bottom
                             labelsize=9,  # Set tick label size
                             pad=-1)  # Lower tick labels a bit

    ax.yaxis.set_tick_params(pad=10,  # Pad tick labels so they don't go over y-axis
                             labelsize=9,  # Set label size
                             bottom=False)  # Set no ticks on bottom/left

    # Reformat y-axis tick labels
    ax.set_yticks(range(len(sum_df)))
    ax.set_yticklabels(sum_df[hrvar], ha='right')

    # Add in line and tag
    ax.plot([-.35, .87],  # Set width of line
            [1.02, 1.02],  # Set height of line
            transform=fig.transFigure,  # Set location relative to plot
            clip_on=False,
            color='#fe7f4f',
            linewidth=.6)

    ax.add_patch(plt.Rectangle((-.35, 1.02),  # Set location of rectangle by lower left corder
                               0.12,  # Width of rectangle
                               -0.02,  # Height of rectangle. Negative so it goes down.
                               facecolor='#fe7f4f',
                               transform=fig.transFigure,
                               clip_on=False,
                               linewidth=0))

    # Add in title, subtitle, and caption
    ax.text(x=-.35, y=.96, s=title_text, transform=fig.transFigure, ha='left', fontsize=13, weight='bold', alpha=.8)
    ax.text(x=-.35, y=.925, s=subtitle_text, transform=fig.transFigure, ha='left', fontsize=11, alpha=.8)
    ax.text(x=-.35, y=.08, s=caption_text, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7)

    if percent == True:
        ax.bar_label(ax.containers[0], labels=[f"{100 * value:.0f}%" for value in sum_df['metric']], label_type="edge",
                     padding=3)
    else:
        ax.bar_label(ax.containers[0], fmt='%.0f', label_type='edge', padding=3)  # annotate

    ax.margins(y=0.3)  # pad the spacing between the number and the edge of the figure

    # return the plot object
    return fig    



[docs]
def create_bar(
    data: pd.DataFrame,
    metric: str,
    hrvar: str,
    mingroup: int = 5,
    percent: bool = False,
    return_type: str = "plot",
    plot_title = None,
    plot_subtitle = None):
    """
    Name
    -----
    create_bar 
    
    Description
    -----------
    The function `create_bar` calculates and visualizes the mean of a selected metric, grouped by a selected HR variable. 
    The metrics are first aggregated at a user-level prior to being aggregated at the level of the HR variable. 
    `create_bar` returns either a plot object or a table, depending on the value passed to `return_type`.
    Internally, `create_bar` calls `create_bar_viz()` and `create_bar_calc()` to create the plot and calculate the mean of the selected metric, respectively.

    Parameters
    ----------
    data : pd.DataFrame
        Person query data.
    metric : str
        Name of the metric to be analysed.
    hrvar : str
        Name of the organizational attribute to be used for grouping.
    mingroup : int, optional
        Minimum group size. Defaults to 5.
    percent : bool, optional
        Whether to display values as percentages. Defaults to False.
    return_type : str, optional
        The type of output to return. Can be "plot" or "table". Defaults to "plot".
    plot_title : str, optional
        Title of the plot. Defaults to None.
    plot_subtitle : str, optional
        Subtitle of the plot. Defaults to None.
    

    Returns
    -------
    Various
        The output, either a plot or a table, depending on the value passed to `return_type`.

    Example
    -------
    >>> create_bar(pq_data, metric = "Collaboration_hours", hrvar = "LevelDesignation")
    """  
    
    ## Handling None value passed to hrvar
    if(hrvar is None):
        data = totals_col(data)
        hrvar = "Total"
        
    if return_type == "plot":
        out = create_bar_viz(data=data, metric=metric, hrvar=hrvar, percent=percent, mingroup=mingroup, plot_title = plot_title, plot_subtitle = plot_subtitle)
    elif return_type == "table":
        out = create_bar_calc(data=data, metric=metric, hrvar=hrvar, mingroup=mingroup)
    else:
        out = "Invalid input. Please check your inputs and try again."
    return out