Source code for vivainsights.network_g2g

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module returns a network plot given a data frame containing a group-to-group query.
"""
import pandas as pd
from igraph import *
import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import re
import random


[docs]
def network_g2g(data, primary=None, secondary=None, metric="Group_collaboration_time_invested", algorithm="fr", node_colour="lightblue", exc_threshold=0.1, org_count=None, node_scale = 1, edge_scale = 10, subtitle="Collaboration Across Organizations", return_type="plot"):
    """
    Name
    ----
    network_g2g

    Description
    ------------
    This function returns a network plot given a data frame containing a group-to-group query.

    Parameters
    ----------
    data : data frame
        Data frame containing a group-to-group query.
    primary : str
        String containing the variable name for the Primary Collaborator column.
    secondary : str
        String containing the variable name for the SecondaryCollaborator column.
    metric: str
        String containing the variable name for metric. Defaults to `Meeting_Count`.
    algorithm : str
        String to specify the node placement algorithm to be used. 
        - Defaults to `"fr"` for the force-directed algorithm of Fruchterman and Reingold. 
        - See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
    node_colour : str or dictionary 
        String or named vector to specify the colour to be used for displaying nodes. 
        - Defaults to `"lightblue"`. 
        - If `"vary"` is supplied, a different colour is shown for each node at random. 
        - If a named dictionary is supplied, the names must match the values of the variable provided for the `primary` and `secondary` columns. 
        - See example section for details.
    exc_threshold: Numeric value between 0 and 1 specifying the exclusion threshold to apply. 
        - Defaults to 0.1, which means that the plot will only display collaboration above 10% of a node's total collaboration. 
        - This argument has no impact on `"data"` or `"table"` return.
    org_count : optional 
        Optional data frame to provide the size of each organizationin the `secondary` attribute. 
        - The data frame should contain only two columns: 
        - Name of the `secondary` attribute excluding any prefixes, e.g. `"Organization"`. 
        - Must be of character or factor type. `"n"`. Must be of numeric type. 
        - Defaults to `None`, where node sizes will be fixed.
    node_scale : Numeric value controlling the size of the nodes. 1 keeps the size of the nodes as is. 
    edge_scale: Numeric value controlling the width of the edges. 1 keeps the size of the edges as is. Defaults to 10. 
    subtitle : str 
        String to override default plot subtitle.
    return_type : str
        String specifying what to return. This must be one of the following strings:
        - `"plot"`
        - `"table"`
        - `"network"`
        - `"data"`
        - Defaults to `"plot"`.

    Returns
    -------
    A different output is returned depending on the value passed to the `return` argument:
    - `"plot"`: 'ggplot' object. A group-to-group network plot.
    - `"table"`: data frame. An interactive matrix of the network.
    - `"network`: 'igraph' object used for creating the network plot.
    - `"data"`: data frame. A long table of the underlying data.

    Example
    -------
    
    >>> network_g2g(data = vi.load_g2g_data(), metric = "Group_meeting_count")
    # Return a network visual
    
    >>> network_g2g(data = vi.load_g2g_data(), return_type = "table")
    # Return the interaction matrix
    
    >>> network_g2g(data = vi.load_g2g_data(), exc_threshold = 0)
    # Return a network visual with no exclusion threshold
    
    """ 
    if primary is None:
        #Only return first match
        primary = data.filter(regex = "^PrimaryCollaborator_").columns[0] 
        print("Primary field not provided. Assuming {} as the primary variable.".format(primary))

    if secondary is None:
        #Only return first match
        secondary = data.filter(regex = "^SecondaryCollaborator_").columns[0]
        print("Secondary field not provided. Assuming {} as the secondary variable.".format(secondary))

    #Get string of HR variable (for grouping)
    hrvar_string = re.sub("SecondaryCollaborator_", "",  string = secondary)

    #Warn if 'Within Group' is not in the data
    if "Within Group" not in data[secondary].unique().tolist():
        print("Warning: Within Group variable is not found in the " + secondary + " variable. The analysis may be excluding in-group collaboration.")

    #Run plot_data
    plot_data = data.rename(columns={primary: "PrimaryOrg", secondary: "SecondaryOrg", metric: "Metric"})
    plot_data = plot_data.assign(SecondaryOrg=np.where(plot_data.SecondaryOrg == "Within Group", plot_data.PrimaryOrg, plot_data.SecondaryOrg))    
    plot_data = plot_data.groupby(["PrimaryOrg", "SecondaryOrg"]).agg({"Metric": "mean"}).reset_index()
    plot_data = plot_data.query('PrimaryOrg != "Other_Collaborators" & SecondaryOrg != "Other_Collaborators"')
    plot_data = plot_data.groupby("PrimaryOrg")
    plot_data = plot_data.apply(lambda func: func.assign(metric_prop=func.Metric / func.Metric.sum())).reset_index(drop=True)
    plot_data = plot_data.loc[:, ["PrimaryOrg", "SecondaryOrg", "metric_prop"]]

    if return_type == "table":

        # return a 'tidy' matrix
        table = plot_data.pivot(index = "PrimaryOrg", columns = "SecondaryOrg", values = "metric_prop")
        return table
    
    elif return_type == "data":

        # return long table
        return plot_data
    
    elif return_type in ["plot", "network"]:

        # create network object - one for export, one for plotting
        # exclusion threshold ONLY applies in network output and plotting 
        mynet_em = plot_data[plot_data['metric_prop'] > exc_threshold]
        mynet_em.loc[:, ['PrimaryOrg', 'SecondaryOrg']] = mynet_em[['PrimaryOrg', 'SecondaryOrg']].apply(lambda func: func.str.replace(' ', '\n'))
        
        # a version of the graph without self-collaboration
        mynet_em_noloops = mynet_em[mynet_em['PrimaryOrg'] != mynet_em['SecondaryOrg']]
        mynet_em_noloops.loc[:, 'metric_prop'] = mynet_em_noloops['metric_prop'] * edge_scale # only scale width for the plotting graph
        
        # Set 'metric_props' as edge attribute
        g = ig.Graph.TupleList(mynet_em.itertuples(index=False), directed=False, edge_attrs = ['metric_prop'])
        g_noloops = ig.Graph.TupleList(mynet_em_noloops.itertuples(index=False), directed=False, edge_attrs = ['metric_prop'])
        
        # Org count can vary by size
        if org_count is not None:
            g.vs["org_size"] = (
                pd.DataFrame({"id": g.vs["name"]})
                .assign(id=lambda org: org["id"].str.replace("\n", " "))
                .merge(org_count, how="left", left_on="id", right_on=hrvar_string)
                .assign(n=lambda org: org["n"] / 100) #scale for plotting
                .loc[:, "n"]
                .tolist()
            )
            
            g_noloops.vs["org_size"] = (
                pd.DataFrame({"id": g_noloops.vs["name"]})
                .assign(id=lambda org: org["id"].str.replace("\n", " "))
                .merge(org_count, how="left", left_on="id", right_on=hrvar_string)
                .assign(n=lambda org: org["n"] / 100) #scale for plotting
                .loc[:, "n"]
                .tolist()
            )
        else:
            #imputed size if not specified
            g.vs['org_size'] = (
                pd.DataFrame({"id": g.vs['name']})
                .assign(id=lambda org: org['id'].str.replace('\n', ' '))
                .assign(n=0.4)
                .loc[:, 'n']
                .tolist()
            )
             
            g_noloops.vs['org_size'] = (
                pd.DataFrame({"id": g_noloops.vs['name']})
                .assign(id=lambda org: org['id'].str.replace('\n', ' '))
                .assign(n=0.4)
                .loc[:, 'n']
                .tolist()
            )

        # scale the size of the nodes
        g.vs["org_size"] = [x*node_scale for x in g.vs["org_size"]] 
        g_noloops.vs["org_size"] = [x*node_scale for x in g_noloops.vs["org_size"]]
        
        # Add edge_colour with transparent grey
        g_noloops.es["edge_colour"] = [(0.827, 0.827, 0.827, 0.5)] * g_noloops.ecount()


        if return_type == "network":
            return g # return 'igraph' object
        else:
            # Keep multiple edges, remove loops
            # g = g.simplify(multiple = True, loops = True)
            
            g = g_noloops # use version of graph with no self-collaboration
            
            # plot object
            fig, ax = plt.subplots(figsize=(8, 8))
            ig.plot(
                g,
                layout=g.layout(algorithm),
                target=ax,
                vertex_label=g.vs["name"],
                vertex_frame_width=0,
                vertex_size=g.vs["org_size"],
                vertex_color=setColor(node_colour, g.vs["name"]),  
                edge_width= g.es["metric_prop"],                
                # edge_width=mynet_em["metric_prop"] * 1,
                # edge_alpha=0.2,
                edge_color= g.es["edge_colour"]
            )
            
            plt.suptitle("Group to Group Collaboration" + '\n' + subtitle, fontsize=13)
            plt.figtext(0.95, 0.05, "Displays only collaboration above {}% of node's total collaboration".format(int(exc_threshold * 100)), ha="right", va="bottom", fontsize=8)     
            return plt.show() #return 'ggplot' object
    else:
        raise ValueError("Please enter a valid input for 'return'.")

    

[docs]
def setColor(node_colour, org):
    org = [i.replace("\n", " ") for i in org]
    if isinstance(node_colour, str) and len(node_colour) > 1:
        if node_colour == "vary": #generate a random colour for each node in the network
            node_colour = [f"#{random.randint(0, 0xFFFFFF):06x}" for _ in range(len(org))]
        else:
            node_colour = node_colour #use the colour provided
    elif isinstance(node_colour, dict): #use dictionary to map each node to a colour
        node_colour = {node: colour for node, colour in node_colour.items()}
        for node, colour in node_colour.items():
            if colour == "random": 
                node_colour[node] = f"#{random.randint(0, 0xFFFFFF):06x}"
            else:
                node_colour[node] = colour
        node_colour = [node_colour.get(node, "lightblue") for node in org] #use default colour if key not found

    else: #default colour
        node_colour = "lightblue"

    return node_colour