# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module returns a network plot given a data frame containing a group-to-group query.
"""
import pandas as pd
from igraph import *
import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import re
import random
[docs]
def network_g2g(data, primary=None, secondary=None, metric="Group_collaboration_time_invested", algorithm="fr", node_colour="lightblue", exc_threshold=0.1, org_count=None, node_scale = 1, edge_scale = 10, subtitle="Collaboration Across Organizations", return_type="plot"):
"""
Name
----
network_g2g
Description
------------
This function returns a network plot given a data frame containing a group-to-group query.
Parameters
----------
data : data frame
Data frame containing a group-to-group query.
primary : str
String containing the variable name for the Primary Collaborator column.
secondary : str
String containing the variable name for the SecondaryCollaborator column.
metric: str
String containing the variable name for metric. Defaults to `Meeting_Count`.
algorithm : str
String to specify the node placement algorithm to be used.
- Defaults to `"fr"` for the force-directed algorithm of Fruchterman and Reingold.
- See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
node_colour : str or dictionary
String or named vector to specify the colour to be used for displaying nodes.
- Defaults to `"lightblue"`.
- If `"vary"` is supplied, a different colour is shown for each node at random.
- If a named dictionary is supplied, the names must match the values of the variable provided for the `primary` and `secondary` columns.
- See example section for details.
exc_threshold: Numeric value between 0 and 1 specifying the exclusion threshold to apply.
- Defaults to 0.1, which means that the plot will only display collaboration above 10% of a node's total collaboration.
- This argument has no impact on `"data"` or `"table"` return.
org_count : optional
Optional data frame to provide the size of each organizationin the `secondary` attribute.
- The data frame should contain only two columns:
- Name of the `secondary` attribute excluding any prefixes, e.g. `"Organization"`.
- Must be of character or factor type. `"n"`. Must be of numeric type.
- Defaults to `None`, where node sizes will be fixed.
node_scale : Numeric value controlling the size of the nodes. 1 keeps the size of the nodes as is.
edge_scale: Numeric value controlling the width of the edges. 1 keeps the size of the edges as is. Defaults to 10.
subtitle : str
String to override default plot subtitle.
return_type : str
String specifying what to return. This must be one of the following strings:
- `"plot"`
- `"table"`
- `"network"`
- `"data"`
- Defaults to `"plot"`.
Returns
-------
A different output is returned depending on the value passed to the `return` argument:
- `"plot"`: 'ggplot' object. A group-to-group network plot.
- `"table"`: data frame. An interactive matrix of the network.
- `"network`: 'igraph' object used for creating the network plot.
- `"data"`: data frame. A long table of the underlying data.
Example
-------
>>> network_g2g(data = vi.load_g2g_data(), metric = "Group_meeting_count")
# Return a network visual
>>> network_g2g(data = vi.load_g2g_data(), return_type = "table")
# Return the interaction matrix
>>> network_g2g(data = vi.load_g2g_data(), exc_threshold = 0)
# Return a network visual with no exclusion threshold
"""
if primary is None:
#Only return first match
primary = data.filter(regex = "^PrimaryCollaborator_").columns[0]
print("Primary field not provided. Assuming {} as the primary variable.".format(primary))
if secondary is None:
#Only return first match
secondary = data.filter(regex = "^SecondaryCollaborator_").columns[0]
print("Secondary field not provided. Assuming {} as the secondary variable.".format(secondary))
#Get string of HR variable (for grouping)
hrvar_string = re.sub("SecondaryCollaborator_", "", string = secondary)
#Warn if 'Within Group' is not in the data
if "Within Group" not in data[secondary].unique().tolist():
print("Warning: Within Group variable is not found in the " + secondary + " variable. The analysis may be excluding in-group collaboration.")
#Run plot_data
plot_data = data.rename(columns={primary: "PrimaryOrg", secondary: "SecondaryOrg", metric: "Metric"})
plot_data = plot_data.assign(SecondaryOrg=np.where(plot_data.SecondaryOrg == "Within Group", plot_data.PrimaryOrg, plot_data.SecondaryOrg))
plot_data = plot_data.groupby(["PrimaryOrg", "SecondaryOrg"]).agg({"Metric": "mean"}).reset_index()
plot_data = plot_data.query('PrimaryOrg != "Other_Collaborators" & SecondaryOrg != "Other_Collaborators"')
plot_data = plot_data.groupby("PrimaryOrg")
plot_data = plot_data.apply(lambda func: func.assign(metric_prop=func.Metric / func.Metric.sum())).reset_index(drop=True)
plot_data = plot_data.loc[:, ["PrimaryOrg", "SecondaryOrg", "metric_prop"]]
if return_type == "table":
# return a 'tidy' matrix
table = plot_data.pivot(index = "PrimaryOrg", columns = "SecondaryOrg", values = "metric_prop")
return table
elif return_type == "data":
# return long table
return plot_data
elif return_type in ["plot", "network"]:
# create network object - one for export, one for plotting
# exclusion threshold ONLY applies in network output and plotting
mynet_em = plot_data[plot_data['metric_prop'] > exc_threshold]
mynet_em.loc[:, ['PrimaryOrg', 'SecondaryOrg']] = mynet_em[['PrimaryOrg', 'SecondaryOrg']].apply(lambda func: func.str.replace(' ', '\n'))
# a version of the graph without self-collaboration
mynet_em_noloops = mynet_em[mynet_em['PrimaryOrg'] != mynet_em['SecondaryOrg']]
mynet_em_noloops.loc[:, 'metric_prop'] = mynet_em_noloops['metric_prop'] * edge_scale # only scale width for the plotting graph
# Set 'metric_props' as edge attribute
g = ig.Graph.TupleList(mynet_em.itertuples(index=False), directed=False, edge_attrs = ['metric_prop'])
g_noloops = ig.Graph.TupleList(mynet_em_noloops.itertuples(index=False), directed=False, edge_attrs = ['metric_prop'])
# Org count can vary by size
if org_count is not None:
g.vs["org_size"] = (
pd.DataFrame({"id": g.vs["name"]})
.assign(id=lambda org: org["id"].str.replace("\n", " "))
.merge(org_count, how="left", left_on="id", right_on=hrvar_string)
.assign(n=lambda org: org["n"] / 100) #scale for plotting
.loc[:, "n"]
.tolist()
)
g_noloops.vs["org_size"] = (
pd.DataFrame({"id": g_noloops.vs["name"]})
.assign(id=lambda org: org["id"].str.replace("\n", " "))
.merge(org_count, how="left", left_on="id", right_on=hrvar_string)
.assign(n=lambda org: org["n"] / 100) #scale for plotting
.loc[:, "n"]
.tolist()
)
else:
#imputed size if not specified
g.vs['org_size'] = (
pd.DataFrame({"id": g.vs['name']})
.assign(id=lambda org: org['id'].str.replace('\n', ' '))
.assign(n=0.4)
.loc[:, 'n']
.tolist()
)
g_noloops.vs['org_size'] = (
pd.DataFrame({"id": g_noloops.vs['name']})
.assign(id=lambda org: org['id'].str.replace('\n', ' '))
.assign(n=0.4)
.loc[:, 'n']
.tolist()
)
# scale the size of the nodes
g.vs["org_size"] = [x*node_scale for x in g.vs["org_size"]]
g_noloops.vs["org_size"] = [x*node_scale for x in g_noloops.vs["org_size"]]
# Add edge_colour with transparent grey
g_noloops.es["edge_colour"] = [(0.827, 0.827, 0.827, 0.5)] * g_noloops.ecount()
if return_type == "network":
return g # return 'igraph' object
else:
# Keep multiple edges, remove loops
# g = g.simplify(multiple = True, loops = True)
g = g_noloops # use version of graph with no self-collaboration
# plot object
fig, ax = plt.subplots(figsize=(8, 8))
ig.plot(
g,
layout=g.layout(algorithm),
target=ax,
vertex_label=g.vs["name"],
vertex_frame_width=0,
vertex_size=g.vs["org_size"],
vertex_color=setColor(node_colour, g.vs["name"]),
edge_width= g.es["metric_prop"],
# edge_width=mynet_em["metric_prop"] * 1,
# edge_alpha=0.2,
edge_color= g.es["edge_colour"]
)
plt.suptitle("Group to Group Collaboration" + '\n' + subtitle, fontsize=13)
plt.figtext(0.95, 0.05, "Displays only collaboration above {}% of node's total collaboration".format(int(exc_threshold * 100)), ha="right", va="bottom", fontsize=8)
return plt.show() #return 'ggplot' object
else:
raise ValueError("Please enter a valid input for 'return'.")
[docs]
def setColor(node_colour, org):
org = [i.replace("\n", " ") for i in org]
if isinstance(node_colour, str) and len(node_colour) > 1:
if node_colour == "vary": #generate a random colour for each node in the network
node_colour = [f"#{random.randint(0, 0xFFFFFF):06x}" for _ in range(len(org))]
else:
node_colour = node_colour #use the colour provided
elif isinstance(node_colour, dict): #use dictionary to map each node to a colour
node_colour = {node: colour for node, colour in node_colour.items()}
for node, colour in node_colour.items():
if colour == "random":
node_colour[node] = f"#{random.randint(0, 0xFFFFFF):06x}"
else:
node_colour[node] = colour
node_colour = [node_colour.get(node, "lightblue") for node in org] #use default colour if key not found
else: #default colour
node_colour = "lightblue"
return node_colour