In [ ]:
Copied!
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg
Run Presidio on structured / semi-structured data¶
This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame. It introduces methods for the analysis and anonymization of both lists and dicts.
Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.
Set up imports¶
In [3]:
Copied!
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint
import pandas as pd
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
from presidio_anonymizer.entities import EngineResult
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint
import pandas as pd
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
from presidio_anonymizer.entities import EngineResult
Example using sample tabular data¶
In [4]:
Copied!
columns = ["name phrase", "phone number phrase", "integer", "boolean" ]
sample_data = [
('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),
('You should talk to Mike', 'his number is 978-428-7111', 2, False),
('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)
]
columns = ["name phrase", "phone number phrase", "integer", "boolean" ]
sample_data = [
('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),
('You should talk to Mike', 'his number is 978-428-7111', 2, False),
('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)
]
In [5]:
Copied!
# Create Pandas DataFrame
df = pd.DataFrame(sample_data,columns=columns)
df
# Create Pandas DataFrame
df = pd.DataFrame(sample_data,columns=columns)
df
Out[5]:
name phrase | phone number phrase | integer | boolean | |
---|---|---|---|---|
0 | Charlie likes this | Please call 212-555-1234 after 2pm | 1 | True |
1 | You should talk to Mike | his number is 978-428-7111 | 2 | False |
2 | Mary had a little startup | Phone number: 202-342-1234 | 3 | False |
In [6]:
Copied!
# DataFrame to dict
df_dict = df.to_dict(orient="list")
# DataFrame to dict
df_dict = df.to_dict(orient="list")
In [7]:
Copied!
pprint.pprint(df_dict)
pprint.pprint(df_dict)
{'boolean': [True, False, False], 'integer': [1, 2, 3], 'name phrase': ['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], 'phone number phrase': ['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234']}
In [8]:
Copied!
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
batch_anonymizer = BatchAnonymizerEngine()
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
batch_anonymizer = BatchAnonymizerEngine()
In [9]:
Copied!
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
analyzer_results = list(analyzer_results)
analyzer_results
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
analyzer_results = list(analyzer_results)
analyzer_results
Out[9]:
[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]), DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]), DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]), DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]
In [10]:
Copied!
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)
In [11]:
Copied!
scrubbed_df = pd.DataFrame(anonymizer_results)
scrubbed_df = pd.DataFrame(anonymizer_results)
In [12]:
Copied!
scrubbed_df
scrubbed_df
Out[12]:
name phrase | phone number phrase | integer | boolean | |
---|---|---|---|---|
0 | <PERSON> likes this | Please call <PHONE_NUMBER> after <DATE_TIME> | 1 | True |
1 | You should talk to <PERSON> | his number is <PHONE_NUMBER> | 2 | False |
2 | <PERSON> had a little startup | Phone number: <PHONE_NUMBER> | 3 | False |
Example using JSON¶
In [13]:
Copied!
nested_dict = {
"key_a": {"key_a1": "My phone number is 212-121-1424"},
"key_b": {"www.abc.com"},
"key_c": 3,
"names": ["James Bond", "Clark Kent", "Hakeem Olajuwon", "No name here!"]
}
pprint.pprint(nested_dict)
nested_dict = {
"key_a": {"key_a1": "My phone number is 212-121-1424"},
"key_b": {"www.abc.com"},
"key_c": 3,
"names": ["James Bond", "Clark Kent", "Hakeem Olajuwon", "No name here!"]
}
pprint.pprint(nested_dict)
{'key_a': {'key_a1': 'My phone number is 212-121-1424'}, 'key_b': {'www.abc.com'}, 'key_c': 3, 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}
In [14]:
Copied!
# Analyze dict
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en")
# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
# Analyze dict
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en")
# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
{'key_a': {'key_a1': 'My phone number is <PHONE_NUMBER>'}, 'key_b': ['<URL>'], 'key_c': 3, 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}
Ignoring specific keys¶
In [15]:
Copied!
keys_to_skip=["key_a1", "names"]
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)
# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
keys_to_skip=["key_a1", "names"]
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)
# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
{'key_a': {'key_a1': 'My phone number is 212-121-1424'}, 'key_b': ['<URL>'], 'key_c': 3, 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}
Ignoring nested keys¶
In [16]:
Copied!
keys_to_skip = ["key_a.key_a1"]
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)
# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
keys_to_skip = ["key_a.key_a1"]
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)
# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
{'key_a': {'key_a1': 'My phone number is 212-121-1424'}, 'key_b': ['<URL>'], 'key_c': 3, 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}
Note!¶
JSON files with objects within lists, e.g.:
{
"key": [
{
"key2": "Peter Parker"
},
{
"key3": "555-1234"
}
]
}
Are not yet supported. Consider breaking the JSON to parts if needed.