In [ ]:

Copied!

# download presidio
!pip install presidio_analyzer presidio_anonymizer
# download presidio
!pip install presidio_analyzer presidio_anonymizer

Path to notebook: https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/ner_model_configuration.ipynb ¶

Configuring the NER model¶

This notebook contains a few examples to customize and configure the NER model through code. Examples:

Changing the default model's parameters
Using Stanza as the NER engine
Using transformers as the NER engine
Supporting multiple languages

This notebook complements the documentation, which primarily focuses on reading the NER configuration from file

1. Changing the default model's parameters¶

In this example, we'll change the models' default confidence score (spaCy models do not generally output confidence per prediction, so we add a default score(. In addition, we'll change the types of PII entities the model returns.

In [15]:

Copied!

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NerModelConfiguration
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NerModelConfiguration

In [16]:

Copied!





# Define which model to use
model_config = [{"lang_code": "en", "model_name": "en_core_web_lg"}]

# Define which entities the model returns and how they map to Presidio's
entity_mapping = dict(
    PER="PERSON",
    LOC= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION"
)

ner_model_configuration = NerModelConfiguration(default_score = 0.6, 
                                                model_to_presidio_entity_mapping=entity_mapping)

# Create the NLP Engine based on this configuration
spacy_nlp_engine = SpacyNlpEngine(models= model_config, ner_model_configuration=ner_model_configuration)
# Define which model to use
model_config = [{"lang_code": "en", "model_name": "en_core_web_lg"}]

# Define which entities the model returns and how they map to Presidio's
entity_mapping = dict(
    PER="PERSON",
    LOC= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION"
)

ner_model_configuration = NerModelConfiguration(default_score = 0.6, 
                                                model_to_presidio_entity_mapping=entity_mapping)

# Create the NLP Engine based on this configuration
spacy_nlp_engine = SpacyNlpEngine(models= model_config, ner_model_configuration=ner_model_configuration)

In [17]:

Copied!





# Helper method to use the NLP Engine as part of Presidio Analyzer, and print configuration+results

def call_analyzer_and_print_results(nlp_engine: NlpEngine,
                                    language: str = "en",
                                    text: str = "Bill Clinton used to be the president of the United States") -> None:
    """
    Instantiate the AnalyzerEngine with the provided nlp_engine and return output.

    This method creates an AnalyzerEngine instance with the provided NlpEngine, and three supported languages (en, es, de)
    Then, it calls the analyze method to return identified PII.

    :param nlp_engine: The NlpEngine instance as configured by the user
    :param language: the language the request should support (in contrast to the AnalyzerEngine which can support multiple)
    :param text: The text to look for PII entities in.

    """
    
    print(f"Input text:\n\t{text}\n")
    
    # Initialize the AnalyzerEngine with the configured Nlp Engine:
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, 
                              supported_languages=["en", "de", "es"])

    # Print the NLP Engine's configuration
    print(f"NLP Engine configuration:\n\tLoaded NLP engine: {analyzer.nlp_engine.__class__.__name__}")
    print(f"\tSupported entities: {analyzer.nlp_engine.get_supported_entities()}")
    print(f"\tSupported languages: {analyzer.nlp_engine.get_supported_languages()}")
    print()
    
    # Call the analyzer.analyze to detect PII entities (from the NLP engine + all other recognizers)
    results = analyzer.analyze(text=text, 
                               language=language, 
                               return_decision_process=True)

    # sort results
    results = sorted(results, key= lambda x: x.start)
    
    # Print results
    print("Returning full results, including the decision process:")
    for i, result in enumerate(results):
        print(f"\tResult {i}: {result}")
        print(f"\tDetected text: {text[result.start: result.end]}")
        print(f"\t{result.analysis_explanation.textual_explanation}")
        print("")
# Helper method to use the NLP Engine as part of Presidio Analyzer, and print configuration+results

def call_analyzer_and_print_results(nlp_engine: NlpEngine,
                                    language: str = "en",
                                    text: str = "Bill Clinton used to be the president of the United States") -> None:
    """
    Instantiate the AnalyzerEngine with the provided nlp_engine and return output.

    This method creates an AnalyzerEngine instance with the provided NlpEngine, and three supported languages (en, es, de)
    Then, it calls the analyze method to return identified PII.

    :param nlp_engine: The NlpEngine instance as configured by the user
    :param language: the language the request should support (in contrast to the AnalyzerEngine which can support multiple)
    :param text: The text to look for PII entities in.

    """
    
    print(f"Input text:\n\t{text}\n")
    
    # Initialize the AnalyzerEngine with the configured Nlp Engine:
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, 
                              supported_languages=["en", "de", "es"])

    # Print the NLP Engine's configuration
    print(f"NLP Engine configuration:\n\tLoaded NLP engine: {analyzer.nlp_engine.__class__.__name__}")
    print(f"\tSupported entities: {analyzer.nlp_engine.get_supported_entities()}")
    print(f"\tSupported languages: {analyzer.nlp_engine.get_supported_languages()}")
    print()
    
    # Call the analyzer.analyze to detect PII entities (from the NLP engine + all other recognizers)
    results = analyzer.analyze(text=text, 
                               language=language, 
                               return_decision_process=True)

    # sort results
    results = sorted(results, key= lambda x: x.start)
    
    # Print results
    print("Returning full results, including the decision process:")
    for i, result in enumerate(results):
        print(f"\tResult {i}: {result}")
        print(f"\tDetected text: {text[result.start: result.end]}")
        print(f"\t{result.analysis_explanation.textual_explanation}")
        print("")

In [18]:

Copied!

# Run it as part of Presidio's AnalyzerEngine
call_analyzer_and_print_results(spacy_nlp_engine)
# Run it as part of Presidio's AnalyzerEngine
call_analyzer_and_print_results(spacy_nlp_engine)

Input text:
	Bill Clinton used to be the president of the United States

NLP Engine configuration:
	Loaded NLP engine: SpacyNlpEngine
	Supported entities: ['LOCATION', 'PERSON', 'ORGANIZATION']
	Supported languages: ['en']

Returning full results, including the decision process:
	Result 0: type: PERSON, start: 0, end: 12, score: 0.6
	Detected text: Bill Clinton
	Identified as PERSON by Spacy's Named Entity Recognition

	Result 1: type: LOCATION, start: 41, end: 58, score: 0.6
	Detected text: the United States
	Identified as LOCATION by Spacy's Named Entity Recognition

2. Using Stanza¶

Stanza is an NLP package by Stanford. More details on Stanza can be found here: https://stanfordnlp.github.io/stanza/ Loading Stanza instead of spaCy is straightforward. Just use StanzaNlpEngine instead of SpacyNlpEngine and define a model name supported by stanza (for example, en instead of en_core_web_lg)

In [ ]:

Copied!

from presidio_analyzer.nlp_engine import StanzaNlpEngine, NerModelConfiguration
from presidio_analyzer.nlp_engine import StanzaNlpEngine, NerModelConfiguration

In [ ]:

Copied!





# Define which model to use
model_config = [{"lang_code": "en", "model_name": "en"}]

# Define which entities the model returns and how they map to Presidio's
entity_mapping = dict(
    PER="PERSON",
    LOC= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION"
)

ner_model_configuration = NerModelConfiguration(model_to_presidio_entity_mapping=entity_mapping)

# Create the Stanza NLP Engine based on this configuration
stanza_nlp_engine = StanzaNlpEngine(models= model_config, ner_model_configuration=ner_model_configuration)

# Run it as part of Presidio's AnalyzerEngine
call_analyzer_and_print_results(stanza_nlp_engine)
# Define which model to use
model_config = [{"lang_code": "en", "model_name": "en"}]

# Define which entities the model returns and how they map to Presidio's
entity_mapping = dict(
    PER="PERSON",
    LOC= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION"
)

ner_model_configuration = NerModelConfiguration(model_to_presidio_entity_mapping=entity_mapping)

# Create the Stanza NLP Engine based on this configuration
stanza_nlp_engine = StanzaNlpEngine(models= model_config, ner_model_configuration=ner_model_configuration)

# Run it as part of Presidio's AnalyzerEngine
call_analyzer_and_print_results(stanza_nlp_engine)

3. Using transformers as the NLP engine¶

A third option is to use a model based on the transformers package. Note that in this case, we use both spaCy and transformers. The actual PII entities are detected using a transformers model, but additional text features such as lemmas and others, are extracted from a spaCy pipeline. We use a small spaCy model as it's faster and more memory efficient.

In [ ]:

Copied!

from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration

In [ ]:

Copied!





# Define which model to use
model_config = [{
   "lang_code":"en",
   "model_name":{
      "spacy":"en_core_web_sm",
      "transformers":"obi/deid_roberta_i2b2"
   }
}]

# Map transformers model labels to Presidio's
model_to_presidio_entity_mapping = dict(
    PER="PERSON",
    PERSON="PERSON",
    LOC= "LOCATION",
    LOCATION= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION",
    ORGANIZATION="ORGANIZATION",
    NORP="NRP",
    AGE="AGE",
    ID="ID",
    EMAIL="EMAIL",
    PATIENT="PERSON",
    STAFF="PERSON",
    HOSP="ORGANIZATION",
    PATORG="ORGANIZATION",
    DATE="DATE_TIME",
    TIME="DATE_TIME",
    PHONE="PHONE_NUMBER",
    HCW="PERSON",
    HOSPITAL="ORGANIZATION",
    FACILITY="LOCATION",
)

ner_model_configuration = NerModelConfiguration(model_to_presidio_entity_mapping=model_to_presidio_entity_mapping, 
                                                aggregation_strategy="simple",
                                                stride=14)

transformers_nlp_engine = TransformersNlpEngine(models=model_config,
                                                ner_model_configuration=ner_model_configuration)
# Define which model to use
model_config = [{
   "lang_code":"en",
   "model_name":{
      "spacy":"en_core_web_sm",
      "transformers":"obi/deid_roberta_i2b2"
   }
}]

# Map transformers model labels to Presidio's
model_to_presidio_entity_mapping = dict(
    PER="PERSON",
    PERSON="PERSON",
    LOC= "LOCATION",
    LOCATION= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION",
    ORGANIZATION="ORGANIZATION",
    NORP="NRP",
    AGE="AGE",
    ID="ID",
    EMAIL="EMAIL",
    PATIENT="PERSON",
    STAFF="PERSON",
    HOSP="ORGANIZATION",
    PATORG="ORGANIZATION",
    DATE="DATE_TIME",
    TIME="DATE_TIME",
    PHONE="PHONE_NUMBER",
    HCW="PERSON",
    HOSPITAL="ORGANIZATION",
    FACILITY="LOCATION",
)

ner_model_configuration = NerModelConfiguration(model_to_presidio_entity_mapping=model_to_presidio_entity_mapping, 
                                                aggregation_strategy="simple",
                                                stride=14)

transformers_nlp_engine = TransformersNlpEngine(models=model_config,
                                                ner_model_configuration=ner_model_configuration)

In [ ]:

Copied!

# Run it as part of Presidio's AnalyzerEngine
call_analyzer_and_print_results(transformers_nlp_engine)
# Run it as part of Presidio's AnalyzerEngine
call_analyzer_and_print_results(transformers_nlp_engine)

4. Supporting multiple languages¶

Presidio allows the user to create a model per language:

In [ ]:

Copied!

from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration

In [ ]:

Copied!





# Define which model to use
model_config = [{
   "lang_code":"en",
   "model_name":{
      "spacy":"en_core_web_sm",
      "transformers":"obi/deid_roberta_i2b2"
   }
},
{
    "lang_code":"es",
    "model_name":{
      "spacy":"es_core_news_sm",
      "transformers":"PlanTL-GOB-ES/roberta-large-bne-capitel-ner"
   }
}]

transformers_nlp_engine = TransformersNlpEngine(models=model_config,
                                                ner_model_configuration=ner_model_configuration)
# Define which model to use
model_config = [{
   "lang_code":"en",
   "model_name":{
      "spacy":"en_core_web_sm",
      "transformers":"obi/deid_roberta_i2b2"
   }
},
{
    "lang_code":"es",
    "model_name":{
      "spacy":"es_core_news_sm",
      "transformers":"PlanTL-GOB-ES/roberta-large-bne-capitel-ner"
   }
}]

transformers_nlp_engine = TransformersNlpEngine(models=model_config,
                                                ner_model_configuration=ner_model_configuration)

In [ ]:

Copied!





# Call in English
call_analyzer_and_print_results(transformers_nlp_engine, 
                                language="en", 
                                text = "Bill Clinton was the president of the United States")
# Call in English
call_analyzer_and_print_results(transformers_nlp_engine, 
                                language="en", 
                                text = "Bill Clinton was the president of the United States")

In [ ]:

Copied!





# Call in Spanish
call_analyzer_and_print_results(transformers_nlp_engine, 
                                language="es", 
                                text = "Bill Clinton solía ser el presidente de los Estados Unidos.")
# Call in Spanish
call_analyzer_and_print_results(transformers_nlp_engine, 
                                language="es", 
                                text = "Bill Clinton solía ser el presidente de los Estados Unidos.")

Path to notebook: https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/ner_model_configuration.ipynb¶

Configuring the NER model¶

1. Changing the default model's parameters¶

2. Using Stanza¶

3. Using transformers as the NLP engine¶

4. Supporting multiple languages¶

Path to notebook: https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/ner_model_configuration.ipynb ¶