Skip to content

Presidio Analyzer API Reference

AnalyzerEngine

Entry point for Presidio Analyzer.

Orchestrating the detection of PII entities and all related logic.

:param registry: instance of type RecognizerRegistry :param nlp_engine: instance of type NlpEngine (for example SpacyNlpEngine) :param app_tracer: instance of type AppTracer, used to trace the logic used during each request for interpretability reasons. :param log_decision_process: bool, defines whether the decision process within the analyzer should be logged or not. :param default_score_threshold: Minimum confidence value for detected entities to be returned :param supported_languages: List of possible languages this engine could be run on. Used for loading the right NLP models and recognizers for these languages. :param context_aware_enhancer: instance of type ContextAwareEnhancer for enhancing confidence score based on context words, (LemmaContextAwareEnhancer will be created by default if None passed)

Source code in presidio_analyzer/analyzer_engine.py
class AnalyzerEngine:
    """
    Entry point for Presidio Analyzer.

    Orchestrating the detection of PII entities and all related logic.

    :param registry: instance of type RecognizerRegistry
    :param nlp_engine: instance of type NlpEngine
    (for example SpacyNlpEngine)
    :param app_tracer: instance of type AppTracer, used to trace the logic
    used during each request for interpretability reasons.
    :param log_decision_process: bool,
    defines whether the decision process within the analyzer should be logged or not.
    :param default_score_threshold: Minimum confidence value
    for detected entities to be returned
    :param supported_languages: List of possible languages this engine could be run on.
    Used for loading the right NLP models and recognizers for these languages.
    :param context_aware_enhancer: instance of type ContextAwareEnhancer for enhancing
    confidence score based on context words, (LemmaContextAwareEnhancer will be created
    by default if None passed)
    """

    def __init__(
        self,
        registry: RecognizerRegistry = None,
        nlp_engine: NlpEngine = None,
        app_tracer: AppTracer = None,
        log_decision_process: bool = False,
        default_score_threshold: float = 0,
        supported_languages: List[str] = None,
        context_aware_enhancer: Optional[ContextAwareEnhancer] = None,
    ):
        if not supported_languages:
            supported_languages = ["en"]

        if not nlp_engine:
            logger.info("nlp_engine not provided, creating default.")
            provider = NlpEngineProvider()
            nlp_engine = provider.create_engine()

        if not registry:
            logger.info("registry not provided, creating default.")
            registry = RecognizerRegistry()
        if not app_tracer:
            app_tracer = AppTracer()
        self.app_tracer = app_tracer

        self.supported_languages = supported_languages

        self.nlp_engine = nlp_engine
        self.registry = registry

        # load all recognizers
        if not registry.recognizers:
            registry.load_predefined_recognizers(
                nlp_engine=self.nlp_engine, languages=self.supported_languages
            )

        self.log_decision_process = log_decision_process
        self.default_score_threshold = default_score_threshold

        if not context_aware_enhancer:
            logger.debug(
                "context aware enhancer not provided, creating default"
                + " lemma based enhancer."
            )
            context_aware_enhancer = LemmaContextAwareEnhancer()

        self.context_aware_enhancer = context_aware_enhancer

    def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
        """
        Return a list of PII recognizers currently loaded.

        :param language: Return the recognizers supporting a given language.
        :return: List of [Recognizer] as a RecognizersAllResponse
        """
        if not language:
            languages = self.supported_languages
        else:
            languages = [language]

        recognizers = []
        for language in languages:
            logger.info(f"Fetching all recognizers for language {language}")
            recognizers.extend(
                self.registry.get_recognizers(language=language, all_fields=True)
            )

        return list(set(recognizers))

    def get_supported_entities(self, language: Optional[str] = None) -> List[str]:
        """
        Return a list of the entities that can be detected.

        :param language: Return only entities supported in a specific language.
        :return: List of entity names
        """
        recognizers = self.get_recognizers(language=language)
        supported_entities = []
        for recognizer in recognizers:
            supported_entities.extend(recognizer.get_supported_entities())

        return list(set(supported_entities))

    def analyze(
        self,
        text: str,
        language: str,
        entities: Optional[List[str]] = None,
        correlation_id: Optional[str] = None,
        score_threshold: Optional[float] = None,
        return_decision_process: Optional[bool] = False,
        ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
        context: Optional[List[str]] = None,
        allow_list: Optional[List[str]] = None,
        nlp_artifacts: Optional[NlpArtifacts] = None,
    ) -> List[RecognizerResult]:
        """
        Find PII entities in text using different PII recognizers for a given language.

        :param text: the text to analyze
        :param language: the language of the text
        :param entities: List of PII entities that should be looked for in the text.
        If entities=None then all entities are looked for.
        :param correlation_id: cross call ID for this request
        :param score_threshold: A minimum value for which
        to return an identified entity
        :param return_decision_process: Whether the analysis decision process steps
        returned in the response.
        :param ad_hoc_recognizers: List of recognizers which will be used only
        for this specific request.
        :param context: List of context words to enhance confidence score if matched
        with the recognized entity's recognizer context
        :param allow_list: List of words that the user defines as being allowed to keep
        in the text
        :param nlp_artifacts: precomputed NlpArtifacts
        :return: an array of the found entities in the text

        :example:

        >>> from presidio_analyzer import AnalyzerEngine

        >>> # Set up the engine, loads the NLP module (spaCy model by default)
        >>> # and other PII recognizers
        >>> analyzer = AnalyzerEngine()

        >>> # Call analyzer to get results
        >>> results = analyzer.analyze(text='My phone number is 212-555-5555', entities=['PHONE_NUMBER'], language='en') # noqa D501
        >>> print(results)
        [type: PHONE_NUMBER, start: 19, end: 31, score: 0.85]
        """
        all_fields = not entities

        recognizers = self.registry.get_recognizers(
            language=language,
            entities=entities,
            all_fields=all_fields,
            ad_hoc_recognizers=ad_hoc_recognizers,
        )

        if all_fields:
            # Since all_fields=True, list all entities by iterating
            # over all recognizers
            entities = self.get_supported_entities(language=language)

        # run the nlp pipeline over the given text, store the results in
        # a NlpArtifacts instance
        if not nlp_artifacts:
            nlp_artifacts = self.nlp_engine.process_text(text, language)

        if self.log_decision_process:
            self.app_tracer.trace(
                correlation_id, "nlp artifacts:" + nlp_artifacts.to_json()
            )

        results = []
        for recognizer in recognizers:
            # Lazy loading of the relevant recognizers
            if not recognizer.is_loaded:
                recognizer.load()
                recognizer.is_loaded = True

            # analyze using the current recognizer and append the results
            current_results = recognizer.analyze(
                text=text, entities=entities, nlp_artifacts=nlp_artifacts
            )
            if current_results:
                # add recognizer name to recognition metadata inside results
                # if not exists
                self.__add_recognizer_id_if_not_exists(current_results, recognizer)
                results.extend(current_results)

        results = self._enhance_using_context(
            text, results, nlp_artifacts, recognizers, context
        )

        if self.log_decision_process:
            self.app_tracer.trace(
                correlation_id,
                json.dumps([str(result.to_dict()) for result in results]),
            )

        # Remove duplicates or low score results
        results = EntityRecognizer.remove_duplicates(results)
        results = self.__remove_low_scores(results, score_threshold)

        if allow_list:
            results = self._remove_allow_list(results, allow_list, text)

        if not return_decision_process:
            results = self.__remove_decision_process(results)

        return results

    def _enhance_using_context(
        self,
        text: str,
        raw_results: List[RecognizerResult],
        nlp_artifacts: NlpArtifacts,
        recognizers: List[EntityRecognizer],
        context: Optional[List[str]] = None,
    ) -> List[RecognizerResult]:
        """
        Enhance confidence score using context words.

        :param text: The actual text that was analyzed
        :param raw_results: Recognizer results which didn't take
                            context into consideration
        :param nlp_artifacts: The nlp artifacts contains elements
                              such as lemmatized tokens for better
                              accuracy of the context enhancement process
        :param recognizers: the list of recognizers
        :param context: list of context words
        """
        results = []

        for recognizer in recognizers:
            recognizer_results = [
                r
                for r in raw_results
                if r.recognition_metadata[RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]
                == recognizer.id
            ]
            other_recognizer_results = [
                r
                for r in raw_results
                if r.recognition_metadata[RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]
                != recognizer.id
            ]

            # enhance score using context in recognizer level if implemented
            recognizer_results = recognizer.enhance_using_context(
                text=text,
                # each recognizer will get access to all recognizer results
                # to allow related entities contex enhancement
                raw_recognizer_results=recognizer_results,
                other_raw_recognizer_results=other_recognizer_results,
                nlp_artifacts=nlp_artifacts,
                context=context,
            )

            results.extend(recognizer_results)

        # Update results in case surrounding words or external context are relevant to
        # the context words.
        results = self.context_aware_enhancer.enhance_using_context(
            text=text,
            raw_results=results,
            nlp_artifacts=nlp_artifacts,
            recognizers=recognizers,
            context=context,
        )

        return results

    def __remove_low_scores(
        self, results: List[RecognizerResult], score_threshold: float = None
    ) -> List[RecognizerResult]:
        """
        Remove results for which the confidence is lower than the threshold.

        :param results: List of RecognizerResult
        :param score_threshold: float value for minimum possible confidence
        :return: List[RecognizerResult]
        """
        if score_threshold is None:
            score_threshold = self.default_score_threshold

        new_results = [result for result in results if result.score >= score_threshold]
        return new_results

    @staticmethod
    def _remove_allow_list(
        results: List[RecognizerResult], allow_list: List[str], text: str
    ) -> List[RecognizerResult]:
        """
        Remove results which are part of the allow list.

        :param results: List of RecognizerResult
        :param allow_list: list of allowed terms
        :param text: the text to analyze
        :return: List[RecognizerResult]
        """
        new_results = []
        for result in results:
            word = text[result.start : result.end]
            # if the word is not specified to be allowed, keep in the PII entities
            if word not in allow_list:
                new_results.append(result)

        return new_results

    @staticmethod
    def __add_recognizer_id_if_not_exists(
        results: List[RecognizerResult], recognizer: EntityRecognizer
    ):
        """Ensure recognition metadata with recognizer id existence.

        Ensure recognizer result list contains recognizer id inside recognition
        metadata dictionary, and if not create it. recognizer_id is needed
        for context aware enhancement.

        :param results: List of RecognizerResult
        :param recognizer: Entity recognizer
        """
        for result in results:
            if not result.recognition_metadata:
                result.recognition_metadata = dict()
            if (
                RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
                not in result.recognition_metadata
            ):
                result.recognition_metadata[
                    RecognizerResult.RECOGNIZER_IDENTIFIER_KEY
                ] = recognizer.id
            if RecognizerResult.RECOGNIZER_NAME_KEY not in result.recognition_metadata:
                result.recognition_metadata[
                    RecognizerResult.RECOGNIZER_NAME_KEY
                ] = recognizer.name

    @staticmethod
    def __remove_decision_process(
        results: List[RecognizerResult],
    ) -> List[RecognizerResult]:
        """Remove decision process / analysis explanation from response."""

        for result in results:
            result.analysis_explanation = None

        return results

analyze(self, text, language, entities=None, correlation_id=None, score_threshold=None, return_decision_process=False, ad_hoc_recognizers=None, context=None, allow_list=None, nlp_artifacts=None)

Find PII entities in text using different PII recognizers for a given language.

:param text: the text to analyze :param language: the language of the text :param entities: List of PII entities that should be looked for in the text. If entities=None then all entities are looked for. :param correlation_id: cross call ID for this request :param score_threshold: A minimum value for which to return an identified entity :param return_decision_process: Whether the analysis decision process steps returned in the response. :param ad_hoc_recognizers: List of recognizers which will be used only for this specific request. :param context: List of context words to enhance confidence score if matched with the recognized entity's recognizer context :param allow_list: List of words that the user defines as being allowed to keep in the text :param nlp_artifacts: precomputed NlpArtifacts :return: an array of the found entities in the text

:example:

from presidio_analyzer import AnalyzerEngine

Set up the engine, loads the NLP module (spaCy model by default)

and other PII recognizers

analyzer = AnalyzerEngine()

Call analyzer to get results

results = analyzer.analyze(text='My phone number is 212-555-5555', entities=['PHONE_NUMBER'], language='en') # noqa D501 print(results) [type: PHONE_NUMBER, start: 19, end: 31, score: 0.85]

Source code in presidio_analyzer/analyzer_engine.py
def analyze(
    self,
    text: str,
    language: str,
    entities: Optional[List[str]] = None,
    correlation_id: Optional[str] = None,
    score_threshold: Optional[float] = None,
    return_decision_process: Optional[bool] = False,
    ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
    context: Optional[List[str]] = None,
    allow_list: Optional[List[str]] = None,
    nlp_artifacts: Optional[NlpArtifacts] = None,
) -> List[RecognizerResult]:
    """
    Find PII entities in text using different PII recognizers for a given language.

    :param text: the text to analyze
    :param language: the language of the text
    :param entities: List of PII entities that should be looked for in the text.
    If entities=None then all entities are looked for.
    :param correlation_id: cross call ID for this request
    :param score_threshold: A minimum value for which
    to return an identified entity
    :param return_decision_process: Whether the analysis decision process steps
    returned in the response.
    :param ad_hoc_recognizers: List of recognizers which will be used only
    for this specific request.
    :param context: List of context words to enhance confidence score if matched
    with the recognized entity's recognizer context
    :param allow_list: List of words that the user defines as being allowed to keep
    in the text
    :param nlp_artifacts: precomputed NlpArtifacts
    :return: an array of the found entities in the text

    :example:

    >>> from presidio_analyzer import AnalyzerEngine

    >>> # Set up the engine, loads the NLP module (spaCy model by default)
    >>> # and other PII recognizers
    >>> analyzer = AnalyzerEngine()

    >>> # Call analyzer to get results
    >>> results = analyzer.analyze(text='My phone number is 212-555-5555', entities=['PHONE_NUMBER'], language='en') # noqa D501
    >>> print(results)
    [type: PHONE_NUMBER, start: 19, end: 31, score: 0.85]
    """
    all_fields = not entities

    recognizers = self.registry.get_recognizers(
        language=language,
        entities=entities,
        all_fields=all_fields,
        ad_hoc_recognizers=ad_hoc_recognizers,
    )

    if all_fields:
        # Since all_fields=True, list all entities by iterating
        # over all recognizers
        entities = self.get_supported_entities(language=language)

    # run the nlp pipeline over the given text, store the results in
    # a NlpArtifacts instance
    if not nlp_artifacts:
        nlp_artifacts = self.nlp_engine.process_text(text, language)

    if self.log_decision_process:
        self.app_tracer.trace(
            correlation_id, "nlp artifacts:" + nlp_artifacts.to_json()
        )

    results = []
    for recognizer in recognizers:
        # Lazy loading of the relevant recognizers
        if not recognizer.is_loaded:
            recognizer.load()
            recognizer.is_loaded = True

        # analyze using the current recognizer and append the results
        current_results = recognizer.analyze(
            text=text, entities=entities, nlp_artifacts=nlp_artifacts
        )
        if current_results:
            # add recognizer name to recognition metadata inside results
            # if not exists
            self.__add_recognizer_id_if_not_exists(current_results, recognizer)
            results.extend(current_results)

    results = self._enhance_using_context(
        text, results, nlp_artifacts, recognizers, context
    )

    if self.log_decision_process:
        self.app_tracer.trace(
            correlation_id,
            json.dumps([str(result.to_dict()) for result in results]),
        )

    # Remove duplicates or low score results
    results = EntityRecognizer.remove_duplicates(results)
    results = self.__remove_low_scores(results, score_threshold)

    if allow_list:
        results = self._remove_allow_list(results, allow_list, text)

    if not return_decision_process:
        results = self.__remove_decision_process(results)

    return results

get_recognizers(self, language=None)

Return a list of PII recognizers currently loaded.

:param language: Return the recognizers supporting a given language. :return: List of [Recognizer] as a RecognizersAllResponse

Source code in presidio_analyzer/analyzer_engine.py
def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
    """
    Return a list of PII recognizers currently loaded.

    :param language: Return the recognizers supporting a given language.
    :return: List of [Recognizer] as a RecognizersAllResponse
    """
    if not language:
        languages = self.supported_languages
    else:
        languages = [language]

    recognizers = []
    for language in languages:
        logger.info(f"Fetching all recognizers for language {language}")
        recognizers.extend(
            self.registry.get_recognizers(language=language, all_fields=True)
        )

    return list(set(recognizers))

get_supported_entities(self, language=None)

Return a list of the entities that can be detected.

:param language: Return only entities supported in a specific language. :return: List of entity names

Source code in presidio_analyzer/analyzer_engine.py
def get_supported_entities(self, language: Optional[str] = None) -> List[str]:
    """
    Return a list of the entities that can be detected.

    :param language: Return only entities supported in a specific language.
    :return: List of entity names
    """
    recognizers = self.get_recognizers(language=language)
    supported_entities = []
    for recognizer in recognizers:
        supported_entities.extend(recognizer.get_supported_entities())

    return list(set(supported_entities))

RecognizerRegistry

Detect, register and hold all recognizers to be used by the analyzer.

:param recognizers: An optional list of recognizers, that will be available instead of the predefined recognizers

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
class RecognizerRegistry:
    """
    Detect, register and hold all recognizers to be used by the analyzer.

    :param recognizers: An optional list of recognizers,
    that will be available instead of the predefined recognizers
    """

    def __init__(self, recognizers: Optional[Iterable[EntityRecognizer]] = None):

        if recognizers:
            self.recognizers = recognizers
        else:
            self.recognizers = []

    def load_predefined_recognizers(
        self, languages: Optional[List[str]] = None, nlp_engine: NlpEngine = None
    ) -> None:
        """
        Load the existing recognizers into memory.

        :param languages: List of languages for which to load recognizers
        :param nlp_engine: The NLP engine to use.
        :return: None
        """
        if not languages:
            languages = ["en"]

        nlp_recognizer = self._get_nlp_recognizer(nlp_engine)
        recognizers_map = {
            "en": [
                UsBankRecognizer,
                UsLicenseRecognizer,
                UsItinRecognizer,
                UsPassportRecognizer,
                UsSsnRecognizer,
                NhsRecognizer,
                SgFinRecognizer,
                AuAbnRecognizer,
                AuAcnRecognizer,
                AuTfnRecognizer,
                AuMedicareRecognizer,
            ],
            "es": [EsNifRecognizer],
            "it": [
                ItDriverLicenseRecognizer,
                ItFiscalCodeRecognizer,
                ItVatCodeRecognizer,
                ItIdentityCardRecognizer,
                ItPassportRecognizer,
            ],
            "ALL": [
                CreditCardRecognizer,
                CryptoRecognizer,
                DateRecognizer,
                EmailRecognizer,
                IbanRecognizer,
                IpRecognizer,
                MedicalLicenseRecognizer,
                nlp_recognizer,
                PhoneRecognizer,
                UrlRecognizer,
            ],
        }
        for lang in languages:
            lang_recognizers = [rc() for rc in recognizers_map.get(lang, [])]
            self.recognizers.extend(lang_recognizers)
            all_recognizers = [
                rc(supported_language=lang) for rc in recognizers_map.get("ALL", [])
            ]
            self.recognizers.extend(all_recognizers)

    @staticmethod
    def _get_nlp_recognizer(
        nlp_engine: NlpEngine,
    ) -> Union[Type[SpacyRecognizer], Type[StanzaRecognizer]]:
        """Return the recognizer leveraging the selected NLP Engine."""

        if not nlp_engine or type(nlp_engine) == SpacyNlpEngine:
            return SpacyRecognizer
        if isinstance(nlp_engine, StanzaNlpEngine):
            return StanzaRecognizer
        if isinstance(nlp_engine, TransformersNlpEngine):
            return TransformersRecognizer
        else:
            logger.warning(
                "nlp engine should be either SpacyNlpEngine,"
                "StanzaNlpEngine or TransformersNlpEngine"
            )
            # Returning default
            return SpacyRecognizer

    def get_recognizers(
        self,
        language: str,
        entities: Optional[List[str]] = None,
        all_fields: bool = False,
        ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
    ) -> List[EntityRecognizer]:
        """
        Return a list of recognizers which supports the specified name and language.

        :param entities: the requested entities
        :param language: the requested language
        :param all_fields: a flag to return all fields of a requested language.
        :param ad_hoc_recognizers: Additional recognizers provided by the user
        as part of the request
        :return: A list of the recognizers which supports the supplied entities
        and language
        """
        if language is None:
            raise ValueError("No language provided")

        if entities is None and all_fields is False:
            raise ValueError("No entities provided")

        all_possible_recognizers = copy.copy(self.recognizers)
        if ad_hoc_recognizers:
            all_possible_recognizers.extend(ad_hoc_recognizers)

        # filter out unwanted recognizers
        to_return = set()
        if all_fields:
            to_return = [
                rec
                for rec in all_possible_recognizers
                if language == rec.supported_language
            ]
        else:
            for entity in entities:
                subset = [
                    rec
                    for rec in all_possible_recognizers
                    if entity in rec.supported_entities
                    and language == rec.supported_language
                ]

                if not subset:
                    logger.warning(
                        "Entity %s doesn't have the corresponding"
                        " recognizer in language : %s",
                        entity,
                        language,
                    )
                else:
                    to_return.update(set(subset))

        logger.debug(
            "Returning a total of %s recognizers",
            str(len(to_return)),
        )

        if not to_return:
            raise ValueError("No matching recognizers were found to serve the request.")

        return list(to_return)

    def add_recognizer(self, recognizer: EntityRecognizer) -> None:
        """
        Add a new recognizer to the list of recognizers.

        :param recognizer: Recognizer to add
        """
        if not isinstance(recognizer, EntityRecognizer):
            raise ValueError("Input is not of type EntityRecognizer")

        self.recognizers.append(recognizer)

    def remove_recognizer(self, recognizer_name: str) -> None:
        """
        Remove a recognizer based on its name.

        :param recognizer_name: Name of recognizer to remove
        """
        new_recognizers = [
            rec for rec in self.recognizers if rec.name != recognizer_name
        ]
        logger.info(
            "Removed %s recognizers which had the name %s",
            str(len(self.recognizers) - len(new_recognizers)),
            recognizer_name,
        )
        self.recognizers = new_recognizers

    def add_pattern_recognizer_from_dict(self, recognizer_dict: Dict):
        """
        Load a pattern recognizer from a Dict into the recognizer registry.

        :param recognizer_dict: Dict holding a serialization of an PatternRecognizer

        :example:
        >>> registry = RecognizerRegistry()
        >>> recognizer = { "name": "Titles Recognizer", "supported_language": "de","supported_entity": "TITLE", "deny_list": ["Mr.","Mrs."]} # noqa: E501
        >>> registry.add_pattern_recognizer_from_dict(recognizer)
        """

        recognizer = PatternRecognizer.from_dict(recognizer_dict)
        self.add_recognizer(recognizer)

    def add_recognizers_from_yaml(self, yml_path: Union[str, Path]):
        r"""
        Read YAML file and load recognizers into the recognizer registry.

        See example yaml file here:
        https://github.com/microsoft/presidio/blob/main/presidio-analyzer/conf/example_recognizers.yaml

        :example:
        >>> yaml_file = "recognizers.yaml"
        >>> registry = RecognizerRegistry()
        >>> registry.add_recognizers_from_yaml(yaml_file)

        """

        try:
            with open(yml_path, "r") as stream:
                yaml_recognizers = yaml.safe_load(stream)

            for yaml_recognizer in yaml_recognizers["recognizers"]:
                self.add_pattern_recognizer_from_dict(yaml_recognizer)
        except IOError as io_error:
            print(f"Error reading file {yml_path}")
            raise io_error
        except yaml.YAMLError as yaml_error:
            print(f"Failed to parse file {yml_path}")
            raise yaml_error
        except TypeError as yaml_error:
            print(f"Failed to parse file {yml_path}")
            raise yaml_error

add_pattern_recognizer_from_dict(self, recognizer_dict)

Load a pattern recognizer from a Dict into the recognizer registry.

:param recognizer_dict: Dict holding a serialization of an PatternRecognizer

:example:

registry = RecognizerRegistry() recognizer = { "name": "Titles Recognizer", "supported_language": "de","supported_entity": "TITLE", "deny_list": ["Mr.","Mrs."]} # noqa: E501 registry.add_pattern_recognizer_from_dict(recognizer)

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
def add_pattern_recognizer_from_dict(self, recognizer_dict: Dict):
    """
    Load a pattern recognizer from a Dict into the recognizer registry.

    :param recognizer_dict: Dict holding a serialization of an PatternRecognizer

    :example:
    >>> registry = RecognizerRegistry()
    >>> recognizer = { "name": "Titles Recognizer", "supported_language": "de","supported_entity": "TITLE", "deny_list": ["Mr.","Mrs."]} # noqa: E501
    >>> registry.add_pattern_recognizer_from_dict(recognizer)
    """

    recognizer = PatternRecognizer.from_dict(recognizer_dict)
    self.add_recognizer(recognizer)

add_recognizer(self, recognizer)

Add a new recognizer to the list of recognizers.

:param recognizer: Recognizer to add

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
    """
    Add a new recognizer to the list of recognizers.

    :param recognizer: Recognizer to add
    """
    if not isinstance(recognizer, EntityRecognizer):
        raise ValueError("Input is not of type EntityRecognizer")

    self.recognizers.append(recognizer)

add_recognizers_from_yaml(self, yml_path)

Read YAML file and load recognizers into the recognizer registry.

See example yaml file here: https://github.com/microsoft/presidio/blob/main/presidio-analyzer/conf/example_recognizers.yaml

:example:

yaml_file = "recognizers.yaml" registry = RecognizerRegistry() registry.add_recognizers_from_yaml(yaml_file)

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
def add_recognizers_from_yaml(self, yml_path: Union[str, Path]):
    r"""
    Read YAML file and load recognizers into the recognizer registry.

    See example yaml file here:
    https://github.com/microsoft/presidio/blob/main/presidio-analyzer/conf/example_recognizers.yaml

    :example:
    >>> yaml_file = "recognizers.yaml"
    >>> registry = RecognizerRegistry()
    >>> registry.add_recognizers_from_yaml(yaml_file)

    """

    try:
        with open(yml_path, "r") as stream:
            yaml_recognizers = yaml.safe_load(stream)

        for yaml_recognizer in yaml_recognizers["recognizers"]:
            self.add_pattern_recognizer_from_dict(yaml_recognizer)
    except IOError as io_error:
        print(f"Error reading file {yml_path}")
        raise io_error
    except yaml.YAMLError as yaml_error:
        print(f"Failed to parse file {yml_path}")
        raise yaml_error
    except TypeError as yaml_error:
        print(f"Failed to parse file {yml_path}")
        raise yaml_error

get_recognizers(self, language, entities=None, all_fields=False, ad_hoc_recognizers=None)

Return a list of recognizers which supports the specified name and language.

:param entities: the requested entities :param language: the requested language :param all_fields: a flag to return all fields of a requested language. :param ad_hoc_recognizers: Additional recognizers provided by the user as part of the request :return: A list of the recognizers which supports the supplied entities and language

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
def get_recognizers(
    self,
    language: str,
    entities: Optional[List[str]] = None,
    all_fields: bool = False,
    ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
) -> List[EntityRecognizer]:
    """
    Return a list of recognizers which supports the specified name and language.

    :param entities: the requested entities
    :param language: the requested language
    :param all_fields: a flag to return all fields of a requested language.
    :param ad_hoc_recognizers: Additional recognizers provided by the user
    as part of the request
    :return: A list of the recognizers which supports the supplied entities
    and language
    """
    if language is None:
        raise ValueError("No language provided")

    if entities is None and all_fields is False:
        raise ValueError("No entities provided")

    all_possible_recognizers = copy.copy(self.recognizers)
    if ad_hoc_recognizers:
        all_possible_recognizers.extend(ad_hoc_recognizers)

    # filter out unwanted recognizers
    to_return = set()
    if all_fields:
        to_return = [
            rec
            for rec in all_possible_recognizers
            if language == rec.supported_language
        ]
    else:
        for entity in entities:
            subset = [
                rec
                for rec in all_possible_recognizers
                if entity in rec.supported_entities
                and language == rec.supported_language
            ]

            if not subset:
                logger.warning(
                    "Entity %s doesn't have the corresponding"
                    " recognizer in language : %s",
                    entity,
                    language,
                )
            else:
                to_return.update(set(subset))

    logger.debug(
        "Returning a total of %s recognizers",
        str(len(to_return)),
    )

    if not to_return:
        raise ValueError("No matching recognizers were found to serve the request.")

    return list(to_return)

load_predefined_recognizers(self, languages=None, nlp_engine=None)

Load the existing recognizers into memory.

:param languages: List of languages for which to load recognizers :param nlp_engine: The NLP engine to use. :return: None

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
def load_predefined_recognizers(
    self, languages: Optional[List[str]] = None, nlp_engine: NlpEngine = None
) -> None:
    """
    Load the existing recognizers into memory.

    :param languages: List of languages for which to load recognizers
    :param nlp_engine: The NLP engine to use.
    :return: None
    """
    if not languages:
        languages = ["en"]

    nlp_recognizer = self._get_nlp_recognizer(nlp_engine)
    recognizers_map = {
        "en": [
            UsBankRecognizer,
            UsLicenseRecognizer,
            UsItinRecognizer,
            UsPassportRecognizer,
            UsSsnRecognizer,
            NhsRecognizer,
            SgFinRecognizer,
            AuAbnRecognizer,
            AuAcnRecognizer,
            AuTfnRecognizer,
            AuMedicareRecognizer,
        ],
        "es": [EsNifRecognizer],
        "it": [
            ItDriverLicenseRecognizer,
            ItFiscalCodeRecognizer,
            ItVatCodeRecognizer,
            ItIdentityCardRecognizer,
            ItPassportRecognizer,
        ],
        "ALL": [
            CreditCardRecognizer,
            CryptoRecognizer,
            DateRecognizer,
            EmailRecognizer,
            IbanRecognizer,
            IpRecognizer,
            MedicalLicenseRecognizer,
            nlp_recognizer,
            PhoneRecognizer,
            UrlRecognizer,
        ],
    }
    for lang in languages:
        lang_recognizers = [rc() for rc in recognizers_map.get(lang, [])]
        self.recognizers.extend(lang_recognizers)
        all_recognizers = [
            rc(supported_language=lang) for rc in recognizers_map.get("ALL", [])
        ]
        self.recognizers.extend(all_recognizers)

remove_recognizer(self, recognizer_name)

Remove a recognizer based on its name.

:param recognizer_name: Name of recognizer to remove

Source code in presidio_analyzer/recognizer_registry/recognizer_registry.py
def remove_recognizer(self, recognizer_name: str) -> None:
    """
    Remove a recognizer based on its name.

    :param recognizer_name: Name of recognizer to remove
    """
    new_recognizers = [
        rec for rec in self.recognizers if rec.name != recognizer_name
    ]
    logger.info(
        "Removed %s recognizers which had the name %s",
        str(len(self.recognizers) - len(new_recognizers)),
        recognizer_name,
    )
    self.recognizers = new_recognizers

EntityRecognizer

A class representing an abstract PII entity recognizer.

EntityRecognizer is an abstract class to be inherited by Recognizers which hold the logic for recognizing specific PII entities.

EntityRecognizer exposes a method called enhance_using_context which can be overridden in case a custom context aware enhancement is needed in derived class of a recognizer.

:param supported_entities: the entities supported by this recognizer (for example, phone number, address, etc.) :param supported_language: the language supported by this recognizer. The supported langauge code is iso6391Name :param name: the name of this recognizer (optional) :param version: the recognizer current version :param context: a list of words which can help boost confidence score when they appear in context of the matched entity

Source code in presidio_analyzer/entity_recognizer.py
class EntityRecognizer:
    """
    A class representing an abstract PII entity recognizer.

    EntityRecognizer is an abstract class to be inherited by
    Recognizers which hold the logic for recognizing specific PII entities.

    EntityRecognizer exposes a method called enhance_using_context which
    can be overridden in case a custom context aware enhancement is needed
    in derived class of a recognizer.

    :param supported_entities: the entities supported by this recognizer
    (for example, phone number, address, etc.)
    :param supported_language: the language supported by this recognizer.
    The supported langauge code is iso6391Name
    :param name: the name of this recognizer (optional)
    :param version: the recognizer current version
    :param context: a list of words which can help boost confidence score
    when they appear in context of the matched entity
    """

    MIN_SCORE = 0
    MAX_SCORE = 1.0

    def __init__(
        self,
        supported_entities: List[str],
        name: str = None,
        supported_language: str = "en",
        version: str = "0.0.1",
        context: Optional[List[str]] = None,
    ):

        self.supported_entities = supported_entities

        if name is None:
            self.name = self.__class__.__name__  # assign class name as name
        else:
            self.name = name

        self._id = f"{self.name}_{id(self)}"

        self.supported_language = supported_language
        self.version = version
        self.is_loaded = False
        self.context = context if context else []

        self.load()
        logger.info("Loaded recognizer: %s", self.name)
        self.is_loaded = True

    @property
    def id(self):
        """Return a unique identifier of this recognizer."""

        return self._id

    @abstractmethod
    def load(self) -> None:
        """
        Initialize the recognizer assets if needed.

        (e.g. machine learning models)
        """

    @abstractmethod
    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        Analyze text to identify entities.

        :param text: The text to be analyzed
        :param entities: The list of entities this recognizer is able to detect
        :param nlp_artifacts: A group of attributes which are the result of
        an NLP process over the input text.
        :return: List of results detected by this recognizer.
        """
        return None

    def enhance_using_context(
        self,
        text: str,
        raw_recognizer_results: List[RecognizerResult],
        other_raw_recognizer_results: List[RecognizerResult],
        nlp_artifacts: NlpArtifacts,
        context: Optional[List[str]] = None,
    ) -> List[RecognizerResult]:
        """Enhance confidence score using context of the entity.

        Override this method in derived class in case a custom logic
        is needed, otherwise return value will be equal to
        raw_results.

        in case a result score is boosted, derived class need to update
        result.recognition_metadata[RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY]

        :param text: The actual text that was analyzed
        :param raw_recognizer_results: This recognizer's results, to be updated
        based on recognizer specific context.
        :param other_raw_recognizer_results: Other recognizer results matched in
        the given text to allow related entity context enhancement
        :param nlp_artifacts: The nlp artifacts contains elements
                              such as lemmatized tokens for better
                              accuracy of the context enhancement process
        :param context: list of context words
        """
        return raw_recognizer_results

    def get_supported_entities(self) -> List[str]:
        """
        Return the list of entities this recognizer can identify.

        :return: A list of the supported entities by this recognizer
        """
        return self.supported_entities

    def get_supported_language(self) -> str:
        """
        Return the language this recognizer can support.

        :return: A list of the supported language by this recognizer
        """
        return self.supported_language

    def get_version(self) -> str:
        """
        Return the version of this recognizer.

        :return: The current version of this recognizer
        """
        return self.version

    def to_dict(self) -> Dict:
        """
        Serialize self to dictionary.

        :return: a dictionary
        """
        return_dict = {
            "supported_entities": self.supported_entities,
            "supported_language": self.supported_language,
            "name": self.name,
            "version": self.version,
        }
        return return_dict

    @classmethod
    def from_dict(cls, entity_recognizer_dict: Dict) -> "EntityRecognizer":
        """
        Create EntityRecognizer from a dict input.

        :param entity_recognizer_dict: Dict containing keys and values for instantiation
        """
        return cls(**entity_recognizer_dict)

    @staticmethod
    def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]:
        """
        Remove duplicate results.

        Remove duplicates in case the two results
        have identical start and ends and types.
        :param results: List[RecognizerResult]
        :return: List[RecognizerResult]
        """
        results = list(set(results))
        results = sorted(results, key=lambda x: (-x.score, x.start, -(x.end - x.start)))
        filtered_results = []

        for result in results:
            if result.score == 0:
                continue

            to_keep = result not in filtered_results  # equals based comparison
            if to_keep:
                for filtered in filtered_results:
                    # If result is contained in one of the other results
                    if (
                        result.contained_in(filtered)
                        and result.entity_type == filtered.entity_type
                    ):
                        to_keep = False
                        break

            if to_keep:
                filtered_results.append(result)

        return filtered_results

id property readonly

Return a unique identifier of this recognizer.

analyze(self, text, entities, nlp_artifacts)

Analyze text to identify entities.

:param text: The text to be analyzed :param entities: The list of entities this recognizer is able to detect :param nlp_artifacts: A group of attributes which are the result of an NLP process over the input text. :return: List of results detected by this recognizer.

Source code in presidio_analyzer/entity_recognizer.py
@abstractmethod
def analyze(
    self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
) -> List[RecognizerResult]:
    """
    Analyze text to identify entities.

    :param text: The text to be analyzed
    :param entities: The list of entities this recognizer is able to detect
    :param nlp_artifacts: A group of attributes which are the result of
    an NLP process over the input text.
    :return: List of results detected by this recognizer.
    """
    return None

enhance_using_context(self, text, raw_recognizer_results, other_raw_recognizer_results, nlp_artifacts, context=None)

Enhance confidence score using context of the entity.

Override this method in derived class in case a custom logic is needed, otherwise return value will be equal to raw_results.

in case a result score is boosted, derived class need to update result.recognition_metadata[RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY]

:param text: The actual text that was analyzed :param raw_recognizer_results: This recognizer's results, to be updated based on recognizer specific context. :param other_raw_recognizer_results: Other recognizer results matched in the given text to allow related entity context enhancement :param nlp_artifacts: The nlp artifacts contains elements such as lemmatized tokens for better accuracy of the context enhancement process :param context: list of context words

Source code in presidio_analyzer/entity_recognizer.py
def enhance_using_context(
    self,
    text: str,
    raw_recognizer_results: List[RecognizerResult],
    other_raw_recognizer_results: List[RecognizerResult],
    nlp_artifacts: NlpArtifacts,
    context: Optional[List[str]] = None,
) -> List[RecognizerResult]:
    """Enhance confidence score using context of the entity.

    Override this method in derived class in case a custom logic
    is needed, otherwise return value will be equal to
    raw_results.

    in case a result score is boosted, derived class need to update
    result.recognition_metadata[RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY]

    :param text: The actual text that was analyzed
    :param raw_recognizer_results: This recognizer's results, to be updated
    based on recognizer specific context.
    :param other_raw_recognizer_results: Other recognizer results matched in
    the given text to allow related entity context enhancement
    :param nlp_artifacts: The nlp artifacts contains elements
                          such as lemmatized tokens for better
                          accuracy of the context enhancement process
    :param context: list of context words
    """
    return raw_recognizer_results

from_dict(entity_recognizer_dict) classmethod

Create EntityRecognizer from a dict input.

:param entity_recognizer_dict: Dict containing keys and values for instantiation

Source code in presidio_analyzer/entity_recognizer.py
@classmethod
def from_dict(cls, entity_recognizer_dict: Dict) -> "EntityRecognizer":
    """
    Create EntityRecognizer from a dict input.

    :param entity_recognizer_dict: Dict containing keys and values for instantiation
    """
    return cls(**entity_recognizer_dict)

get_supported_entities(self)

Return the list of entities this recognizer can identify.

:return: A list of the supported entities by this recognizer

Source code in presidio_analyzer/entity_recognizer.py
def get_supported_entities(self) -> List[str]:
    """
    Return the list of entities this recognizer can identify.

    :return: A list of the supported entities by this recognizer
    """
    return self.supported_entities

get_supported_language(self)

Return the language this recognizer can support.

:return: A list of the supported language by this recognizer

Source code in presidio_analyzer/entity_recognizer.py
def get_supported_language(self) -> str:
    """
    Return the language this recognizer can support.

    :return: A list of the supported language by this recognizer
    """
    return self.supported_language

get_version(self)

Return the version of this recognizer.

:return: The current version of this recognizer

Source code in presidio_analyzer/entity_recognizer.py
def get_version(self) -> str:
    """
    Return the version of this recognizer.

    :return: The current version of this recognizer
    """
    return self.version

load(self)

Initialize the recognizer assets if needed.

(e.g. machine learning models)

Source code in presidio_analyzer/entity_recognizer.py
@abstractmethod
def load(self) -> None:
    """
    Initialize the recognizer assets if needed.

    (e.g. machine learning models)
    """

remove_duplicates(results) staticmethod

Remove duplicate results.

Remove duplicates in case the two results have identical start and ends and types. :param results: List[RecognizerResult] :return: List[RecognizerResult]

Source code in presidio_analyzer/entity_recognizer.py
@staticmethod
def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]:
    """
    Remove duplicate results.

    Remove duplicates in case the two results
    have identical start and ends and types.
    :param results: List[RecognizerResult]
    :return: List[RecognizerResult]
    """
    results = list(set(results))
    results = sorted(results, key=lambda x: (-x.score, x.start, -(x.end - x.start)))
    filtered_results = []

    for result in results:
        if result.score == 0:
            continue

        to_keep = result not in filtered_results  # equals based comparison
        if to_keep:
            for filtered in filtered_results:
                # If result is contained in one of the other results
                if (
                    result.contained_in(filtered)
                    and result.entity_type == filtered.entity_type
                ):
                    to_keep = False
                    break

        if to_keep:
            filtered_results.append(result)

    return filtered_results

to_dict(self)

Serialize self to dictionary.

:return: a dictionary

Source code in presidio_analyzer/entity_recognizer.py
def to_dict(self) -> Dict:
    """
    Serialize self to dictionary.

    :return: a dictionary
    """
    return_dict = {
        "supported_entities": self.supported_entities,
        "supported_language": self.supported_language,
        "name": self.name,
        "version": self.version,
    }
    return return_dict

RemoteRecognizer

A configuration for a recognizer that runs on a different process / remote machine.

:param supported_entities: A list of entities this recognizer can identify :param name: name of recognizer :param supported_language: The language this recognizer can detect entities in :param version: Version of this recognizer

Source code in presidio_analyzer/remote_recognizer.py
class RemoteRecognizer(ABC, EntityRecognizer):
    """
    A configuration for a recognizer that runs on a different process / remote machine.

    :param supported_entities: A list of entities this recognizer can identify
    :param name: name of recognizer
    :param supported_language: The language this recognizer can detect entities in
    :param version: Version of this recognizer
    """

    def __init__(
        self,
        supported_entities: List[str],
        name: Optional[str],
        supported_language: str,
        version: str,
        context: Optional[List[str]] = None,
    ):
        super().__init__(
            supported_entities=supported_entities,
            name=name,
            supported_language=supported_language,
            version=version,
            context=context,
        )

    @abstractmethod
    def load(self):  # noqa D102
        pass

    @abstractmethod
    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ):  # noqa ANN201
        """
        Call an external service for PII detection.

        :param text: text to be analyzed
        :param entities: Entities that should be looked for
        :param nlp_artifacts: Additional metadata from the NLP engine
        :return: List of identified PII entities
        """

        # 1. Call the external service.
        # 2. Translate results into List[RecognizerResult]
        pass

    @abstractmethod
    def get_supported_entities(self) -> List[str]:  # noqa D102
        pass

analyze(self, text, entities, nlp_artifacts)

Call an external service for PII detection.

:param text: text to be analyzed :param entities: Entities that should be looked for :param nlp_artifacts: Additional metadata from the NLP engine :return: List of identified PII entities

Source code in presidio_analyzer/remote_recognizer.py
@abstractmethod
def analyze(
    self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
):  # noqa ANN201
    """
    Call an external service for PII detection.

    :param text: text to be analyzed
    :param entities: Entities that should be looked for
    :param nlp_artifacts: Additional metadata from the NLP engine
    :return: List of identified PII entities
    """

    # 1. Call the external service.
    # 2. Translate results into List[RecognizerResult]
    pass

get_supported_entities(self)

Return the list of entities this recognizer can identify.

:return: A list of the supported entities by this recognizer

Source code in presidio_analyzer/remote_recognizer.py
@abstractmethod
def get_supported_entities(self) -> List[str]:  # noqa D102
    pass

load(self)

Initialize the recognizer assets if needed.

(e.g. machine learning models)

Source code in presidio_analyzer/remote_recognizer.py
@abstractmethod
def load(self):  # noqa D102
    pass

LocalRecognizer

PII entity recognizer which runs on the same process as the AnalyzerEngine.

Source code in presidio_analyzer/local_recognizer.py
class LocalRecognizer(ABC, EntityRecognizer):
    """PII entity recognizer which runs on the same process as the AnalyzerEngine."""

PatternRecognizer

PII entity recognizer using regular expressions or deny-lists.

:param patterns: A list of patterns to detect :param deny_list: A list of words to detect, in case our recognizer uses a predefined list of words (deny list) :param context: list of context words :param deny_list_score: confidence score for a term identified using a deny-list

Source code in presidio_analyzer/pattern_recognizer.py
class PatternRecognizer(LocalRecognizer):
    """
    PII entity recognizer using regular expressions or deny-lists.

    :param patterns: A list of patterns to detect
    :param deny_list: A list of words to detect,
    in case our recognizer uses a predefined list of words (deny list)
    :param context: list of context words
    :param deny_list_score: confidence score for a term
    identified using a deny-list
    """

    def __init__(
        self,
        supported_entity: str,
        name: str = None,
        supported_language: str = "en",
        patterns: List[Pattern] = None,
        deny_list: List[str] = None,
        context: List[str] = None,
        deny_list_score: float = 1.0,
        version: str = "0.0.1",
    ):

        if not supported_entity:
            raise ValueError("Pattern recognizer should be initialized with entity")

        if not patterns and not deny_list:
            raise ValueError(
                "Pattern recognizer should be initialized with patterns"
                " or with deny list"
            )

        super().__init__(
            supported_entities=[supported_entity],
            supported_language=supported_language,
            name=name,
            version=version,
        )
        if patterns is None:
            self.patterns = []
        else:
            self.patterns = patterns
        self.context = context
        self.deny_list_score = deny_list_score

        if deny_list:
            deny_list_pattern = self._deny_list_to_regex(deny_list)
            self.patterns.append(deny_list_pattern)
            self.deny_list = deny_list
        else:
            self.deny_list = []

    def load(self):  # noqa D102
        pass

    def analyze(
        self,
        text: str,
        entities: List[str],
        nlp_artifacts: NlpArtifacts = None,
        regex_flags: int = None,
    ) -> List[RecognizerResult]:
        """
        Analyzes text to detect PII using regular expressions or deny-lists.

        :param text: Text to be analyzed
        :param entities: Entities this recognizer can detect
        :param nlp_artifacts: Output values from the NLP engine
        :param regex_flags:
        :return:
        """
        results = []

        if self.patterns:
            pattern_result = self.__analyze_patterns(text, regex_flags)
            results.extend(pattern_result)

        return results

    def _deny_list_to_regex(self, deny_list: List[str]) -> Pattern:
        """
        Convert a list of words to a matching regex.

        To be analyzed by the analyze method as any other regex patterns.

        :param deny_list: the list of words to detect
        :return:the regex of the words for detection
        """

        # Escape deny list elements as preparation for regex
        escaped_deny_list = [re.escape(element) for element in deny_list]
        regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)"
        return Pattern(name="deny_list", regex=regex, score=self.deny_list_score)

    def validate_result(self, pattern_text: str) -> Optional[bool]:
        """
        Validate the pattern logic e.g., by running checksum on a detected pattern.

        :param pattern_text: the text to validated.
        Only the part in text that was detected by the regex engine
        :return: A bool indicating whether the validation was successful.
        """
        return None

    def invalidate_result(self, pattern_text: str) -> Optional[bool]:
        """
        Logic to check for result invalidation by running pruning logic.

        For example, each SSN number group should not consist of all the same digits.

        :param pattern_text: the text to validated.
        Only the part in text that was detected by the regex engine
        :return: A bool indicating whether the result is invalidated
        """
        return None

    @staticmethod
    def build_regex_explanation(
        recognizer_name: str,
        pattern_name: str,
        pattern: str,
        original_score: float,
        validation_result: bool,
    ) -> AnalysisExplanation:
        """
        Construct an explanation for why this entity was detected.

        :param recognizer_name: Name of recognizer detecting the entity
        :param pattern_name: Regex pattern name which detected the entity
        :param pattern: Regex pattern logic
        :param original_score: Score given by the recognizer
        :param validation_result: Whether validation was used and its result
        :return: Analysis explanation
        """
        explanation = AnalysisExplanation(
            recognizer=recognizer_name,
            original_score=original_score,
            pattern_name=pattern_name,
            pattern=pattern,
            validation_result=validation_result,
        )
        return explanation

    def __analyze_patterns(
        self, text: str, flags: int = None
    ) -> List[RecognizerResult]:
        """
        Evaluate all patterns in the provided text.

        Including words in the provided deny-list

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        flags = flags if flags else re.DOTALL | re.MULTILINE
        results = []
        for pattern in self.patterns:
            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex, text, flags=flags)
            match_time = datetime.datetime.now() - match_start_time
            logger.debug(
                "--- match_time[%s]: %s.%s seconds",
                pattern.name,
                match_time.seconds,
                match_time.microseconds,
            )

            for match in matches:
                start, end = match.span()
                current_match = text[start:end]

                # Skip empty results
                if current_match == "":
                    continue

                score = pattern.score

                validation_result = self.validate_result(current_match)
                description = self.build_regex_explanation(
                    self.name, pattern.name, pattern.regex, score, validation_result
                )
                pattern_result = RecognizerResult(
                    entity_type=self.supported_entities[0],
                    start=start,
                    end=end,
                    score=score,
                    analysis_explanation=description,
                    recognition_metadata={
                        RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
                        RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
                    },
                )

                if validation_result is not None:
                    if validation_result:
                        pattern_result.score = EntityRecognizer.MAX_SCORE
                    else:
                        pattern_result.score = EntityRecognizer.MIN_SCORE

                invalidation_result = self.invalidate_result(current_match)
                if invalidation_result is not None and invalidation_result:
                    pattern_result.score = EntityRecognizer.MIN_SCORE

                if pattern_result.score > EntityRecognizer.MIN_SCORE:
                    results.append(pattern_result)

                # Update analysis explanation score following validation or invalidation
                description.score = pattern_result.score

        results = EntityRecognizer.remove_duplicates(results)
        return results

    def to_dict(self) -> Dict:
        """Serialize instance into a dictionary."""
        return_dict = super().to_dict()

        return_dict["patterns"] = [pat.to_dict() for pat in self.patterns]
        return_dict["deny_list"] = self.deny_list
        return_dict["context"] = self.context
        return_dict["supported_entity"] = return_dict["supported_entities"][0]
        del return_dict["supported_entities"]

        return return_dict

    @classmethod
    def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer":
        """Create instance from a serialized dict."""
        patterns = entity_recognizer_dict.get("patterns")
        if patterns:
            patterns_list = [Pattern.from_dict(pat) for pat in patterns]
            entity_recognizer_dict["patterns"] = patterns_list

        return cls(**entity_recognizer_dict)

analyze(self, text, entities, nlp_artifacts=None, regex_flags=None)

Analyzes text to detect PII using regular expressions or deny-lists.

:param text: Text to be analyzed :param entities: Entities this recognizer can detect :param nlp_artifacts: Output values from the NLP engine :param regex_flags: :return:

Source code in presidio_analyzer/pattern_recognizer.py
def analyze(
    self,
    text: str,
    entities: List[str],
    nlp_artifacts: NlpArtifacts = None,
    regex_flags: int = None,
) -> List[RecognizerResult]:
    """
    Analyzes text to detect PII using regular expressions or deny-lists.

    :param text: Text to be analyzed
    :param entities: Entities this recognizer can detect
    :param nlp_artifacts: Output values from the NLP engine
    :param regex_flags:
    :return:
    """
    results = []

    if self.patterns:
        pattern_result = self.__analyze_patterns(text, regex_flags)
        results.extend(pattern_result)

    return results

build_regex_explanation(recognizer_name, pattern_name, pattern, original_score, validation_result) staticmethod

Construct an explanation for why this entity was detected.

:param recognizer_name: Name of recognizer detecting the entity :param pattern_name: Regex pattern name which detected the entity :param pattern: Regex pattern logic :param original_score: Score given by the recognizer :param validation_result: Whether validation was used and its result :return: Analysis explanation

Source code in presidio_analyzer/pattern_recognizer.py
@staticmethod
def build_regex_explanation(
    recognizer_name: str,
    pattern_name: str,
    pattern: str,
    original_score: float,
    validation_result: bool,
) -> AnalysisExplanation:
    """
    Construct an explanation for why this entity was detected.

    :param recognizer_name: Name of recognizer detecting the entity
    :param pattern_name: Regex pattern name which detected the entity
    :param pattern: Regex pattern logic
    :param original_score: Score given by the recognizer
    :param validation_result: Whether validation was used and its result
    :return: Analysis explanation
    """
    explanation = AnalysisExplanation(
        recognizer=recognizer_name,
        original_score=original_score,
        pattern_name=pattern_name,
        pattern=pattern,
        validation_result=validation_result,
    )
    return explanation

from_dict(entity_recognizer_dict) classmethod

Create instance from a serialized dict.

Source code in presidio_analyzer/pattern_recognizer.py
@classmethod
def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer":
    """Create instance from a serialized dict."""
    patterns = entity_recognizer_dict.get("patterns")
    if patterns:
        patterns_list = [Pattern.from_dict(pat) for pat in patterns]
        entity_recognizer_dict["patterns"] = patterns_list

    return cls(**entity_recognizer_dict)

invalidate_result(self, pattern_text)

Logic to check for result invalidation by running pruning logic.

For example, each SSN number group should not consist of all the same digits.

:param pattern_text: the text to validated. Only the part in text that was detected by the regex engine :return: A bool indicating whether the result is invalidated

Source code in presidio_analyzer/pattern_recognizer.py
def invalidate_result(self, pattern_text: str) -> Optional[bool]:
    """
    Logic to check for result invalidation by running pruning logic.

    For example, each SSN number group should not consist of all the same digits.

    :param pattern_text: the text to validated.
    Only the part in text that was detected by the regex engine
    :return: A bool indicating whether the result is invalidated
    """
    return None

load(self)

Initialize the recognizer assets if needed.

(e.g. machine learning models)

Source code in presidio_analyzer/pattern_recognizer.py
def load(self):  # noqa D102
    pass

to_dict(self)

Serialize instance into a dictionary.

Source code in presidio_analyzer/pattern_recognizer.py
def to_dict(self) -> Dict:
    """Serialize instance into a dictionary."""
    return_dict = super().to_dict()

    return_dict["patterns"] = [pat.to_dict() for pat in self.patterns]
    return_dict["deny_list"] = self.deny_list
    return_dict["context"] = self.context
    return_dict["supported_entity"] = return_dict["supported_entities"][0]
    del return_dict["supported_entities"]

    return return_dict

validate_result(self, pattern_text)

Validate the pattern logic e.g., by running checksum on a detected pattern.

:param pattern_text: the text to validated. Only the part in text that was detected by the regex engine :return: A bool indicating whether the validation was successful.

Source code in presidio_analyzer/pattern_recognizer.py
def validate_result(self, pattern_text: str) -> Optional[bool]:
    """
    Validate the pattern logic e.g., by running checksum on a detected pattern.

    :param pattern_text: the text to validated.
    Only the part in text that was detected by the regex engine
    :return: A bool indicating whether the validation was successful.
    """
    return None

NlpArtifacts

NlpArtifacts is an abstraction layer over the results of an NLP pipeline.

processing over a given text, it holds attributes such as entities, tokens and lemmas which can be used by any recognizer

Source code in presidio_analyzer/nlp_engine/nlp_artifacts.py
class NlpArtifacts:
    """
    NlpArtifacts is an abstraction layer over the results of an NLP pipeline.

    processing over a given text, it holds attributes such as entities,
    tokens and lemmas which can be used by any recognizer
    """

    def __init__(
        self,
        entities: List[Span],
        tokens: Doc,
        tokens_indices: List[int],
        lemmas: List[str],
        nlp_engine,  # noqa ANN001
        language: str,
    ):
        self.entities = entities
        self.tokens = tokens
        self.lemmas = lemmas
        self.tokens_indices = tokens_indices
        self.keywords = self.set_keywords(nlp_engine, lemmas, language)
        self.nlp_engine = nlp_engine

    @staticmethod
    def set_keywords(
        nlp_engine, lemmas: List[str], language: str  # noqa ANN001
    ) -> List[str]:
        """
        Return keywords fpr text.

        Extracts lemmas with certain conditions as keywords.
        """
        if not nlp_engine:
            return []
        keywords = [
            k.lower()
            for k in lemmas
            if not nlp_engine.is_stopword(k, language)
            and not nlp_engine.is_punct(k, language)
            and k != "-PRON-"
            and k != "be"
        ]

        # best effort, try even further to break tokens into sub tokens,
        # this can result in reducing false negatives
        keywords = [i.split(":") for i in keywords]

        # splitting the list can, if happened, will result in list of lists,
        # we flatten the list
        keywords = [item for sublist in keywords for item in sublist]
        return keywords

    def to_json(self) -> str:
        """Convert nlp artifacts to json."""

        return_dict = self.__dict__.copy()

        # Ignore NLP engine as it's not serializable currently
        del return_dict["nlp_engine"]

        # Converting spaCy tokens and spans to string as they are not serializable
        if "tokens" in return_dict:
            return_dict["tokens"] = [token.text for token in self.tokens]
        if "entities" in return_dict:
            return_dict["entities"] = [entity.text for entity in self.entities]

        return json.dumps(return_dict)

set_keywords(nlp_engine, lemmas, language) staticmethod

Return keywords fpr text.

Extracts lemmas with certain conditions as keywords.

Source code in presidio_analyzer/nlp_engine/nlp_artifacts.py
@staticmethod
def set_keywords(
    nlp_engine, lemmas: List[str], language: str  # noqa ANN001
) -> List[str]:
    """
    Return keywords fpr text.

    Extracts lemmas with certain conditions as keywords.
    """
    if not nlp_engine:
        return []
    keywords = [
        k.lower()
        for k in lemmas
        if not nlp_engine.is_stopword(k, language)
        and not nlp_engine.is_punct(k, language)
        and k != "-PRON-"
        and k != "be"
    ]

    # best effort, try even further to break tokens into sub tokens,
    # this can result in reducing false negatives
    keywords = [i.split(":") for i in keywords]

    # splitting the list can, if happened, will result in list of lists,
    # we flatten the list
    keywords = [item for sublist in keywords for item in sublist]
    return keywords

to_json(self)

Convert nlp artifacts to json.

Source code in presidio_analyzer/nlp_engine/nlp_artifacts.py
def to_json(self) -> str:
    """Convert nlp artifacts to json."""

    return_dict = self.__dict__.copy()

    # Ignore NLP engine as it's not serializable currently
    del return_dict["nlp_engine"]

    # Converting spaCy tokens and spans to string as they are not serializable
    if "tokens" in return_dict:
        return_dict["tokens"] = [token.text for token in self.tokens]
    if "entities" in return_dict:
        return_dict["entities"] = [entity.text for entity in self.entities]

    return json.dumps(return_dict)

NlpEngine is an abstraction layer over the nlp module.

It provides NLP preprocessing functionality as well as other queries on tokens.

Source code in presidio_analyzer/nlp_engine/nlp_engine.py
class NlpEngine(ABC):
    """
    NlpEngine is an abstraction layer over the nlp module.

    It provides NLP preprocessing functionality as well as other queries
    on tokens.
    """

    @abstractmethod
    def process_text(self, text: str, language: str) -> NlpArtifacts:
        """Execute the NLP pipeline on the given text and language."""

    @abstractmethod
    def process_batch(
        self, texts: Iterable[str], language: str, **kwargs
    ) -> Iterator[Tuple[str, NlpArtifacts]]:
        """Execute the NLP pipeline on a batch of texts.

        Returns a tuple of (text, NlpArtifacts)
        """

    @abstractmethod
    def is_stopword(self, word: str, language: str) -> bool:
        """
        Return true if the given word is a stop word.

        (within the given language)
        """

    @abstractmethod
    def is_punct(self, word: str, language: str) -> bool:
        """
        Return true if the given word is a punctuation word.

        (within the given language)
        """

is_punct(self, word, language)

Return true if the given word is a punctuation word.

(within the given language)

Source code in presidio_analyzer/nlp_engine/nlp_engine.py
@abstractmethod
def is_punct(self, word: str, language: str) -> bool:
    """
    Return true if the given word is a punctuation word.

    (within the given language)
    """

is_stopword(self, word, language)

Return true if the given word is a stop word.

(within the given language)

Source code in presidio_analyzer/nlp_engine/nlp_engine.py
@abstractmethod
def is_stopword(self, word: str, language: str) -> bool:
    """
    Return true if the given word is a stop word.

    (within the given language)
    """

process_batch(self, texts, language, **kwargs)

Execute the NLP pipeline on a batch of texts.

Returns a tuple of (text, NlpArtifacts)

Source code in presidio_analyzer/nlp_engine/nlp_engine.py
@abstractmethod
def process_batch(
    self, texts: Iterable[str], language: str, **kwargs
) -> Iterator[Tuple[str, NlpArtifacts]]:
    """Execute the NLP pipeline on a batch of texts.

    Returns a tuple of (text, NlpArtifacts)
    """

process_text(self, text, language)

Execute the NLP pipeline on the given text and language.

Source code in presidio_analyzer/nlp_engine/nlp_engine.py
@abstractmethod
def process_text(self, text: str, language: str) -> NlpArtifacts:
    """Execute the NLP pipeline on the given text and language."""

SpacyNlpEngine is an abstraction layer over the nlp module.

It provides processing functionality as well as other queries on tokens. The SpacyNlpEngine uses SpaCy as its NLP module

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
class SpacyNlpEngine(NlpEngine):
    """
    SpacyNlpEngine is an abstraction layer over the nlp module.

    It provides processing functionality as well as other queries
    on tokens.
    The SpacyNlpEngine uses SpaCy as its NLP module
    """

    engine_name = "spacy"
    is_available = bool(spacy)

    def __init__(self, models: Optional[Dict[str, str]] = None):
        """
        Initialize a wrapper on spaCy functionality.

        :param models: Dictionary with the name of the spaCy model per language.
        For example: models = {"en": "en_core_web_lg"}
        """
        if not models:
            models = {"en": "en_core_web_lg"}
        logger.debug(f"Loading SpaCy models: {models.values()}")

        self.nlp = {
            lang_code: spacy.load(model_name, disable=["parser"])
            for lang_code, model_name in models.items()
        }

    def process_text(self, text: str, language: str) -> NlpArtifacts:
        """Execute the SpaCy NLP pipeline on the given text and language."""

        doc = self.nlp[language](text)
        return self._doc_to_nlp_artifact(doc, language)

    def process_batch(
        self,
        texts: Union[List[str], List[Tuple[str, object]]],
        language: str,
        as_tuples: bool = False,
    ) -> Iterator[Optional[NlpArtifacts]]:
        """Execute the NLP pipeline on a batch of texts using spacy pipe."""
        texts = (str(text) for text in texts)
        docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
        for doc in docs:
            yield doc.text, self._doc_to_nlp_artifact(doc, language)

    def is_stopword(self, word: str, language: str) -> bool:
        """
        Return true if the given word is a stop word.

        (within the given language)
        """
        return self.nlp[language].vocab[word].is_stop

    def is_punct(self, word: str, language: str) -> bool:
        """
        Return true if the given word is a punctuation word.

        (within the given language).
        """
        return self.nlp[language].vocab[word].is_punct

    def get_nlp(self, language: str) -> Language:
        """
        Return the language model loaded for a language.

        :param language: Name of language
        :return: Language model from spaCy
        """
        return self.nlp[language]

    def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
        lemmas = [token.lemma_ for token in doc]
        tokens_indices = [token.idx for token in doc]
        entities = doc.ents
        return NlpArtifacts(
            entities=entities,
            tokens=doc,
            tokens_indices=tokens_indices,
            lemmas=lemmas,
            nlp_engine=self,
            language=language,
        )

__init__(self, models=None) special

Initialize a wrapper on spaCy functionality.

:param models: Dictionary with the name of the spaCy model per language. For example: models = {"en": "en_core_web_lg"}

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
def __init__(self, models: Optional[Dict[str, str]] = None):
    """
    Initialize a wrapper on spaCy functionality.

    :param models: Dictionary with the name of the spaCy model per language.
    For example: models = {"en": "en_core_web_lg"}
    """
    if not models:
        models = {"en": "en_core_web_lg"}
    logger.debug(f"Loading SpaCy models: {models.values()}")

    self.nlp = {
        lang_code: spacy.load(model_name, disable=["parser"])
        for lang_code, model_name in models.items()
    }

get_nlp(self, language)

Return the language model loaded for a language.

:param language: Name of language :return: Language model from spaCy

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
def get_nlp(self, language: str) -> Language:
    """
    Return the language model loaded for a language.

    :param language: Name of language
    :return: Language model from spaCy
    """
    return self.nlp[language]

is_punct(self, word, language)

Return true if the given word is a punctuation word.

(within the given language).

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
def is_punct(self, word: str, language: str) -> bool:
    """
    Return true if the given word is a punctuation word.

    (within the given language).
    """
    return self.nlp[language].vocab[word].is_punct

is_stopword(self, word, language)

Return true if the given word is a stop word.

(within the given language)

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
def is_stopword(self, word: str, language: str) -> bool:
    """
    Return true if the given word is a stop word.

    (within the given language)
    """
    return self.nlp[language].vocab[word].is_stop

process_batch(self, texts, language, as_tuples=False)

Execute the NLP pipeline on a batch of texts using spacy pipe.

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
def process_batch(
    self,
    texts: Union[List[str], List[Tuple[str, object]]],
    language: str,
    as_tuples: bool = False,
) -> Iterator[Optional[NlpArtifacts]]:
    """Execute the NLP pipeline on a batch of texts using spacy pipe."""
    texts = (str(text) for text in texts)
    docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
    for doc in docs:
        yield doc.text, self._doc_to_nlp_artifact(doc, language)

process_text(self, text, language)

Execute the SpaCy NLP pipeline on the given text and language.

Source code in presidio_analyzer/nlp_engine/spacy_nlp_engine.py
def process_text(self, text: str, language: str) -> NlpArtifacts:
    """Execute the SpaCy NLP pipeline on the given text and language."""

    doc = self.nlp[language](text)
    return self._doc_to_nlp_artifact(doc, language)

Create different NLP engines from configuration.

:param nlp_engines: List of available NLP engines. Default: (SpacyNlpEngine, StanzaNlpEngine) :param nlp_configuration: Dict containing nlp configuration :example: configuration: { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_lg" }] } Nlp engine names available by default: spacy, stanza. :param conf_file: Path to yaml file containing nlp engine configuration.

Source code in presidio_analyzer/nlp_engine/nlp_engine_provider.py
class NlpEngineProvider:
    """Create different NLP engines from configuration.

    :param nlp_engines: List of available NLP engines.
    Default: (SpacyNlpEngine, StanzaNlpEngine)
    :param nlp_configuration: Dict containing nlp configuration
    :example: configuration:
            {
                "nlp_engine_name": "spacy",
                "models": [{"lang_code": "en",
                            "model_name": "en_core_web_lg"
                          }]
            }
    Nlp engine names available by default: spacy, stanza.
    :param conf_file: Path to yaml file containing nlp engine configuration.
    """

    def __init__(
        self,
        nlp_engines: Optional[Tuple] = None,
        conf_file: Optional[Union[Path, str]] = None,
        nlp_configuration: Optional[Dict] = None,
    ):

        if not nlp_engines:
            nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine)

        self.nlp_engines = {
            engine.engine_name: engine for engine in nlp_engines if engine.is_available
        }
        logger.debug(
            f"Loaded these available nlp engines: {list(self.nlp_engines.keys())}"
        )

        if conf_file and nlp_configuration:
            raise ValueError(
                "Either conf_file or nlp_configuration should be provided, not both."
            )

        if nlp_configuration:
            self.nlp_configuration = nlp_configuration

        if conf_file:
            self.nlp_configuration = self._read_nlp_conf(conf_file)

        if not conf_file and not nlp_configuration:
            conf_file = self._get_full_conf_path()
            logger.debug(f"Reading default conf file from {conf_file}")
            self.nlp_configuration = self._read_nlp_conf(conf_file)

    def create_engine(self) -> NlpEngine:
        """Create an NLP engine instance."""
        if (
            not self.nlp_configuration
            or not self.nlp_configuration.get("models")
            or not self.nlp_configuration.get("nlp_engine_name")
        ):
            raise ValueError(
                "Illegal nlp configuration. "
                "Configuration should include nlp_engine_name and models "
                "(list of model_name for each lang_code)."
            )
        nlp_engine_name = self.nlp_configuration["nlp_engine_name"]
        if nlp_engine_name not in self.nlp_engines:
            raise ValueError(
                f"NLP engine '{nlp_engine_name}' is not available. "
                "Make sure you have all required packages installed"
            )
        try:
            nlp_engine_class = self.nlp_engines[nlp_engine_name]
            nlp_engine_opts = {
                m["lang_code"]: m["model_name"]
                for m in self.nlp_configuration["models"]
            }
            engine = nlp_engine_class(nlp_engine_opts)
            logger.info(
                f"Created NLP engine: {engine.engine_name}. "
                f"Loaded models: {list(engine.nlp.keys())}"
            )
            return engine
        except KeyError:
            raise ValueError("Wrong NLP engine configuration")

    @staticmethod
    def _read_nlp_conf(conf_file: Union[Path, str]) -> dict:
        """Read the nlp configuration from a provided yaml file."""

        if not Path(conf_file).exists():
            nlp_configuration = {
                "nlp_engine_name": "spacy",
                "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
            }
            logger.warning(
                f"configuration file {conf_file} not found.  "
                f"Using default config: {nlp_configuration}."
            )

        else:
            nlp_configuration = yaml.safe_load(open(conf_file))

        return nlp_configuration

    @staticmethod
    def _get_full_conf_path(
        default_conf_file: Union[Path, str] = "default.yaml"
    ) -> Path:
        """Return a Path to the default conf file."""
        return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file)

create_engine(self)

Create an NLP engine instance.

Source code in presidio_analyzer/nlp_engine/nlp_engine_provider.py
def create_engine(self) -> NlpEngine:
    """Create an NLP engine instance."""
    if (
        not self.nlp_configuration
        or not self.nlp_configuration.get("models")
        or not self.nlp_configuration.get("nlp_engine_name")
    ):
        raise ValueError(
            "Illegal nlp configuration. "
            "Configuration should include nlp_engine_name and models "
            "(list of model_name for each lang_code)."
        )
    nlp_engine_name = self.nlp_configuration["nlp_engine_name"]
    if nlp_engine_name not in self.nlp_engines:
        raise ValueError(
            f"NLP engine '{nlp_engine_name}' is not available. "
            "Make sure you have all required packages installed"
        )
    try:
        nlp_engine_class = self.nlp_engines[nlp_engine_name]
        nlp_engine_opts = {
            m["lang_code"]: m["model_name"]
            for m in self.nlp_configuration["models"]
        }
        engine = nlp_engine_class(nlp_engine_opts)
        logger.info(
            f"Created NLP engine: {engine.engine_name}. "
            f"Loaded models: {list(engine.nlp.keys())}"
        )
        return engine
    except KeyError:
        raise ValueError("Wrong NLP engine configuration")

RecognizerResult

Recognizer Result represents the findings of the detected entity.

Result of a recognizer analyzing the text.

:param entity_type: the type of the entity :param start: the start location of the detected entity :param end: the end location of the detected entity :param score: the score of the detection :param analysis_explanation: contains the explanation of why this entity was identified :param recognition_metadata: a dictionary of metadata to be used in recognizer specific cases, for example specific recognized context words and recognizer name

Source code in presidio_analyzer/recognizer_result.py
class RecognizerResult:
    """
    Recognizer Result represents the findings of the detected entity.

    Result of a recognizer analyzing the text.

    :param entity_type: the type of the entity
    :param start: the start location of the detected entity
    :param end: the end location of the detected entity
    :param score: the score of the detection
    :param analysis_explanation: contains the explanation of why this
                                 entity was identified
    :param recognition_metadata: a dictionary of metadata to be used in
    recognizer specific cases, for example specific recognized context words
    and recognizer name
    """

    # Keys for recognizer metadata
    RECOGNIZER_NAME_KEY = "recognizer_name"
    RECOGNIZER_IDENTIFIER_KEY = "recognizer_identifier"

    # Key of a flag inside recognition_metadata dictionary
    # which is set to true if the result enhanced by context
    IS_SCORE_ENHANCED_BY_CONTEXT_KEY = "is_score_enhanced_by_context"

    logger = logging.getLogger("presidio-analyzer")

    def __init__(
        self,
        entity_type: str,
        start: int,
        end: int,
        score: float,
        analysis_explanation: AnalysisExplanation = None,
        recognition_metadata: Dict = None,
    ):

        self.entity_type = entity_type
        self.start = start
        self.end = end
        self.score = score
        self.analysis_explanation = analysis_explanation

        if not recognition_metadata:
            self.logger.debug(
                "recognition_metadata should be passed, "
                "containing a recognizer_name value"
            )

        self.recognition_metadata = recognition_metadata

    def append_analysis_explanation_text(self, text: str) -> None:
        """Add text to the analysis explanation."""
        if self.analysis_explanation:
            self.analysis_explanation.append_textual_explanation_line(text)

    def to_dict(self) -> Dict:
        """
        Serialize self to dictionary.

        :return: a dictionary
        """
        return self.__dict__

    @classmethod
    def from_json(cls, data: Dict) -> "RecognizerResult":
        """
        Create RecognizerResult from json.

        :param data: e.g. {
            "start": 24,
            "end": 32,
            "score": 0.8,
            "entity_type": "NAME"
        }
        :return: RecognizerResult
        """
        score = data.get("score")
        entity_type = data.get("entity_type")
        start = data.get("start")
        end = data.get("end")
        return cls(entity_type, start, end, score)

    def __repr__(self) -> str:
        """Return a string representation of the instance."""
        return self.__str__()

    def intersects(self, other: "RecognizerResult") -> int:
        """
        Check if self intersects with a different RecognizerResult.

        :return: If intersecting, returns the number of
        intersecting characters.
        If not, returns 0
        """
        # if they do not overlap the intersection is 0
        if self.end < other.start or other.end < self.start:
            return 0

        # otherwise the intersection is min(end) - max(start)
        return min(self.end, other.end) - max(self.start, other.start)

    def contained_in(self, other: "RecognizerResult") -> bool:
        """
        Check if self is contained in a different RecognizerResult.

        :return: true if contained
        """
        return self.start >= other.start and self.end <= other.end

    def contains(self, other: "RecognizerResult") -> bool:
        """
        Check if one result is contained or equal to another result.

        :param other: another RecognizerResult
        :return: bool
        """
        return self.start <= other.start and self.end >= other.end

    def equal_indices(self, other: "RecognizerResult") -> bool:
        """
        Check if the indices are equal between two results.

        :param other: another RecognizerResult
        :return:
        """
        return self.start == other.start and self.end == other.end

    def __gt__(self, other: "RecognizerResult") -> bool:
        """
        Check if one result is greater by using the results indices in the text.

        :param other: another RecognizerResult
        :return: bool
        """
        if self.start == other.start:
            return self.end > other.end
        return self.start > other.start

    def __eq__(self, other: "RecognizerResult") -> bool:
        """
        Check two results are equal by using all class fields.

        :param other: another RecognizerResult
        :return: bool
        """
        equal_type = self.entity_type == other.entity_type
        equal_score = self.score == other.score
        return self.equal_indices(other) and equal_type and equal_score

    def __hash__(self):
        """
        Hash the result data by using all class fields.

        :return: int
        """
        return hash(
            f"{str(self.start)} {str(self.end)} {str(self.score)} {self.entity_type}"
        )

    def __str__(self) -> str:
        """Return a string representation of the instance."""
        return (
            f"type: {self.entity_type}, "
            f"start: {self.start}, "
            f"end: {self.end}, "
            f"score: {self.score}"
        )

    def has_conflict(self, other: "RecognizerResult") -> bool:
        """
        Check if two recognizer results are conflicted or not.

        I have a conflict if:
        1. My indices are the same as the other and my score is lower.
        2. If my indices are contained in another.

        :param other: RecognizerResult
        :return:
        """
        if self.equal_indices(other):
            return self.score <= other.score
        return other.contains(self)

__eq__(self, other) special

Check two results are equal by using all class fields.

:param other: another RecognizerResult :return: bool

Source code in presidio_analyzer/recognizer_result.py
def __eq__(self, other: "RecognizerResult") -> bool:
    """
    Check two results are equal by using all class fields.

    :param other: another RecognizerResult
    :return: bool
    """
    equal_type = self.entity_type == other.entity_type
    equal_score = self.score == other.score
    return self.equal_indices(other) and equal_type and equal_score

__gt__(self, other) special

Check if one result is greater by using the results indices in the text.

:param other: another RecognizerResult :return: bool

Source code in presidio_analyzer/recognizer_result.py
def __gt__(self, other: "RecognizerResult") -> bool:
    """
    Check if one result is greater by using the results indices in the text.

    :param other: another RecognizerResult
    :return: bool
    """
    if self.start == other.start:
        return self.end > other.end
    return self.start > other.start

__hash__(self) special

Hash the result data by using all class fields.

:return: int

Source code in presidio_analyzer/recognizer_result.py
def __hash__(self):
    """
    Hash the result data by using all class fields.

    :return: int
    """
    return hash(
        f"{str(self.start)} {str(self.end)} {str(self.score)} {self.entity_type}"
    )

__repr__(self) special

Return a string representation of the instance.

Source code in presidio_analyzer/recognizer_result.py
def __repr__(self) -> str:
    """Return a string representation of the instance."""
    return self.__str__()

__str__(self) special

Return a string representation of the instance.

Source code in presidio_analyzer/recognizer_result.py
def __str__(self) -> str:
    """Return a string representation of the instance."""
    return (
        f"type: {self.entity_type}, "
        f"start: {self.start}, "
        f"end: {self.end}, "
        f"score: {self.score}"
    )

append_analysis_explanation_text(self, text)

Add text to the analysis explanation.

Source code in presidio_analyzer/recognizer_result.py
def append_analysis_explanation_text(self, text: str) -> None:
    """Add text to the analysis explanation."""
    if self.analysis_explanation:
        self.analysis_explanation.append_textual_explanation_line(text)

contained_in(self, other)

Check if self is contained in a different RecognizerResult.

:return: true if contained

Source code in presidio_analyzer/recognizer_result.py
def contained_in(self, other: "RecognizerResult") -> bool:
    """
    Check if self is contained in a different RecognizerResult.

    :return: true if contained
    """
    return self.start >= other.start and self.end <= other.end

contains(self, other)

Check if one result is contained or equal to another result.

:param other: another RecognizerResult :return: bool

Source code in presidio_analyzer/recognizer_result.py
def contains(self, other: "RecognizerResult") -> bool:
    """
    Check if one result is contained or equal to another result.

    :param other: another RecognizerResult
    :return: bool
    """
    return self.start <= other.start and self.end >= other.end

equal_indices(self, other)

Check if the indices are equal between two results.

:param other: another RecognizerResult :return:

Source code in presidio_analyzer/recognizer_result.py
def equal_indices(self, other: "RecognizerResult") -> bool:
    """
    Check if the indices are equal between two results.

    :param other: another RecognizerResult
    :return:
    """
    return self.start == other.start and self.end == other.end

from_json(data) classmethod

Create RecognizerResult from json.

:param data: e.g. { "start": 24, "end": 32, "score": 0.8, "entity_type": "NAME" } :return: RecognizerResult

Source code in presidio_analyzer/recognizer_result.py
@classmethod
def from_json(cls, data: Dict) -> "RecognizerResult":
    """
    Create RecognizerResult from json.

    :param data: e.g. {
        "start": 24,
        "end": 32,
        "score": 0.8,
        "entity_type": "NAME"
    }
    :return: RecognizerResult
    """
    score = data.get("score")
    entity_type = data.get("entity_type")
    start = data.get("start")
    end = data.get("end")
    return cls(entity_type, start, end, score)

has_conflict(self, other)

Check if two recognizer results are conflicted or not.

I have a conflict if: 1. My indices are the same as the other and my score is lower. 2. If my indices are contained in another.

:param other: RecognizerResult :return:

Source code in presidio_analyzer/recognizer_result.py
def has_conflict(self, other: "RecognizerResult") -> bool:
    """
    Check if two recognizer results are conflicted or not.

    I have a conflict if:
    1. My indices are the same as the other and my score is lower.
    2. If my indices are contained in another.

    :param other: RecognizerResult
    :return:
    """
    if self.equal_indices(other):
        return self.score <= other.score
    return other.contains(self)

intersects(self, other)

Check if self intersects with a different RecognizerResult.

:return: If intersecting, returns the number of intersecting characters. If not, returns 0

Source code in presidio_analyzer/recognizer_result.py
def intersects(self, other: "RecognizerResult") -> int:
    """
    Check if self intersects with a different RecognizerResult.

    :return: If intersecting, returns the number of
    intersecting characters.
    If not, returns 0
    """
    # if they do not overlap the intersection is 0
    if self.end < other.start or other.end < self.start:
        return 0

    # otherwise the intersection is min(end) - max(start)
    return min(self.end, other.end) - max(self.start, other.start)

to_dict(self)

Serialize self to dictionary.

:return: a dictionary

Source code in presidio_analyzer/recognizer_result.py
def to_dict(self) -> Dict:
    """
    Serialize self to dictionary.

    :return: a dictionary
    """
    return self.__dict__

Pattern

A class that represents a regex pattern.

:param name: the name of the pattern :param regex: the regex pattern to detect :param score: the pattern's strength (values varies 0-1)

Source code in presidio_analyzer/pattern.py
class Pattern:
    """
    A class that represents a regex pattern.

    :param name: the name of the pattern
    :param regex: the regex pattern to detect
    :param score: the pattern's strength (values varies 0-1)
    """

    def __init__(self, name: str, regex: str, score: float):

        self.name = name
        self.regex = regex
        self.score = score

    def to_dict(self) -> Dict:
        """
        Turn this instance into a dictionary.

        :return: a dictionary
        """
        return_dict = {"name": self.name, "score": self.score, "regex": self.regex}
        return return_dict

    @classmethod
    def from_dict(cls, pattern_dict: Dict) -> "Pattern":
        """
        Load an instance from a dictionary.

        :param pattern_dict: a dictionary holding the pattern's parameters
        :return: a Pattern instance
        """
        return cls(**pattern_dict)

    def __repr__(self):
        """Return string representation of instance."""
        return json.dumps(self.to_dict())

    def __str__(self):
        """Return string representation of instance."""
        return json.dumps(self.to_dict())

__repr__(self) special

Return string representation of instance.

Source code in presidio_analyzer/pattern.py
def __repr__(self):
    """Return string representation of instance."""
    return json.dumps(self.to_dict())

__str__(self) special

Return string representation of instance.

Source code in presidio_analyzer/pattern.py
def __str__(self):
    """Return string representation of instance."""
    return json.dumps(self.to_dict())

from_dict(pattern_dict) classmethod

Load an instance from a dictionary.

:param pattern_dict: a dictionary holding the pattern's parameters :return: a Pattern instance

Source code in presidio_analyzer/pattern.py
@classmethod
def from_dict(cls, pattern_dict: Dict) -> "Pattern":
    """
    Load an instance from a dictionary.

    :param pattern_dict: a dictionary holding the pattern's parameters
    :return: a Pattern instance
    """
    return cls(**pattern_dict)

to_dict(self)

Turn this instance into a dictionary.

:return: a dictionary

Source code in presidio_analyzer/pattern.py
def to_dict(self) -> Dict:
    """
    Turn this instance into a dictionary.

    :return: a dictionary
    """
    return_dict = {"name": self.name, "score": self.score, "regex": self.regex}
    return return_dict