Skip to content

Presidio Image Redactor API Reference

ImageRedactorEngine class

ImageRedactorEngine performs OCR + PII detection + bounding box redaction.

:param image_analyzer_engine: Engine which performs OCR + PII detection.

Source code in presidio_image_redactor/image_redactor_engine.py
class ImageRedactorEngine:
    """ImageRedactorEngine performs OCR + PII detection + bounding box redaction.

    :param image_analyzer_engine: Engine which performs OCR + PII detection.
    """

    def __init__(self, image_analyzer_engine: ImageAnalyzerEngine = None):
        if not image_analyzer_engine:
            self.image_analyzer_engine = ImageAnalyzerEngine()
        else:
            self.image_analyzer_engine = image_analyzer_engine

    def redact(
        self, image: Image,
        fill: Union[int, Tuple[int, int, int]] = (0, 0, 0),
        **kwargs,
    ) -> Image:
        """Redact method to redact the given image.

        Please notice, this method duplicates the image, creates a new instance and
        manipulate it.
        :param image: PIL Image to be processed
        :param fill: colour to fill the shape - int (0-255) for
        grayscale or Tuple(R, G, B) for RGB
        :param kwargs: Additional values for the analyze method in AnalyzerEngine

        :return: the redacted image
        """

        image = ImageChops.duplicate(image)

        bboxes = self.image_analyzer_engine.analyze(image, **kwargs)
        draw = ImageDraw.Draw(image)

        for box in bboxes:
            x0 = box.left
            y0 = box.top
            x1 = x0 + box.width
            y1 = y0 + box.height
            draw.rectangle([x0, y0, x1, y1], fill=fill)

        return image

redact(self, image, fill=(0, 0, 0), **kwargs)

Redact method to redact the given image.

Please notice, this method duplicates the image, creates a new instance and manipulate it. :param image: PIL Image to be processed :param fill: colour to fill the shape - int (0-255) for grayscale or Tuple(R, G, B) for RGB :param kwargs: Additional values for the analyze method in AnalyzerEngine

:return: the redacted image

Source code in presidio_image_redactor/image_redactor_engine.py
def redact(
    self, image: Image,
    fill: Union[int, Tuple[int, int, int]] = (0, 0, 0),
    **kwargs,
) -> Image:
    """Redact method to redact the given image.

    Please notice, this method duplicates the image, creates a new instance and
    manipulate it.
    :param image: PIL Image to be processed
    :param fill: colour to fill the shape - int (0-255) for
    grayscale or Tuple(R, G, B) for RGB
    :param kwargs: Additional values for the analyze method in AnalyzerEngine

    :return: the redacted image
    """

    image = ImageChops.duplicate(image)

    bboxes = self.image_analyzer_engine.analyze(image, **kwargs)
    draw = ImageDraw.Draw(image)

    for box in bboxes:
        x0 = box.left
        y0 = box.top
        x1 = x0 + box.width
        y1 = y0 + box.height
        draw.rectangle([x0, y0, x1, y1], fill=fill)

    return image

ImageAnalyzerEngine class

ImageAnalyzerEngine class.

:param analyzer_engine: The Presidio AnalyzerEngine instance to be used to detect PII in text :param ocr: the OCR object to be used to detect text in images.

Source code in presidio_image_redactor/image_analyzer_engine.py
class ImageAnalyzerEngine:
    """ImageAnalyzerEngine class.

    :param analyzer_engine: The Presidio AnalyzerEngine instance
        to be used to detect PII in text
    :param ocr: the OCR object to be used to detect text in images.
    """

    def __init__(self, analyzer_engine: AnalyzerEngine = None, ocr: OCR = None):
        if not analyzer_engine:
            analyzer_engine = AnalyzerEngine()
        self.analyzer_engine = analyzer_engine

        if not ocr:
            ocr = TesseractOCR()
        self.ocr = ocr

    def analyze(self, image: object, **kwargs) -> List[ImageRecognizerResult]:
        """Analyse method to analyse the given image.

        :param image: PIL Image/numpy array or file path(str) to be processed
        :param kwargs: Additional values for the analyze method in AnalyzerEngine

        :return: list of the extract entities with image bounding boxes
        """
        ocr_result = self.ocr.perform_ocr(image)
        text = self.ocr.get_text_from_ocr_dict(ocr_result)

        analyzer_result = self.analyzer_engine.analyze(
            text=text, language="en", **kwargs
        )
        bboxes = self.map_analyzer_results_to_bounding_boxes(
            analyzer_result, ocr_result, text
        )
        return bboxes

    @staticmethod
    def map_analyzer_results_to_bounding_boxes(
        text_analyzer_results: List[RecognizerResult], ocr_result: dict, text: str
    ) -> List[ImageRecognizerResult]:
        """Map extracted PII entities to image bounding boxes.

        Matching is based on the position of the recognized entity from analyzer
        and word (in ocr dict) in the text.

        :param text_analyzer_results: PII entities recognized by presidio analyzer
        :param ocr_result: dict results with words and bboxes from OCR
        :param text: text the results are based on

        return: list of extracted entities with image bounding boxes
        """
        if (not ocr_result) or (not text_analyzer_results):
            return []

        bboxes = []
        proc_indexes = 0
        indexes = len(text_analyzer_results)

        pos = 0
        iter_ocr = enumerate(ocr_result["text"])
        for index, word in iter_ocr:
            if not word:
                pos += 1
            else:
                for element in text_analyzer_results:
                    text_element = text[element.start : element.end]
                    # check position and text of ocr word matches recognized entity
                    if (
                        max(pos, element.start) < min(element.end, pos + len(word))
                    ) and ((text_element in word) or (word in text_element)):
                        bboxes.append(
                            ImageRecognizerResult(
                                element.entity_type,
                                element.start,
                                element.end,
                                element.score,
                                ocr_result["left"][index],
                                ocr_result["top"][index],
                                ocr_result["width"][index],
                                ocr_result["height"][index],
                            )
                        )

                        # add bounding boxes for all words in ocr dict
                        # contained within the text of recognized entity
                        # based on relative position in the full text
                        while pos + len(word) < element.end:
                            prev_word = word
                            index, word = next(iter_ocr)
                            if word:
                                bboxes.append(
                                    ImageRecognizerResult(
                                        element.entity_type,
                                        element.start,
                                        element.end,
                                        element.score,
                                        ocr_result["left"][index],
                                        ocr_result["top"][index],
                                        ocr_result["width"][index],
                                        ocr_result["height"][index],
                                    )
                                )
                            pos += len(prev_word) + 1
                        proc_indexes += 1

                if proc_indexes == indexes:
                    break
                pos += len(word) + 1

        return bboxes

analyze(self, image, **kwargs)

Analyse method to analyse the given image.

:param image: PIL Image/numpy array or file path(str) to be processed :param kwargs: Additional values for the analyze method in AnalyzerEngine

:return: list of the extract entities with image bounding boxes

Source code in presidio_image_redactor/image_analyzer_engine.py
def analyze(self, image: object, **kwargs) -> List[ImageRecognizerResult]:
    """Analyse method to analyse the given image.

    :param image: PIL Image/numpy array or file path(str) to be processed
    :param kwargs: Additional values for the analyze method in AnalyzerEngine

    :return: list of the extract entities with image bounding boxes
    """
    ocr_result = self.ocr.perform_ocr(image)
    text = self.ocr.get_text_from_ocr_dict(ocr_result)

    analyzer_result = self.analyzer_engine.analyze(
        text=text, language="en", **kwargs
    )
    bboxes = self.map_analyzer_results_to_bounding_boxes(
        analyzer_result, ocr_result, text
    )
    return bboxes

map_analyzer_results_to_bounding_boxes(text_analyzer_results, ocr_result, text) staticmethod

Map extracted PII entities to image bounding boxes.

Matching is based on the position of the recognized entity from analyzer and word (in ocr dict) in the text.

:param text_analyzer_results: PII entities recognized by presidio analyzer :param ocr_result: dict results with words and bboxes from OCR :param text: text the results are based on

return: list of extracted entities with image bounding boxes

Source code in presidio_image_redactor/image_analyzer_engine.py
@staticmethod
def map_analyzer_results_to_bounding_boxes(
    text_analyzer_results: List[RecognizerResult], ocr_result: dict, text: str
) -> List[ImageRecognizerResult]:
    """Map extracted PII entities to image bounding boxes.

    Matching is based on the position of the recognized entity from analyzer
    and word (in ocr dict) in the text.

    :param text_analyzer_results: PII entities recognized by presidio analyzer
    :param ocr_result: dict results with words and bboxes from OCR
    :param text: text the results are based on

    return: list of extracted entities with image bounding boxes
    """
    if (not ocr_result) or (not text_analyzer_results):
        return []

    bboxes = []
    proc_indexes = 0
    indexes = len(text_analyzer_results)

    pos = 0
    iter_ocr = enumerate(ocr_result["text"])
    for index, word in iter_ocr:
        if not word:
            pos += 1
        else:
            for element in text_analyzer_results:
                text_element = text[element.start : element.end]
                # check position and text of ocr word matches recognized entity
                if (
                    max(pos, element.start) < min(element.end, pos + len(word))
                ) and ((text_element in word) or (word in text_element)):
                    bboxes.append(
                        ImageRecognizerResult(
                            element.entity_type,
                            element.start,
                            element.end,
                            element.score,
                            ocr_result["left"][index],
                            ocr_result["top"][index],
                            ocr_result["width"][index],
                            ocr_result["height"][index],
                        )
                    )

                    # add bounding boxes for all words in ocr dict
                    # contained within the text of recognized entity
                    # based on relative position in the full text
                    while pos + len(word) < element.end:
                        prev_word = word
                        index, word = next(iter_ocr)
                        if word:
                            bboxes.append(
                                ImageRecognizerResult(
                                    element.entity_type,
                                    element.start,
                                    element.end,
                                    element.score,
                                    ocr_result["left"][index],
                                    ocr_result["top"][index],
                                    ocr_result["width"][index],
                                    ocr_result["height"][index],
                                )
                            )
                        pos += len(prev_word) + 1
                    proc_indexes += 1

            if proc_indexes == indexes:
                break
            pos += len(word) + 1

    return bboxes