Source code for genalog.text.ner_label

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# ---------------------------------------------------------

import itertools
import re
import string

from genalog.text import alignment, anchor
from genalog.text import preprocess

# Both regex below has the following behavior:
#   1. whitespace-tolerant at both ends of the string
#   2. separate the token into two groups:
#       For example, given a label 'B-PLACE'
#       Group 1 (denoted by \1): Label Indicator (B-)
#       Group 2 (denoted by \2): Label Name (PLACE)
MULTI_TOKEN_BEGIN_LABEL_REGEX = r"^\s*(B-)([a-z|A-Z]+)\s*$"
MULTI_TOKEN_INSIDE_LABEL_REGEX = r"^\s*(I-)([a-z|A-Z]+)\s*$"
MULTI_TOKEN_LABEL_REGEX = r"^\s*([B|I]-)([a-z|A-Z]+)\s*"

# To avoid confusion in the Python interpreter,
# gap char should not be any of the following special characters
SPECIAL_CHAR = set(
    " \t\n'\x0b''\x0c''\r'"
)  # Notice space characters (' ', '\t', '\n') are in this set.
GAP_CHAR_SET = set(string.printable).difference(SPECIAL_CHAR)
# GAP_CHAR_SET = '!"#$%&()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'


[docs]class GapCharError(Exception):
    pass


def _is_begin_label(label):
    """ Return true if the NER label is a begin label (eg. B-PLACE) """
    return re.match(MULTI_TOKEN_BEGIN_LABEL_REGEX, label) is not None


def _is_inside_label(label):
    """ Return true if the NER label is an inside label (eg. I-PLACE) """
    return re.match(MULTI_TOKEN_INSIDE_LABEL_REGEX, label) is not None


def _is_multi_token_label(label):
    """ Return true if the NER label is a multi token label (eg. B-PLACE, I-PLACE) """
    return re.match(MULTI_TOKEN_LABEL_REGEX, label) is not None


def _clean_multi_token_label(label):
    """ Rid the multi-token-labels of whitespaces"""
    return re.sub(MULTI_TOKEN_LABEL_REGEX, r"\1\2", label)


def _convert_to_begin_label(label):
    """Convert an inside label, or I-label, (ex. I-PLACE) to a begin label, or B-Label, (ex. B-PLACE)

    Arguments:
        label (str) : an NER label

    Returns:
        an NER label. This method DOES NOT alter the label unless it is an inside label
    """
    if _is_inside_label(label):
        # Replace the Label Indicator to 'B-'(\1) and keep the Label Name (\2)
        return re.sub(MULTI_TOKEN_INSIDE_LABEL_REGEX, r"B-\2", label)
    return label


def _convert_to_inside_label(label):
    """Convert a begin label, or B-label, (ex. B-PLACE) to an inside label, or I-Label, (ex. B-PLACE)

    Arguments:
        label (str) : an NER label

    Returns:
        an NER label. This method DOES NOT alter the label unless it is a begin label
    """
    if _is_begin_label(label):
        # Replace the Label Indicator to 'I-'(\1) and keep the Label Name (\2)
        return re.sub(MULTI_TOKEN_BEGIN_LABEL_REGEX, r"I-\2", label)
    return label


def _is_missing_begin_label(begin_label, inside_label):
    """Validate a inside label given an begin label

    Arguments:
        begin_label (str) : a begin NER label used to
            check if the given label is part of a multi-token label
        inside_label (str) : an inside label to check for its validity

    Returns:
        True if the inside label paired with the begin_label. False otherwise.
        Also False if input is not an inside label
    """
    if not _is_inside_label(inside_label):
        return False

    if begin_label:
        # clean the two labels before comparison
        inside_label = _clean_multi_token_label(inside_label)
        begin_label = _clean_multi_token_label(begin_label)
        # convert inside label to a begin label for string comparison
        # True if the two labels have different names
        # (e.g. B-LOC followed by I-ORG, and I-ORG is missing a begin label)
        return _convert_to_begin_label(inside_label) != begin_label
    else:
        return True


[docs]def correct_ner_labels(labels):
    """Correct the given list of labels for the following case:

    1. Missing B-Label (i.e. I-PLACE I-PLACE -> B-PLACE I-PLACE)

    Arguments:
        labels (list) : list of NER labels

    Returns:
        a list of NER labels
    """
    cur_begin_label = ""
    for i, label in enumerate(labels):
        if _is_multi_token_label(label):
            if _is_begin_label(label):
                cur_begin_label = label
            # else is an inside label, so we check if it's missing a begin label
            else:
                if _is_missing_begin_label(cur_begin_label, label):
                    labels[i] = _convert_to_begin_label(label)
                    # Update current begin label
                    cur_begin_label = labels[i]
        else:
            cur_begin_label = ""
    return labels


def _select_from_multiple_ner_labels(label_indices):
    """Private method to select a NER label from a list of candidate

    Note: this method is used to tackle the issue when multiple gt tokens
    are aligned to ONE ocr_token

    For example:

        gt_labels:  B-p   I-p    O   O
                     |     |     |   |
            gt:     New   York   is big
                     |      \\   /   |
            ocr:    New     Yorkis  big
                     |        |      |
       ocr_labels:  B-p      I-p     O

    We need to decide whether the token "Yorkis" should be labeled as "I-place", "o" or both.
    Currently the FIRST label takes precedence.

    Arguments:
        label_indices (list) : a list of token indices

    Returns:
        a specific index
    """
    # TODO: may need a more sophisticated way to select from multiple NER labels
    return label_indices[0]


def _find_gap_char_candidates(gt_tokens, ocr_tokens):
    """Find a set of suitable GAP_CHARs based not in the set of input characters

    Arguments:
        gt_tokens (list) : a list of tokens
        ocr_tokens (list) : a list of tokens

    Returns:
        (set, set) -- a 2-element tuple of
            1. the set of suitable GAP_CHARs
            2. the set of input characters
    """
    input_char_set = set(
        "".join(itertools.chain(gt_tokens, ocr_tokens))
    )  # The set of input characters
    gap_char_set = GAP_CHAR_SET  # The set of possible GAP_CHARs
    # Find a set of gap_char that is NOT in the set of input characters
    gap_char_candidates = gap_char_set.difference(input_char_set)
    return gap_char_candidates, input_char_set


[docs]def propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens, use_anchor=True):
    """Propagate NER label for ground truth tokens to to ocr tokens.

        NOTE that `gt_tokens` and `ocr_tokens` MUST NOT contain invalid tokens.
            Invalid tokens are:
                1. non-atomic tokens, or space-separated string ("New York")
                3. empty string ("")
                4. string with spaces ("  ")

    Arguments:
        gt_labels (list) : a list of NER label for ground truth token
        gt_tokens (list) : a list of ground truth string tokens
        ocr_tokens (list) : a list of OCR'ed text tokens
        gap_char (char, optional) : gap char used in alignment algorithm. Defaults to ``alignment.GAP_CHAR``.
        use_anchor (bool, optional) : use faster alignment method with anchors if set to True. Defaults to True.

    Raises:
        GapCharError:
            when the set of input character is EQUAL
            to set of all possible gap characters (GAP_CHAR_SET)

    Returns:
        tuple : a tuple of 3 elements ``(ocr_labels, aligned_gt, aligned_ocr, gap_char)`` where
        1. ``ocr_labels`` is a list of NER label for the corresponding ocr tokens
        2. ``aligned_gt`` is the ground truth string aligned with the ocr text
        3. ``aligned_ocr`` is the ocr text aligned with ground true
        4. ``gap_char`` is the char used to alignment for inserting gaps
    """
    # Find a set of suitable GAP_CHAR based not in the set of input characters
    gap_char_candidates, input_char_set = _find_gap_char_candidates(
        gt_tokens, ocr_tokens
    )
    if len(gap_char_candidates) == 0:
        raise GapCharError(
            "Exhausted all possible GAP_CHAR candidates for alignment."
            + " Consider reducing cardinality of the input character set.\n"
            + f"The set of possible GAP_CHAR candidates is: '{''.join(sorted(GAP_CHAR_SET))}'\n"
            + f"The set of input character is: '{''.join(sorted(input_char_set))}'"
        )
    else:
        if alignment.GAP_CHAR in gap_char_candidates:
            gap_char = alignment.GAP_CHAR  # prefer to use default GAP_CHAR
        else:
            gap_char = gap_char_candidates.pop()
        return _propagate_label_to_ocr(
            gt_labels, gt_tokens, ocr_tokens, gap_char=gap_char, use_anchor=use_anchor
        )


[docs]def _propagate_label_to_ocr(
    gt_labels, gt_tokens, ocr_tokens, gap_char=alignment.GAP_CHAR, use_anchor=True
):
    r"""Propagate NER label for ground truth tokens to to ocr tokens. Low level implementation

        NOTE: that `gt_tokens` and `ocr_tokens` MUST NOT contain invalid tokens.
        Invalid tokens are:
        1. non-atomic tokens, or space-separated string ("New York")
        2. multiple occurrences of the GAP_CHAR ('@@@')
        3. empty string ("")
        4. string with spaces ("  ")

    ::

        Case Analysis:
        ******************************** MULTI-TOKEN-LABELS ********************************

                    Case 1:         Case 2:         Case 3:         Case 4:         Case 5:
                    one-to-many     many-to-one     many-to-many    missing tokens  missing tokens
                                                   (Case 1&2 comb)  (I-label)       (B-label)
        gt label     B-p    I-p      B-p I-p        B-p   I-p       B-p  I-p        B-p  I-p  I-p
                      |      |        |   |          |     |         |    |          |   |     |
        gt_token     New    York     New York       New  York       New York        New York City
                     / \    / \        \ /           /\   /          |                   |     |
       ocr_token    N   ew Yo  rk    NewYork        N ew@York       New                 York City
                    |   |   |   |       |           |    |           |                   |     |
       ocr label   B-p I-p I-p I-p     B-p          B-p I-p         B-p                 B-p   I-p

        ******************************** SINGLE-TOKEN-LABELS ********************************

                    Case 1:         Case 2:         Case 3:         Case 4:
                    one-to-many     many-to-one     many-to-many    missing tokens
                                                   (Case 1&2 comb)
        gt label         O           V    O          O   V   W       O   O
                         |           |    |          |   |   |       |   |
        gt_token     something       is  big       this is huge      is big
                     / \    \          \ /          /\  /\ /         |
       ocr_token    so  me  thing     isbig       th isi shuge       is
                    |   |     |         |          |  |    |         |
       ocr label    o   o     o         V          O  O    V         O

    Arguments:
        gt_labels (list) : a list of NER label for ground truth token
        gt_tokens (list) : a list of ground truth string tokens
        ocr_tokens (list) : a list of OCR'ed text tokens
        gap_char (char, optional) : gap char used in alignment algorithm . Defaults to ``alignment.GAP_CHAR``.
        use_anchor (bool, optional) : use faster alignment method with anchors if set to True.
                            Defaults to True.
    Raises:
        ValueError: when
        1. there is unequal number of gt_tokens and gt_labels
        2. there is a non-atomic token in gt_tokens or ocr_tokens
        3. there is an empty string in gt_tokens or ocr_tokens
        4. there is a token full of space characters only in gt_tokens or ocr_tokens
        5. gt_to_ocr_mapping has more tokens than gt_tokens
        GapCharError: when
        1. there is a token consisted of GAP_CHAR only


    Returns:
        a tuple of 4 elements: (ocr_labels, aligned_gt, aligned_ocr, gap_char)
        where
        `ocr_labels` is a list of NER label for the corresponding ocr tokens
        `aligned_gt` is the ground truth string aligned with the ocr text
        `aligned_ocr` is the ocr text aligned with ground true
        `gap_char` is the char used to alignment for inserting gaps

    For example, given input:

    >>> _propagate_label_to_ocr(
        ["B-place", "I-place", "o", "o"],
        ["New", "York", "is", "big"],
        ["N", "ewYork", "big"]
    )
    (["B-place", "I-place", "o"], "N@ew York is big", "N ew@York@@@ big", '@')

    """
    # Pseudo-algorithm:

    #                                             ocr_to_gt_mapping = [
    # gt_labels:   B-P I-P  I-P  O  O  B-P I-P          [1, 2], ('YorkCity' maps to 'York' and 'City')
    #               |   |    |   |  |  |   |            [3],    ('i' maps to 'is')
    # gt_txt:     "New York City is in New York"        [3, 4], ('sin' maps to 'is' and 'in')
    #                     \/     /\  |  /\              [5],    ('N' maps to 'New')
    # ocr_txt:        "YorkCity  i sin N ew"            [5]     ('ew' maps to 'New)
    #                     |      |  |  |  |            ]
    #                    I-P     O  O B-P B-P

    # STEP 1: naively propagate NER label based on text-alignment
    #   ** If a ocr token is made of two or more gt tokens, the ocr token
    #      takes the label from the FIRST gt token.
    #            Please see '_select_from_multiple_ner_labels()' from above
    #   ** If gt token is splitted into two of more ocr token, ALL ocr tokens
    #      share the same gt label
    #

    #                                             gt_to_ocr_mapping = [
    # gt_labels:   B-P I-P  I-P  O  O  B-P I-P          [],     ('New' does not map to any ocr token)
    #               |   |    |   |  |  |   |            [0],    ('York' maps to 'YorkCity')
    # gt_txt:     "New York City is in New York"        [0],    ('City' maps to 'YorkCity')
    #                     \/     /\  |  /\              [1, 2], ('is' maps to 'i' and 'sin')
    # ocr_txt:        "YorkCity  i sin N ew"            [2],    ('in' maps to 'sin)
    #                     |      |  |  |  |             [3,4],  ('New' maps to 'N' and 'ew')
    #                    I-P     O  O B-P B-P           []      ('York' does not map to any ocr token)
    #                                                 ]

    # STEP 2, clean up corner cases from multi-token-labels
    #   ** At this point, Step 1 should've taken care all single-token-label cases
    #   ** We need to correct the following corner cases with multi-token-labels
    #       1. Trailing B-labels (MULTI-TOKEN-LABELS Case 1)
    #           Ex: B-PLACE B-PLACE
    #                  N        ew
    #       2. Missing B-label (MULTI-TOKEN-LABELS Case 5)
    #           Ex: I-PLACE
    #               YorkCity

    # We can address MULTI-TOKEN-LABELS Case 1 with following pseudo-algorithm:
    # 1. For each gt_token in gt_to_ocr_mapping:
    # 1. If the gt_token is mapped to 2 or more ocr_tokens AND the gt_token has a B-label
    # 1. For all the ocr_tokens this gt_token mapped to
    # 1. Keep the B-label for the 1st ocr_token
    # 2. For the rest of the ocr_token, convert the B-label to an I-label

    # We can address the MULTI-TOKEN-LABELS Case 5 with the '_correct_ner_labels()' method

    # Sanity check:
    if len(gt_tokens) != len(gt_labels):
        raise ValueError(
            f"Unequal number of gt_tokens ({len(gt_tokens)})"
            + f"to that of gt_labels ({len(gt_labels)})"
        )

    for tk in gt_tokens + ocr_tokens:
        if len(preprocess.tokenize(tk)) > 1:
            raise ValueError(f"Invalid token '{tk}'. Tokens must be atomic.")
        if not alignment._is_valid_token(tk, gap_char=gap_char):
            if re.search(rf"{re.escape(gap_char)}+", tk):  # Escape special regex chars
                raise GapCharError(
                    f"Invalid token '{tk}'. Tokens cannot be a chain repetition of the GAP_CHAR '{gap_char}'"
                )
            else:
                raise ValueError(
                    f"Invalid token '{tk}'. Tokens cannot be an empty string or a mix of space characters (spaces, tabs, newlines)"
                )

    # Stitch tokens together into one string for alignment
    gt_txt = preprocess.join_tokens(gt_tokens)
    ocr_txt = preprocess.join_tokens(ocr_tokens)
    # Align the ground truth and ocr text first
    if use_anchor:
        aligned_gt, aligned_ocr = anchor.align_w_anchor(
            gt_txt, ocr_txt, gap_char=gap_char
        )
    else:
        aligned_gt, aligned_ocr = alignment.align(gt_txt, ocr_txt, gap_char=gap_char)
    gt_to_ocr_mapping, ocr_to_gt_mapping = alignment.parse_alignment(
        aligned_gt, aligned_ocr, gap_char=gap_char
    )
    # Check invariant
    if len(gt_to_ocr_mapping) != len(gt_tokens):
        raise ValueError(
            "Alignment modified number of gt_tokens. aligned_gt_tokens to gt_tokens: "
            + f"{len(gt_to_ocr_mapping)}:{len(gt_tokens)}. \nCheck alignment.parse_alignment()."
        )

    ocr_labels = []
    # STEP 1: naively propagate NER label based on text-alignment
    for ocr_to_gt_token_relationship in ocr_to_gt_mapping:
        # if is not mapping to missing a token (Case 4)
        if ocr_to_gt_token_relationship:
            # Find the corresponding gt_token it is aligned to
            ner_label_index = _select_from_multiple_ner_labels(
                ocr_to_gt_token_relationship
            )
            # Get the NER label for that particular gt_token
            ocr_labels.append(gt_labels[ner_label_index])

    # STEP 2a: resolve MULTI-TOKEN-LABELS Case 1 Trailing B-label)
    for gt_token_index, gt_to_ocr_token_relationship in enumerate(gt_to_ocr_mapping):
        num_connections = len(gt_to_ocr_token_relationship)
        gt_token_label = gt_labels[gt_token_index]
        if num_connections > 1 and _is_begin_label(gt_token_label):
            for connection_index in range(1, num_connections):
                ocr_token_index = gt_to_ocr_token_relationship[connection_index]
                # Get the current label for ocr token
                cur_ocr_label = ocr_labels[ocr_token_index]
                ocr_labels[ocr_token_index] = _convert_to_inside_label(cur_ocr_label)

    # STEP 2b: resolve MULTI-TOKEN-LABELS Case 5 (Missing B-label)
    ocr_labels = correct_ner_labels(ocr_labels)

    return ocr_labels, aligned_gt, aligned_ocr, gap_char


[docs]def format_labels(tokens, labels, label_top=True):
    """Format tokens and their NER label for display

    Arguments:
        tokens (list) : a list of word tokens
        labels (list) : a list of NER labels
        label_top (bool, optional) : True if label is place on top of the token.
                                     Defaults to True.

    Returns:
        a str with NER label align to the token it is labeling

    ::

        Given inputs:
            tokens: ["New", "York", "is", "big"]
            labels: ["B-place", "I-place", "o", "o"]
            label_top: True

        Outputs:
            \"B-place I-place o  o \"
            \"New     York    is big\"


    """
    formatted_tokens = ""
    formatted_labels = ""
    token_label_pair = zip(labels, tokens)
    for label, token in token_label_pair:
        # find the length difference
        len_diff = abs(len(label) - len(token))
        # Add padding spaces for whichever is shorter
        if len(label) > len(token):
            formatted_labels += label + " "
            formatted_tokens += token + " " * len_diff + " "
        else:
            formatted_labels += label + " " * len_diff + " "
            formatted_tokens += token + " "
    if label_top:
        return formatted_labels + "\n" + formatted_tokens + "\n"
    else:
        return formatted_tokens + "\n" + formatted_labels + "\n"


[docs]def format_label_propagation(
    gt_tokens,
    gt_labels,
    ocr_tokens,
    ocr_labels,
    aligned_gt,
    aligned_ocr,
    show_alignment=True,
):
    """Format label propagation for display

    Arguments:
        gt_tokens (list) : list of ground truth tokens
        gt_labels (list) : list of NER labels for ground truth tokens
        ocr_tokens (list) : list of OCR'ed text tokens
        ocr_labels (list) : list of NER labels for the OCR'ed tokens
        aligned_gt (str) : ground truth string aligned with the OCR'ed text
        aligned_ocr (str) : OCR'ed text aligned with ground truth
        show_alignment (bool, optional) : if true, show alignment result . Defaults to True.

    Returns:
        str: a string formatted for display as follows:

    .. code-block:: python

        if show_alignment:

            "B-PLACE I-PLACE V  O"      # [gt_labels]
            "New     York    is big"    # [gt_txt]
            "New York is big"           # [aligned_gt]
            "||||....|||||||"
            "New @@@@ is big"           # [aligned_ocr]
            "New     is big "           # [ocr_txt]
            "B-PLACE V  O   "           # [ocr_labels]

        else:

            "B-PLACE I-PLACE V  O"     # [gt_labels]
            "New     York    is big"   # [gt_txt]
            "New     is big"           # [ocr_txt]
            "B-PLACE V  O"             # [ocr_labels]

    """

    gt_label_str = format_labels(gt_tokens, gt_labels)
    label_str = format_labels(ocr_tokens, ocr_labels, label_top=False)
    if show_alignment:
        alignment_str = alignment._format_alignment(aligned_gt, aligned_ocr)
        return gt_label_str + alignment_str + label_str
    else:
        return gt_label_str + label_str