Validation

filter_overlaps(example)

Show source code in recon/validation.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@operation("recon.v1.filter_overlaps")
def filter_overlaps(example: Example) -> Example:
    """Filter overlapping entity spans by picking the longest one.

    Args:
        example (Example): Input Example

    Returns:
        List[Example]: Example with fixed overlaps
    """
    annotations: List[Span] = sorted(example.spans, key=lambda s: s.start)
    filtered_annotations = remove_overlapping_entities(annotations)
    example.spans = filtered_annotations

    return example

Filter overlapping entity spans by picking the longest one.

Parameters

Name Type Description Default
example Example Input Example required

Returns

Type Description
Example List[Example]: Example with fixed overlaps

remove_overlapping_entities(sorted_spans)

Show source code in recon/validation.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def remove_overlapping_entities(sorted_spans: List[Span]) -> List[Span]:
    """
    Removes overlapping entities from the entity set, by greedilytaking the longest
    entity from each overlapping chain. The input list of entities should be sorted
    and follow the spacy format.
    """
    spans_without_overlap: List[Span] = []
    current_overlapping_chain: List[Span] = []
    current_overlapping_chain_start = 0
    current_overlapping_chain_end = 0
    for i, current_entity in enumerate(sorted_spans):
        current_entity = sorted_spans[i]
        current_entity_start = current_entity.start
        current_entity_end = current_entity.end

        if len(current_overlapping_chain) == 0:
            current_overlapping_chain.append(current_entity)
            current_overlapping_chain_start = current_entity_start
            current_overlapping_chain_end = current_entity_end
        else:
            min_end = min(current_entity_end, current_overlapping_chain_end)
            max_start = max(current_entity_start, current_overlapping_chain_start)
            if min_end - max_start > 0:
                current_overlapping_chain.append(current_entity)
                current_overlapping_chain_start = min(
                    current_entity_start, current_overlapping_chain_start
                )
                current_overlapping_chain_end = max(
                    current_entity_end, current_overlapping_chain_end
                )
            else:
                selections_from_chain: List[Span] = select_subset_of_overlapping_chain(
                    current_overlapping_chain
                )

                current_overlapping_chain = []
                spans_without_overlap.extend(selections_from_chain)
                current_overlapping_chain.append(current_entity)
                current_overlapping_chain_start = current_entity_start
                current_overlapping_chain_end = current_entity_end

    spans_without_overlap.extend(select_subset_of_overlapping_chain(current_overlapping_chain))

    return sorted(spans_without_overlap, key=lambda x: x.start)

Removes overlapping entities from the entity set, by greedilytaking the longest entity from each overlapping chain. The input list of entities should be sorted and follow the spacy format.

select_subset_of_overlapping_chain(chain)

Show source code in recon/validation.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def select_subset_of_overlapping_chain(chain: List[Span]) -> List[Span]:
    """
    Select the subset of entities in an overlapping chain to return by greedily choosing the
    longest entity in the chain until there are no entities remaining
    """
    sorted_chain = sorted(chain, key=lambda s: s.end - s.start, reverse=True)
    selections_from_chain: List[Span] = []
    chain_index = 0
    # dump the current chain by greedily keeping the longest entity that doesn't overlap
    while chain_index < len(sorted_chain):
        entity = sorted_chain[chain_index]
        match_found = False
        for already_selected_entity in selections_from_chain:
            max_start = max(entity.start, already_selected_entity.start)
            min_end = min(entity.end, already_selected_entity.end)
            if len(range(max_start, min_end)) > 0:
                match_found = True
                break

        if not match_found:
            selections_from_chain.append(entity)

        chain_index += 1

    return selections_from_chain

Select the subset of entities in an overlapping chain to return by greedily choosing the longest entity in the chain until there are no entities remaining

upcase_labels(example)

Show source code in recon/validation.py
11
12
13
14
15
16
17
18
19
20
21
22
23
@operation("recon.v1.upcase_labels")
def upcase_labels(example: Example) -> Example:
    """Convert all span labels to uppercase to normalize

    Args:
        example (Example): Input Example

    Returns:
        Example: Example with fixed labels
    """
    for s in example.spans:
        s.label = s.label.upper()
    return example

Convert all span labels to uppercase to normalize

Parameters

Name Type Description Default
example Example Input Example required

Returns

Type Description
Example Example: Example with fixed labels