Stats

calculate_entity_coverage_entropy(entity_coverage)

Show source code in recon/stats.py
252
253
254
255
256
257
258
259
260
261
262
263
def calculate_entity_coverage_entropy(entity_coverage: List[EntityCoverage],) -> float:
    """Use Entropy to calculate a metric for entity coverage.

    Args:
        entity_coverage (List[EntityCoverage]): List of EntityCoverage 
            from get_entity_coverage

    Returns:
        float: Entropy for entity coverage counts
    """
    counts = [ecs.count for ecs in entity_coverage]
    return entropy(counts, sum(counts))  # type: ignore

Use Entropy to calculate a metric for entity coverage.

Parameters

Name Type Description Default
entity_coverage List[recon.types.EntityCoverage] List of EntityCoverage from get_entity_coverage required

Returns

Type Description
float float: Entropy for entity coverage counts

calculate_entity_coverage_similarity(x, y)

Show source code in recon/stats.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def calculate_entity_coverage_similarity(x: List[Example], y: List[Example]) -> EntityCoverageStats:
    """Calculate how well dataset x covers the entities in dataset y.
    This function should be used to calculate how similar your train set
    annotations cover the annotations in your dev/test set

    Args:
        x (List[Example]): Dataset to compare coverage to (usually corpus.train)
        y (List[Example]): Dataset to evaluate coverage for (usually corpus.dev or corpus.test)

    Returns:
        EntityCoverageStats: Stats with 
            1. The base entity coverage (does entity in y exist in x)
            2. Count coverage (sum of the EntityCoverage.count property for 
            each EntityCoverage in y to get a more holisic coverage scaled by how 
            often entities occur in each dataset x and y)
    """

    def pipeline(data: List[Example]) -> Dict[int, int]:
        ecs = get_entity_coverage(data)
        return {hash(ec): ec.count for ec in ecs}

    x_map = pipeline(x)
    y_map = pipeline(y)

    n_intersection = 0
    count_intersection = 0
    n_union = 0
    count_union = 0

    for k, count in y_map.items():
        if k in x_map:
            n_intersection += 1
            count_intersection += count
        n_union += 1
        count_union += count

    return EntityCoverageStats(
        entity=(n_intersection / n_union) * 100, count=(count_intersection / count_union) * 100,
    )

Calculate how well dataset x covers the entities in dataset y. This function should be used to calculate how similar your train set annotations cover the annotations in your dev/test set

Parameters

Name Type Description Default
x List[recon.types.Example] Dataset to compare coverage to (usually corpus.train) required
y List[recon.types.Example] Dataset to evaluate coverage for (usually corpus.dev or corpus.test) required

Returns

Type Description
EntityCoverageStats EntityCoverageStats: Stats with 1. The base entity coverage (does entity in y exist in x) 2. Count coverage (sum of the EntityCoverage.count property for each EntityCoverage in y to get a more holisic coverage scaled by how often entities occur in each dataset x and y)

calculate_label_balance_entropy(ner_stats)

Show source code in recon/stats.py
238
239
240
241
242
243
244
245
246
247
248
249
def calculate_label_balance_entropy(ner_stats: NERStats) -> float:
    """Use Entropy to calculate a metric for label balance based on an NERStats object 

    Args:
        ner_stats (NERStats): NERStats for a dataset.

    Returns:
        float: Entropy for annotation counts of each label
    """
    total = ner_stats.n_annotations
    classes = [count for label, count in ner_stats.n_annotations_per_type.items()]
    return entropy(classes, total)

Use Entropy to calculate a metric for label balance based on an NERStats object

Parameters

Name Type Description Default
ner_stats NERStats NERStats for a dataset. required

Returns

Type Description
float float: Entropy for annotation counts of each label

calculate_label_distribution_similarity(x, y)

Show source code in recon/stats.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def calculate_label_distribution_similarity(x: List[Example], y: List[Example]) -> float:
    """Calculate the similarity of the label distribution for 2 datasets.

    e.g. This can help you understand how well your train set models your dev and test sets.
    Empircally you want a similarity over **0.8** when comparing your train set to each of your
    dev and test sets.

        calculate_label_distribution_similarity(corpus.train, corpus.dev)
        # 98.57

        calculate_label_distribution_similarity(corpus.train, corpus.test)
        # 73.29 - This is bad, let's investigate our test set more

    Args:
        x (List[Example]): Dataset
        y (List[Example]): Dataset to compare x to

    Returns:
        float: Similarity of label distributions
    """

    def pipeline(data: List[Example]) -> Sequence[float]:
        stats = cast(NERStats, get_ner_stats(data))
        sorted_type_counts = get_sorted_type_counts(stats)
        counts_to_probs = get_probs_from_counts(sorted_type_counts)
        return counts_to_probs

    distance = jensenshannon(pipeline(x), pipeline(y))

    return (1 - distance) * 100

Calculate the similarity of the label distribution for 2 datasets.

e.g. This can help you understand how well your train set models your dev and test sets. Empircally you want a similarity over 0.8 when comparing your train set to each of your dev and test sets.

calculate_label_distribution_similarity(corpus.train, corpus.dev)
# 98.57

calculate_label_distribution_similarity(corpus.train, corpus.test)
# 73.29 - This is bad, let's investigate our test set more

Parameters

Name Type Description Default
x List[recon.types.Example] Dataset required
y List[recon.types.Example] Dataset to compare x to required

Returns

Type Description
float float: Similarity of label distributions

detect_outliers(seq, use_log=False)

Show source code in recon/stats.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
def detect_outliers(seq: Sequence[Any], use_log: bool = False) -> Outliers:
    """Detect outliers in a numerical sequence.

    Args:
        seq (Sequence[Any]): Sequence of ints or floats
        use_log (bool, optional): Use logarithm of seq.

    Returns:
        Tuple[List[int], List[int]]: Tuple of low and high indices
    """
    q1 = np.quantile(seq, 0.25)
    q3 = np.quantile(seq, 0.75)
    iqr = q3 - q1
    fence_low = math.floor(q1 - 1.5 * iqr)
    fence_high = math.floor(q3 + 1.5 * iqr)
    low_indices = [i for i, n in enumerate(seq) if n <= fence_low]
    high_indices = [i for i, n in enumerate(seq) if n > fence_high]
    return Outliers(low=low_indices, high=high_indices)

Detect outliers in a numerical sequence.

Parameters

Name Type Description Default
seq Sequence[Any] Sequence of ints or floats required
use_log bool Use logarithm of seq. False

Returns

Type Description
Outliers Tuple[List[int], List[int]]: Tuple of low and high indices

entropy(seq, total=None)

Show source code in recon/stats.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def entropy(seq: Union[List[int], List[float]], total: int = None) -> float:
    """Calculate Shannon Entropy for a sequence of Floats or Integers.
    If Floats, check they are probabilities
    If Integers, divide each n in seq by total and calculate entropy

    Args:
        seq (Union[List[int], List[float]]): Sequence to calculate entropy for
        total (int, optional): Total to divide by for List of int

    Raises:
        ValueError: If seq is not valid

    Returns:
        float: Entropy for sequence
    """
    if not seq:
        raise ValueError("Pass a valid non-empty sequence")

    if isinstance(seq[0], float):
        e = scipy_entropy(seq)
    elif isinstance(seq[0], int):
        e = scipy_entropy(get_probs_from_counts(seq))
    else:
        raise ValueError("Parameter seq must be a sequence of probabilites or integers.")
    return e

Calculate Shannon Entropy for a sequence of Floats or Integers. If Floats, check they are probabilities If Integers, divide each n in seq by total and calculate entropy

Parameters

Name Type Description Default
seq Union[List[int], List[float]] Sequence to calculate entropy for required
total int Total to divide by for List of int None

Exceptions

Type Description
ValueError If seq is not valid

Returns

Type Description
float float: Entropy for sequence

get_entity_coverage(data, sep='||', use_lower=True, return_examples=False)

Show source code in recon/stats.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def get_entity_coverage(
    data: List[Example], sep: str = "||", use_lower: bool = True, return_examples: bool = False,
) -> List[EntityCoverage]:
    """Identify how well you dataset covers an entity type. Get insights
    on the how many times certain text/label span combinations exist across your
    data so that you can focus your annotation efforts better rather than
    annotating examples your Model already understands well.

    Args:
        data (List[Example]): List of examples
        sep (str, optional): Separator used in coverage map, only change if || exists in your text
            or label.
        use_lower (bool, optional): Use the lowercase form of the span text in ents_to_label.
        return_examples (bool, optional): Return Examples that contain the entity label annotation.

    Returns:
        List[EntityCoverage]: Sorted List of EntityCoverage objects containing the text, label, count, and
            an optional list of examples where that text/label annotation exists.
    """
    coverage_map: DefaultDict[str, int] = defaultdict(int)
    examples_map: DefaultDict[str, List[Example]] = defaultdict(list)

    for example in data:
        for span in example.spans:
            text = span.text
            if use_lower:
                text = text.lower()
            key = f"{text}{sep}{span.label}"
            coverage_map[key] += 1
            examples_map[key].append(example)

    coverage = []
    for key, count in coverage_map.items():
        text, label = key.split(sep)
        record = EntityCoverage(text=text, label=label, count=count)
        if return_examples:
            record.examples = examples_map[key]
        coverage.append(record)

    sorted_coverage = sorted(coverage, key=lambda x: x.count, reverse=True)
    return sorted_coverage

Identify how well you dataset covers an entity type. Get insights on the how many times certain text/label span combinations exist across your data so that you can focus your annotation efforts better rather than annotating examples your Model already understands well.

Parameters

Name Type Description Default
data List[recon.types.Example] List of examples required
sep str Separator used in coverage map, only change if
use_lower bool Use the lowercase form of the span text in ents_to_label. True
return_examples bool Return Examples that contain the entity label annotation. False

Returns

Type Description
List[recon.types.EntityCoverage] List[EntityCoverage]: Sorted List of EntityCoverage objects containing the text, label, count, and an optional list of examples where that text/label annotation exists.

get_ner_stats(data, serialize=False, return_examples=False)

Show source code in recon/stats.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def get_ner_stats(
    data: List[Example], serialize: bool = False, return_examples: bool = False
) -> Union[NERStats, str, None]:
    """Compute statistics for NER data

    Args:
        data (List[Example]): Data as a List of examples
        serialize (bool, optional): Serialize to a JSON string for printing.
        return_examples (bool, optional): Whether to return examples per type

    Returns:
        Union[NERStats, str, None]: 
            List of examples or string if serialize and no_print are both True
    """
    annotations_per_type: DefaultDict[str, Any] = defaultdict(int)
    examples: DefaultDict[str, Any] = defaultdict(list)
    n_examples_no_entities = 0
    for e in data:
        if not e.spans:
            n_examples_no_entities += 1
            examples[NONE].append(e)
        else:
            for s in e.spans:
                annotations_per_type[s.label] += 1
                examples[s.label].append(e)

    sorted_anns_by_count = {
        a[0]: a[1] for a in sorted(annotations_per_type.items(), key=lambda x: x[1], reverse=True)
    }

    stats = NERStats(
        n_examples=len(data),
        n_examples_no_entities=n_examples_no_entities,
        n_annotations=sum(annotations_per_type.values()),
        n_annotations_per_type=sorted_anns_by_count,
    )
    if return_examples:
        stats.examples_with_type = examples

    if serialize:
        return srsly.json_dumps(stats.dict(), indent=4)
    else:
        return stats

Compute statistics for NER data

Parameters

Name Type Description Default
data List[recon.types.Example] Data as a List of examples required
serialize bool Serialize to a JSON string for printing. False
return_examples bool Whether to return examples per type False

Returns

Type Description
Union[recon.types.NERStats, str, NoneType] Union[NERStats, str, None]: List of examples or string if serialize and no_print are both True

get_probs_from_counts(seq)

Show source code in recon/stats.py
198
199
200
201
202
203
204
205
206
207
208
def get_probs_from_counts(seq: Sequence[int]) -> Sequence[float]:
    """Convert a sequence of counts to a sequence of probabilties
    by dividing each n by the sum of all n in seq

    Args:
        seq (Sequence[int]): Sequence of counts

    Returns:
        Sequence[float]: Sequence of probabilities
    """
    return np.asarray(seq) / sum(seq)

Convert a sequence of counts to a sequence of probabilties by dividing each n by the sum of all n in seq

Parameters

Name Type Description Default
seq Sequence[int] Sequence of counts required

Returns

Type Description
Sequence[float] Sequence[float]: Sequence of probabilities

get_sorted_type_counts(ner_stats)

Show source code in recon/stats.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def get_sorted_type_counts(ner_stats: NERStats) -> List[int]:
    """Get list of counts for each type in n_annotations_per_type property 
    of an NERStats object sorted by type name

    Args:
        ner_stats (NERStats): Dataset stats

    Returns:
        List[int]: List of counts sorted by type name
    """
    annotations_per_type = ner_stats.n_annotations_per_type
    annotations_per_type[NONE] = ner_stats.n_examples_no_entities

    return [t[1] for t in sorted(annotations_per_type.items(), key=lambda p: p[0])]

Get list of counts for each type in n_annotations_per_type property of an NERStats object sorted by type name

Parameters

Name Type Description Default
ner_stats NERStats Dataset stats required

Returns

Type Description
List[int] List[int]: List of counts sorted by type name