@operation("recon.v1.add_tokens",pre=[spacy_pre_processor])defadd_tokens(example:Example,*,preprocessed_outputs:Dict[str,Any])->Union[Example,None]:"""Add tokens to each Example Args: example (Example): Input Example preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors Returns: Example: Example with tokens """doc=preprocessed_outputs["recon.v1.spacy"]tokens=[]token_starts={}token_ends={}fortindoc:start=t.idxend=t.idx+len(t)tokens.append(Token(text=t.text,start=start,end=end,id=t.i))token_starts[start]=ttoken_ends[end]=texample.tokens=tokensforspaninexample.spans:ifspan.startintoken_startsandspan.endintoken_ends:span.token_start=token_starts[span.start].ispan.token_end=token_ends[span.end].i+1ifspan.token_startisNoneorspan.token_endisNone:returnNonereturnexample
@operation("recon.v1.fix_tokenization_and_spacing",pre=[spacy_pre_processor])deffix_tokenization_and_spacing(example:Example,*,preprocessed_outputs:Dict[str,Any]={})->Union[Example,None]:"""Fix tokenization and spacing issues where there are annotation spans that don't fall on a token boundary. This can happen if annotations are done at the character level, not the token level. Often, when scraping web text it's easy to get two words pushed together where one is an entity so this can fix a lot of issues. Args: example (Example): Input Example preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors Returns: Example: Example with spans fixed to align to token boundaries. """doc=preprocessed_outputs["recon.v1.spacy"]tokens=[]token_starts={}token_ends={}fortindoc:start=t.idxend=t.idx+len(t)tokens.append(Token(text=t.text,start=start,end=end,id=t.i))token_starts[start]=ttoken_ends[end]=tspans_to_increment:Dict[int,int]=defaultdict(int)forspan_i,spaninenumerate(example.spans):ifspan.startintoken_startsandspan.endintoken_ends:# Aligns to token boundaries, nothing to change herecontinueifspan.startintoken_startsandspan.endnotintoken_ends:# Span start aligns to token_start but end doesn't# e.g. [customer][PERSONTYPE]s but should be annotated as [customers][PERSONTYPE]# tokenization_errors.append((example, span))# print("BAD END")ifspan.end+1intoken_ends:# Likely off by 1 annotation# e.g. [customer][PERSONTYPE]s but should be annotated as [customers][PERSONTYPE]span.end+=1span.text=example.text[span.start:span.end]# print("SPAN CORRECTED OFF BY 1", example.text, span)elifspan.end-1intoken_ends:span.end-=1span.text=example.text[span.start:span.end]else:# Likely bad tokenization# e.g. [Quadling][GPE]Country should be split to [Quadling][GPE] Countryforjinrange(span_i+1,len(example.spans)):spans_to_increment[j]+=1fe_text=example.textsplit_start=span.startiflen(spans_to_increment)>1andspan_i!=list(spans_to_increment.keys())[0]:split_start+=spans_to_increment.get(span_i,0)split_end=span.endiflen(spans_to_increment)>1andspan_i!=list(spans_to_increment.keys())[0]:split_end+=spans_to_increment.get(span_i,0)new_text=f"{fe_text[:split_start]}{span.text}{fe_text[split_end:]}"example.text=new_textelifspan.startnotintoken_startsandspan.endintoken_ends:# Bad tokenization# e.g. with[Raymond][PERSON] but text should be split to with [Raymond][PERSON]# print("BAD START", span.text)# tokenization_errors.append((example, span))forjinrange(span_i,len(example.spans)):spans_to_increment[j]+=1fe_text=example.textsplit_start=span.startiflen(spans_to_increment)>1andspan_i!=list(spans_to_increment.keys())[0]:split_start+=spans_to_increment.get(span_i,0)split_end=span.endiflen(spans_to_increment)>1andspan_i!=list(spans_to_increment.keys())[0]:split_end+=spans_to_increment.get(span_i,0)new_text=f"{fe_text[:split_start]}{span.text}{fe_text[split_end:]}"example.text=new_textelse:# Something is super fucked up.# print("SPAN CORRECTED OFF BY 1 unfixable", example.text, span)before=span.startafter=span.end# tokenization_errors.append((example, span))# if (before >= 0 and after < len(span.text) and span[before] not in token_starts and span[before] != ' ' and span[after] not in token_ends and span[after] != ' '):# fe_text = example.text# new_text = f"{fe_text[:span.start]} {span.text}{fe_text[span.end:]}"# spans_to_increment[span_i] += 1# for j in range(span_i + 1, len(example.spans)):# spans_to_increment[j] += 2# else:# unfixable_examples.add(example.text)# breakreturnNone# Increment the start and end characters for each spanforspan_i,countinspans_to_increment.items():example.spans[span_i].start+=countexample.spans[span_i].end+=countreturnexample
Fix tokenization and spacing issues where there are annotation spans that
don't fall on a token boundary. This can happen if annotations are done at the
character level, not the token level. Often, when scraping web text it's easy to
get two words pushed together where one is an entity so this can fix a lot of issues.
Parameters
Name
Type
Description
Default
example
Example
Input Example
required
preprocessed_outputs
Dict[str, Any]
Outputs of preprocessors
{}
Returns
Type
Description
Union[recon.types.Example, NoneType]
Example: Example with spans fixed to align to token boundaries.