# ---------------------------------------------------------# Copyright (c) Microsoft Corporation. All rights reserved.# Licensed under the MIT License.# ---------------------------------------------------------importreEND_OF_TOKEN={" ","\t","\n"}NON_ASCII_REPLACEMENT="_"
[docs]defremove_non_ascii(token,replacement=NON_ASCII_REPLACEMENT):"""Remove non ascii characters in a token Arguments: token (str) : a word token replacement (str, optional) : a replace character for non-ASCII characters. Defaults to ``NON_ASCII_REPLACEMENT``. Returns: str -- a word token with non-ASCII characters removed """# Remove non-ASCII characters in the tokenascii_token=str(token.encode("utf-8").decode("ascii","ignore"))# If token becomes an empty string as a resultiflen(ascii_token)==0andlen(token)!=0:ascii_token=replacement# replace with a default characterreturnascii_token
[docs]deftokenize(s):"""Tokenize string Arguments: s (str) : aligned string Returns: a list of tokens """# split alignment tokens by spaces, tabs and newline (and excluding them in the tokens)returns.split()
[docs]defjoin_tokens(tokens):"""Join a list of tokens into a string Arguments: tokens (list) : a list of tokens Returns: a string with space-separated tokens """return" ".join(tokens)
def_is_spacing(c):""" Determine if the character is ignorable """returnTrueifcinEND_OF_TOKENelseFalse
[docs]defsplit_sentences(text,delimiter="\n"):""" Split a text into sentences with a delimiter"""returnre.sub(r"(( /?[.!?])+ )",rf"\1{delimiter}",text)
[docs]defis_sentence_separator(token):""" Returns true if the token is a sentence splitter """returnre.match(r"^/?[.!?]$",token)isnotNone