Source code for genalog.text.preprocess

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# ---------------------------------------------------------

import re

END_OF_TOKEN = {" ", "\t", "\n"}
NON_ASCII_REPLACEMENT = "_"


[docs]def remove_non_ascii(token, replacement=NON_ASCII_REPLACEMENT): """Remove non ascii characters in a token Arguments: token (str) : a word token replacement (str, optional) : a replace character for non-ASCII characters. Defaults to ``NON_ASCII_REPLACEMENT``. Returns: str -- a word token with non-ASCII characters removed """ # Remove non-ASCII characters in the token ascii_token = str(token.encode("utf-8").decode("ascii", "ignore")) # If token becomes an empty string as a result if len(ascii_token) == 0 and len(token) != 0: ascii_token = replacement # replace with a default character return ascii_token
[docs]def tokenize(s): """Tokenize string Arguments: s (str) : aligned string Returns: a list of tokens """ # split alignment tokens by spaces, tabs and newline (and excluding them in the tokens) return s.split()
[docs]def join_tokens(tokens): """Join a list of tokens into a string Arguments: tokens (list) : a list of tokens Returns: a string with space-separated tokens """ return " ".join(tokens)
def _is_spacing(c): """ Determine if the character is ignorable """ return True if c in END_OF_TOKEN else False
[docs]def split_sentences(text, delimiter="\n"): """ Split a text into sentences with a delimiter""" return re.sub(r"(( /?[.!?])+ )", rf"\1{delimiter}", text)
[docs]def is_sentence_separator(token): """ Returns true if the token is a sentence splitter """ return re.match(r"^/?[.!?]$", token) is not None