Source code for genalog.text.preprocess

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# ---------------------------------------------------------

import re

END_OF_TOKEN = {" ", "\t", "\n"}
NON_ASCII_REPLACEMENT = "_"


[docs]def remove_non_ascii(token, replacement=NON_ASCII_REPLACEMENT):
    """Remove non ascii characters in a token

    Arguments:
        token (str) : a word token
        replacement (str, optional) : a replace character for non-ASCII characters.
                                      Defaults to ``NON_ASCII_REPLACEMENT``.
    Returns:
        str -- a word token with non-ASCII characters removed
    """
    # Remove non-ASCII characters in the token
    ascii_token = str(token.encode("utf-8").decode("ascii", "ignore"))
    # If token becomes an empty string as a result
    if len(ascii_token) == 0 and len(token) != 0:
        ascii_token = replacement  # replace with a default character
    return ascii_token


[docs]def tokenize(s):
    """Tokenize string

    Arguments:
        s (str) : aligned string

    Returns:
        a list of tokens
    """
    # split alignment tokens by spaces, tabs and newline (and excluding them in the tokens)
    return s.split()


[docs]def join_tokens(tokens):
    """Join a list of tokens into a string

    Arguments:
        tokens (list) : a list of tokens

    Returns:
        a string with space-separated tokens
    """
    return " ".join(tokens)


def _is_spacing(c):
    """ Determine if the character is ignorable """
    return True if c in END_OF_TOKEN else False


[docs]def split_sentences(text, delimiter="\n"):
    """ Split a text into sentences with a delimiter"""
    return re.sub(r"(( /?[.!?])+ )", rf"\1{delimiter}", text)


[docs]def is_sentence_separator(token):
    """ Returns true if the token is a sentence splitter """
    return re.match(r"^/?[.!?]$", token) is not None