Source code for lexlib.structure

#!/usr/bin/env python3

"""
Functions related to the structure of words.
"""

# module: structure
# copyright: 2016-2019 R. Steiner
# license: MIT License

import itertools as it


[docs]def clusters(words, vowels, sep=None, unique=False, case_sensitive=True):
    """
    Separates a list of *words* into clusters. Clusters are defined as
    sequences of characters that do not contain any of the characters
    in the list of *vowels*.

    If *sep* is defined, it will be used as the delimiter string (for
    example, with `sep="."`, the word "a.bc.de" will be treated as the
    three-character sequence `["a", "bc", "de"]`).

    If *unique* is `True`, returns each cluster only once. If *unique*
    is `False` (the default), returns each cluster as many times as it
    occurs.

    If *case_sensitive* is `True` (the default), uppercase and lowercase
    characters will be treated as two different characters (e.g., "a"
    will be seen as different from "A"). If *case_sensitive* is `False`,
    uppercase and lowercase characters will be treated as the same
    character, and the output will be lowercase (e.g., "a" and "A" will
    both be treated as "a").
    """
    if unique not in [True, False]:
        raise AttributeError("'unique' must be either True or False.")
    if case_sensitive not in [True, False]:
        raise AttributeError("'case_sensitive' must be either True or False.")
    clusts = [clusters_word(word, vowels, sep) for word in words]
    flattened = list(it.chain(*clusts))
    if unique:
        output = list(set(flattened))
    else:
        output = flattened
    return output


[docs]def clusters_word(word, vowels, sep=None, case_sensitive=True):
    """
    Separates a *word* into clusters, defined as sequences of
    characters that do not contain any of the characters in the list of
    *vowels*.

    If *sep* is defined, it will be used as the delimiter string (for
    example, with `sep="."`, the word "a.bc.de" will be treated as the
    three-character sequence `["a", "bc", "de"]`).

    If *case_sensitive* is `True` (the default), uppercase and lowercase
    characters will be treated as two different characters (e.g., "a"
    will be seen as different from "A"). If *case_sensitive* is `False`,
    uppercase and lowercase characters will be treated as the same
    character, and the output will be lowercase (e.g., "a" and "A" will
    both be treated as "a").
    """
    if case_sensitive not in [True, False]:
        raise AttributeError("'case_sensitive' must be either True or False.")
    if not case_sensitive:
        word = word.lower()
        vowels = [v.lower() for v in vowels]
    output = []
    wkspc = []
    word = list(word) if not sep else word.split(sep)
    for char in word:
        # If char is a vowel and wkspc is non-empty, add cluster to list.
        if char in vowels and wkspc:
            output.append(__finalize_cluster(wkspc, sep))
            wkspc = []
        # If char is a vowel and wkspc is empty, move on.
        elif char in vowels and not wkspc:
            continue
        # If char is not a vowel, add it to the working cluster.
        else:
            wkspc.append(char)
    else:
        if wkspc:
            output += __finalize_cluster(wkspc, sep)
        else:
            None
    return output


[docs]def filter_by_nsyll(words, vowels, nsyll, sep=None):
    """
    Given a list of *words*, return a list containing only the words
    with the desired number of syllables, determined by the number of
    characters from the *vowels* list found in that word.

    The number of syllables, *nsyll* can be either an integer or a list
    of integers. If it is a list, the returned list will contain words
    of any syllable length included in *nsyll*.

    If *sep* is defined, it will be used as the delimiter string (for
    example, with `sep="."`, the word "a.bc.de" will be treated as the
    three-character sequence `["a", "bc", "de"]`).
    """
    nsyll = [nsyll] if type(nsyll) == int else nsyll
    match = list(filter(lambda w: __is_desired_nsyll(w, vowels, sep, nsyll),
                        words))
    return match


[docs]def get_cv(word, vowels, sep=None):
    """
    Calculate the consonant ("C") and vowel ("V") structure of the
    given word. Returns a string of the characters "C" and "V"
    corresponding to the characters in the word.

    *vowels* -- A list of the characters representing vowels.

    *sep* -- String used to separate phonemes (if the words are
    phonological forms). To separate into individual characters, set to
    `None` (default).
    """
    wsplit = list(word) if not sep else word.split(sep)
    pattern = ["C" if char not in vowels else "V" for char in wsplit]
    return "".join(pattern)


[docs]def nsyll_list(words, vowels, sep=None):
    """
    Count the number of syllables in each word in a *words* list,
    determined by the number of characters from the *vowels* list found
    in that word. Return a list of `(word, nsyll)` pairs.

    If *sep* is defined, it will be used as the delimiter string (for
    example, with `sep="."`, the word "a.bc.de" will be treated as the
    three-character sequence `["a", "bc", "de"]`).
    """
    return [nsyll_word(w, vowels, sep) for w in words]


[docs]def nsyll_word(word, vowels, sep=None):
    """
    Count the number of syllables in a *word*, determined by the number
    of characters from the *vowels* list found in that word.

    If *sep* is defined, it will be used as the delimiter string (for
    example, with `sep="."`, the word "a.bc.de" will be treated as the
    three-character sequence `["a", "bc", "de"]`).
    """
    counter = 0
    # This actually makes it safe to use '' as the delimiter, because that
    # will make `if not sep` return True.
    phonemes = list(word) if not sep else word.split(sep)
    for phoneme in phonemes:
        counter += 1 if phoneme in vowels else 0
    return counter


def __finalize_cluster(chars, sep):
    if sep:
        cluster = sep.join(chars)
    else:
        cluster = "".join(chars)
    return cluster


def __is_desired_nsyll(word, vowels, sep, nsyll):
    """
    Where nsyll is a list of desired syllable lengths. Return True if
    the word's syllable count is in the list of desired lengths; else,
    return False.
    """
    return nsyll_word(word, vowels, sep) in nsyll