Source code for lexlib.neighbors

#!/usr/bin/env python3

"""
Neighbor calculation functions for lexlib.
"""

# module: neighbors
# copyright: 2016-2018 R. Steiner
# license: MIT License


[docs]def get_neighbor_dict(words, **kwargs):
    """
    Compare each word in a list of *words* to each word in a *corpus*
    word list (or in the same list if *corpus* is not given), and return
    a dict where each target word is a key, and its value is a list of
    its neighbors. (If you are looking for a function to get neighbor
    pairs, see `get_neighbor_pairs()`).

    keyword arguments:
        *corpus* -- List of all the words to get the neighbors from. If
        empty, defaults to *words*.

        *sep* -- String used to separate phonemes (if the words are
        phonological forms). To separate into individual characters,
        set to `None` (default).

        *debug* -- If `True`, it prints the current word and the words
        being compared to it to the console. Defaults to `False`.
    """
    words = words.copy()
    sep = kwargs.get("sep", None)
    debug = kwargs.get("debug", None)
    corpus = kwargs.get("corpus", words).copy()
    neighbors = {}
    while words:
        word = words.pop()
        print(word) if debug else None
        w_split = list(word) if not sep else word.split(sep)
        nbrs = filter(lambda tgt: check_neighbors(w_split, tgt, sep), corpus)
        neighbors[word] = list(nbrs)
    return neighbors


[docs]def check_neighbors(a, b, sep=None):
    """
    Determine whether two words are neighbors. Returns `True` if they
    are neighbors and `False` if they are not.

    *sep* -- String used to separate phonemes (if the words are
    phonological forms). To separate into individual characters, set to
    `None` (default).
    """
    if type(a) == list:
        a_split = a
        b_split = list(b) if not sep else b.split(sep)
    elif sep:
        a_split = a.split(sep)
        b_split = b.split(sep)
    else:
        a_split = list(a)
        b_split = list(b)
    a_len = len(a_split)
    b_len = len(b_split)
    if b_len == a_len:
        if __check_substitution(a_split, b_split):
            return True
    elif b_len == a_len+1:
        if __check_addition(a_split, b_split):
            return True
    elif b_len == a_len-1:
        if __check_deletion(a_split, b_split):
            return True
    else:
        return False


[docs]def get_neighbor_pairs(words, **kwargs):
    """
    Compare each word in a list of *words* to each word in a *corpus*
    word list (or in the same list if *corpus* is not given), and return
    a list of `(word, neighbor)` pairs. (If you are looking for a
    function to get lists of all the neighbors for specific words, see
    `get_neighbor_pairs()`).

    keyword arguments:
        *corpus* -- List of all the words to get the neighbors from. If
        omitted, defaults to `words`.

        *sep* -- String used to separate phonemes (if the words are
        phonological forms). To separate into individual characters, set
        to `None` (default).

        *debug* -- If True, it logs the current word and the words being
        compared to it to the console. Defaults to False.
    """
    words = words.copy()
    sep = kwargs.get("sep", None)
    debug = kwargs.get("debug", None)
    corpus = kwargs.get("corpus", words).copy()
    neighbors = []
    while words:
        word = words.pop()
        print(word) if debug else None
        # Lighten the memory load and avoid duplicates.
        if word in corpus:
            corpus.remove(word)
        w_split = list(word) if not sep else word.split(sep)
        nbrs = filter(lambda tgt: check_neighbors(w_split, tgt, sep), corpus)
        neighbors += [(word, nbr) for nbr in nbrs]
    return neighbors


[docs]def get_neighbor_positions(neighbor_pairs, sep=None):
    """
    Given a list of `(word1, word2)` *neighbor_pairs*, return a list of
    `(word1, word2, position)` triples, where `position` is the position
    in the words where the neighbor relationship is formed. Note that
    this can only be calculated for pairs of substitution neighbors. If
    the words differ in length, `position` will be `-1`.

    Example::

        >>> neighbor_pairs = [("cat", "cap"), ("cat", "cut"), ("cat", "cast")]
        >>> get_neighbor_positions(neighbor_pairs)
        [("cat", "cap", 3), ("cat", "cut", 2), ("cat", "cast", -1)]

    """
    return [__get_position(neighbors, sep=None) for neighbors in
            neighbor_pairs]


[docs]def get_neighbor_types(neighbor_dict, sep=None):
    """
    Given a *neighbor_dict* (where a key is a "target" word and its
    value is a list of all of its neighbors), return a list of `(word1,
    word2, relationship)` triples, where `relationship` is one of
    "deletion," "addition," "substitution," or "unknown".
    """
    types = []
    targets = list(neighbor_dict.keys())
    while targets:
        current = targets.pop()
        # In case it gets split, use this as the key.
        current_key = current
        if sep:
            current = current.split(sep)
        for neighbor in neighbor_dict[current_key]:
            # Misnomer, just to add to list.
            neighbor_key = neighbor
            if sep:
                neighbor = neighbor.split(sep)
            # If they are the same length, the change was a substitution.
            if len(current) == len(neighbor):
                types.append((current_key, neighbor_key, "substitution"))
            # If the target is longer than the neighbor, the change was
            # a deletion.
            elif len(current) > len(neighbor):
                types.append((current_key, neighbor_key, "deletion"))
            # If the target is shorter than the neighbor, the change
            # was an addition.
            elif len(current) < len(neighbor):
                types.append((current_key, neighbor_key, "addition"))
            else:
                types.append((current_key, neighbor_key, "unknown"))
    return types


def __check_addition(base, candidate):
    strikes = 0
    for position in range(len(base)):
        while True:
            # If they match, break the while loop and try the next position.
            if base[position] == candidate[position+strikes]:
                break
            # Otherwise, take a strike and continue on that position,
            # as long as it's the first strike. If it's the second strike,
            # then they are not neighbors, so return False.
            else:
                strikes += 1
                if strikes >= 2:
                    return False
    else:
        return True


def __check_deletion(base, candidate):
    strikes = 0
    for position in range(len(candidate)):
        while True:
            if base[position+strikes] == candidate[position]:
                break
            else:
                strikes += 1
                if strikes >= 2:
                    return False
    else:
        return True


def __check_substitution(base, candidate):
    strikes = 0
    for position in range(len(base)):
        if base[position] == candidate[position]:
            continue
        else:
            strikes += 1
            if strikes >= 2:
                return False
    else:
        return True


def __get_position(neighbors, sep=None):
    first, second = neighbors
    if sep:
        first = first.split(sep)
        second = second.split(sep)
    if len(first) != len(second):
        return (first, second, -1)
    for pos in range(len(first)):
        if first[pos] != second[pos]:
            return (first, second, pos+1)
    else:
        return (first, second, 0)