dotfiles/talon/community/community-cursorless-0.4.0/code/create_spoken_forms.py

from typing import List, Optional
import itertools
from talon import registry
import re

from .extensions import file_extensions
from .numbers import digits_map
from .abbreviate import abbreviations

# TODO: 'Whats application': 'WhatsApp' (Should keep "whats app" as well?)
# TODO: 'V O X': 'VOX' (should keep "VOX" as well?)
# Could handle by handling all alternatives for these, or by having hardcoded list of things that we want to handle specially

DEFAULT_MINIMUM_TERM_LENGTH = 3

SMALL_WORD = r"[A-Z]?[a-z]+"
# TODO: We want "AXEvery" to be ["AX", "Every"]
UPPERCASE_WORD = r"[A-Z]+"
FILE_EXTENSIONS_REGEX = "|".join(
    re.escape(file_extension) for file_extension in file_extensions.values()
)
DIGITS_REGEX = r"\d"
FULL_REGEX = re.compile(
    "|".join(
        [
            DIGITS_REGEX,
            FILE_EXTENSIONS_REGEX,
            SMALL_WORD,
            UPPERCASE_WORD,
        ]
    )
)

REVERSE_PRONUNCIATION_MAP = {
    **{value: key for key, value in abbreviations.items()},
    **{value: key for key, value in file_extensions.items()},
    **{str(value): key for key, value in digits_map.items()},
}


def create_single_spoken_form(source: str):
    normalized_source = source.lower()
    try:
        mapped_source = REVERSE_PRONUNCIATION_MAP[normalized_source]
    except KeyError:
        mapped_source = source
    if mapped_source.isupper():
        mapped_source = " ".join(mapped_source)
    return mapped_source


def create_spoken_forms(
    source: str,
    words_to_exclude: Optional[List[str]] = None,
    minimum_term_length=DEFAULT_MINIMUM_TERM_LENGTH,
) -> List[str]:
    if words_to_exclude is None:
        words_to_exclude = []

    pieces = list(FULL_REGEX.finditer(source))
    # print([piece.group(0) for piece in pieces])

    term_sequence = " ".join(
        [create_single_spoken_form(piece.group(0)) for piece in pieces]
    ).split(" ")
    # print(term_sequence)

    terms = list(
        {
            term.strip()
            for term in (
                term_sequence
                + list(itertools.accumulate([f"{term} " for term in term_sequence]))
                + [source]
            )
        }
    )

    terms = [
        term
        for term in terms
        if term not in words_to_exclude and len(term) >= minimum_term_length
    ]
    # print(terms)

    return terms