dotfiles/talon/user/community/core/vocabulary/vocabulary.py

import logging
import os
import re
from typing import Sequence, Union

from talon import Context, Module, actions
from talon.grammar import Phrase

from ..user_settings import append_to_csv, track_csv_list

mod = Module()
ctx = Context()

mod.list("vocabulary", desc="additional vocabulary words")

# Default words that will need to be capitalized.
# DON'T EDIT THIS. Edit settings/words_to_replace.csv instead.
# These defaults and those later in this file are ONLY used when
# auto-creating the corresponding settings/*.csv files. Those csv files
# determine the contents of user.vocabulary and dictate.word_map. Once they
# exist, the contents of the lists/dictionaries below are irrelevant.
_capitalize_defaults = [
    # NB. the lexicon now capitalizes January/February by default, but not the
    # others below. Not sure why.
    "January",
    "February",
    # March omitted because it's a regular word too
    "April",
    # May omitted because it's a regular word too
    "June",
    "July",
    "August",  # technically also an adjective but the month is far more common
    "September",
    "October",
    "November",
    "December",
]

# Default words that need to be remapped.
_word_map_defaults = {
    # E.g:
    # "cash": "cache",
    # This is the opposite ordering to words_to_replace.csv (the latter has the target word first)
}
_word_map_defaults.update({word.lower(): word for word in _capitalize_defaults})
phrases_to_replace = {}


class PhraseReplacer:
    """Utility for replacing phrases by other phrases inside text or word lists.

    Replacing longer phrases has priority.

    Args:
      - phrase_dict: dictionary mapping recognized/spoken forms to written forms
    """

    def __init__(self):
        self.phrase_index = {}

    def update(self, phrase_dict: dict[str, str]):
        # Index phrases by first word, then number of subsequent words n_next
        phrase_index = dict()
        for spoken_form, written_form in phrase_dict.items():
            words = spoken_form.split()
            if not words:
                logging.warning(
                    "Found empty spoken form for written form"
                    f"{written_form}, ignored"
                )
                continue
            first_word, n_next = words[0], len(words) - 1
            phrase_index.setdefault(first_word, {}).setdefault(n_next, {})[
                tuple(words[1:])
            ] = written_form

        # Sort n_next index so longer phrases have priority
        self.phrase_index = {
            first_word: sorted(same_first_word.items(), key=lambda x: -x[0])
            for first_word, same_first_word in phrase_index.items()
        }

    def replace(self, input_words: Sequence[str]) -> Sequence[str]:
        input_words = tuple(input_words)  # tuple to ensure hashability of slices
        output_words = []
        first_word_i = 0
        while first_word_i < len(input_words):
            first_word = input_words[first_word_i]
            next_word_i = first_word_i + 1
            # Could this word be the first of a phrase we should replace?
            for n_next, phrases_n_next in self.phrase_index.get(first_word, []):
                # Yes. Perhaps a phrase with n_next subsequent words?
                continuation = input_words[next_word_i : next_word_i + n_next]
                if continuation in phrases_n_next:
                    # Found a match!
                    output_words.append(phrases_n_next[continuation])
                    first_word_i += 1 + n_next
                    break
            else:
                # No match, just add the word to the result
                output_words.append(first_word)
                first_word_i += 1
        return output_words

    # Wrapper used for testing.
    def replace_string(self, text: str) -> str:
        return " ".join(self.replace(text.split()))


# Unit tests for PhraseReplacer
rep = PhraseReplacer()
rep.update(
    {
        "this": "foo",
        "that": "bar",
        "this is": "stopping early",
        "this is a test": "it worked!",
    }
)
assert rep.replace_string("gnork") == "gnork"
assert rep.replace_string("this") == "foo"
assert rep.replace_string("this that this") == "foo bar foo"
assert rep.replace_string("this is a test") == "it worked!"
assert rep.replace_string("well this is a test really") == "well it worked! really"
assert rep.replace_string("try this is too") == "try stopping early too"
assert rep.replace_string("this is a tricky one") == "stopping early a tricky one"

phrase_replacer = PhraseReplacer()


# phrases_to_replace is a spoken form -> written form map, used by our
# implementation of `dictate.replace_words` (at bottom of file) to rewrite words
# and phrases Talon recognized. This does not change the priority with which
# Talon recognizes particular phrases over others.
@track_csv_list(
    "words_to_replace.csv",
    headers=("Replacement", "Original"),
    default=_word_map_defaults,
)
def on_word_map(values):
    global phrases_to_replace
    phrases_to_replace = values
    phrase_replacer.update(values)

    # "dictate.word_map" is used by Talon's built-in default implementation of
    # `dictate.replace_words`, but supports only single-word replacements.
    # Multi-word phrases are ignored.
    ctx.settings["dictate.word_map"] = values


@ctx.action_class("dictate")
class OverwrittenActions:
    def replace_words(words: Sequence[str]) -> Sequence[str]:
        try:
            return phrase_replacer.replace(words)
        except:
            # fall back to default implementation for error-robustness
            logging.error("phrase replacer failed!")
            return actions.next(words)


def _create_vocabulary_entries(spoken_form, written_form, type):
    """Expands the provided spoken form and written form into multiple variants based on
    the provided type, which can be either "name" to add a possessive variant or "noun"
    to add plural.
    """
    entries = {spoken_form: written_form}
    if type == "name":
        # Note that we use the spoken form without apostrophe because this seems to generally lead
        # to better recognition on Conformer b108.
        entries[f"{spoken_form}s"] = f"{written_form}'s"
    elif type == "noun":
        # Note that we simply append an "s", but we could use something more sophisticated like
        # https://github.com/jpvanhal/inflection. The downside is that this is less predictable,
        # and this feature is likely to be used in ways that are unlike common English prose, which
        # is already included in the lexicon. For example, made up identifiers used in programming.
        entries[f"{spoken_form}s"] = f"{written_form}s"
    return entries


# See https://github.com/wolfmanstout/talon-vocabulary-editor for an experimental version
# of this which tests if the default spoken form can be used instead of the provided phrase.
def _add_selection_to_file(
    phrase: Union[Phrase, str],
    type: str,
    file_name: str,
    file_contents: dict[str, str],
    skip_identical_replacement: bool,
):
    written_form = actions.edit.selected_text().strip()
    if phrase:
        spoken_form = " ".join(actions.dictate.parse_words(phrase))
    else:
        is_acronym = re.fullmatch(r"[A-Z]+", written_form)
        spoken_form = " ".join(written_form) if is_acronym else written_form
    entries = _create_vocabulary_entries(spoken_form, written_form, type)
    added_some_phrases = False

    # until we add support for parsing or otherwise getting the active
    # vocabulary.talon-list, skip the logic for checking for duplicates etc
    if file_contents:
        # clear the new entries dictionary
        new_entries = {}
        for spoken_form, written_form in entries.items():
            if skip_identical_replacement and spoken_form == written_form:
                actions.app.notify(f'Skipping identical replacement: "{spoken_form}"')
            elif spoken_form in file_contents:
                actions.app.notify(
                    f'Spoken form "{spoken_form}" is already in {file_name}'
                )
            else:
                new_entries[spoken_form] = written_form
                added_some_phrases = True
    else:
        new_entries = entries
        added_some_phrases = True

    if file_name.endswith(".csv"):
        append_to_csv(file_name, new_entries)
    elif file_name == "vocabulary.talon-list":
        append_to_vocabulary(new_entries)

    if added_some_phrases:
        actions.app.notify(f"Added to {file_name}: {new_entries}")


def append_to_vocabulary(rows: dict[str, str]):
    vocabulary_file_path = actions.user.get_vocabulary_file_path()
    with open(str(vocabulary_file_path)) as file:
        line = None
        for line in file:
            pass
        needs_newline = line is not None and not line.endswith("\n")

    with open(vocabulary_file_path, "a", encoding="utf-8") as file:
        if needs_newline:
            file.write("\n")
        for key, value in rows.items():
            if key == value:
                file.write(f"{key}\n")
            else:
                value = repr(value)
                file.write(f"{key}: {value}\n")


@mod.action_class
class Actions:
    # this is implemented as an action so it may be overridden in other contexts
    def get_vocabulary_file_path():
        """Returns the path for the active vocabulary file"""
        vocabulary_directory = os.path.dirname(os.path.realpath(__file__))
        vocabulary_file_path = os.path.join(
            vocabulary_directory, "vocabulary.talon-list"
        )
        return vocabulary_file_path

    def add_selection_to_vocabulary(phrase: Union[Phrase, str] = "", type: str = ""):
        """Permanently adds the currently selected text to the vocabulary with the provided
        spoken form and adds variants based on the type ("noun" or "name").
        """
        _add_selection_to_file(
            phrase,
            type,
            "vocabulary.talon-list",
            None,
            False,
        )

    def add_selection_to_words_to_replace(phrase: Phrase, type: str = ""):
        """Permanently adds the currently selected text as replacement for the provided
        original form and adds variants based on the type ("noun" or "name").
        """
        _add_selection_to_file(
            phrase,
            type,
            "words_to_replace.csv",
            phrases_to_replace,
            True,
        )