import logging import os import re from typing import Sequence, Union from talon import Context, Module, actions from talon.grammar import Phrase from ..user_settings import append_to_csv, track_csv_list mod = Module() ctx = Context() mod.list("vocabulary", desc="additional vocabulary words") # Default words that will need to be capitalized. # DON'T EDIT THIS. Edit settings/words_to_replace.csv instead. # These defaults and those later in this file are ONLY used when # auto-creating the corresponding settings/*.csv files. Those csv files # determine the contents of user.vocabulary and dictate.word_map. Once they # exist, the contents of the lists/dictionaries below are irrelevant. _capitalize_defaults = [ # NB. the lexicon now capitalizes January/February by default, but not the # others below. Not sure why. "January", "February", # March omitted because it's a regular word too "April", # May omitted because it's a regular word too "June", "July", "August", # technically also an adjective but the month is far more common "September", "October", "November", "December", ] # Default words that need to be remapped. _word_map_defaults = { # E.g: # "cash": "cache", # This is the opposite ordering to words_to_replace.csv (the latter has the target word first) } _word_map_defaults.update({word.lower(): word for word in _capitalize_defaults}) phrases_to_replace = {} class PhraseReplacer: """Utility for replacing phrases by other phrases inside text or word lists. Replacing longer phrases has priority. Args: - phrase_dict: dictionary mapping recognized/spoken forms to written forms """ def __init__(self): self.phrase_index = {} def update(self, phrase_dict: dict[str, str]): # Index phrases by first word, then number of subsequent words n_next phrase_index = dict() for spoken_form, written_form in phrase_dict.items(): words = spoken_form.split() if not words: logging.warning( "Found empty spoken form for written form" f"{written_form}, ignored" ) continue first_word, n_next = words[0], len(words) - 1 phrase_index.setdefault(first_word, {}).setdefault(n_next, {})[ tuple(words[1:]) ] = written_form # Sort n_next index so longer phrases have priority self.phrase_index = { first_word: sorted(same_first_word.items(), key=lambda x: -x[0]) for first_word, same_first_word in phrase_index.items() } def replace(self, input_words: Sequence[str]) -> Sequence[str]: input_words = tuple(input_words) # tuple to ensure hashability of slices output_words = [] first_word_i = 0 while first_word_i < len(input_words): first_word = input_words[first_word_i] next_word_i = first_word_i + 1 # Could this word be the first of a phrase we should replace? for n_next, phrases_n_next in self.phrase_index.get(first_word, []): # Yes. Perhaps a phrase with n_next subsequent words? continuation = input_words[next_word_i : next_word_i + n_next] if continuation in phrases_n_next: # Found a match! output_words.append(phrases_n_next[continuation]) first_word_i += 1 + n_next break else: # No match, just add the word to the result output_words.append(first_word) first_word_i += 1 return output_words # Wrapper used for testing. def replace_string(self, text: str) -> str: return " ".join(self.replace(text.split())) # Unit tests for PhraseReplacer rep = PhraseReplacer() rep.update( { "this": "foo", "that": "bar", "this is": "stopping early", "this is a test": "it worked!", } ) assert rep.replace_string("gnork") == "gnork" assert rep.replace_string("this") == "foo" assert rep.replace_string("this that this") == "foo bar foo" assert rep.replace_string("this is a test") == "it worked!" assert rep.replace_string("well this is a test really") == "well it worked! really" assert rep.replace_string("try this is too") == "try stopping early too" assert rep.replace_string("this is a tricky one") == "stopping early a tricky one" phrase_replacer = PhraseReplacer() # phrases_to_replace is a spoken form -> written form map, used by our # implementation of `dictate.replace_words` (at bottom of file) to rewrite words # and phrases Talon recognized. This does not change the priority with which # Talon recognizes particular phrases over others. @track_csv_list( "words_to_replace.csv", headers=("Replacement", "Original"), default=_word_map_defaults, ) def on_word_map(values): global phrases_to_replace phrases_to_replace = values phrase_replacer.update(values) # "dictate.word_map" is used by Talon's built-in default implementation of # `dictate.replace_words`, but supports only single-word replacements. # Multi-word phrases are ignored. ctx.settings["dictate.word_map"] = values @ctx.action_class("dictate") class OverwrittenActions: def replace_words(words: Sequence[str]) -> Sequence[str]: try: return phrase_replacer.replace(words) except: # fall back to default implementation for error-robustness logging.error("phrase replacer failed!") return actions.next(words) def _create_vocabulary_entries(spoken_form, written_form, type): """Expands the provided spoken form and written form into multiple variants based on the provided type, which can be either "name" to add a possessive variant or "noun" to add plural. """ entries = {spoken_form: written_form} if type == "name": # Note that we use the spoken form without apostrophe because this seems to generally lead # to better recognition on Conformer b108. entries[f"{spoken_form}s"] = f"{written_form}'s" elif type == "noun": # Note that we simply append an "s", but we could use something more sophisticated like # https://github.com/jpvanhal/inflection. The downside is that this is less predictable, # and this feature is likely to be used in ways that are unlike common English prose, which # is already included in the lexicon. For example, made up identifiers used in programming. entries[f"{spoken_form}s"] = f"{written_form}s" return entries # See https://github.com/wolfmanstout/talon-vocabulary-editor for an experimental version # of this which tests if the default spoken form can be used instead of the provided phrase. def _add_selection_to_file( phrase: Union[Phrase, str], type: str, file_name: str, file_contents: dict[str, str], skip_identical_replacement: bool, ): written_form = actions.edit.selected_text().strip() if phrase: spoken_form = " ".join(actions.dictate.parse_words(phrase)) else: is_acronym = re.fullmatch(r"[A-Z]+", written_form) spoken_form = " ".join(written_form) if is_acronym else written_form entries = _create_vocabulary_entries(spoken_form, written_form, type) added_some_phrases = False # until we add support for parsing or otherwise getting the active # vocabulary.talon-list, skip the logic for checking for duplicates etc if file_contents: # clear the new entries dictionary new_entries = {} for spoken_form, written_form in entries.items(): if skip_identical_replacement and spoken_form == written_form: actions.app.notify(f'Skipping identical replacement: "{spoken_form}"') elif spoken_form in file_contents: actions.app.notify( f'Spoken form "{spoken_form}" is already in {file_name}' ) else: new_entries[spoken_form] = written_form added_some_phrases = True else: new_entries = entries added_some_phrases = True if file_name.endswith(".csv"): append_to_csv(file_name, new_entries) elif file_name == "vocabulary.talon-list": append_to_vocabulary(new_entries) if added_some_phrases: actions.app.notify(f"Added to {file_name}: {new_entries}") def append_to_vocabulary(rows: dict[str, str]): vocabulary_file_path = actions.user.get_vocabulary_file_path() with open(str(vocabulary_file_path)) as file: line = None for line in file: pass needs_newline = line is not None and not line.endswith("\n") with open(vocabulary_file_path, "a", encoding="utf-8") as file: if needs_newline: file.write("\n") for key, value in rows.items(): if key == value: file.write(f"{key}\n") else: value = repr(value) file.write(f"{key}: {value}\n") @mod.action_class class Actions: # this is implemented as an action so it may be overridden in other contexts def get_vocabulary_file_path(): """Returns the path for the active vocabulary file""" vocabulary_directory = os.path.dirname(os.path.realpath(__file__)) vocabulary_file_path = os.path.join( vocabulary_directory, "vocabulary.talon-list" ) return vocabulary_file_path def add_selection_to_vocabulary(phrase: Union[Phrase, str] = "", type: str = ""): """Permanently adds the currently selected text to the vocabulary with the provided spoken form and adds variants based on the type ("noun" or "name"). """ _add_selection_to_file( phrase, type, "vocabulary.talon-list", None, False, ) def add_selection_to_words_to_replace(phrase: Phrase, type: str = ""): """Permanently adds the currently selected text as replacement for the provided original form and adds variants based on the type ("noun" or "name"). """ _add_selection_to_file( phrase, type, "words_to_replace.csv", phrases_to_replace, True, )