279 lines
10 KiB
Python
279 lines
10 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
from typing import Sequence, Union
|
|
|
|
from talon import Context, Module, actions
|
|
from talon.grammar import Phrase
|
|
|
|
from ..user_settings import append_to_csv, track_csv_list
|
|
|
|
mod = Module()
|
|
ctx = Context()
|
|
|
|
mod.list("vocabulary", desc="additional vocabulary words")
|
|
|
|
# Default words that will need to be capitalized.
|
|
# DON'T EDIT THIS. Edit settings/words_to_replace.csv instead.
|
|
# These defaults and those later in this file are ONLY used when
|
|
# auto-creating the corresponding settings/*.csv files. Those csv files
|
|
# determine the contents of user.vocabulary and dictate.word_map. Once they
|
|
# exist, the contents of the lists/dictionaries below are irrelevant.
|
|
_capitalize_defaults = [
|
|
# NB. the lexicon now capitalizes January/February by default, but not the
|
|
# others below. Not sure why.
|
|
"January",
|
|
"February",
|
|
# March omitted because it's a regular word too
|
|
"April",
|
|
# May omitted because it's a regular word too
|
|
"June",
|
|
"July",
|
|
"August", # technically also an adjective but the month is far more common
|
|
"September",
|
|
"October",
|
|
"November",
|
|
"December",
|
|
]
|
|
|
|
# Default words that need to be remapped.
|
|
_word_map_defaults = {
|
|
# E.g:
|
|
# "cash": "cache",
|
|
# This is the opposite ordering to words_to_replace.csv (the latter has the target word first)
|
|
}
|
|
_word_map_defaults.update({word.lower(): word for word in _capitalize_defaults})
|
|
phrases_to_replace = {}
|
|
|
|
|
|
class PhraseReplacer:
|
|
"""Utility for replacing phrases by other phrases inside text or word lists.
|
|
|
|
Replacing longer phrases has priority.
|
|
|
|
Args:
|
|
- phrase_dict: dictionary mapping recognized/spoken forms to written forms
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.phrase_index = {}
|
|
|
|
def update(self, phrase_dict: dict[str, str]):
|
|
# Index phrases by first word, then number of subsequent words n_next
|
|
phrase_index = dict()
|
|
for spoken_form, written_form in phrase_dict.items():
|
|
words = spoken_form.split()
|
|
if not words:
|
|
logging.warning(
|
|
"Found empty spoken form for written form"
|
|
f"{written_form}, ignored"
|
|
)
|
|
continue
|
|
first_word, n_next = words[0], len(words) - 1
|
|
phrase_index.setdefault(first_word, {}).setdefault(n_next, {})[
|
|
tuple(words[1:])
|
|
] = written_form
|
|
|
|
# Sort n_next index so longer phrases have priority
|
|
self.phrase_index = {
|
|
first_word: sorted(same_first_word.items(), key=lambda x: -x[0])
|
|
for first_word, same_first_word in phrase_index.items()
|
|
}
|
|
|
|
def replace(self, input_words: Sequence[str]) -> Sequence[str]:
|
|
input_words = tuple(input_words) # tuple to ensure hashability of slices
|
|
output_words = []
|
|
first_word_i = 0
|
|
while first_word_i < len(input_words):
|
|
first_word = input_words[first_word_i]
|
|
next_word_i = first_word_i + 1
|
|
# Could this word be the first of a phrase we should replace?
|
|
for n_next, phrases_n_next in self.phrase_index.get(first_word, []):
|
|
# Yes. Perhaps a phrase with n_next subsequent words?
|
|
continuation = input_words[next_word_i : next_word_i + n_next]
|
|
if continuation in phrases_n_next:
|
|
# Found a match!
|
|
output_words.append(phrases_n_next[continuation])
|
|
first_word_i += 1 + n_next
|
|
break
|
|
else:
|
|
# No match, just add the word to the result
|
|
output_words.append(first_word)
|
|
first_word_i += 1
|
|
return output_words
|
|
|
|
# Wrapper used for testing.
|
|
def replace_string(self, text: str) -> str:
|
|
return " ".join(self.replace(text.split()))
|
|
|
|
|
|
# Unit tests for PhraseReplacer
|
|
rep = PhraseReplacer()
|
|
rep.update(
|
|
{
|
|
"this": "foo",
|
|
"that": "bar",
|
|
"this is": "stopping early",
|
|
"this is a test": "it worked!",
|
|
}
|
|
)
|
|
assert rep.replace_string("gnork") == "gnork"
|
|
assert rep.replace_string("this") == "foo"
|
|
assert rep.replace_string("this that this") == "foo bar foo"
|
|
assert rep.replace_string("this is a test") == "it worked!"
|
|
assert rep.replace_string("well this is a test really") == "well it worked! really"
|
|
assert rep.replace_string("try this is too") == "try stopping early too"
|
|
assert rep.replace_string("this is a tricky one") == "stopping early a tricky one"
|
|
|
|
phrase_replacer = PhraseReplacer()
|
|
|
|
|
|
# phrases_to_replace is a spoken form -> written form map, used by our
|
|
# implementation of `dictate.replace_words` (at bottom of file) to rewrite words
|
|
# and phrases Talon recognized. This does not change the priority with which
|
|
# Talon recognizes particular phrases over others.
|
|
@track_csv_list(
|
|
"words_to_replace.csv",
|
|
headers=("Replacement", "Original"),
|
|
default=_word_map_defaults,
|
|
)
|
|
def on_word_map(values):
|
|
global phrases_to_replace
|
|
phrases_to_replace = values
|
|
phrase_replacer.update(values)
|
|
|
|
# "dictate.word_map" is used by Talon's built-in default implementation of
|
|
# `dictate.replace_words`, but supports only single-word replacements.
|
|
# Multi-word phrases are ignored.
|
|
ctx.settings["dictate.word_map"] = values
|
|
|
|
|
|
@ctx.action_class("dictate")
|
|
class OverwrittenActions:
|
|
def replace_words(words: Sequence[str]) -> Sequence[str]:
|
|
try:
|
|
return phrase_replacer.replace(words)
|
|
except:
|
|
# fall back to default implementation for error-robustness
|
|
logging.error("phrase replacer failed!")
|
|
return actions.next(words)
|
|
|
|
|
|
def _create_vocabulary_entries(spoken_form, written_form, type):
|
|
"""Expands the provided spoken form and written form into multiple variants based on
|
|
the provided type, which can be either "name" to add a possessive variant or "noun"
|
|
to add plural.
|
|
"""
|
|
entries = {spoken_form: written_form}
|
|
if type == "name":
|
|
# Note that we use the spoken form without apostrophe because this seems to generally lead
|
|
# to better recognition on Conformer b108.
|
|
entries[f"{spoken_form}s"] = f"{written_form}'s"
|
|
elif type == "noun":
|
|
# Note that we simply append an "s", but we could use something more sophisticated like
|
|
# https://github.com/jpvanhal/inflection. The downside is that this is less predictable,
|
|
# and this feature is likely to be used in ways that are unlike common English prose, which
|
|
# is already included in the lexicon. For example, made up identifiers used in programming.
|
|
entries[f"{spoken_form}s"] = f"{written_form}s"
|
|
return entries
|
|
|
|
|
|
# See https://github.com/wolfmanstout/talon-vocabulary-editor for an experimental version
|
|
# of this which tests if the default spoken form can be used instead of the provided phrase.
|
|
def _add_selection_to_file(
|
|
phrase: Union[Phrase, str],
|
|
type: str,
|
|
file_name: str,
|
|
file_contents: dict[str, str],
|
|
skip_identical_replacement: bool,
|
|
):
|
|
written_form = actions.edit.selected_text().strip()
|
|
if phrase:
|
|
spoken_form = " ".join(actions.dictate.parse_words(phrase))
|
|
else:
|
|
is_acronym = re.fullmatch(r"[A-Z]+", written_form)
|
|
spoken_form = " ".join(written_form) if is_acronym else written_form
|
|
entries = _create_vocabulary_entries(spoken_form, written_form, type)
|
|
added_some_phrases = False
|
|
|
|
# until we add support for parsing or otherwise getting the active
|
|
# vocabulary.talon-list, skip the logic for checking for duplicates etc
|
|
if file_contents:
|
|
# clear the new entries dictionary
|
|
new_entries = {}
|
|
for spoken_form, written_form in entries.items():
|
|
if skip_identical_replacement and spoken_form == written_form:
|
|
actions.app.notify(f'Skipping identical replacement: "{spoken_form}"')
|
|
elif spoken_form in file_contents:
|
|
actions.app.notify(
|
|
f'Spoken form "{spoken_form}" is already in {file_name}'
|
|
)
|
|
else:
|
|
new_entries[spoken_form] = written_form
|
|
added_some_phrases = True
|
|
else:
|
|
new_entries = entries
|
|
added_some_phrases = True
|
|
|
|
if file_name.endswith(".csv"):
|
|
append_to_csv(file_name, new_entries)
|
|
elif file_name == "vocabulary.talon-list":
|
|
append_to_vocabulary(new_entries)
|
|
|
|
if added_some_phrases:
|
|
actions.app.notify(f"Added to {file_name}: {new_entries}")
|
|
|
|
|
|
def append_to_vocabulary(rows: dict[str, str]):
|
|
vocabulary_file_path = actions.user.get_vocabulary_file_path()
|
|
with open(str(vocabulary_file_path)) as file:
|
|
line = None
|
|
for line in file:
|
|
pass
|
|
needs_newline = line is not None and not line.endswith("\n")
|
|
|
|
with open(vocabulary_file_path, "a", encoding="utf-8") as file:
|
|
if needs_newline:
|
|
file.write("\n")
|
|
for key, value in rows.items():
|
|
if key == value:
|
|
file.write(f"{key}\n")
|
|
else:
|
|
value = repr(value)
|
|
file.write(f"{key}: {value}\n")
|
|
|
|
|
|
@mod.action_class
|
|
class Actions:
|
|
# this is implemented as an action so it may be overridden in other contexts
|
|
def get_vocabulary_file_path():
|
|
"""Returns the path for the active vocabulary file"""
|
|
vocabulary_directory = os.path.dirname(os.path.realpath(__file__))
|
|
vocabulary_file_path = os.path.join(
|
|
vocabulary_directory, "vocabulary.talon-list"
|
|
)
|
|
return vocabulary_file_path
|
|
|
|
def add_selection_to_vocabulary(phrase: Union[Phrase, str] = "", type: str = ""):
|
|
"""Permanently adds the currently selected text to the vocabulary with the provided
|
|
spoken form and adds variants based on the type ("noun" or "name").
|
|
"""
|
|
_add_selection_to_file(
|
|
phrase,
|
|
type,
|
|
"vocabulary.talon-list",
|
|
None,
|
|
False,
|
|
)
|
|
|
|
def add_selection_to_words_to_replace(phrase: Phrase, type: str = ""):
|
|
"""Permanently adds the currently selected text as replacement for the provided
|
|
original form and adds variants based on the type ("noun" or "name").
|
|
"""
|
|
_add_selection_to_file(
|
|
phrase,
|
|
type,
|
|
"words_to_replace.csv",
|
|
phrases_to_replace,
|
|
True,
|
|
)
|