86 lines
No EOL
2.3 KiB
Python
86 lines
No EOL
2.3 KiB
Python
from typing import List, Optional
|
|
import itertools
|
|
from talon import registry
|
|
import re
|
|
|
|
from .extensions import file_extensions
|
|
from .numbers import digits_map
|
|
from .abbreviate import abbreviations
|
|
|
|
# TODO: 'Whats application': 'WhatsApp' (Should keep "whats app" as well?)
|
|
# TODO: 'V O X': 'VOX' (should keep "VOX" as well?)
|
|
# Could handle by handling all alternatives for these, or by having hardcoded list of things that we want to handle specially
|
|
|
|
DEFAULT_MINIMUM_TERM_LENGTH = 3
|
|
|
|
SMALL_WORD = r"[A-Z]?[a-z]+"
|
|
# TODO: We want "AXEvery" to be ["AX", "Every"]
|
|
UPPERCASE_WORD = r"[A-Z]+"
|
|
FILE_EXTENSIONS_REGEX = "|".join(
|
|
re.escape(file_extension) for file_extension in file_extensions.values()
|
|
)
|
|
DIGITS_REGEX = r"\d"
|
|
FULL_REGEX = re.compile(
|
|
"|".join(
|
|
[
|
|
DIGITS_REGEX,
|
|
FILE_EXTENSIONS_REGEX,
|
|
SMALL_WORD,
|
|
UPPERCASE_WORD,
|
|
]
|
|
)
|
|
)
|
|
|
|
REVERSE_PRONUNCIATION_MAP = {
|
|
**{value: key for key, value in abbreviations.items()},
|
|
**{value: key for key, value in file_extensions.items()},
|
|
**{str(value): key for key, value in digits_map.items()},
|
|
}
|
|
|
|
|
|
def create_single_spoken_form(source: str):
|
|
normalized_source = source.lower()
|
|
try:
|
|
mapped_source = REVERSE_PRONUNCIATION_MAP[normalized_source]
|
|
except KeyError:
|
|
mapped_source = source
|
|
if mapped_source.isupper():
|
|
mapped_source = " ".join(mapped_source)
|
|
return mapped_source
|
|
|
|
|
|
def create_spoken_forms(
|
|
source: str,
|
|
words_to_exclude: Optional[List[str]] = None,
|
|
minimum_term_length=DEFAULT_MINIMUM_TERM_LENGTH,
|
|
) -> List[str]:
|
|
if words_to_exclude is None:
|
|
words_to_exclude = []
|
|
|
|
pieces = list(FULL_REGEX.finditer(source))
|
|
# print([piece.group(0) for piece in pieces])
|
|
|
|
term_sequence = " ".join(
|
|
[create_single_spoken_form(piece.group(0)) for piece in pieces]
|
|
).split(" ")
|
|
# print(term_sequence)
|
|
|
|
terms = list(
|
|
{
|
|
term.strip()
|
|
for term in (
|
|
term_sequence
|
|
+ list(itertools.accumulate([f"{term} " for term in term_sequence]))
|
|
+ [source]
|
|
)
|
|
}
|
|
)
|
|
|
|
terms = [
|
|
term
|
|
for term in terms
|
|
if term not in words_to_exclude and len(term) >= minimum_term_length
|
|
]
|
|
# print(terms)
|
|
|
|
return terms |