dotfiles/talon/user/community/core/numbers/numbers.py

273 lines
10 KiB
Python

import math
from typing import Iterator, Union
from talon import Context, Module
mod = Module()
ctx = Context()
digit_list = "zero one two three four five six seven eight nine".split()
teens = "ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen".split()
tens = "twenty thirty forty fifty sixty seventy eighty ninety".split()
scales = "hundred thousand million billion trillion quadrillion quintillion sextillion septillion octillion nonillion decillion".split()
digits_map = {n: i for i, n in enumerate(digit_list)}
digits_map["oh"] = 0
teens_map = {n: i + 10 for i, n in enumerate(teens)}
tens_map = {n: 10 * (i + 2) for i, n in enumerate(tens)}
scales_map = {n: 10 ** (3 * (i + 1)) for i, n in enumerate(scales[1:])}
scales_map["hundred"] = 100
# Maps number words to integers values that are used to compute numeric values.
numbers_map = digits_map.copy()
numbers_map.update(teens_map)
numbers_map.update(tens_map)
numbers_map.update(scales_map)
def get_spoken_form_under_one_hundred(
start,
end,
*,
include_oh_variant_for_single_digits=False,
include_default_variant_for_single_digits=False,
include_double_digits=False,
):
"""Helper function to get dictionary of spoken forms for non-negative numbers in the range [start, end] under 100"""
result = {}
for value in range(start, end + 1):
digit_index = value % 10
if value < 10:
# oh prefix digit: "oh five"-> `05`
if include_oh_variant_for_single_digits:
result[f"oh {digit_list[digit_index]}"] = f"0{value}"
# default digit: "five" -> `5`
if include_default_variant_for_single_digits:
result[f"{digit_list[digit_index]}"] = f"{value}"
elif value < 20:
teens_index = value - 10
result[f"{teens[teens_index]}"] = f"{value}"
elif value < 100:
tens_index = math.floor(value / 10) - 2
if digit_index > 0:
spoken_form = f"{tens[tens_index]} {digit_list[digit_index]}"
else:
spoken_form = f"{tens[tens_index]}"
result[spoken_form] = f"{value}"
else:
raise ValueError(f"Value {value} is not in the range [0, 100)")
# double digits: "five one" -> `51`
if include_double_digits and value > 9:
tens_index = math.floor(value / 10)
spoken_form = f"{digit_list[tens_index]} {digit_list[digit_index]}"
result[spoken_form] = f"{value}"
return result
def parse_number(l: list[str]) -> str:
"""Parses a list of words into a number/digit string."""
l = list(scan_small_numbers(l))
for scale in scales:
l = parse_scale(scale, l)
return "".join(str(n) for n in l)
def scan_small_numbers(l: list[str]) -> Iterator[Union[str, int]]:
"""
Takes a list of number words, yields a generator of mixed numbers & strings.
Translates small number terms (<100) into corresponding numbers.
Drops all occurrences of "and".
Smashes digits onto tens words, eg. ["twenty", "one"] -> [21].
But note that "ten" and "zero" are excluded, ie:
["ten", "three"] -> [10, 3]
["fifty", "zero"] -> [50, 0]
Does nothing to scale words ("hundred", "thousand", "million", etc).
"""
# reversed so that repeated pop() visits in left-to-right order
l = [x for x in reversed(l) if x != "and"]
while l:
n = l.pop()
# fuse tens onto digits, eg. "twenty", "one" -> 21
if n in tens_map and l and digits_map.get(l[-1], 0) != 0:
d = l.pop()
yield numbers_map[n] + numbers_map[d]
# turn small number terms into corresponding numbers
elif n not in scales_map:
yield numbers_map[n]
else:
yield n
def parse_scale(scale: str, l: list[Union[str, int]]) -> list[Union[str, int]]:
"""Parses a list of mixed numbers & strings for occurrences of the following
pattern:
<multiplier> <scale> <remainder>
where <scale> is a scale word like "hundred", "thousand", "million", etc and
multiplier and remainder are numbers or strings of numbers of the
appropriate size. For example:
parse_scale("hundred", [1, "hundred", 2]) -> [102]
parse_scale("thousand", [12, "thousand", 3, 45]) -> [12345]
We assume that all scales of lower magnitude have already been parsed; don't
call parse_scale("thousand") until you've called parse_scale("hundred").
"""
scale_value = scales_map[scale]
scale_digits = len(str(scale_value))
# Split the list on the desired scale word, then parse from left to right.
left, *splits = split_list(scale, l)
for right in splits:
# (1) Figure out the multiplier by looking to the left of the scale
# word. We ignore non-integers because they are scale words that we
# haven't processed yet; this strategy means that "thousand hundred"
# gets parsed as 1,100 instead of 100,000, but "hundred thousand" is
# parsed correctly as 100,000.
before = 1 # default multiplier
if left and isinstance(left[-1], int) and left[-1] != 0:
before = left.pop()
# (2) Absorb numbers to the right, eg. in [1, "thousand", 1, 26], "1
# thousand" absorbs ["1", "26"] to make 1,126. We pull numbers off
# `right` until we fill up the desired number of digits.
after = ""
while right and isinstance(right[0], int):
next = after + str(right[0])
if len(next) >= scale_digits:
break
after = next
right.pop(0)
after = int(after) if after else 0
# (3) Push the parsed number into place, append whatever was left
# unparsed, and continue.
left.append(before * scale_value + after)
left.extend(right)
return left
def split_list(value, l: list) -> Iterator:
"""Splits a list by occurrences of a given value."""
start = 0
while True:
try:
i = l.index(value, start)
except ValueError:
break
yield l[start:i]
start = i + 1
yield l[start:]
# # ---------- TESTS (uncomment to run) ----------
# def test_number(expected, string):
# print('testing:', string)
# l = list(scan_small_numbers(string.split()))
# print(" scan --->", l)
# for scale in scales:
# old = l
# l = parse_scale(scale, l)
# if scale in old: print(" parse -->", l)
# else: assert old == l, "parse_scale should do nothing if the scale does not occur in the list"
# result = "".join(str(n) for n in l)
# assert result == parse_number(string.split())
# assert str(expected) == result, f"parsing {string!r}, expected {expected}, got {result}"
# test_number(105000, "one hundred and five thousand")
# test_number(1000000, "one thousand thousand")
# test_number(1501000, "one million five hundred one thousand")
# test_number(1501106, "one million five hundred and one thousand one hundred and six")
# test_number(123, "one two three")
# test_number(123, "one twenty three")
# test_number(104, "ten four") # borderline, but valid in some dialects
# test_number(1066, "ten sixty six") # a common way of saying years
# test_number(1906, "nineteen oh six") # year
# test_number(2001, "twenty oh one") # year
# test_number(2020, "twenty twenty")
# test_number(1001, "one thousand one")
# test_number(1010, "one thousand ten")
# test_number(123456, "one hundred and twenty three thousand and four hundred and fifty six")
# test_number(123456, "one twenty three thousand four fifty six")
# ## failing (and somewhat debatable) tests from old numbers.py
# #test_number(10000011, "one million one one")
# #test_number(100001010, "one million ten ten")
# #test_number(1050006000, "one hundred thousand and five thousand and six thousand")
# ---------- CAPTURES ----------
alt_digits = "(" + "|".join(digits_map.keys()) + ")"
alt_teens = "(" + "|".join(teens_map.keys()) + ")"
alt_tens = "(" + "|".join(tens_map.keys()) + ")"
alt_scales = "(" + "|".join(scales_map.keys()) + ")"
number_word = "(" + "|".join(numbers_map.keys()) + ")"
# don't allow numbers to start with scale words like "hundred", "thousand", etc
leading_words = numbers_map.keys() - scales_map.keys()
leading_words -= {"oh", "o"} # comment out to enable bare/initial "oh"
number_word_leading = f"({'|'.join(leading_words)})"
mod.list("number_small", "List of small (0-99) numbers")
mod.tag("unprefixed_numbers", desc="Dont require prefix when saying a number")
ctx.lists["user.number_small"] = get_spoken_form_under_one_hundred(
0,
99,
include_default_variant_for_single_digits=True,
include_double_digits=True,
)
# TODO: allow things like "double eight" for 88
@ctx.capture("digit_string", rule=f"({alt_digits} | {alt_teens} | {alt_tens})+")
def digit_string(m) -> str:
return parse_number(list(m))
@ctx.capture("digits", rule="<digit_string>")
def digits(m) -> int:
"""Parses a phrase representing a digit sequence, returning it as an integer."""
return int(m.digit_string)
@mod.capture(rule=f"{number_word_leading} ([and] {number_word})*")
def number_string(m) -> str:
"""Parses a number phrase, returning that number as a string."""
return parse_number(list(m))
@mod.capture(rule="<user.number_string> ((point | dot) <user.number_string>)+")
def number_decimal_string(m) -> str:
"""Parses a decimal number phrase, returning that number as a string."""
return ".".join(m.number_string_list)
@ctx.capture("number", rule="<user.number_string>")
def number(m) -> int:
"""Parses a number phrase, returning it as an integer."""
return int(m.number_string)
@ctx.capture("number_signed", rule=f"[negative|minus] <number>")
def number_signed(m):
number = m[-1]
return -number if (m[0] in ["negative", "minus"]) else number
@ctx.capture("number_small", rule="{user.number_small}")
def number_small(m) -> int:
return int(m.number_small)
@mod.capture(rule=f"[negative|minus] <number_small>")
def number_signed_small(m) -> int:
"""Parses an integer between -99 and 99."""
number = m[-1]
return -number if (m[0] in ["negative", "minus"]) else number