Feat: filter out numerical

This commit is contained in:
Grail Finder
2025-06-19 08:41:21 +03:00
parent 7dea19dcad
commit 09fbc53f65
3 changed files with 5429 additions and 5619 deletions

File diff suppressed because it is too large Load Diff

33
main.py
View File

@ -7,6 +7,37 @@ NPATH = os.environ["NLTK_DATA"]
COMMON_WORDS = {}
HOW_COMMON = 10
numerical = {
# Cardinals (1-19)
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
"seventeen", "eighteen", "nineteen",
# Tens (20-90)
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
# Hundreds
"hundred",
# Thousands
"thousand",
# Ordinals (1st-19th)
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
"sixteenth", "seventeenth", "eighteenth", "nineteenth",
# Tens ordinals (20th-90th)
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
"eightieth", "ninetieth",
# Hundredth/Thousandth
"hundredth", "thousandth",
# Special cases
"dozen", "score", "gross" # Traditional counting units
}
def load_data():
nltk.download('brown', download_dir=NPATH)
nltk.download('wordnet', download_dir=NPATH)
@ -49,6 +80,8 @@ def get_words():
continue # winning, twisiting; only want win, twist or feelings
if word.endswith("s") and not word.endswith("ss"):
continue # leave dutschess but skip provisions
if word in numerical: # skip numbers
continue
# If base form matches the word, it's singular/uncountable
if base_form == word:
nouns.add(word)

File diff suppressed because it is too large Load Diff