Feat: filter out numerical

2025-06-19 08:41:21 +03:00
parent 7dea19dcad
commit 09fbc53f65
3 changed files with 5429 additions and 5619 deletions
--- a/adjectives_10.txt
+++ b/adjectives_10.txt
--- a/main.py
+++ b/main.py
@@ -7,6 +7,37 @@ NPATH = os.environ["NLTK_DATA"]
 COMMON_WORDS = {}
 HOW_COMMON = 10

+numerical = {
+    # Cardinals (1-19)
+    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
+    "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
+    "seventeen", "eighteen", "nineteen",
+    
+    # Tens (20-90)
+    "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
+    
+    # Hundreds
+    "hundred",
+    
+    # Thousands
+    "thousand",
+    
+    # Ordinals (1st-19th)
+    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
+    "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
+    "sixteenth", "seventeenth", "eighteenth", "nineteenth",
+    
+    # Tens ordinals (20th-90th)
+    "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
+    "eightieth", "ninetieth",
+    
+    # Hundredth/Thousandth
+    "hundredth", "thousandth",
+    
+    # Special cases
+    "dozen", "score", "gross"  # Traditional counting units
+}
+
 def load_data():
    nltk.download('brown', download_dir=NPATH)
    nltk.download('wordnet', download_dir=NPATH)
@@ -49,6 +80,8 @@ def get_words():
                    continue # winning, twisiting; only want win, twist or feelings
                if word.endswith("s") and not word.endswith("ss"):
                    continue # leave dutschess but skip provisions
+                if word in numerical: # skip numbers
+                    continue
                # If base form matches the word, it's singular/uncountable
                if base_form == word:
                    nouns.add(word)
--- a/nouns_10.txt
+++ b/nouns_10.txt