Feat: filter out numerical
This commit is contained in:
4046
adjectives_10.txt
4046
adjectives_10.txt
File diff suppressed because it is too large
Load Diff
33
main.py
33
main.py
@ -7,6 +7,37 @@ NPATH = os.environ["NLTK_DATA"]
|
||||
COMMON_WORDS = {}
|
||||
HOW_COMMON = 10
|
||||
|
||||
numerical = {
|
||||
# Cardinals (1-19)
|
||||
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
||||
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
||||
"seventeen", "eighteen", "nineteen",
|
||||
|
||||
# Tens (20-90)
|
||||
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
|
||||
|
||||
# Hundreds
|
||||
"hundred",
|
||||
|
||||
# Thousands
|
||||
"thousand",
|
||||
|
||||
# Ordinals (1st-19th)
|
||||
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
|
||||
"sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||
|
||||
# Tens ordinals (20th-90th)
|
||||
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||
"eightieth", "ninetieth",
|
||||
|
||||
# Hundredth/Thousandth
|
||||
"hundredth", "thousandth",
|
||||
|
||||
# Special cases
|
||||
"dozen", "score", "gross" # Traditional counting units
|
||||
}
|
||||
|
||||
def load_data():
|
||||
nltk.download('brown', download_dir=NPATH)
|
||||
nltk.download('wordnet', download_dir=NPATH)
|
||||
@ -49,6 +80,8 @@ def get_words():
|
||||
continue # winning, twisiting; only want win, twist or feelings
|
||||
if word.endswith("s") and not word.endswith("ss"):
|
||||
continue # leave dutschess but skip provisions
|
||||
if word in numerical: # skip numbers
|
||||
continue
|
||||
# If base form matches the word, it's singular/uncountable
|
||||
if base_form == word:
|
||||
nouns.add(word)
|
||||
|
6969
nouns_10.txt
6969
nouns_10.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user