Feat: filter out numerical
This commit is contained in:
4046
adjectives_10.txt
4046
adjectives_10.txt
File diff suppressed because it is too large
Load Diff
33
main.py
33
main.py
@ -7,6 +7,37 @@ NPATH = os.environ["NLTK_DATA"]
|
|||||||
COMMON_WORDS = {}
|
COMMON_WORDS = {}
|
||||||
HOW_COMMON = 10
|
HOW_COMMON = 10
|
||||||
|
|
||||||
|
numerical = {
|
||||||
|
# Cardinals (1-19)
|
||||||
|
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
||||||
|
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
||||||
|
"seventeen", "eighteen", "nineteen",
|
||||||
|
|
||||||
|
# Tens (20-90)
|
||||||
|
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
|
||||||
|
|
||||||
|
# Hundreds
|
||||||
|
"hundred",
|
||||||
|
|
||||||
|
# Thousands
|
||||||
|
"thousand",
|
||||||
|
|
||||||
|
# Ordinals (1st-19th)
|
||||||
|
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||||
|
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
|
||||||
|
"sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||||
|
|
||||||
|
# Tens ordinals (20th-90th)
|
||||||
|
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||||
|
"eightieth", "ninetieth",
|
||||||
|
|
||||||
|
# Hundredth/Thousandth
|
||||||
|
"hundredth", "thousandth",
|
||||||
|
|
||||||
|
# Special cases
|
||||||
|
"dozen", "score", "gross" # Traditional counting units
|
||||||
|
}
|
||||||
|
|
||||||
def load_data():
|
def load_data():
|
||||||
nltk.download('brown', download_dir=NPATH)
|
nltk.download('brown', download_dir=NPATH)
|
||||||
nltk.download('wordnet', download_dir=NPATH)
|
nltk.download('wordnet', download_dir=NPATH)
|
||||||
@ -49,6 +80,8 @@ def get_words():
|
|||||||
continue # winning, twisiting; only want win, twist or feelings
|
continue # winning, twisiting; only want win, twist or feelings
|
||||||
if word.endswith("s") and not word.endswith("ss"):
|
if word.endswith("s") and not word.endswith("ss"):
|
||||||
continue # leave dutschess but skip provisions
|
continue # leave dutschess but skip provisions
|
||||||
|
if word in numerical: # skip numbers
|
||||||
|
continue
|
||||||
# If base form matches the word, it's singular/uncountable
|
# If base form matches the word, it's singular/uncountable
|
||||||
if base_form == word:
|
if base_form == word:
|
||||||
nouns.add(word)
|
nouns.add(word)
|
||||||
|
6969
nouns_10.txt
6969
nouns_10.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user