Feat: more blacklists

This commit is contained in:
Grail Finder
2025-06-19 09:12:42 +03:00
parent 09fbc53f65
commit 067ad1821c

22
main.py
View File

@ -7,37 +7,43 @@ NPATH = os.environ["NLTK_DATA"]
COMMON_WORDS = {} COMMON_WORDS = {}
HOW_COMMON = 10 HOW_COMMON = 10
# llm help to remove words
numerical = { numerical = {
# Cardinals (1-19) # Cardinals (1-19)
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
"seventeen", "eighteen", "nineteen", "seventeen", "eighteen", "nineteen",
# Tens (20-90) # Tens (20-90)
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
# Hundreds # Hundreds
"hundred", "hundred",
# Thousands # Thousands
"thousand", "thousand",
# Ordinals (1st-19th) # Ordinals (1st-19th)
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
"sixteenth", "seventeenth", "eighteenth", "nineteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
# Tens ordinals (20th-90th) # Tens ordinals (20th-90th)
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
"eightieth", "ninetieth", "eightieth", "ninetieth",
# Hundredth/Thousandth # Hundredth/Thousandth
"hundredth", "thousandth", "hundredth", "thousandth",
# Special cases # Special cases
"dozen", "score", "gross" # Traditional counting units "dozen", "score", "gross", # Traditional counting units
} }
names = {"martin", "ben", "john", "maxwell", "ruth", "charlotte", "ada", "dick", "timothy", "earl", "geneva"}
difficult = {
"aerator", "transducer", "substrate",
"characteristic", "congruence", "secant",
"tetrachloride", "binomial", "thyroglobulin", "anode", "antigen",
"baroque", "muzzle", "anionic",
"tsh", "polynomial", "antibody", "gyro", "polymer",
"isotope", "barometer", "cathode", "electrode",
}
def load_data(): def load_data():
nltk.download('brown', download_dir=NPATH) nltk.download('brown', download_dir=NPATH)
nltk.download('wordnet', download_dir=NPATH) nltk.download('wordnet', download_dir=NPATH)