Feat: more blacklists
This commit is contained in:
22
main.py
22
main.py
@ -7,37 +7,43 @@ NPATH = os.environ["NLTK_DATA"]
|
||||
COMMON_WORDS = {}
|
||||
HOW_COMMON = 10
|
||||
|
||||
# llm help to remove words
|
||||
numerical = {
|
||||
# Cardinals (1-19)
|
||||
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
||||
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
||||
"seventeen", "eighteen", "nineteen",
|
||||
|
||||
# Tens (20-90)
|
||||
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
|
||||
|
||||
# Hundreds
|
||||
"hundred",
|
||||
|
||||
# Thousands
|
||||
"thousand",
|
||||
|
||||
# Ordinals (1st-19th)
|
||||
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
|
||||
"sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||
|
||||
# Tens ordinals (20th-90th)
|
||||
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||
"eightieth", "ninetieth",
|
||||
|
||||
# Hundredth/Thousandth
|
||||
"hundredth", "thousandth",
|
||||
|
||||
# Special cases
|
||||
"dozen", "score", "gross" # Traditional counting units
|
||||
"dozen", "score", "gross", # Traditional counting units
|
||||
}
|
||||
|
||||
names = {"martin", "ben", "john", "maxwell", "ruth", "charlotte", "ada", "dick", "timothy", "earl", "geneva"}
|
||||
|
||||
difficult = {
|
||||
"aerator", "transducer", "substrate",
|
||||
"characteristic", "congruence", "secant",
|
||||
"tetrachloride", "binomial", "thyroglobulin", "anode", "antigen",
|
||||
"baroque", "muzzle", "anionic",
|
||||
"tsh", "polynomial", "antibody", "gyro", "polymer",
|
||||
"isotope", "barometer", "cathode", "electrode",
|
||||
}
|
||||
|
||||
|
||||
def load_data():
|
||||
nltk.download('brown', download_dir=NPATH)
|
||||
nltk.download('wordnet', download_dir=NPATH)
|
||||
|
Reference in New Issue
Block a user