diff --git a/main.py b/main.py index d372b05..d14d2ca 100644 --- a/main.py +++ b/main.py @@ -7,37 +7,43 @@ NPATH = os.environ["NLTK_DATA"] COMMON_WORDS = {} HOW_COMMON = 10 +# llm help to remove words numerical = { # Cardinals (1-19) "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", - # Tens (20-90) "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", - # Hundreds "hundred", - # Thousands "thousand", - # Ordinals (1st-19th) "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", - # Tens ordinals (20th-90th) "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "eightieth", "ninetieth", - # Hundredth/Thousandth "hundredth", "thousandth", - # Special cases - "dozen", "score", "gross" # Traditional counting units + "dozen", "score", "gross", # Traditional counting units } +names = {"martin", "ben", "john", "maxwell", "ruth", "charlotte", "ada", "dick", "timothy", "earl", "geneva"} + +difficult = { + "aerator", "transducer", "substrate", + "characteristic", "congruence", "secant", + "tetrachloride", "binomial", "thyroglobulin", "anode", "antigen", + "baroque", "muzzle", "anionic", + "tsh", "polynomial", "antibody", "gyro", "polymer", + "isotope", "barometer", "cathode", "electrode", +} + + def load_data(): nltk.download('brown', download_dir=NPATH) nltk.download('wordnet', download_dir=NPATH)