import nltk from nltk.corpus import wordnet as wn, brown import os from collections import defaultdict NPATH = os.environ["NLTK_DATA"] COMMON_WORDS = {} HOW_COMMON = 10 # llm help to remove words numerical = { # Cardinals (1-19) "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", # Tens (20-90) "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", # Hundreds "hundred", # Thousands "thousand", # Ordinals (1st-19th) "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", # Tens ordinals (20th-90th) "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "eightieth", "ninetieth", # Hundredth/Thousandth "hundredth", "thousandth", # Special cases "dozen", "score", "gross", # Traditional counting units } names = {"martin", "ben", "john", "maxwell", "ruth", "charlotte", "ada", "dick", "timothy", "earl", "geneva"} difficult = { "aerator", "transducer", "substrate", "characteristic", "congruence", "secant", "tetrachloride", "binomial", "thyroglobulin", "anode", "antigen", "baroque", "muzzle", "anionic", "tsh", "polynomial", "antibody", "gyro", "polymer", "isotope", "barometer", "cathode", "electrode", } def load_data(): nltk.download('brown', download_dir=NPATH) nltk.download('wordnet', download_dir=NPATH) nltk.download('omw-1.4', download_dir=NPATH) # Load frequency distribution from Brown Corpus freq_dist = nltk.FreqDist(word.lower() for word in brown.words()) # Create a set of common words (adjust threshold as needed) global COMMON_WORDS COMMON_WORDS = {word for word, count in freq_dist.items() if count >= HOW_COMMON} def is_common(word): # Check if word exists in Brown Corpus with minimal frequency is_frequent = word in COMMON_WORDS # Check if the word has multiple synsets (indicates broader usage) synset_count = len(wn.synsets(word)) # Adjust thresholds: require frequency AND at least 1 synset return is_frequent and synset_count >= 1 def filter_common(words): return {word for word in words if is_common(word)} def get_words(): nouns = set() adjectives = set() # Iterate over all synsets in WordNet for synset in wn.all_synsets(): pos = synset.pos() for lemma in synset.lemmas(): word = lemma.name().replace('_', ' ').lower() # Normalize word # no need for compoud words if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word: continue if lemma.name().istitle(): continue # Check for nouns (singular/uncountable) if pos == 'n': # Use WordNet's morphy to get base form base_form = wn.morphy(word, pos='n') if word.endswith("ing"): # possibly remove all words ending with s ( a lot of abstract words and plurals got in) continue # winning, twisiting; only want win, twist or feelings if word.endswith("s") and not word.endswith("ss"): continue # leave dutschess but skip provisions if word in numerical: # skip numbers continue # If base form matches the word, it's singular/uncountable if base_form == word: nouns.add(word) # Check for adjectives (including satellite adjectives) elif pos in ('a', 's'): adjectives.add(word) # Filter using Brown Corpus frequency and synset count nouns = filter_common(nouns) adjectives = filter_common(adjectives) return nouns, adjectives def writefile(fname, data): with open(fname, "w") as lf: lf.write("\n".join(data)) if __name__ == "__main__": load_data() nouns, adjectives = get_words() writefile(f"nouns_{HOW_COMMON}.txt", nouns) writefile(f"adjectives_{HOW_COMMON}.txt", adjectives)