import nltk from nltk.corpus import wordnet as wn, brown import os from collections import defaultdict NPATH = os.environ["NLTK_DATA"] COMMON_WORDS = {} def load_data(): nltk.download('brown', download_dir=NPATH) nltk.download('wordnet', download_dir=NPATH) nltk.download('omw-1.4', download_dir=NPATH) # Load frequency distribution from Brown Corpus freq_dist = nltk.FreqDist(word.lower() for word in brown.words()) # Create a set of common words (adjust threshold as needed) global COMMON_WORDS COMMON_WORDS = {word for word, count in freq_dist.items() if count >= 5} def is_common(word): # Check if word exists in Brown Corpus with minimal frequency is_frequent = word in COMMON_WORDS # Check if the word has multiple synsets (indicates broader usage) synset_count = len(wn.synsets(word)) # Adjust thresholds: require frequency AND at least 1 synset return is_frequent and synset_count >= 1 def filter_common(words): return {word for word in words if is_common(word)} def get_words(): nouns = set() adjectives = set() # Iterate over all synsets in WordNet for synset in wn.all_synsets(): pos = synset.pos() for lemma in synset.lemmas(): word = lemma.name().replace('_', ' ').lower() # Normalize word # no need for compoud words if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word: continue if lemma.name().istitle(): continue # Check for nouns (singular/uncountable) if pos == 'n': # Use WordNet's morphy to get base form base_form = wn.morphy(word, pos='n') # If base form matches the word, it's singular/uncountable if base_form == word: nouns.add(word) # Check for adjectives (including satellite adjectives) elif pos in ('a', 's'): adjectives.add(word) # Filter using Brown Corpus frequency and synset count nouns = filter_common(nouns) adjectives = filter_common(adjectives) return nouns, adjectives def writefile(fname, data): with open(fname, "w") as lf: lf.write("\n".join(data)) if __name__ == "__main__": load_data() nouns, adjectives = get_words() writefile("nouns.txt", nouns) writefile("adjectives.txt", adjectives)