Enha: set how common words are

This commit is contained in:
Grail Finder
2025-06-19 06:17:39 +03:00
parent 91f0e93961
commit a12a2f4385
3 changed files with 4 additions and 3 deletions

View File

@ -5,6 +5,7 @@ from collections import defaultdict
NPATH = os.environ["NLTK_DATA"] NPATH = os.environ["NLTK_DATA"]
COMMON_WORDS = {} COMMON_WORDS = {}
HOW_COMMON = 10
def load_data(): def load_data():
nltk.download('brown', download_dir=NPATH) nltk.download('brown', download_dir=NPATH)
@ -14,7 +15,7 @@ def load_data():
freq_dist = nltk.FreqDist(word.lower() for word in brown.words()) freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
# Create a set of common words (adjust threshold as needed) # Create a set of common words (adjust threshold as needed)
global COMMON_WORDS global COMMON_WORDS
COMMON_WORDS = {word for word, count in freq_dist.items() if count >= 5} COMMON_WORDS = {word for word, count in freq_dist.items() if count >= HOW_COMMON}
def is_common(word): def is_common(word):
# Check if word exists in Brown Corpus with minimal frequency # Check if word exists in Brown Corpus with minimal frequency
@ -62,5 +63,5 @@ def writefile(fname, data):
if __name__ == "__main__": if __name__ == "__main__":
load_data() load_data()
nouns, adjectives = get_words() nouns, adjectives = get_words()
writefile("nouns.txt", nouns) writefile(f"nouns_{HOW_COMMON}.txt", nouns)
writefile("adjectives.txt", adjectives) writefile(f"adjectives_{HOW_COMMON}.txt", adjectives)