Enha: set how common words are
This commit is contained in:
7
main.py
7
main.py
@ -5,6 +5,7 @@ from collections import defaultdict
|
|||||||
|
|
||||||
NPATH = os.environ["NLTK_DATA"]
|
NPATH = os.environ["NLTK_DATA"]
|
||||||
COMMON_WORDS = {}
|
COMMON_WORDS = {}
|
||||||
|
HOW_COMMON = 10
|
||||||
|
|
||||||
def load_data():
|
def load_data():
|
||||||
nltk.download('brown', download_dir=NPATH)
|
nltk.download('brown', download_dir=NPATH)
|
||||||
@ -14,7 +15,7 @@ def load_data():
|
|||||||
freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
|
freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
|
||||||
# Create a set of common words (adjust threshold as needed)
|
# Create a set of common words (adjust threshold as needed)
|
||||||
global COMMON_WORDS
|
global COMMON_WORDS
|
||||||
COMMON_WORDS = {word for word, count in freq_dist.items() if count >= 5}
|
COMMON_WORDS = {word for word, count in freq_dist.items() if count >= HOW_COMMON}
|
||||||
|
|
||||||
def is_common(word):
|
def is_common(word):
|
||||||
# Check if word exists in Brown Corpus with minimal frequency
|
# Check if word exists in Brown Corpus with minimal frequency
|
||||||
@ -62,5 +63,5 @@ def writefile(fname, data):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
load_data()
|
load_data()
|
||||||
nouns, adjectives = get_words()
|
nouns, adjectives = get_words()
|
||||||
writefile("nouns.txt", nouns)
|
writefile(f"nouns_{HOW_COMMON}.txt", nouns)
|
||||||
writefile("adjectives.txt", adjectives)
|
writefile(f"adjectives_{HOW_COMMON}.txt", adjectives)
|
||||||
|
Reference in New Issue
Block a user