init

2025-05-01 14:12:35 +03:00
commit 91f0e93961
6 changed files with 9176 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,66 @@
+import nltk
+from nltk.corpus import wordnet as wn, brown
+import os
+from collections import defaultdict
+
+NPATH = os.environ["NLTK_DATA"]
+COMMON_WORDS = {}
+
+def load_data():
+    nltk.download('brown', download_dir=NPATH)
+    nltk.download('wordnet', download_dir=NPATH)
+    nltk.download('omw-1.4', download_dir=NPATH)
+    # Load frequency distribution from Brown Corpus
+    freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
+    # Create a set of common words (adjust threshold as needed)
+    global COMMON_WORDS
+    COMMON_WORDS = {word for word, count in freq_dist.items() if count >= 5}
+
+def is_common(word):
+    # Check if word exists in Brown Corpus with minimal frequency
+    is_frequent = word in COMMON_WORDS
+    # Check if the word has multiple synsets (indicates broader usage)
+    synset_count = len(wn.synsets(word))
+    # Adjust thresholds: require frequency AND at least 1 synset
+    return is_frequent and synset_count >= 1
+
+def filter_common(words):
+    return {word for word in words if is_common(word)}
+
+def get_words():
+    nouns = set()
+    adjectives = set()
+    # Iterate over all synsets in WordNet
+    for synset in wn.all_synsets():
+        pos = synset.pos()
+        for lemma in synset.lemmas():
+            word = lemma.name().replace('_', ' ').lower()  # Normalize word
+            # no need for compoud words
+            if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word:
+                continue
+            if lemma.name().istitle():
+                continue
+            # Check for nouns (singular/uncountable)
+            if pos == 'n':
+                # Use WordNet's morphy to get base form
+                base_form = wn.morphy(word, pos='n')
+                # If base form matches the word, it's singular/uncountable
+                if base_form == word:
+                    nouns.add(word)
+            # Check for adjectives (including satellite adjectives)
+            elif pos in ('a', 's'):
+                adjectives.add(word)
+    # Filter using Brown Corpus frequency and synset count
+    nouns = filter_common(nouns)
+    adjectives = filter_common(adjectives)
+    return nouns, adjectives
+
+def writefile(fname, data):
+    with open(fname, "w") as lf:
+        lf.write("\n".join(data))
+
+if __name__ == "__main__":
+    load_data()
+    nouns, adjectives = get_words()
+    writefile("nouns.txt", nouns)
+    writefile("adjectives.txt", adjectives)