init

2025-05-01 14:12:35 +03:00
commit 91f0e93961
6 changed files with 9176 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 venv/
--- a/adjectives.txt
+++ b/adjectives.txt
--- a/assign_colours.py
+++ b/assign_colours.py
@@ -0,0 +1,30 @@
 #!/usr/env/bin python
 from random import choice
 FNAME = "nouns.txt"
 # 9 for blue
 # 8 for red
 # 1 for black
 # 7 for white
 BLUENUM = 9
 REDNUM = 8
 BLACKNUM = 1
 WHITENUM = 7
 ALLNUMS = {"blue": BLUENUM, "red": REDNUM, "black": BLACKNUM, "white": WHITENUM}
 di = {}
 def assign_words(words, num):
    resp = set()
    while len(resp) < num:
        resp.add(choice(words).strip())
    return resp
 if __name__ == "__main__":
    with open(FNAME) as lf:
        data = lf.readlines()
    for k,num in ALLNUMS.items():
        di[k] = assign_words(data, num)
    print(di)
--- a/main.py
+++ b/main.py
@@ -0,0 +1,66 @@
 import nltk
 from nltk.corpus import wordnet as wn, brown
 import os
 from collections import defaultdict
 NPATH = os.environ["NLTK_DATA"]
 COMMON_WORDS = {}
 def load_data():
    nltk.download('brown', download_dir=NPATH)
    nltk.download('wordnet', download_dir=NPATH)
    nltk.download('omw-1.4', download_dir=NPATH)
    # Load frequency distribution from Brown Corpus
    freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
    # Create a set of common words (adjust threshold as needed)
    global COMMON_WORDS
    COMMON_WORDS = {word for word, count in freq_dist.items() if count >= 5}
 def is_common(word):
    # Check if word exists in Brown Corpus with minimal frequency
    is_frequent = word in COMMON_WORDS
    # Check if the word has multiple synsets (indicates broader usage)
    synset_count = len(wn.synsets(word))
    # Adjust thresholds: require frequency AND at least 1 synset
    return is_frequent and synset_count >= 1
 def filter_common(words):
    return {word for word in words if is_common(word)}
 def get_words():
    nouns = set()
    adjectives = set()
    # Iterate over all synsets in WordNet
    for synset in wn.all_synsets():
        pos = synset.pos()
        for lemma in synset.lemmas():
            word = lemma.name().replace('_', ' ').lower()  # Normalize word
            # no need for compoud words
            if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word:
                continue
            if lemma.name().istitle():
                continue
            # Check for nouns (singular/uncountable)
            if pos == 'n':
                # Use WordNet's morphy to get base form
                base_form = wn.morphy(word, pos='n')
                # If base form matches the word, it's singular/uncountable
                if base_form == word:
                    nouns.add(word)
            # Check for adjectives (including satellite adjectives)
            elif pos in ('a', 's'):
                adjectives.add(word)
    # Filter using Brown Corpus frequency and synset count
    nouns = filter_common(nouns)
    adjectives = filter_common(adjectives)
    return nouns, adjectives
 def writefile(fname, data):
    with open(fname, "w") as lf:
        lf.write("\n".join(data))
 if __name__ == "__main__":
    load_data()
    nouns, adjectives = get_words()
    writefile("nouns.txt", nouns)
    writefile("adjectives.txt", adjectives)
--- a/nouns.txt
+++ b/nouns.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
 nltk