init
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
venv/
|
3169
adjectives.txt
Normal file
3169
adjectives.txt
Normal file
File diff suppressed because it is too large
Load Diff
30
assign_colours.py
Normal file
30
assign_colours.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/env/bin python
|
||||||
|
|
||||||
|
from random import choice
|
||||||
|
|
||||||
|
FNAME = "nouns.txt"
|
||||||
|
# 9 for blue
|
||||||
|
# 8 for red
|
||||||
|
# 1 for black
|
||||||
|
# 7 for white
|
||||||
|
BLUENUM = 9
|
||||||
|
REDNUM = 8
|
||||||
|
BLACKNUM = 1
|
||||||
|
WHITENUM = 7
|
||||||
|
|
||||||
|
ALLNUMS = {"blue": BLUENUM, "red": REDNUM, "black": BLACKNUM, "white": WHITENUM}
|
||||||
|
|
||||||
|
di = {}
|
||||||
|
|
||||||
|
def assign_words(words, num):
|
||||||
|
resp = set()
|
||||||
|
while len(resp) < num:
|
||||||
|
resp.add(choice(words).strip())
|
||||||
|
return resp
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
with open(FNAME) as lf:
|
||||||
|
data = lf.readlines()
|
||||||
|
for k,num in ALLNUMS.items():
|
||||||
|
di[k] = assign_words(data, num)
|
||||||
|
print(di)
|
66
main.py
Normal file
66
main.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import nltk
|
||||||
|
from nltk.corpus import wordnet as wn, brown
|
||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
NPATH = os.environ["NLTK_DATA"]
|
||||||
|
COMMON_WORDS = {}
|
||||||
|
|
||||||
|
def load_data():
|
||||||
|
nltk.download('brown', download_dir=NPATH)
|
||||||
|
nltk.download('wordnet', download_dir=NPATH)
|
||||||
|
nltk.download('omw-1.4', download_dir=NPATH)
|
||||||
|
# Load frequency distribution from Brown Corpus
|
||||||
|
freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
|
||||||
|
# Create a set of common words (adjust threshold as needed)
|
||||||
|
global COMMON_WORDS
|
||||||
|
COMMON_WORDS = {word for word, count in freq_dist.items() if count >= 5}
|
||||||
|
|
||||||
|
def is_common(word):
|
||||||
|
# Check if word exists in Brown Corpus with minimal frequency
|
||||||
|
is_frequent = word in COMMON_WORDS
|
||||||
|
# Check if the word has multiple synsets (indicates broader usage)
|
||||||
|
synset_count = len(wn.synsets(word))
|
||||||
|
# Adjust thresholds: require frequency AND at least 1 synset
|
||||||
|
return is_frequent and synset_count >= 1
|
||||||
|
|
||||||
|
def filter_common(words):
|
||||||
|
return {word for word in words if is_common(word)}
|
||||||
|
|
||||||
|
def get_words():
|
||||||
|
nouns = set()
|
||||||
|
adjectives = set()
|
||||||
|
# Iterate over all synsets in WordNet
|
||||||
|
for synset in wn.all_synsets():
|
||||||
|
pos = synset.pos()
|
||||||
|
for lemma in synset.lemmas():
|
||||||
|
word = lemma.name().replace('_', ' ').lower() # Normalize word
|
||||||
|
# no need for compoud words
|
||||||
|
if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word:
|
||||||
|
continue
|
||||||
|
if lemma.name().istitle():
|
||||||
|
continue
|
||||||
|
# Check for nouns (singular/uncountable)
|
||||||
|
if pos == 'n':
|
||||||
|
# Use WordNet's morphy to get base form
|
||||||
|
base_form = wn.morphy(word, pos='n')
|
||||||
|
# If base form matches the word, it's singular/uncountable
|
||||||
|
if base_form == word:
|
||||||
|
nouns.add(word)
|
||||||
|
# Check for adjectives (including satellite adjectives)
|
||||||
|
elif pos in ('a', 's'):
|
||||||
|
adjectives.add(word)
|
||||||
|
# Filter using Brown Corpus frequency and synset count
|
||||||
|
nouns = filter_common(nouns)
|
||||||
|
adjectives = filter_common(adjectives)
|
||||||
|
return nouns, adjectives
|
||||||
|
|
||||||
|
def writefile(fname, data):
|
||||||
|
with open(fname, "w") as lf:
|
||||||
|
lf.write("\n".join(data))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
load_data()
|
||||||
|
nouns, adjectives = get_words()
|
||||||
|
writefile("nouns.txt", nouns)
|
||||||
|
writefile("adjectives.txt", adjectives)
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
nltk
|
Reference in New Issue
Block a user