114 lines
4.3 KiB
Python
114 lines
4.3 KiB
Python
import nltk
|
|
from nltk.corpus import wordnet as wn, brown
|
|
import os
|
|
from collections import defaultdict
|
|
|
|
NPATH = os.environ["NLTK_DATA"]
|
|
COMMON_WORDS = {}
|
|
HOW_COMMON = 10
|
|
|
|
# llm help to remove words
|
|
numerical = {
|
|
# Cardinals (1-19)
|
|
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
|
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
|
"seventeen", "eighteen", "nineteen",
|
|
# Tens (20-90)
|
|
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
|
|
# Hundreds
|
|
"hundred",
|
|
# Thousands
|
|
"thousand",
|
|
# Ordinals (1st-19th)
|
|
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
|
"ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
|
|
"sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
|
# Tens ordinals (20th-90th)
|
|
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
|
"eightieth", "ninetieth",
|
|
# Hundredth/Thousandth
|
|
"hundredth", "thousandth",
|
|
# Special cases
|
|
"dozen", "score", "gross", # Traditional counting units
|
|
}
|
|
|
|
innapropriate = {"nigger", "vagina", "rape", "penis"}
|
|
|
|
names = {"martin", "ben", "john", "maxwell", "ruth", "charlotte", "ada", "dick", "timothy", "earl", "geneva", "bobby"}
|
|
|
|
difficult = {
|
|
"aerator", "transducer", "substrate",
|
|
"characteristic", "congruence", "secant",
|
|
"tetrachloride", "binomial", "thyroglobulin", "anode", "antigen",
|
|
"baroque", "muzzle", "anionic",
|
|
"tsh", "polynomial", "antibody", "gyro", "polymer",
|
|
"isotope", "barometer", "cathode", "electrode",
|
|
}
|
|
|
|
|
|
def load_data():
|
|
nltk.download('brown', download_dir=NPATH)
|
|
nltk.download('wordnet', download_dir=NPATH)
|
|
nltk.download('omw-1.4', download_dir=NPATH)
|
|
# Load frequency distribution from Brown Corpus
|
|
freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
|
|
# Create a set of common words (adjust threshold as needed)
|
|
global COMMON_WORDS
|
|
COMMON_WORDS = {word for word, count in freq_dist.items() if count >= HOW_COMMON}
|
|
|
|
def is_common(word):
|
|
# Check if word exists in Brown Corpus with minimal frequency
|
|
is_frequent = word in COMMON_WORDS
|
|
# Check if the word has multiple synsets (indicates broader usage)
|
|
synset_count = len(wn.synsets(word))
|
|
# Adjust thresholds: require frequency AND at least 1 synset
|
|
return is_frequent and synset_count >= 1
|
|
|
|
def filter_common(words):
|
|
return {word for word in words if is_common(word)}
|
|
|
|
def get_words():
|
|
nouns = set()
|
|
adjectives = set()
|
|
# Iterate over all synsets in WordNet
|
|
for synset in wn.all_synsets():
|
|
pos = synset.pos()
|
|
for lemma in synset.lemmas():
|
|
word = lemma.name().replace('_', ' ').lower() # Normalize word
|
|
# no need for compoud words
|
|
if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word:
|
|
continue
|
|
if lemma.name().istitle():
|
|
continue
|
|
# Check for nouns (singular/uncountable)
|
|
if pos == 'n':
|
|
# Use WordNet's morphy to get base form
|
|
base_form = wn.morphy(word, pos='n')
|
|
if word.endswith("ing"): # possibly remove all words ending with s ( a lot of abstract words and plurals got in)
|
|
continue # winning, twisiting; only want win, twist or feelings
|
|
if word.endswith("s") and not word.endswith("ss"):
|
|
continue # leave dutschess but skip provisions
|
|
# exceptions
|
|
if word in numerical or word in names or word in difficult or word in innapropriate:
|
|
continue
|
|
# If base form matches the word, it's singular/uncountable
|
|
if base_form == word:
|
|
nouns.add(word)
|
|
# Check for adjectives (including satellite adjectives)
|
|
elif pos in ('a', 's'):
|
|
adjectives.add(word)
|
|
# Filter using Brown Corpus frequency and synset count
|
|
nouns = filter_common(nouns)
|
|
adjectives = filter_common(adjectives)
|
|
return nouns, adjectives
|
|
|
|
def writefile(fname, data):
|
|
with open(fname, "w") as lf:
|
|
lf.write("\n".join(data))
|
|
|
|
if __name__ == "__main__":
|
|
load_data()
|
|
nouns, adjectives = get_words()
|
|
writefile(f"nouns_{HOW_COMMON}.txt", nouns)
|
|
writefile(f"adjectives_{HOW_COMMON}.txt", adjectives)
|