common-words/main.py

import nltk
from nltk.corpus import wordnet as wn, brown
import os
from collections import defaultdict

NPATH = os.environ["NLTK_DATA"]
COMMON_WORDS = {}
HOW_COMMON = 10

# llm help to remove words
numerical = {
    # Cardinals (1-19)
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
    "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
    "seventeen", "eighteen", "nineteen",
    # Tens (20-90)
    "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
    # Hundreds
    "hundred",
    # Thousands
    "thousand",
    # Ordinals (1st-19th)
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
    "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
    "sixteenth", "seventeenth", "eighteenth", "nineteenth",
    # Tens ordinals (20th-90th)
    "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
    "eightieth", "ninetieth",
    # Hundredth/Thousandth
    "hundredth", "thousandth",
    # Special cases
    "dozen", "score", "gross",  # Traditional counting units
}

innapropriate = {"nigger", "vagina", "rape", "penis"}

names = {"martin", "ben", "john", "maxwell", "ruth", "charlotte", "ada", "dick", "timothy", "earl", "geneva", "bobby"}

difficult = {
    "aerator", "transducer", "substrate",
    "characteristic", "congruence", "secant",
    "tetrachloride", "binomial", "thyroglobulin", "anode", "antigen",
    "baroque", "muzzle", "anionic",
    "tsh", "polynomial", "antibody", "gyro", "polymer",
    "isotope", "barometer", "cathode", "electrode",
}


def load_data():
    nltk.download('brown', download_dir=NPATH)
    nltk.download('wordnet', download_dir=NPATH)
    nltk.download('omw-1.4', download_dir=NPATH)
    # Load frequency distribution from Brown Corpus
    freq_dist = nltk.FreqDist(word.lower() for word in brown.words())
    # Create a set of common words (adjust threshold as needed)
    global COMMON_WORDS
    COMMON_WORDS = {word for word, count in freq_dist.items() if count >= HOW_COMMON}

def is_common(word):
    # Check if word exists in Brown Corpus with minimal frequency
    is_frequent = word in COMMON_WORDS
    # Check if the word has multiple synsets (indicates broader usage)
    synset_count = len(wn.synsets(word))
    # Adjust thresholds: require frequency AND at least 1 synset
    return is_frequent and synset_count >= 1

def filter_common(words):
    return {word for word in words if is_common(word)}

def get_words():
    nouns = set()
    adjectives = set()
    # Iterate over all synsets in WordNet
    for synset in wn.all_synsets():
        pos = synset.pos()
        for lemma in synset.lemmas():
            word = lemma.name().replace('_', ' ').lower()  # Normalize word
            # no need for compoud words
            if "-" in word or " " in word or "'" in word or len(word) < 3 or "." in word:
                continue
            if lemma.name().istitle():
                continue
            # Check for nouns (singular/uncountable)
            if pos == 'n':
                # Use WordNet's morphy to get base form
                base_form = wn.morphy(word, pos='n')
                if word.endswith("ing"):  # possibly remove all words ending with s ( a lot of abstract words and plurals got in)
                    continue # winning, twisiting; only want win, twist or feelings
                if word.endswith("s") and not word.endswith("ss"):
                    continue # leave dutschess but skip provisions
                # exceptions
                if word in numerical or word in names or word in difficult or word in innapropriate:
                    continue
                # If base form matches the word, it's singular/uncountable
                if base_form == word:
                    nouns.add(word)
            # Check for adjectives (including satellite adjectives)
            elif pos in ('a', 's'):
                adjectives.add(word)
    # Filter using Brown Corpus frequency and synset count
    nouns = filter_common(nouns)
    adjectives = filter_common(adjectives)
    return nouns, adjectives

def writefile(fname, data):
    with open(fname, "w") as lf:
        lf.write("\n".join(data))

if __name__ == "__main__":
    load_data()
    nouns, adjectives = get_words()
    writefile(f"nouns_{HOW_COMMON}.txt", nouns)
    writefile(f"adjectives_{HOW_COMMON}.txt", adjectives)