// ML implementation: TF-IDF + Logistic Regression for article filtering. // // Why title-only: Avoids content scraping overhead, titles are already informative. // MinDF=2: Removes typos and rare terms that don't generalize. // MaxDF=0.8: Removes common words that appear in >80% of documents. // λ=0.001: Light L2 regularization to prevent overfitting on small datasets. // // Public API: // - TFIDFVectorizer.Fit(): Learn vocabulary from documents // - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors // - LogisticRegression.Fit(): Train classifier on vectors // - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model // - PredictScore(): Score article using trained weights package core import ( "fmt" "math" "regexp" "sort" "strings" ) // ============================================================================ // ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓ // ┃┏┛┣╸ ┃ ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛ // ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸ // ============================================================================ var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+") // StopWords: Common words that don't help distinguish articles. // Why: Reduces noise and improves model generalization. var stopWords = map[string]struct{}{ // Single letters and symbols "s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {}, // Common English stop words "a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {}, } type TFIDFVectorizer struct { Vocabulary map[string]float64 OrderedVocab []string NgramMin int NgramMax int MinDF int // Minimum document frequency (absolute) MaxDF float64 // Maximum document frequency (ratio) VocabCap int } func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer { return &TFIDFVectorizer{ Vocabulary: model.Vectorizer, OrderedVocab: model.OrderedVocab, } } // Learns vocabulary and IDF from documents func (v *TFIDFVectorizer) Fit(documents []string) { numDocs := len(documents) docFreqs := make(map[string]int) for _, doc := range documents { unigrams := Tokenize(doc) ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax) seenInDoc := make(map[string]struct{}) for _, ngram := range ngrams { if _, seen := seenInDoc[ngram]; !seen { docFreqs[ngram]++ seenInDoc[ngram] = struct{}{} } } } maxDocs := int(v.MaxDF * float64(numDocs)) filteredVocab := make(map[string]int) for term, freq := range docFreqs { if freq >= v.MinDF && freq <= maxDocs { filteredVocab[term] = freq } } if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap { type termFreq struct { term string freq int } terms := make([]termFreq, 0, len(filteredVocab)) for term, freq := range filteredVocab { terms = append(terms, termFreq{term, freq}) } sort.Slice(terms, func(i, j int) bool { return terms[i].freq > terms[j].freq }) cappedTerms := terms[:v.VocabCap] filteredVocab = make(map[string]int, v.VocabCap) for _, tf := range cappedTerms { filteredVocab[tf.term] = tf.freq } } v.OrderedVocab = make([]string, 0, len(filteredVocab)) for term := range filteredVocab { v.OrderedVocab = append(v.OrderedVocab, term) } sort.Strings(v.OrderedVocab) // deterministic order v.Vocabulary = make(map[string]float64, len(v.OrderedVocab)) for _, term := range v.OrderedVocab { // IDF = log(total num of docs / num of docs with term) idf := math.Log(float64(numDocs) / float64(filteredVocab[term])) v.Vocabulary[term] = idf } } // Converts documents to TF-IDF vectors using learned vocabulary func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 { vectors := make([][]float64, len(documents)) for i, doc := range documents { unigrams := Tokenize(doc) ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax) vector := make([]float64, len(v.OrderedVocab)) if len(ngrams) > 0 { // tf: term frequency (normalized count of each n-gram in document) tf := make(map[string]float64) for _, ngram := range ngrams { tf[ngram]++ } numNgrams := float64(len(ngrams)) for ngram, count := range tf { tf[ngram] = count / numNgrams } for j, term := range v.OrderedVocab { if tfValue, ok := tf[term]; ok { // only score terms that were in our training vocabulary if idfValue, inVocab := v.Vocabulary[term]; inVocab { vector[j] = tfValue * idfValue } } } } vectors[i] = vector } return vectors } func Tokenize(text string) []string { text = strings.ToLower(text) words := wordHyphenRegex.Split(text, -1) tokens := make([]string, 0, len(words)) for _, word := range words { if word == "" { continue } if _, isStopWord := stopWords[word]; isStopWord { continue } tokens = append(tokens, word) } return tokens } func generateNgrams(tokens []string, minN, maxN int) []string { if minN <= 0 { minN = 1 } if maxN < minN { maxN = minN } numTokens := len(tokens) estimatedCap := 0 for n := minN; n <= maxN; n++ { if numTokens >= n { estimatedCap += numTokens - n + 1 } } ngrams := make([]string, 0, estimatedCap) for n := minN; n <= maxN; n++ { if numTokens < n { continue } for i := 0; i <= numTokens-n; i++ { ngrams = append(ngrams, strings.Join(tokens[i:i+n], " ")) } } return ngrams } // ============================================================================ // ┏━╸╻ ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓ // ┃ ┃ ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛ // ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹ ╹┗━╸╹┗╸ // ============================================================================ // Binary logistic regression with L2 regularization // Bias term stored separately (not regularized) type LogisticRegression struct { LearningRate float64 Lambda float64 // L2 regularization parameter Iterations int Tolerance float64 // Convergence tolerance on loss improvement } // validate checks and clamps hyperparams to reasonable bounds. func (lr *LogisticRegression) Validate() *LogisticRegression { const ( defaultLearningRate = 0.5 defaultIterations = 500 defaultTolerance = 0.000001 ) if lr.LearningRate <= 0 { lr.LearningRate = defaultLearningRate } if lr.LearningRate > 10 { lr.LearningRate = 10.0 } if lr.Lambda < 0 { lr.Lambda = 0.0 } if lr.Iterations <= 0 { lr.Iterations = defaultIterations } if lr.Tolerance <= 0 { lr.Tolerance = defaultTolerance } return lr } // Fit trains via SGD with L2 regularization on feature weights (not bias). // Class weights reweight samples; unused in our pipeline (we downsample instead). // Returns weights with bias as last element. func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) { if len(vectors) == 0 { return nil, fmt.Errorf("cannot train on empty dataset") } if len(vectors) != len(labels) { return nil, fmt.Errorf( "mismatch between number of vectors (%d) and labels (%d)", len(vectors), len(labels), ) } for i, y := range labels { if y != 0 && y != 1 { return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y) } } numFeatures := len(vectors[0]) if numFeatures == 0 { return nil, fmt.Errorf("cannot train with zero-length feature vectors") } for i := 1; i < len(vectors); i++ { if len(vectors[i]) != numFeatures { return nil, fmt.Errorf( "inconsistent feature vector length at index %d: got %d, expected %d", i, len(vectors[i]), numFeatures, ) } } useUniformWeights := classWeights == nil if useUniformWeights { classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0} } numSamples := float64(len(vectors)) var totalWeight float64 if useUniformWeights { totalWeight = numSamples } else { for _, y := range labels { totalWeight += classWeights[y] } } if totalWeight == 0 { totalWeight = numSamples // Fallback } weights := make([]float64, numFeatures) var bias float64 prevLoss := math.MaxFloat64 for i := 0; i < lr.Iterations; i++ { gradWeights := make([]float64, numFeatures) var gradBias float64 var currentLoss float64 for j, x := range vectors { y := labels[j] sampleWeight := classWeights[y] z, err := dot(weights, x) if err != nil { return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err) } p := Sigmoid(z + bias) // Compute prediction error. This term gets multiplied by each feature value // to accumulate gradients (higher error pushes weights harder). errTerm := p - y for k := 0; k < numFeatures; k++ { gradWeights[k] += sampleWeight * errTerm * x[k] } gradBias += sampleWeight * errTerm cp := clamp(p) currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp))) } // Update weights with L2 regularization (only on feature weights, not bias). // This pulls weights toward zero, preventing overfitting on small datasets. for k := 0; k < numFeatures; k++ { regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k]) weights[k] -= lr.LearningRate * regularizedGrad } gradBias /= totalWeight bias -= lr.LearningRate * gradBias // Check convergence: if loss change is below tolerance, we're done. // We include the L2 penalty in total loss to assess true convergence. avgLoss := currentLoss / totalWeight var l2Penalty float64 for _, w := range weights { l2Penalty += w * w } totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty if math.Abs(prevLoss-totalLoss) < lr.Tolerance { break } prevLoss = totalLoss } // bias is stored as the last element return append(weights, bias), nil } // PredictScore computes the probability for a single vec given weights. // the last element of weights is the bias. func PredictScore(vector []float64, weights []float64) (float64, error) { if len(weights) == 0 { return 0, fmt.Errorf("weights cannot be empty") } if len(vector) != len(weights)-1 { return 0, fmt.Errorf( "vector length mismatch: expected %d features, got %d", len(weights)-1, len(vector), ) } for i, v := range vector { if math.IsNaN(v) || math.IsInf(v, 0) { return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v) } } for i, w := range weights { if math.IsNaN(w) || math.IsInf(w, 0) { return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w) } } featureWeights := weights[:len(weights)-1] bias := weights[len(weights)-1] z, err := dot(featureWeights, vector) if err != nil { return 0, fmt.Errorf("failed to compute dot product: %w", err) } return Sigmoid(z + bias), nil } // ============================================================================ // ┏┳┓┏━┓╺┳╸╻ ╻┏━┓ // ┃┃┃┣━┫ ┃ ┣━┫┗━┓ // ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛ // ============================================================================ func Sigmoid(z float64) float64 { if z >= 0 { return 1.0 / (1.0 + math.Exp(-z)) } ez := math.Exp(z) return ez / (1.0 + ez) } func dot(a, b []float64) (float64, error) { if len(a) != len(b) { return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b)) } var sum float64 for i := range a { sum += a[i] * b[i] } return sum, nil } func clamp(p float64) float64 { const probabilityClamp = 1e-15 if p < probabilityClamp { return probabilityClamp } if p > 1.0-probabilityClamp { return 1.0 - probabilityClamp } return p }