aboutsummaryrefslogtreecommitdiff
path: root/core/ml.go
diff options
context:
space:
mode:
Diffstat (limited to 'core/ml.go')
-rw-r--r--core/ml.go427
1 files changed, 427 insertions, 0 deletions
diff --git a/core/ml.go b/core/ml.go
new file mode 100644
index 0000000..afdd2f3
--- /dev/null
+++ b/core/ml.go
@@ -0,0 +1,427 @@
+// ML implementation: TF-IDF + Logistic Regression for article filtering.
+//
+// Why title-only: Avoids content scraping overhead, titles are already informative.
+// MinDF=2: Removes typos and rare terms that don't generalize.
+// MaxDF=0.8: Removes common words that appear in >80% of documents.
+// λ=0.001: Light L2 regularization to prevent overfitting on small datasets.
+//
+// Public API:
+// - TFIDFVectorizer.Fit(): Learn vocabulary from documents
+// - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors
+// - LogisticRegression.Fit(): Train classifier on vectors
+// - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model
+// - PredictScore(): Score article using trained weights
+package core
+
+import (
+ "fmt"
+ "math"
+ "regexp"
+ "sort"
+ "strings"
+)
+
+
+// ============================================================================
+// ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓
+// ┃┏┛┣╸ ┃ ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛
+// ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸
+// ============================================================================
+
+
+var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+")
+
+// StopWords: Common words that don't help distinguish articles.
+// Why: Reduces noise and improves model generalization.
+var stopWords = map[string]struct{}{
+ // Single letters and symbols
+ "s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {},
+
+ // Common English stop words
+ "a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {},
+}
+
+type TFIDFVectorizer struct {
+ Vocabulary map[string]float64
+ OrderedVocab []string
+ NgramMin int
+ NgramMax int
+ MinDF int // Minimum document frequency (absolute)
+ MaxDF float64 // Maximum document frequency (ratio)
+ VocabCap int
+}
+
+func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer {
+ return &TFIDFVectorizer{
+ Vocabulary: model.Vectorizer,
+ OrderedVocab: model.OrderedVocab,
+ }
+}
+
+
+// Learns vocabulary and IDF from documents
+func (v *TFIDFVectorizer) Fit(documents []string) {
+ numDocs := len(documents)
+ docFreqs := make(map[string]int)
+
+ for _, doc := range documents {
+ unigrams := Tokenize(doc)
+ ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
+ seenInDoc := make(map[string]struct{})
+ for _, ngram := range ngrams {
+ if _, seen := seenInDoc[ngram]; !seen {
+ docFreqs[ngram]++
+ seenInDoc[ngram] = struct{}{}
+ }
+ }
+ }
+
+ maxDocs := int(v.MaxDF * float64(numDocs))
+ filteredVocab := make(map[string]int)
+ for term, freq := range docFreqs {
+ if freq >= v.MinDF && freq <= maxDocs {
+ filteredVocab[term] = freq
+ }
+ }
+
+ if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap {
+ type termFreq struct {
+ term string
+ freq int
+ }
+ terms := make([]termFreq, 0, len(filteredVocab))
+ for term, freq := range filteredVocab {
+ terms = append(terms, termFreq{term, freq})
+ }
+ sort.Slice(terms, func(i, j int) bool {
+ return terms[i].freq > terms[j].freq
+ })
+
+ cappedTerms := terms[:v.VocabCap]
+ filteredVocab = make(map[string]int, v.VocabCap)
+ for _, tf := range cappedTerms {
+ filteredVocab[tf.term] = tf.freq
+ }
+ }
+
+ v.OrderedVocab = make([]string, 0, len(filteredVocab))
+ for term := range filteredVocab {
+ v.OrderedVocab = append(v.OrderedVocab, term)
+ }
+ sort.Strings(v.OrderedVocab) // deterministic order
+
+ v.Vocabulary = make(map[string]float64, len(v.OrderedVocab))
+ for _, term := range v.OrderedVocab {
+ // IDF = log(total num of docs / num of docs with term)
+ idf := math.Log(float64(numDocs) / float64(filteredVocab[term]))
+ v.Vocabulary[term] = idf
+ }
+}
+
+// Converts documents to TF-IDF vectors using learned vocabulary
+func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 {
+ vectors := make([][]float64, len(documents))
+
+ for i, doc := range documents {
+ unigrams := Tokenize(doc)
+ ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
+ vector := make([]float64, len(v.OrderedVocab))
+
+ if len(ngrams) > 0 {
+ // tf: term frequency (normalized count of each n-gram in document)
+ tf := make(map[string]float64)
+ for _, ngram := range ngrams {
+ tf[ngram]++
+ }
+ numNgrams := float64(len(ngrams))
+ for ngram, count := range tf {
+ tf[ngram] = count / numNgrams
+ }
+
+ for j, term := range v.OrderedVocab {
+ if tfValue, ok := tf[term]; ok {
+ // only score terms that were in our training vocabulary
+ if idfValue, inVocab := v.Vocabulary[term]; inVocab {
+ vector[j] = tfValue * idfValue
+ }
+ }
+ }
+ }
+ vectors[i] = vector
+ }
+
+ return vectors
+}
+
+func Tokenize(text string) []string {
+ text = strings.ToLower(text)
+ words := wordHyphenRegex.Split(text, -1)
+ tokens := make([]string, 0, len(words))
+ for _, word := range words {
+ if word == "" {
+ continue
+ }
+ if _, isStopWord := stopWords[word]; isStopWord {
+ continue
+ }
+ tokens = append(tokens, word)
+ }
+ return tokens
+}
+
+func generateNgrams(tokens []string, minN, maxN int) []string {
+ if minN <= 0 {
+ minN = 1
+ }
+ if maxN < minN {
+ maxN = minN
+ }
+
+ numTokens := len(tokens)
+
+ estimatedCap := 0
+ for n := minN; n <= maxN; n++ {
+ if numTokens >= n {
+ estimatedCap += numTokens - n + 1
+ }
+ }
+ ngrams := make([]string, 0, estimatedCap)
+
+ for n := minN; n <= maxN; n++ {
+ if numTokens < n {
+ continue
+ }
+ for i := 0; i <= numTokens-n; i++ {
+ ngrams = append(ngrams, strings.Join(tokens[i:i+n], " "))
+ }
+ }
+ return ngrams
+}
+
+
+// ============================================================================
+// ┏━╸╻ ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓
+// ┃ ┃ ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛
+// ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹ ╹┗━╸╹┗╸
+// ============================================================================
+
+
+// Binary logistic regression with L2 regularization
+// Bias term stored separately (not regularized)
+type LogisticRegression struct {
+ LearningRate float64
+ Lambda float64 // L2 regularization parameter
+ Iterations int
+ Tolerance float64 // Convergence tolerance on loss improvement
+}
+
+// validate checks and clamps hyperparams to reasonable bounds.
+func (lr *LogisticRegression) Validate() *LogisticRegression {
+ const (
+ defaultLearningRate = 0.5
+ defaultIterations = 500
+ defaultTolerance = 0.000001
+ )
+
+ if lr.LearningRate <= 0 {
+ lr.LearningRate = defaultLearningRate
+ }
+ if lr.LearningRate > 10 {
+ lr.LearningRate = 10.0
+ }
+ if lr.Lambda < 0 {
+ lr.Lambda = 0.0
+ }
+ if lr.Iterations <= 0 {
+ lr.Iterations = defaultIterations
+ }
+ if lr.Tolerance <= 0 {
+ lr.Tolerance = defaultTolerance
+ }
+ return lr
+}
+
+// Fit trains via SGD with L2 regularization on feature weights (not bias).
+// Class weights reweight samples; unused in our pipeline (we downsample instead).
+// Returns weights with bias as last element.
+func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) {
+ if len(vectors) == 0 {
+ return nil, fmt.Errorf("cannot train on empty dataset")
+ }
+ if len(vectors) != len(labels) {
+ return nil, fmt.Errorf(
+ "mismatch between number of vectors (%d) and labels (%d)",
+ len(vectors),
+ len(labels),
+ )
+ }
+
+ for i, y := range labels {
+ if y != 0 && y != 1 {
+ return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y)
+ }
+ }
+
+ numFeatures := len(vectors[0])
+ if numFeatures == 0 {
+ return nil, fmt.Errorf("cannot train with zero-length feature vectors")
+ }
+ for i := 1; i < len(vectors); i++ {
+ if len(vectors[i]) != numFeatures {
+ return nil, fmt.Errorf(
+ "inconsistent feature vector length at index %d: got %d, expected %d",
+ i,
+ len(vectors[i]),
+ numFeatures,
+ )
+ }
+ }
+ useUniformWeights := classWeights == nil
+ if useUniformWeights {
+ classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0}
+ }
+
+ numSamples := float64(len(vectors))
+ var totalWeight float64
+ if useUniformWeights {
+ totalWeight = numSamples
+ } else {
+ for _, y := range labels {
+ totalWeight += classWeights[y]
+ }
+ }
+ if totalWeight == 0 {
+ totalWeight = numSamples // Fallback
+ }
+
+ weights := make([]float64, numFeatures)
+ var bias float64
+
+ prevLoss := math.MaxFloat64
+
+ for i := 0; i < lr.Iterations; i++ {
+ gradWeights := make([]float64, numFeatures)
+ var gradBias float64
+ var currentLoss float64
+
+ for j, x := range vectors {
+ y := labels[j]
+ sampleWeight := classWeights[y]
+
+ z, err := dot(weights, x)
+ if err != nil {
+ return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err)
+ }
+ p := Sigmoid(z + bias)
+
+ // Compute prediction error. This term gets multiplied by each feature value
+ // to accumulate gradients (higher error pushes weights harder).
+ errTerm := p - y
+ for k := 0; k < numFeatures; k++ {
+ gradWeights[k] += sampleWeight * errTerm * x[k]
+ }
+ gradBias += sampleWeight * errTerm
+
+ cp := clamp(p)
+ currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp)))
+ }
+
+ // Update weights with L2 regularization (only on feature weights, not bias).
+ // This pulls weights toward zero, preventing overfitting on small datasets.
+ for k := 0; k < numFeatures; k++ {
+ regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k])
+ weights[k] -= lr.LearningRate * regularizedGrad
+ }
+ gradBias /= totalWeight
+ bias -= lr.LearningRate * gradBias
+
+ // Check convergence: if loss change is below tolerance, we're done.
+ // We include the L2 penalty in total loss to assess true convergence.
+ avgLoss := currentLoss / totalWeight
+ var l2Penalty float64
+ for _, w := range weights {
+ l2Penalty += w * w
+ }
+ totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty
+ if math.Abs(prevLoss-totalLoss) < lr.Tolerance {
+ break
+ }
+ prevLoss = totalLoss
+ }
+
+ // bias is stored as the last element
+ return append(weights, bias), nil
+}
+
+// PredictScore computes the probability for a single vec given weights.
+// the last element of weights is the bias.
+func PredictScore(vector []float64, weights []float64) (float64, error) {
+ if len(weights) == 0 {
+ return 0, fmt.Errorf("weights cannot be empty")
+ }
+ if len(vector) != len(weights)-1 {
+ return 0, fmt.Errorf(
+ "vector length mismatch: expected %d features, got %d",
+ len(weights)-1,
+ len(vector),
+ )
+ }
+
+ for i, v := range vector {
+ if math.IsNaN(v) || math.IsInf(v, 0) {
+ return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v)
+ }
+ }
+ for i, w := range weights {
+ if math.IsNaN(w) || math.IsInf(w, 0) {
+ return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w)
+ }
+ }
+
+ featureWeights := weights[:len(weights)-1]
+ bias := weights[len(weights)-1]
+
+ z, err := dot(featureWeights, vector)
+ if err != nil {
+ return 0, fmt.Errorf("failed to compute dot product: %w", err)
+ }
+ return Sigmoid(z + bias), nil
+}
+
+
+// ============================================================================
+// ┏┳┓┏━┓╺┳╸╻ ╻┏━┓
+// ┃┃┃┣━┫ ┃ ┣━┫┗━┓
+// ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛
+// ============================================================================
+
+
+func Sigmoid(z float64) float64 {
+ if z >= 0 {
+ return 1.0 / (1.0 + math.Exp(-z))
+ }
+ ez := math.Exp(z)
+ return ez / (1.0 + ez)
+}
+
+func dot(a, b []float64) (float64, error) {
+ if len(a) != len(b) {
+ return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b))
+ }
+ var sum float64
+ for i := range a {
+ sum += a[i] * b[i]
+ }
+ return sum, nil
+}
+
+func clamp(p float64) float64 {
+ const probabilityClamp = 1e-15
+ if p < probabilityClamp {
+ return probabilityClamp
+ }
+ if p > 1.0-probabilityClamp {
+ return 1.0 - probabilityClamp
+ }
+ return p
+}