core/scoring.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14

// Score conversion utilities.
//
// ScoreToScale: Maps probability (0-1) to user-friendly 1-10 scale.
// Why: Users understand "8/10" better than "0.82 probability".
package core

import "math"

// ScoreToScale turns probability into 1-10 display score
func ScoreToScale(rawScore, threshold float64) int {
	k := 10.0
	adjustedScore := 1.0 / (1.0 + math.Exp(-k*(rawScore-threshold)))
	return int(math.Round(1.0 + (adjustedScore * 9.0)))
}
// ML implementation: TF-IDF + Logistic Regression for article filtering.
//
// Why title-only: Avoids content scraping overhead, titles are already informative.
// MinDF=2: Removes typos and rare terms that don't generalize.
// MaxDF=0.8: Removes common words that appear in >80% of documents.
// λ=0.001: Light L2 regularization to prevent overfitting on small datasets.
//
// Public API:
//   - TFIDFVectorizer.Fit(): Learn vocabulary from documents
//   - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors
//   - LogisticRegression.Fit(): Train classifier on vectors
//   - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model
//   - PredictScore(): Score article using trained weights
package core

import (
	"fmt"
	"math"
	"regexp"
	"sort"
	"strings"
)


// ============================================================================
// ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓
// ┃┏┛┣╸ ┃   ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛
// ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸
// ============================================================================


var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+")

// StopWords: Common words that don't help distinguish articles.
// Why: Reduces noise and improves model generalization.
var stopWords = map[string]struct{}{
	// Single letters and symbols
	"s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {},

	// Common English stop words
	"a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {},
}

type TFIDFVectorizer struct {
	Vocabulary   map[string]float64
	OrderedVocab []string
	NgramMin     int
	NgramMax     int
	MinDF        int     // Minimum document frequency (absolute)
	MaxDF        float64 // Maximum document frequency (ratio)
	VocabCap     int
}

func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer {
	return &TFIDFVectorizer{
		Vocabulary:   model.Vectorizer,
		OrderedVocab: model.OrderedVocab,
	}
}


// Learns vocabulary and IDF from documents
func (v *TFIDFVectorizer) Fit(documents []string) {
	numDocs := len(documents)
	docFreqs := make(map[string]int)

	for _, doc := range documents {
		unigrams := Tokenize(doc)
		ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
		seenInDoc := make(map[string]struct{})
		for _, ngram := range ngrams {
			if _, seen := seenInDoc[ngram]; !seen {
				docFreqs[ngram]++
				seenInDoc[ngram] = struct{}{}
			}
		}
	}

	maxDocs := int(v.MaxDF * float64(numDocs))
	filteredVocab := make(map[string]int)
	for term, freq := range docFreqs {
		if freq >= v.MinDF && freq <= maxDocs {
			filteredVocab[term] = freq
		}
	}

	if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap {
		type termFreq struct {
			term string
			freq int
		}
		terms := make([]termFreq, 0, len(filteredVocab))
		for term, freq := range filteredVocab {
			terms = append(terms, termFreq{term, freq})
		}
		sort.Slice(terms, func(i, j int) bool {
			return terms[i].freq > terms[j].freq
		})

		cappedTerms := terms[:v.VocabCap]
		filteredVocab = make(map[string]int, v.VocabCap)
		for _, tf := range cappedTerms {
			filteredVocab[tf.term] = tf.freq
		}
	}

	v.OrderedVocab = make([]string, 0, len(filteredVocab))
	for term := range filteredVocab {
		v.OrderedVocab = append(v.OrderedVocab, term)
	}
	sort.Strings(v.OrderedVocab) // deterministic order

	v.Vocabulary = make(map[string]float64, len(v.OrderedVocab))
	for _, term := range v.OrderedVocab {
		// IDF = log(total num of docs / num of docs with term)
		idf := math.Log(float64(numDocs) / float64(filteredVocab[term]))
		v.Vocabulary[term] = idf
	}
}

// Converts documents to TF-IDF vectors using learned vocabulary
func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 {
	vectors := make([][]float64, len(documents))

	for i, doc := range documents {
		unigrams := Tokenize(doc)
		ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
		vector := make([]float64, len(v.OrderedVocab))

		if len(ngrams) > 0 {
			// tf: term frequency (normalized count of each n-gram in document)
			tf := make(map[string]float64)
			for _, ngram := range ngrams {
				tf[ngram]++
			}
			numNgrams := float64(len(ngrams))
			for ngram, count := range tf {
				tf[ngram] = count / numNgrams
			}

			for j, term := range v.OrderedVocab {
				if tfValue, ok := tf[term]; ok {
					// only score terms that were in our training vocabulary
					if idfValue, inVocab := v.Vocabulary[term]; inVocab {
						vector[j] = tfValue * idfValue
					}
				}
			}
		}
		vectors[i] = vector
	}

	return vectors
}

func Tokenize(text string) []string {
	text = strings.ToLower(text)
	words := wordHyphenRegex.Split(text, -1)
	tokens := make([]string, 0, len(words))
	for _, word := range words {
		if word == "" {
			continue
		}
		if _, isStopWord := stopWords[word]; isStopWord {
			continue
		}
		tokens = append(tokens, word)
	}
	return tokens
}

func generateNgrams(tokens []string, minN, maxN int) []string {
	if minN <= 0 {
		minN = 1
	}
	if maxN < minN {
		maxN = minN
	}

	numTokens := len(tokens)

	estimatedCap := 0
	for n := minN; n <= maxN; n++ {
		if numTokens >= n {
			estimatedCap += numTokens - n + 1
		}
	}
	ngrams := make([]string, 0, estimatedCap)

	for n := minN; n <= maxN; n++ {
		if numTokens < n {
			continue
		}
		for i := 0; i <= numTokens-n; i++ {
			ngrams = append(ngrams, strings.Join(tokens[i:i+n], " "))
		}
	}
	return ngrams
}


// ============================================================================
// ┏━╸╻  ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓
// ┃  ┃  ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛
// ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹  ╹┗━╸╹┗╸
// ============================================================================


// Binary logistic regression with L2 regularization
// Bias term stored separately (not regularized)
type LogisticRegression struct {
	LearningRate float64
	Lambda       float64 // L2 regularization parameter
	Iterations   int
	Tolerance    float64 // Convergence tolerance on loss improvement
}

// validate checks and clamps hyperparams to reasonable bounds.
func (lr *LogisticRegression) Validate() *LogisticRegression {
	const (
		defaultLearningRate = 0.5
		defaultIterations   = 500
		defaultTolerance    = 0.000001
	)

	if lr.LearningRate <= 0 {
		lr.LearningRate = defaultLearningRate
	}
	if lr.LearningRate > 10 {
		lr.LearningRate = 10.0
	}
	if lr.Lambda < 0 {
		lr.Lambda = 0.0
	}
	if lr.Iterations <= 0 {
		lr.Iterations = defaultIterations
	}
	if lr.Tolerance <= 0 {
		lr.Tolerance = defaultTolerance
	}
	return lr
}

// Fit trains via SGD with L2 regularization on feature weights (not bias).
// Class weights reweight samples; unused in our pipeline (we downsample instead).
// Returns weights with bias as last element.
func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) {
	if len(vectors) == 0 {
		return nil, fmt.Errorf("cannot train on empty dataset")
	}
	if len(vectors) != len(labels) {
		return nil, fmt.Errorf(
			"mismatch between number of vectors (%d) and labels (%d)",
			len(vectors),
			len(labels),
		)
	}

	for i, y := range labels {
		if y != 0 && y != 1 {
			return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y)
		}
	}

	numFeatures := len(vectors[0])
	if numFeatures == 0 {
		return nil, fmt.Errorf("cannot train with zero-length feature vectors")
	}
	for i := 1; i < len(vectors); i++ {
		if len(vectors[i]) != numFeatures {
			return nil, fmt.Errorf(
				"inconsistent feature vector length at index %d: got %d, expected %d",
				i,
				len(vectors[i]),
				numFeatures,
			)
		}
	}
	useUniformWeights := classWeights == nil
	if useUniformWeights {
		classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0}
	}

	numSamples := float64(len(vectors))
	var totalWeight float64
	if useUniformWeights {
		totalWeight = numSamples
	} else {
		for _, y := range labels {
			totalWeight += classWeights[y]
		}
	}
	if totalWeight == 0 {
		totalWeight = numSamples // Fallback
	}

	weights := make([]float64, numFeatures)
	var bias float64

	prevLoss := math.MaxFloat64

	for i := 0; i < lr.Iterations; i++ {
		gradWeights := make([]float64, numFeatures)
		var gradBias float64
		var currentLoss float64

		for j, x := range vectors {
			y := labels[j]
			sampleWeight := classWeights[y]

			z, err := dot(weights, x)
			if err != nil {
				return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err)
			}
			p := Sigmoid(z + bias)

			// Compute prediction error. This term gets multiplied by each feature value
			// to accumulate gradients (higher error pushes weights harder).
			errTerm := p - y
			for k := 0; k < numFeatures; k++ {
				gradWeights[k] += sampleWeight * errTerm * x[k]
			}
			gradBias += sampleWeight * errTerm

			cp := clamp(p)
			currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp)))
		}

		// Update weights with L2 regularization (only on feature weights, not bias).
		// This pulls weights toward zero, preventing overfitting on small datasets.
		for k := 0; k < numFeatures; k++ {
			regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k])
			weights[k] -= lr.LearningRate * regularizedGrad
		}
		gradBias /= totalWeight
		bias -= lr.LearningRate * gradBias

		// Check convergence: if loss change is below tolerance, we're done.
		// We include the L2 penalty in total loss to assess true convergence.
		avgLoss := currentLoss / totalWeight
		var l2Penalty float64
		for _, w := range weights {
			l2Penalty += w * w
		}
		totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty
		if math.Abs(prevLoss-totalLoss) < lr.Tolerance {
			break
		}
		prevLoss = totalLoss
	}

	// bias is stored as the last element
	return append(weights, bias), nil
}

// PredictScore computes the probability for a single vec given weights.
// the last element of weights is the bias.
func PredictScore(vector []float64, weights []float64) (float64, error) {
	if len(weights) == 0 {
		return 0, fmt.Errorf("weights cannot be empty")
	}
	if len(vector) != len(weights)-1 {
		return 0, fmt.Errorf(
			"vector length mismatch: expected %d features, got %d",
			len(weights)-1,
			len(vector),
		)
	}

	for i, v := range vector {
		if math.IsNaN(v) || math.IsInf(v, 0) {
			return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v)
		}
	}
	for i, w := range weights {
		if math.IsNaN(w) || math.IsInf(w, 0) {
			return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w)
		}
	}

	featureWeights := weights[:len(weights)-1]
	bias := weights[len(weights)-1]

	z, err := dot(featureWeights, vector)
	if err != nil {
		return 0, fmt.Errorf("failed to compute dot product: %w", err)
	}
	return Sigmoid(z + bias), nil
}


// ============================================================================
// ┏┳┓┏━┓╺┳╸╻ ╻┏━┓
// ┃┃┃┣━┫ ┃ ┣━┫┗━┓
// ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛
// ============================================================================


func Sigmoid(z float64) float64 {
	if z >= 0 {
		return 1.0 / (1.0 + math.Exp(-z))
	}
	ez := math.Exp(z)
	return ez / (1.0 + ez)
}

func dot(a, b []float64) (float64, error) {
	if len(a) != len(b) {
		return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b))
	}
	var sum float64
	for i := range a {
		sum += a[i] * b[i]
	}
	return sum, nil
}

func clamp(p float64) float64 {
	const probabilityClamp = 1e-15
	if p < probabilityClamp {
		return probabilityClamp
	}
	if p > 1.0-probabilityClamp {
		return 1.0 - probabilityClamp
	}
	return p
}