Init v0.1.0

author: Sam Scholten 2025-12-15 19:34:17 +1000
committer: Sam Scholten 2025-12-15 19:34:59 +1000
commit: 9f5978186ac3de07f4325975fecf4f538fe713b6 (patch)
tree: 41440b703054fe59eb561ba81d80fd60380c1f7a /core
download: scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz
scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip
7 files changed, 798 insertions, 0 deletions
diff --git a/core/constants.go b/core/constants.go
new file mode 100644
index 0000000..2dadac4
--- /dev/null
+++ b/core/constants.go
@@ -0,0 +1,21 @@
+// Default configuration constants.
+//
+// Timeouts are defensive: 30s for HTTP requests, 5s for graceful shutdown.
+// Score threshold 0.5 is neutral; models should learn their own.
+// MinTitleLength filters junk/broken titles (<15 chars rarely meaningful).
+// ChunkSize 50 balances memory usage vs batch efficiency.
+package core
+
+import "time"
+
+const (
+	DefaultHTTPTimeout     = 30 * time.Second
+	DefaultContextTimeout  = 10 * time.Second
+	DefaultReadTimeout     = 30 * time.Second
+	DefaultWriteTimeout    = 30 * time.Second
+	DefaultIdleTimeout     = 120 * time.Second
+	DefaultShutdownTimeout = 5 * time.Second
+	DefaultScoreThreshold  = 0.5
+	MinTitleLength         = 15
+	DefaultChunkSize       = 50
+)
diff --git a/core/http.go b/core/http.go
new file mode 100644
index 0000000..8629676
--- /dev/null
+++ b/core/http.go
@@ -0,0 +1,196 @@
+// HTTP client with exponential backoff retry.
+//
+// Handles transient network failures, timeouts, and rate limiting.
+//   - Backoff: 500ms → 1s → 2s → 4s max
+//   - Jitter prevents thundering herd
+//   - Respects 429 Retry-After header
+package core
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"math/rand"
+	"net"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+)
+
+
+// ============================================================================
+// ╻ ╻╺┳╸╺┳╸┏━┓   ┏━┓┏━╸╺┳╸┏━┓╻ ╻
+// ┣━┫ ┃  ┃ ┣━┛   ┣┳┛┣╸  ┃ ┣┳┛┗┳┛
+// ╹ ╹ ╹  ╹ ╹     ╹┗╸┗━╸ ╹ ╹┗╸ ╹
+// ============================================================================
+
+
+const PoliteUserAgent = "scholscan/1.0 (https://github.com/mrichman/scholscan; mailto:matt@mrichman.net)"
+
+var DefaultHTTPClient = &http.Client{
+	Timeout: 30 * time.Second,
+}
+
+var (
+	retryMaxAttempts    = 4
+	retryInitialBackoff = 500 * time.Millisecond
+	retryMaxBackoff     = 5 * time.Second
+)
+
+// Makes HTTP request with exponential backoff retry
+func DoRequestWithRetry(
+	ctx context.Context,
+	client *http.Client,
+	req *http.Request,
+) (*http.Response, error) {
+	if client == nil {
+		client = DefaultHTTPClient
+	}
+	var lastErr error
+	backoff := retryInitialBackoff
+
+	for attempt := 1; attempt <= retryMaxAttempts; attempt++ {
+		// Make the request cancellable
+		reqWithCtx := req.WithContext(ctx)
+		resp, err := client.Do(reqWithCtx)
+		if err == nil {
+			if isRetriableStatus(resp.StatusCode) {
+				retryAfter := parseRetryAfter(resp.Header.Get("Retry-After"))
+				_ = resp.Body.Close()
+				sleep := backoff
+				if retryAfter > sleep {
+					sleep = retryAfter
+				}
+
+				// Add jitter to avoid thundering herd.
+				jitter := time.Duration(rand.Intn(int(backoff / 2)))
+				sleep += jitter
+
+				// Make sleep cancellable
+				timer := time.NewTimer(sleep)
+				select {
+				case <-ctx.Done():
+					timer.Stop()
+					return nil, ctx.Err()
+				case <-timer.C:
+				}
+
+				backoff = minDuration(backoff*2, retryMaxBackoff)
+				continue
+			}
+			return resp, nil
+		}
+		// Check for context cancellation
+		if ctx.Err() != nil {
+			return nil, ctx.Err()
+		}
+		// Network error: retry on timeouts, context deadline, transient net errors, and HTTP/2 stream errors
+		if os.IsTimeout(err) || errors.Is(err, context.DeadlineExceeded) || isTransientNetError(err) || isHTTP2StreamErr(err) {
+			lastErr = err
+
+			// Add jitter to avoid thundering herd.
+			jitter := time.Duration(rand.Intn(int(backoff / 2)))
+			sleep := backoff + jitter
+
+			// Make sleep cancellable
+			timer := time.NewTimer(sleep)
+			select {
+			case <-ctx.Done():
+				timer.Stop()
+				return nil, ctx.Err()
+			case <-timer.C:
+			}
+
+			backoff = minDuration(backoff*2, retryMaxBackoff)
+			continue
+		}
+		// Non-retriable error
+		return nil, err
+	}
+	if lastErr == nil {
+		lastErr = fmt.Errorf("request retries exhausted")
+	}
+	return nil, lastErr
+}
+
+
+// ============================================================================
+// ╻ ╻┏━╸╻  ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃  ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹  ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+
+func isRetriableStatus(code int) bool {
+	if code == http.StatusTooManyRequests {
+		return true
+	}
+	return code >= 500 && code != http.StatusNotImplemented
+}
+
+func parseRetryAfter(v string) time.Duration {
+	if v == "" {
+		return 0
+	}
+	if secs, err := strconv.Atoi(strings.TrimSpace(v)); err == nil && secs > 0 {
+		return time.Duration(secs) * time.Second
+	}
+	if t, err := http.ParseTime(v); err == nil {
+		if d := time.Until(t); d > 0 {
+			return d
+		}
+	}
+	return 0
+}
+
+func minDuration(a, b time.Duration) time.Duration {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// isTransientNetError returns true for network errors which are commonly transient,
+// such as timeouts and common connection reset/closed cases.
+func isTransientNetError(err error) bool {
+	if err == nil {
+		return false
+	}
+	var ne net.Error
+	if errors.As(err, &ne) {
+		if ne.Timeout() {
+			return true
+		}
+	}
+	msg := strings.ToLower(err.Error())
+	switch {
+	case strings.Contains(msg, "use of closed network connection"):
+		return true
+	case strings.Contains(msg, "connection reset by peer"):
+		return true
+	case strings.Contains(msg, "connection aborted"):
+		return true
+	case strings.Contains(msg, "broken pipe"):
+		return true
+	case strings.Contains(msg, "eof"):
+		// Treat unexpected EOFs as transient when occurring at transport level.
+		return true
+	default:
+		return false
+	}
+}
+
+// isHTTP2StreamErr detects HTTP/2 stream-level errors which are often transient.
+func isHTTP2StreamErr(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := strings.ToLower(err.Error())
+	return strings.Contains(msg, "stream error") ||
+		strings.Contains(msg, "internal_error") ||
+		strings.Contains(msg, "rst_stream") ||
+		strings.Contains(msg, "goaway") ||
+		strings.Contains(msg, "http2:")
+}
diff --git a/core/ml.go b/core/ml.go
new file mode 100644
index 0000000..afdd2f3
--- /dev/null
+++ b/core/ml.go
@@ -0,0 +1,427 @@
+// ML implementation: TF-IDF + Logistic Regression for article filtering.
+//
+// Why title-only: Avoids content scraping overhead, titles are already informative.
+// MinDF=2: Removes typos and rare terms that don't generalize.
+// MaxDF=0.8: Removes common words that appear in >80% of documents.
+// λ=0.001: Light L2 regularization to prevent overfitting on small datasets.
+//
+// Public API:
+//   - TFIDFVectorizer.Fit(): Learn vocabulary from documents
+//   - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors
+//   - LogisticRegression.Fit(): Train classifier on vectors
+//   - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model
+//   - PredictScore(): Score article using trained weights
+package core
+
+import (
+	"fmt"
+	"math"
+	"regexp"
+	"sort"
+	"strings"
+)
+
+
+// ============================================================================
+// ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓
+// ┃┏┛┣╸ ┃   ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛
+// ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸
+// ============================================================================
+
+
+var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+")
+
+// StopWords: Common words that don't help distinguish articles.
+// Why: Reduces noise and improves model generalization.
+var stopWords = map[string]struct{}{
+	// Single letters and symbols
+	"s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {},
+
+	// Common English stop words
+	"a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {},
+}
+
+type TFIDFVectorizer struct {
+	Vocabulary   map[string]float64
+	OrderedVocab []string
+	NgramMin     int
+	NgramMax     int
+	MinDF        int     // Minimum document frequency (absolute)
+	MaxDF        float64 // Maximum document frequency (ratio)
+	VocabCap     int
+}
+
+func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer {
+	return &TFIDFVectorizer{
+		Vocabulary:   model.Vectorizer,
+		OrderedVocab: model.OrderedVocab,
+	}
+}
+
+
+// Learns vocabulary and IDF from documents
+func (v *TFIDFVectorizer) Fit(documents []string) {
+	numDocs := len(documents)
+	docFreqs := make(map[string]int)
+
+	for _, doc := range documents {
+		unigrams := Tokenize(doc)
+		ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
+		seenInDoc := make(map[string]struct{})
+		for _, ngram := range ngrams {
+			if _, seen := seenInDoc[ngram]; !seen {
+				docFreqs[ngram]++
+				seenInDoc[ngram] = struct{}{}
+			}
+		}
+	}
+
+	maxDocs := int(v.MaxDF * float64(numDocs))
+	filteredVocab := make(map[string]int)
+	for term, freq := range docFreqs {
+		if freq >= v.MinDF && freq <= maxDocs {
+			filteredVocab[term] = freq
+		}
+	}
+
+	if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap {
+		type termFreq struct {
+			term string
+			freq int
+		}
+		terms := make([]termFreq, 0, len(filteredVocab))
+		for term, freq := range filteredVocab {
+			terms = append(terms, termFreq{term, freq})
+		}
+		sort.Slice(terms, func(i, j int) bool {
+			return terms[i].freq > terms[j].freq
+		})
+
+		cappedTerms := terms[:v.VocabCap]
+		filteredVocab = make(map[string]int, v.VocabCap)
+		for _, tf := range cappedTerms {
+			filteredVocab[tf.term] = tf.freq
+		}
+	}
+
+	v.OrderedVocab = make([]string, 0, len(filteredVocab))
+	for term := range filteredVocab {
+		v.OrderedVocab = append(v.OrderedVocab, term)
+	}
+	sort.Strings(v.OrderedVocab) // deterministic order
+
+	v.Vocabulary = make(map[string]float64, len(v.OrderedVocab))
+	for _, term := range v.OrderedVocab {
+		// IDF = log(total num of docs / num of docs with term)
+		idf := math.Log(float64(numDocs) / float64(filteredVocab[term]))
+		v.Vocabulary[term] = idf
+	}
+}
+
+// Converts documents to TF-IDF vectors using learned vocabulary
+func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 {
+	vectors := make([][]float64, len(documents))
+
+	for i, doc := range documents {
+		unigrams := Tokenize(doc)
+		ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
+		vector := make([]float64, len(v.OrderedVocab))
+
+		if len(ngrams) > 0 {
+			// tf: term frequency (normalized count of each n-gram in document)
+			tf := make(map[string]float64)
+			for _, ngram := range ngrams {
+				tf[ngram]++
+			}
+			numNgrams := float64(len(ngrams))
+			for ngram, count := range tf {
+				tf[ngram] = count / numNgrams
+			}
+
+			for j, term := range v.OrderedVocab {
+				if tfValue, ok := tf[term]; ok {
+					// only score terms that were in our training vocabulary
+					if idfValue, inVocab := v.Vocabulary[term]; inVocab {
+						vector[j] = tfValue * idfValue
+					}
+				}
+			}
+		}
+		vectors[i] = vector
+	}
+
+	return vectors
+}
+
+func Tokenize(text string) []string {
+	text = strings.ToLower(text)
+	words := wordHyphenRegex.Split(text, -1)
+	tokens := make([]string, 0, len(words))
+	for _, word := range words {
+		if word == "" {
+			continue
+		}
+		if _, isStopWord := stopWords[word]; isStopWord {
+			continue
+		}
+		tokens = append(tokens, word)
+	}
+	return tokens
+}
+
+func generateNgrams(tokens []string, minN, maxN int) []string {
+	if minN <= 0 {
+		minN = 1
+	}
+	if maxN < minN {
+		maxN = minN
+	}
+
+	numTokens := len(tokens)
+
+	estimatedCap := 0
+	for n := minN; n <= maxN; n++ {
+		if numTokens >= n {
+			estimatedCap += numTokens - n + 1
+		}
+	}
+	ngrams := make([]string, 0, estimatedCap)
+
+	for n := minN; n <= maxN; n++ {
+		if numTokens < n {
+			continue
+		}
+		for i := 0; i <= numTokens-n; i++ {
+			ngrams = append(ngrams, strings.Join(tokens[i:i+n], " "))
+		}
+	}
+	return ngrams
+}
+
+
+// ============================================================================
+// ┏━╸╻  ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓
+// ┃  ┃  ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛
+// ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹  ╹┗━╸╹┗╸
+// ============================================================================
+
+
+// Binary logistic regression with L2 regularization
+// Bias term stored separately (not regularized)
+type LogisticRegression struct {
+	LearningRate float64
+	Lambda       float64 // L2 regularization parameter
+	Iterations   int
+	Tolerance    float64 // Convergence tolerance on loss improvement
+}
+
+// validate checks and clamps hyperparams to reasonable bounds.
+func (lr *LogisticRegression) Validate() *LogisticRegression {
+	const (
+		defaultLearningRate = 0.5
+		defaultIterations   = 500
+		defaultTolerance    = 0.000001
+	)
+
+	if lr.LearningRate <= 0 {
+		lr.LearningRate = defaultLearningRate
+	}
+	if lr.LearningRate > 10 {
+		lr.LearningRate = 10.0
+	}
+	if lr.Lambda < 0 {
+		lr.Lambda = 0.0
+	}
+	if lr.Iterations <= 0 {
+		lr.Iterations = defaultIterations
+	}
+	if lr.Tolerance <= 0 {
+		lr.Tolerance = defaultTolerance
+	}
+	return lr
+}
+
+// Fit trains via SGD with L2 regularization on feature weights (not bias).
+// Class weights reweight samples; unused in our pipeline (we downsample instead).
+// Returns weights with bias as last element.
+func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) {
+	if len(vectors) == 0 {
+		return nil, fmt.Errorf("cannot train on empty dataset")
+	}
+	if len(vectors) != len(labels) {
+		return nil, fmt.Errorf(
+			"mismatch between number of vectors (%d) and labels (%d)",
+			len(vectors),
+			len(labels),
+		)
+	}
+
+	for i, y := range labels {
+		if y != 0 && y != 1 {
+			return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y)
+		}
+	}
+
+	numFeatures := len(vectors[0])
+	if numFeatures == 0 {
+		return nil, fmt.Errorf("cannot train with zero-length feature vectors")
+	}
+	for i := 1; i < len(vectors); i++ {
+		if len(vectors[i]) != numFeatures {
+			return nil, fmt.Errorf(
+				"inconsistent feature vector length at index %d: got %d, expected %d",
+				i,
+				len(vectors[i]),
+				numFeatures,
+			)
+		}
+	}
+	useUniformWeights := classWeights == nil
+	if useUniformWeights {
+		classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0}
+	}
+
+	numSamples := float64(len(vectors))
+	var totalWeight float64
+	if useUniformWeights {
+		totalWeight = numSamples
+	} else {
+		for _, y := range labels {
+			totalWeight += classWeights[y]
+		}
+	}
+	if totalWeight == 0 {
+		totalWeight = numSamples // Fallback
+	}
+
+	weights := make([]float64, numFeatures)
+	var bias float64
+
+	prevLoss := math.MaxFloat64
+
+	for i := 0; i < lr.Iterations; i++ {
+		gradWeights := make([]float64, numFeatures)
+		var gradBias float64
+		var currentLoss float64
+
+		for j, x := range vectors {
+			y := labels[j]
+			sampleWeight := classWeights[y]
+
+			z, err := dot(weights, x)
+			if err != nil {
+				return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err)
+			}
+			p := Sigmoid(z + bias)
+
+			// Compute prediction error. This term gets multiplied by each feature value
+			// to accumulate gradients (higher error pushes weights harder).
+			errTerm := p - y
+			for k := 0; k < numFeatures; k++ {
+				gradWeights[k] += sampleWeight * errTerm * x[k]
+			}
+			gradBias += sampleWeight * errTerm
+
+			cp := clamp(p)
+			currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp)))
+		}
+
+		// Update weights with L2 regularization (only on feature weights, not bias).
+		// This pulls weights toward zero, preventing overfitting on small datasets.
+		for k := 0; k < numFeatures; k++ {
+			regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k])
+			weights[k] -= lr.LearningRate * regularizedGrad
+		}
+		gradBias /= totalWeight
+		bias -= lr.LearningRate * gradBias
+
+		// Check convergence: if loss change is below tolerance, we're done.
+		// We include the L2 penalty in total loss to assess true convergence.
+		avgLoss := currentLoss / totalWeight
+		var l2Penalty float64
+		for _, w := range weights {
+			l2Penalty += w * w
+		}
+		totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty
+		if math.Abs(prevLoss-totalLoss) < lr.Tolerance {
+			break
+		}
+		prevLoss = totalLoss
+	}
+
+	// bias is stored as the last element
+	return append(weights, bias), nil
+}
+
+// PredictScore computes the probability for a single vec given weights.
+// the last element of weights is the bias.
+func PredictScore(vector []float64, weights []float64) (float64, error) {
+	if len(weights) == 0 {
+		return 0, fmt.Errorf("weights cannot be empty")
+	}
+	if len(vector) != len(weights)-1 {
+		return 0, fmt.Errorf(
+			"vector length mismatch: expected %d features, got %d",
+			len(weights)-1,
+			len(vector),
+		)
+	}
+
+	for i, v := range vector {
+		if math.IsNaN(v) || math.IsInf(v, 0) {
+			return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v)
+		}
+	}
+	for i, w := range weights {
+		if math.IsNaN(w) || math.IsInf(w, 0) {
+			return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w)
+		}
+	}
+
+	featureWeights := weights[:len(weights)-1]
+	bias := weights[len(weights)-1]
+
+	z, err := dot(featureWeights, vector)
+	if err != nil {
+		return 0, fmt.Errorf("failed to compute dot product: %w", err)
+	}
+	return Sigmoid(z + bias), nil
+}
+
+
+// ============================================================================
+// ┏┳┓┏━┓╺┳╸╻ ╻┏━┓
+// ┃┃┃┣━┫ ┃ ┣━┫┗━┓
+// ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛
+// ============================================================================
+
+
+func Sigmoid(z float64) float64 {
+	if z >= 0 {
+		return 1.0 / (1.0 + math.Exp(-z))
+	}
+	ez := math.Exp(z)
+	return ez / (1.0 + ez)
+}
+
+func dot(a, b []float64) (float64, error) {
+	if len(a) != len(b) {
+		return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b))
+	}
+	var sum float64
+	for i := range a {
+		sum += a[i] * b[i]
+	}
+	return sum, nil
+}
+
+func clamp(p float64) float64 {
+	const probabilityClamp = 1e-15
+	if p < probabilityClamp {
+		return probabilityClamp
+	}
+	if p > 1.0-probabilityClamp {
+		return 1.0 - probabilityClamp
+	}
+	return p
+}
diff --git a/core/model.go b/core/model.go
new file mode 100644
index 0000000..28f4045
--- /dev/null
+++ b/core/model.go
@@ -0,0 +1,20 @@
+// Model envelope persists trained model to JSON. Contains Vectorizer for IDF values,
+// OrderedVocab for feature ordering, and Weights for logistic regression.
+// To score: recreate TFIDFVectorizer, transform, then PredictScore.
+package core
+
+import (
+	"time"
+)
+
+// ModelEnvelope - complete trained model for scoring articles
+type ModelEnvelope struct {
+	Algorithm    string             `json:"algorithm"`
+	Impl         string             `json:"impl"`
+	Version      string             `json:"version"`
+	CreatedAt    time.Time          `json:"created_at"`
+	Meta         map[string]any     `json:"meta"`
+	Vectorizer   map[string]float64 `json:"vectorizer"`
+	OrderedVocab []string           `json:"ordered_vocab"`
+	Weights      []float64          `json:"weights"`
+}
diff --git a/core/scoring.go b/core/scoring.go
new file mode 100644
index 0000000..9896c80
--- /dev/null
+++ b/core/scoring.go
@@ -0,0 +1,14 @@
+// Score conversion utilities.
+//
+// ScoreToScale: Maps probability (0-1) to user-friendly 1-10 scale.
+// Why: Users understand "8/10" better than "0.82 probability".
+package core
+
+import "math"
+
+// ScoreToScale turns probability into 1-10 display score
+func ScoreToScale(rawScore, threshold float64) int {
+	k := 10.0
+	adjustedScore := 1.0 / (1.0 + math.Exp(-k*(rawScore-threshold)))
+	return int(math.Round(1.0 + (adjustedScore * 9.0)))
+}
diff --git a/core/text.go b/core/text.go
new file mode 100644
index 0000000..ef4f861
--- /dev/null
+++ b/core/text.go
@@ -0,0 +1,36 @@
+// Text processing for RSS feed content.
+// Used for web UI previews and search indexing - not ML (title-only scoring).
+package core
+
+import (
+	"regexp"
+	"strings"
+)
+
+// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB
+func CleanFeedContent(content string) string {
+	if content == "" {
+		return ""
+	}
+
+	content = StripHTMLTags(content)
+	content = NormalizeSpace(content)
+
+	maxLength := 5000
+	if len(content) > maxLength {
+		content = content[:maxLength] + "..."
+	}
+
+	return content
+}
+
+// StripHTMLTags removes HTML tags
+func StripHTMLTags(content string) string {
+	re := regexp.MustCompile(`<[^>]*>`)
+	return re.ReplaceAllString(content, "")
+}
+
+// NormalizeSpace collapses whitespace and trims
+func NormalizeSpace(s string) string {
+	return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
+}
diff --git a/core/types.go b/core/types.go
new file mode 100644
index 0000000..3bfa311
--- /dev/null
+++ b/core/types.go
@@ -0,0 +1,84 @@
+// Core type definitions for article filtering.
+//
+// Article: Represents paper with metadata, URL, title, optional content.
+//
+//	Score, LabelPositive, Classification for ML pipeline state.
+//
+// Config: Application settings (timeouts, user agent, enrich).
+// Command: Interface for CLI subcommands (train, scan, serve).
+package core
+
+import (
+	"io"
+	"time"
+)
+
+// Article represents a single article with enriched metadata and scoring.
+type Article struct {
+	// Basic article information
+	Title   string `json:"title"`
+	Content string `json:"content,omitempty"`
+	URL     string `json:"url"`
+
+	// Enrichment metadata
+	FetchedAt   *time.Time `json:"fetched_at,omitempty"`
+	PublishedAt *time.Time `json:"published_at,omitempty"`
+	Source      string     `json:"source,omitempty"`
+
+	// Machine learning fields
+	Score          *float64 `json:"score,omitempty"`
+	LabelPositive  *bool    `json:"label_positive,omitempty"`
+	Classification string   `json:"classification,omitempty"`
+
+	// Additional metadata
+	Authors []string `json:"authors,omitempty"`
+	Journal string   `json:"journal,omitempty"`
+	Year    *int     `json:"year,omitempty"`
+	DOI     string   `json:"doi,omitempty"`
+
+	// Raw extracted text from APIs or HTML
+	// Fields that may populate Title/Content
+	RawTitle   string `json:"raw_title,omitempty"`
+	RawContent string `json:"raw_content,omitempty"`
+}
+
+// Config represents the application configuration.
+type Config struct {
+	// Default model and threshold
+	Defaults struct {
+		Model     string   `json:"model"`
+		Threshold *float64 `json:"threshold"`
+		EventsOut string   `json:"events_out"`
+	} `json:"defaults"`
+
+	// HTTP behavior
+	UserAgent    string `json:"user_agent"`
+	ContactEmail string `json:"contact_email"`
+
+	// Enrichment settings
+	Enrich struct {
+		MinTitleLength int `json:"min_title_length"`
+		ChunkSize      int `json:"chunk_size"`
+	} `json:"enrich"`
+
+	// API provider settings
+	Providers struct {
+		SemanticScholar struct {
+			APIKey string `json:"api_key"`
+		} `json:"semantic_scholar"`
+	} `json:"providers"`
+}
+
+// Command defines the interface that all CLI subcommands must implement.
+type Command interface {
+	// Name returns the command name (e.g., "train", "scan", "clean").
+	Name() string
+
+	// Init parses command-line arguments and initializes the command.
+	// It should return flag.ErrHelp if --help was requested.
+	Init(args []string) error
+
+	// Run executes the command, reading from stdin and writing to stdout.
+	// The command should handle its own error reporting to stderr.
+	Run(stdin io.Reader, stdout io.Writer) error
+}
author	Sam Scholten	2025-12-15 19:34:17 +1000
committer	Sam Scholten	2025-12-15 19:34:59 +1000
commit	9f5978186ac3de07f4325975fecf4f538fe713b6 (patch)
tree	41440b703054fe59eb561ba81d80fd60380c1f7a /core
download	scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip