diff options
| author | Sam Scholten | 2025-12-15 19:34:17 +1000 |
|---|---|---|
| committer | Sam Scholten | 2025-12-15 19:34:59 +1000 |
| commit | 9f5978186ac3de07f4325975fecf4f538fe713b6 (patch) | |
| tree | 41440b703054fe59eb561ba81d80fd60380c1f7a /core | |
| download | scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip | |
Init v0.1.0
Diffstat (limited to 'core')
| -rw-r--r-- | core/constants.go | 21 | ||||
| -rw-r--r-- | core/http.go | 196 | ||||
| -rw-r--r-- | core/ml.go | 427 | ||||
| -rw-r--r-- | core/model.go | 20 | ||||
| -rw-r--r-- | core/scoring.go | 14 | ||||
| -rw-r--r-- | core/text.go | 36 | ||||
| -rw-r--r-- | core/types.go | 84 |
7 files changed, 798 insertions, 0 deletions
diff --git a/core/constants.go b/core/constants.go new file mode 100644 index 0000000..2dadac4 --- /dev/null +++ b/core/constants.go @@ -0,0 +1,21 @@ +// Default configuration constants. +// +// Timeouts are defensive: 30s for HTTP requests, 5s for graceful shutdown. +// Score threshold 0.5 is neutral; models should learn their own. +// MinTitleLength filters junk/broken titles (<15 chars rarely meaningful). +// ChunkSize 50 balances memory usage vs batch efficiency. +package core + +import "time" + +const ( + DefaultHTTPTimeout = 30 * time.Second + DefaultContextTimeout = 10 * time.Second + DefaultReadTimeout = 30 * time.Second + DefaultWriteTimeout = 30 * time.Second + DefaultIdleTimeout = 120 * time.Second + DefaultShutdownTimeout = 5 * time.Second + DefaultScoreThreshold = 0.5 + MinTitleLength = 15 + DefaultChunkSize = 50 +) diff --git a/core/http.go b/core/http.go new file mode 100644 index 0000000..8629676 --- /dev/null +++ b/core/http.go @@ -0,0 +1,196 @@ +// HTTP client with exponential backoff retry. +// +// Handles transient network failures, timeouts, and rate limiting. +// - Backoff: 500ms → 1s → 2s → 4s max +// - Jitter prevents thundering herd +// - Respects 429 Retry-After header +package core + +import ( + "context" + "errors" + "fmt" + "math/rand" + "net" + "net/http" + "os" + "strconv" + "strings" + "time" +) + + +// ============================================================================ +// ╻ ╻╺┳╸╺┳╸┏━┓ ┏━┓┏━╸╺┳╸┏━┓╻ ╻ +// ┣━┫ ┃ ┃ ┣━┛ ┣┳┛┣╸ ┃ ┣┳┛┗┳┛ +// ╹ ╹ ╹ ╹ ╹ ╹┗╸┗━╸ ╹ ╹┗╸ ╹ +// ============================================================================ + + +const PoliteUserAgent = "scholscan/1.0 (https://github.com/mrichman/scholscan; mailto:matt@mrichman.net)" + +var DefaultHTTPClient = &http.Client{ + Timeout: 30 * time.Second, +} + +var ( + retryMaxAttempts = 4 + retryInitialBackoff = 500 * time.Millisecond + retryMaxBackoff = 5 * time.Second +) + +// Makes HTTP request with exponential backoff retry +func DoRequestWithRetry( + ctx context.Context, + client *http.Client, + req *http.Request, +) (*http.Response, error) { + if client == nil { + client = DefaultHTTPClient + } + var lastErr error + backoff := retryInitialBackoff + + for attempt := 1; attempt <= retryMaxAttempts; attempt++ { + // Make the request cancellable + reqWithCtx := req.WithContext(ctx) + resp, err := client.Do(reqWithCtx) + if err == nil { + if isRetriableStatus(resp.StatusCode) { + retryAfter := parseRetryAfter(resp.Header.Get("Retry-After")) + _ = resp.Body.Close() + sleep := backoff + if retryAfter > sleep { + sleep = retryAfter + } + + // Add jitter to avoid thundering herd. + jitter := time.Duration(rand.Intn(int(backoff / 2))) + sleep += jitter + + // Make sleep cancellable + timer := time.NewTimer(sleep) + select { + case <-ctx.Done(): + timer.Stop() + return nil, ctx.Err() + case <-timer.C: + } + + backoff = minDuration(backoff*2, retryMaxBackoff) + continue + } + return resp, nil + } + // Check for context cancellation + if ctx.Err() != nil { + return nil, ctx.Err() + } + // Network error: retry on timeouts, context deadline, transient net errors, and HTTP/2 stream errors + if os.IsTimeout(err) || errors.Is(err, context.DeadlineExceeded) || isTransientNetError(err) || isHTTP2StreamErr(err) { + lastErr = err + + // Add jitter to avoid thundering herd. + jitter := time.Duration(rand.Intn(int(backoff / 2))) + sleep := backoff + jitter + + // Make sleep cancellable + timer := time.NewTimer(sleep) + select { + case <-ctx.Done(): + timer.Stop() + return nil, ctx.Err() + case <-timer.C: + } + + backoff = minDuration(backoff*2, retryMaxBackoff) + continue + } + // Non-retriable error + return nil, err + } + if lastErr == nil { + lastErr = fmt.Errorf("request retries exhausted") + } + return nil, lastErr +} + + +// ============================================================================ +// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓ +// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓ +// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛ +// ============================================================================ + + +func isRetriableStatus(code int) bool { + if code == http.StatusTooManyRequests { + return true + } + return code >= 500 && code != http.StatusNotImplemented +} + +func parseRetryAfter(v string) time.Duration { + if v == "" { + return 0 + } + if secs, err := strconv.Atoi(strings.TrimSpace(v)); err == nil && secs > 0 { + return time.Duration(secs) * time.Second + } + if t, err := http.ParseTime(v); err == nil { + if d := time.Until(t); d > 0 { + return d + } + } + return 0 +} + +func minDuration(a, b time.Duration) time.Duration { + if a < b { + return a + } + return b +} + +// isTransientNetError returns true for network errors which are commonly transient, +// such as timeouts and common connection reset/closed cases. +func isTransientNetError(err error) bool { + if err == nil { + return false + } + var ne net.Error + if errors.As(err, &ne) { + if ne.Timeout() { + return true + } + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "use of closed network connection"): + return true + case strings.Contains(msg, "connection reset by peer"): + return true + case strings.Contains(msg, "connection aborted"): + return true + case strings.Contains(msg, "broken pipe"): + return true + case strings.Contains(msg, "eof"): + // Treat unexpected EOFs as transient when occurring at transport level. + return true + default: + return false + } +} + +// isHTTP2StreamErr detects HTTP/2 stream-level errors which are often transient. +func isHTTP2StreamErr(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "stream error") || + strings.Contains(msg, "internal_error") || + strings.Contains(msg, "rst_stream") || + strings.Contains(msg, "goaway") || + strings.Contains(msg, "http2:") +} diff --git a/core/ml.go b/core/ml.go new file mode 100644 index 0000000..afdd2f3 --- /dev/null +++ b/core/ml.go @@ -0,0 +1,427 @@ +// ML implementation: TF-IDF + Logistic Regression for article filtering. +// +// Why title-only: Avoids content scraping overhead, titles are already informative. +// MinDF=2: Removes typos and rare terms that don't generalize. +// MaxDF=0.8: Removes common words that appear in >80% of documents. +// λ=0.001: Light L2 regularization to prevent overfitting on small datasets. +// +// Public API: +// - TFIDFVectorizer.Fit(): Learn vocabulary from documents +// - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors +// - LogisticRegression.Fit(): Train classifier on vectors +// - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model +// - PredictScore(): Score article using trained weights +package core + +import ( + "fmt" + "math" + "regexp" + "sort" + "strings" +) + + +// ============================================================================ +// ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓ +// ┃┏┛┣╸ ┃ ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛ +// ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸ +// ============================================================================ + + +var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+") + +// StopWords: Common words that don't help distinguish articles. +// Why: Reduces noise and improves model generalization. +var stopWords = map[string]struct{}{ + // Single letters and symbols + "s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {}, + + // Common English stop words + "a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {}, +} + +type TFIDFVectorizer struct { + Vocabulary map[string]float64 + OrderedVocab []string + NgramMin int + NgramMax int + MinDF int // Minimum document frequency (absolute) + MaxDF float64 // Maximum document frequency (ratio) + VocabCap int +} + +func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer { + return &TFIDFVectorizer{ + Vocabulary: model.Vectorizer, + OrderedVocab: model.OrderedVocab, + } +} + + +// Learns vocabulary and IDF from documents +func (v *TFIDFVectorizer) Fit(documents []string) { + numDocs := len(documents) + docFreqs := make(map[string]int) + + for _, doc := range documents { + unigrams := Tokenize(doc) + ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax) + seenInDoc := make(map[string]struct{}) + for _, ngram := range ngrams { + if _, seen := seenInDoc[ngram]; !seen { + docFreqs[ngram]++ + seenInDoc[ngram] = struct{}{} + } + } + } + + maxDocs := int(v.MaxDF * float64(numDocs)) + filteredVocab := make(map[string]int) + for term, freq := range docFreqs { + if freq >= v.MinDF && freq <= maxDocs { + filteredVocab[term] = freq + } + } + + if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap { + type termFreq struct { + term string + freq int + } + terms := make([]termFreq, 0, len(filteredVocab)) + for term, freq := range filteredVocab { + terms = append(terms, termFreq{term, freq}) + } + sort.Slice(terms, func(i, j int) bool { + return terms[i].freq > terms[j].freq + }) + + cappedTerms := terms[:v.VocabCap] + filteredVocab = make(map[string]int, v.VocabCap) + for _, tf := range cappedTerms { + filteredVocab[tf.term] = tf.freq + } + } + + v.OrderedVocab = make([]string, 0, len(filteredVocab)) + for term := range filteredVocab { + v.OrderedVocab = append(v.OrderedVocab, term) + } + sort.Strings(v.OrderedVocab) // deterministic order + + v.Vocabulary = make(map[string]float64, len(v.OrderedVocab)) + for _, term := range v.OrderedVocab { + // IDF = log(total num of docs / num of docs with term) + idf := math.Log(float64(numDocs) / float64(filteredVocab[term])) + v.Vocabulary[term] = idf + } +} + +// Converts documents to TF-IDF vectors using learned vocabulary +func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 { + vectors := make([][]float64, len(documents)) + + for i, doc := range documents { + unigrams := Tokenize(doc) + ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax) + vector := make([]float64, len(v.OrderedVocab)) + + if len(ngrams) > 0 { + // tf: term frequency (normalized count of each n-gram in document) + tf := make(map[string]float64) + for _, ngram := range ngrams { + tf[ngram]++ + } + numNgrams := float64(len(ngrams)) + for ngram, count := range tf { + tf[ngram] = count / numNgrams + } + + for j, term := range v.OrderedVocab { + if tfValue, ok := tf[term]; ok { + // only score terms that were in our training vocabulary + if idfValue, inVocab := v.Vocabulary[term]; inVocab { + vector[j] = tfValue * idfValue + } + } + } + } + vectors[i] = vector + } + + return vectors +} + +func Tokenize(text string) []string { + text = strings.ToLower(text) + words := wordHyphenRegex.Split(text, -1) + tokens := make([]string, 0, len(words)) + for _, word := range words { + if word == "" { + continue + } + if _, isStopWord := stopWords[word]; isStopWord { + continue + } + tokens = append(tokens, word) + } + return tokens +} + +func generateNgrams(tokens []string, minN, maxN int) []string { + if minN <= 0 { + minN = 1 + } + if maxN < minN { + maxN = minN + } + + numTokens := len(tokens) + + estimatedCap := 0 + for n := minN; n <= maxN; n++ { + if numTokens >= n { + estimatedCap += numTokens - n + 1 + } + } + ngrams := make([]string, 0, estimatedCap) + + for n := minN; n <= maxN; n++ { + if numTokens < n { + continue + } + for i := 0; i <= numTokens-n; i++ { + ngrams = append(ngrams, strings.Join(tokens[i:i+n], " ")) + } + } + return ngrams +} + + +// ============================================================================ +// ┏━╸╻ ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓ +// ┃ ┃ ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛ +// ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹ ╹┗━╸╹┗╸ +// ============================================================================ + + +// Binary logistic regression with L2 regularization +// Bias term stored separately (not regularized) +type LogisticRegression struct { + LearningRate float64 + Lambda float64 // L2 regularization parameter + Iterations int + Tolerance float64 // Convergence tolerance on loss improvement +} + +// validate checks and clamps hyperparams to reasonable bounds. +func (lr *LogisticRegression) Validate() *LogisticRegression { + const ( + defaultLearningRate = 0.5 + defaultIterations = 500 + defaultTolerance = 0.000001 + ) + + if lr.LearningRate <= 0 { + lr.LearningRate = defaultLearningRate + } + if lr.LearningRate > 10 { + lr.LearningRate = 10.0 + } + if lr.Lambda < 0 { + lr.Lambda = 0.0 + } + if lr.Iterations <= 0 { + lr.Iterations = defaultIterations + } + if lr.Tolerance <= 0 { + lr.Tolerance = defaultTolerance + } + return lr +} + +// Fit trains via SGD with L2 regularization on feature weights (not bias). +// Class weights reweight samples; unused in our pipeline (we downsample instead). +// Returns weights with bias as last element. +func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) { + if len(vectors) == 0 { + return nil, fmt.Errorf("cannot train on empty dataset") + } + if len(vectors) != len(labels) { + return nil, fmt.Errorf( + "mismatch between number of vectors (%d) and labels (%d)", + len(vectors), + len(labels), + ) + } + + for i, y := range labels { + if y != 0 && y != 1 { + return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y) + } + } + + numFeatures := len(vectors[0]) + if numFeatures == 0 { + return nil, fmt.Errorf("cannot train with zero-length feature vectors") + } + for i := 1; i < len(vectors); i++ { + if len(vectors[i]) != numFeatures { + return nil, fmt.Errorf( + "inconsistent feature vector length at index %d: got %d, expected %d", + i, + len(vectors[i]), + numFeatures, + ) + } + } + useUniformWeights := classWeights == nil + if useUniformWeights { + classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0} + } + + numSamples := float64(len(vectors)) + var totalWeight float64 + if useUniformWeights { + totalWeight = numSamples + } else { + for _, y := range labels { + totalWeight += classWeights[y] + } + } + if totalWeight == 0 { + totalWeight = numSamples // Fallback + } + + weights := make([]float64, numFeatures) + var bias float64 + + prevLoss := math.MaxFloat64 + + for i := 0; i < lr.Iterations; i++ { + gradWeights := make([]float64, numFeatures) + var gradBias float64 + var currentLoss float64 + + for j, x := range vectors { + y := labels[j] + sampleWeight := classWeights[y] + + z, err := dot(weights, x) + if err != nil { + return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err) + } + p := Sigmoid(z + bias) + + // Compute prediction error. This term gets multiplied by each feature value + // to accumulate gradients (higher error pushes weights harder). + errTerm := p - y + for k := 0; k < numFeatures; k++ { + gradWeights[k] += sampleWeight * errTerm * x[k] + } + gradBias += sampleWeight * errTerm + + cp := clamp(p) + currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp))) + } + + // Update weights with L2 regularization (only on feature weights, not bias). + // This pulls weights toward zero, preventing overfitting on small datasets. + for k := 0; k < numFeatures; k++ { + regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k]) + weights[k] -= lr.LearningRate * regularizedGrad + } + gradBias /= totalWeight + bias -= lr.LearningRate * gradBias + + // Check convergence: if loss change is below tolerance, we're done. + // We include the L2 penalty in total loss to assess true convergence. + avgLoss := currentLoss / totalWeight + var l2Penalty float64 + for _, w := range weights { + l2Penalty += w * w + } + totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty + if math.Abs(prevLoss-totalLoss) < lr.Tolerance { + break + } + prevLoss = totalLoss + } + + // bias is stored as the last element + return append(weights, bias), nil +} + +// PredictScore computes the probability for a single vec given weights. +// the last element of weights is the bias. +func PredictScore(vector []float64, weights []float64) (float64, error) { + if len(weights) == 0 { + return 0, fmt.Errorf("weights cannot be empty") + } + if len(vector) != len(weights)-1 { + return 0, fmt.Errorf( + "vector length mismatch: expected %d features, got %d", + len(weights)-1, + len(vector), + ) + } + + for i, v := range vector { + if math.IsNaN(v) || math.IsInf(v, 0) { + return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v) + } + } + for i, w := range weights { + if math.IsNaN(w) || math.IsInf(w, 0) { + return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w) + } + } + + featureWeights := weights[:len(weights)-1] + bias := weights[len(weights)-1] + + z, err := dot(featureWeights, vector) + if err != nil { + return 0, fmt.Errorf("failed to compute dot product: %w", err) + } + return Sigmoid(z + bias), nil +} + + +// ============================================================================ +// ┏┳┓┏━┓╺┳╸╻ ╻┏━┓ +// ┃┃┃┣━┫ ┃ ┣━┫┗━┓ +// ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛ +// ============================================================================ + + +func Sigmoid(z float64) float64 { + if z >= 0 { + return 1.0 / (1.0 + math.Exp(-z)) + } + ez := math.Exp(z) + return ez / (1.0 + ez) +} + +func dot(a, b []float64) (float64, error) { + if len(a) != len(b) { + return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b)) + } + var sum float64 + for i := range a { + sum += a[i] * b[i] + } + return sum, nil +} + +func clamp(p float64) float64 { + const probabilityClamp = 1e-15 + if p < probabilityClamp { + return probabilityClamp + } + if p > 1.0-probabilityClamp { + return 1.0 - probabilityClamp + } + return p +} diff --git a/core/model.go b/core/model.go new file mode 100644 index 0000000..28f4045 --- /dev/null +++ b/core/model.go @@ -0,0 +1,20 @@ +// Model envelope persists trained model to JSON. Contains Vectorizer for IDF values, +// OrderedVocab for feature ordering, and Weights for logistic regression. +// To score: recreate TFIDFVectorizer, transform, then PredictScore. +package core + +import ( + "time" +) + +// ModelEnvelope - complete trained model for scoring articles +type ModelEnvelope struct { + Algorithm string `json:"algorithm"` + Impl string `json:"impl"` + Version string `json:"version"` + CreatedAt time.Time `json:"created_at"` + Meta map[string]any `json:"meta"` + Vectorizer map[string]float64 `json:"vectorizer"` + OrderedVocab []string `json:"ordered_vocab"` + Weights []float64 `json:"weights"` +} diff --git a/core/scoring.go b/core/scoring.go new file mode 100644 index 0000000..9896c80 --- /dev/null +++ b/core/scoring.go @@ -0,0 +1,14 @@ +// Score conversion utilities. +// +// ScoreToScale: Maps probability (0-1) to user-friendly 1-10 scale. +// Why: Users understand "8/10" better than "0.82 probability". +package core + +import "math" + +// ScoreToScale turns probability into 1-10 display score +func ScoreToScale(rawScore, threshold float64) int { + k := 10.0 + adjustedScore := 1.0 / (1.0 + math.Exp(-k*(rawScore-threshold))) + return int(math.Round(1.0 + (adjustedScore * 9.0))) +} diff --git a/core/text.go b/core/text.go new file mode 100644 index 0000000..ef4f861 --- /dev/null +++ b/core/text.go @@ -0,0 +1,36 @@ +// Text processing for RSS feed content. +// Used for web UI previews and search indexing - not ML (title-only scoring). +package core + +import ( + "regexp" + "strings" +) + +// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB +func CleanFeedContent(content string) string { + if content == "" { + return "" + } + + content = StripHTMLTags(content) + content = NormalizeSpace(content) + + maxLength := 5000 + if len(content) > maxLength { + content = content[:maxLength] + "..." + } + + return content +} + +// StripHTMLTags removes HTML tags +func StripHTMLTags(content string) string { + re := regexp.MustCompile(`<[^>]*>`) + return re.ReplaceAllString(content, "") +} + +// NormalizeSpace collapses whitespace and trims +func NormalizeSpace(s string) string { + return strings.Join(strings.Fields(strings.TrimSpace(s)), " ") +} diff --git a/core/types.go b/core/types.go new file mode 100644 index 0000000..3bfa311 --- /dev/null +++ b/core/types.go @@ -0,0 +1,84 @@ +// Core type definitions for article filtering. +// +// Article: Represents paper with metadata, URL, title, optional content. +// +// Score, LabelPositive, Classification for ML pipeline state. +// +// Config: Application settings (timeouts, user agent, enrich). +// Command: Interface for CLI subcommands (train, scan, serve). +package core + +import ( + "io" + "time" +) + +// Article represents a single article with enriched metadata and scoring. +type Article struct { + // Basic article information + Title string `json:"title"` + Content string `json:"content,omitempty"` + URL string `json:"url"` + + // Enrichment metadata + FetchedAt *time.Time `json:"fetched_at,omitempty"` + PublishedAt *time.Time `json:"published_at,omitempty"` + Source string `json:"source,omitempty"` + + // Machine learning fields + Score *float64 `json:"score,omitempty"` + LabelPositive *bool `json:"label_positive,omitempty"` + Classification string `json:"classification,omitempty"` + + // Additional metadata + Authors []string `json:"authors,omitempty"` + Journal string `json:"journal,omitempty"` + Year *int `json:"year,omitempty"` + DOI string `json:"doi,omitempty"` + + // Raw extracted text from APIs or HTML + // Fields that may populate Title/Content + RawTitle string `json:"raw_title,omitempty"` + RawContent string `json:"raw_content,omitempty"` +} + +// Config represents the application configuration. +type Config struct { + // Default model and threshold + Defaults struct { + Model string `json:"model"` + Threshold *float64 `json:"threshold"` + EventsOut string `json:"events_out"` + } `json:"defaults"` + + // HTTP behavior + UserAgent string `json:"user_agent"` + ContactEmail string `json:"contact_email"` + + // Enrichment settings + Enrich struct { + MinTitleLength int `json:"min_title_length"` + ChunkSize int `json:"chunk_size"` + } `json:"enrich"` + + // API provider settings + Providers struct { + SemanticScholar struct { + APIKey string `json:"api_key"` + } `json:"semantic_scholar"` + } `json:"providers"` +} + +// Command defines the interface that all CLI subcommands must implement. +type Command interface { + // Name returns the command name (e.g., "train", "scan", "clean"). + Name() string + + // Init parses command-line arguments and initializes the command. + // It should return flag.ErrHelp if --help was requested. + Init(args []string) error + + // Run executes the command, reading from stdin and writing to stdout. + // The command should handle its own error reporting to stderr. + Run(stdin io.Reader, stdout io.Writer) error +} |
