aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKlimperfix2025-10-03 22:57:37 +0200
committerKlimperfix2025-10-03 23:58:14 +0200
commit624977bb9fd209b0c7c5f60a1332718de1d460d4 (patch)
tree9a9fe5fcf9a087c822de48800c6d856ad4f14a71
parent161fc7010cb863e1af534ce1d173136401816a32 (diff)
downloadbibiman-624977bb9fd209b0c7c5f60a1332718de1d460d4.tar.gz
bibiman-624977bb9fd209b0c7c5f60a1332718de1d460d4.zip
macro-sani: started impl new algorithm
-rw-r--r--Cargo.lock146
-rw-r--r--Cargo.toml4
-rw-r--r--src/bibiman/bibisetup.rs76
-rw-r--r--src/bibiman/sanitize.rs94
-rw-r--r--src/bibiman/sanitize/optimized_sanitize.rs86
5 files changed, 241 insertions, 165 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 535b929..22a5a48 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -90,6 +90,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
+name = "beef"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1"
+
+[[package]]
name = "bibiman"
version = "0.14.1"
dependencies = [
@@ -99,13 +105,14 @@ dependencies = [
"crossterm",
"dirs",
"editor-command",
- "fancy-regex",
"figment",
"futures",
"itertools",
"lexopt",
+ "logos",
"nucleo-matcher",
"owo-colors",
+ "phf",
"rand",
"ratatui",
"regex",
@@ -114,7 +121,6 @@ dependencies = [
"tokio",
"tokio-util",
"tui-input",
- "unicodeit",
"ureq",
"walkdir",
]
@@ -133,21 +139,6 @@ dependencies = [
]
[[package]]
-name = "bit-set"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
-dependencies = [
- "bit-vec",
-]
-
-[[package]]
-name = "bit-vec"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
-
-[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -427,17 +418,6 @@ dependencies = [
]
[[package]]
-name = "fancy-regex"
-version = "0.16.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
-dependencies = [
- "bit-set",
- "regex-automata",
- "regex-syntax",
-]
-
-[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -894,6 +874,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
+name = "logos"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff472f899b4ec2d99161c51f60ff7075eeb3097069a36050d8037a6325eb8154"
+dependencies = [
+ "logos-derive",
+]
+
+[[package]]
+name = "logos-codegen"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "192a3a2b90b0c05b27a0b2c43eecdb7c415e29243acc3f89cc8247a5b693045c"
+dependencies = [
+ "beef",
+ "fnv",
+ "lazy_static",
+ "proc-macro2",
+ "quote",
+ "regex-syntax",
+ "rustc_version",
+ "syn",
+]
+
+[[package]]
+name = "logos-derive"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "605d9697bcd5ef3a42d38efc51541aa3d6a4a25f7ab6d1ed0da5ac632a26b470"
+dependencies = [
+ "logos-codegen",
+]
+
+[[package]]
name = "lru"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1127,6 +1141,49 @@ dependencies = [
]
[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_macros",
+ "phf_shared",
+ "serde",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
+[[package]]
name = "pin-project-lite"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1330,6 +1387,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f"
[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
name = "rustix"
version = "0.38.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1418,6 +1484,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
+name = "semver"
+version = "1.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+
+[[package]]
name = "serde"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1504,6 +1576,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
[[package]]
+name = "siphasher"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
+
+[[package]]
name = "slab"
version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1892,18 +1970,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
[[package]]
-name = "unicodeit"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1069c222ea63347e2e59763aa12d32c9c6a4e595931c7724a769f6a75bfbc553"
-dependencies = [
- "aho-corasick",
- "cfg-if",
- "memchr",
- "regex",
-]
-
-[[package]]
name = "unscanny"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 2d596de..a01a7e7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -39,5 +39,5 @@ ureq = "2.12.1"
serde = { version = "1.0.217", features = ["serde_derive"] }
figment = { version = "0.10.19", features = [ "toml", "test" ]}
owo-colors = "4.2.2"
-unicodeit = { version = "0.2.0", features = ["naive-impl"] }
-fancy-regex = "0.16.2"
+logos = "0.15.1"
+phf = { version = "0.13.1", features = ["macros"] }
diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs
index 48046e9..37b0b01 100644
--- a/src/bibiman/bibisetup.rs
+++ b/src/bibiman/bibisetup.rs
@@ -26,7 +26,7 @@ use std::{fs, path::PathBuf};
use walkdir::WalkDir;
use crate::app;
-use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata};
+use crate::bibiman::sanitize::sanitize_one_bibidata;
use crate::cliargs::{self};
use crate::config::BibiConfig;
@@ -159,9 +159,6 @@ impl BibiData {
/// Generates the SanitizedBibiData for the BibiData.
///
/// Consumes self and returns a new BibiData struct.
- ///
- /// If multiple SanitizedBibiData are to be generated,
- /// one should use the [`mass_sanitize`] function instead.
pub fn gen_sanitized(mut self) -> Self {
self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self));
self
@@ -325,42 +322,41 @@ impl BibiSetup {
// this may cause longer startup-load-times.
//
//
- mass_sanitize(
- citekeys
- .iter()
- .enumerate()
- .map(|(i, k)| {
- let filepaths: (Option<Vec<OsString>>, bool) =
- { Self::get_filepath(k, bibliography, &mut pdf_files) };
-
- BibiData {
- id: i as u32,
- authors: Self::get_authors(k, bibliography),
- short_author: String::new(),
- title: Self::get_title(k, bibliography),
- year: Self::get_year(k, bibliography),
- custom_field: (
- cfg.general.custom_column.clone(),
- Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
- ),
- keywords: Self::get_keywords(k, bibliography),
- citekey: k.to_owned(),
- abstract_text: Self::get_abstract(k, bibliography),
- doi_url: Self::get_weblink(k, bibliography),
- filepath: filepaths.0,
- file_field: filepaths.1,
- subtitle: Self::get_subtitle(k, bibliography),
- notes: if note_files.is_some() {
- Self::get_notepath(k, &mut note_files, &ext)
- } else {
- None
- },
- symbols: [None, None, None],
- sanitized_bibi_data: None,
- }
- })
- .collect(),
- )
+ citekeys
+ .iter()
+ .enumerate()
+ .map(|(i, k)| {
+ let filepaths: (Option<Vec<OsString>>, bool) =
+ { Self::get_filepath(k, bibliography, &mut pdf_files) };
+
+ BibiData {
+ id: i as u32,
+ authors: Self::get_authors(k, bibliography),
+ short_author: String::new(),
+ title: Self::get_title(k, bibliography),
+ year: Self::get_year(k, bibliography),
+ custom_field: (
+ cfg.general.custom_column.clone(),
+ Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
+ ),
+ keywords: Self::get_keywords(k, bibliography),
+ citekey: k.to_owned(),
+ abstract_text: Self::get_abstract(k, bibliography),
+ doi_url: Self::get_weblink(k, bibliography),
+ filepath: filepaths.0,
+ file_field: filepaths.1,
+ subtitle: Self::get_subtitle(k, bibliography),
+ notes: if note_files.is_some() {
+ Self::get_notepath(k, &mut note_files, &ext)
+ } else {
+ None
+ },
+ symbols: [None, None, None],
+ sanitized_bibi_data: None,
+ }
+ .gen_sanitized()
+ })
+ .collect()
}
// get list of citekeys from the given bibfile
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
index 823b91c..9ccf4c4 100644
--- a/src/bibiman/sanitize.rs
+++ b/src/bibiman/sanitize.rs
@@ -15,80 +15,23 @@
// along with this program. If not, see <https://www.gnu.org/licenses/>.
/////
-use fancy_regex::Regex;
-use unicodeit::replace as unicode_replace;
-
use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData};
-/// Sanitizing process rules as regex cmds.
-///
-/// Only macros that are not already covered by unicodeit should be processed in this way.
-///
-// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
-// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
-//
-const SANITIZE_REGEX: &[(&str, &str)] = &[
- (
- r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
- "\"${1}\"",
- ),
- (r"\\hyphen", "-"),
-];
-
-/// Function to build the sanitization regex vector:
-fn regex_vector() -> Vec<(Regex, &'static str)> {
- let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
- // build regex
- for (search, replace) in SANITIZE_REGEX {
- regex.push((Regex::new(search).unwrap(), replace));
- }
- regex
-}
-
-fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> {
- let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
-
- // process strings
- let result_len = result.len();
- for (re, replace) in regex {
- for i in 0..result_len {
- result[i] = re.replace_all(&result[i], *replace).to_string();
- }
- }
- for i in 0..result_len {
- result[i] = unicode_replace(&result[i]);
- }
-
- // return result
- result
-}
+mod optimized_sanitize;
+use optimized_sanitize::optimized_sanitize;
/// Helper macro to sanitize bibidata structs.
/// Here lives the code that generates SanitizedBibiData
/// structs from BibiData structs.
macro_rules! optimized_sanitize_bibidata {
- ($bibidata:expr, $regex:expr) => {
- match &$bibidata.subtitle {
- None => {
- let sanitized_data =
- optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex);
- SanitizedBibiData {
- title: sanitized_data[0].clone(),
- subtitle: None,
- abstract_text: sanitized_data[1].clone(),
- }
- }
- Some(subtitle) => {
- let sanitized_data = optimized_sanitize(
- vec![&$bibidata.title, subtitle, &$bibidata.abstract_text],
- &$regex,
- );
- SanitizedBibiData {
- title: sanitized_data[0].clone(),
- subtitle: Some(sanitized_data[1].clone()),
- abstract_text: sanitized_data[2].clone(),
- }
- }
+ ($bibidata:expr) => {
+ SanitizedBibiData {
+ title: optimized_sanitize(&$bibidata.title),
+ subtitle: match &$bibidata.subtitle {
+ None => None,
+ Some(subtitle) => Some(optimized_sanitize(subtitle)),
+ },
+ abstract_text: optimized_sanitize(&$bibidata.abstract_text),
}
};
}
@@ -96,20 +39,5 @@ macro_rules! optimized_sanitize_bibidata {
/// Sanitize one BibiData and return a SanitizedBibiData struct.
/// This function does ignore any existing sanitization of the bibidata struct.
pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
- let regex = regex_vector();
- optimized_sanitize_bibidata!(bibidata, regex)
-}
-
-/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one.
-pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> {
- let regex: Vec<(Regex, &str)> = regex_vector();
-
- let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len());
- for entry in bibidata {
- result.push(BibiData {
- sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)),
- ..entry
- });
- }
- result
+ optimized_sanitize_bibidata!(bibidata)
}
diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs
new file mode 100644
index 0000000..b3bf90d
--- /dev/null
+++ b/src/bibiman/sanitize/optimized_sanitize.rs
@@ -0,0 +1,86 @@
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025 lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+/////
+
+use phf::phf_map;
+use std::collections::HashMap;
+
+use logos::Logos;
+
+static LOOKUP: phf::Map<&'static str, &'static str> = phf_map! {
+ " " => " ", // str a forced space should substitute to.
+};
+
+#[derive(Logos, Debug)]
+enum Token {
+ #[token("{")]
+ OpenCurlyBracket,
+ #[token("}")]
+ ClosedCurlyBracket,
+ #[regex(r"\\\w+")]
+ LaTeXMacro,
+ #[token(r"\ ")]
+ ForcedSpace,
+}
+
+pub fn optimized_sanitize(input_text: &str) -> String {
+ let mut out: Vec<&str> = Vec::new();
+ let mut bracket_counter: u32 = 0;
+ let mut counter_actions: HashMap<u32, String> = HashMap::new();
+ let mut lex = Token::lexer(input_text);
+ while let Some(sometoken) = lex.next() {
+ match sometoken {
+ Ok(token) => match token {
+ Token::ForcedSpace => {
+ out.push(
+ LOOKUP
+ .get(" ")
+ .expect("Something is wrong with the sanitization lookup table."),
+ );
+ }
+ Token::OpenCurlyBracket => {
+ bracket_counter.saturating_add(1);
+ todo!();
+ }
+ Token::ClosedCurlyBracket => {
+ bracket_counter.saturating_sub(1);
+ todo!();
+ }
+ Token::LaTeXMacro => {
+ todo!()
+ }
+ },
+ Err(_) => {
+ out.push(lex.slice());
+ }
+ }
+ }
+ out.into_iter().collect::<String>()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::optimized_sanitize;
+
+ #[test]
+ fn check_sanitization() {
+ let result = optimized_sanitize(
+ r"\mkbibquote{Intention} und \mkbibquote{Intentionen sind \hyphen\ bibquote.}",
+ );
+ println!("{}", result);
+ panic!("Tatütata!");
+ }
+}