diff options
| author | Klimperfix | 2025-10-03 22:57:37 +0200 |
|---|---|---|
| committer | Klimperfix | 2025-10-03 23:58:14 +0200 |
| commit | 624977bb9fd209b0c7c5f60a1332718de1d460d4 (patch) | |
| tree | 9a9fe5fcf9a087c822de48800c6d856ad4f14a71 | |
| parent | 161fc7010cb863e1af534ce1d173136401816a32 (diff) | |
| download | bibiman-624977bb9fd209b0c7c5f60a1332718de1d460d4.tar.gz bibiman-624977bb9fd209b0c7c5f60a1332718de1d460d4.zip | |
macro-sani: started impl new algorithm
| -rw-r--r-- | Cargo.lock | 146 | ||||
| -rw-r--r-- | Cargo.toml | 4 | ||||
| -rw-r--r-- | src/bibiman/bibisetup.rs | 76 | ||||
| -rw-r--r-- | src/bibiman/sanitize.rs | 94 | ||||
| -rw-r--r-- | src/bibiman/sanitize/optimized_sanitize.rs | 86 |
5 files changed, 241 insertions, 165 deletions
@@ -90,6 +90,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + +[[package]] name = "bibiman" version = "0.14.1" dependencies = [ @@ -99,13 +105,14 @@ dependencies = [ "crossterm", "dirs", "editor-command", - "fancy-regex", "figment", "futures", "itertools", "lexopt", + "logos", "nucleo-matcher", "owo-colors", + "phf", "rand", "ratatui", "regex", @@ -114,7 +121,6 @@ dependencies = [ "tokio", "tokio-util", "tui-input", - "unicodeit", "ureq", "walkdir", ] @@ -133,21 +139,6 @@ dependencies = [ ] [[package]] -name = "bit-set" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" -dependencies = [ - "bit-vec", -] - -[[package]] -name = "bit-vec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" - -[[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -427,17 +418,6 @@ dependencies = [ ] [[package]] -name = "fancy-regex" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" -dependencies = [ - "bit-set", - "regex-automata", - "regex-syntax", -] - -[[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -894,6 +874,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] +name = "logos" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff472f899b4ec2d99161c51f60ff7075eeb3097069a36050d8037a6325eb8154" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "192a3a2b90b0c05b27a0b2c43eecdb7c415e29243acc3f89cc8247a5b693045c" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax", + "rustc_version", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "605d9697bcd5ef3a42d38efc51541aa3d6a4a25f7ab6d1ed0da5ac632a26b470" +dependencies = [ + "logos-codegen", +] + +[[package]] name = "lru" version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1127,6 +1141,49 @@ dependencies = [ ] [[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros", + "phf_shared", + "serde", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + +[[package]] name = "pin-project-lite" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1330,6 +1387,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" [[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] name = "rustix" version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1418,6 +1484,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] name = "serde" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1504,6 +1576,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" [[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + +[[package]] name = "slab" version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1892,18 +1970,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] -name = "unicodeit" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1069c222ea63347e2e59763aa12d32c9c6a4e595931c7724a769f6a75bfbc553" -dependencies = [ - "aho-corasick", - "cfg-if", - "memchr", - "regex", -] - -[[package]] name = "unscanny" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -39,5 +39,5 @@ ureq = "2.12.1" serde = { version = "1.0.217", features = ["serde_derive"] } figment = { version = "0.10.19", features = [ "toml", "test" ]} owo-colors = "4.2.2" -unicodeit = { version = "0.2.0", features = ["naive-impl"] } -fancy-regex = "0.16.2" +logos = "0.15.1" +phf = { version = "0.13.1", features = ["macros"] } diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 48046e9..37b0b01 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -26,7 +26,7 @@ use std::{fs, path::PathBuf}; use walkdir::WalkDir; use crate::app; -use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata}; +use crate::bibiman::sanitize::sanitize_one_bibidata; use crate::cliargs::{self}; use crate::config::BibiConfig; @@ -159,9 +159,6 @@ impl BibiData { /// Generates the SanitizedBibiData for the BibiData. /// /// Consumes self and returns a new BibiData struct. - /// - /// If multiple SanitizedBibiData are to be generated, - /// one should use the [`mass_sanitize`] function instead. pub fn gen_sanitized(mut self) -> Self { self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self)); self @@ -325,42 +322,41 @@ impl BibiSetup { // this may cause longer startup-load-times. // // - mass_sanitize( - citekeys - .iter() - .enumerate() - .map(|(i, k)| { - let filepaths: (Option<Vec<OsString>>, bool) = - { Self::get_filepath(k, bibliography, &mut pdf_files) }; - - BibiData { - id: i as u32, - authors: Self::get_authors(k, bibliography), - short_author: String::new(), - title: Self::get_title(k, bibliography), - year: Self::get_year(k, bibliography), - custom_field: ( - cfg.general.custom_column.clone(), - Self::get_custom_field(k, bibliography, &cfg.general.custom_column), - ), - keywords: Self::get_keywords(k, bibliography), - citekey: k.to_owned(), - abstract_text: Self::get_abstract(k, bibliography), - doi_url: Self::get_weblink(k, bibliography), - filepath: filepaths.0, - file_field: filepaths.1, - subtitle: Self::get_subtitle(k, bibliography), - notes: if note_files.is_some() { - Self::get_notepath(k, &mut note_files, &ext) - } else { - None - }, - symbols: [None, None, None], - sanitized_bibi_data: None, - } - }) - .collect(), - ) + citekeys + .iter() + .enumerate() + .map(|(i, k)| { + let filepaths: (Option<Vec<OsString>>, bool) = + { Self::get_filepath(k, bibliography, &mut pdf_files) }; + + BibiData { + id: i as u32, + authors: Self::get_authors(k, bibliography), + short_author: String::new(), + title: Self::get_title(k, bibliography), + year: Self::get_year(k, bibliography), + custom_field: ( + cfg.general.custom_column.clone(), + Self::get_custom_field(k, bibliography, &cfg.general.custom_column), + ), + keywords: Self::get_keywords(k, bibliography), + citekey: k.to_owned(), + abstract_text: Self::get_abstract(k, bibliography), + doi_url: Self::get_weblink(k, bibliography), + filepath: filepaths.0, + file_field: filepaths.1, + subtitle: Self::get_subtitle(k, bibliography), + notes: if note_files.is_some() { + Self::get_notepath(k, &mut note_files, &ext) + } else { + None + }, + symbols: [None, None, None], + sanitized_bibi_data: None, + } + .gen_sanitized() + }) + .collect() } // get list of citekeys from the given bibfile diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs index 823b91c..9ccf4c4 100644 --- a/src/bibiman/sanitize.rs +++ b/src/bibiman/sanitize.rs @@ -15,80 +15,23 @@ // along with this program. If not, see <https://www.gnu.org/licenses/>. ///// -use fancy_regex::Regex; -use unicodeit::replace as unicode_replace; - use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData}; -/// Sanitizing process rules as regex cmds. -/// -/// Only macros that are not already covered by unicodeit should be processed in this way. -/// -// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})` -// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}` -// -const SANITIZE_REGEX: &[(&str, &str)] = &[ - ( - r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}", - "\"${1}\"", - ), - (r"\\hyphen", "-"), -]; - -/// Function to build the sanitization regex vector: -fn regex_vector() -> Vec<(Regex, &'static str)> { - let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); - // build regex - for (search, replace) in SANITIZE_REGEX { - regex.push((Regex::new(search).unwrap(), replace)); - } - regex -} - -fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> { - let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect(); - - // process strings - let result_len = result.len(); - for (re, replace) in regex { - for i in 0..result_len { - result[i] = re.replace_all(&result[i], *replace).to_string(); - } - } - for i in 0..result_len { - result[i] = unicode_replace(&result[i]); - } - - // return result - result -} +mod optimized_sanitize; +use optimized_sanitize::optimized_sanitize; /// Helper macro to sanitize bibidata structs. /// Here lives the code that generates SanitizedBibiData /// structs from BibiData structs. macro_rules! optimized_sanitize_bibidata { - ($bibidata:expr, $regex:expr) => { - match &$bibidata.subtitle { - None => { - let sanitized_data = - optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex); - SanitizedBibiData { - title: sanitized_data[0].clone(), - subtitle: None, - abstract_text: sanitized_data[1].clone(), - } - } - Some(subtitle) => { - let sanitized_data = optimized_sanitize( - vec![&$bibidata.title, subtitle, &$bibidata.abstract_text], - &$regex, - ); - SanitizedBibiData { - title: sanitized_data[0].clone(), - subtitle: Some(sanitized_data[1].clone()), - abstract_text: sanitized_data[2].clone(), - } - } + ($bibidata:expr) => { + SanitizedBibiData { + title: optimized_sanitize(&$bibidata.title), + subtitle: match &$bibidata.subtitle { + None => None, + Some(subtitle) => Some(optimized_sanitize(subtitle)), + }, + abstract_text: optimized_sanitize(&$bibidata.abstract_text), } }; } @@ -96,20 +39,5 @@ macro_rules! optimized_sanitize_bibidata { /// Sanitize one BibiData and return a SanitizedBibiData struct. /// This function does ignore any existing sanitization of the bibidata struct. pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData { - let regex = regex_vector(); - optimized_sanitize_bibidata!(bibidata, regex) -} - -/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one. -pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> { - let regex: Vec<(Regex, &str)> = regex_vector(); - - let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len()); - for entry in bibidata { - result.push(BibiData { - sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)), - ..entry - }); - } - result + optimized_sanitize_bibidata!(bibidata) } diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs new file mode 100644 index 0000000..b3bf90d --- /dev/null +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -0,0 +1,86 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <https://www.gnu.org/licenses/>. +///// + +use phf::phf_map; +use std::collections::HashMap; + +use logos::Logos; + +static LOOKUP: phf::Map<&'static str, &'static str> = phf_map! { + " " => " ", // str a forced space should substitute to. +}; + +#[derive(Logos, Debug)] +enum Token { + #[token("{")] + OpenCurlyBracket, + #[token("}")] + ClosedCurlyBracket, + #[regex(r"\\\w+")] + LaTeXMacro, + #[token(r"\ ")] + ForcedSpace, +} + +pub fn optimized_sanitize(input_text: &str) -> String { + let mut out: Vec<&str> = Vec::new(); + let mut bracket_counter: u32 = 0; + let mut counter_actions: HashMap<u32, String> = HashMap::new(); + let mut lex = Token::lexer(input_text); + while let Some(sometoken) = lex.next() { + match sometoken { + Ok(token) => match token { + Token::ForcedSpace => { + out.push( + LOOKUP + .get(" ") + .expect("Something is wrong with the sanitization lookup table."), + ); + } + Token::OpenCurlyBracket => { + bracket_counter.saturating_add(1); + todo!(); + } + Token::ClosedCurlyBracket => { + bracket_counter.saturating_sub(1); + todo!(); + } + Token::LaTeXMacro => { + todo!() + } + }, + Err(_) => { + out.push(lex.slice()); + } + } + } + out.into_iter().collect::<String>() +} + +#[cfg(test)] +mod tests { + use super::optimized_sanitize; + + #[test] + fn check_sanitization() { + let result = optimized_sanitize( + r"\mkbibquote{Intention} und \mkbibquote{Intentionen sind \hyphen\ bibquote.}", + ); + println!("{}", result); + panic!("Tatütata!"); + } +} |
