From 7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Tue, 30 Sep 2025 21:49:31 +0200 Subject: Implemented basic sanitizing. The default Regex crate does not support the required regex features, so I'm using the fancy_regex crate that does. --- Cargo.lock | 40 +++++++++++++++++++++++++++ Cargo.toml | 2 ++ src/bibiman.rs | 3 ++ src/bibiman/sanitize.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ tests/biblatex-test.bib | 13 ++++++--- 5 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 src/bibiman/sanitize.rs diff --git a/Cargo.lock b/Cargo.lock index 3661e99..535b929 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,6 +99,7 @@ dependencies = [ "crossterm", "dirs", "editor-command", + "fancy-regex", "figment", "futures", "itertools", @@ -113,6 +114,7 @@ dependencies = [ "tokio", "tokio-util", "tui-input", + "unicodeit", "ureq", "walkdir", ] @@ -130,6 +132,21 @@ dependencies = [ "unscanny", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -409,6 +426,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "fancy-regex" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -1863,6 +1891,18 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +[[package]] +name = "unicodeit" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1069c222ea63347e2e59763aa12d32c9c6a4e595931c7724a769f6a75bfbc553" +dependencies = [ + "aho-corasick", + "cfg-if", + "memchr", + "regex", +] + [[package]] name = "unscanny" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index b3fc774..2d596de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,3 +39,5 @@ ureq = "2.12.1" serde = { version = "1.0.217", features = ["serde_derive"] } figment = { version = "0.10.19", features = [ "toml", "test" ]} owo-colors = "4.2.2" +unicodeit = { version = "0.2.0", features = ["naive-impl"] } +fancy-regex = "0.16.2" diff --git a/src/bibiman.rs b/src/bibiman.rs index 6d21f8c..c423ce1 100644 --- a/src/bibiman.rs +++ b/src/bibiman.rs @@ -44,6 +44,9 @@ pub mod entries; pub mod keywords; pub mod search; +/// Module with function to sanitize text with LaTeX Macros into readable unicode text. +pub mod sanitize; + // Areas in which actions are possible #[derive(Debug)] pub enum CurrentArea { diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs new file mode 100644 index 0000000..aaf81ad --- /dev/null +++ b/src/bibiman/sanitize.rs @@ -0,0 +1,73 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +///// + +use fancy_regex::Regex; +use unicodeit::replace as unicode_replace; + +/// Sanitizing process rules as regex cmds. +/// +/// Only macros that are not already covered by unicodeit should be processed in this way. +/// +// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})` +// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}` +// +const SANITIZE_REGEX: &[(&str, &str)] = &[ + ( + r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}", + "\"${1}\"", + ), + (r"\\hyphen", "-"), +]; + +/// Sanitize one String with LaTeX Macros into a more readable one without. +/// +/// If one is going to mass-sanitize strings, one should use the [`sanitize`] +/// function for performance reasons instead, to process multiple strings at once. +/// +/// This is just a shortcut for the sanitize function. +pub fn sanitize_one(input_text: &str) -> String { + // This does not panic, the sanitize function always returns + // as many elements in the returned list as it get's elements + // in the input vector. + sanitize(vec![input_text]).get(0).unwrap().to_string() +} + +/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents. +/// +/// This function does always return the same amount of Strings as it gets in the input list. +pub fn sanitize(input_text: Vec<&str>) -> Vec { + let mut result: Vec = input_text.into_iter().map(|s| s.to_string()).collect(); + let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); + // build regex + for (search, replace) in SANITIZE_REGEX { + regex.push((Regex::new(search).unwrap(), replace)); + } + + // process strings + let result_len = result.len(); + for (re, replace) in regex { + for i in 0..result_len { + result[i] = re.replace_all(&result[i], replace).to_string(); + } + } + for i in 0..result_len { + result[i] = unicode_replace(&result[i]); + } + + // return result + result +} diff --git a/tests/biblatex-test.bib b/tests/biblatex-test.bib index cfcdc80..fcc5085 100644 --- a/tests/biblatex-test.bib +++ b/tests/biblatex-test.bib @@ -28,9 +28,9 @@ @article{aksin, title = {Effect of immobilization on catalytic characteristics of saturated {Pd-N}-heterocyclic carbenes in {Mizoroki-Heck} reactions}, - author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok, Levent and { - \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\"u}y{\"u}kg{\"u}ng{\" - o}r, Orhan and {\"O}zkal, Erhan}, + author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok , Levent and + { \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\" u}y{ \"u}kg{\"u} + ng{ \" o}r, Orhan and {\"O}zkal, Erhan}, volume = {691}, number = {13}, pages = {3027--3036}, @@ -82,7 +82,12 @@ langidopts = {variant=american}, file = {~/Documents/coding/projects/bibiman/tests/aristotle_physics.pdf}, annotation = {A \texttt{book} entry with a \texttt{translator} field}, - abstract = {The Physics is a work by Aristotle dedicated to the study of nature. Regarded by Heidegger as "the fundamental work of Western philosophy", it presents the renowned distinction between the four types of cause, as well as reflections on chance, motion, infinity, and other fundamental concepts. It is here that Aristotle sets out his celebrated paradox of time.}, + abstract = {The Physics is a work by Aristotle dedicated to the study of + nature. Regarded by Heidegger as "the fundamental work of Western + philosophy", it presents the renowned distinction between the + four types of cause, as well as reflections on chance, motion, + infinity, and other fundamental concepts. It is here that + Aristotle sets out his celebrated paradox of time.}, } @book{aristotle_poetics, -- cgit v1.2.3 From dfb7edde13ca39af3e23b80e40272e02aa093919 Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Fri, 3 Oct 2025 12:37:16 +0200 Subject: Sanitization hooked into bibiman. --- src/bibiman/bibisetup.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 3bcb717..8466169 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -26,6 +26,7 @@ use std::{fs, path::PathBuf}; use walkdir::WalkDir; use crate::app; +use crate::bibiman::sanitize::sanitize_one; use crate::cliargs::{self}; use crate::config::BibiConfig; @@ -294,11 +295,28 @@ impl BibiSetup { let filepaths: (Option>, bool) = { Self::get_filepath(k, bibliography, &mut pdf_files) }; + // bibiman will sanitize some fields at this point, + // this may cause longer startup-load-times. + // + // It may be better to sanitize them somewhere else, so bibiman + // does not loose the original text-information including the + // LaTeX macros present in the bibfile. From here on, they will be + // gone. + // + // The following fields are going to be sanitized: + // + // - title + // - subtitle + // - abstract_text + // + // TODO: Once the final decision to implement the sanitization at + // this point, one could write a constructor for the BibiData struct + // which handles the sanitization. BibiData { id: i as u32, authors: Self::get_authors(k, bibliography), short_author: String::new(), - title: Self::get_title(k, bibliography), + title: sanitize_one(&Self::get_title(k, bibliography)), year: Self::get_year(k, bibliography), custom_field: ( cfg.general.custom_column.clone(), @@ -306,11 +324,14 @@ impl BibiSetup { ), keywords: Self::get_keywords(k, bibliography), citekey: k.to_owned(), - abstract_text: Self::get_abstract(k, bibliography), + abstract_text: sanitize_one(&Self::get_abstract(k, bibliography)), doi_url: Self::get_weblink(k, bibliography), filepath: filepaths.0, file_field: filepaths.1, - subtitle: Self::get_subtitle(k, bibliography), + subtitle: match Self::get_subtitle(k, bibliography) { + None => None, + Some(x) => Some(sanitize_one(&x)), + }, notes: if note_files.is_some() { Self::get_notepath(k, &mut note_files, &ext) } else { -- cgit v1.2.3 From 26befd38aedbfdd278c3096644baf69e4a1fb051 Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Fri, 3 Oct 2025 16:56:30 +0200 Subject: Now storing the sanitized data seperately, keeping the original. --- src/bibiman/bibisetup.rs | 154 ++++++++++++++++++++++++++--------------------- src/bibiman/entries.rs | 8 ++- src/bibiman/sanitize.rs | 96 +++++++++++++++++++++++------ src/bibiman/search.rs | 4 +- 4 files changed, 175 insertions(+), 87 deletions(-) diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 8466169..48046e9 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -26,7 +26,7 @@ use std::{fs, path::PathBuf}; use walkdir::WalkDir; use crate::app; -use crate::bibiman::sanitize::sanitize_one; +use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata}; use crate::cliargs::{self}; use crate::config::BibiConfig; @@ -77,6 +77,18 @@ pub struct BibiData { pub subtitle: Option, pub notes: Option>, pub symbols: [Option; 3], + /// This field should be set to None when initially creating a BibiData instance. + /// It then can be generated from the constructed BibiData Object using + /// `BibiData::gen_sanitized()` + pub sanitized_bibi_data: Option, +} + +/// Struct that holds sanitized bibidata data. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct SanitizedBibiData { + pub title: String, + pub subtitle: Option, + pub abstract_text: String, } #[derive(Debug, Clone, PartialEq)] @@ -120,22 +132,41 @@ impl BibiData { // self.pubtype(), // &self.symbols, // ] - - BibiRow { - authors: { - if self.short_author.is_empty() { - self.authors() - } else { - &self.short_author - } - }, - title: self.title(), - year: self.year(), - custom_field_value: self.custom_field_value(), - symbols: &self.symbols, + let author_ref = if self.short_author.is_empty() { + self.authors() + } else { + &self.short_author + }; + if let Some(sanidata) = &self.sanitized_bibi_data { + BibiRow { + authors: author_ref, + title: &sanidata.title, + year: self.year(), + custom_field_value: self.custom_field_value(), + symbols: &self.symbols, + } + } else { + BibiRow { + authors: author_ref, + title: self.title(), + year: self.year(), + custom_field_value: self.custom_field_value(), + symbols: &self.symbols, + } } } + /// Generates the SanitizedBibiData for the BibiData. + /// + /// Consumes self and returns a new BibiData struct. + /// + /// If multiple SanitizedBibiData are to be generated, + /// one should use the [`mass_sanitize`] function instead. + pub fn gen_sanitized(mut self) -> Self { + self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self)); + self + } + pub fn entry_id(&self) -> &u32 { &self.id } @@ -288,59 +319,48 @@ impl BibiSetup { } else { None }; - citekeys - .iter() - .enumerate() - .map(|(i, k)| { - let filepaths: (Option>, bool) = - { Self::get_filepath(k, bibliography, &mut pdf_files) }; - - // bibiman will sanitize some fields at this point, - // this may cause longer startup-load-times. - // - // It may be better to sanitize them somewhere else, so bibiman - // does not loose the original text-information including the - // LaTeX macros present in the bibfile. From here on, they will be - // gone. - // - // The following fields are going to be sanitized: - // - // - title - // - subtitle - // - abstract_text - // - // TODO: Once the final decision to implement the sanitization at - // this point, one could write a constructor for the BibiData struct - // which handles the sanitization. - BibiData { - id: i as u32, - authors: Self::get_authors(k, bibliography), - short_author: String::new(), - title: sanitize_one(&Self::get_title(k, bibliography)), - year: Self::get_year(k, bibliography), - custom_field: ( - cfg.general.custom_column.clone(), - Self::get_custom_field(k, bibliography, &cfg.general.custom_column), - ), - keywords: Self::get_keywords(k, bibliography), - citekey: k.to_owned(), - abstract_text: sanitize_one(&Self::get_abstract(k, bibliography)), - doi_url: Self::get_weblink(k, bibliography), - filepath: filepaths.0, - file_field: filepaths.1, - subtitle: match Self::get_subtitle(k, bibliography) { - None => None, - Some(x) => Some(sanitize_one(&x)), - }, - notes: if note_files.is_some() { - Self::get_notepath(k, &mut note_files, &ext) - } else { - None - }, - symbols: [None, None, None], - } - }) - .collect() + // + // + // bibiman will sanitize some fields at this point, + // this may cause longer startup-load-times. + // + // + mass_sanitize( + citekeys + .iter() + .enumerate() + .map(|(i, k)| { + let filepaths: (Option>, bool) = + { Self::get_filepath(k, bibliography, &mut pdf_files) }; + + BibiData { + id: i as u32, + authors: Self::get_authors(k, bibliography), + short_author: String::new(), + title: Self::get_title(k, bibliography), + year: Self::get_year(k, bibliography), + custom_field: ( + cfg.general.custom_column.clone(), + Self::get_custom_field(k, bibliography, &cfg.general.custom_column), + ), + keywords: Self::get_keywords(k, bibliography), + citekey: k.to_owned(), + abstract_text: Self::get_abstract(k, bibliography), + doi_url: Self::get_weblink(k, bibliography), + filepath: filepaths.0, + file_field: filepaths.1, + subtitle: Self::get_subtitle(k, bibliography), + notes: if note_files.is_some() { + Self::get_notepath(k, &mut note_files, &ext) + } else { + None + }, + symbols: [None, None, None], + sanitized_bibi_data: None, + } + }) + .collect(), + ) } // get list of citekeys from the given bibfile diff --git a/src/bibiman/entries.rs b/src/bibiman/entries.rs index db6d6bf..0b35a8b 100644 --- a/src/bibiman/entries.rs +++ b/src/bibiman/entries.rs @@ -174,7 +174,9 @@ mod tests { subtitle: None, notes: None, symbols: [None, None, None], - }; + sanitized_bibi_data: None, + } + .gen_sanitized(); let entry_vec = BibiData::ref_vec(&mut entry, &cfg); @@ -194,7 +196,9 @@ mod tests { subtitle: None, notes: None, symbols: [None, None, None], - }; + sanitized_bibi_data: None, + } + .gen_sanitized(); let entry_vec_editors = BibiData::ref_vec(&mut entry_editors, &cfg); diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs index aaf81ad..614ed11 100644 --- a/src/bibiman/sanitize.rs +++ b/src/bibiman/sanitize.rs @@ -18,6 +18,8 @@ use fancy_regex::Regex; use unicodeit::replace as unicode_replace; +use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData}; + /// Sanitizing process rules as regex cmds. /// /// Only macros that are not already covered by unicodeit should be processed in this way. @@ -33,6 +35,71 @@ const SANITIZE_REGEX: &[(&str, &str)] = &[ (r"\\hyphen", "-"), ]; +/// Function to build the sanitization regex vector: +fn regex_vector() -> Vec<(Regex, &'static str)> { + let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); + // build regex + for (search, replace) in SANITIZE_REGEX { + regex.push((Regex::new(search).unwrap(), replace)); + } + regex +} + +fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec { + let mut result: Vec = input_text.into_iter().map(|s| s.to_string()).collect(); + + // process strings + let result_len = result.len(); + for (re, replace) in regex { + for i in 0..result_len { + result[i] = re.replace_all(&result[i], *replace).to_string(); + } + } + for i in 0..result_len { + result[i] = unicode_replace(&result[i]); + } + + // return result + result +} + +/// Helper macro to sanitize bibidata structs. +/// Here lives the code that generates SanitizedBibiData +/// structs from BibiData structs. +macro_rules! optimized_sanitize_bibidata { + ($bibidata:expr, $regex:expr) => { + match &$bibidata.subtitle { + None => { + let sanitized_data = + optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex); + SanitizedBibiData { + title: sanitized_data[0].clone(), + subtitle: None, + abstract_text: sanitized_data[1].clone(), + } + } + Some(subtitle) => { + let sanitized_data = optimized_sanitize( + vec![&$bibidata.title, subtitle, &$bibidata.abstract_text], + &$regex, + ); + SanitizedBibiData { + title: sanitized_data[0].clone(), + subtitle: Some(sanitized_data[1].clone()), + abstract_text: sanitized_data[2].clone(), + } + } + } + }; +} + +/// Sanitize one BibiData and return a SanitizedBibiData struct. +/// This function does ignore any existing sanitization of the bibidata struct. +pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData { + let regex = regex_vector(); + optimized_sanitize_bibidata!(bibidata, regex) +} + /// Sanitize one String with LaTeX Macros into a more readable one without. /// /// If one is going to mass-sanitize strings, one should use the [`sanitize`] @@ -50,24 +117,19 @@ pub fn sanitize_one(input_text: &str) -> String { /// /// This function does always return the same amount of Strings as it gets in the input list. pub fn sanitize(input_text: Vec<&str>) -> Vec { - let mut result: Vec = input_text.into_iter().map(|s| s.to_string()).collect(); - let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); - // build regex - for (search, replace) in SANITIZE_REGEX { - regex.push((Regex::new(search).unwrap(), replace)); - } + optimized_sanitize(input_text, ®ex_vector()) +} - // process strings - let result_len = result.len(); - for (re, replace) in regex { - for i in 0..result_len { - result[i] = re.replace_all(&result[i], replace).to_string(); - } - } - for i in 0..result_len { - result[i] = unicode_replace(&result[i]); - } +/// Sanitize a whole `Vec`, returning a new sanitized one. +pub fn mass_sanitize(bibidata: Vec) -> Vec { + let regex: Vec<(Regex, &str)> = regex_vector(); - // return result + let mut result: Vec = Vec::with_capacity(bibidata.len()); + for entry in bibidata { + result.push(BibiData { + sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)), + ..entry + }); + } result } diff --git a/src/bibiman/search.rs b/src/bibiman/search.rs index e0c5f17..2156634 100644 --- a/src/bibiman/search.rs +++ b/src/bibiman/search.rs @@ -141,7 +141,9 @@ mod tests { subtitle: None, notes: None, symbols: [None, None, None], - }; + sanitized_bibi_data: None, + } + .gen_sanitized(); let joined_vec = BibiSearch::convert_to_string(&bibvec); -- cgit v1.2.3 From 161fc7010cb863e1af534ce1d173136401816a32 Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Fri, 3 Oct 2025 18:19:25 +0200 Subject: Removed unused sanitization functions. --- src/bibiman/sanitize.rs | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs index 614ed11..823b91c 100644 --- a/src/bibiman/sanitize.rs +++ b/src/bibiman/sanitize.rs @@ -100,26 +100,6 @@ pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData { optimized_sanitize_bibidata!(bibidata, regex) } -/// Sanitize one String with LaTeX Macros into a more readable one without. -/// -/// If one is going to mass-sanitize strings, one should use the [`sanitize`] -/// function for performance reasons instead, to process multiple strings at once. -/// -/// This is just a shortcut for the sanitize function. -pub fn sanitize_one(input_text: &str) -> String { - // This does not panic, the sanitize function always returns - // as many elements in the returned list as it get's elements - // in the input vector. - sanitize(vec![input_text]).get(0).unwrap().to_string() -} - -/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents. -/// -/// This function does always return the same amount of Strings as it gets in the input list. -pub fn sanitize(input_text: Vec<&str>) -> Vec { - optimized_sanitize(input_text, ®ex_vector()) -} - /// Sanitize a whole `Vec`, returning a new sanitized one. pub fn mass_sanitize(bibidata: Vec) -> Vec { let regex: Vec<(Regex, &str)> = regex_vector(); -- cgit v1.2.3 From 624977bb9fd209b0c7c5f60a1332718de1d460d4 Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Fri, 3 Oct 2025 22:57:37 +0200 Subject: macro-sani: started impl new algorithm --- Cargo.lock | 146 +++++++++++++++++++++-------- Cargo.toml | 4 +- src/bibiman/bibisetup.rs | 76 +++++++-------- src/bibiman/sanitize.rs | 94 +++---------------- src/bibiman/sanitize/optimized_sanitize.rs | 86 +++++++++++++++++ 5 files changed, 241 insertions(+), 165 deletions(-) create mode 100644 src/bibiman/sanitize/optimized_sanitize.rs diff --git a/Cargo.lock b/Cargo.lock index 535b929..22a5a48 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,6 +89,12 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "bibiman" version = "0.14.1" @@ -99,13 +105,14 @@ dependencies = [ "crossterm", "dirs", "editor-command", - "fancy-regex", "figment", "futures", "itertools", "lexopt", + "logos", "nucleo-matcher", "owo-colors", + "phf", "rand", "ratatui", "regex", @@ -114,7 +121,6 @@ dependencies = [ "tokio", "tokio-util", "tui-input", - "unicodeit", "ureq", "walkdir", ] @@ -132,21 +138,6 @@ dependencies = [ "unscanny", ] -[[package]] -name = "bit-set" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" -dependencies = [ - "bit-vec", -] - -[[package]] -name = "bit-vec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" - [[package]] name = "bitflags" version = "1.3.2" @@ -426,17 +417,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "fancy-regex" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" -dependencies = [ - "bit-set", - "regex-automata", - "regex-syntax", -] - [[package]] name = "fastrand" version = "2.3.0" @@ -893,6 +873,40 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "logos" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff472f899b4ec2d99161c51f60ff7075eeb3097069a36050d8037a6325eb8154" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "192a3a2b90b0c05b27a0b2c43eecdb7c415e29243acc3f89cc8247a5b693045c" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax", + "rustc_version", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "605d9697bcd5ef3a42d38efc51541aa3d6a4a25f7ab6d1ed0da5ac632a26b470" +dependencies = [ + "logos-codegen", +] + [[package]] name = "lru" version = "0.12.5" @@ -1126,6 +1140,49 @@ dependencies = [ "indexmap", ] +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros", + "phf_shared", + "serde", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -1329,6 +1386,15 @@ version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.38.44" @@ -1417,6 +1483,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + [[package]] name = "serde" version = "1.0.219" @@ -1503,6 +1575,12 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.10" @@ -1891,18 +1969,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" -[[package]] -name = "unicodeit" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1069c222ea63347e2e59763aa12d32c9c6a4e595931c7724a769f6a75bfbc553" -dependencies = [ - "aho-corasick", - "cfg-if", - "memchr", - "regex", -] - [[package]] name = "unscanny" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 2d596de..a01a7e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,5 +39,5 @@ ureq = "2.12.1" serde = { version = "1.0.217", features = ["serde_derive"] } figment = { version = "0.10.19", features = [ "toml", "test" ]} owo-colors = "4.2.2" -unicodeit = { version = "0.2.0", features = ["naive-impl"] } -fancy-regex = "0.16.2" +logos = "0.15.1" +phf = { version = "0.13.1", features = ["macros"] } diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 48046e9..37b0b01 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -26,7 +26,7 @@ use std::{fs, path::PathBuf}; use walkdir::WalkDir; use crate::app; -use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata}; +use crate::bibiman::sanitize::sanitize_one_bibidata; use crate::cliargs::{self}; use crate::config::BibiConfig; @@ -159,9 +159,6 @@ impl BibiData { /// Generates the SanitizedBibiData for the BibiData. /// /// Consumes self and returns a new BibiData struct. - /// - /// If multiple SanitizedBibiData are to be generated, - /// one should use the [`mass_sanitize`] function instead. pub fn gen_sanitized(mut self) -> Self { self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self)); self @@ -325,42 +322,41 @@ impl BibiSetup { // this may cause longer startup-load-times. // // - mass_sanitize( - citekeys - .iter() - .enumerate() - .map(|(i, k)| { - let filepaths: (Option>, bool) = - { Self::get_filepath(k, bibliography, &mut pdf_files) }; - - BibiData { - id: i as u32, - authors: Self::get_authors(k, bibliography), - short_author: String::new(), - title: Self::get_title(k, bibliography), - year: Self::get_year(k, bibliography), - custom_field: ( - cfg.general.custom_column.clone(), - Self::get_custom_field(k, bibliography, &cfg.general.custom_column), - ), - keywords: Self::get_keywords(k, bibliography), - citekey: k.to_owned(), - abstract_text: Self::get_abstract(k, bibliography), - doi_url: Self::get_weblink(k, bibliography), - filepath: filepaths.0, - file_field: filepaths.1, - subtitle: Self::get_subtitle(k, bibliography), - notes: if note_files.is_some() { - Self::get_notepath(k, &mut note_files, &ext) - } else { - None - }, - symbols: [None, None, None], - sanitized_bibi_data: None, - } - }) - .collect(), - ) + citekeys + .iter() + .enumerate() + .map(|(i, k)| { + let filepaths: (Option>, bool) = + { Self::get_filepath(k, bibliography, &mut pdf_files) }; + + BibiData { + id: i as u32, + authors: Self::get_authors(k, bibliography), + short_author: String::new(), + title: Self::get_title(k, bibliography), + year: Self::get_year(k, bibliography), + custom_field: ( + cfg.general.custom_column.clone(), + Self::get_custom_field(k, bibliography, &cfg.general.custom_column), + ), + keywords: Self::get_keywords(k, bibliography), + citekey: k.to_owned(), + abstract_text: Self::get_abstract(k, bibliography), + doi_url: Self::get_weblink(k, bibliography), + filepath: filepaths.0, + file_field: filepaths.1, + subtitle: Self::get_subtitle(k, bibliography), + notes: if note_files.is_some() { + Self::get_notepath(k, &mut note_files, &ext) + } else { + None + }, + symbols: [None, None, None], + sanitized_bibi_data: None, + } + .gen_sanitized() + }) + .collect() } // get list of citekeys from the given bibfile diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs index 823b91c..9ccf4c4 100644 --- a/src/bibiman/sanitize.rs +++ b/src/bibiman/sanitize.rs @@ -15,80 +15,23 @@ // along with this program. If not, see . ///// -use fancy_regex::Regex; -use unicodeit::replace as unicode_replace; - use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData}; -/// Sanitizing process rules as regex cmds. -/// -/// Only macros that are not already covered by unicodeit should be processed in this way. -/// -// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})` -// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}` -// -const SANITIZE_REGEX: &[(&str, &str)] = &[ - ( - r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}", - "\"${1}\"", - ), - (r"\\hyphen", "-"), -]; - -/// Function to build the sanitization regex vector: -fn regex_vector() -> Vec<(Regex, &'static str)> { - let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); - // build regex - for (search, replace) in SANITIZE_REGEX { - regex.push((Regex::new(search).unwrap(), replace)); - } - regex -} - -fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec { - let mut result: Vec = input_text.into_iter().map(|s| s.to_string()).collect(); - - // process strings - let result_len = result.len(); - for (re, replace) in regex { - for i in 0..result_len { - result[i] = re.replace_all(&result[i], *replace).to_string(); - } - } - for i in 0..result_len { - result[i] = unicode_replace(&result[i]); - } - - // return result - result -} +mod optimized_sanitize; +use optimized_sanitize::optimized_sanitize; /// Helper macro to sanitize bibidata structs. /// Here lives the code that generates SanitizedBibiData /// structs from BibiData structs. macro_rules! optimized_sanitize_bibidata { - ($bibidata:expr, $regex:expr) => { - match &$bibidata.subtitle { - None => { - let sanitized_data = - optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex); - SanitizedBibiData { - title: sanitized_data[0].clone(), - subtitle: None, - abstract_text: sanitized_data[1].clone(), - } - } - Some(subtitle) => { - let sanitized_data = optimized_sanitize( - vec![&$bibidata.title, subtitle, &$bibidata.abstract_text], - &$regex, - ); - SanitizedBibiData { - title: sanitized_data[0].clone(), - subtitle: Some(sanitized_data[1].clone()), - abstract_text: sanitized_data[2].clone(), - } - } + ($bibidata:expr) => { + SanitizedBibiData { + title: optimized_sanitize(&$bibidata.title), + subtitle: match &$bibidata.subtitle { + None => None, + Some(subtitle) => Some(optimized_sanitize(subtitle)), + }, + abstract_text: optimized_sanitize(&$bibidata.abstract_text), } }; } @@ -96,20 +39,5 @@ macro_rules! optimized_sanitize_bibidata { /// Sanitize one BibiData and return a SanitizedBibiData struct. /// This function does ignore any existing sanitization of the bibidata struct. pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData { - let regex = regex_vector(); - optimized_sanitize_bibidata!(bibidata, regex) -} - -/// Sanitize a whole `Vec`, returning a new sanitized one. -pub fn mass_sanitize(bibidata: Vec) -> Vec { - let regex: Vec<(Regex, &str)> = regex_vector(); - - let mut result: Vec = Vec::with_capacity(bibidata.len()); - for entry in bibidata { - result.push(BibiData { - sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)), - ..entry - }); - } - result + optimized_sanitize_bibidata!(bibidata) } diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs new file mode 100644 index 0000000..b3bf90d --- /dev/null +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -0,0 +1,86 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +///// + +use phf::phf_map; +use std::collections::HashMap; + +use logos::Logos; + +static LOOKUP: phf::Map<&'static str, &'static str> = phf_map! { + " " => " ", // str a forced space should substitute to. +}; + +#[derive(Logos, Debug)] +enum Token { + #[token("{")] + OpenCurlyBracket, + #[token("}")] + ClosedCurlyBracket, + #[regex(r"\\\w+")] + LaTeXMacro, + #[token(r"\ ")] + ForcedSpace, +} + +pub fn optimized_sanitize(input_text: &str) -> String { + let mut out: Vec<&str> = Vec::new(); + let mut bracket_counter: u32 = 0; + let mut counter_actions: HashMap = HashMap::new(); + let mut lex = Token::lexer(input_text); + while let Some(sometoken) = lex.next() { + match sometoken { + Ok(token) => match token { + Token::ForcedSpace => { + out.push( + LOOKUP + .get(" ") + .expect("Something is wrong with the sanitization lookup table."), + ); + } + Token::OpenCurlyBracket => { + bracket_counter.saturating_add(1); + todo!(); + } + Token::ClosedCurlyBracket => { + bracket_counter.saturating_sub(1); + todo!(); + } + Token::LaTeXMacro => { + todo!() + } + }, + Err(_) => { + out.push(lex.slice()); + } + } + } + out.into_iter().collect::() +} + +#[cfg(test)] +mod tests { + use super::optimized_sanitize; + + #[test] + fn check_sanitization() { + let result = optimized_sanitize( + r"\mkbibquote{Intention} und \mkbibquote{Intentionen sind \hyphen\ bibquote.}", + ); + println!("{}", result); + panic!("Tatütata!"); + } +} -- cgit v1.2.3 From 3ba8f024577e52c51833cd34b07ad90d14cb6338 Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Sat, 4 Oct 2025 12:00:11 +0200 Subject: macro-sani: Implemented new algorithm to replace macros. --- src/bibiman/sanitize/optimized_sanitize.rs | 61 +++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs index b3bf90d..8788d39 100644 --- a/src/bibiman/sanitize/optimized_sanitize.rs +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -20,8 +20,13 @@ use std::collections::HashMap; use logos::Logos; -static LOOKUP: phf::Map<&'static str, &'static str> = phf_map! { - " " => " ", // str a forced space should substitute to. +static LOOKUP: phf::Map<&'static str, (&'static str, Option<&'static str>)> = phf_map! { + r"\mkbibquote" => ("\"", Some("\"")), + r"\enquote*" => ("\'", Some("\'")), + r"\enquote" => ("\"", Some("\"")), + r"\hyphen" => ("-", None), + r"\textbf" => ("", Some("")), + r"\textit" => ("", Some("")), }; #[derive(Logos, Debug)] @@ -30,41 +35,59 @@ enum Token { OpenCurlyBracket, #[token("}")] ClosedCurlyBracket, - #[regex(r"\\\w+")] + #[regex(r"\\[\*\w]+")] LaTeXMacro, #[token(r"\ ")] ForcedSpace, } pub fn optimized_sanitize(input_text: &str) -> String { - let mut out: Vec<&str> = Vec::new(); + let mut out: Vec<&str> = Vec::with_capacity(input_text.chars().count()); let mut bracket_counter: u32 = 0; - let mut counter_actions: HashMap = HashMap::new(); + let mut bc_up: bool = false; + let mut counter_actions: HashMap = HashMap::new(); let mut lex = Token::lexer(input_text); while let Some(sometoken) = lex.next() { match sometoken { Ok(token) => match token { Token::ForcedSpace => { - out.push( - LOOKUP - .get(" ") - .expect("Something is wrong with the sanitization lookup table."), - ); + out.push(" "); + bc_up = false; } Token::OpenCurlyBracket => { - bracket_counter.saturating_add(1); - todo!(); + if bc_up { + bracket_counter = bracket_counter.saturating_add(1); + } else { + out.push("{") + } } Token::ClosedCurlyBracket => { - bracket_counter.saturating_sub(1); - todo!(); + if bracket_counter == 0 { + out.push("}") + } else { + match counter_actions.remove(&bracket_counter) { + None => out.push("}"), + Some(a) => out.push(a), + } + bracket_counter = bracket_counter - 1; + } } Token::LaTeXMacro => { - todo!() + let texmacro = lex.slice(); + if let Some(x) = LOOKUP.get(&texmacro) { + if let Some(end) = x.1 { + bc_up = true; + counter_actions.insert(bracket_counter + 1, end); + } + out.push(x.0); + } else { + out.push(texmacro) + } } }, Err(_) => { out.push(lex.slice()); + bc_up = false; } } } @@ -78,9 +101,11 @@ mod tests { #[test] fn check_sanitization() { let result = optimized_sanitize( - r"\mkbibquote{Intention} und \mkbibquote{Intentionen sind \hyphen\ bibquote.}", + r"\mkbibquote{Intention} und \mkbibquote{Intentionen \mkbibquote{sind} \hyphen\ bibquote.}", ); - println!("{}", result); - panic!("Tatütata!"); + assert_eq!( + "\"Intention\" und \"Intentionen \"sind\" - bibquote.\"", + result + ) } } -- cgit v1.2.3 From d80ce65ad5efb64fcce313a4c44b7f46fc5e7798 Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Sat, 4 Oct 2025 12:30:22 +0200 Subject: macro-sani: skipping the algorithm, if no macro is in the string. --- src/bibiman/sanitize/optimized_sanitize.rs | 94 +++++++++++++++++------------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs index 8788d39..3a9dc67 100644 --- a/src/bibiman/sanitize/optimized_sanitize.rs +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -42,56 +42,68 @@ enum Token { } pub fn optimized_sanitize(input_text: &str) -> String { - let mut out: Vec<&str> = Vec::with_capacity(input_text.chars().count()); - let mut bracket_counter: u32 = 0; - let mut bc_up: bool = false; - let mut counter_actions: HashMap = HashMap::new(); - let mut lex = Token::lexer(input_text); - while let Some(sometoken) = lex.next() { - match sometoken { - Ok(token) => match token { - Token::ForcedSpace => { - out.push(" "); - bc_up = false; - } - Token::OpenCurlyBracket => { - if bc_up { - bracket_counter = bracket_counter.saturating_add(1); - } else { - out.push("{") + let mut char_counter: usize = 0; + let mut contains_macro: bool = false; + for char in input_text.chars() { + if char == '\\' { + contains_macro = true; + } + char_counter = char_counter.saturating_add(1); + } + if !contains_macro { + input_text.to_string() + } else { + let mut out: Vec<&str> = Vec::with_capacity(char_counter); + let mut bracket_counter: u32 = 0; + let mut bc_up: bool = false; + let mut counter_actions: HashMap = HashMap::new(); + let mut lex = Token::lexer(input_text); + while let Some(sometoken) = lex.next() { + match sometoken { + Ok(token) => match token { + Token::ForcedSpace => { + out.push(" "); + bc_up = false; } - } - Token::ClosedCurlyBracket => { - if bracket_counter == 0 { - out.push("}") - } else { - match counter_actions.remove(&bracket_counter) { - None => out.push("}"), - Some(a) => out.push(a), + Token::OpenCurlyBracket => { + if bc_up { + bracket_counter = bracket_counter.saturating_add(1); + } else { + out.push("{") } - bracket_counter = bracket_counter - 1; } - } - Token::LaTeXMacro => { - let texmacro = lex.slice(); - if let Some(x) = LOOKUP.get(&texmacro) { - if let Some(end) = x.1 { - bc_up = true; - counter_actions.insert(bracket_counter + 1, end); + Token::ClosedCurlyBracket => { + if bracket_counter == 0 { + out.push("}") + } else { + match counter_actions.remove(&bracket_counter) { + None => out.push("}"), + Some(a) => out.push(a), + } + bracket_counter = bracket_counter - 1; + } + } + Token::LaTeXMacro => { + let texmacro = lex.slice(); + if let Some(x) = LOOKUP.get(&texmacro) { + if let Some(end) = x.1 { + bc_up = true; + counter_actions.insert(bracket_counter + 1, end); + } + out.push(x.0); + } else { + out.push(texmacro) } - out.push(x.0); - } else { - out.push(texmacro) } + }, + Err(_) => { + out.push(lex.slice()); + bc_up = false; } - }, - Err(_) => { - out.push(lex.slice()); - bc_up = false; } } + out.into_iter().collect::() } - out.into_iter().collect::() } #[cfg(test)] -- cgit v1.2.3 From f5adcd0fad71828646b5047c661a0d8524a3fc9c Mon Sep 17 00:00:00 2001 From: Klimperfix Date: Sat, 4 Oct 2025 12:37:40 +0200 Subject: macro-sani: Fixed whitespace handling after latex macro. --- src/bibiman/sanitize/optimized_sanitize.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs index 3a9dc67..8ee0115 100644 --- a/src/bibiman/sanitize/optimized_sanitize.rs +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -35,7 +35,7 @@ enum Token { OpenCurlyBracket, #[token("}")] ClosedCurlyBracket, - #[regex(r"\\[\*\w]+")] + #[regex(r"\\[\*\w]+ ?")] LaTeXMacro, #[token(r"\ ")] ForcedSpace, @@ -85,7 +85,7 @@ pub fn optimized_sanitize(input_text: &str) -> String { } Token::LaTeXMacro => { let texmacro = lex.slice(); - if let Some(x) = LOOKUP.get(&texmacro) { + if let Some(x) = LOOKUP.get(&texmacro.trim_end()) { if let Some(end) = x.1 { bc_up = true; counter_actions.insert(bracket_counter + 1, end); @@ -113,10 +113,10 @@ mod tests { #[test] fn check_sanitization() { let result = optimized_sanitize( - r"\mkbibquote{Intention} und \mkbibquote{Intentionen \mkbibquote{sind} \hyphen\ bibquote.}", + r"\mkbibquote {Intention} und \mkbibquote{Intentionen \mkbibquote{sind} \hyphen\ bibquote\hyphen .}", ); assert_eq!( - "\"Intention\" und \"Intentionen \"sind\" - bibquote.\"", + "\"Intention\" und \"Intentionen \"sind\" - bibquote-.\"", result ) } -- cgit v1.2.3 From 606716f064c1151ab9e8617ff76fd4b95f4a2c57 Mon Sep 17 00:00:00 2001 From: lukeflo Date: Sun, 5 Oct 2025 11:59:24 +0200 Subject: add functions to make sanitized data from PR #57 visible in the information tab too --- src/bibiman/bibisetup.rs | 22 +++++++++++++++++++--- src/tui/ui.rs | 6 +++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 37b0b01..b3f788c 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -173,7 +173,11 @@ impl BibiData { } pub fn title(&self) -> &str { - &self.title + if let Some(sani_data) = &self.sanitized_bibi_data { + &sani_data.title + } else { + &self.title + } } pub fn year(&self) -> &str { @@ -204,8 +208,20 @@ impl BibiData { .collect_vec() } - pub fn subtitle(&self) -> &str { - self.subtitle.as_ref().unwrap() + pub fn subtitle(&self) -> Option<&str> { + if let Some(sani_data) = &self.sanitized_bibi_data { + sani_data.subtitle.as_ref().map(|s| s.as_str()) + } else { + self.subtitle.as_ref().map(|s| s.as_str()) + } + } + + pub fn get_abstract(&self) -> &str { + if let Some(sani_data) = &self.sanitized_bibi_data { + &sani_data.abstract_text + } else { + &self.abstract_text + } } fn create_symbols(&self, cfg: &BibiConfig) -> [Option; 3] { diff --git a/src/tui/ui.rs b/src/tui/ui.rs index 3e6e24c..87d8c29 100644 --- a/src/tui/ui.rs +++ b/src/tui/ui.rs @@ -894,7 +894,7 @@ pub fn render_selected_item(app: &mut App, cfg: &BibiConfig, frame: &mut Frame, Style::new().fg(cfg.colors.author_color), ), ])); - if cur_entry.subtitle.is_some() { + if let Some(subtitle) = cur_entry.subtitle() { lines.push(Line::from(vec![ Span::styled("Title: ", style_value), Span::styled( @@ -910,7 +910,7 @@ pub fn render_selected_item(app: &mut App, cfg: &BibiConfig, frame: &mut Frame, .add_modifier(Modifier::ITALIC), ), Span::styled( - cur_entry.subtitle(), + subtitle, Style::new() .fg(cfg.colors.title_color) .add_modifier(Modifier::ITALIC), @@ -999,7 +999,7 @@ pub fn render_selected_item(app: &mut App, cfg: &BibiConfig, frame: &mut Frame, } lines.push(Line::from("")); lines.push(Line::from(vec![Span::styled( - cur_entry.abstract_text.clone(), + cur_entry.get_abstract(), Style::new().fg(cfg.colors.main_text_color), )])); lines -- cgit v1.2.3 From f84ebacd1ea47b09c58dd1ef1eaaf70feaacbe0f Mon Sep 17 00:00:00 2001 From: lukeflo Date: Sun, 5 Oct 2025 13:16:26 +0200 Subject: add some further text macros to be hidden --- src/bibiman/sanitize/optimized_sanitize.rs | 2 ++ tests/biblatex-test.bib | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs index 8ee0115..336cc56 100644 --- a/src/bibiman/sanitize/optimized_sanitize.rs +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -27,6 +27,8 @@ static LOOKUP: phf::Map<&'static str, (&'static str, Option<&'static str>)> = ph r"\hyphen" => ("-", None), r"\textbf" => ("", Some("")), r"\textit" => ("", Some("")), + r"\texttt" => ("", Some("")), + r"\textsc" => ("", Some("")), }; #[derive(Logos, Debug)] diff --git a/tests/biblatex-test.bib b/tests/biblatex-test.bib index fcc5085..2149e7c 100644 --- a/tests/biblatex-test.bib +++ b/tests/biblatex-test.bib @@ -107,7 +107,7 @@ } @mvbook{aristotle_rhetoric, - title = {The Rhetoric of {Aristotle} with a commentary by the late {Edward + title = {The \textbf{Rhetoric} of {Aristotle} with a commentary by the late {Edward Meredith Cope}}, shorttitle = {Rhetoric}, author = {Aristotle}, @@ -127,7 +127,7 @@ } @book{augustine, - title = {Heterogeneous catalysis for the synthetic chemist}, + title = {Heterogeneous catalysis for the synthetic \textit{chemist}}, shorttitle = {Heterogeneous catalysis}, author = {Augustine, Robert L.}, location = {New York}, @@ -289,7 +289,7 @@ annotation = {An \texttt{article} entry with an \texttt{eid} and a \texttt{ doi} field. Note that the \textsc{doi} is transformed into a clickable link if \texttt{hyperref} support has been enabled}, - abstract = {The computation of ionic solvation free energies from atomistic + abstract = {The computation of \texttt{ionic} solvation free energies from atomistic simulations is a surprisingly difficult problem that has found no satisfactory solution for more than 15 years. The reason is that the charging free energies evaluated from such simulations are -- cgit v1.2.3