diff options
| author | Klimperfix | 2025-10-03 16:56:30 +0200 |
|---|---|---|
| committer | Klimperfix | 2025-10-03 17:58:53 +0200 |
| commit | 26befd38aedbfdd278c3096644baf69e4a1fb051 (patch) | |
| tree | fefa546a14b2bffead3a0b866b6826f65a4ef175 | |
| parent | dfb7edde13ca39af3e23b80e40272e02aa093919 (diff) | |
| download | bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.tar.gz bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.zip | |
Now storing the sanitized data seperately, keeping the original.
| -rw-r--r-- | src/bibiman/bibisetup.rs | 154 | ||||
| -rw-r--r-- | src/bibiman/entries.rs | 8 | ||||
| -rw-r--r-- | src/bibiman/sanitize.rs | 96 | ||||
| -rw-r--r-- | src/bibiman/search.rs | 4 |
4 files changed, 175 insertions, 87 deletions
diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 8466169..48046e9 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -26,7 +26,7 @@ use std::{fs, path::PathBuf}; use walkdir::WalkDir; use crate::app; -use crate::bibiman::sanitize::sanitize_one; +use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata}; use crate::cliargs::{self}; use crate::config::BibiConfig; @@ -77,6 +77,18 @@ pub struct BibiData { pub subtitle: Option<String>, pub notes: Option<Vec<OsString>>, pub symbols: [Option<String>; 3], + /// This field should be set to None when initially creating a BibiData instance. + /// It then can be generated from the constructed BibiData Object using + /// `BibiData::gen_sanitized()` + pub sanitized_bibi_data: Option<SanitizedBibiData>, +} + +/// Struct that holds sanitized bibidata data. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct SanitizedBibiData { + pub title: String, + pub subtitle: Option<String>, + pub abstract_text: String, } #[derive(Debug, Clone, PartialEq)] @@ -120,22 +132,41 @@ impl BibiData { // self.pubtype(), // &self.symbols, // ] - - BibiRow { - authors: { - if self.short_author.is_empty() { - self.authors() - } else { - &self.short_author - } - }, - title: self.title(), - year: self.year(), - custom_field_value: self.custom_field_value(), - symbols: &self.symbols, + let author_ref = if self.short_author.is_empty() { + self.authors() + } else { + &self.short_author + }; + if let Some(sanidata) = &self.sanitized_bibi_data { + BibiRow { + authors: author_ref, + title: &sanidata.title, + year: self.year(), + custom_field_value: self.custom_field_value(), + symbols: &self.symbols, + } + } else { + BibiRow { + authors: author_ref, + title: self.title(), + year: self.year(), + custom_field_value: self.custom_field_value(), + symbols: &self.symbols, + } } } + /// Generates the SanitizedBibiData for the BibiData. + /// + /// Consumes self and returns a new BibiData struct. + /// + /// If multiple SanitizedBibiData are to be generated, + /// one should use the [`mass_sanitize`] function instead. + pub fn gen_sanitized(mut self) -> Self { + self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self)); + self + } + pub fn entry_id(&self) -> &u32 { &self.id } @@ -288,59 +319,48 @@ impl BibiSetup { } else { None }; - citekeys - .iter() - .enumerate() - .map(|(i, k)| { - let filepaths: (Option<Vec<OsString>>, bool) = - { Self::get_filepath(k, bibliography, &mut pdf_files) }; - - // bibiman will sanitize some fields at this point, - // this may cause longer startup-load-times. - // - // It may be better to sanitize them somewhere else, so bibiman - // does not loose the original text-information including the - // LaTeX macros present in the bibfile. From here on, they will be - // gone. - // - // The following fields are going to be sanitized: - // - // - title - // - subtitle - // - abstract_text - // - // TODO: Once the final decision to implement the sanitization at - // this point, one could write a constructor for the BibiData struct - // which handles the sanitization. - BibiData { - id: i as u32, - authors: Self::get_authors(k, bibliography), - short_author: String::new(), - title: sanitize_one(&Self::get_title(k, bibliography)), - year: Self::get_year(k, bibliography), - custom_field: ( - cfg.general.custom_column.clone(), - Self::get_custom_field(k, bibliography, &cfg.general.custom_column), - ), - keywords: Self::get_keywords(k, bibliography), - citekey: k.to_owned(), - abstract_text: sanitize_one(&Self::get_abstract(k, bibliography)), - doi_url: Self::get_weblink(k, bibliography), - filepath: filepaths.0, - file_field: filepaths.1, - subtitle: match Self::get_subtitle(k, bibliography) { - None => None, - Some(x) => Some(sanitize_one(&x)), - }, - notes: if note_files.is_some() { - Self::get_notepath(k, &mut note_files, &ext) - } else { - None - }, - symbols: [None, None, None], - } - }) - .collect() + // + // + // bibiman will sanitize some fields at this point, + // this may cause longer startup-load-times. + // + // + mass_sanitize( + citekeys + .iter() + .enumerate() + .map(|(i, k)| { + let filepaths: (Option<Vec<OsString>>, bool) = + { Self::get_filepath(k, bibliography, &mut pdf_files) }; + + BibiData { + id: i as u32, + authors: Self::get_authors(k, bibliography), + short_author: String::new(), + title: Self::get_title(k, bibliography), + year: Self::get_year(k, bibliography), + custom_field: ( + cfg.general.custom_column.clone(), + Self::get_custom_field(k, bibliography, &cfg.general.custom_column), + ), + keywords: Self::get_keywords(k, bibliography), + citekey: k.to_owned(), + abstract_text: Self::get_abstract(k, bibliography), + doi_url: Self::get_weblink(k, bibliography), + filepath: filepaths.0, + file_field: filepaths.1, + subtitle: Self::get_subtitle(k, bibliography), + notes: if note_files.is_some() { + Self::get_notepath(k, &mut note_files, &ext) + } else { + None + }, + symbols: [None, None, None], + sanitized_bibi_data: None, + } + }) + .collect(), + ) } // get list of citekeys from the given bibfile diff --git a/src/bibiman/entries.rs b/src/bibiman/entries.rs index db6d6bf..0b35a8b 100644 --- a/src/bibiman/entries.rs +++ b/src/bibiman/entries.rs @@ -174,7 +174,9 @@ mod tests { subtitle: None, notes: None, symbols: [None, None, None], - }; + sanitized_bibi_data: None, + } + .gen_sanitized(); let entry_vec = BibiData::ref_vec(&mut entry, &cfg); @@ -194,7 +196,9 @@ mod tests { subtitle: None, notes: None, symbols: [None, None, None], - }; + sanitized_bibi_data: None, + } + .gen_sanitized(); let entry_vec_editors = BibiData::ref_vec(&mut entry_editors, &cfg); diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs index aaf81ad..614ed11 100644 --- a/src/bibiman/sanitize.rs +++ b/src/bibiman/sanitize.rs @@ -18,6 +18,8 @@ use fancy_regex::Regex; use unicodeit::replace as unicode_replace; +use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData}; + /// Sanitizing process rules as regex cmds. /// /// Only macros that are not already covered by unicodeit should be processed in this way. @@ -33,6 +35,71 @@ const SANITIZE_REGEX: &[(&str, &str)] = &[ (r"\\hyphen", "-"), ]; +/// Function to build the sanitization regex vector: +fn regex_vector() -> Vec<(Regex, &'static str)> { + let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); + // build regex + for (search, replace) in SANITIZE_REGEX { + regex.push((Regex::new(search).unwrap(), replace)); + } + regex +} + +fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> { + let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect(); + + // process strings + let result_len = result.len(); + for (re, replace) in regex { + for i in 0..result_len { + result[i] = re.replace_all(&result[i], *replace).to_string(); + } + } + for i in 0..result_len { + result[i] = unicode_replace(&result[i]); + } + + // return result + result +} + +/// Helper macro to sanitize bibidata structs. +/// Here lives the code that generates SanitizedBibiData +/// structs from BibiData structs. +macro_rules! optimized_sanitize_bibidata { + ($bibidata:expr, $regex:expr) => { + match &$bibidata.subtitle { + None => { + let sanitized_data = + optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex); + SanitizedBibiData { + title: sanitized_data[0].clone(), + subtitle: None, + abstract_text: sanitized_data[1].clone(), + } + } + Some(subtitle) => { + let sanitized_data = optimized_sanitize( + vec![&$bibidata.title, subtitle, &$bibidata.abstract_text], + &$regex, + ); + SanitizedBibiData { + title: sanitized_data[0].clone(), + subtitle: Some(sanitized_data[1].clone()), + abstract_text: sanitized_data[2].clone(), + } + } + } + }; +} + +/// Sanitize one BibiData and return a SanitizedBibiData struct. +/// This function does ignore any existing sanitization of the bibidata struct. +pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData { + let regex = regex_vector(); + optimized_sanitize_bibidata!(bibidata, regex) +} + /// Sanitize one String with LaTeX Macros into a more readable one without. /// /// If one is going to mass-sanitize strings, one should use the [`sanitize`] @@ -50,24 +117,19 @@ pub fn sanitize_one(input_text: &str) -> String { /// /// This function does always return the same amount of Strings as it gets in the input list. pub fn sanitize(input_text: Vec<&str>) -> Vec<String> { - let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect(); - let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); - // build regex - for (search, replace) in SANITIZE_REGEX { - regex.push((Regex::new(search).unwrap(), replace)); - } + optimized_sanitize(input_text, ®ex_vector()) +} - // process strings - let result_len = result.len(); - for (re, replace) in regex { - for i in 0..result_len { - result[i] = re.replace_all(&result[i], replace).to_string(); - } - } - for i in 0..result_len { - result[i] = unicode_replace(&result[i]); - } +/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one. +pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> { + let regex: Vec<(Regex, &str)> = regex_vector(); - // return result + let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len()); + for entry in bibidata { + result.push(BibiData { + sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)), + ..entry + }); + } result } diff --git a/src/bibiman/search.rs b/src/bibiman/search.rs index e0c5f17..2156634 100644 --- a/src/bibiman/search.rs +++ b/src/bibiman/search.rs @@ -141,7 +141,9 @@ mod tests { subtitle: None, notes: None, symbols: [None, None, None], - }; + sanitized_bibi_data: None, + } + .gen_sanitized(); let joined_vec = BibiSearch::convert_to_string(&bibvec); |
