Now storing the sanitized data seperately, keeping the original.

author: Klimperfix 2025-10-03 16:56:30 +0200
committer: Klimperfix 2025-10-03 17:58:53 +0200
commit: 26befd38aedbfdd278c3096644baf69e4a1fb051 (patch)
tree: fefa546a14b2bffead3a0b866b6826f65a4ef175
parent: dfb7edde13ca39af3e23b80e40272e02aa093919 (diff)
download: bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.tar.gz
bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.zip
4 files changed, 175 insertions, 87 deletions
diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs
index 8466169..48046e9 100644
--- a/src/bibiman/bibisetup.rs
+++ b/src/bibiman/bibisetup.rs
@@ -26,7 +26,7 @@ use std::{fs, path::PathBuf};
 use walkdir::WalkDir;
 
 use crate::app;
-use crate::bibiman::sanitize::sanitize_one;
+use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata};
 use crate::cliargs::{self};
 use crate::config::BibiConfig;
 
@@ -77,6 +77,18 @@ pub struct BibiData {
     pub subtitle: Option<String>,
     pub notes: Option<Vec<OsString>>,
     pub symbols: [Option<String>; 3],
+    /// This field should be set to None when initially creating a BibiData instance.
+    /// It then can be generated from the constructed BibiData Object using
+    /// `BibiData::gen_sanitized()`
+    pub sanitized_bibi_data: Option<SanitizedBibiData>,
+}
+
+/// Struct that holds sanitized bibidata data.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct SanitizedBibiData {
+    pub title: String,
+    pub subtitle: Option<String>,
+    pub abstract_text: String,
 }
 
 #[derive(Debug, Clone, PartialEq)]
@@ -120,22 +132,41 @@ impl BibiData {
         //     self.pubtype(),
         //     &self.symbols,
         // ]
-
-        BibiRow {
-            authors: {
-                if self.short_author.is_empty() {
-                    self.authors()
-                } else {
-                    &self.short_author
-                }
-            },
-            title: self.title(),
-            year: self.year(),
-            custom_field_value: self.custom_field_value(),
-            symbols: &self.symbols,
+        let author_ref = if self.short_author.is_empty() {
+            self.authors()
+        } else {
+            &self.short_author
+        };
+        if let Some(sanidata) = &self.sanitized_bibi_data {
+            BibiRow {
+                authors: author_ref,
+                title: &sanidata.title,
+                year: self.year(),
+                custom_field_value: self.custom_field_value(),
+                symbols: &self.symbols,
+            }
+        } else {
+            BibiRow {
+                authors: author_ref,
+                title: self.title(),
+                year: self.year(),
+                custom_field_value: self.custom_field_value(),
+                symbols: &self.symbols,
+            }
         }
     }
 
+    /// Generates the SanitizedBibiData for the BibiData.
+    ///
+    /// Consumes self and returns a new BibiData struct.
+    ///
+    /// If multiple SanitizedBibiData are to be generated,
+    /// one should use the [`mass_sanitize`] function instead.
+    pub fn gen_sanitized(mut self) -> Self {
+        self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self));
+        self
+    }
+
     pub fn entry_id(&self) -> &u32 {
         &self.id
     }
@@ -288,59 +319,48 @@ impl BibiSetup {
             } else {
                 None
             };
-        citekeys
-            .iter()
-            .enumerate()
-            .map(|(i, k)| {
-                let filepaths: (Option<Vec<OsString>>, bool) =
-                    { Self::get_filepath(k, bibliography, &mut pdf_files) };
-
-                // bibiman will sanitize some fields at this point,
-                // this may cause longer startup-load-times.
-                //
-                // It may be better to sanitize them somewhere else, so bibiman
-                // does not loose the original text-information including the
-                // LaTeX macros present in the bibfile. From here on, they will be
-                // gone.
-                //
-                // The following fields are going to be sanitized:
-                //
-                // - title
-                // - subtitle
-                // - abstract_text
-                //
-                // TODO: Once the final decision to implement the sanitization at
-                // this point, one could write a constructor for the BibiData struct
-                // which handles the sanitization.
-                BibiData {
-                    id: i as u32,
-                    authors: Self::get_authors(k, bibliography),
-                    short_author: String::new(),
-                    title: sanitize_one(&Self::get_title(k, bibliography)),
-                    year: Self::get_year(k, bibliography),
-                    custom_field: (
-                        cfg.general.custom_column.clone(),
-                        Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
-                    ),
-                    keywords: Self::get_keywords(k, bibliography),
-                    citekey: k.to_owned(),
-                    abstract_text: sanitize_one(&Self::get_abstract(k, bibliography)),
-                    doi_url: Self::get_weblink(k, bibliography),
-                    filepath: filepaths.0,
-                    file_field: filepaths.1,
-                    subtitle: match Self::get_subtitle(k, bibliography) {
-                        None => None,
-                        Some(x) => Some(sanitize_one(&x)),
-                    },
-                    notes: if note_files.is_some() {
-                        Self::get_notepath(k, &mut note_files, &ext)
-                    } else {
-                        None
-                    },
-                    symbols: [None, None, None],
-                }
-            })
-            .collect()
+        //
+        //
+        // bibiman will sanitize some fields at this point,
+        // this may cause longer startup-load-times.
+        //
+        //
+        mass_sanitize(
+            citekeys
+                .iter()
+                .enumerate()
+                .map(|(i, k)| {
+                    let filepaths: (Option<Vec<OsString>>, bool) =
+                        { Self::get_filepath(k, bibliography, &mut pdf_files) };
+
+                    BibiData {
+                        id: i as u32,
+                        authors: Self::get_authors(k, bibliography),
+                        short_author: String::new(),
+                        title: Self::get_title(k, bibliography),
+                        year: Self::get_year(k, bibliography),
+                        custom_field: (
+                            cfg.general.custom_column.clone(),
+                            Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
+                        ),
+                        keywords: Self::get_keywords(k, bibliography),
+                        citekey: k.to_owned(),
+                        abstract_text: Self::get_abstract(k, bibliography),
+                        doi_url: Self::get_weblink(k, bibliography),
+                        filepath: filepaths.0,
+                        file_field: filepaths.1,
+                        subtitle: Self::get_subtitle(k, bibliography),
+                        notes: if note_files.is_some() {
+                            Self::get_notepath(k, &mut note_files, &ext)
+                        } else {
+                            None
+                        },
+                        symbols: [None, None, None],
+                        sanitized_bibi_data: None,
+                    }
+                })
+                .collect(),
+        )
     }
 
     // get list of citekeys from the given bibfile
diff --git a/src/bibiman/entries.rs b/src/bibiman/entries.rs
index db6d6bf..0b35a8b 100644
--- a/src/bibiman/entries.rs
+++ b/src/bibiman/entries.rs
@@ -174,7 +174,9 @@ mod tests {
             subtitle: None,
             notes: None,
             symbols: [None, None, None],
-        };
+            sanitized_bibi_data: None,
+        }
+        .gen_sanitized();
 
         let entry_vec = BibiData::ref_vec(&mut entry, &cfg);
 
@@ -194,7 +196,9 @@ mod tests {
             subtitle: None,
             notes: None,
             symbols: [None, None, None],
-        };
+            sanitized_bibi_data: None,
+        }
+        .gen_sanitized();
 
         let entry_vec_editors = BibiData::ref_vec(&mut entry_editors, &cfg);
 
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
index aaf81ad..614ed11 100644
--- a/src/bibiman/sanitize.rs
+++ b/src/bibiman/sanitize.rs
@@ -18,6 +18,8 @@
 use fancy_regex::Regex;
 use unicodeit::replace as unicode_replace;
 
+use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData};
+
 /// Sanitizing process rules as regex cmds.
 ///
 /// Only macros that are not already covered by unicodeit should be processed in this way.
@@ -33,6 +35,71 @@ const SANITIZE_REGEX: &[(&str, &str)] = &[
     (r"\\hyphen", "-"),
 ];
 
+/// Function to build the sanitization regex vector:
+fn regex_vector() -> Vec<(Regex, &'static str)> {
+    let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
+    // build regex
+    for (search, replace) in SANITIZE_REGEX {
+        regex.push((Regex::new(search).unwrap(), replace));
+    }
+    regex
+}
+
+fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> {
+    let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
+
+    // process strings
+    let result_len = result.len();
+    for (re, replace) in regex {
+        for i in 0..result_len {
+            result[i] = re.replace_all(&result[i], *replace).to_string();
+        }
+    }
+    for i in 0..result_len {
+        result[i] = unicode_replace(&result[i]);
+    }
+
+    // return result
+    result
+}
+
+/// Helper macro to sanitize bibidata structs.
+/// Here lives the code that generates SanitizedBibiData
+/// structs from BibiData structs.
+macro_rules! optimized_sanitize_bibidata {
+    ($bibidata:expr, $regex:expr) => {
+        match &$bibidata.subtitle {
+            None => {
+                let sanitized_data =
+                    optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex);
+                SanitizedBibiData {
+                    title: sanitized_data[0].clone(),
+                    subtitle: None,
+                    abstract_text: sanitized_data[1].clone(),
+                }
+            }
+            Some(subtitle) => {
+                let sanitized_data = optimized_sanitize(
+                    vec![&$bibidata.title, subtitle, &$bibidata.abstract_text],
+                    &$regex,
+                );
+                SanitizedBibiData {
+                    title: sanitized_data[0].clone(),
+                    subtitle: Some(sanitized_data[1].clone()),
+                    abstract_text: sanitized_data[2].clone(),
+                }
+            }
+        }
+    };
+}
+
+/// Sanitize one BibiData and return a SanitizedBibiData struct.
+/// This function does ignore any existing sanitization of the bibidata struct.
+pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
+    let regex = regex_vector();
+    optimized_sanitize_bibidata!(bibidata, regex)
+}
+
 /// Sanitize one String with LaTeX Macros into a more readable one without.
 ///
 /// If one is going to mass-sanitize strings, one should use the [`sanitize`]
@@ -50,24 +117,19 @@ pub fn sanitize_one(input_text: &str) -> String {
 ///
 /// This function does always return the same amount of Strings as it gets in the input list.
 pub fn sanitize(input_text: Vec<&str>) -> Vec<String> {
-    let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
-    let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
-    // build regex
-    for (search, replace) in SANITIZE_REGEX {
-        regex.push((Regex::new(search).unwrap(), replace));
-    }
+    optimized_sanitize(input_text, &regex_vector())
+}
 
-    // process strings
-    let result_len = result.len();
-    for (re, replace) in regex {
-        for i in 0..result_len {
-            result[i] = re.replace_all(&result[i], replace).to_string();
-        }
-    }
-    for i in 0..result_len {
-        result[i] = unicode_replace(&result[i]);
-    }
+/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one.
+pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> {
+    let regex: Vec<(Regex, &str)> = regex_vector();
 
-    // return result
+    let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len());
+    for entry in bibidata {
+        result.push(BibiData {
+            sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)),
+            ..entry
+        });
+    }
     result
 }
diff --git a/src/bibiman/search.rs b/src/bibiman/search.rs
index e0c5f17..2156634 100644
--- a/src/bibiman/search.rs
+++ b/src/bibiman/search.rs
@@ -141,7 +141,9 @@ mod tests {
             subtitle: None,
             notes: None,
             symbols: [None, None, None],
-        };
+            sanitized_bibi_data: None,
+        }
+        .gen_sanitized();
 
         let joined_vec = BibiSearch::convert_to_string(&bibvec);
author	Klimperfix	2025-10-03 16:56:30 +0200
committer	Klimperfix	2025-10-03 17:58:53 +0200
commit	26befd38aedbfdd278c3096644baf69e4a1fb051 (patch)
tree	fefa546a14b2bffead3a0b866b6826f65a4ef175
parent	dfb7edde13ca39af3e23b80e40272e02aa093919 (diff)
download	bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.tar.gz bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.zip