3 files changed, 133 insertions, 123 deletions
diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs
index 48046e9..37b0b01 100644
--- a/src/bibiman/bibisetup.rs
+++ b/src/bibiman/bibisetup.rs
@@ -26,7 +26,7 @@ use std::{fs, path::PathBuf};
 use walkdir::WalkDir;
 
 use crate::app;
-use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata};
+use crate::bibiman::sanitize::sanitize_one_bibidata;
 use crate::cliargs::{self};
 use crate::config::BibiConfig;
 
@@ -159,9 +159,6 @@ impl BibiData {
     /// Generates the SanitizedBibiData for the BibiData.
     ///
     /// Consumes self and returns a new BibiData struct.
-    ///
-    /// If multiple SanitizedBibiData are to be generated,
-    /// one should use the [`mass_sanitize`] function instead.
     pub fn gen_sanitized(mut self) -> Self {
         self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self));
         self
@@ -325,42 +322,41 @@ impl BibiSetup {
         // this may cause longer startup-load-times.
         //
         //
-        mass_sanitize(
-            citekeys
-                .iter()
-                .enumerate()
-                .map(|(i, k)| {
-                    let filepaths: (Option<Vec<OsString>>, bool) =
-                        { Self::get_filepath(k, bibliography, &mut pdf_files) };
-
-                    BibiData {
-                        id: i as u32,
-                        authors: Self::get_authors(k, bibliography),
-                        short_author: String::new(),
-                        title: Self::get_title(k, bibliography),
-                        year: Self::get_year(k, bibliography),
-                        custom_field: (
-                            cfg.general.custom_column.clone(),
-                            Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
-                        ),
-                        keywords: Self::get_keywords(k, bibliography),
-                        citekey: k.to_owned(),
-                        abstract_text: Self::get_abstract(k, bibliography),
-                        doi_url: Self::get_weblink(k, bibliography),
-                        filepath: filepaths.0,
-                        file_field: filepaths.1,
-                        subtitle: Self::get_subtitle(k, bibliography),
-                        notes: if note_files.is_some() {
-                            Self::get_notepath(k, &mut note_files, &ext)
-                        } else {
-                            None
-                        },
-                        symbols: [None, None, None],
-                        sanitized_bibi_data: None,
-                    }
-                })
-                .collect(),
-        )
+        citekeys
+            .iter()
+            .enumerate()
+            .map(|(i, k)| {
+                let filepaths: (Option<Vec<OsString>>, bool) =
+                    { Self::get_filepath(k, bibliography, &mut pdf_files) };
+
+                BibiData {
+                    id: i as u32,
+                    authors: Self::get_authors(k, bibliography),
+                    short_author: String::new(),
+                    title: Self::get_title(k, bibliography),
+                    year: Self::get_year(k, bibliography),
+                    custom_field: (
+                        cfg.general.custom_column.clone(),
+                        Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
+                    ),
+                    keywords: Self::get_keywords(k, bibliography),
+                    citekey: k.to_owned(),
+                    abstract_text: Self::get_abstract(k, bibliography),
+                    doi_url: Self::get_weblink(k, bibliography),
+                    filepath: filepaths.0,
+                    file_field: filepaths.1,
+                    subtitle: Self::get_subtitle(k, bibliography),
+                    notes: if note_files.is_some() {
+                        Self::get_notepath(k, &mut note_files, &ext)
+                    } else {
+                        None
+                    },
+                    symbols: [None, None, None],
+                    sanitized_bibi_data: None,
+                }
+                .gen_sanitized()
+            })
+            .collect()
     }
 
     // get list of citekeys from the given bibfile
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
index 823b91c..9ccf4c4 100644
--- a/src/bibiman/sanitize.rs
+++ b/src/bibiman/sanitize.rs
@@ -15,80 +15,23 @@
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 /////
 
-use fancy_regex::Regex;
-use unicodeit::replace as unicode_replace;
-
 use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData};
 
-/// Sanitizing process rules as regex cmds.
-///
-/// Only macros that are not already covered by unicodeit should be processed in this way.
-///
-// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
-// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
-//
-const SANITIZE_REGEX: &[(&str, &str)] = &[
-    (
-        r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
-        "\"${1}\"",
-    ),
-    (r"\\hyphen", "-"),
-];
-
-/// Function to build the sanitization regex vector:
-fn regex_vector() -> Vec<(Regex, &'static str)> {
-    let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
-    // build regex
-    for (search, replace) in SANITIZE_REGEX {
-        regex.push((Regex::new(search).unwrap(), replace));
-    }
-    regex
-}
-
-fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> {
-    let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
-
-    // process strings
-    let result_len = result.len();
-    for (re, replace) in regex {
-        for i in 0..result_len {
-            result[i] = re.replace_all(&result[i], *replace).to_string();
-        }
-    }
-    for i in 0..result_len {
-        result[i] = unicode_replace(&result[i]);
-    }
-
-    // return result
-    result
-}
+mod optimized_sanitize;
+use optimized_sanitize::optimized_sanitize;
 
 /// Helper macro to sanitize bibidata structs.
 /// Here lives the code that generates SanitizedBibiData
 /// structs from BibiData structs.
 macro_rules! optimized_sanitize_bibidata {
-    ($bibidata:expr, $regex:expr) => {
-        match &$bibidata.subtitle {
-            None => {
-                let sanitized_data =
-                    optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex);
-                SanitizedBibiData {
-                    title: sanitized_data[0].clone(),
-                    subtitle: None,
-                    abstract_text: sanitized_data[1].clone(),
-                }
-            }
-            Some(subtitle) => {
-                let sanitized_data = optimized_sanitize(
-                    vec![&$bibidata.title, subtitle, &$bibidata.abstract_text],
-                    &$regex,
-                );
-                SanitizedBibiData {
-                    title: sanitized_data[0].clone(),
-                    subtitle: Some(sanitized_data[1].clone()),
-                    abstract_text: sanitized_data[2].clone(),
-                }
-            }
+    ($bibidata:expr) => {
+        SanitizedBibiData {
+            title: optimized_sanitize(&$bibidata.title),
+            subtitle: match &$bibidata.subtitle {
+                None => None,
+                Some(subtitle) => Some(optimized_sanitize(subtitle)),
+            },
+            abstract_text: optimized_sanitize(&$bibidata.abstract_text),
         }
     };
 }
@@ -96,20 +39,5 @@ macro_rules! optimized_sanitize_bibidata {
 /// Sanitize one BibiData and return a SanitizedBibiData struct.
 /// This function does ignore any existing sanitization of the bibidata struct.
 pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
-    let regex = regex_vector();
-    optimized_sanitize_bibidata!(bibidata, regex)
-}
-
-/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one.
-pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> {
-    let regex: Vec<(Regex, &str)> = regex_vector();
-
-    let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len());
-    for entry in bibidata {
-        result.push(BibiData {
-            sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)),
-            ..entry
-        });
-    }
-    result
+    optimized_sanitize_bibidata!(bibidata)
 }
diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs
new file mode 100644
index 0000000..b3bf90d
--- /dev/null
+++ b/src/bibiman/sanitize/optimized_sanitize.rs
@@ -0,0 +1,86 @@
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025  lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+/////
+
+use phf::phf_map;
+use std::collections::HashMap;
+
+use logos::Logos;
+
+static LOOKUP: phf::Map<&'static str, &'static str> = phf_map! {
+    " " => " ", // str a forced space should substitute to.
+};
+
+#[derive(Logos, Debug)]
+enum Token {
+    #[token("{")]
+    OpenCurlyBracket,
+    #[token("}")]
+    ClosedCurlyBracket,
+    #[regex(r"\\\w+")]
+    LaTeXMacro,
+    #[token(r"\ ")]
+    ForcedSpace,
+}
+
+pub fn optimized_sanitize(input_text: &str) -> String {
+    let mut out: Vec<&str> = Vec::new();
+    let mut bracket_counter: u32 = 0;
+    let mut counter_actions: HashMap<u32, String> = HashMap::new();
+    let mut lex = Token::lexer(input_text);
+    while let Some(sometoken) = lex.next() {
+        match sometoken {
+            Ok(token) => match token {
+                Token::ForcedSpace => {
+                    out.push(
+                        LOOKUP
+                            .get(" ")
+                            .expect("Something is wrong with the sanitization lookup table."),
+                    );
+                }
+                Token::OpenCurlyBracket => {
+                    bracket_counter.saturating_add(1);
+                    todo!();
+                }
+                Token::ClosedCurlyBracket => {
+                    bracket_counter.saturating_sub(1);
+                    todo!();
+                }
+                Token::LaTeXMacro => {
+                    todo!()
+                }
+            },
+            Err(_) => {
+                out.push(lex.slice());
+            }
+        }
+    }
+    out.into_iter().collect::<String>()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::optimized_sanitize;
+
+    #[test]
+    fn check_sanitization() {
+        let result = optimized_sanitize(
+            r"\mkbibquote{Intention} und \mkbibquote{Intentionen sind \hyphen\ bibquote.}",
+        );
+        println!("{}", result);
+        panic!("Tatütata!");
+    }
+}