From 7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d Mon Sep 17 00:00:00 2001
From: Klimperfix
Date: Tue, 30 Sep 2025 21:49:31 +0200
Subject: Implemented basic sanitizing.

The default Regex crate does not support the required regex features, so
I'm using the fancy_regex crate that does.
---
 src/bibiman.rs          |  3 ++
 src/bibiman/sanitize.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 src/bibiman/sanitize.rs

(limited to 'src')
diff --git a/src/bibiman.rs b/src/bibiman.rs
index 6d21f8c..c423ce1 100644
--- a/src/bibiman.rs
+++ b/src/bibiman.rs
@@ -44,6 +44,9 @@ pub mod entries;
 pub mod keywords;
 pub mod search;
 
+/// Module with function to sanitize text with LaTeX Macros into readable unicode text.
+pub mod sanitize;
+
 // Areas in which actions are possible
 #[derive(Debug)]
 pub enum CurrentArea {
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
new file mode 100644
index 0000000..aaf81ad
--- /dev/null
+++ b/src/bibiman/sanitize.rs
@@ -0,0 +1,73 @@
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025  lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+/////
+
+use fancy_regex::Regex;
+use unicodeit::replace as unicode_replace;
+
+/// Sanitizing process rules as regex cmds.
+///
+/// Only macros that are not already covered by unicodeit should be processed in this way.
+///
+// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
+// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
+//
+const SANITIZE_REGEX: &[(&str, &str)] = &[
+    (
+        r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
+        "\"${1}\"",
+    ),
+    (r"\\hyphen", "-"),
+];
+
+/// Sanitize one String with LaTeX Macros into a more readable one without.
+///
+/// If one is going to mass-sanitize strings, one should use the [`sanitize`]
+/// function for performance reasons instead, to process multiple strings at once.
+///
+/// This is just a shortcut for the sanitize function.
+pub fn sanitize_one(input_text: &str) -> String {
+    // This does not panic, the sanitize function always returns
+    // as many elements in the returned list as it get's elements
+    // in the input vector.
+    sanitize(vec![input_text]).get(0).unwrap().to_string()
+}
+
+/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents.
+///
+/// This function does always return the same amount of Strings as it gets in the input list.
+pub fn sanitize(input_text: Vec<&str>) -> Vec<String> {
+    let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
+    let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
+    // build regex
+    for (search, replace) in SANITIZE_REGEX {
+        regex.push((Regex::new(search).unwrap(), replace));
+    }
+
+    // process strings
+    let result_len = result.len();
+    for (re, replace) in regex {
+        for i in 0..result_len {
+            result[i] = re.replace_all(&result[i], replace).to_string();
+        }
+    }
+    for i in 0..result_len {
+        result[i] = unicode_replace(&result[i]);
+    }
+
+    // return result
+    result
+}
-- 
cgit v1.2.3