From 7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d Mon Sep 17 00:00:00 2001
From: Klimperfix
Date: Tue, 30 Sep 2025 21:49:31 +0200
Subject: Implemented basic sanitizing.
The default Regex crate does not support the required regex features, so
I'm using the fancy_regex crate that does.
---
src/bibiman.rs | 3 ++
src/bibiman/sanitize.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 76 insertions(+)
create mode 100644 src/bibiman/sanitize.rs
(limited to 'src')
diff --git a/src/bibiman.rs b/src/bibiman.rs
index 6d21f8c..c423ce1 100644
--- a/src/bibiman.rs
+++ b/src/bibiman.rs
@@ -44,6 +44,9 @@ pub mod entries;
pub mod keywords;
pub mod search;
+/// Module with function to sanitize text with LaTeX Macros into readable unicode text.
+pub mod sanitize;
+
// Areas in which actions are possible
#[derive(Debug)]
pub enum CurrentArea {
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
new file mode 100644
index 0000000..aaf81ad
--- /dev/null
+++ b/src/bibiman/sanitize.rs
@@ -0,0 +1,73 @@
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025 lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see .
+/////
+
+use fancy_regex::Regex;
+use unicodeit::replace as unicode_replace;
+
+/// Sanitizing process rules as regex cmds.
+///
+/// Only macros that are not already covered by unicodeit should be processed in this way.
+///
+// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
+// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
+//
+const SANITIZE_REGEX: &[(&str, &str)] = &[
+ (
+ r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
+ "\"${1}\"",
+ ),
+ (r"\\hyphen", "-"),
+];
+
+/// Sanitize one String with LaTeX Macros into a more readable one without.
+///
+/// If one is going to mass-sanitize strings, one should use the [`sanitize`]
+/// function for performance reasons instead, to process multiple strings at once.
+///
+/// This is just a shortcut for the sanitize function.
+pub fn sanitize_one(input_text: &str) -> String {
+ // This does not panic, the sanitize function always returns
+ // as many elements in the returned list as it get's elements
+ // in the input vector.
+ sanitize(vec![input_text]).get(0).unwrap().to_string()
+}
+
+/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents.
+///
+/// This function does always return the same amount of Strings as it gets in the input list.
+pub fn sanitize(input_text: Vec<&str>) -> Vec {
+ let mut result: Vec = input_text.into_iter().map(|s| s.to_string()).collect();
+ let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
+ // build regex
+ for (search, replace) in SANITIZE_REGEX {
+ regex.push((Regex::new(search).unwrap(), replace));
+ }
+
+ // process strings
+ let result_len = result.len();
+ for (re, replace) in regex {
+ for i in 0..result_len {
+ result[i] = re.replace_all(&result[i], replace).to_string();
+ }
+ }
+ for i in 0..result_len {
+ result[i] = unicode_replace(&result[i]);
+ }
+
+ // return result
+ result
+}
--
cgit v1.2.3