diff options
| author | Klimperfix | 2025-09-30 21:49:31 +0200 |
|---|---|---|
| committer | Klimperfix | 2025-10-03 12:20:14 +0200 |
| commit | 7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d (patch) | |
| tree | 19128cc2ced032d9988adfa6d3a658f803b5c930 /src | |
| parent | db0deb9b6ee59c07ab2f04972184b154925034bd (diff) | |
| download | bibiman-7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d.tar.gz bibiman-7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d.zip | |
Implemented basic sanitizing.
The default Regex crate does not support the required regex features, so
I'm using the fancy_regex crate that does.
Diffstat (limited to 'src')
| -rw-r--r-- | src/bibiman.rs | 3 | ||||
| -rw-r--r-- | src/bibiman/sanitize.rs | 73 |
2 files changed, 76 insertions, 0 deletions
diff --git a/src/bibiman.rs b/src/bibiman.rs index 6d21f8c..c423ce1 100644 --- a/src/bibiman.rs +++ b/src/bibiman.rs @@ -44,6 +44,9 @@ pub mod entries; pub mod keywords; pub mod search; +/// Module with function to sanitize text with LaTeX Macros into readable unicode text. +pub mod sanitize; + // Areas in which actions are possible #[derive(Debug)] pub enum CurrentArea { diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs new file mode 100644 index 0000000..aaf81ad --- /dev/null +++ b/src/bibiman/sanitize.rs @@ -0,0 +1,73 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <https://www.gnu.org/licenses/>. +///// + +use fancy_regex::Regex; +use unicodeit::replace as unicode_replace; + +/// Sanitizing process rules as regex cmds. +/// +/// Only macros that are not already covered by unicodeit should be processed in this way. +/// +// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})` +// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}` +// +const SANITIZE_REGEX: &[(&str, &str)] = &[ + ( + r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}", + "\"${1}\"", + ), + (r"\\hyphen", "-"), +]; + +/// Sanitize one String with LaTeX Macros into a more readable one without. +/// +/// If one is going to mass-sanitize strings, one should use the [`sanitize`] +/// function for performance reasons instead, to process multiple strings at once. +/// +/// This is just a shortcut for the sanitize function. +pub fn sanitize_one(input_text: &str) -> String { + // This does not panic, the sanitize function always returns + // as many elements in the returned list as it get's elements + // in the input vector. + sanitize(vec![input_text]).get(0).unwrap().to_string() +} + +/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents. +/// +/// This function does always return the same amount of Strings as it gets in the input list. +pub fn sanitize(input_text: Vec<&str>) -> Vec<String> { + let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect(); + let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); + // build regex + for (search, replace) in SANITIZE_REGEX { + regex.push((Regex::new(search).unwrap(), replace)); + } + + // process strings + let result_len = result.len(); + for (re, replace) in regex { + for i in 0..result_len { + result[i] = re.replace_all(&result[i], replace).to_string(); + } + } + for i in 0..result_len { + result[i] = unicode_replace(&result[i]); + } + + // return result + result +} |
