aboutsummaryrefslogtreecommitdiff
path: root/src/bibiman/sanitize.rs
blob: aaf81adac5d2926e3db6254d1682da99fe95bca9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// bibiman - a TUI for managing BibLaTeX databases
// Copyright (C) 2025  lukeflo
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.
/////

use fancy_regex::Regex;
use unicodeit::replace as unicode_replace;

/// Sanitizing process rules as regex cmds.
///
/// Only macros that are not already covered by unicodeit should be processed in this way.
///
// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
//
const SANITIZE_REGEX: &[(&str, &str)] = &[
    (
        r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
        "\"${1}\"",
    ),
    (r"\\hyphen", "-"),
];

/// Sanitize one String with LaTeX Macros into a more readable one without.
///
/// If one is going to mass-sanitize strings, one should use the [`sanitize`]
/// function for performance reasons instead, to process multiple strings at once.
///
/// This is just a shortcut for the sanitize function.
pub fn sanitize_one(input_text: &str) -> String {
    // This does not panic, the sanitize function always returns
    // as many elements in the returned list as it get's elements
    // in the input vector.
    sanitize(vec![input_text]).get(0).unwrap().to_string()
}

/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents.
///
/// This function does always return the same amount of Strings as it gets in the input list.
pub fn sanitize(input_text: Vec<&str>) -> Vec<String> {
    let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
    let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
    // build regex
    for (search, replace) in SANITIZE_REGEX {
        regex.push((Regex::new(search).unwrap(), replace));
    }

    // process strings
    let result_len = result.len();
    for (re, replace) in regex {
        for i in 0..result_len {
            result[i] = re.replace_all(&result[i], replace).to_string();
        }
    }
    for i in 0..result_len {
        result[i] = unicode_replace(&result[i]);
    }

    // return result
    result
}