src/bibiman/sanitize.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

// bibiman - a TUI for managing BibLaTeX databases
// Copyright (C) 2025  lukeflo
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.
/////

use fancy_regex::Regex;
use unicodeit::replace as unicode_replace;

use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData};

/// Sanitizing process rules as regex cmds.
///
/// Only macros that are not already covered by unicodeit should be processed in this way.
///
// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
//
const SANITIZE_REGEX: &[(&str, &str)] = &[
    (
        r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
        "\"${1}\"",
    ),
    (r"\\hyphen", "-"),
];

/// Function to build the sanitization regex vector:
fn regex_vector() -> Vec<(Regex, &'static str)> {
    let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
    // build regex
    for (search, replace) in SANITIZE_REGEX {
        regex.push((Regex::new(search).unwrap(), replace));
    }
    regex
}

fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> {
    let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();

    // process strings
    let result_len = result.len();
    for (re, replace) in regex {
        for i in 0..result_len {
            result[i] = re.replace_all(&result[i], *replace).to_string();
        }
    }
    for i in 0..result_len {
        result[i] = unicode_replace(&result[i]);
    }

    // return result
    result
}

/// Helper macro to sanitize bibidata structs.
/// Here lives the code that generates SanitizedBibiData
/// structs from BibiData structs.
macro_rules! optimized_sanitize_bibidata {
    ($bibidata:expr, $regex:expr) => {
        match &$bibidata.subtitle {
            None => {
                let sanitized_data =
                    optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex);
                SanitizedBibiData {
                    title: sanitized_data[0].clone(),
                    subtitle: None,
                    abstract_text: sanitized_data[1].clone(),
                }
            }
            Some(subtitle) => {
                let sanitized_data = optimized_sanitize(
                    vec![&$bibidata.title, subtitle, &$bibidata.abstract_text],
                    &$regex,
                );
                SanitizedBibiData {
                    title: sanitized_data[0].clone(),
                    subtitle: Some(sanitized_data[1].clone()),
                    abstract_text: sanitized_data[2].clone(),
                }
            }
        }
    };
}

/// Sanitize one BibiData and return a SanitizedBibiData struct.
/// This function does ignore any existing sanitization of the bibidata struct.
pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
    let regex = regex_vector();
    optimized_sanitize_bibidata!(bibidata, regex)
}

/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one.
pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> {
    let regex: Vec<(Regex, &str)> = regex_vector();

    let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len());
    for entry in bibidata {
        result.push(BibiData {
            sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)),
            ..entry
        });
    }
    result
}