1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
// bibiman - a TUI for managing BibLaTeX databases
// Copyright (C) 2025 lukeflo
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
/////
use fancy_regex::Regex;
use unicodeit::replace as unicode_replace;
use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData};
/// Sanitizing process rules as regex cmds.
///
/// Only macros that are not already covered by unicodeit should be processed in this way.
///
// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
//
const SANITIZE_REGEX: &[(&str, &str)] = &[
(
r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
"\"${1}\"",
),
(r"\\hyphen", "-"),
];
/// Function to build the sanitization regex vector:
fn regex_vector() -> Vec<(Regex, &'static str)> {
let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
// build regex
for (search, replace) in SANITIZE_REGEX {
regex.push((Regex::new(search).unwrap(), replace));
}
regex
}
fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> {
let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
// process strings
let result_len = result.len();
for (re, replace) in regex {
for i in 0..result_len {
result[i] = re.replace_all(&result[i], *replace).to_string();
}
}
for i in 0..result_len {
result[i] = unicode_replace(&result[i]);
}
// return result
result
}
/// Helper macro to sanitize bibidata structs.
/// Here lives the code that generates SanitizedBibiData
/// structs from BibiData structs.
macro_rules! optimized_sanitize_bibidata {
($bibidata:expr, $regex:expr) => {
match &$bibidata.subtitle {
None => {
let sanitized_data =
optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex);
SanitizedBibiData {
title: sanitized_data[0].clone(),
subtitle: None,
abstract_text: sanitized_data[1].clone(),
}
}
Some(subtitle) => {
let sanitized_data = optimized_sanitize(
vec![&$bibidata.title, subtitle, &$bibidata.abstract_text],
&$regex,
);
SanitizedBibiData {
title: sanitized_data[0].clone(),
subtitle: Some(sanitized_data[1].clone()),
abstract_text: sanitized_data[2].clone(),
}
}
}
};
}
/// Sanitize one BibiData and return a SanitizedBibiData struct.
/// This function does ignore any existing sanitization of the bibidata struct.
pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
let regex = regex_vector();
optimized_sanitize_bibidata!(bibidata, regex)
}
/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one.
pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> {
let regex: Vec<(Regex, &str)> = regex_vector();
let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len());
for entry in bibidata {
result.push(BibiData {
sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)),
..entry
});
}
result
}
|