aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKlimperfix2025-09-30 21:49:31 +0200
committerKlimperfix2025-10-03 12:20:14 +0200
commit7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d (patch)
tree19128cc2ced032d9988adfa6d3a658f803b5c930
parentdb0deb9b6ee59c07ab2f04972184b154925034bd (diff)
downloadbibiman-7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d.tar.gz
bibiman-7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d.zip
Implemented basic sanitizing.
The default Regex crate does not support the required regex features, so I'm using the fancy_regex crate that does.
-rw-r--r--Cargo.lock40
-rw-r--r--Cargo.toml2
-rw-r--r--src/bibiman.rs3
-rw-r--r--src/bibiman/sanitize.rs73
-rw-r--r--tests/biblatex-test.bib13
5 files changed, 127 insertions, 4 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 3661e99..535b929 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -99,6 +99,7 @@ dependencies = [
"crossterm",
"dirs",
"editor-command",
+ "fancy-regex",
"figment",
"futures",
"itertools",
@@ -113,6 +114,7 @@ dependencies = [
"tokio",
"tokio-util",
"tui-input",
+ "unicodeit",
"ureq",
"walkdir",
]
@@ -131,6 +133,21 @@ dependencies = [
]
[[package]]
+name = "bit-set"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
+
+[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -410,6 +427,17 @@ dependencies = [
]
[[package]]
+name = "fancy-regex"
+version = "0.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
+dependencies = [
+ "bit-set",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1864,6 +1892,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
[[package]]
+name = "unicodeit"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1069c222ea63347e2e59763aa12d32c9c6a4e595931c7724a769f6a75bfbc553"
+dependencies = [
+ "aho-corasick",
+ "cfg-if",
+ "memchr",
+ "regex",
+]
+
+[[package]]
name = "unscanny"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index b3fc774..2d596de 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -39,3 +39,5 @@ ureq = "2.12.1"
serde = { version = "1.0.217", features = ["serde_derive"] }
figment = { version = "0.10.19", features = [ "toml", "test" ]}
owo-colors = "4.2.2"
+unicodeit = { version = "0.2.0", features = ["naive-impl"] }
+fancy-regex = "0.16.2"
diff --git a/src/bibiman.rs b/src/bibiman.rs
index 6d21f8c..c423ce1 100644
--- a/src/bibiman.rs
+++ b/src/bibiman.rs
@@ -44,6 +44,9 @@ pub mod entries;
pub mod keywords;
pub mod search;
+/// Module with function to sanitize text with LaTeX Macros into readable unicode text.
+pub mod sanitize;
+
// Areas in which actions are possible
#[derive(Debug)]
pub enum CurrentArea {
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
new file mode 100644
index 0000000..aaf81ad
--- /dev/null
+++ b/src/bibiman/sanitize.rs
@@ -0,0 +1,73 @@
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025 lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+/////
+
+use fancy_regex::Regex;
+use unicodeit::replace as unicode_replace;
+
+/// Sanitizing process rules as regex cmds.
+///
+/// Only macros that are not already covered by unicodeit should be processed in this way.
+///
+// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})`
+// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}`
+//
+const SANITIZE_REGEX: &[(&str, &str)] = &[
+ (
+ r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}",
+ "\"${1}\"",
+ ),
+ (r"\\hyphen", "-"),
+];
+
+/// Sanitize one String with LaTeX Macros into a more readable one without.
+///
+/// If one is going to mass-sanitize strings, one should use the [`sanitize`]
+/// function for performance reasons instead, to process multiple strings at once.
+///
+/// This is just a shortcut for the sanitize function.
+pub fn sanitize_one(input_text: &str) -> String {
+ // This does not panic, the sanitize function always returns
+ // as many elements in the returned list as it get's elements
+ // in the input vector.
+ sanitize(vec![input_text]).get(0).unwrap().to_string()
+}
+
+/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents.
+///
+/// This function does always return the same amount of Strings as it gets in the input list.
+pub fn sanitize(input_text: Vec<&str>) -> Vec<String> {
+ let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
+ let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
+ // build regex
+ for (search, replace) in SANITIZE_REGEX {
+ regex.push((Regex::new(search).unwrap(), replace));
+ }
+
+ // process strings
+ let result_len = result.len();
+ for (re, replace) in regex {
+ for i in 0..result_len {
+ result[i] = re.replace_all(&result[i], replace).to_string();
+ }
+ }
+ for i in 0..result_len {
+ result[i] = unicode_replace(&result[i]);
+ }
+
+ // return result
+ result
+}
diff --git a/tests/biblatex-test.bib b/tests/biblatex-test.bib
index cfcdc80..fcc5085 100644
--- a/tests/biblatex-test.bib
+++ b/tests/biblatex-test.bib
@@ -28,9 +28,9 @@
@article{aksin,
title = {Effect of immobilization on catalytic characteristics of saturated
{Pd-N}-heterocyclic carbenes in {Mizoroki-Heck} reactions},
- author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok, Levent and {
- \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\"u}y{\"u}kg{\"u}ng{\"
- o}r, Orhan and {\"O}zkal, Erhan},
+ author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok , Levent and
+ { \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\" u}y{ \"u}kg{\"u}
+ ng{ \" o}r, Orhan and {\"O}zkal, Erhan},
volume = {691},
number = {13},
pages = {3027--3036},
@@ -82,7 +82,12 @@
langidopts = {variant=american},
file = {~/Documents/coding/projects/bibiman/tests/aristotle_physics.pdf},
annotation = {A \texttt{book} entry with a \texttt{translator} field},
- abstract = {The Physics is a work by Aristotle dedicated to the study of nature. Regarded by Heidegger as "the fundamental work of Western philosophy", it presents the renowned distinction between the four types of cause, as well as reflections on chance, motion, infinity, and other fundamental concepts. It is here that Aristotle sets out his celebrated paradox of time.},
+ abstract = {The Physics is a work by Aristotle dedicated to the study of
+ nature. Regarded by Heidegger as "the fundamental work of Western
+ philosophy", it presents the renowned distinction between the
+ four types of cause, as well as reflections on chance, motion,
+ infinity, and other fundamental concepts. It is here that
+ Aristotle sets out his celebrated paradox of time.},
}
@book{aristotle_poetics,