diff options
| author | Klimperfix | 2025-09-30 21:49:31 +0200 |
|---|---|---|
| committer | Klimperfix | 2025-10-03 12:20:14 +0200 |
| commit | 7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d (patch) | |
| tree | 19128cc2ced032d9988adfa6d3a658f803b5c930 | |
| parent | db0deb9b6ee59c07ab2f04972184b154925034bd (diff) | |
| download | bibiman-7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d.tar.gz bibiman-7350c7382bda85618c3dae1d74cc8cbe7ddd4b9d.zip | |
Implemented basic sanitizing.
The default Regex crate does not support the required regex features, so
I'm using the fancy_regex crate that does.
| -rw-r--r-- | Cargo.lock | 40 | ||||
| -rw-r--r-- | Cargo.toml | 2 | ||||
| -rw-r--r-- | src/bibiman.rs | 3 | ||||
| -rw-r--r-- | src/bibiman/sanitize.rs | 73 | ||||
| -rw-r--r-- | tests/biblatex-test.bib | 13 |
5 files changed, 127 insertions, 4 deletions
@@ -99,6 +99,7 @@ dependencies = [ "crossterm", "dirs", "editor-command", + "fancy-regex", "figment", "futures", "itertools", @@ -113,6 +114,7 @@ dependencies = [ "tokio", "tokio-util", "tui-input", + "unicodeit", "ureq", "walkdir", ] @@ -131,6 +133,21 @@ dependencies = [ ] [[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -410,6 +427,17 @@ dependencies = [ ] [[package]] +name = "fancy-regex" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1864,6 +1892,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] +name = "unicodeit" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1069c222ea63347e2e59763aa12d32c9c6a4e595931c7724a769f6a75bfbc553" +dependencies = [ + "aho-corasick", + "cfg-if", + "memchr", + "regex", +] + +[[package]] name = "unscanny" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -39,3 +39,5 @@ ureq = "2.12.1" serde = { version = "1.0.217", features = ["serde_derive"] } figment = { version = "0.10.19", features = [ "toml", "test" ]} owo-colors = "4.2.2" +unicodeit = { version = "0.2.0", features = ["naive-impl"] } +fancy-regex = "0.16.2" diff --git a/src/bibiman.rs b/src/bibiman.rs index 6d21f8c..c423ce1 100644 --- a/src/bibiman.rs +++ b/src/bibiman.rs @@ -44,6 +44,9 @@ pub mod entries; pub mod keywords; pub mod search; +/// Module with function to sanitize text with LaTeX Macros into readable unicode text. +pub mod sanitize; + // Areas in which actions are possible #[derive(Debug)] pub enum CurrentArea { diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs new file mode 100644 index 0000000..aaf81ad --- /dev/null +++ b/src/bibiman/sanitize.rs @@ -0,0 +1,73 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <https://www.gnu.org/licenses/>. +///// + +use fancy_regex::Regex; +use unicodeit::replace as unicode_replace; + +/// Sanitizing process rules as regex cmds. +/// +/// Only macros that are not already covered by unicodeit should be processed in this way. +/// +// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})` +// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}` +// +const SANITIZE_REGEX: &[(&str, &str)] = &[ + ( + r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}", + "\"${1}\"", + ), + (r"\\hyphen", "-"), +]; + +/// Sanitize one String with LaTeX Macros into a more readable one without. +/// +/// If one is going to mass-sanitize strings, one should use the [`sanitize`] +/// function for performance reasons instead, to process multiple strings at once. +/// +/// This is just a shortcut for the sanitize function. +pub fn sanitize_one(input_text: &str) -> String { + // This does not panic, the sanitize function always returns + // as many elements in the returned list as it get's elements + // in the input vector. + sanitize(vec![input_text]).get(0).unwrap().to_string() +} + +/// Sanitize multiple Strings with LaTeX Macros into more readable unicode equivalents. +/// +/// This function does always return the same amount of Strings as it gets in the input list. +pub fn sanitize(input_text: Vec<&str>) -> Vec<String> { + let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect(); + let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); + // build regex + for (search, replace) in SANITIZE_REGEX { + regex.push((Regex::new(search).unwrap(), replace)); + } + + // process strings + let result_len = result.len(); + for (re, replace) in regex { + for i in 0..result_len { + result[i] = re.replace_all(&result[i], replace).to_string(); + } + } + for i in 0..result_len { + result[i] = unicode_replace(&result[i]); + } + + // return result + result +} diff --git a/tests/biblatex-test.bib b/tests/biblatex-test.bib index cfcdc80..fcc5085 100644 --- a/tests/biblatex-test.bib +++ b/tests/biblatex-test.bib @@ -28,9 +28,9 @@ @article{aksin, title = {Effect of immobilization on catalytic characteristics of saturated {Pd-N}-heterocyclic carbenes in {Mizoroki-Heck} reactions}, - author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok, Levent and { - \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\"u}y{\"u}kg{\"u}ng{\" - o}r, Orhan and {\"O}zkal, Erhan}, + author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok , Levent and + { \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\" u}y{ \"u}kg{\"u} + ng{ \" o}r, Orhan and {\"O}zkal, Erhan}, volume = {691}, number = {13}, pages = {3027--3036}, @@ -82,7 +82,12 @@ langidopts = {variant=american}, file = {~/Documents/coding/projects/bibiman/tests/aristotle_physics.pdf}, annotation = {A \texttt{book} entry with a \texttt{translator} field}, - abstract = {The Physics is a work by Aristotle dedicated to the study of nature. Regarded by Heidegger as "the fundamental work of Western philosophy", it presents the renowned distinction between the four types of cause, as well as reflections on chance, motion, infinity, and other fundamental concepts. It is here that Aristotle sets out his celebrated paradox of time.}, + abstract = {The Physics is a work by Aristotle dedicated to the study of + nature. Regarded by Heidegger as "the fundamental work of Western + philosophy", it presents the renowned distinction between the + four types of cause, as well as reflections on chance, motion, + infinity, and other fundamental concepts. It is here that + Aristotle sets out his celebrated paradox of time.}, } @book{aristotle_poetics, |
