diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/bibiman/bibisetup.rs | 76 | ||||
| -rw-r--r-- | src/bibiman/sanitize.rs | 94 | ||||
| -rw-r--r-- | src/bibiman/sanitize/optimized_sanitize.rs | 86 |
3 files changed, 133 insertions, 123 deletions
diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs index 48046e9..37b0b01 100644 --- a/src/bibiman/bibisetup.rs +++ b/src/bibiman/bibisetup.rs @@ -26,7 +26,7 @@ use std::{fs, path::PathBuf}; use walkdir::WalkDir; use crate::app; -use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata}; +use crate::bibiman::sanitize::sanitize_one_bibidata; use crate::cliargs::{self}; use crate::config::BibiConfig; @@ -159,9 +159,6 @@ impl BibiData { /// Generates the SanitizedBibiData for the BibiData. /// /// Consumes self and returns a new BibiData struct. - /// - /// If multiple SanitizedBibiData are to be generated, - /// one should use the [`mass_sanitize`] function instead. pub fn gen_sanitized(mut self) -> Self { self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self)); self @@ -325,42 +322,41 @@ impl BibiSetup { // this may cause longer startup-load-times. // // - mass_sanitize( - citekeys - .iter() - .enumerate() - .map(|(i, k)| { - let filepaths: (Option<Vec<OsString>>, bool) = - { Self::get_filepath(k, bibliography, &mut pdf_files) }; - - BibiData { - id: i as u32, - authors: Self::get_authors(k, bibliography), - short_author: String::new(), - title: Self::get_title(k, bibliography), - year: Self::get_year(k, bibliography), - custom_field: ( - cfg.general.custom_column.clone(), - Self::get_custom_field(k, bibliography, &cfg.general.custom_column), - ), - keywords: Self::get_keywords(k, bibliography), - citekey: k.to_owned(), - abstract_text: Self::get_abstract(k, bibliography), - doi_url: Self::get_weblink(k, bibliography), - filepath: filepaths.0, - file_field: filepaths.1, - subtitle: Self::get_subtitle(k, bibliography), - notes: if note_files.is_some() { - Self::get_notepath(k, &mut note_files, &ext) - } else { - None - }, - symbols: [None, None, None], - sanitized_bibi_data: None, - } - }) - .collect(), - ) + citekeys + .iter() + .enumerate() + .map(|(i, k)| { + let filepaths: (Option<Vec<OsString>>, bool) = + { Self::get_filepath(k, bibliography, &mut pdf_files) }; + + BibiData { + id: i as u32, + authors: Self::get_authors(k, bibliography), + short_author: String::new(), + title: Self::get_title(k, bibliography), + year: Self::get_year(k, bibliography), + custom_field: ( + cfg.general.custom_column.clone(), + Self::get_custom_field(k, bibliography, &cfg.general.custom_column), + ), + keywords: Self::get_keywords(k, bibliography), + citekey: k.to_owned(), + abstract_text: Self::get_abstract(k, bibliography), + doi_url: Self::get_weblink(k, bibliography), + filepath: filepaths.0, + file_field: filepaths.1, + subtitle: Self::get_subtitle(k, bibliography), + notes: if note_files.is_some() { + Self::get_notepath(k, &mut note_files, &ext) + } else { + None + }, + symbols: [None, None, None], + sanitized_bibi_data: None, + } + .gen_sanitized() + }) + .collect() } // get list of citekeys from the given bibfile diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs index 823b91c..9ccf4c4 100644 --- a/src/bibiman/sanitize.rs +++ b/src/bibiman/sanitize.rs @@ -15,80 +15,23 @@ // along with this program. If not, see <https://www.gnu.org/licenses/>. ///// -use fancy_regex::Regex; -use unicodeit::replace as unicode_replace; - use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData}; -/// Sanitizing process rules as regex cmds. -/// -/// Only macros that are not already covered by unicodeit should be processed in this way. -/// -// Regex to capture content between brackets: `(\{(?:[^{}]++|(\1))*+\})` -// Alternative without capturing the outer brackets: `\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}` -// -const SANITIZE_REGEX: &[(&str, &str)] = &[ - ( - r"\\mkbibquote\{((?:(\{(?:[^}{]|(\1))*+\})|[^{])*?)\}", - "\"${1}\"", - ), - (r"\\hyphen", "-"), -]; - -/// Function to build the sanitization regex vector: -fn regex_vector() -> Vec<(Regex, &'static str)> { - let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len()); - // build regex - for (search, replace) in SANITIZE_REGEX { - regex.push((Regex::new(search).unwrap(), replace)); - } - regex -} - -fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> { - let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect(); - - // process strings - let result_len = result.len(); - for (re, replace) in regex { - for i in 0..result_len { - result[i] = re.replace_all(&result[i], *replace).to_string(); - } - } - for i in 0..result_len { - result[i] = unicode_replace(&result[i]); - } - - // return result - result -} +mod optimized_sanitize; +use optimized_sanitize::optimized_sanitize; /// Helper macro to sanitize bibidata structs. /// Here lives the code that generates SanitizedBibiData /// structs from BibiData structs. macro_rules! optimized_sanitize_bibidata { - ($bibidata:expr, $regex:expr) => { - match &$bibidata.subtitle { - None => { - let sanitized_data = - optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex); - SanitizedBibiData { - title: sanitized_data[0].clone(), - subtitle: None, - abstract_text: sanitized_data[1].clone(), - } - } - Some(subtitle) => { - let sanitized_data = optimized_sanitize( - vec![&$bibidata.title, subtitle, &$bibidata.abstract_text], - &$regex, - ); - SanitizedBibiData { - title: sanitized_data[0].clone(), - subtitle: Some(sanitized_data[1].clone()), - abstract_text: sanitized_data[2].clone(), - } - } + ($bibidata:expr) => { + SanitizedBibiData { + title: optimized_sanitize(&$bibidata.title), + subtitle: match &$bibidata.subtitle { + None => None, + Some(subtitle) => Some(optimized_sanitize(subtitle)), + }, + abstract_text: optimized_sanitize(&$bibidata.abstract_text), } }; } @@ -96,20 +39,5 @@ macro_rules! optimized_sanitize_bibidata { /// Sanitize one BibiData and return a SanitizedBibiData struct. /// This function does ignore any existing sanitization of the bibidata struct. pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData { - let regex = regex_vector(); - optimized_sanitize_bibidata!(bibidata, regex) -} - -/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one. -pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> { - let regex: Vec<(Regex, &str)> = regex_vector(); - - let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len()); - for entry in bibidata { - result.push(BibiData { - sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)), - ..entry - }); - } - result + optimized_sanitize_bibidata!(bibidata) } diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs new file mode 100644 index 0000000..b3bf90d --- /dev/null +++ b/src/bibiman/sanitize/optimized_sanitize.rs @@ -0,0 +1,86 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <https://www.gnu.org/licenses/>. +///// + +use phf::phf_map; +use std::collections::HashMap; + +use logos::Logos; + +static LOOKUP: phf::Map<&'static str, &'static str> = phf_map! { + " " => " ", // str a forced space should substitute to. +}; + +#[derive(Logos, Debug)] +enum Token { + #[token("{")] + OpenCurlyBracket, + #[token("}")] + ClosedCurlyBracket, + #[regex(r"\\\w+")] + LaTeXMacro, + #[token(r"\ ")] + ForcedSpace, +} + +pub fn optimized_sanitize(input_text: &str) -> String { + let mut out: Vec<&str> = Vec::new(); + let mut bracket_counter: u32 = 0; + let mut counter_actions: HashMap<u32, String> = HashMap::new(); + let mut lex = Token::lexer(input_text); + while let Some(sometoken) = lex.next() { + match sometoken { + Ok(token) => match token { + Token::ForcedSpace => { + out.push( + LOOKUP + .get(" ") + .expect("Something is wrong with the sanitization lookup table."), + ); + } + Token::OpenCurlyBracket => { + bracket_counter.saturating_add(1); + todo!(); + } + Token::ClosedCurlyBracket => { + bracket_counter.saturating_sub(1); + todo!(); + } + Token::LaTeXMacro => { + todo!() + } + }, + Err(_) => { + out.push(lex.slice()); + } + } + } + out.into_iter().collect::<String>() +} + +#[cfg(test)] +mod tests { + use super::optimized_sanitize; + + #[test] + fn check_sanitization() { + let result = optimized_sanitize( + r"\mkbibquote{Intention} und \mkbibquote{Intentionen sind \hyphen\ bibquote.}", + ); + println!("{}", result); + panic!("Tatütata!"); + } +} |
