aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKlimperfix2025-10-03 16:56:30 +0200
committerKlimperfix2025-10-03 17:58:53 +0200
commit26befd38aedbfdd278c3096644baf69e4a1fb051 (patch)
treefefa546a14b2bffead3a0b866b6826f65a4ef175
parentdfb7edde13ca39af3e23b80e40272e02aa093919 (diff)
downloadbibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.tar.gz
bibiman-26befd38aedbfdd278c3096644baf69e4a1fb051.zip
Now storing the sanitized data seperately, keeping the original.
-rw-r--r--src/bibiman/bibisetup.rs154
-rw-r--r--src/bibiman/entries.rs8
-rw-r--r--src/bibiman/sanitize.rs96
-rw-r--r--src/bibiman/search.rs4
4 files changed, 175 insertions, 87 deletions
diff --git a/src/bibiman/bibisetup.rs b/src/bibiman/bibisetup.rs
index 8466169..48046e9 100644
--- a/src/bibiman/bibisetup.rs
+++ b/src/bibiman/bibisetup.rs
@@ -26,7 +26,7 @@ use std::{fs, path::PathBuf};
use walkdir::WalkDir;
use crate::app;
-use crate::bibiman::sanitize::sanitize_one;
+use crate::bibiman::sanitize::{mass_sanitize, sanitize_one_bibidata};
use crate::cliargs::{self};
use crate::config::BibiConfig;
@@ -77,6 +77,18 @@ pub struct BibiData {
pub subtitle: Option<String>,
pub notes: Option<Vec<OsString>>,
pub symbols: [Option<String>; 3],
+ /// This field should be set to None when initially creating a BibiData instance.
+ /// It then can be generated from the constructed BibiData Object using
+ /// `BibiData::gen_sanitized()`
+ pub sanitized_bibi_data: Option<SanitizedBibiData>,
+}
+
+/// Struct that holds sanitized bibidata data.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct SanitizedBibiData {
+ pub title: String,
+ pub subtitle: Option<String>,
+ pub abstract_text: String,
}
#[derive(Debug, Clone, PartialEq)]
@@ -120,22 +132,41 @@ impl BibiData {
// self.pubtype(),
// &self.symbols,
// ]
-
- BibiRow {
- authors: {
- if self.short_author.is_empty() {
- self.authors()
- } else {
- &self.short_author
- }
- },
- title: self.title(),
- year: self.year(),
- custom_field_value: self.custom_field_value(),
- symbols: &self.symbols,
+ let author_ref = if self.short_author.is_empty() {
+ self.authors()
+ } else {
+ &self.short_author
+ };
+ if let Some(sanidata) = &self.sanitized_bibi_data {
+ BibiRow {
+ authors: author_ref,
+ title: &sanidata.title,
+ year: self.year(),
+ custom_field_value: self.custom_field_value(),
+ symbols: &self.symbols,
+ }
+ } else {
+ BibiRow {
+ authors: author_ref,
+ title: self.title(),
+ year: self.year(),
+ custom_field_value: self.custom_field_value(),
+ symbols: &self.symbols,
+ }
}
}
+ /// Generates the SanitizedBibiData for the BibiData.
+ ///
+ /// Consumes self and returns a new BibiData struct.
+ ///
+ /// If multiple SanitizedBibiData are to be generated,
+ /// one should use the [`mass_sanitize`] function instead.
+ pub fn gen_sanitized(mut self) -> Self {
+ self.sanitized_bibi_data = Some(sanitize_one_bibidata(&self));
+ self
+ }
+
pub fn entry_id(&self) -> &u32 {
&self.id
}
@@ -288,59 +319,48 @@ impl BibiSetup {
} else {
None
};
- citekeys
- .iter()
- .enumerate()
- .map(|(i, k)| {
- let filepaths: (Option<Vec<OsString>>, bool) =
- { Self::get_filepath(k, bibliography, &mut pdf_files) };
-
- // bibiman will sanitize some fields at this point,
- // this may cause longer startup-load-times.
- //
- // It may be better to sanitize them somewhere else, so bibiman
- // does not loose the original text-information including the
- // LaTeX macros present in the bibfile. From here on, they will be
- // gone.
- //
- // The following fields are going to be sanitized:
- //
- // - title
- // - subtitle
- // - abstract_text
- //
- // TODO: Once the final decision to implement the sanitization at
- // this point, one could write a constructor for the BibiData struct
- // which handles the sanitization.
- BibiData {
- id: i as u32,
- authors: Self::get_authors(k, bibliography),
- short_author: String::new(),
- title: sanitize_one(&Self::get_title(k, bibliography)),
- year: Self::get_year(k, bibliography),
- custom_field: (
- cfg.general.custom_column.clone(),
- Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
- ),
- keywords: Self::get_keywords(k, bibliography),
- citekey: k.to_owned(),
- abstract_text: sanitize_one(&Self::get_abstract(k, bibliography)),
- doi_url: Self::get_weblink(k, bibliography),
- filepath: filepaths.0,
- file_field: filepaths.1,
- subtitle: match Self::get_subtitle(k, bibliography) {
- None => None,
- Some(x) => Some(sanitize_one(&x)),
- },
- notes: if note_files.is_some() {
- Self::get_notepath(k, &mut note_files, &ext)
- } else {
- None
- },
- symbols: [None, None, None],
- }
- })
- .collect()
+ //
+ //
+ // bibiman will sanitize some fields at this point,
+ // this may cause longer startup-load-times.
+ //
+ //
+ mass_sanitize(
+ citekeys
+ .iter()
+ .enumerate()
+ .map(|(i, k)| {
+ let filepaths: (Option<Vec<OsString>>, bool) =
+ { Self::get_filepath(k, bibliography, &mut pdf_files) };
+
+ BibiData {
+ id: i as u32,
+ authors: Self::get_authors(k, bibliography),
+ short_author: String::new(),
+ title: Self::get_title(k, bibliography),
+ year: Self::get_year(k, bibliography),
+ custom_field: (
+ cfg.general.custom_column.clone(),
+ Self::get_custom_field(k, bibliography, &cfg.general.custom_column),
+ ),
+ keywords: Self::get_keywords(k, bibliography),
+ citekey: k.to_owned(),
+ abstract_text: Self::get_abstract(k, bibliography),
+ doi_url: Self::get_weblink(k, bibliography),
+ filepath: filepaths.0,
+ file_field: filepaths.1,
+ subtitle: Self::get_subtitle(k, bibliography),
+ notes: if note_files.is_some() {
+ Self::get_notepath(k, &mut note_files, &ext)
+ } else {
+ None
+ },
+ symbols: [None, None, None],
+ sanitized_bibi_data: None,
+ }
+ })
+ .collect(),
+ )
}
// get list of citekeys from the given bibfile
diff --git a/src/bibiman/entries.rs b/src/bibiman/entries.rs
index db6d6bf..0b35a8b 100644
--- a/src/bibiman/entries.rs
+++ b/src/bibiman/entries.rs
@@ -174,7 +174,9 @@ mod tests {
subtitle: None,
notes: None,
symbols: [None, None, None],
- };
+ sanitized_bibi_data: None,
+ }
+ .gen_sanitized();
let entry_vec = BibiData::ref_vec(&mut entry, &cfg);
@@ -194,7 +196,9 @@ mod tests {
subtitle: None,
notes: None,
symbols: [None, None, None],
- };
+ sanitized_bibi_data: None,
+ }
+ .gen_sanitized();
let entry_vec_editors = BibiData::ref_vec(&mut entry_editors, &cfg);
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
index aaf81ad..614ed11 100644
--- a/src/bibiman/sanitize.rs
+++ b/src/bibiman/sanitize.rs
@@ -18,6 +18,8 @@
use fancy_regex::Regex;
use unicodeit::replace as unicode_replace;
+use crate::bibiman::bibisetup::{BibiData, SanitizedBibiData};
+
/// Sanitizing process rules as regex cmds.
///
/// Only macros that are not already covered by unicodeit should be processed in this way.
@@ -33,6 +35,71 @@ const SANITIZE_REGEX: &[(&str, &str)] = &[
(r"\\hyphen", "-"),
];
+/// Function to build the sanitization regex vector:
+fn regex_vector() -> Vec<(Regex, &'static str)> {
+ let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
+ // build regex
+ for (search, replace) in SANITIZE_REGEX {
+ regex.push((Regex::new(search).unwrap(), replace));
+ }
+ regex
+}
+
+fn optimized_sanitize(input_text: Vec<&str>, regex: &Vec<(Regex, &str)>) -> Vec<String> {
+ let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
+
+ // process strings
+ let result_len = result.len();
+ for (re, replace) in regex {
+ for i in 0..result_len {
+ result[i] = re.replace_all(&result[i], *replace).to_string();
+ }
+ }
+ for i in 0..result_len {
+ result[i] = unicode_replace(&result[i]);
+ }
+
+ // return result
+ result
+}
+
+/// Helper macro to sanitize bibidata structs.
+/// Here lives the code that generates SanitizedBibiData
+/// structs from BibiData structs.
+macro_rules! optimized_sanitize_bibidata {
+ ($bibidata:expr, $regex:expr) => {
+ match &$bibidata.subtitle {
+ None => {
+ let sanitized_data =
+ optimized_sanitize(vec![&$bibidata.title, &$bibidata.abstract_text], &$regex);
+ SanitizedBibiData {
+ title: sanitized_data[0].clone(),
+ subtitle: None,
+ abstract_text: sanitized_data[1].clone(),
+ }
+ }
+ Some(subtitle) => {
+ let sanitized_data = optimized_sanitize(
+ vec![&$bibidata.title, subtitle, &$bibidata.abstract_text],
+ &$regex,
+ );
+ SanitizedBibiData {
+ title: sanitized_data[0].clone(),
+ subtitle: Some(sanitized_data[1].clone()),
+ abstract_text: sanitized_data[2].clone(),
+ }
+ }
+ }
+ };
+}
+
+/// Sanitize one BibiData and return a SanitizedBibiData struct.
+/// This function does ignore any existing sanitization of the bibidata struct.
+pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
+ let regex = regex_vector();
+ optimized_sanitize_bibidata!(bibidata, regex)
+}
+
/// Sanitize one String with LaTeX Macros into a more readable one without.
///
/// If one is going to mass-sanitize strings, one should use the [`sanitize`]
@@ -50,24 +117,19 @@ pub fn sanitize_one(input_text: &str) -> String {
///
/// This function does always return the same amount of Strings as it gets in the input list.
pub fn sanitize(input_text: Vec<&str>) -> Vec<String> {
- let mut result: Vec<String> = input_text.into_iter().map(|s| s.to_string()).collect();
- let mut regex: Vec<(Regex, &str)> = Vec::with_capacity(SANITIZE_REGEX.len());
- // build regex
- for (search, replace) in SANITIZE_REGEX {
- regex.push((Regex::new(search).unwrap(), replace));
- }
+ optimized_sanitize(input_text, &regex_vector())
+}
- // process strings
- let result_len = result.len();
- for (re, replace) in regex {
- for i in 0..result_len {
- result[i] = re.replace_all(&result[i], replace).to_string();
- }
- }
- for i in 0..result_len {
- result[i] = unicode_replace(&result[i]);
- }
+/// Sanitize a whole `Vec<BibiData>`, returning a new sanitized one.
+pub fn mass_sanitize(bibidata: Vec<BibiData>) -> Vec<BibiData> {
+ let regex: Vec<(Regex, &str)> = regex_vector();
- // return result
+ let mut result: Vec<BibiData> = Vec::with_capacity(bibidata.len());
+ for entry in bibidata {
+ result.push(BibiData {
+ sanitized_bibi_data: Some(optimized_sanitize_bibidata!(entry, regex)),
+ ..entry
+ });
+ }
result
}
diff --git a/src/bibiman/search.rs b/src/bibiman/search.rs
index e0c5f17..2156634 100644
--- a/src/bibiman/search.rs
+++ b/src/bibiman/search.rs
@@ -141,7 +141,9 @@ mod tests {
subtitle: None,
notes: None,
symbols: [None, None, None],
- };
+ sanitized_bibi_data: None,
+ }
+ .gen_sanitized();
let joined_vec = BibiSearch::convert_to_string(&bibvec);