Working proof of concept of citekey formatting

author: lukeflo 2025-10-13 15:45:53 +0200
committer: lukeflo 2025-10-13 15:57:42 +0200
commit: 467851007e1861834326deee3116aa88fe839f5a (patch)
tree: 7e1cb113d99c32ad5b434f7e87d851cd9c9be382 /src
parent: 0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (diff)
download: bibiman-467851007e1861834326deee3116aa88fe839f5a.tar.gz
bibiman-467851007e1861834326deee3116aa88fe839f5a.zip
3 files changed, 166 insertions, 77 deletions
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 2f56947..0cec28e 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize};
 
 use crate::{
     bibiman::citekeys::citekey_utils::{build_citekey, formatting_help},
-    config::BibiConfig,
+    config::{BibiConfig, IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
 };
 
 mod citekey_utils;
@@ -60,6 +60,7 @@ pub(crate) struct CitekeyFormatting {
     case: Option<CitekeyCase>,
     old_new_keys_map: Vec<(String, String)>,
     dry_run: bool,
+    ascii_only: bool,
 }
 
 impl CitekeyFormatting {
@@ -69,14 +70,15 @@ impl CitekeyFormatting {
     ) -> color_eyre::Result<()> {
         let mut formatter = CitekeyFormatting::default();
 
-        formatter.fields = cfg
-            .citekey_formatter
-            .fields
-            .clone()
-            .ok_or_eyre("Need to define fields correctly in config file")?;
+        formatter.fields = cfg.citekey_formatter.fields.clone().ok_or_eyre(format!(
+            "Need to define {} correctly in config file",
+            "citekey pattern fields".red()
+        ))?;
 
         formatter.case = cfg.citekey_formatter.case.clone();
 
+        formatter.ascii_only = cfg.citekey_formatter.ascii_only;
+
         if formatter.fields.is_empty() {
             return Err(eyre!(
                 "To format all citekeys, you need to provide {} values in the config file",
@@ -105,13 +107,26 @@ impl CitekeyFormatting {
         formatter.bib_entries = Bibliography::parse(&bibstring)
             .map_err(|e| eyre!("Couldn't parse bibfile due to {}", e.kind))?;
 
+        let ignored_chars = if let Some(chars) = &cfg.citekey_formatter.ignored_chars {
+            chars.as_slice()
+        } else {
+            IGNORED_SPECIAL_CHARS.as_slice()
+        };
+
+        let ignored_words = if let Some(words) = &cfg.citekey_formatter.ignored_words {
+            words.as_slice()
+        } else {
+            &*IGNORED_WORDS.as_slice()
+        };
+
         formatter
-            .do_formatting()
+            .do_formatting(ignored_chars, ignored_words)
             .rev_sort_new_keys_by_len()
             .update_file()?;
 
         Ok(())
     }
+
     /// Start Citekey formatting with building a new instance of `CitekeyFormatting`
     /// Formatting is processed file by file, because `bibman` can handle
     /// multi-file setups.
@@ -144,16 +159,24 @@ impl CitekeyFormatting {
             case: cfg.citekey_formatter.case.clone(),
             old_new_keys_map: Vec::new(),
             dry_run: false,
+            ascii_only: cfg.citekey_formatter.ascii_only,
         })
     }
 
     /// Process the actual formatting. The citekey of every entry will be updated.
-    pub fn do_formatting(&mut self) -> &mut Self {
+    pub fn do_formatting(&mut self, ignored_chars: &[char], ignored_words: &[String]) -> &mut Self {
         let mut old_new_keys: Vec<(String, String)> = Vec::new();
         for entry in self.bib_entries.iter() {
             old_new_keys.push((
                 entry.key.clone(),
-                build_citekey(entry, &self.fields, self.case.as_ref()),
+                build_citekey(
+                    entry,
+                    &self.fields,
+                    self.case.as_ref(),
+                    self.ascii_only,
+                    ignored_chars,
+                    ignored_words,
+                ),
             ));
         }
 
@@ -215,12 +238,15 @@ mod tests {
 
     use biblatex::Bibliography;
 
-    use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting};
+    use crate::{
+        bibiman::citekeys::{CitekeyCase, CitekeyFormatting},
+        config::{IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
+    };
 
     #[test]
     fn format_citekey_test() {
         let src = r"
-        @article{bos_latex_metadata_and_publishing_workflows_2023,
+        @article{Bos2023,
             title         = {{LaTeX}, metadata, and publishing workflows},
             author        = {Bos, Joppe W. and {McCurley}, Kevin S.},
             year          = {2023},
@@ -232,7 +258,7 @@ mod tests {
             urldate       = {2023-08-22},
             note          = {type: article},
         }
-        @book{bhambra_colonialism_social_theory_2021,
+        @book{Bhambra2021,
             title         = {Colonialism and \textbf{Modern Social Theory}},
             author        = {Bhambra, Gurminder K. and Holmwood, John},
             location      = {Cambridge and Medford},
@@ -247,29 +273,24 @@ mod tests {
             fields: vec![
                 "entrytype;;;;:".into(),
                 "author;;;-;_".into(),
-                "title;4;3;_;_".into(),
+                "title;4;3;=;_".into(),
                 "location;;4;:;_".into(),
                 "year".into(),
             ],
-            case: None,
+            case: Some(CitekeyCase::Lower),
             old_new_keys_map: Vec::new(),
             dry_run: false,
+            ascii_only: true,
         };
-        let _ = formatting_struct.do_formatting();
+        let _ = formatting_struct
+            .do_formatting(IGNORED_SPECIAL_CHARS.as_slice(), &*IGNORED_WORDS.as_slice());
         assert_eq!(
             formatting_struct.old_new_keys_map.get(0).unwrap().1,
-            "article:Bos-McCurley_LaT_met_and_pub_Empt_2023"
+            "article:bos-mccurley_lat=met=pub=wor_2023"
         );
         assert_eq!(
             formatting_struct.old_new_keys_map.get(1).unwrap().1,
-            "book:Bhambra-Holmwood_Col_and_Mod_Soc_Camb:and:Medf_2021"
-        );
-        formatting_struct.case = Some(CitekeyCase::Lower);
-        let _ = formatting_struct.do_formatting().rev_sort_new_keys_by_len();
-        // now the longer citekey is processed first and its in lowercase!
-        assert_eq!(
-            formatting_struct.old_new_keys_map.get(0).unwrap().1,
-            "book:bhambra-holmwood_col_and_mod_soc_camb:and:medf_2021"
+            "book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021"
         );
     }
 
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
index ee2c849..5f70224 100644
--- a/src/bibiman/citekeys/citekey_utils.rs
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -16,21 +16,14 @@
 /////
 
 use biblatex::{ChunksExt, Entry, Type};
+use deunicode::deunicode;
 use indoc::formatdoc;
 use owo_colors::{
     OwoColorize,
     colors::{BrightBlue, Green, White},
 };
 
-use crate::{
-    bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully},
-    config::IGNORED_SPECIAL_CHARS,
-};
-
-const IGNORE_WORDS: [&str; 20] = [
-    "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine",
-    "eines", "des", "auf", "und", "für", "vor",
-];
+use crate::bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully};
 
 pub(super) fn formatting_help() {
     let help = vec![
@@ -104,6 +97,9 @@ pub(super) fn build_citekey(
     entry: &Entry,
     pattern_fields: &[String],
     case: Option<&CitekeyCase>,
+    ascii_only: bool,
+    ignored_chars: &[char],
+    ignored_words: &[String],
 ) -> String {
     // mut string the citekey is built from
     let mut new_citekey = String::new();
@@ -114,7 +110,7 @@ pub(super) fn build_citekey(
     // loop over pattern fields process them
     'field_loop: for pattern in pattern_fields.iter() {
         // parse single values from pattern field
-        let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) =
+        let (field_name, max_words, max_chars, inner_delimiter, cur_trailing_delimiter) =
             split_formatting_pat(pattern);
 
         // built the part of the citekey from the current pattern field
@@ -126,16 +122,9 @@ pub(super) fn build_citekey(
 
             // split at whitespaces, count fields and set counter for processed
             // splits
-            let mut split_field = field.split_whitespace();
+            let split_field = field.split_whitespace();
             let mut words_passed = 0;
             let field_count = field.split_whitespace().count();
-            let word_count = if let Some(val) = word_count
-                && val <= field_count
-            {
-                val
-            } else {
-                field_count
-            };
 
             // If there is a trailing delimiter from the previous field, push it
             if let Some(del) = trailing_delimiter {
@@ -152,47 +141,57 @@ pub(super) fn build_citekey(
             }
 
             // loop over single parts of current field and add correct delimiter
-            'word_loop: loop {
-                // process the single slices and add correct delimiter
-                if let Some(field_slice) = split_field.next() {
-                    // Create word slice char by char. We need to loop over chars
-                    // instead of a simple bytes index to also catch chars which
-                    // consist of more than one byte (äöüøæ etc...)
-                    let mut word_slice = String::new();
-                    let word_chars = field_slice.chars();
-                    let mut counter = 0;
-                    'char_loop: for mut c in word_chars {
-                        // If camelcase is set, force first char of word to uppercase
-                        if counter == 0 && case == Some(&CitekeyCase::Camel) {
-                            c = c.to_ascii_uppercase()
-                        }
-                        if let Some(len) = char_count
-                            && counter == len
-                        {
-                            break 'char_loop;
-                        }
-                        // if a word slice contains a special char, skip it
-                        if IGNORED_SPECIAL_CHARS.contains(&c) {
-                            continue 'char_loop;
-                        }
+            // process the single slices and add correct delimiter
+            'word_loop: for (idx, field_slice) in split_field.enumerate() {
+                // if the current slice is a common word from the ignore list,
+                // skip it.
+                if ignored_words.contains(&field_slice.to_lowercase()) {
+                    continue;
+                }
+
+                // Create word slice char by char. We need to loop over chars
+                // instead of a simple bytes index to also catch chars which
+                // consist of more than one byte (äöüøæ etc...)
+                let mut word_slice = String::new();
+                let word_chars = field_slice.chars();
+                let mut counter = 0;
+                'char_loop: for mut c in word_chars {
+                    // If camelcase is set, force first char of word to uppercase
+                    if counter == 0 && case == Some(&CitekeyCase::Camel) {
+                        c = c.to_ascii_uppercase()
+                    }
+                    if let Some(len) = max_chars
+                        && counter >= len
+                    {
+                        break 'char_loop;
+                    }
+                    // if a word slice contains a special char, skip it
+                    if ignored_chars.contains(&c) {
+                        continue 'char_loop;
+                    }
+                    // if non-ascii chars should be mapped, check if needed and do it
+                    if let Some(chars) = deunicode::deunicode_char(c)
+                        && ascii_only
+                    {
+                        word_slice.push_str(chars);
+                        counter += chars.len();
+                    } else {
                         word_slice.push(c);
                         counter += 1;
                     }
-                    // Don't count empty slices and don't add delimiter to those
-                    if !word_slice.is_empty() {
-                        formatted_str = formatted_str + &word_slice;
-                        words_passed += 1;
-                        if word_count == words_passed {
-                            break 'word_loop;
-                        } else {
-                            formatted_str = formatted_str + inner_delimiter.unwrap_or("");
-                        }
+                }
+                // Don't count empty slices and don't add delimiter to those
+                if !word_slice.is_empty() {
+                    formatted_str = formatted_str + &word_slice;
+                    words_passed += 1;
+                    if max_words.is_some_and(|max| max == words_passed) || idx + 1 == field_count {
+                        break 'word_loop;
                     } else {
-                        continue 'word_loop;
+                        formatted_str = formatted_str + inner_delimiter.unwrap_or("");
                     }
                 } else {
-                    break 'word_loop;
-                };
+                    continue 'word_loop;
+                }
             }
             formatted_str
         };
diff --git a/src/config.rs b/src/config.rs
index b1c4b07..7c1a0f8 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -20,6 +20,7 @@ use std::{
     io::{Write, stdin},
     path::PathBuf,
     str::FromStr,
+    sync::LazyLock,
 };
 
 use color_eyre::{eyre::Result, owo_colors::OwoColorize};
@@ -40,6 +41,31 @@ pub const IGNORED_SPECIAL_CHARS: [char; 33] = [
     '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"',
 ];
 
+pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| {
+    vec![
+        String::from("the"),
+        String::from("a"),
+        String::from("an"),
+        String::from("of"),
+        String::from("for"),
+        String::from("in"),
+        String::from("at"),
+        String::from("to"),
+        String::from("and"),
+        String::from("der"),
+        String::from("die"),
+        String::from("das"),
+        String::from("ein"),
+        String::from("eine"),
+        String::from("eines"),
+        String::from("des"),
+        String::from("auf"),
+        String::from("und"),
+        String::from("für"),
+        String::from("vor"),
+    ]
+});
+
 const DEFAULT_CONFIG: &str = r##"
 # [general]
 ## Default files/dirs which are loaded on startup
@@ -118,6 +144,40 @@ const DEFAULT_CONFIG: &str = r##"
 ## Convert chars to specified case. Possible values:
 ## "upper", "uppercase", "lower", "lowercase"
 # case = "lowercase"
+
+## Map all unicode chars to their pure ascii equivalent
+# ascii_only = true
+
+## List of special chars that'll be ignored when building citekeys.
+## A custom list will overwrite the default list
+# ignored_chars = [
+#     "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", """,
+# ]
+
+## List of words that'll be ignored when building citekeys.
+## A custom list will overwrite the default list
+# ignored_words = [
+#     "the",
+#     "a",
+#     "an",
+#     "of",
+#     "for",
+#     "in",
+#     "at",
+#     "to",
+#     "and",
+#     "der",
+#     "die",
+#     "das",
+#     "ein",
+#     "eine",
+#     "eines",
+#     "des",
+#     "auf",
+#     "und",
+#     "für",
+#     "vor",
+# ]
 "##;
 
 /// Main struct of the config file. Contains substructs/headings in toml
@@ -171,6 +231,9 @@ pub struct Colors {
 pub struct CitekeyFormatter {
     pub fields: Option<Vec<String>>,
     pub case: Option<CitekeyCase>,
+    pub ascii_only: bool,
+    pub ignored_chars: Option<Vec<char>>,
+    pub ignored_words: Option<Vec<String>>,
 }
 
 impl Default for BibiConfig {
@@ -194,6 +257,9 @@ impl Default for BibiConfig {
             citekey_formatter: CitekeyFormatter {
                 fields: None,
                 case: None,
+                ascii_only: true,
+                ignored_chars: None,
+                ignored_words: None,
             },
         }
     }
@@ -224,6 +290,9 @@ impl BibiConfig {
             citekey_formatter: CitekeyFormatter {
                 fields: None,
                 case: None,
+                ascii_only: true,
+                ignored_chars: None,
+                ignored_words: None,
             },
         }
     }
author	lukeflo	2025-10-13 15:45:53 +0200
committer	lukeflo	2025-10-13 15:57:42 +0200
commit	467851007e1861834326deee3116aa88fe839f5a (patch)
tree	7e1cb113d99c32ad5b434f7e87d851cd9c9be382 /src
parent	0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (diff)
download	bibiman-467851007e1861834326deee3116aa88fe839f5a.tar.gz bibiman-467851007e1861834326deee3116aa88fe839f5a.zip