From 67afd67d4d51a00079269d431a7058fc50750886 Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Tue, 7 Oct 2025 15:05:47 +0200
Subject: implement basic citekey formatting:

* Reads patterns and parses them.

TODO:

* **Fully** sanitize Latex macros
* Preprocess complex and regularly used fields like `author`
* Write changes to original bib file
---
 src/config.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src/config.rs')
diff --git a/src/config.rs b/src/config.rs
index 00a35b7..78cfef9 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -102,6 +102,7 @@ const DEFAULT_CONFIG: &str = r##"
 pub struct BibiConfig {
     pub general: General,
     pub colors: Colors,
+    pub citekey_formatter: CitekeyFormatter,
 }
 
 /// Substruct [general] in config.toml
@@ -143,6 +144,11 @@ pub struct Colors {
     pub year_color: Color,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct CitekeyFormatter {
+    pub fields: Vec<String>,
+}
+
 impl Default for BibiConfig {
     fn default() -> Self {
         Self {
@@ -161,6 +167,7 @@ impl Default for BibiConfig {
                 custom_column: CustomField::Pubtype,
             },
             colors: Self::dark_colors(),
+            citekey_formatter: CitekeyFormatter { fields: Vec::new() },
         }
     }
 }
@@ -187,6 +194,7 @@ impl BibiConfig {
             } else {
                 Self::dark_colors()
             },
+            citekey_formatter: CitekeyFormatter { fields: Vec::new() },
         }
     }
 
-- 
cgit v1.2.3


From a07359a9a1da0c06c040f77158be31b3883b33ac Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Wed, 8 Oct 2025 13:49:06 +0200
Subject: refine matching and preformatting of fields for citekey formattin;
 add case field and enum

---
 Cross.toml                                 |   6 ++
 src/bibiman/citekeys.rs                    | 136 ++++++++++++++++++++++++++---
 src/bibiman/sanitize.rs                    |  10 ++-
 src/bibiman/sanitize/optimized_sanitize.rs |  28 +++++-
 src/config.rs                              |  24 +++--
 5 files changed, 177 insertions(+), 27 deletions(-)

(limited to 'src/config.rs')

diff --git a/Cross.toml b/Cross.toml
index e7cd27b..6140bf2 100644
--- a/Cross.toml
+++ b/Cross.toml
@@ -9,3 +9,9 @@ pre-build = [
     "dpkg --add-architecture $CROSS_DEB_ARCH",
     "apt-get update && apt-get install --assume-yes libssl-dev:$CROSS_DEB_ARCH",
 ]
+
+[target.x86_64-unknown-freebsd]
+# pre-build = [
+#     "dpkg --add-architecture $CROSS_DEB_ARCH",
+#     "apt-get update && apt-get install --assume-yes libssl-dev:$CROSS_DEB_ARCH",
+# ]
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 4c36e80..a304e92 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -1,13 +1,40 @@
-use biblatex::Bibliography;
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025  lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+/////
+
+use biblatex::{Bibliography, ChunksExt, Entry, Type};
 use color_eyre::eyre::eyre;
 use owo_colors::OwoColorize;
+use serde::{Deserialize, Serialize};
+
+use crate::{bibiman::sanitize::sanitize_single_string_fully, config::BibiConfig};
 
-use crate::config::BibiConfig;
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub(crate) enum CitekeyCase {
+    #[serde(alias = "uppercase", alias = "upper")]
+    Upper,
+    #[serde(alias = "lowercase", alias = "lower")]
+    Lower,
+}
 
 #[derive(Debug, Default, Clone)]
 pub(crate) struct CitekeyFormatting {
     bib_entries: Bibliography,
     fields: Vec<String>,
+    case: Option<CitekeyCase>,
 }
 
 impl CitekeyFormatting {
@@ -27,6 +54,7 @@ impl CitekeyFormatting {
         Ok(Self {
             bib_entries,
             fields,
+            case: cfg.citekey_formatter.case.clone(),
         })
     }
 
@@ -38,17 +66,36 @@ impl CitekeyFormatting {
                     split_formatting_pat(pattern);
                 let formatted_field_str = {
                     let mut formatted_str = String::new();
-                    let field = entry.get_as::<String>(field).expect(&format!(
-                        "Couldn't find field {}",
-                        field.bold().bright_red()
-                    ));
+                    let field = preformat_field(field, entry);
+                    // let field = if let Ok(val) = entry.get_as::<String>(field) {
+                    //     val
+                    // } else {
+                    //     eprintln!(
+                    //         "Unable to get field {} for entry {}",
+                    //         field.bright_red(),
+                    //         &entry.key.bold()
+                    //     );
+                    //     continue;
+                    // };
+                    // let field = entry.get_as::<String>(field).expect(&format!(
+                    //     "Couldn't find field {}",
+                    //     field.bold().bright_red()
+                    // ));
                     let mut split_field = field.split_whitespace();
                     let mut words_passed = 0;
+                    let word_count = if let Some(val) = word_count {
+                        val
+                    } else {
+                        field.split_whitespace().count()
+                        // split_field.size_hint().0 + 1
+                    };
+                    dbg!(word_count);
                     loop {
                         if let Some(field_slice) = split_field.next() {
                             formatted_str = formatted_str + format_word(field_slice, char_count);
                             words_passed += 1;
-                            if word_count.is_some_and(|count| count == words_passed) {
+                            // if word_count.is_some_and(|count| count == words_passed) {
+                            if word_count == words_passed {
                                 formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
                                 break;
                             } else {
@@ -68,7 +115,51 @@ impl CitekeyFormatting {
     }
 }
 
-fn preformat_field() {}
+/// Preformat some fields which are very common to be used in citekeys
+fn preformat_field(field: &str, entry: &mut Entry) -> String {
+    match field {
+        "title" => {
+            sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("NA".into()))
+        }
+        "author" => {
+            if let Ok(authors) = entry.author() {
+                let mut last_names = String::new();
+                for a in authors.iter() {
+                    last_names = last_names + &a.name + " ";
+                }
+                dbg!(&last_names);
+                last_names
+            } else {
+                "NA".to_string()
+            }
+        }
+        "year" => {
+            if let Ok(date) = entry.date() {
+                date.to_chunks().format_verbatim()[..4].to_string()
+            } else {
+                entry.get_as::<String>(field).unwrap_or("NA".into())
+            }
+        }
+        "subtitle" => {
+            sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("NA".into()))
+        }
+        "editor" => {
+            if let Ok(editors) = entry.editors() {
+                let mut last_names = String::new();
+                for editortypes in editors.iter() {
+                    for e in editortypes.0.iter() {
+                        last_names = last_names + &e.name + " ";
+                    }
+                }
+                last_names
+            } else {
+                "NA".to_string()
+            }
+        }
+        "pubtype" | "entrytype" => entry.entry_type.to_string(),
+        _ => entry.get_as::<String>(field).unwrap_or("Empty".into()),
+    }
+}
 
 /// Cut of word at char count index if its set
 fn format_word(word: &str, count: Option<usize>) -> &str {
@@ -122,7 +213,7 @@ mod tests {
     use biblatex::Bibliography;
     use itertools::Itertools;
 
-    use crate::bibiman::citekeys::{CitekeyFormatting, split_formatting_pat};
+    use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting, split_formatting_pat};
 
     #[test]
     fn split_citekey_pattern() {
@@ -149,19 +240,36 @@ mod tests {
 
     #[test]
     fn format_citekey_test() {
-        let src = r"@book{tolkien1937, author = {Tolkien}, title = {\enquote{Lord} of the \textbf{Rings}}, year = {1937}}";
+        let src = r"
+        @book{bhambra_colonialism_social_theory_2021,
+            title         = {Colonialism and \textbf{Modern Social Theory}},
+            author        = {Bhambra, Gurminder K. and Holmwood, John},
+            location      = {Cambridge and Medford},
+            publisher     = {Polity Press},
+            date          = {2021},
+        }
+        ";
         let bibliography = Bibliography::parse(src).unwrap();
         let mut formatting_struct = CitekeyFormatting {
             bib_entries: bibliography,
             fields: vec![
-                "author;1;;-;_".into(),
-                "title;3;3;_;_".into(),
+                "entrytype;;;;:".into(),
+                "author;;;-;_".into(),
+                "title;4;3;_;_".into(),
+                "location;;4;:;_".into(),
                 "year".into(),
             ],
+            case: None,
         };
         formatting_struct.do_formatting();
         let keys = formatting_struct.bib_entries.keys().collect_vec();
-        assert_eq!(keys[0], "Tolkien_Lor_of_the_1937");
-        assert_eq!(keys[0].to_lowercase(), "tolkien_lor_of_the_1937");
+        assert_eq!(
+            keys[0],
+            "book:Bhambra-Holmwood_Col_and_Mod_Soc_Camb:and:Medf_2021"
+        );
+        assert_eq!(
+            keys[0].to_lowercase(),
+            "book:bhambra-holmwood_col_and_mod_soc_camb:and:medf_2021"
+        );
     }
 }
diff --git a/src/bibiman/sanitize.rs b/src/bibiman/sanitize.rs
index 9ccf4c4..8c1cc43 100644
--- a/src/bibiman/sanitize.rs
+++ b/src/bibiman/sanitize.rs
@@ -26,12 +26,12 @@ use optimized_sanitize::optimized_sanitize;
 macro_rules! optimized_sanitize_bibidata {
     ($bibidata:expr) => {
         SanitizedBibiData {
-            title: optimized_sanitize(&$bibidata.title),
+            title: optimized_sanitize(false, &$bibidata.title),
             subtitle: match &$bibidata.subtitle {
                 None => None,
-                Some(subtitle) => Some(optimized_sanitize(subtitle)),
+                Some(subtitle) => Some(optimized_sanitize(false, subtitle)),
             },
-            abstract_text: optimized_sanitize(&$bibidata.abstract_text),
+            abstract_text: optimized_sanitize(false, &$bibidata.abstract_text),
         }
     };
 }
@@ -41,3 +41,7 @@ macro_rules! optimized_sanitize_bibidata {
 pub fn sanitize_one_bibidata(bibidata: &BibiData) -> SanitizedBibiData {
     optimized_sanitize_bibidata!(bibidata)
 }
+
+pub fn sanitize_single_string_fully(input: &str) -> String {
+    optimized_sanitize(true, input)
+}
diff --git a/src/bibiman/sanitize/optimized_sanitize.rs b/src/bibiman/sanitize/optimized_sanitize.rs
index 336cc56..dff4d32 100644
--- a/src/bibiman/sanitize/optimized_sanitize.rs
+++ b/src/bibiman/sanitize/optimized_sanitize.rs
@@ -31,6 +31,17 @@ static LOOKUP: phf::Map<&'static str, (&'static str, Option<&'static str>)> = ph
     r"\textsc" => ("", Some("")),
 };
 
+static LOOKUP_CLEAR_ALL: phf::Map<&'static str, (&'static str, Option<&'static str>)> = phf_map! {
+    r"\mkbibquote" => ("", Some("")),
+    r"\enquote*" => ("", Some("")),
+    r"\enquote" => ("", Some("")),
+    r"\hyphen" => ("", None),
+    r"\textbf" => ("", Some("")),
+    r"\textit" => ("", Some("")),
+    r"\texttt" => ("", Some("")),
+    r"\textsc" => ("", Some("")),
+};
+
 #[derive(Logos, Debug)]
 enum Token {
     #[token("{")]
@@ -43,7 +54,12 @@ enum Token {
     ForcedSpace,
 }
 
-pub fn optimized_sanitize(input_text: &str) -> String {
+pub fn optimized_sanitize(clear_all: bool, input_text: &str) -> String {
+    let lookup = if clear_all {
+        &LOOKUP_CLEAR_ALL
+    } else {
+        &LOOKUP
+    };
     let mut char_counter: usize = 0;
     let mut contains_macro: bool = false;
     for char in input_text.chars() {
@@ -87,7 +103,7 @@ pub fn optimized_sanitize(input_text: &str) -> String {
                     }
                     Token::LaTeXMacro => {
                         let texmacro = lex.slice();
-                        if let Some(x) = LOOKUP.get(&texmacro.trim_end()) {
+                        if let Some(x) = lookup.get(&texmacro.trim_end()) {
                             if let Some(end) = x.1 {
                                 bc_up = true;
                                 counter_actions.insert(bracket_counter + 1, end);
@@ -115,11 +131,17 @@ mod tests {
     #[test]
     fn check_sanitization() {
         let result = optimized_sanitize(
+            false,
             r"\mkbibquote {Intention} und \mkbibquote{Intentionen \mkbibquote{sind} \hyphen\ bibquote\hyphen .}",
         );
         assert_eq!(
             "\"Intention\" und \"Intentionen \"sind\" - bibquote-.\"",
             result
-        )
+        );
+        let result = optimized_sanitize(
+            true,
+            r"\mkbibquote {Intention} und \mkbibquote{Intentionen \mkbibquote{sind} \hyphen\ bibquote\hyphen .}",
+        );
+        assert_eq!("Intention und Intentionen sind  bibquote.", result)
     }
 }
diff --git a/src/config.rs b/src/config.rs
index 78cfef9..8a333e4 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -16,21 +16,24 @@
 /////
 
 use std::{
-    fs::{create_dir_all, File},
-    io::{stdin, Write},
+    fs::{File, create_dir_all},
+    io::{Write, stdin},
     path::PathBuf,
     str::FromStr,
 };
 
 use color_eyre::{eyre::Result, owo_colors::OwoColorize};
 use figment::{
-    providers::{Format, Serialized, Toml},
     Figment,
+    providers::{Format, Serialized, Toml},
 };
 use ratatui::style::Color;
 use serde::{Deserialize, Serialize};
 
-use crate::{bibiman::bibisetup::CustomField, cliargs::CLIArgs};
+use crate::{
+    bibiman::{bibisetup::CustomField, citekeys::CitekeyCase},
+    cliargs::CLIArgs,
+};
 
 const DEFAULT_CONFIG: &str = r##"
 # [general]
@@ -147,6 +150,7 @@ pub struct Colors {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct CitekeyFormatter {
     pub fields: Vec<String>,
+    pub case: Option<CitekeyCase>,
 }
 
 impl Default for BibiConfig {
@@ -167,7 +171,10 @@ impl Default for BibiConfig {
                 custom_column: CustomField::Pubtype,
             },
             colors: Self::dark_colors(),
-            citekey_formatter: CitekeyFormatter { fields: Vec::new() },
+            citekey_formatter: CitekeyFormatter {
+                fields: Vec::new(),
+                case: None,
+            },
         }
     }
 }
@@ -194,7 +201,10 @@ impl BibiConfig {
             } else {
                 Self::dark_colors()
             },
-            citekey_formatter: CitekeyFormatter { fields: Vec::new() },
+            citekey_formatter: CitekeyFormatter {
+                fields: Vec::new(),
+                case: None,
+            },
         }
     }
 
@@ -352,8 +362,8 @@ fn select_opener() -> String {
 #[cfg(test)]
 mod tests {
     use figment::{
-        providers::{Format, Toml},
         Figment,
+        providers::{Format, Toml},
     };
 
     use super::BibiConfig;
-- 
cgit v1.2.3


From 8b858f92da69cfb8fa43ec861cda46eeb6ef4bbe Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Wed, 8 Oct 2025 14:39:46 +0200
Subject: case parsing from config, needs to be implemented for citekey struct

---
 src/bibiman/citekeys.rs | 95 +++++++++++++++++++++++--------------------------
 src/config.rs           |  8 +++--
 2 files changed, 49 insertions(+), 54 deletions(-)

(limited to 'src/config.rs')

diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index a304e92..118ae3e 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -23,7 +23,7 @@ use serde::{Deserialize, Serialize};
 use crate::{bibiman::sanitize::sanitize_single_string_fully, config::BibiConfig};
 
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
-pub(crate) enum CitekeyCase {
+pub enum CitekeyCase {
     #[serde(alias = "uppercase", alias = "upper")]
     Upper,
     #[serde(alias = "lowercase", alias = "lower")]
@@ -44,7 +44,11 @@ impl CitekeyFormatting {
     /// The `Bibliography` inserted will be edited in place with the new citekeys.
     /// Thus, in the end the `bib_entries` field will hold the updated `Bibliography`
     pub fn new(cfg: &BibiConfig, bib_entries: Bibliography) -> color_eyre::Result<Self> {
-        let fields = cfg.citekey_formatter.fields.clone();
+        let fields = cfg
+            .citekey_formatter
+            .fields
+            .clone()
+            .expect("Need to define fields in config to format citekeys");
         if fields.is_empty() {
             return Err(eyre!(
                 "To format all citekeys, you need to provide {} values in the config file",
@@ -58,65 +62,54 @@ impl CitekeyFormatting {
         })
     }
 
+    /// Process the actual formatting. The citekey of every entry will be updated.
     pub fn do_formatting(&mut self) {
         for entry in self.bib_entries.iter_mut() {
-            let mut new_citekey = String::new();
-            for pattern in self.fields.iter() {
-                let (field, word_count, char_count, inner_delimiter, trailing_delimiter) =
-                    split_formatting_pat(pattern);
-                let formatted_field_str = {
-                    let mut formatted_str = String::new();
-                    let field = preformat_field(field, entry);
-                    // let field = if let Ok(val) = entry.get_as::<String>(field) {
-                    //     val
-                    // } else {
-                    //     eprintln!(
-                    //         "Unable to get field {} for entry {}",
-                    //         field.bright_red(),
-                    //         &entry.key.bold()
-                    //     );
-                    //     continue;
-                    // };
-                    // let field = entry.get_as::<String>(field).expect(&format!(
-                    //     "Couldn't find field {}",
-                    //     field.bold().bright_red()
-                    // ));
-                    let mut split_field = field.split_whitespace();
-                    let mut words_passed = 0;
-                    let word_count = if let Some(val) = word_count {
-                        val
+            entry.key = build_citekey(entry, &self.fields);
+        }
+    }
+}
+
+/// Build the citekey from the patterns defined in the config file
+fn build_citekey(entry: &Entry, pattern_fields: &[String]) -> String {
+    let mut new_citekey = String::new();
+    for pattern in pattern_fields.iter() {
+        let (field, word_count, char_count, inner_delimiter, trailing_delimiter) =
+            split_formatting_pat(pattern);
+        let formatted_field_str = {
+            let mut formatted_str = String::new();
+            let field = preformat_field(field, entry);
+            let mut split_field = field.split_whitespace();
+            let mut words_passed = 0;
+            let word_count = if let Some(val) = word_count {
+                val
+            } else {
+                field.split_whitespace().count()
+            };
+            loop {
+                if let Some(field_slice) = split_field.next() {
+                    formatted_str = formatted_str + format_word(field_slice, char_count);
+                    words_passed += 1;
+                    if word_count == words_passed {
+                        formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
+                        break;
                     } else {
-                        field.split_whitespace().count()
-                        // split_field.size_hint().0 + 1
-                    };
-                    dbg!(word_count);
-                    loop {
-                        if let Some(field_slice) = split_field.next() {
-                            formatted_str = formatted_str + format_word(field_slice, char_count);
-                            words_passed += 1;
-                            // if word_count.is_some_and(|count| count == words_passed) {
-                            if word_count == words_passed {
-                                formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
-                                break;
-                            } else {
-                                formatted_str = formatted_str + inner_delimiter.unwrap_or("")
-                            }
-                        } else {
-                            formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
-                            break;
-                        };
+                        formatted_str = formatted_str + inner_delimiter.unwrap_or("")
                     }
-                    formatted_str
+                } else {
+                    formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
+                    break;
                 };
-                new_citekey = new_citekey + &formatted_field_str;
             }
-            entry.key = new_citekey;
-        }
+            formatted_str
+        };
+        new_citekey = new_citekey + &formatted_field_str;
     }
+    new_citekey
 }
 
 /// Preformat some fields which are very common to be used in citekeys
-fn preformat_field(field: &str, entry: &mut Entry) -> String {
+fn preformat_field(field: &str, entry: &Entry) -> String {
     match field {
         "title" => {
             sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("NA".into()))
diff --git a/src/config.rs b/src/config.rs
index 8a333e4..a5df61c 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -149,7 +149,7 @@ pub struct Colors {
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct CitekeyFormatter {
-    pub fields: Vec<String>,
+    pub fields: Option<Vec<String>>,
     pub case: Option<CitekeyCase>,
 }
 
@@ -172,7 +172,7 @@ impl Default for BibiConfig {
             },
             colors: Self::dark_colors(),
             citekey_formatter: CitekeyFormatter {
-                fields: Vec::new(),
+                fields: None,
                 case: None,
             },
         }
@@ -202,7 +202,7 @@ impl BibiConfig {
                 Self::dark_colors()
             },
             citekey_formatter: CitekeyFormatter {
-                fields: Vec::new(),
+                fields: None,
                 case: None,
             },
         }
@@ -400,6 +400,8 @@ mod tests {
                     author_color = "38"
                     title_color = "37"
                     year_color = "135"
+
+                    [citekey_formatter]
                 "#,
             )?;
 
-- 
cgit v1.2.3


From c69b1789fabaf149916d160922d7026f2cbe33f1 Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Fri, 10 Oct 2025 14:57:53 +0200
Subject: implement const of ignored special chars for citekey formatting

* the list contains 33 special chars at the moment
* it will only affect already existing special chars in biblatex fields
* delimiter specified for citekey formatting are not affected
* char count is also not affected, ignored chars are not counted
---
 src/bibiman/citekeys.rs | 40 +++++++++++++++++++++-------------------
 src/config.rs           |  5 +++++
 tests/test-config.toml  |  2 +-
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'src/config.rs')

diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 5121741..7c06886 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -31,7 +31,10 @@ use owo_colors::{
 };
 use serde::{Deserialize, Serialize};
 
-use crate::{bibiman::sanitize::sanitize_single_string_fully, config::BibiConfig};
+use crate::{
+    bibiman::sanitize::sanitize_single_string_fully,
+    config::{BibiConfig, IGNORED_SPECIAL_CHARS},
+};
 
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub enum CitekeyCase {
@@ -354,26 +357,25 @@ fn preformat_field(field: &str, entry: &Entry) -> String {
 
 /// Cut of word at char count index if its set
 fn format_word(word: &str, count: Option<usize>) -> String {
-    if let Some(len) = count
-        && len < word.chars().count()
-    {
-        // Since chars can consist of multiple bytes, we need this more complex
-        // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...)
-        // instead of simple byte indexing
-        let mut word_slice = String::new();
-        let word_chars = word.chars();
-        let mut counter = 0;
-        for c in word_chars {
-            if counter == len {
-                break;
-            }
-            word_slice.push(c);
-            counter += 1;
+    // Since chars can consist of multiple bytes, we need this more complex
+    // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...)
+    // instead of simple byte indexing
+    let mut word_slice = String::new();
+    let word_chars = word.chars();
+    let mut counter = 0;
+    for c in word_chars {
+        if let Some(len) = count
+            && counter == len
+        {
+            break;
         }
-        word_slice
-    } else {
-        word.to_string()
+        if IGNORED_SPECIAL_CHARS.contains(&c) {
+            continue;
+        }
+        word_slice.push(c);
+        counter += 1;
     }
+    word_slice
 }
 
 /// Split a formatting pattern of kind
diff --git a/src/config.rs b/src/config.rs
index a5df61c..a4e89be 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -35,6 +35,11 @@ use crate::{
     cliargs::CLIArgs,
 };
 
+pub const IGNORED_SPECIAL_CHARS: [char; 33] = [
+    '?', '!', '\\', '\'', '.', '-', '–', ':', ',', '[', ']', '(', ')', '{', '}', '§', '$', '%',
+    '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"',
+];
+
 const DEFAULT_CONFIG: &str = r##"
 # [general]
 ## Default files/dirs which are loaded on startup
diff --git a/tests/test-config.toml b/tests/test-config.toml
index b484b69..558d216 100644
--- a/tests/test-config.toml
+++ b/tests/test-config.toml
@@ -61,5 +61,5 @@ custom_column = "series"
 # year_color = "135"
 
 [citekey_formatter]
-fields = [ "author;2;;-;_", "title;3;3;_;_", "year" ]
+fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ]
 case = "lowercase"
-- 
cgit v1.2.3


From 418d2f3874c8e86c4b58143115ee3d4181130f9c Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Fri, 10 Oct 2025 15:09:48 +0200
Subject: add dry-run information to --help function

---
 src/bibiman/citekeys.rs | 11 +++++++++++
 src/config.rs           | 15 +++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'src/config.rs')

diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 7c06886..f7704fb 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -238,6 +238,17 @@ fn formatting_help() {
             ",
             "-h, --help".fg::<White>().bold()
         ),
+        formatdoc!(
+            "
+                \t{}
+                \tDon't apply any changes to the named files. Instead print all
+                \told citekeys and the formatted strings that would have been
+                \tapplied in the format: {} => {}
+            ",
+            "-d, --dry-run".fg::<White>().bold(),
+            "old_key".italic(),
+            "new_key".bold()
+        ),
         formatdoc! {"
                 \t{}
                 \tThe bibfile for which the citekey formatting should be processed.
diff --git a/src/config.rs b/src/config.rs
index a4e89be..b1c4b07 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -103,6 +103,21 @@ const DEFAULT_CONFIG: &str = r##"
 # author_color = "38"
 # title_color = "37"
 # year_color = "135"
+
+# [citekey_formatter]
+## Define the patterns for creating citekeys. Every item of the array consists of
+## five components separated by semicolons. Despite the field name every component
+## can be left blank:
+## - name of the biblatex field ("author", "title"...)
+## - number of max words from the given field
+## - number of chars used from each word
+## - delimiter to separate words of the same field
+## - trailing delimiter separating the current field from the following
+# fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ]
+
+## Convert chars to specified case. Possible values:
+## "upper", "uppercase", "lower", "lowercase"
+# case = "lowercase"
 "##;
 
 /// Main struct of the config file. Contains substructs/headings in toml
-- 
cgit v1.2.3


From 467851007e1861834326deee3116aa88fe839f5a Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Mon, 13 Oct 2025 15:45:53 +0200
Subject: Working proof of concept of citekey formatting

---
 CITEKEYS.md                           | 215 +++++++++++++++
 Cargo.lock                            |   7 +
 Cargo.toml                            |   1 +
 README.md                             |  20 ++
 src/bibiman/citekeys.rs               |  69 +++--
 src/bibiman/citekeys/citekey_utils.rs | 105 ++++----
 src/config.rs                         |  69 +++++
 tests/biblatex-test-citekeys.bib      | 476 ++++++++++++++++++++++++++++++++++
 tests/test-config.toml                |   5 +-
 9 files changed, 889 insertions(+), 78 deletions(-)
 create mode 100644 CITEKEYS.md
 create mode 100644 tests/biblatex-test-citekeys.bib

(limited to 'src/config.rs')

diff --git a/CITEKEYS.md b/CITEKEYS.md
new file mode 100644
index 0000000..912326a
--- /dev/null
+++ b/CITEKEYS.md
@@ -0,0 +1,215 @@
+# Formatting Citekeys<a name="formatting-citekeys"></a>
+
+<!-- mdformat-toc start --slug=github --maxlevel=6 --minlevel=1 -->
+
+- [Formatting Citekeys](#formatting-citekeys)
+  - [Settings](#settings)
+  - [Building Patterns](#building-patterns)
+  - [Ignore Lists and Char Case](#ignore-lists-and-char-case)
+  - [General Tipps](#general-tipps)
+  - [Examples](#examples)
+
+<!-- mdformat-toc end -->
+
+`bibiman` offers the possibility to create new citekeys from the fields of
+BibLaTeX entries. This is done using an easy but powerful pattern-matching
+syntax.
+
+## Settings<a name="settings"></a>
+
+All settings for the citekey generation have to be configured in the used config
+file. The regular path is `XDG_CONFIG_DIR/bibiman/bibiman.toml`. But it can be
+set dynamically with the `-c`/`--config=` global option.
+
+Following values can be set through the config file. A detailed explanation for
+all fields follows below:
+
+```toml
+[citekey_formatter]
+fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ]
+case = "lowercase"
+ascii_only = true
+ignored_chars = [
+    "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", "\"",
+]
+ignored_words = [
+    "the",
+    "a",
+    "an",
+    "of",
+    "for",
+    "in",
+    "at",
+    "to",
+    "and",
+    "der",
+    "die",
+    "das",
+    "ein",
+    "eine",
+    "eines",
+    "des",
+    "auf",
+    "und",
+    "für",
+    "vor",
+]
+```
+
+## Building Patterns<a name="building-patterns"></a>
+
+The main aspect for generating citekeys are the field patterns. They can be set
+through an array in the config file where every array-item represents a single
+BibLaTeX field to be used for generating a part of the citekey.
+
+Every field pattern consists of the following five parts separated by
+semicolons. The general pattern looks like this (every subfield is explained
+below):
+
+*biblatex field name* **;** *max word count* **;** *max char count* **;** *inner delimiter* **;** *trailing delimiter*
+
+- **BibLaTeX field**: the first part represents the field name which value
+  should be used to generate the content part of the citekey. Theoretically, any
+  BibLaTeX field can be selected by name. But there are some fields which are
+  much more common than others; e.g. `author`, `editor`, `title`, `year`/`date`
+  or `entrytype`. Those very common fields are preprocessed; meaning that for
+  instance LaTeX macros are fully stripped from the strings, or that `editor` is
+  a fallback value for `author` if the latter is empty (however, setting
+  `editor` explicitly is still possible). Also using `year` will parse the
+  `date` field too, to ensure a year number.
+- **Max Word**: Defines how many words should maximal be used from the named
+  field. E.g. if the title consists of five words, and the max counter is set to
+  `3` only the first three fields will be used.
+- **Max Chars/Word**: Defines how many chars, counting from the start, of each
+  word will be used to build the citekey. If for instance the value is set to
+  `5`, only the first five chars of any word will be used. Thus, "archaeology"
+  would be stripped down to "archa".
+- **Inner Delimiter**: Sets the delimiter char used between words from the
+  currently named field; e.g. to separate the words of the `title` field.
+- **Trailing Delimiter**: Sets the delimiter which separates the current fields
+  value from the following. This delimiter is only printed if the following
+  field has some content.
+
+For example, to use the `title` field, print maximal three words and of those
+only the first five chars, single words separated by underscore and the whole
+field separated by equal sign, insert the following pattern field into the
+`fields` array:
+
+`title;3;5;_;=`
+
+Except the BibLaTeX field name, all other parts of the pattern can be left
+blank. If the field name is the only value set, semicolon delimiters are also
+not necessary. But if only one of the following parts should be set, all
+delimiters need to be used. E.g. those are both valid: `title` or `title;;;_;=`.
+The first would print all words of the title, no matter the length, not
+separated by any char. The last would also print all words of the title, but
+single words separated by underscores and the whole pattern value separated from
+the following by an equal sign. This is not valid: `title;;_` since `bibiman`
+can't know if the underscore means a delimiter (and which) or the max char
+count.
+
+The pattern array inside the config file takes multiple pattern fields like the
+predecing. This allows an elaborated citekey pattern which takes into account
+multiple fields.
+
+## Ignore Lists and Char Case<a name="ignore-lists-and-char-case"></a>
+
+Beside the field patterns there are some other options to define how citekeys
+should be built.
+
+`ascii_only=<BOOL>`
+: If set to `true`, which is the default, non-ascii chars are mapped to their
+  ascii equivalent. For example, the German `ä` would be mapped to `a`. The
+  Turkish `ş` or Greek `σ`/`ς` would be mapped to `s`. If set to `false` all are
+  kept as they are. But this could lead to errors running LaTeX on the file.
+
+`case=<CASE>`
+: If used, sets the case of the chars in the citekey. Valid values are
+  `uppercase`, `lowercase` or `camelcase`. Both first should be clear, the
+  latter means typical camel case also beginning the *first word* with an
+  uppercase letter; also referenced as upper camel case or Pascal case.
+
+`ignored_chars=<ARRAY>`
+: Defines chars which should be ignored during parsing (meaning not print them).
+  The default list contains 33 special chars and is part of the default config
+  file (in out-commented state). Be aware, setting this key will completely
+  overwrite the default list!
+
+`ignored_words=<ARRAY>`
+: A list of words which should be ignored parsing field values. The default list
+  contains about 20 very commonly used words in English and German; like
+  articles, pronouns or connector words. Like with `ignored_chars` setting this
+  key will completely overwrite the default list!
+
+## General Tipps<a name="general-tipps"></a>
+
+- Most importantly: *always use the **`--dry-run`** option first*! This will
+  print a list of old and new values for all citekeys in the file without
+  changing anything.
+- After finding a good overall pattern, *use the `--output=` option* to create a
+  new file and don't overwrite your existent file. Thus, your original file
+  isn't broken if the key formatter produces some unwanted output.
+- Even very long patterns are possible, they are not encouraged, since it bloats
+  the bibfiles.
+- The same accounts for *too short* patterns; if the pattern is to unspecific,
+  it bares the risk of producing doublettes (e.g. single author and year only).
+  But the citekey generator will not check for doublettes!
+- It is possible to keep special chars and use them as delimiters. But this
+  might cause problems other programs and CLI tools in particular, since many
+  special chars are reserved for shell operations. For instance, it will very
+  likely break the note file feature of `bibiman` which doesn't accept many
+  special chars.
+
+## Examples<a name="examples"></a>
+
+To make the process more clear a few examples might help. Following bibfile is
+assumed:
+
+```latex
+@article{Bos2023,
+    title         = {{LaTeX}, metadata, and publishing workflows},
+    author        = {Bos, Joppe W. and {McCurley}, Kevin S.},
+    year          = {2023},
+    month         = apr,
+    journal       = {arXiv},
+    number        = {{arXiv}:2301.08277},
+    doi           = {10.48550/arXiv.2301.08277},
+    url           = {http://arxiv.org/abs/2301.08277},
+    urldate       = {2023-08-22},
+    note          = {type: article},
+}
+@book{Bhambra2021,
+    title         = {Colonialism and \textbf{Modern Social Theory}},
+    author        = {Bhambra, Gurminder K. and Holmwood, John},
+    location      = {Cambridge and Medford},
+    publisher     = {Polity Press},
+    date          = {2021},
+
+```
+
+And the following values set in the config file:
+
+```toml
+fields = [
+  # Just print the whole entrytype and a colon as trailing delimiter
+  "entrytype;;;;:", 
+  # Print all author names in full length, names separated by dash,
+  # the whole field by underscore
+  "author;;;-;_", 
+  # Print first 4 words of title, first 3 chars of every word only. Title words
+  # separated by equal sign, the whole field by underscore
+  "title;4;3;=;_", 
+  # Print all words of location, but only first 4 chars of every word. Single words
+  # separated by colon, whole field by underscore
+  "location;;4;:;_", 
+  # Just print the whole year
+  "year",
+]
+case = "lowercase"
+ascii_only = true
+```
+
+The combination of those setting will produce the following citekeys:
+
+- **`article:bos-mccurley_lat=met=pub=wor_2023`**
+- **`book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021`**
diff --git a/Cargo.lock b/Cargo.lock
index a27636e..0adb4e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -103,6 +103,7 @@ dependencies = [
  "biblatex",
  "color-eyre",
  "crossterm",
+ "deunicode",
  "dirs",
  "editor-command",
  "figment",
@@ -323,6 +324,12 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "deunicode"
+version = "1.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04"
+
 [[package]]
 name = "dirs"
 version = "5.0.1"
diff --git a/Cargo.toml b/Cargo.toml
index abf1eee..0c07c51 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ owo-colors = "4.2.2"
 logos = "0.15.1"
 phf = { version = "0.13.1", features = ["macros"] }
 indoc = "2.0.6"
+deunicode = "1.6.2"
 
 [workspace.metadata.cross.target.aarch64-unknown-linux-gnu]
 # Install libssl-dev:arm64, see <https://github.com/cross-rs/cross/blob/main/docs/custom_images.md#adding-dependencies-to-existing-images>
diff --git a/README.md b/README.md
index 4929509..3fb81c8 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,11 @@
     - [Ubuntu/Debian](#ubuntudebian)
     - [Void Linux](#void-linux)
   - [Usage](#usage)
+    - [CLI for citekey formatting](#cli-for-citekey-formatting)
   - [Configuration](#configuration)
     - [Location of Config File](#location-of-config-file)
     - [General Configuration](#general-configuration)
+    - [Citekey formatting](#citekey-formatting)
     - [Color Configuration](#color-configuration)
   - [Features](#features)
   - [Keybindings](#keybindings)
@@ -196,6 +198,13 @@ bibman tests/multi-files/
 bibiman tests/biblatex-test.bib tests/multi-files/
 ```
 
+### CLI for citekey formatting<a name="cli-for-citekey-formatting"></a>
+
+Beside the TUI `bibiman` can format and replace citekeys. To make use of this
+feature run the program with the `format-citekeys` subcommand. For more
+information on this use `bibiman format-citekeys --help` and the
+[docs](./CITEKEYS.md).
+
 ## Configuration<a name="configuration"></a>
 
 ### Location of Config File<a name="location-of-config-file"></a>
@@ -268,6 +277,11 @@ note_symbol = "󰧮"
 ## Possible values are "journaltitle", "organization", "instituion", "publisher"
 ## and "pubtype" (which is the default)
 custom_column = "pubtype"
+
+[citekey_formatter]
+fields = []
+ascii_only = true
+case = "lowercase"
 ```
 
 `bibfiles`
@@ -326,6 +340,12 @@ custom_column = "pubtype"
   good advice to use a rather wide terminal window when using a value like
   `journaltitle`.
 
+### Citekey formatting<a name="citekey-formatting"></a>
+
+`bibiman` now also offers a citekey generating feature. This enables to reformat
+all citekeys based on an elaborated pattern matching syntax. For furthter
+information and examples see the [docs](CITEKEYS.md).
+
 ### Color Configuration<a name="color-configuration"></a>
 
 Furthermore, it is now possible to customize the colors. The following values
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 2f56947..0cec28e 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize};
 
 use crate::{
     bibiman::citekeys::citekey_utils::{build_citekey, formatting_help},
-    config::BibiConfig,
+    config::{BibiConfig, IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
 };
 
 mod citekey_utils;
@@ -60,6 +60,7 @@ pub(crate) struct CitekeyFormatting {
     case: Option<CitekeyCase>,
     old_new_keys_map: Vec<(String, String)>,
     dry_run: bool,
+    ascii_only: bool,
 }
 
 impl CitekeyFormatting {
@@ -69,14 +70,15 @@ impl CitekeyFormatting {
     ) -> color_eyre::Result<()> {
         let mut formatter = CitekeyFormatting::default();
 
-        formatter.fields = cfg
-            .citekey_formatter
-            .fields
-            .clone()
-            .ok_or_eyre("Need to define fields correctly in config file")?;
+        formatter.fields = cfg.citekey_formatter.fields.clone().ok_or_eyre(format!(
+            "Need to define {} correctly in config file",
+            "citekey pattern fields".red()
+        ))?;
 
         formatter.case = cfg.citekey_formatter.case.clone();
 
+        formatter.ascii_only = cfg.citekey_formatter.ascii_only;
+
         if formatter.fields.is_empty() {
             return Err(eyre!(
                 "To format all citekeys, you need to provide {} values in the config file",
@@ -105,13 +107,26 @@ impl CitekeyFormatting {
         formatter.bib_entries = Bibliography::parse(&bibstring)
             .map_err(|e| eyre!("Couldn't parse bibfile due to {}", e.kind))?;
 
+        let ignored_chars = if let Some(chars) = &cfg.citekey_formatter.ignored_chars {
+            chars.as_slice()
+        } else {
+            IGNORED_SPECIAL_CHARS.as_slice()
+        };
+
+        let ignored_words = if let Some(words) = &cfg.citekey_formatter.ignored_words {
+            words.as_slice()
+        } else {
+            &*IGNORED_WORDS.as_slice()
+        };
+
         formatter
-            .do_formatting()
+            .do_formatting(ignored_chars, ignored_words)
             .rev_sort_new_keys_by_len()
             .update_file()?;
 
         Ok(())
     }
+
     /// Start Citekey formatting with building a new instance of `CitekeyFormatting`
     /// Formatting is processed file by file, because `bibman` can handle
     /// multi-file setups.
@@ -144,16 +159,24 @@ impl CitekeyFormatting {
             case: cfg.citekey_formatter.case.clone(),
             old_new_keys_map: Vec::new(),
             dry_run: false,
+            ascii_only: cfg.citekey_formatter.ascii_only,
         })
     }
 
     /// Process the actual formatting. The citekey of every entry will be updated.
-    pub fn do_formatting(&mut self) -> &mut Self {
+    pub fn do_formatting(&mut self, ignored_chars: &[char], ignored_words: &[String]) -> &mut Self {
         let mut old_new_keys: Vec<(String, String)> = Vec::new();
         for entry in self.bib_entries.iter() {
             old_new_keys.push((
                 entry.key.clone(),
-                build_citekey(entry, &self.fields, self.case.as_ref()),
+                build_citekey(
+                    entry,
+                    &self.fields,
+                    self.case.as_ref(),
+                    self.ascii_only,
+                    ignored_chars,
+                    ignored_words,
+                ),
             ));
         }
 
@@ -215,12 +238,15 @@ mod tests {
 
     use biblatex::Bibliography;
 
-    use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting};
+    use crate::{
+        bibiman::citekeys::{CitekeyCase, CitekeyFormatting},
+        config::{IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
+    };
 
     #[test]
     fn format_citekey_test() {
         let src = r"
-        @article{bos_latex_metadata_and_publishing_workflows_2023,
+        @article{Bos2023,
             title         = {{LaTeX}, metadata, and publishing workflows},
             author        = {Bos, Joppe W. and {McCurley}, Kevin S.},
             year          = {2023},
@@ -232,7 +258,7 @@ mod tests {
             urldate       = {2023-08-22},
             note          = {type: article},
         }
-        @book{bhambra_colonialism_social_theory_2021,
+        @book{Bhambra2021,
             title         = {Colonialism and \textbf{Modern Social Theory}},
             author        = {Bhambra, Gurminder K. and Holmwood, John},
             location      = {Cambridge and Medford},
@@ -247,29 +273,24 @@ mod tests {
             fields: vec![
                 "entrytype;;;;:".into(),
                 "author;;;-;_".into(),
-                "title;4;3;_;_".into(),
+                "title;4;3;=;_".into(),
                 "location;;4;:;_".into(),
                 "year".into(),
             ],
-            case: None,
+            case: Some(CitekeyCase::Lower),
             old_new_keys_map: Vec::new(),
             dry_run: false,
+            ascii_only: true,
         };
-        let _ = formatting_struct.do_formatting();
+        let _ = formatting_struct
+            .do_formatting(IGNORED_SPECIAL_CHARS.as_slice(), &*IGNORED_WORDS.as_slice());
         assert_eq!(
             formatting_struct.old_new_keys_map.get(0).unwrap().1,
-            "article:Bos-McCurley_LaT_met_and_pub_Empt_2023"
+            "article:bos-mccurley_lat=met=pub=wor_2023"
         );
         assert_eq!(
             formatting_struct.old_new_keys_map.get(1).unwrap().1,
-            "book:Bhambra-Holmwood_Col_and_Mod_Soc_Camb:and:Medf_2021"
-        );
-        formatting_struct.case = Some(CitekeyCase::Lower);
-        let _ = formatting_struct.do_formatting().rev_sort_new_keys_by_len();
-        // now the longer citekey is processed first and its in lowercase!
-        assert_eq!(
-            formatting_struct.old_new_keys_map.get(0).unwrap().1,
-            "book:bhambra-holmwood_col_and_mod_soc_camb:and:medf_2021"
+            "book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021"
         );
     }
 
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
index ee2c849..5f70224 100644
--- a/src/bibiman/citekeys/citekey_utils.rs
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -16,21 +16,14 @@
 /////
 
 use biblatex::{ChunksExt, Entry, Type};
+use deunicode::deunicode;
 use indoc::formatdoc;
 use owo_colors::{
     OwoColorize,
     colors::{BrightBlue, Green, White},
 };
 
-use crate::{
-    bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully},
-    config::IGNORED_SPECIAL_CHARS,
-};
-
-const IGNORE_WORDS: [&str; 20] = [
-    "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine",
-    "eines", "des", "auf", "und", "für", "vor",
-];
+use crate::bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully};
 
 pub(super) fn formatting_help() {
     let help = vec![
@@ -104,6 +97,9 @@ pub(super) fn build_citekey(
     entry: &Entry,
     pattern_fields: &[String],
     case: Option<&CitekeyCase>,
+    ascii_only: bool,
+    ignored_chars: &[char],
+    ignored_words: &[String],
 ) -> String {
     // mut string the citekey is built from
     let mut new_citekey = String::new();
@@ -114,7 +110,7 @@ pub(super) fn build_citekey(
     // loop over pattern fields process them
     'field_loop: for pattern in pattern_fields.iter() {
         // parse single values from pattern field
-        let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) =
+        let (field_name, max_words, max_chars, inner_delimiter, cur_trailing_delimiter) =
             split_formatting_pat(pattern);
 
         // built the part of the citekey from the current pattern field
@@ -126,16 +122,9 @@ pub(super) fn build_citekey(
 
             // split at whitespaces, count fields and set counter for processed
             // splits
-            let mut split_field = field.split_whitespace();
+            let split_field = field.split_whitespace();
             let mut words_passed = 0;
             let field_count = field.split_whitespace().count();
-            let word_count = if let Some(val) = word_count
-                && val <= field_count
-            {
-                val
-            } else {
-                field_count
-            };
 
             // If there is a trailing delimiter from the previous field, push it
             if let Some(del) = trailing_delimiter {
@@ -152,47 +141,57 @@ pub(super) fn build_citekey(
             }
 
             // loop over single parts of current field and add correct delimiter
-            'word_loop: loop {
-                // process the single slices and add correct delimiter
-                if let Some(field_slice) = split_field.next() {
-                    // Create word slice char by char. We need to loop over chars
-                    // instead of a simple bytes index to also catch chars which
-                    // consist of more than one byte (äöüøæ etc...)
-                    let mut word_slice = String::new();
-                    let word_chars = field_slice.chars();
-                    let mut counter = 0;
-                    'char_loop: for mut c in word_chars {
-                        // If camelcase is set, force first char of word to uppercase
-                        if counter == 0 && case == Some(&CitekeyCase::Camel) {
-                            c = c.to_ascii_uppercase()
-                        }
-                        if let Some(len) = char_count
-                            && counter == len
-                        {
-                            break 'char_loop;
-                        }
-                        // if a word slice contains a special char, skip it
-                        if IGNORED_SPECIAL_CHARS.contains(&c) {
-                            continue 'char_loop;
-                        }
+            // process the single slices and add correct delimiter
+            'word_loop: for (idx, field_slice) in split_field.enumerate() {
+                // if the current slice is a common word from the ignore list,
+                // skip it.
+                if ignored_words.contains(&field_slice.to_lowercase()) {
+                    continue;
+                }
+
+                // Create word slice char by char. We need to loop over chars
+                // instead of a simple bytes index to also catch chars which
+                // consist of more than one byte (äöüøæ etc...)
+                let mut word_slice = String::new();
+                let word_chars = field_slice.chars();
+                let mut counter = 0;
+                'char_loop: for mut c in word_chars {
+                    // If camelcase is set, force first char of word to uppercase
+                    if counter == 0 && case == Some(&CitekeyCase::Camel) {
+                        c = c.to_ascii_uppercase()
+                    }
+                    if let Some(len) = max_chars
+                        && counter >= len
+                    {
+                        break 'char_loop;
+                    }
+                    // if a word slice contains a special char, skip it
+                    if ignored_chars.contains(&c) {
+                        continue 'char_loop;
+                    }
+                    // if non-ascii chars should be mapped, check if needed and do it
+                    if let Some(chars) = deunicode::deunicode_char(c)
+                        && ascii_only
+                    {
+                        word_slice.push_str(chars);
+                        counter += chars.len();
+                    } else {
                         word_slice.push(c);
                         counter += 1;
                     }
-                    // Don't count empty slices and don't add delimiter to those
-                    if !word_slice.is_empty() {
-                        formatted_str = formatted_str + &word_slice;
-                        words_passed += 1;
-                        if word_count == words_passed {
-                            break 'word_loop;
-                        } else {
-                            formatted_str = formatted_str + inner_delimiter.unwrap_or("");
-                        }
+                }
+                // Don't count empty slices and don't add delimiter to those
+                if !word_slice.is_empty() {
+                    formatted_str = formatted_str + &word_slice;
+                    words_passed += 1;
+                    if max_words.is_some_and(|max| max == words_passed) || idx + 1 == field_count {
+                        break 'word_loop;
                     } else {
-                        continue 'word_loop;
+                        formatted_str = formatted_str + inner_delimiter.unwrap_or("");
                     }
                 } else {
-                    break 'word_loop;
-                };
+                    continue 'word_loop;
+                }
             }
             formatted_str
         };
diff --git a/src/config.rs b/src/config.rs
index b1c4b07..7c1a0f8 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -20,6 +20,7 @@ use std::{
     io::{Write, stdin},
     path::PathBuf,
     str::FromStr,
+    sync::LazyLock,
 };
 
 use color_eyre::{eyre::Result, owo_colors::OwoColorize};
@@ -40,6 +41,31 @@ pub const IGNORED_SPECIAL_CHARS: [char; 33] = [
     '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"',
 ];
 
+pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| {
+    vec![
+        String::from("the"),
+        String::from("a"),
+        String::from("an"),
+        String::from("of"),
+        String::from("for"),
+        String::from("in"),
+        String::from("at"),
+        String::from("to"),
+        String::from("and"),
+        String::from("der"),
+        String::from("die"),
+        String::from("das"),
+        String::from("ein"),
+        String::from("eine"),
+        String::from("eines"),
+        String::from("des"),
+        String::from("auf"),
+        String::from("und"),
+        String::from("für"),
+        String::from("vor"),
+    ]
+});
+
 const DEFAULT_CONFIG: &str = r##"
 # [general]
 ## Default files/dirs which are loaded on startup
@@ -118,6 +144,40 @@ const DEFAULT_CONFIG: &str = r##"
 ## Convert chars to specified case. Possible values:
 ## "upper", "uppercase", "lower", "lowercase"
 # case = "lowercase"
+
+## Map all unicode chars to their pure ascii equivalent
+# ascii_only = true
+
+## List of special chars that'll be ignored when building citekeys.
+## A custom list will overwrite the default list
+# ignored_chars = [
+#     "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", """,
+# ]
+
+## List of words that'll be ignored when building citekeys.
+## A custom list will overwrite the default list
+# ignored_words = [
+#     "the",
+#     "a",
+#     "an",
+#     "of",
+#     "for",
+#     "in",
+#     "at",
+#     "to",
+#     "and",
+#     "der",
+#     "die",
+#     "das",
+#     "ein",
+#     "eine",
+#     "eines",
+#     "des",
+#     "auf",
+#     "und",
+#     "für",
+#     "vor",
+# ]
 "##;
 
 /// Main struct of the config file. Contains substructs/headings in toml
@@ -171,6 +231,9 @@ pub struct Colors {
 pub struct CitekeyFormatter {
     pub fields: Option<Vec<String>>,
     pub case: Option<CitekeyCase>,
+    pub ascii_only: bool,
+    pub ignored_chars: Option<Vec<char>>,
+    pub ignored_words: Option<Vec<String>>,
 }
 
 impl Default for BibiConfig {
@@ -194,6 +257,9 @@ impl Default for BibiConfig {
             citekey_formatter: CitekeyFormatter {
                 fields: None,
                 case: None,
+                ascii_only: true,
+                ignored_chars: None,
+                ignored_words: None,
             },
         }
     }
@@ -224,6 +290,9 @@ impl BibiConfig {
             citekey_formatter: CitekeyFormatter {
                 fields: None,
                 case: None,
+                ascii_only: true,
+                ignored_chars: None,
+                ignored_words: None,
             },
         }
     }
diff --git a/tests/biblatex-test-citekeys.bib b/tests/biblatex-test-citekeys.bib
new file mode 100644
index 0000000..9767f97
--- /dev/null
+++ b/tests/biblatex-test-citekeys.bib
@@ -0,0 +1,476 @@
+@set{set,
+    entryset = {article:herrmann-ofele_carboc=carben=as_2006,article:aksin-turkmen_effect=immobi=on_2006,article:yoon-ryu_pallad=pincer=comple_2006},
+    annotation = {A \texttt{set} with three members.},
+}
+
+@set{set,
+    entryset = {article:glashow_partia=symmet=weak_1961,article:weinberg_model=lepton_1967,salam},
+    annotation = {A \texttt{set} with three members discussing the standard
+                  model of particle physics.},
+}
+
+@collection{collection:matuz-miller_contem=litera=critic_1990gale,
+    title = {Contemporary Literary Criticism},
+    year = {1990},
+    location = {Detroit},
+    publisher = {Gale},
+    volume = {61},
+    pages = {204--208},
+    editor = {Matuz, Roger and Miller, Helen},
+    keywords = {narration},
+    langid = {english},
+    langidopts = {variant=american},
+    annotation = {A \texttt{collection} entry providing the excerpt information
+                  for the \texttt{article:doody_heming=style=jakes_1974} entry. Note the format of the \texttt{
+                  pages} field},
+}
+
+@article{article:aksin-turkmen_effect=immobi=on_2006,
+    title = {Effect of immobilization on catalytic characteristics of saturated
+             {Pd-N}-heterocyclic carbenes in {Mizoroki-Heck} reactions},
+    author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok , Levent and
+              { \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\" u}y{ \"u}kg{\"u}
+              ng{ \" o}r, Orhan and {\"O}zkal, Erhan},
+    volume = {691},
+    number = {13},
+    pages = {3027--3036},
+    journaltitle = jomch,
+    date = {2006},
+    indextitle = {Effect of immobilization on catalytic characteristics},
+}
+
+@article{article:angenendt_honore=salvat=vom_2002,
+    title = {In Honore Salvatoris~-- Vom Sinn und Unsinn der Patrozinienkunde},
+    shorttitle = {In Honore Salvatoris},
+    author = {Angenendt, Arnold},
+    volume = {97},
+    pages = {431--456, 791--823},
+    journaltitle = {Revue d'Histoire Eccl{\'e}siastique},
+    date = {2002},
+    langid = {german},
+    indextitle = {In Honore Salvatoris},
+    annotation = {A German article in a French journal. Apart from that, a
+                  typical \texttt{article} entry. Note the \texttt{indextitle}
+                  field},
+}
+
+@book{book:aristotle_de=anima_1907cambr#unive#press,
+    title = {De Anima},
+    author = {Aristotle},
+    location = {Cambridge},
+    publisher = cup,
+    date = {1907},
+    editor = {Hicks, Robert Drew},
+    keywords = {primary, ancient, philosophy, athens},
+    langid = {english},
+    langidopts = {variant=british},
+    annotation = {A \texttt{book} entry with an \texttt{author} and an \texttt{
+                  editor}},
+}
+
+@book{book:aristotle_physic_1929g#p#putna,
+    title = {Physics},
+    shorttitle = {Physics},
+    author = {Aristotle},
+    location = {New York},
+    publisher = {G. P. Putnam},
+    url = {https://www.infobooks.org/authors/classic/aristotle-books/#Physic},
+    date = {1929},
+    translator = {Wicksteed, P. H. and Cornford, F. M.},
+    keywords = {primary, ancient, philosophy},
+    langid = {english},
+    langidopts = {variant=american},
+    file = {~/Documents/coding/projects/bibiman/tests/book:aristotle_physic_1929g#p#putna.pdf},
+    annotation = {A \texttt{book} entry with a \texttt{translator} field},
+    abstract = {The Physics is a work by Aristotle dedicated to the study of
+                nature. Regarded by Heidegger as "the fundamental work of Western
+                philosophy", it presents the renowned distinction between the
+                four types of cause, as well as reflections on chance, motion,
+                infinity, and other fundamental concepts. It is here that
+                Aristotle sets out his celebrated paradox of time.},
+}
+
+@book{book:aristotle_poetic_1968clare#press,
+    title = {Poetics},
+    shorttitle = {Poetics},
+    author = {Aristotle},
+    location = {Oxford},
+    publisher = {Clarendon Press},
+    series = {Clarendon {Aristotle}},
+    date = {1968},
+    editor = {Lucas, D. W.},
+    keywords = {primary},
+    langid = {english},
+    langidopts = {variant=british},
+    annotation = {A \texttt{book} entry with an \texttt{author} and an \texttt{
+                  editor} as well as a \texttt{series} field},
+}
+
+@mvbook{mvbook:aristotle_rhetor=aristo=with_1877cambr#unive#press,
+    title = {The \textbf{Rhetoric} of {Aristotle} with a commentary by the late {Edward
+             Meredith Cope}},
+    shorttitle = {Rhetoric},
+    author = {Aristotle},
+    publisher = cup,
+    date = {1877},
+    editor = {Cope, Edward Meredith},
+    commentator = {Cope, Edward Meredith},
+    volumes = {3},
+    keywords = {primary},
+    langid = {english},
+    langidopts = {variant=british},
+    sorttitle = {Rhetoric of Aristotle},
+    indextitle = {Rhetoric of {Aristotle}, The},
+    annotation = {A commented edition. Note the concatenation of the \texttt{
+                  editor} and \texttt{commentator} fields as well as the \texttt{
+                  volumes}, \texttt{sorttitle}, and \texttt{indextitle} fields},
+}
+
+@book{book:augustine_hetero=cataly=synthe_1995marce#dekke,
+    title = {Heterogeneous catalysis for the synthetic \textit{chemist}},
+    shorttitle = {Heterogeneous catalysis},
+    author = {Augustine, Robert L.},
+    location = {New York},
+    publisher = {Marcel Dekker},
+    date = {1995},
+    langid = {english},
+    langidopts = {variant=american},
+    annotation = {A plain \texttt{book} entry},
+    keywords = {chemistry},
+}
+
+@book{book:averroes_epistl=on=possib_1982jewis#theol#semin#ameri,
+    title = {The Epistle on the Possibility of Conjunction with the Active
+             Intellect by {Ibn Rushd} with the Commentary of {Moses Narboni}},
+    shorttitle = {Possibility of Conjunction},
+    author = {Averroes},
+    location = {New York},
+    publisher = {Jewish Theological Seminary of America},
+    series = {Moreshet: Studies in {Jewish} History, Literature and Thought},
+    number = {7},
+    date = {1982},
+    editor = {Bland, Kalman P.},
+    translator = {Bland, Kalman P.},
+    keywords = {primary},
+    langid = {english},
+    langidopts = {variant=american},
+    indextitle = {Epistle on the Possibility of Conjunction, The},
+    annotation = {A \texttt{book} entry with a \texttt{series} and a \texttt{
+                  number}. Note the concatenation of the \texttt{editor} and
+                  \texttt{translator} fields as well as the \texttt{indextitle}
+                  field},
+}
+
+@article{article:baez-lauda_higher=algebr=v_2004,
+    title = {Higher-Dimensional Algebra {V}: 2-Groups},
+    author = {Baez, John C. and Lauda, Aaron D.},
+    volume = {12},
+    pages = {423--491},
+    journaltitle = {Theory and Applications of Categories},
+    date = {2004},
+    version = {3},
+    eprint = {math/0307200v3},
+    eprinttype = {arxiv},
+    langid = {english},
+    keywords = {math},
+    langidopts = {variant=american},
+    annotation = {An \texttt{article} with \texttt{eprint} and \texttt{
+                  eprinttype} fields. Note that the arXiv reference is
+                  transformed into a clickable link if \texttt{hyperref} support
+                  has been enabled. Compare \texttt{baez\slash online}, which is
+                  the same item given as an \texttt{online} entry},
+}
+
+@article{article:bertram-wentworth_gromov=invari=holomo_1996,
+    title = {Gromov invariants for holomorphic maps on {Riemann} surfaces},
+    shorttitle = {Gromov invariants},
+    author = {Bertram, Aaron and Wentworth, Richard},
+    volume = {9},
+    number = {2},
+    pages = {529--571},
+    journaltitle = jams,
+    date = {1996},
+    langid = {english},
+    langidopts = {variant=american},
+    annotation = {An \texttt{article} entry with a \texttt{volume} and a \texttt
+                  {number} field},
+}
+
+@article{article:doody_heming=style=jakes_1974,
+    title = {Hemingway's Style and {Jake's} Narration},
+    author = {Doody, Terrence},
+    year = {1974},
+    journal = {The Journal of Narrative Technique},
+    volume = {4},
+    number = {3},
+    pages = {212--225},
+    langid = {english},
+    langidopts = {variant=american},
+    related = {matuz:article:doody_heming=style=jakes_1974},
+    relatedstring = {\autocap{e}xcerpt in},
+    annotation = {An \texttt{article} entry cited as an excerpt from a \texttt{
+                  collection} entry. Note the format of the \texttt{related} and
+                  \texttt{relatedstring} fields},
+}
+
+@article{article:gillies_herder=prepar=goethe_1933,
+    title = {Herder and the Preparation of {Goethe's} Idea of World Literature},
+    author = {Gillies, Alexander},
+    series = {newseries},
+    volume = {9},
+    pages = {46--67},
+    journaltitle = {Publications of the English Goethe Society},
+    date = {1933},
+    langid = {english},
+    langidopts = {variant=british},
+    annotation = {An \texttt{article} entry with a \texttt{series} and a \texttt
+                  {volume} field. Note that format of the \texttt{series} field
+                  in the database file},
+}
+
+@article{article:glashow_partia=symmet=weak_1961,
+    title = {Partial Symmetries of Weak Interactions},
+    author = {Glashow, Sheldon},
+    volume = {22},
+    pages = {579--588},
+    journaltitle = {Nucl.~Phys.},
+    date = {1961},
+}
+
+@article{article:herrmann-ofele_carboc=carben=as_2006,
+    title = {A carbocyclic carbene as an efficient catalyst ligand for {C--C}
+             coupling reactions},
+    author = {Herrmann, Wolfgang A. and {\"O}fele, Karl and Schneider, Sabine K.
+              and Herdtweck, Eberhardt and Hoffmann, Stephan D.},
+    volume = {45},
+    number = {23},
+    pages = {3859--3862},
+    journaltitle = anch-ie,
+    date = {2006},
+    indextitle = {Carbocyclic carbene as an efficient catalyst, A},
+}
+
+@article{article:hostetler-wingate_alkane=gold=cluste_1998,
+    title = {Alkanethiolate gold cluster molecules with core diameters from 1.5
+             to 5.2~{nm}},
+    shorttitle = {Alkanethiolate gold cluster molecules},
+    author = {Hostetler, Michael J. and Wingate, Julia E. and Zhong, Chuan-Jian
+              and Harris, Jay E. and Vachet, Richard W. and Clark, Michael R. and
+              Londono, J. David and Green, Stephen J. and Stokes, Jennifer J. and
+              Wignall, George D. and Glish, Gary L. and Porter, Marc D. and Evans
+              , Neal D. and Murray, Royce W.},
+    volume = {14},
+    number = {1},
+    pages = {17--30},
+    journaltitle = {Langmuir},
+    date = {1998},
+    subtitle = {Core and monolayer properties as a function of core size},
+    langid = {english},
+    langidopts = {variant=american},
+    indextitle = {Alkanethiolate gold cluster molecules},
+    annotation = {An \texttt{article} entry with \arabic{author} authors. By
+                  default, long author and editor lists are automatically
+                  truncated. This is configurable},
+}
+
+@article{article:kastenholz-hunenberger_comput=method=ionic_2006,
+    title = {Computation of methodology\hyphen independent ionic solvation free
+             energies from molecular simulations},
+    author = {Kastenholz, M. A. and H{\"u}nenberger, Philippe H.},
+    volume = {124},
+    doi = {10.1063/1.2172593},
+    journaltitle = jchph,
+    date = {2006},
+    subtitle = {{I}. {The} electrostatic potential in molecular liquids},
+    eid = {124106},
+    langid = {english},
+    langidopts = {variant=american},
+    indextitle = {Computation of ionic solvation free energies},
+    annotation = {An \texttt{article} entry with an \texttt{eid} and a \texttt{
+                  doi} field. Note that the \textsc{doi} is transformed into a
+                  clickable link if \texttt{hyperref} support has been enabled},
+    abstract = {The computation of \texttt{ionic} solvation free energies from atomistic
+                simulations is a surprisingly difficult problem that has found no
+                satisfactory solution for more than 15 years. The reason is that
+                the charging free energies evaluated from such simulations are
+                affected by very large errors. One of these is related to the
+                choice of a specific convention for summing up the contributions
+                of solvent charges to the electrostatic potential in the ionic
+                cavity, namely, on the basis of point charges within entire
+                solvent molecules (M scheme) or on the basis of individual point
+                charges (P scheme). The use of an inappropriate convention may
+                lead to a charge-independent offset in the calculated potential,
+                which depends on the details of the summation scheme, on the
+                quadrupole-moment trace of the solvent molecule, and on the
+                approximate form used to represent electrostatic interactions in
+                the system. However, whether the M or P scheme (if any)
+                represents the appropriate convention is still a matter of
+                on-going debate. The goal of the present article is to settle
+                this long-standing controversy by carefully analyzing (both
+                analytically and numerically) the properties of the electrostatic
+                potential in molecular liquids (and inside cavities within them).
+                },
+}
+
+@article{article:sarfraz-razzak_techni=sectio=algori_2002,
+    title = {Technical section: {An} algorithm for automatic capturing of the
+             font outlines},
+    author = {M. Sarfraz and M. F. A. Razzak},
+    year = {2002},
+    journal = {Computers and Graphics},
+    volume = {26},
+    number = {5},
+    pages = {795--804},
+    issn = {0097-8493},
+    annotation = {An \texttt{article} entry with an \texttt{issn} field},
+}
+
+@article{article:reese_georgi=anglos=diplom_1958,
+    title = {Georgia in {Anglo-Spanish} Diplomacy, 1736--1739},
+    author = {Reese, Trevor R.},
+    series = {3},
+    volume = {15},
+    pages = {168--190},
+    journaltitle = {William and Mary Quarterly},
+    date = {1958},
+    langid = {english},
+    langidopts = {variant=american},
+    annotation = {An \texttt{article} entry with a \texttt{series} and a \texttt
+                  {volume} field. Note the format of the series. If the value of
+                  the \texttt{series} field is an integer, this number is printed
+                  as an ordinal and the string \enquote*{series} is appended
+                  automatically},
+}
+
+@article{article:shore_twiceb=once=concei_1991,
+    title = {Twice-Born, Once Conceived},
+    author = {Shore, Bradd},
+    series = {newseries},
+    volume = {93},
+    number = {1},
+    pages = {9--27},
+    journaltitle = {American Anthropologist},
+    date = {1991-03},
+    subtitle = {Meaning Construction and Cultural Cognition},
+    annotation = {An \texttt{article} entry with \texttt{series}, \texttt{volume
+                  }, and \texttt{number} fields. Note the format of the \texttt{
+                  series} which is a localization key},
+}
+
+@article{article:sigfridsson-ryde_compar=method=derivi_1998,
+    title = {Comparison of methods for deriving atomic charges from the
+             electrostatic potential and moments},
+    author = {Sigfridsson, Emma and Ryde, Ulf},
+    volume = {19},
+    number = {4},
+    pages = {377--395},
+    doi = {10.1002/(SICI)1096-987X(199803)19:4<377::AID-JCC1>3.0.CO;2-P},
+    journaltitle = {Journal of Computational Chemistry},
+    date = {1998},
+    langid = {english},
+    langidopts = {variant=american},
+    indextitle = {Methods for deriving atomic charges},
+    annotation = {An \texttt{article} entry with \texttt{volume}, \texttt{number
+                  }, and \texttt{doi} fields. Note that the \textsc{doi} is
+                  transformed into a clickable link if \texttt{hyperref} support
+                  has been enabled},
+    abstract = {Four methods for deriving partial atomic charges from the
+                quantum chemical electrostatic potential (CHELP, CHELPG,
+                Merz-Kollman, and RESP) have been compared and critically
+                evaluated. It is shown that charges strongly depend on how and
+                where the potential points are selected. Two alternative methods
+                are suggested to avoid the arbitrariness in the point-selection
+                schemes and van der Waals exclusion radii: CHELP-BOW, which also
+                estimates the charges from the electrostatic potential, but with
+                potential points that are Boltzmann-weighted after their
+                occurrence in actual simulations using the energy function of the
+                program in which the charges will be used, and CHELMO, which
+                estimates the charges directly from the electrostatic multipole
+                moments. Different criteria for the quality of the charges are
+                discussed.},
+}
+
+@article{article:spiegelberg_intent=intent=schola_1969,
+    title = {\mkbibquote{Intention} und \mkbibquote{Intentionalit{\"a}t} in der
+             Scholastik, bei Brentano und Husserl},
+    shorttitle = {Intention und Intentionalit{\"a}t},
+    author = {Spiegelberg, Herbert},
+    volume = {29},
+    pages = {189--216},
+    journaltitle = {Studia Philosophica},
+    date = {1969},
+    langid = {german},
+    sorttitle = {Intention und Intentionalitat in der Scholastik, bei Brentano
+                 und Husserl},
+    indexsorttitle = {Intention und Intentionalitat in der Scholastik, bei
+                      Brentano und Husserl},
+    annotation = {An \texttt{article} entry. Note the \texttt{sorttitle} and
+                  \texttt{indexsorttitle} fields and the markup of the quotes in
+                  the database file},
+}
+
+@article{article:springer_mediae=pilgri=routes_1950,
+    title = {Mediaeval Pilgrim Routes from {Scandinavia} to {Rome}},
+    shorttitle = {Mediaeval Pilgrim Routes},
+    author = {Springer, Otto},
+    volume = {12},
+    pages = {92--122},
+    journaltitle = {Mediaeval Studies},
+    date = {1950},
+    langid = {english},
+    langidopts = {variant=british},
+    annotation = {A plain \texttt{article} entry},
+}
+
+@article{article:weinberg_model=lepton_1967,
+    title = {A Model of Leptons},
+    author = {Weinberg, Steven},
+    volume = {19},
+    pages = {1264--1266},
+    journaltitle = {Phys.~Rev.~Lett.},
+    date = {1967},
+}
+
+@string{anch-ie = {Angew.~Chem. Int.~Ed.}}
+
+@string{cup = {Cambridge University Press}}
+
+@string{dtv = {Deutscher Taschenbuch-Verlag}}
+
+@string{hup = {Harvard University Press}}
+
+@string{jams = {J.~Amer. Math. Soc.}}
+
+@string{jchph = {J.~Chem. Phys.}}
+
+@string{jomch = {J.~Organomet. Chem.}}
+
+@string{pup = {Princeton University Press}}
+
+@incollection{incollection:westfahl_true=fronti,
+    title = {The True Frontier},
+    author = {Westfahl, Gary},
+    pages = {55--65},
+    subtitle = {Confronting and Avoiding the Realities of Space in {American}
+                Science Fiction Films},
+    crossref = {westfahl:frontier},
+    langid = {english},
+    langidopts = {variant=american},
+    indextitle = {True Frontier, The},
+    annotation = {A cross-referenced article from a \texttt{collection}. This is
+                  an \texttt{incollection} entry with a \texttt{crossref} field.
+                  Note the \texttt{subtitle} and \texttt{indextitle} fields},
+}
+
+@article{article:yoon-ryu_pallad=pincer=comple_2006,
+    title = {Palladium pincer complexes with reduced bond angle strain:
+             efficient catalysts for the {Heck} reaction},
+    author = {Yoon, Myeong S. and Ryu, Dowook and Kim, Jeongryul and Ahn, Kyo
+              Han},
+    volume = {25},
+    number = {10},
+    pages = {2409--2411},
+    journaltitle = {Organometallics},
+    date = {2006},
+    indextitle = {Palladium pincer complexes},
+}
diff --git a/tests/test-config.toml b/tests/test-config.toml
index d3e42c5..8dd8014 100644
--- a/tests/test-config.toml
+++ b/tests/test-config.toml
@@ -61,10 +61,13 @@ custom_column = "series"
 # year_color = "135"
 
 [citekey_formatter]
-fields = ["entrytype;;;;:", "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ]
+fields = ["shorthand;;;;+","entrytype;;;;:", "author;2;;-;_", "title;3;6;=;_", "year", "publisher;;5;#;" ]
 # fields = [ # CamelCase test
 #   "author;2;;;",
 #   "title;5;5;;",
 #   "year"
 # ]
 case = "lowercase"
+ascii_only = true
+# ignored_words = ["the"]
+# ignored_chars = ["?", "."]
-- 
cgit v1.2.3


From c62b83e02359c24973344699116acc12b4a04108 Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Tue, 14 Oct 2025 08:54:35 +0200
Subject: skip set and xdata entries by default

---
 src/bibiman/citekeys.rs               |  7 +++++--
 src/bibiman/citekeys/citekey_utils.rs |  5 +++++
 src/config.rs                         | 10 ++++++++++
 tests/biblatex-test-citekeys.bib      |  4 ++--
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'src/config.rs')

diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 0cec28e..999c6cb 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -28,7 +28,7 @@ use owo_colors::OwoColorize;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    bibiman::citekeys::citekey_utils::{build_citekey, formatting_help},
+    bibiman::citekeys::citekey_utils::{SKIPPED_ENTRIES, build_citekey, formatting_help},
     config::{BibiConfig, IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
 };
 
@@ -167,6 +167,10 @@ impl CitekeyFormatting {
     pub fn do_formatting(&mut self, ignored_chars: &[char], ignored_words: &[String]) -> &mut Self {
         let mut old_new_keys: Vec<(String, String)> = Vec::new();
         for entry in self.bib_entries.iter() {
+            // Skip specific entries
+            if SKIPPED_ENTRIES.contains(&entry.entry_type.to_string().to_lowercase().as_str()) {
+                continue;
+            }
             old_new_keys.push((
                 entry.key.clone(),
                 build_citekey(
@@ -181,7 +185,6 @@ impl CitekeyFormatting {
         }
 
         self.old_new_keys_map = old_new_keys;
-        // dbg!(&self.old_new_keys_map);
 
         self
     }
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
index 5f70224..58a8274 100644
--- a/src/bibiman/citekeys/citekey_utils.rs
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -15,6 +15,8 @@
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 /////
 
+use std::sync::LazyLock;
+
 use biblatex::{ChunksExt, Entry, Type};
 use deunicode::deunicode;
 use indoc::formatdoc;
@@ -25,6 +27,8 @@ use owo_colors::{
 
 use crate::bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully};
 
+pub(super) const SKIPPED_ENTRIES: [&str; 2] = ["set", "xdata"];
+
 pub(super) fn formatting_help() {
     let help = vec![
         formatdoc!(
@@ -200,6 +204,7 @@ pub(super) fn build_citekey(
     match case {
         Some(CitekeyCase::Lower) => new_citekey.to_lowercase(),
         Some(CitekeyCase::Upper) => new_citekey.to_uppercase(),
+        // otherwise skip, since camelcase is processed in char loop
         _ => new_citekey,
     }
 }
diff --git a/src/config.rs b/src/config.rs
index 7c1a0f8..b8d8b45 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -52,6 +52,10 @@ pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| {
         String::from("at"),
         String::from("to"),
         String::from("and"),
+        String::from("him"),
+        String::from("her"),
+        String::from("his"),
+        String::from("hers"),
         String::from("der"),
         String::from("die"),
         String::from("das"),
@@ -63,6 +67,12 @@ pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| {
         String::from("und"),
         String::from("für"),
         String::from("vor"),
+        String::from("er"),
+        String::from("sie"),
+        String::from("es"),
+        String::from("ihm"),
+        String::from("ihr"),
+        String::from("ihnen"),
     ]
 });
 
diff --git a/tests/biblatex-test-citekeys.bib b/tests/biblatex-test-citekeys.bib
index 9767f97..34c2f33 100644
--- a/tests/biblatex-test-citekeys.bib
+++ b/tests/biblatex-test-citekeys.bib
@@ -1,9 +1,9 @@
-@set{set,
+@set{SET,
     entryset = {article:herrmann-ofele_carboc=carben=as_2006,article:aksin-turkmen_effect=immobi=on_2006,article:yoon-ryu_pallad=pincer=comple_2006},
     annotation = {A \texttt{set} with three members.},
 }
 
-@set{set,
+@set{stdmodel,
     entryset = {article:glashow_partia=symmet=weak_1961,article:weinberg_model=lepton_1967,salam},
     annotation = {A \texttt{set} with three members discussing the standard
                   model of particle physics.},
-- 
cgit v1.2.3


From 2dc231247757a9a80b1925ed215f53f54eececa5 Mon Sep 17 00:00:00 2001
From: lukeflo
Date: Wed, 15 Oct 2025 07:28:20 +0200
Subject: fix tests, remove unneeded imports, add description

---
 src/bibiman/citekeys.rs               | 6 +++---
 src/bibiman/citekeys/citekey_utils.rs | 3 ---
 src/config.rs                         | 1 +
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'src/config.rs')

diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 8f70ab0..fdeed14 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -241,9 +241,9 @@ impl<'a> CitekeyFormatting<'a> {
     /// That will prevent the replacement longer key parts that equal a full shorter
     /// key.
     ///
-    /// You are **very encouraged** to call this method before `update_file()` to
-    /// prevent replacing citekeys partly which afterwards wont match the pattern
-    /// anymore.
+    /// You are **very encouraged** to call this method before `update_file()`
+    /// or `update_notes_pdfs` to prevent replacing citekeys partly which
+    /// afterwards wont match the pattern anymore.
     pub fn rev_sort_new_keys_by_len(mut self) -> Self {
         self.old_new_keys_map
             .sort_by(|a, b| b.0.len().cmp(&a.0.len()));
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
index b8f5600..773a2d2 100644
--- a/src/bibiman/citekeys/citekey_utils.rs
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -15,10 +15,7 @@
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 /////
 
-use std::sync::LazyLock;
-
 use biblatex::{ChunksExt, Entry, Type};
-use deunicode::deunicode;
 use indoc::formatdoc;
 use owo_colors::{
     OwoColorize,
diff --git a/src/config.rs b/src/config.rs
index b8d8b45..47e145c 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -501,6 +501,7 @@ mod tests {
                     year_color = "135"
 
                     [citekey_formatter]
+                    ascii_only = true
                 "#,
             )?;
 
-- 
cgit v1.2.3