add some additional ignored chars and words, check deunicoded chars for ignoring

author: lukeflo 2025-10-18 13:03:08 +0200
committer: lukeflo 2025-10-18 13:03:08 +0200
commit: 231353de2ce5713a4848178f031101534b529442 (patch)
tree: f8612664038a10f847c4c5b1ae9d8a2a2144cbf5
parent: 05dd5cbc2f5fd8252f9ed3b0e79cf4d785791f98 (diff)
download: bibiman-231353de2ce5713a4848178f031101534b529442.tar.gz
bibiman-231353de2ce5713a4848178f031101534b529442.zip
5 files changed, 107 insertions, 70 deletions
diff --git a/CITEKEYS.md b/CITEKEYS.md
index 828e557..d9855b8 100644
--- a/CITEKEYS.md
+++ b/CITEKEYS.md
@@ -30,7 +30,9 @@ fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ]
 case = "lowercase"
 ascii_only = true
 ignored_chars = [
-    "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", "\"",
+    '?', '!', '\\', '\'', '.', '-', '–', ':', ',', '[', ']', '(', ')', '{', '}', '§', '$', '%',
+    '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"', '»', '«', '‘', '’',
+    '“', '”',
 ]
 ignored_words = [
     "the",
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index fdeed14..a56969b 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -49,7 +49,8 @@ pub enum CitekeyCase {
         alias = "camelcase",
         alias = "camel_case",
         alias = "uppercamelcase",
-        alias = "upper_camel_case"
+        alias = "upper_camel_case",
+        alias = "pascalcase"
     )]
     Camel,
 }
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
index 2de3600..3f12f03 100644
--- a/src/bibiman/citekeys/citekey_utils.rs
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -175,24 +175,36 @@ pub(super) fn build_citekey(
                     if counter == 0 && case == Some(&CitekeyCase::Camel) {
                         c = c.to_ascii_uppercase()
                     }
-                    if let Some(len) = max_chars
-                        && counter >= len
-                    {
-                        break 'char_loop;
-                    }
-                    // if a word slice contains a special char, skip it
-                    if ignored_chars.contains(&c) {
-                        continue 'char_loop;
-                    }
                     // if non-ascii chars should be mapped, check if needed and do it
                     if let Some(chars) = deunicode::deunicode_char(c)
                         && ascii_only
                     {
-                        word_slice.push_str(chars);
-                        counter += chars.len();
+                        'deunicode_char_loop: for ch in chars.chars() {
+                            // if the deunicoded charset of the word slice's current
+                            // char contains a special char, skip it
+                            if ignored_chars.contains(&ch) {
+                                continue 'deunicode_char_loop;
+                            }
+                            word_slice.push(ch);
+                            counter += 1;
+                            if let Some(len) = max_chars
+                                && counter >= len
+                            {
+                                break 'char_loop;
+                            }
+                        }
                     } else {
+                        // if a word slice contains a special char, skip it
+                        if ignored_chars.contains(&c) {
+                            continue 'char_loop;
+                        }
                         word_slice.push(c);
                         counter += 1;
+                        if let Some(len) = max_chars
+                            && counter >= len
+                        {
+                            break 'char_loop;
+                        }
                     }
                 }
                 // Don't count empty slices and don't add delimiter to those
diff --git a/src/config.rs b/src/config.rs
index 47e145c..e0a097c 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -36,43 +36,49 @@ use crate::{
     cliargs::CLIArgs,
 };
 
-pub const IGNORED_SPECIAL_CHARS: [char; 33] = [
+pub const IGNORED_SPECIAL_CHARS: [char; 39] = [
     '?', '!', '\\', '\'', '.', '-', '–', ':', ',', '[', ']', '(', ')', '{', '}', '§', '$', '%',
-    '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"',
+    '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"', '»', '«', '‘', '’',
+    '“', '”',
 ];
 
 pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| {
     vec![
-        String::from("the"),
-        String::from("a"),
-        String::from("an"),
-        String::from("of"),
-        String::from("for"),
-        String::from("in"),
-        String::from("at"),
-        String::from("to"),
-        String::from("and"),
-        String::from("him"),
-        String::from("her"),
-        String::from("his"),
-        String::from("hers"),
-        String::from("der"),
-        String::from("die"),
-        String::from("das"),
-        String::from("ein"),
-        String::from("eine"),
-        String::from("eines"),
-        String::from("des"),
-        String::from("auf"),
-        String::from("und"),
-        String::from("für"),
-        String::from("vor"),
-        String::from("er"),
-        String::from("sie"),
-        String::from("es"),
-        String::from("ihm"),
-        String::from("ihr"),
-        String::from("ihnen"),
+        "the".into(),
+        "a".into(),
+        "an".into(),
+        "of".into(),
+        "for".into(),
+        "in".into(),
+        "at".into(),
+        "to".into(),
+        "and".into(),
+        "him".into(),
+        "her".into(),
+        "his".into(),
+        "he".into(),
+        "she".into(),
+        "it".into(),
+        "hers".into(),
+        "der".into(),
+        "die".into(),
+        "das".into(),
+        "ein".into(),
+        "eine".into(),
+        "eines".into(),
+        "des".into(),
+        "auf".into(),
+        "und".into(),
+        "für".into(),
+        "vor".into(),
+        "er".into(),
+        "sie".into(),
+        "es".into(),
+        "ihm".into(),
+        "ihr".into(),
+        "ihnen".into(),
+        "zum".into(),
+        "dem".into(),
     ]
 });
 
@@ -159,34 +165,50 @@ const DEFAULT_CONFIG: &str = r##"
 # ascii_only = true
 
 ## List of special chars that'll be ignored when building citekeys.
-## A custom list will overwrite the default list
+## A custom list will overwrite the default list. Thus, to add to the list,
+## uncomment it and add the additional chars.
 # ignored_chars = [
-#     "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", """,
-# ]
+#   '?', '!', '\\', '\'', '.', '-', '–', ':', ',', '[', ']', '(', ')', '{', '}', '§', '$', '%', '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"', '»', '«', '‘', '’', '“', '”',
 
 ## List of words that'll be ignored when building citekeys.
-## A custom list will overwrite the default list
+## A custom list will overwrite the default list. Thus, to add to the list,
+## uncomment it and add the additional words.
 # ignored_words = [
-#     "the",
-#     "a",
-#     "an",
-#     "of",
-#     "for",
-#     "in",
-#     "at",
-#     "to",
-#     "and",
-#     "der",
-#     "die",
-#     "das",
-#     "ein",
-#     "eine",
-#     "eines",
-#     "des",
-#     "auf",
-#     "und",
-#     "für",
-#     "vor",
+#   "the"
+#   "a"
+#   "an"
+#   "of"
+#   "for"
+#   "in"
+#   "at"
+#   "to"
+#   "and"
+#   "him"
+#   "her"
+#   "his"
+#   "he"
+#   "she"
+#   "it"
+#   "hers"
+#   "der"
+#   "die"
+#   "das"
+#   "ein"
+#   "eine"
+#   "eines"
+#   "des"
+#   "auf"
+#   "und"
+#   "für"
+#   "vor"
+#   "er"
+#   "sie"
+#   "es"
+#   "ihm"
+#   "ihr"
+#   "ihnen"
+#   "zum"
+#   "dem"
 # ]
 "##;
 
diff --git a/tests/biblatex-test.bib b/tests/biblatex-test.bib
index 2149e7c..941a24d 100644
--- a/tests/biblatex-test.bib
+++ b/tests/biblatex-test.bib
@@ -40,7 +40,7 @@
 }
 
 @article{angenendt,
-    title = {In Honore Salvatoris~-- Vom Sinn und Unsinn der Patrozinienkunde},
+    title = {“In Honore Salvatoris”~-- Vom Sinn und Unsinn der Patrozinienkunde},
     shorttitle = {In Honore Salvatoris},
     author = {Angenendt, Arnold},
     volume = {97},
author	lukeflo	2025-10-18 13:03:08 +0200
committer	lukeflo	2025-10-18 13:03:08 +0200
commit	231353de2ce5713a4848178f031101534b529442 (patch)
tree	f8612664038a10f847c4c5b1ae9d8a2a2144cbf5
parent	05dd5cbc2f5fd8252f9ed3b0e79cf4d785791f98 (diff)
download	bibiman-231353de2ce5713a4848178f031101534b529442.tar.gz bibiman-231353de2ce5713a4848178f031101534b529442.zip