ignore list for words, but need to solve inner delimiter problem for words ignored

author: lukeflo 2025-10-12 23:01:17 +0200
committer: lukeflo 2025-10-12 23:01:27 +0200
commit: 0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (patch)
tree: 3009e5c32985690cc1b346f4688fa3e9e3da7fde
parent: f112c4e13009e5ddfe3cf5c4cbe7f29f832b8553 (diff)
download: bibiman-0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb.tar.gz
bibiman-0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb.zip
3 files changed, 348 insertions, 303 deletions
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 9d17403..2f56947 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -21,27 +21,33 @@ use std::{
     path::{Path, PathBuf},
 };
 
-use biblatex::{Bibliography, ChunksExt, Entry, Type};
+use biblatex::Bibliography;
 use color_eyre::eyre::{OptionExt, eyre};
-use indoc::formatdoc;
 use lexopt::Arg::{Long, Short};
-use owo_colors::{
-    OwoColorize,
-    colors::{BrightBlue, Green, White},
-};
+use owo_colors::OwoColorize;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    bibiman::sanitize::sanitize_single_string_fully,
-    config::{BibiConfig, IGNORED_SPECIAL_CHARS},
+    bibiman::citekeys::citekey_utils::{build_citekey, formatting_help},
+    config::BibiConfig,
 };
 
+mod citekey_utils;
+
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub enum CitekeyCase {
     #[serde(alias = "uppercase", alias = "upper")]
     Upper,
     #[serde(alias = "lowercase", alias = "lower")]
     Lower,
+    #[serde(
+        alias = "camel",
+        alias = "camelcase",
+        alias = "camel_case",
+        alias = "uppercamelcase",
+        alias = "upper_camel_case"
+    )]
+    Camel,
 }
 
 #[derive(Debug, Default, Clone)]
@@ -203,306 +209,13 @@ impl CitekeyFormatting {
     }
 }
 
-fn formatting_help() {
-    let help = vec![
-        formatdoc!(
-            "{} {}\n",
-            env!("CARGO_PKG_NAME").fg::<Green>().bold(),
-            env!("CARGO_PKG_VERSION")
-        ),
-        formatdoc!("{}", "USAGE".bold()),
-        formatdoc!(
-            "\t{} {} {} {}\n",
-            env!("CARGO_PKG_NAME").fg::<White>().bold(),
-            "format-citekeys".bold(),
-            "--source=<SOURCE>".bold(),
-            "--output=<TARGET>".bold()
-        ),
-        formatdoc!(
-            "
-                \tThis help describes the CLI usage for the citekey formatting
-                \tfunctionality of bibiman. The definition of patterns how the
-                \tcitekeys should be formatted must be set in the config file.
-                \tFor further informations how to use this patterns etc. see:
-                \t{}
-            ",
-            "https://codeberg.org/lukeflo/bibiman/src/branch/main#bibiman"
-                .italic()
-                .fg::<BrightBlue>()
-        ),
-        formatdoc!("{}", "OPTIONS".bold()),
-        formatdoc!(
-            "
-                \t{}
-                \tShow this help and exit
-            ",
-            "-h, --help".fg::<White>().bold()
-        ),
-        formatdoc!(
-            "
-                \t{}
-                \tDon't apply any changes to the named files. Instead print all
-                \told citekeys and the formatted strings that would have been
-                \tapplied in the format: {} => {}
-            ",
-            "-d, --dry-run".fg::<White>().bold(),
-            "old_key".italic(),
-            "new_key".bold()
-        ),
-        formatdoc! {"
-                \t{}
-                \tThe bibfile for which the citekey formatting should be processed.
-                \tTakes a path as argument.
-            ", "-s, -f, --source=, --file=".fg::<White>().bold()},
-        formatdoc!(
-            "
-                \t{}
-                \tThe bibfile to which the updated content should be written.
-                \tTakes a path as argument. If the file doesn't exist, it will be
-                \tcreated.
-                \tIf the argument isn't used, the original file will be {}!
-            ",
-            "-t, -o, --target=, --output=".fg::<White>().bold(),
-            "overwritten".italic(),
-        ),
-    ];
-    let help = help.join("\n");
-    println!("{}", help);
-}
-
-/// Build the citekey from the patterns defined in the config file
-fn build_citekey(entry: &Entry, pattern_fields: &[String], case: Option<&CitekeyCase>) -> String {
-    // mut string the citekey is built from
-    let mut new_citekey = String::new();
-
-    // count different fields of pattern vec
-    let fields = pattern_fields.len();
-
-    // loop over pattern fields process them
-    for (idx, pattern) in pattern_fields.iter().enumerate() {
-        // parse single values from pattern field
-        let (field_name, word_count, char_count, inner_delimiter, trailing_delimiter) =
-            split_formatting_pat(pattern);
-
-        // built the part of the citekey from the current pattern field
-        let formatted_field_str = {
-            let mut formatted_str = String::new();
-
-            // preformat the field depending on biblatex value
-            let field = preformat_field(field_name, entry);
-
-            // split at whitespaces, count fields and set counter for processed
-            // splits
-            let mut split_field = field.split_whitespace();
-            let mut words_passed = 0;
-            let field_count = field.split_whitespace().count();
-            let word_count = if let Some(val) = word_count
-                && val <= field_count
-            {
-                val
-            } else {
-                field_count
-            };
-
-            // loop over single parts of current field and add correct delimiter
-            loop {
-                // terminate loop for current field if its empty. If its also the
-                // last of the pattern vec, pop the trailing delimiter
-                if field.is_empty() {
-                    if idx + 1 == fields {
-                        let _ = new_citekey.pop();
-                    }
-                    break;
-                }
-
-                // process the single slices and add correct delimiter
-                if let Some(field_slice) = split_field.next() {
-                    // Create word slice char by char. We need to loop over chars
-                    // instead of a simple bytes index to also catch chars which
-                    // consist of more than one byte (äöüøæ etc...)
-                    let mut word_slice = String::new();
-                    let word_chars = field_slice.chars();
-                    let mut counter = 0;
-                    for c in word_chars {
-                        if let Some(len) = char_count
-                            && counter == len
-                        {
-                            break;
-                        }
-                        // if a word slice contains a special char, skip it
-                        if IGNORED_SPECIAL_CHARS.contains(&c) {
-                            continue;
-                        }
-                        word_slice.push(c);
-                        counter += 1;
-                    }
-                    // Don't count empty slices and don't add delimiter to those
-                    if !word_slice.is_empty() {
-                        formatted_str = formatted_str + &word_slice;
-                        words_passed += 1;
-                        if word_count == words_passed {
-                            formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
-                            break;
-                        } else {
-                            formatted_str = formatted_str + inner_delimiter.unwrap_or("");
-                        }
-                    } else {
-                        continue;
-                    }
-                } else {
-                    formatted_str = formatted_str + trailing_delimiter.unwrap_or("");
-                    break;
-                };
-            }
-            formatted_str
-        };
-        new_citekey = new_citekey + &formatted_field_str;
-    }
-    if let Some(case_format) = case {
-        match case_format {
-            CitekeyCase::Lower => new_citekey.to_lowercase(),
-            CitekeyCase::Upper => new_citekey.to_uppercase(),
-        }
-    } else {
-        new_citekey
-    }
-}
-
-/// Preformat some fields which are very common to be used in citekeys
-fn preformat_field(field: &str, entry: &Entry) -> String {
-    match field {
-        "title" => {
-            sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into()))
-        }
-        "author" => {
-            if let Ok(authors) = entry.author() {
-                let mut last_names = String::new();
-                for a in authors.iter() {
-                    last_names = last_names + &a.name + " ";
-                }
-                last_names
-            } else {
-                "".to_string()
-            }
-        }
-        "year" => {
-            if let Ok(date) = entry.date() {
-                date.to_chunks().format_verbatim()[..4].to_string()
-            } else {
-                entry.get_as::<String>(field).unwrap_or("".into())
-            }
-        }
-        "subtitle" => {
-            sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into()))
-        }
-        "editor" => {
-            if let Ok(editors) = entry.editors() {
-                let mut last_names = String::new();
-                for editortypes in editors.iter() {
-                    for e in editortypes.0.iter() {
-                        last_names = last_names + &e.name + " ";
-                    }
-                }
-                last_names
-            } else {
-                "".to_string()
-            }
-        }
-        "pubtype" | "entrytype" => entry.entry_type.to_string(),
-        _ => entry.get_as::<String>(field).unwrap_or("".into()),
-    }
-}
-
-/// Cut of word at char count index if its set
-fn format_word(word: &str, count: Option<usize>) -> String {
-    // Since chars can consist of multiple bytes, we need this more complex
-    // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...)
-    // instead of simple byte indexing
-    let mut word_slice = String::new();
-    let word_chars = word.chars();
-    let mut counter = 0;
-    for c in word_chars {
-        if let Some(len) = count
-            && counter == len
-        {
-            break;
-        }
-        if IGNORED_SPECIAL_CHARS.contains(&c) {
-            continue;
-        }
-        word_slice.push(c);
-        counter += 1;
-    }
-    word_slice
-}
-
-/// Split a formatting pattern of kind
-/// `<field>;<word count>;<char count>;<inside delimiter>;<trailing delimiter>`,
-/// e.g.: `title;3;3;_;:` will give `("title", 3, 3, "_", ":")`
-fn split_formatting_pat(
-    pattern: &str,
-) -> (
-    &str,
-    Option<usize>,
-    Option<usize>,
-    Option<&str>,
-    Option<&str>,
-) {
-    let mut splits = pattern.split(';');
-    (
-        splits
-            .next()
-            .expect("Need field value for formatting citekey"),
-        if let Some(next) = splits.next()
-            && next.len() > 0
-        {
-            next.parse::<usize>().ok()
-        } else {
-            None
-        },
-        if let Some(next) = splits.next()
-            && next.len() > 0
-        {
-            next.parse::<usize>().ok()
-        } else {
-            None
-        },
-        splits.next(),
-        splits.next(),
-    )
-}
-
 #[cfg(test)]
 mod tests {
     use std::path::PathBuf;
 
     use biblatex::Bibliography;
-    use itertools::Itertools;
-
-    use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting, split_formatting_pat};
-
-    #[test]
-    fn split_citekey_pattern() {
-        let pattern = "title;3;5;_;_";
-
-        assert_eq!(
-            split_formatting_pat(pattern),
-            ("title", Some(3), Some(5), Some("_"), Some("_"))
-        );
 
-        let pattern = "year";
-
-        assert_eq!(
-            split_formatting_pat(pattern),
-            ("year", None, None, None, None)
-        );
-
-        let pattern = "author;1;;;_";
-        assert_eq!(
-            split_formatting_pat(pattern),
-            ("author", Some(1), None, Some(""), Some("_"))
-        );
-    }
+    use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting};
 
     #[test]
     fn format_citekey_test() {
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
new file mode 100644
index 0000000..ee2c849
--- /dev/null
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -0,0 +1,327 @@
+// bibiman - a TUI for managing BibLaTeX databases
+// Copyright (C) 2025  lukeflo
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+/////
+
+use biblatex::{ChunksExt, Entry, Type};
+use indoc::formatdoc;
+use owo_colors::{
+    OwoColorize,
+    colors::{BrightBlue, Green, White},
+};
+
+use crate::{
+    bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully},
+    config::IGNORED_SPECIAL_CHARS,
+};
+
+const IGNORE_WORDS: [&str; 20] = [
+    "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine",
+    "eines", "des", "auf", "und", "für", "vor",
+];
+
+pub(super) fn formatting_help() {
+    let help = vec![
+        formatdoc!(
+            "{} {}\n",
+            env!("CARGO_PKG_NAME").fg::<Green>().bold(),
+            env!("CARGO_PKG_VERSION")
+        ),
+        formatdoc!("{}", "USAGE".bold()),
+        formatdoc!(
+            "\t{} {} {} {}\n",
+            env!("CARGO_PKG_NAME").fg::<White>().bold(),
+            "format-citekeys".bold(),
+            "--source=<SOURCE>".bold(),
+            "--output=<TARGET>".bold()
+        ),
+        formatdoc!(
+            "
+                \tThis help describes the CLI usage for the citekey formatting
+                \tfunctionality of bibiman. The definition of patterns how the
+                \tcitekeys should be formatted must be set in the config file.
+                \tFor further informations how to use this patterns etc. see:
+                \t{}
+            ",
+            "https://codeberg.org/lukeflo/bibiman/src/branch/main#bibiman"
+                .italic()
+                .fg::<BrightBlue>()
+        ),
+        formatdoc!("{}", "OPTIONS".bold()),
+        formatdoc!(
+            "
+                \t{}
+                \tShow this help and exit
+            ",
+            "-h, --help".fg::<White>().bold()
+        ),
+        formatdoc!(
+            "
+                \t{}
+                \tDon't apply any changes to the named files. Instead print all
+                \told citekeys and the formatted strings that would have been
+                \tapplied in the format: {} => {}
+            ",
+            "-d, --dry-run".fg::<White>().bold(),
+            "old_key".italic(),
+            "new_key".bold()
+        ),
+        formatdoc! {"
+                \t{}
+                \tThe bibfile for which the citekey formatting should be processed.
+                \tTakes a path as argument.
+            ", "-s, -f, --source=, --file=".fg::<White>().bold()},
+        formatdoc!(
+            "
+                \t{}
+                \tThe bibfile to which the updated content should be written.
+                \tTakes a path as argument. If the file doesn't exist, it will be
+                \tcreated.
+                \tIf the argument isn't used, the original file will be {}!
+            ",
+            "-t, -o, --target=, --output=".fg::<White>().bold(),
+            "overwritten".italic(),
+        ),
+    ];
+    let help = help.join("\n");
+    println!("{}", help);
+}
+
+/// Build the citekey from the patterns defined in the config file
+pub(super) fn build_citekey(
+    entry: &Entry,
+    pattern_fields: &[String],
+    case: Option<&CitekeyCase>,
+) -> String {
+    // mut string the citekey is built from
+    let mut new_citekey = String::new();
+
+    // trailing delimiter of previous field
+    let mut trailing_delimiter: Option<&str> = None;
+
+    // loop over pattern fields process them
+    'field_loop: for pattern in pattern_fields.iter() {
+        // parse single values from pattern field
+        let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) =
+            split_formatting_pat(pattern);
+
+        // built the part of the citekey from the current pattern field
+        let formatted_field_str = {
+            let mut formatted_str = String::new();
+
+            // preformat the field depending on biblatex value
+            let field = preformat_field(field_name, entry);
+
+            // split at whitespaces, count fields and set counter for processed
+            // splits
+            let mut split_field = field.split_whitespace();
+            let mut words_passed = 0;
+            let field_count = field.split_whitespace().count();
+            let word_count = if let Some(val) = word_count
+                && val <= field_count
+            {
+                val
+            } else {
+                field_count
+            };
+
+            // If there is a trailing delimiter from the previous field, push it
+            if let Some(del) = trailing_delimiter {
+                formatted_str = del.to_string();
+            };
+
+            // If the current field isn't empty, set trailing delimiter for
+            // upcoming loop repitition. If it's empty, start next run of loop
+            // directly
+            if !field.is_empty() {
+                trailing_delimiter = cur_trailing_delimiter;
+            } else {
+                continue 'field_loop;
+            }
+
+            // loop over single parts of current field and add correct delimiter
+            'word_loop: loop {
+                // process the single slices and add correct delimiter
+                if let Some(field_slice) = split_field.next() {
+                    // Create word slice char by char. We need to loop over chars
+                    // instead of a simple bytes index to also catch chars which
+                    // consist of more than one byte (äöüøæ etc...)
+                    let mut word_slice = String::new();
+                    let word_chars = field_slice.chars();
+                    let mut counter = 0;
+                    'char_loop: for mut c in word_chars {
+                        // If camelcase is set, force first char of word to uppercase
+                        if counter == 0 && case == Some(&CitekeyCase::Camel) {
+                            c = c.to_ascii_uppercase()
+                        }
+                        if let Some(len) = char_count
+                            && counter == len
+                        {
+                            break 'char_loop;
+                        }
+                        // if a word slice contains a special char, skip it
+                        if IGNORED_SPECIAL_CHARS.contains(&c) {
+                            continue 'char_loop;
+                        }
+                        word_slice.push(c);
+                        counter += 1;
+                    }
+                    // Don't count empty slices and don't add delimiter to those
+                    if !word_slice.is_empty() {
+                        formatted_str = formatted_str + &word_slice;
+                        words_passed += 1;
+                        if word_count == words_passed {
+                            break 'word_loop;
+                        } else {
+                            formatted_str = formatted_str + inner_delimiter.unwrap_or("");
+                        }
+                    } else {
+                        continue 'word_loop;
+                    }
+                } else {
+                    break 'word_loop;
+                };
+            }
+            formatted_str
+        };
+        new_citekey = new_citekey + &formatted_field_str;
+    }
+    match case {
+        Some(CitekeyCase::Lower) => new_citekey.to_lowercase(),
+        Some(CitekeyCase::Upper) => new_citekey.to_uppercase(),
+        _ => new_citekey,
+    }
+}
+
+/// Preformat some fields which are very common to be used in citekeys
+pub(super) fn preformat_field(field: &str, entry: &Entry) -> String {
+    match field {
+        // Sanitize all macro code from string
+        "title" => {
+            sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into()))
+        }
+        // Get author names. Fall back to editors before setting empty string
+        "author" => {
+            if let Ok(authors) = entry.author() {
+                let mut last_names = String::new();
+                for a in authors.iter() {
+                    last_names = last_names + &a.name + " ";
+                }
+                last_names
+            } else if let Ok(editors) = entry.editors() {
+                let mut last_names = String::new();
+                for editortypes in editors.iter() {
+                    for e in editortypes.0.iter() {
+                        last_names = last_names + &e.name + " ";
+                    }
+                }
+                last_names
+            } else {
+                "".to_string()
+            }
+        }
+        // Get year of date field, fallback to year field
+        "year" => {
+            if let Ok(date) = entry.date() {
+                date.to_chunks().format_verbatim()[..4].to_string()
+            } else {
+                entry.get_as::<String>(field).unwrap_or("".into())
+            }
+        }
+        // Sanitize all macro code from string
+        "subtitle" => {
+            sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into()))
+        }
+        "editor" => {
+            if let Ok(editors) = entry.editors() {
+                let mut last_names = String::new();
+                for editortypes in editors.iter() {
+                    for e in editortypes.0.iter() {
+                        last_names = last_names + &e.name + " ";
+                    }
+                }
+                last_names
+            } else {
+                "".to_string()
+            }
+        }
+        "pubtype" | "entrytype" => entry.entry_type.to_string(),
+        _ => entry.get_as::<String>(field).unwrap_or("".into()),
+    }
+}
+
+/// Split a formatting pattern of kind
+/// `<field>;<word count>;<char count>;<inside delimiter>;<trailing delimiter>`,
+/// e.g.: `title;3;3;_;:` will give `("title", 3, 3, "_", ":")`
+pub(super) fn split_formatting_pat(
+    pattern: &str,
+) -> (
+    &str,
+    Option<usize>,
+    Option<usize>,
+    Option<&str>,
+    Option<&str>,
+) {
+    let mut splits = pattern.split(';');
+    (
+        splits
+            .next()
+            .expect("Need field value for formatting citekey"),
+        if let Some(next) = splits.next()
+            && next.len() > 0
+        {
+            next.parse::<usize>().ok()
+        } else {
+            None
+        },
+        if let Some(next) = splits.next()
+            && next.len() > 0
+        {
+            next.parse::<usize>().ok()
+        } else {
+            None
+        },
+        splits.next(),
+        splits.next(),
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use crate::bibiman::citekeys::citekey_utils::split_formatting_pat;
+
+    #[test]
+    fn split_citekey_pattern() {
+        let pattern = "title;3;5;_;_";
+
+        assert_eq!(
+            split_formatting_pat(pattern),
+            ("title", Some(3), Some(5), Some("_"), Some("_"))
+        );
+
+        let pattern = "year";
+
+        assert_eq!(
+            split_formatting_pat(pattern),
+            ("year", None, None, None, None)
+        );
+
+        let pattern = "author;1;;;_";
+        assert_eq!(
+            split_formatting_pat(pattern),
+            ("author", Some(1), None, Some(""), Some("_"))
+        );
+    }
+}
diff --git a/tests/test-config.toml b/tests/test-config.toml
index 2c5ac96..d3e42c5 100644
--- a/tests/test-config.toml
+++ b/tests/test-config.toml
@@ -61,5 +61,10 @@ custom_column = "series"
 # year_color = "135"
 
 [citekey_formatter]
-fields = [ "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ]
+fields = ["entrytype;;;;:", "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ]
+# fields = [ # CamelCase test
+#   "author;2;;;",
+#   "title;5;5;;",
+#   "year"
+# ]
 case = "lowercase"
author	lukeflo	2025-10-12 23:01:17 +0200
committer	lukeflo	2025-10-12 23:01:27 +0200
commit	0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (patch)
tree	3009e5c32985690cc1b346f4688fa3e9e3da7fde
parent	f112c4e13009e5ddfe3cf5c4cbe7f29f832b8553 (diff)
download	bibiman-0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb.tar.gz bibiman-0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb.zip