From 4779dbc5fe3712bce31bbb5f1f43c28c4c839420 Mon Sep 17 00:00:00 2001 From: lukeflo Date: Fri, 10 Oct 2025 13:47:07 +0200 Subject: substitute byte index for char counting loop, impl `dry-run` option for citekey formatting --- tests/test-config.toml | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tests/test-config.toml') diff --git a/tests/test-config.toml b/tests/test-config.toml index fc447f1..b484b69 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -59,3 +59,7 @@ custom_column = "series" # author_color = "38" # title_color = "37" # year_color = "135" + +[citekey_formatter] +fields = [ "author;2;;-;_", "title;3;3;_;_", "year" ] +case = "lowercase" -- cgit v1.2.3 From c69b1789fabaf149916d160922d7026f2cbe33f1 Mon Sep 17 00:00:00 2001 From: lukeflo Date: Fri, 10 Oct 2025 14:57:53 +0200 Subject: implement const of ignored special chars for citekey formatting * the list contains 33 special chars at the moment * it will only affect already existing special chars in biblatex fields * delimiter specified for citekey formatting are not affected * char count is also not affected, ignored chars are not counted --- src/bibiman/citekeys.rs | 40 +++++++++++++++++++++------------------- src/config.rs | 5 +++++ tests/test-config.toml | 2 +- 3 files changed, 27 insertions(+), 20 deletions(-) (limited to 'tests/test-config.toml') diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs index 5121741..7c06886 100644 --- a/src/bibiman/citekeys.rs +++ b/src/bibiman/citekeys.rs @@ -31,7 +31,10 @@ use owo_colors::{ }; use serde::{Deserialize, Serialize}; -use crate::{bibiman::sanitize::sanitize_single_string_fully, config::BibiConfig}; +use crate::{ + bibiman::sanitize::sanitize_single_string_fully, + config::{BibiConfig, IGNORED_SPECIAL_CHARS}, +}; #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum CitekeyCase { @@ -354,26 +357,25 @@ fn preformat_field(field: &str, entry: &Entry) -> String { /// Cut of word at char count index if its set fn format_word(word: &str, count: Option) -> String { - if let Some(len) = count - && len < word.chars().count() - { - // Since chars can consist of multiple bytes, we need this more complex - // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...) - // instead of simple byte indexing - let mut word_slice = String::new(); - let word_chars = word.chars(); - let mut counter = 0; - for c in word_chars { - if counter == len { - break; - } - word_slice.push(c); - counter += 1; + // Since chars can consist of multiple bytes, we need this more complex + // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...) + // instead of simple byte indexing + let mut word_slice = String::new(); + let word_chars = word.chars(); + let mut counter = 0; + for c in word_chars { + if let Some(len) = count + && counter == len + { + break; } - word_slice - } else { - word.to_string() + if IGNORED_SPECIAL_CHARS.contains(&c) { + continue; + } + word_slice.push(c); + counter += 1; } + word_slice } /// Split a formatting pattern of kind diff --git a/src/config.rs b/src/config.rs index a5df61c..a4e89be 100644 --- a/src/config.rs +++ b/src/config.rs @@ -35,6 +35,11 @@ use crate::{ cliargs::CLIArgs, }; +pub const IGNORED_SPECIAL_CHARS: [char; 33] = [ + '?', '!', '\\', '\'', '.', '-', '–', ':', ',', '[', ']', '(', ')', '{', '}', '§', '$', '%', + '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"', +]; + const DEFAULT_CONFIG: &str = r##" # [general] ## Default files/dirs which are loaded on startup diff --git a/tests/test-config.toml b/tests/test-config.toml index b484b69..558d216 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,5 +61,5 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = [ "author;2;;-;_", "title;3;3;_;_", "year" ] +fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ] case = "lowercase" -- cgit v1.2.3 From f112c4e13009e5ddfe3cf5c4cbe7f29f832b8553 Mon Sep 17 00:00:00 2001 From: lukeflo Date: Sun, 12 Oct 2025 21:51:21 +0200 Subject: solve double delimiters with empty fields --- src/bibiman/citekeys.rs | 36 ++++++++++++++++++++++++++++++------ tests/test-config.toml | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) (limited to 'tests/test-config.toml') diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs index 065d57f..9d17403 100644 --- a/src/bibiman/citekeys.rs +++ b/src/bibiman/citekeys.rs @@ -317,13 +317,37 @@ fn build_citekey(entry: &Entry, pattern_fields: &[String], case: Option<&Citekey // process the single slices and add correct delimiter if let Some(field_slice) = split_field.next() { - formatted_str = formatted_str + &format_word(field_slice, char_count); - words_passed += 1; - if word_count == words_passed { - formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); - break; + // Create word slice char by char. We need to loop over chars + // instead of a simple bytes index to also catch chars which + // consist of more than one byte (äöüøæ etc...) + let mut word_slice = String::new(); + let word_chars = field_slice.chars(); + let mut counter = 0; + for c in word_chars { + if let Some(len) = char_count + && counter == len + { + break; + } + // if a word slice contains a special char, skip it + if IGNORED_SPECIAL_CHARS.contains(&c) { + continue; + } + word_slice.push(c); + counter += 1; + } + // Don't count empty slices and don't add delimiter to those + if !word_slice.is_empty() { + formatted_str = formatted_str + &word_slice; + words_passed += 1; + if word_count == words_passed { + formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); + break; + } else { + formatted_str = formatted_str + inner_delimiter.unwrap_or(""); + } } else { - formatted_str = formatted_str + inner_delimiter.unwrap_or("") + continue; } } else { formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); diff --git a/tests/test-config.toml b/tests/test-config.toml index 558d216..2c5ac96 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,5 +61,5 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ] +fields = [ "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] case = "lowercase" -- cgit v1.2.3 From 0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb Mon Sep 17 00:00:00 2001 From: lukeflo Date: Sun, 12 Oct 2025 23:01:17 +0200 Subject: ignore list for words, but need to solve inner delimiter problem for words ignored --- src/bibiman/citekeys.rs | 317 ++------------------------------ src/bibiman/citekeys/citekey_utils.rs | 327 ++++++++++++++++++++++++++++++++++ tests/test-config.toml | 7 +- 3 files changed, 348 insertions(+), 303 deletions(-) create mode 100644 src/bibiman/citekeys/citekey_utils.rs (limited to 'tests/test-config.toml') diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs index 9d17403..2f56947 100644 --- a/src/bibiman/citekeys.rs +++ b/src/bibiman/citekeys.rs @@ -21,27 +21,33 @@ use std::{ path::{Path, PathBuf}, }; -use biblatex::{Bibliography, ChunksExt, Entry, Type}; +use biblatex::Bibliography; use color_eyre::eyre::{OptionExt, eyre}; -use indoc::formatdoc; use lexopt::Arg::{Long, Short}; -use owo_colors::{ - OwoColorize, - colors::{BrightBlue, Green, White}, -}; +use owo_colors::OwoColorize; use serde::{Deserialize, Serialize}; use crate::{ - bibiman::sanitize::sanitize_single_string_fully, - config::{BibiConfig, IGNORED_SPECIAL_CHARS}, + bibiman::citekeys::citekey_utils::{build_citekey, formatting_help}, + config::BibiConfig, }; +mod citekey_utils; + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum CitekeyCase { #[serde(alias = "uppercase", alias = "upper")] Upper, #[serde(alias = "lowercase", alias = "lower")] Lower, + #[serde( + alias = "camel", + alias = "camelcase", + alias = "camel_case", + alias = "uppercamelcase", + alias = "upper_camel_case" + )] + Camel, } #[derive(Debug, Default, Clone)] @@ -203,306 +209,13 @@ impl CitekeyFormatting { } } -fn formatting_help() { - let help = vec![ - formatdoc!( - "{} {}\n", - env!("CARGO_PKG_NAME").fg::().bold(), - env!("CARGO_PKG_VERSION") - ), - formatdoc!("{}", "USAGE".bold()), - formatdoc!( - "\t{} {} {} {}\n", - env!("CARGO_PKG_NAME").fg::().bold(), - "format-citekeys".bold(), - "--source=".bold(), - "--output=".bold() - ), - formatdoc!( - " - \tThis help describes the CLI usage for the citekey formatting - \tfunctionality of bibiman. The definition of patterns how the - \tcitekeys should be formatted must be set in the config file. - \tFor further informations how to use this patterns etc. see: - \t{} - ", - "https://codeberg.org/lukeflo/bibiman/src/branch/main#bibiman" - .italic() - .fg::() - ), - formatdoc!("{}", "OPTIONS".bold()), - formatdoc!( - " - \t{} - \tShow this help and exit - ", - "-h, --help".fg::().bold() - ), - formatdoc!( - " - \t{} - \tDon't apply any changes to the named files. Instead print all - \told citekeys and the formatted strings that would have been - \tapplied in the format: {} => {} - ", - "-d, --dry-run".fg::().bold(), - "old_key".italic(), - "new_key".bold() - ), - formatdoc! {" - \t{} - \tThe bibfile for which the citekey formatting should be processed. - \tTakes a path as argument. - ", "-s, -f, --source=, --file=".fg::().bold()}, - formatdoc!( - " - \t{} - \tThe bibfile to which the updated content should be written. - \tTakes a path as argument. If the file doesn't exist, it will be - \tcreated. - \tIf the argument isn't used, the original file will be {}! - ", - "-t, -o, --target=, --output=".fg::().bold(), - "overwritten".italic(), - ), - ]; - let help = help.join("\n"); - println!("{}", help); -} - -/// Build the citekey from the patterns defined in the config file -fn build_citekey(entry: &Entry, pattern_fields: &[String], case: Option<&CitekeyCase>) -> String { - // mut string the citekey is built from - let mut new_citekey = String::new(); - - // count different fields of pattern vec - let fields = pattern_fields.len(); - - // loop over pattern fields process them - for (idx, pattern) in pattern_fields.iter().enumerate() { - // parse single values from pattern field - let (field_name, word_count, char_count, inner_delimiter, trailing_delimiter) = - split_formatting_pat(pattern); - - // built the part of the citekey from the current pattern field - let formatted_field_str = { - let mut formatted_str = String::new(); - - // preformat the field depending on biblatex value - let field = preformat_field(field_name, entry); - - // split at whitespaces, count fields and set counter for processed - // splits - let mut split_field = field.split_whitespace(); - let mut words_passed = 0; - let field_count = field.split_whitespace().count(); - let word_count = if let Some(val) = word_count - && val <= field_count - { - val - } else { - field_count - }; - - // loop over single parts of current field and add correct delimiter - loop { - // terminate loop for current field if its empty. If its also the - // last of the pattern vec, pop the trailing delimiter - if field.is_empty() { - if idx + 1 == fields { - let _ = new_citekey.pop(); - } - break; - } - - // process the single slices and add correct delimiter - if let Some(field_slice) = split_field.next() { - // Create word slice char by char. We need to loop over chars - // instead of a simple bytes index to also catch chars which - // consist of more than one byte (äöüøæ etc...) - let mut word_slice = String::new(); - let word_chars = field_slice.chars(); - let mut counter = 0; - for c in word_chars { - if let Some(len) = char_count - && counter == len - { - break; - } - // if a word slice contains a special char, skip it - if IGNORED_SPECIAL_CHARS.contains(&c) { - continue; - } - word_slice.push(c); - counter += 1; - } - // Don't count empty slices and don't add delimiter to those - if !word_slice.is_empty() { - formatted_str = formatted_str + &word_slice; - words_passed += 1; - if word_count == words_passed { - formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); - break; - } else { - formatted_str = formatted_str + inner_delimiter.unwrap_or(""); - } - } else { - continue; - } - } else { - formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); - break; - }; - } - formatted_str - }; - new_citekey = new_citekey + &formatted_field_str; - } - if let Some(case_format) = case { - match case_format { - CitekeyCase::Lower => new_citekey.to_lowercase(), - CitekeyCase::Upper => new_citekey.to_uppercase(), - } - } else { - new_citekey - } -} - -/// Preformat some fields which are very common to be used in citekeys -fn preformat_field(field: &str, entry: &Entry) -> String { - match field { - "title" => { - sanitize_single_string_fully(&entry.get_as::(field).unwrap_or("".into())) - } - "author" => { - if let Ok(authors) = entry.author() { - let mut last_names = String::new(); - for a in authors.iter() { - last_names = last_names + &a.name + " "; - } - last_names - } else { - "".to_string() - } - } - "year" => { - if let Ok(date) = entry.date() { - date.to_chunks().format_verbatim()[..4].to_string() - } else { - entry.get_as::(field).unwrap_or("".into()) - } - } - "subtitle" => { - sanitize_single_string_fully(&entry.get_as::(field).unwrap_or("".into())) - } - "editor" => { - if let Ok(editors) = entry.editors() { - let mut last_names = String::new(); - for editortypes in editors.iter() { - for e in editortypes.0.iter() { - last_names = last_names + &e.name + " "; - } - } - last_names - } else { - "".to_string() - } - } - "pubtype" | "entrytype" => entry.entry_type.to_string(), - _ => entry.get_as::(field).unwrap_or("".into()), - } -} - -/// Cut of word at char count index if its set -fn format_word(word: &str, count: Option) -> String { - // Since chars can consist of multiple bytes, we need this more complex - // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...) - // instead of simple byte indexing - let mut word_slice = String::new(); - let word_chars = word.chars(); - let mut counter = 0; - for c in word_chars { - if let Some(len) = count - && counter == len - { - break; - } - if IGNORED_SPECIAL_CHARS.contains(&c) { - continue; - } - word_slice.push(c); - counter += 1; - } - word_slice -} - -/// Split a formatting pattern of kind -/// `;;;;`, -/// e.g.: `title;3;3;_;:` will give `("title", 3, 3, "_", ":")` -fn split_formatting_pat( - pattern: &str, -) -> ( - &str, - Option, - Option, - Option<&str>, - Option<&str>, -) { - let mut splits = pattern.split(';'); - ( - splits - .next() - .expect("Need field value for formatting citekey"), - if let Some(next) = splits.next() - && next.len() > 0 - { - next.parse::().ok() - } else { - None - }, - if let Some(next) = splits.next() - && next.len() > 0 - { - next.parse::().ok() - } else { - None - }, - splits.next(), - splits.next(), - ) -} - #[cfg(test)] mod tests { use std::path::PathBuf; use biblatex::Bibliography; - use itertools::Itertools; - - use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting, split_formatting_pat}; - - #[test] - fn split_citekey_pattern() { - let pattern = "title;3;5;_;_"; - - assert_eq!( - split_formatting_pat(pattern), - ("title", Some(3), Some(5), Some("_"), Some("_")) - ); - let pattern = "year"; - - assert_eq!( - split_formatting_pat(pattern), - ("year", None, None, None, None) - ); - - let pattern = "author;1;;;_"; - assert_eq!( - split_formatting_pat(pattern), - ("author", Some(1), None, Some(""), Some("_")) - ); - } + use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting}; #[test] fn format_citekey_test() { diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs new file mode 100644 index 0000000..ee2c849 --- /dev/null +++ b/src/bibiman/citekeys/citekey_utils.rs @@ -0,0 +1,327 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +///// + +use biblatex::{ChunksExt, Entry, Type}; +use indoc::formatdoc; +use owo_colors::{ + OwoColorize, + colors::{BrightBlue, Green, White}, +}; + +use crate::{ + bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully}, + config::IGNORED_SPECIAL_CHARS, +}; + +const IGNORE_WORDS: [&str; 20] = [ + "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine", + "eines", "des", "auf", "und", "für", "vor", +]; + +pub(super) fn formatting_help() { + let help = vec![ + formatdoc!( + "{} {}\n", + env!("CARGO_PKG_NAME").fg::().bold(), + env!("CARGO_PKG_VERSION") + ), + formatdoc!("{}", "USAGE".bold()), + formatdoc!( + "\t{} {} {} {}\n", + env!("CARGO_PKG_NAME").fg::().bold(), + "format-citekeys".bold(), + "--source=".bold(), + "--output=".bold() + ), + formatdoc!( + " + \tThis help describes the CLI usage for the citekey formatting + \tfunctionality of bibiman. The definition of patterns how the + \tcitekeys should be formatted must be set in the config file. + \tFor further informations how to use this patterns etc. see: + \t{} + ", + "https://codeberg.org/lukeflo/bibiman/src/branch/main#bibiman" + .italic() + .fg::() + ), + formatdoc!("{}", "OPTIONS".bold()), + formatdoc!( + " + \t{} + \tShow this help and exit + ", + "-h, --help".fg::().bold() + ), + formatdoc!( + " + \t{} + \tDon't apply any changes to the named files. Instead print all + \told citekeys and the formatted strings that would have been + \tapplied in the format: {} => {} + ", + "-d, --dry-run".fg::().bold(), + "old_key".italic(), + "new_key".bold() + ), + formatdoc! {" + \t{} + \tThe bibfile for which the citekey formatting should be processed. + \tTakes a path as argument. + ", "-s, -f, --source=, --file=".fg::().bold()}, + formatdoc!( + " + \t{} + \tThe bibfile to which the updated content should be written. + \tTakes a path as argument. If the file doesn't exist, it will be + \tcreated. + \tIf the argument isn't used, the original file will be {}! + ", + "-t, -o, --target=, --output=".fg::().bold(), + "overwritten".italic(), + ), + ]; + let help = help.join("\n"); + println!("{}", help); +} + +/// Build the citekey from the patterns defined in the config file +pub(super) fn build_citekey( + entry: &Entry, + pattern_fields: &[String], + case: Option<&CitekeyCase>, +) -> String { + // mut string the citekey is built from + let mut new_citekey = String::new(); + + // trailing delimiter of previous field + let mut trailing_delimiter: Option<&str> = None; + + // loop over pattern fields process them + 'field_loop: for pattern in pattern_fields.iter() { + // parse single values from pattern field + let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) = + split_formatting_pat(pattern); + + // built the part of the citekey from the current pattern field + let formatted_field_str = { + let mut formatted_str = String::new(); + + // preformat the field depending on biblatex value + let field = preformat_field(field_name, entry); + + // split at whitespaces, count fields and set counter for processed + // splits + let mut split_field = field.split_whitespace(); + let mut words_passed = 0; + let field_count = field.split_whitespace().count(); + let word_count = if let Some(val) = word_count + && val <= field_count + { + val + } else { + field_count + }; + + // If there is a trailing delimiter from the previous field, push it + if let Some(del) = trailing_delimiter { + formatted_str = del.to_string(); + }; + + // If the current field isn't empty, set trailing delimiter for + // upcoming loop repitition. If it's empty, start next run of loop + // directly + if !field.is_empty() { + trailing_delimiter = cur_trailing_delimiter; + } else { + continue 'field_loop; + } + + // loop over single parts of current field and add correct delimiter + 'word_loop: loop { + // process the single slices and add correct delimiter + if let Some(field_slice) = split_field.next() { + // Create word slice char by char. We need to loop over chars + // instead of a simple bytes index to also catch chars which + // consist of more than one byte (äöüøæ etc...) + let mut word_slice = String::new(); + let word_chars = field_slice.chars(); + let mut counter = 0; + 'char_loop: for mut c in word_chars { + // If camelcase is set, force first char of word to uppercase + if counter == 0 && case == Some(&CitekeyCase::Camel) { + c = c.to_ascii_uppercase() + } + if let Some(len) = char_count + && counter == len + { + break 'char_loop; + } + // if a word slice contains a special char, skip it + if IGNORED_SPECIAL_CHARS.contains(&c) { + continue 'char_loop; + } + word_slice.push(c); + counter += 1; + } + // Don't count empty slices and don't add delimiter to those + if !word_slice.is_empty() { + formatted_str = formatted_str + &word_slice; + words_passed += 1; + if word_count == words_passed { + break 'word_loop; + } else { + formatted_str = formatted_str + inner_delimiter.unwrap_or(""); + } + } else { + continue 'word_loop; + } + } else { + break 'word_loop; + }; + } + formatted_str + }; + new_citekey = new_citekey + &formatted_field_str; + } + match case { + Some(CitekeyCase::Lower) => new_citekey.to_lowercase(), + Some(CitekeyCase::Upper) => new_citekey.to_uppercase(), + _ => new_citekey, + } +} + +/// Preformat some fields which are very common to be used in citekeys +pub(super) fn preformat_field(field: &str, entry: &Entry) -> String { + match field { + // Sanitize all macro code from string + "title" => { + sanitize_single_string_fully(&entry.get_as::(field).unwrap_or("".into())) + } + // Get author names. Fall back to editors before setting empty string + "author" => { + if let Ok(authors) = entry.author() { + let mut last_names = String::new(); + for a in authors.iter() { + last_names = last_names + &a.name + " "; + } + last_names + } else if let Ok(editors) = entry.editors() { + let mut last_names = String::new(); + for editortypes in editors.iter() { + for e in editortypes.0.iter() { + last_names = last_names + &e.name + " "; + } + } + last_names + } else { + "".to_string() + } + } + // Get year of date field, fallback to year field + "year" => { + if let Ok(date) = entry.date() { + date.to_chunks().format_verbatim()[..4].to_string() + } else { + entry.get_as::(field).unwrap_or("".into()) + } + } + // Sanitize all macro code from string + "subtitle" => { + sanitize_single_string_fully(&entry.get_as::(field).unwrap_or("".into())) + } + "editor" => { + if let Ok(editors) = entry.editors() { + let mut last_names = String::new(); + for editortypes in editors.iter() { + for e in editortypes.0.iter() { + last_names = last_names + &e.name + " "; + } + } + last_names + } else { + "".to_string() + } + } + "pubtype" | "entrytype" => entry.entry_type.to_string(), + _ => entry.get_as::(field).unwrap_or("".into()), + } +} + +/// Split a formatting pattern of kind +/// `;;;;`, +/// e.g.: `title;3;3;_;:` will give `("title", 3, 3, "_", ":")` +pub(super) fn split_formatting_pat( + pattern: &str, +) -> ( + &str, + Option, + Option, + Option<&str>, + Option<&str>, +) { + let mut splits = pattern.split(';'); + ( + splits + .next() + .expect("Need field value for formatting citekey"), + if let Some(next) = splits.next() + && next.len() > 0 + { + next.parse::().ok() + } else { + None + }, + if let Some(next) = splits.next() + && next.len() > 0 + { + next.parse::().ok() + } else { + None + }, + splits.next(), + splits.next(), + ) +} + +#[cfg(test)] +mod test { + use crate::bibiman::citekeys::citekey_utils::split_formatting_pat; + + #[test] + fn split_citekey_pattern() { + let pattern = "title;3;5;_;_"; + + assert_eq!( + split_formatting_pat(pattern), + ("title", Some(3), Some(5), Some("_"), Some("_")) + ); + + let pattern = "year"; + + assert_eq!( + split_formatting_pat(pattern), + ("year", None, None, None, None) + ); + + let pattern = "author;1;;;_"; + assert_eq!( + split_formatting_pat(pattern), + ("author", Some(1), None, Some(""), Some("_")) + ); + } +} diff --git a/tests/test-config.toml b/tests/test-config.toml index 2c5ac96..d3e42c5 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,5 +61,10 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = [ "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] +fields = ["entrytype;;;;:", "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] +# fields = [ # CamelCase test +# "author;2;;;", +# "title;5;5;;", +# "year" +# ] case = "lowercase" -- cgit v1.2.3 From 467851007e1861834326deee3116aa88fe839f5a Mon Sep 17 00:00:00 2001 From: lukeflo Date: Mon, 13 Oct 2025 15:45:53 +0200 Subject: Working proof of concept of citekey formatting --- CITEKEYS.md | 215 +++++++++++++++ Cargo.lock | 7 + Cargo.toml | 1 + README.md | 20 ++ src/bibiman/citekeys.rs | 69 +++-- src/bibiman/citekeys/citekey_utils.rs | 105 ++++---- src/config.rs | 69 +++++ tests/biblatex-test-citekeys.bib | 476 ++++++++++++++++++++++++++++++++++ tests/test-config.toml | 5 +- 9 files changed, 889 insertions(+), 78 deletions(-) create mode 100644 CITEKEYS.md create mode 100644 tests/biblatex-test-citekeys.bib (limited to 'tests/test-config.toml') diff --git a/CITEKEYS.md b/CITEKEYS.md new file mode 100644 index 0000000..912326a --- /dev/null +++ b/CITEKEYS.md @@ -0,0 +1,215 @@ +# Formatting Citekeys + + + +- [Formatting Citekeys](#formatting-citekeys) + - [Settings](#settings) + - [Building Patterns](#building-patterns) + - [Ignore Lists and Char Case](#ignore-lists-and-char-case) + - [General Tipps](#general-tipps) + - [Examples](#examples) + + + +`bibiman` offers the possibility to create new citekeys from the fields of +BibLaTeX entries. This is done using an easy but powerful pattern-matching +syntax. + +## Settings + +All settings for the citekey generation have to be configured in the used config +file. The regular path is `XDG_CONFIG_DIR/bibiman/bibiman.toml`. But it can be +set dynamically with the `-c`/`--config=` global option. + +Following values can be set through the config file. A detailed explanation for +all fields follows below: + +```toml +[citekey_formatter] +fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ] +case = "lowercase" +ascii_only = true +ignored_chars = [ + "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", "\"", +] +ignored_words = [ + "the", + "a", + "an", + "of", + "for", + "in", + "at", + "to", + "and", + "der", + "die", + "das", + "ein", + "eine", + "eines", + "des", + "auf", + "und", + "für", + "vor", +] +``` + +## Building Patterns + +The main aspect for generating citekeys are the field patterns. They can be set +through an array in the config file where every array-item represents a single +BibLaTeX field to be used for generating a part of the citekey. + +Every field pattern consists of the following five parts separated by +semicolons. The general pattern looks like this (every subfield is explained +below): + +*biblatex field name* **;** *max word count* **;** *max char count* **;** *inner delimiter* **;** *trailing delimiter* + +- **BibLaTeX field**: the first part represents the field name which value + should be used to generate the content part of the citekey. Theoretically, any + BibLaTeX field can be selected by name. But there are some fields which are + much more common than others; e.g. `author`, `editor`, `title`, `year`/`date` + or `entrytype`. Those very common fields are preprocessed; meaning that for + instance LaTeX macros are fully stripped from the strings, or that `editor` is + a fallback value for `author` if the latter is empty (however, setting + `editor` explicitly is still possible). Also using `year` will parse the + `date` field too, to ensure a year number. +- **Max Word**: Defines how many words should maximal be used from the named + field. E.g. if the title consists of five words, and the max counter is set to + `3` only the first three fields will be used. +- **Max Chars/Word**: Defines how many chars, counting from the start, of each + word will be used to build the citekey. If for instance the value is set to + `5`, only the first five chars of any word will be used. Thus, "archaeology" + would be stripped down to "archa". +- **Inner Delimiter**: Sets the delimiter char used between words from the + currently named field; e.g. to separate the words of the `title` field. +- **Trailing Delimiter**: Sets the delimiter which separates the current fields + value from the following. This delimiter is only printed if the following + field has some content. + +For example, to use the `title` field, print maximal three words and of those +only the first five chars, single words separated by underscore and the whole +field separated by equal sign, insert the following pattern field into the +`fields` array: + +`title;3;5;_;=` + +Except the BibLaTeX field name, all other parts of the pattern can be left +blank. If the field name is the only value set, semicolon delimiters are also +not necessary. But if only one of the following parts should be set, all +delimiters need to be used. E.g. those are both valid: `title` or `title;;;_;=`. +The first would print all words of the title, no matter the length, not +separated by any char. The last would also print all words of the title, but +single words separated by underscores and the whole pattern value separated from +the following by an equal sign. This is not valid: `title;;_` since `bibiman` +can't know if the underscore means a delimiter (and which) or the max char +count. + +The pattern array inside the config file takes multiple pattern fields like the +predecing. This allows an elaborated citekey pattern which takes into account +multiple fields. + +## Ignore Lists and Char Case + +Beside the field patterns there are some other options to define how citekeys +should be built. + +`ascii_only=` +: If set to `true`, which is the default, non-ascii chars are mapped to their + ascii equivalent. For example, the German `ä` would be mapped to `a`. The + Turkish `ş` or Greek `σ`/`ς` would be mapped to `s`. If set to `false` all are + kept as they are. But this could lead to errors running LaTeX on the file. + +`case=` +: If used, sets the case of the chars in the citekey. Valid values are + `uppercase`, `lowercase` or `camelcase`. Both first should be clear, the + latter means typical camel case also beginning the *first word* with an + uppercase letter; also referenced as upper camel case or Pascal case. + +`ignored_chars=` +: Defines chars which should be ignored during parsing (meaning not print them). + The default list contains 33 special chars and is part of the default config + file (in out-commented state). Be aware, setting this key will completely + overwrite the default list! + +`ignored_words=` +: A list of words which should be ignored parsing field values. The default list + contains about 20 very commonly used words in English and German; like + articles, pronouns or connector words. Like with `ignored_chars` setting this + key will completely overwrite the default list! + +## General Tipps + +- Most importantly: *always use the **`--dry-run`** option first*! This will + print a list of old and new values for all citekeys in the file without + changing anything. +- After finding a good overall pattern, *use the `--output=` option* to create a + new file and don't overwrite your existent file. Thus, your original file + isn't broken if the key formatter produces some unwanted output. +- Even very long patterns are possible, they are not encouraged, since it bloats + the bibfiles. +- The same accounts for *too short* patterns; if the pattern is to unspecific, + it bares the risk of producing doublettes (e.g. single author and year only). + But the citekey generator will not check for doublettes! +- It is possible to keep special chars and use them as delimiters. But this + might cause problems other programs and CLI tools in particular, since many + special chars are reserved for shell operations. For instance, it will very + likely break the note file feature of `bibiman` which doesn't accept many + special chars. + +## Examples + +To make the process more clear a few examples might help. Following bibfile is +assumed: + +```latex +@article{Bos2023, + title = {{LaTeX}, metadata, and publishing workflows}, + author = {Bos, Joppe W. and {McCurley}, Kevin S.}, + year = {2023}, + month = apr, + journal = {arXiv}, + number = {{arXiv}:2301.08277}, + doi = {10.48550/arXiv.2301.08277}, + url = {http://arxiv.org/abs/2301.08277}, + urldate = {2023-08-22}, + note = {type: article}, +} +@book{Bhambra2021, + title = {Colonialism and \textbf{Modern Social Theory}}, + author = {Bhambra, Gurminder K. and Holmwood, John}, + location = {Cambridge and Medford}, + publisher = {Polity Press}, + date = {2021}, + +``` + +And the following values set in the config file: + +```toml +fields = [ + # Just print the whole entrytype and a colon as trailing delimiter + "entrytype;;;;:", + # Print all author names in full length, names separated by dash, + # the whole field by underscore + "author;;;-;_", + # Print first 4 words of title, first 3 chars of every word only. Title words + # separated by equal sign, the whole field by underscore + "title;4;3;=;_", + # Print all words of location, but only first 4 chars of every word. Single words + # separated by colon, whole field by underscore + "location;;4;:;_", + # Just print the whole year + "year", +] +case = "lowercase" +ascii_only = true +``` + +The combination of those setting will produce the following citekeys: + +- **`article:bos-mccurley_lat=met=pub=wor_2023`** +- **`book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021`** diff --git a/Cargo.lock b/Cargo.lock index a27636e..0adb4e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,6 +103,7 @@ dependencies = [ "biblatex", "color-eyre", "crossterm", + "deunicode", "dirs", "editor-command", "figment", @@ -323,6 +324,12 @@ dependencies = [ "syn", ] +[[package]] +name = "deunicode" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04" + [[package]] name = "dirs" version = "5.0.1" diff --git a/Cargo.toml b/Cargo.toml index abf1eee..0c07c51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ owo-colors = "4.2.2" logos = "0.15.1" phf = { version = "0.13.1", features = ["macros"] } indoc = "2.0.6" +deunicode = "1.6.2" [workspace.metadata.cross.target.aarch64-unknown-linux-gnu] # Install libssl-dev:arm64, see diff --git a/README.md b/README.md index 4929509..3fb81c8 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,11 @@ - [Ubuntu/Debian](#ubuntudebian) - [Void Linux](#void-linux) - [Usage](#usage) + - [CLI for citekey formatting](#cli-for-citekey-formatting) - [Configuration](#configuration) - [Location of Config File](#location-of-config-file) - [General Configuration](#general-configuration) + - [Citekey formatting](#citekey-formatting) - [Color Configuration](#color-configuration) - [Features](#features) - [Keybindings](#keybindings) @@ -196,6 +198,13 @@ bibman tests/multi-files/ bibiman tests/biblatex-test.bib tests/multi-files/ ``` +### CLI for citekey formatting + +Beside the TUI `bibiman` can format and replace citekeys. To make use of this +feature run the program with the `format-citekeys` subcommand. For more +information on this use `bibiman format-citekeys --help` and the +[docs](./CITEKEYS.md). + ## Configuration ### Location of Config File @@ -268,6 +277,11 @@ note_symbol = "󰧮" ## Possible values are "journaltitle", "organization", "instituion", "publisher" ## and "pubtype" (which is the default) custom_column = "pubtype" + +[citekey_formatter] +fields = [] +ascii_only = true +case = "lowercase" ``` `bibfiles` @@ -326,6 +340,12 @@ custom_column = "pubtype" good advice to use a rather wide terminal window when using a value like `journaltitle`. +### Citekey formatting + +`bibiman` now also offers a citekey generating feature. This enables to reformat +all citekeys based on an elaborated pattern matching syntax. For furthter +information and examples see the [docs](CITEKEYS.md). + ### Color Configuration Furthermore, it is now possible to customize the colors. The following values diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs index 2f56947..0cec28e 100644 --- a/src/bibiman/citekeys.rs +++ b/src/bibiman/citekeys.rs @@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize}; use crate::{ bibiman::citekeys::citekey_utils::{build_citekey, formatting_help}, - config::BibiConfig, + config::{BibiConfig, IGNORED_SPECIAL_CHARS, IGNORED_WORDS}, }; mod citekey_utils; @@ -60,6 +60,7 @@ pub(crate) struct CitekeyFormatting { case: Option, old_new_keys_map: Vec<(String, String)>, dry_run: bool, + ascii_only: bool, } impl CitekeyFormatting { @@ -69,14 +70,15 @@ impl CitekeyFormatting { ) -> color_eyre::Result<()> { let mut formatter = CitekeyFormatting::default(); - formatter.fields = cfg - .citekey_formatter - .fields - .clone() - .ok_or_eyre("Need to define fields correctly in config file")?; + formatter.fields = cfg.citekey_formatter.fields.clone().ok_or_eyre(format!( + "Need to define {} correctly in config file", + "citekey pattern fields".red() + ))?; formatter.case = cfg.citekey_formatter.case.clone(); + formatter.ascii_only = cfg.citekey_formatter.ascii_only; + if formatter.fields.is_empty() { return Err(eyre!( "To format all citekeys, you need to provide {} values in the config file", @@ -105,13 +107,26 @@ impl CitekeyFormatting { formatter.bib_entries = Bibliography::parse(&bibstring) .map_err(|e| eyre!("Couldn't parse bibfile due to {}", e.kind))?; + let ignored_chars = if let Some(chars) = &cfg.citekey_formatter.ignored_chars { + chars.as_slice() + } else { + IGNORED_SPECIAL_CHARS.as_slice() + }; + + let ignored_words = if let Some(words) = &cfg.citekey_formatter.ignored_words { + words.as_slice() + } else { + &*IGNORED_WORDS.as_slice() + }; + formatter - .do_formatting() + .do_formatting(ignored_chars, ignored_words) .rev_sort_new_keys_by_len() .update_file()?; Ok(()) } + /// Start Citekey formatting with building a new instance of `CitekeyFormatting` /// Formatting is processed file by file, because `bibman` can handle /// multi-file setups. @@ -144,16 +159,24 @@ impl CitekeyFormatting { case: cfg.citekey_formatter.case.clone(), old_new_keys_map: Vec::new(), dry_run: false, + ascii_only: cfg.citekey_formatter.ascii_only, }) } /// Process the actual formatting. The citekey of every entry will be updated. - pub fn do_formatting(&mut self) -> &mut Self { + pub fn do_formatting(&mut self, ignored_chars: &[char], ignored_words: &[String]) -> &mut Self { let mut old_new_keys: Vec<(String, String)> = Vec::new(); for entry in self.bib_entries.iter() { old_new_keys.push(( entry.key.clone(), - build_citekey(entry, &self.fields, self.case.as_ref()), + build_citekey( + entry, + &self.fields, + self.case.as_ref(), + self.ascii_only, + ignored_chars, + ignored_words, + ), )); } @@ -215,12 +238,15 @@ mod tests { use biblatex::Bibliography; - use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting}; + use crate::{ + bibiman::citekeys::{CitekeyCase, CitekeyFormatting}, + config::{IGNORED_SPECIAL_CHARS, IGNORED_WORDS}, + }; #[test] fn format_citekey_test() { let src = r" - @article{bos_latex_metadata_and_publishing_workflows_2023, + @article{Bos2023, title = {{LaTeX}, metadata, and publishing workflows}, author = {Bos, Joppe W. and {McCurley}, Kevin S.}, year = {2023}, @@ -232,7 +258,7 @@ mod tests { urldate = {2023-08-22}, note = {type: article}, } - @book{bhambra_colonialism_social_theory_2021, + @book{Bhambra2021, title = {Colonialism and \textbf{Modern Social Theory}}, author = {Bhambra, Gurminder K. and Holmwood, John}, location = {Cambridge and Medford}, @@ -247,29 +273,24 @@ mod tests { fields: vec![ "entrytype;;;;:".into(), "author;;;-;_".into(), - "title;4;3;_;_".into(), + "title;4;3;=;_".into(), "location;;4;:;_".into(), "year".into(), ], - case: None, + case: Some(CitekeyCase::Lower), old_new_keys_map: Vec::new(), dry_run: false, + ascii_only: true, }; - let _ = formatting_struct.do_formatting(); + let _ = formatting_struct + .do_formatting(IGNORED_SPECIAL_CHARS.as_slice(), &*IGNORED_WORDS.as_slice()); assert_eq!( formatting_struct.old_new_keys_map.get(0).unwrap().1, - "article:Bos-McCurley_LaT_met_and_pub_Empt_2023" + "article:bos-mccurley_lat=met=pub=wor_2023" ); assert_eq!( formatting_struct.old_new_keys_map.get(1).unwrap().1, - "book:Bhambra-Holmwood_Col_and_Mod_Soc_Camb:and:Medf_2021" - ); - formatting_struct.case = Some(CitekeyCase::Lower); - let _ = formatting_struct.do_formatting().rev_sort_new_keys_by_len(); - // now the longer citekey is processed first and its in lowercase! - assert_eq!( - formatting_struct.old_new_keys_map.get(0).unwrap().1, - "book:bhambra-holmwood_col_and_mod_soc_camb:and:medf_2021" + "book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021" ); } diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs index ee2c849..5f70224 100644 --- a/src/bibiman/citekeys/citekey_utils.rs +++ b/src/bibiman/citekeys/citekey_utils.rs @@ -16,21 +16,14 @@ ///// use biblatex::{ChunksExt, Entry, Type}; +use deunicode::deunicode; use indoc::formatdoc; use owo_colors::{ OwoColorize, colors::{BrightBlue, Green, White}, }; -use crate::{ - bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully}, - config::IGNORED_SPECIAL_CHARS, -}; - -const IGNORE_WORDS: [&str; 20] = [ - "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine", - "eines", "des", "auf", "und", "für", "vor", -]; +use crate::bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully}; pub(super) fn formatting_help() { let help = vec![ @@ -104,6 +97,9 @@ pub(super) fn build_citekey( entry: &Entry, pattern_fields: &[String], case: Option<&CitekeyCase>, + ascii_only: bool, + ignored_chars: &[char], + ignored_words: &[String], ) -> String { // mut string the citekey is built from let mut new_citekey = String::new(); @@ -114,7 +110,7 @@ pub(super) fn build_citekey( // loop over pattern fields process them 'field_loop: for pattern in pattern_fields.iter() { // parse single values from pattern field - let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) = + let (field_name, max_words, max_chars, inner_delimiter, cur_trailing_delimiter) = split_formatting_pat(pattern); // built the part of the citekey from the current pattern field @@ -126,16 +122,9 @@ pub(super) fn build_citekey( // split at whitespaces, count fields and set counter for processed // splits - let mut split_field = field.split_whitespace(); + let split_field = field.split_whitespace(); let mut words_passed = 0; let field_count = field.split_whitespace().count(); - let word_count = if let Some(val) = word_count - && val <= field_count - { - val - } else { - field_count - }; // If there is a trailing delimiter from the previous field, push it if let Some(del) = trailing_delimiter { @@ -152,47 +141,57 @@ pub(super) fn build_citekey( } // loop over single parts of current field and add correct delimiter - 'word_loop: loop { - // process the single slices and add correct delimiter - if let Some(field_slice) = split_field.next() { - // Create word slice char by char. We need to loop over chars - // instead of a simple bytes index to also catch chars which - // consist of more than one byte (äöüøæ etc...) - let mut word_slice = String::new(); - let word_chars = field_slice.chars(); - let mut counter = 0; - 'char_loop: for mut c in word_chars { - // If camelcase is set, force first char of word to uppercase - if counter == 0 && case == Some(&CitekeyCase::Camel) { - c = c.to_ascii_uppercase() - } - if let Some(len) = char_count - && counter == len - { - break 'char_loop; - } - // if a word slice contains a special char, skip it - if IGNORED_SPECIAL_CHARS.contains(&c) { - continue 'char_loop; - } + // process the single slices and add correct delimiter + 'word_loop: for (idx, field_slice) in split_field.enumerate() { + // if the current slice is a common word from the ignore list, + // skip it. + if ignored_words.contains(&field_slice.to_lowercase()) { + continue; + } + + // Create word slice char by char. We need to loop over chars + // instead of a simple bytes index to also catch chars which + // consist of more than one byte (äöüøæ etc...) + let mut word_slice = String::new(); + let word_chars = field_slice.chars(); + let mut counter = 0; + 'char_loop: for mut c in word_chars { + // If camelcase is set, force first char of word to uppercase + if counter == 0 && case == Some(&CitekeyCase::Camel) { + c = c.to_ascii_uppercase() + } + if let Some(len) = max_chars + && counter >= len + { + break 'char_loop; + } + // if a word slice contains a special char, skip it + if ignored_chars.contains(&c) { + continue 'char_loop; + } + // if non-ascii chars should be mapped, check if needed and do it + if let Some(chars) = deunicode::deunicode_char(c) + && ascii_only + { + word_slice.push_str(chars); + counter += chars.len(); + } else { word_slice.push(c); counter += 1; } - // Don't count empty slices and don't add delimiter to those - if !word_slice.is_empty() { - formatted_str = formatted_str + &word_slice; - words_passed += 1; - if word_count == words_passed { - break 'word_loop; - } else { - formatted_str = formatted_str + inner_delimiter.unwrap_or(""); - } + } + // Don't count empty slices and don't add delimiter to those + if !word_slice.is_empty() { + formatted_str = formatted_str + &word_slice; + words_passed += 1; + if max_words.is_some_and(|max| max == words_passed) || idx + 1 == field_count { + break 'word_loop; } else { - continue 'word_loop; + formatted_str = formatted_str + inner_delimiter.unwrap_or(""); } } else { - break 'word_loop; - }; + continue 'word_loop; + } } formatted_str }; diff --git a/src/config.rs b/src/config.rs index b1c4b07..7c1a0f8 100644 --- a/src/config.rs +++ b/src/config.rs @@ -20,6 +20,7 @@ use std::{ io::{Write, stdin}, path::PathBuf, str::FromStr, + sync::LazyLock, }; use color_eyre::{eyre::Result, owo_colors::OwoColorize}; @@ -40,6 +41,31 @@ pub const IGNORED_SPECIAL_CHARS: [char; 33] = [ '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"', ]; +pub static IGNORED_WORDS: LazyLock> = LazyLock::new(|| { + vec![ + String::from("the"), + String::from("a"), + String::from("an"), + String::from("of"), + String::from("for"), + String::from("in"), + String::from("at"), + String::from("to"), + String::from("and"), + String::from("der"), + String::from("die"), + String::from("das"), + String::from("ein"), + String::from("eine"), + String::from("eines"), + String::from("des"), + String::from("auf"), + String::from("und"), + String::from("für"), + String::from("vor"), + ] +}); + const DEFAULT_CONFIG: &str = r##" # [general] ## Default files/dirs which are loaded on startup @@ -118,6 +144,40 @@ const DEFAULT_CONFIG: &str = r##" ## Convert chars to specified case. Possible values: ## "upper", "uppercase", "lower", "lowercase" # case = "lowercase" + +## Map all unicode chars to their pure ascii equivalent +# ascii_only = true + +## List of special chars that'll be ignored when building citekeys. +## A custom list will overwrite the default list +# ignored_chars = [ +# "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", """, +# ] + +## List of words that'll be ignored when building citekeys. +## A custom list will overwrite the default list +# ignored_words = [ +# "the", +# "a", +# "an", +# "of", +# "for", +# "in", +# "at", +# "to", +# "and", +# "der", +# "die", +# "das", +# "ein", +# "eine", +# "eines", +# "des", +# "auf", +# "und", +# "für", +# "vor", +# ] "##; /// Main struct of the config file. Contains substructs/headings in toml @@ -171,6 +231,9 @@ pub struct Colors { pub struct CitekeyFormatter { pub fields: Option>, pub case: Option, + pub ascii_only: bool, + pub ignored_chars: Option>, + pub ignored_words: Option>, } impl Default for BibiConfig { @@ -194,6 +257,9 @@ impl Default for BibiConfig { citekey_formatter: CitekeyFormatter { fields: None, case: None, + ascii_only: true, + ignored_chars: None, + ignored_words: None, }, } } @@ -224,6 +290,9 @@ impl BibiConfig { citekey_formatter: CitekeyFormatter { fields: None, case: None, + ascii_only: true, + ignored_chars: None, + ignored_words: None, }, } } diff --git a/tests/biblatex-test-citekeys.bib b/tests/biblatex-test-citekeys.bib new file mode 100644 index 0000000..9767f97 --- /dev/null +++ b/tests/biblatex-test-citekeys.bib @@ -0,0 +1,476 @@ +@set{set, + entryset = {article:herrmann-ofele_carboc=carben=as_2006,article:aksin-turkmen_effect=immobi=on_2006,article:yoon-ryu_pallad=pincer=comple_2006}, + annotation = {A \texttt{set} with three members.}, +} + +@set{set, + entryset = {article:glashow_partia=symmet=weak_1961,article:weinberg_model=lepton_1967,salam}, + annotation = {A \texttt{set} with three members discussing the standard + model of particle physics.}, +} + +@collection{collection:matuz-miller_contem=litera=critic_1990gale, + title = {Contemporary Literary Criticism}, + year = {1990}, + location = {Detroit}, + publisher = {Gale}, + volume = {61}, + pages = {204--208}, + editor = {Matuz, Roger and Miller, Helen}, + keywords = {narration}, + langid = {english}, + langidopts = {variant=american}, + annotation = {A \texttt{collection} entry providing the excerpt information + for the \texttt{article:doody_heming=style=jakes_1974} entry. Note the format of the \texttt{ + pages} field}, +} + +@article{article:aksin-turkmen_effect=immobi=on_2006, + title = {Effect of immobilization on catalytic characteristics of saturated + {Pd-N}-heterocyclic carbenes in {Mizoroki-Heck} reactions}, + author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok , Levent and + { \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\" u}y{ \"u}kg{\"u} + ng{ \" o}r, Orhan and {\"O}zkal, Erhan}, + volume = {691}, + number = {13}, + pages = {3027--3036}, + journaltitle = jomch, + date = {2006}, + indextitle = {Effect of immobilization on catalytic characteristics}, +} + +@article{article:angenendt_honore=salvat=vom_2002, + title = {In Honore Salvatoris~-- Vom Sinn und Unsinn der Patrozinienkunde}, + shorttitle = {In Honore Salvatoris}, + author = {Angenendt, Arnold}, + volume = {97}, + pages = {431--456, 791--823}, + journaltitle = {Revue d'Histoire Eccl{\'e}siastique}, + date = {2002}, + langid = {german}, + indextitle = {In Honore Salvatoris}, + annotation = {A German article in a French journal. Apart from that, a + typical \texttt{article} entry. Note the \texttt{indextitle} + field}, +} + +@book{book:aristotle_de=anima_1907cambr#unive#press, + title = {De Anima}, + author = {Aristotle}, + location = {Cambridge}, + publisher = cup, + date = {1907}, + editor = {Hicks, Robert Drew}, + keywords = {primary, ancient, philosophy, athens}, + langid = {english}, + langidopts = {variant=british}, + annotation = {A \texttt{book} entry with an \texttt{author} and an \texttt{ + editor}}, +} + +@book{book:aristotle_physic_1929g#p#putna, + title = {Physics}, + shorttitle = {Physics}, + author = {Aristotle}, + location = {New York}, + publisher = {G. P. Putnam}, + url = {https://www.infobooks.org/authors/classic/aristotle-books/#Physic}, + date = {1929}, + translator = {Wicksteed, P. H. and Cornford, F. M.}, + keywords = {primary, ancient, philosophy}, + langid = {english}, + langidopts = {variant=american}, + file = {~/Documents/coding/projects/bibiman/tests/book:aristotle_physic_1929g#p#putna.pdf}, + annotation = {A \texttt{book} entry with a \texttt{translator} field}, + abstract = {The Physics is a work by Aristotle dedicated to the study of + nature. Regarded by Heidegger as "the fundamental work of Western + philosophy", it presents the renowned distinction between the + four types of cause, as well as reflections on chance, motion, + infinity, and other fundamental concepts. It is here that + Aristotle sets out his celebrated paradox of time.}, +} + +@book{book:aristotle_poetic_1968clare#press, + title = {Poetics}, + shorttitle = {Poetics}, + author = {Aristotle}, + location = {Oxford}, + publisher = {Clarendon Press}, + series = {Clarendon {Aristotle}}, + date = {1968}, + editor = {Lucas, D. W.}, + keywords = {primary}, + langid = {english}, + langidopts = {variant=british}, + annotation = {A \texttt{book} entry with an \texttt{author} and an \texttt{ + editor} as well as a \texttt{series} field}, +} + +@mvbook{mvbook:aristotle_rhetor=aristo=with_1877cambr#unive#press, + title = {The \textbf{Rhetoric} of {Aristotle} with a commentary by the late {Edward + Meredith Cope}}, + shorttitle = {Rhetoric}, + author = {Aristotle}, + publisher = cup, + date = {1877}, + editor = {Cope, Edward Meredith}, + commentator = {Cope, Edward Meredith}, + volumes = {3}, + keywords = {primary}, + langid = {english}, + langidopts = {variant=british}, + sorttitle = {Rhetoric of Aristotle}, + indextitle = {Rhetoric of {Aristotle}, The}, + annotation = {A commented edition. Note the concatenation of the \texttt{ + editor} and \texttt{commentator} fields as well as the \texttt{ + volumes}, \texttt{sorttitle}, and \texttt{indextitle} fields}, +} + +@book{book:augustine_hetero=cataly=synthe_1995marce#dekke, + title = {Heterogeneous catalysis for the synthetic \textit{chemist}}, + shorttitle = {Heterogeneous catalysis}, + author = {Augustine, Robert L.}, + location = {New York}, + publisher = {Marcel Dekker}, + date = {1995}, + langid = {english}, + langidopts = {variant=american}, + annotation = {A plain \texttt{book} entry}, + keywords = {chemistry}, +} + +@book{book:averroes_epistl=on=possib_1982jewis#theol#semin#ameri, + title = {The Epistle on the Possibility of Conjunction with the Active + Intellect by {Ibn Rushd} with the Commentary of {Moses Narboni}}, + shorttitle = {Possibility of Conjunction}, + author = {Averroes}, + location = {New York}, + publisher = {Jewish Theological Seminary of America}, + series = {Moreshet: Studies in {Jewish} History, Literature and Thought}, + number = {7}, + date = {1982}, + editor = {Bland, Kalman P.}, + translator = {Bland, Kalman P.}, + keywords = {primary}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Epistle on the Possibility of Conjunction, The}, + annotation = {A \texttt{book} entry with a \texttt{series} and a \texttt{ + number}. Note the concatenation of the \texttt{editor} and + \texttt{translator} fields as well as the \texttt{indextitle} + field}, +} + +@article{article:baez-lauda_higher=algebr=v_2004, + title = {Higher-Dimensional Algebra {V}: 2-Groups}, + author = {Baez, John C. and Lauda, Aaron D.}, + volume = {12}, + pages = {423--491}, + journaltitle = {Theory and Applications of Categories}, + date = {2004}, + version = {3}, + eprint = {math/0307200v3}, + eprinttype = {arxiv}, + langid = {english}, + keywords = {math}, + langidopts = {variant=american}, + annotation = {An \texttt{article} with \texttt{eprint} and \texttt{ + eprinttype} fields. Note that the arXiv reference is + transformed into a clickable link if \texttt{hyperref} support + has been enabled. Compare \texttt{baez\slash online}, which is + the same item given as an \texttt{online} entry}, +} + +@article{article:bertram-wentworth_gromov=invari=holomo_1996, + title = {Gromov invariants for holomorphic maps on {Riemann} surfaces}, + shorttitle = {Gromov invariants}, + author = {Bertram, Aaron and Wentworth, Richard}, + volume = {9}, + number = {2}, + pages = {529--571}, + journaltitle = jams, + date = {1996}, + langid = {english}, + langidopts = {variant=american}, + annotation = {An \texttt{article} entry with a \texttt{volume} and a \texttt + {number} field}, +} + +@article{article:doody_heming=style=jakes_1974, + title = {Hemingway's Style and {Jake's} Narration}, + author = {Doody, Terrence}, + year = {1974}, + journal = {The Journal of Narrative Technique}, + volume = {4}, + number = {3}, + pages = {212--225}, + langid = {english}, + langidopts = {variant=american}, + related = {matuz:article:doody_heming=style=jakes_1974}, + relatedstring = {\autocap{e}xcerpt in}, + annotation = {An \texttt{article} entry cited as an excerpt from a \texttt{ + collection} entry. Note the format of the \texttt{related} and + \texttt{relatedstring} fields}, +} + +@article{article:gillies_herder=prepar=goethe_1933, + title = {Herder and the Preparation of {Goethe's} Idea of World Literature}, + author = {Gillies, Alexander}, + series = {newseries}, + volume = {9}, + pages = {46--67}, + journaltitle = {Publications of the English Goethe Society}, + date = {1933}, + langid = {english}, + langidopts = {variant=british}, + annotation = {An \texttt{article} entry with a \texttt{series} and a \texttt + {volume} field. Note that format of the \texttt{series} field + in the database file}, +} + +@article{article:glashow_partia=symmet=weak_1961, + title = {Partial Symmetries of Weak Interactions}, + author = {Glashow, Sheldon}, + volume = {22}, + pages = {579--588}, + journaltitle = {Nucl.~Phys.}, + date = {1961}, +} + +@article{article:herrmann-ofele_carboc=carben=as_2006, + title = {A carbocyclic carbene as an efficient catalyst ligand for {C--C} + coupling reactions}, + author = {Herrmann, Wolfgang A. and {\"O}fele, Karl and Schneider, Sabine K. + and Herdtweck, Eberhardt and Hoffmann, Stephan D.}, + volume = {45}, + number = {23}, + pages = {3859--3862}, + journaltitle = anch-ie, + date = {2006}, + indextitle = {Carbocyclic carbene as an efficient catalyst, A}, +} + +@article{article:hostetler-wingate_alkane=gold=cluste_1998, + title = {Alkanethiolate gold cluster molecules with core diameters from 1.5 + to 5.2~{nm}}, + shorttitle = {Alkanethiolate gold cluster molecules}, + author = {Hostetler, Michael J. and Wingate, Julia E. and Zhong, Chuan-Jian + and Harris, Jay E. and Vachet, Richard W. and Clark, Michael R. and + Londono, J. David and Green, Stephen J. and Stokes, Jennifer J. and + Wignall, George D. and Glish, Gary L. and Porter, Marc D. and Evans + , Neal D. and Murray, Royce W.}, + volume = {14}, + number = {1}, + pages = {17--30}, + journaltitle = {Langmuir}, + date = {1998}, + subtitle = {Core and monolayer properties as a function of core size}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Alkanethiolate gold cluster molecules}, + annotation = {An \texttt{article} entry with \arabic{author} authors. By + default, long author and editor lists are automatically + truncated. This is configurable}, +} + +@article{article:kastenholz-hunenberger_comput=method=ionic_2006, + title = {Computation of methodology\hyphen independent ionic solvation free + energies from molecular simulations}, + author = {Kastenholz, M. A. and H{\"u}nenberger, Philippe H.}, + volume = {124}, + doi = {10.1063/1.2172593}, + journaltitle = jchph, + date = {2006}, + subtitle = {{I}. {The} electrostatic potential in molecular liquids}, + eid = {124106}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Computation of ionic solvation free energies}, + annotation = {An \texttt{article} entry with an \texttt{eid} and a \texttt{ + doi} field. Note that the \textsc{doi} is transformed into a + clickable link if \texttt{hyperref} support has been enabled}, + abstract = {The computation of \texttt{ionic} solvation free energies from atomistic + simulations is a surprisingly difficult problem that has found no + satisfactory solution for more than 15 years. The reason is that + the charging free energies evaluated from such simulations are + affected by very large errors. One of these is related to the + choice of a specific convention for summing up the contributions + of solvent charges to the electrostatic potential in the ionic + cavity, namely, on the basis of point charges within entire + solvent molecules (M scheme) or on the basis of individual point + charges (P scheme). The use of an inappropriate convention may + lead to a charge-independent offset in the calculated potential, + which depends on the details of the summation scheme, on the + quadrupole-moment trace of the solvent molecule, and on the + approximate form used to represent electrostatic interactions in + the system. However, whether the M or P scheme (if any) + represents the appropriate convention is still a matter of + on-going debate. The goal of the present article is to settle + this long-standing controversy by carefully analyzing (both + analytically and numerically) the properties of the electrostatic + potential in molecular liquids (and inside cavities within them). + }, +} + +@article{article:sarfraz-razzak_techni=sectio=algori_2002, + title = {Technical section: {An} algorithm for automatic capturing of the + font outlines}, + author = {M. Sarfraz and M. F. A. Razzak}, + year = {2002}, + journal = {Computers and Graphics}, + volume = {26}, + number = {5}, + pages = {795--804}, + issn = {0097-8493}, + annotation = {An \texttt{article} entry with an \texttt{issn} field}, +} + +@article{article:reese_georgi=anglos=diplom_1958, + title = {Georgia in {Anglo-Spanish} Diplomacy, 1736--1739}, + author = {Reese, Trevor R.}, + series = {3}, + volume = {15}, + pages = {168--190}, + journaltitle = {William and Mary Quarterly}, + date = {1958}, + langid = {english}, + langidopts = {variant=american}, + annotation = {An \texttt{article} entry with a \texttt{series} and a \texttt + {volume} field. Note the format of the series. If the value of + the \texttt{series} field is an integer, this number is printed + as an ordinal and the string \enquote*{series} is appended + automatically}, +} + +@article{article:shore_twiceb=once=concei_1991, + title = {Twice-Born, Once Conceived}, + author = {Shore, Bradd}, + series = {newseries}, + volume = {93}, + number = {1}, + pages = {9--27}, + journaltitle = {American Anthropologist}, + date = {1991-03}, + subtitle = {Meaning Construction and Cultural Cognition}, + annotation = {An \texttt{article} entry with \texttt{series}, \texttt{volume + }, and \texttt{number} fields. Note the format of the \texttt{ + series} which is a localization key}, +} + +@article{article:sigfridsson-ryde_compar=method=derivi_1998, + title = {Comparison of methods for deriving atomic charges from the + electrostatic potential and moments}, + author = {Sigfridsson, Emma and Ryde, Ulf}, + volume = {19}, + number = {4}, + pages = {377--395}, + doi = {10.1002/(SICI)1096-987X(199803)19:4<377::AID-JCC1>3.0.CO;2-P}, + journaltitle = {Journal of Computational Chemistry}, + date = {1998}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Methods for deriving atomic charges}, + annotation = {An \texttt{article} entry with \texttt{volume}, \texttt{number + }, and \texttt{doi} fields. Note that the \textsc{doi} is + transformed into a clickable link if \texttt{hyperref} support + has been enabled}, + abstract = {Four methods for deriving partial atomic charges from the + quantum chemical electrostatic potential (CHELP, CHELPG, + Merz-Kollman, and RESP) have been compared and critically + evaluated. It is shown that charges strongly depend on how and + where the potential points are selected. Two alternative methods + are suggested to avoid the arbitrariness in the point-selection + schemes and van der Waals exclusion radii: CHELP-BOW, which also + estimates the charges from the electrostatic potential, but with + potential points that are Boltzmann-weighted after their + occurrence in actual simulations using the energy function of the + program in which the charges will be used, and CHELMO, which + estimates the charges directly from the electrostatic multipole + moments. Different criteria for the quality of the charges are + discussed.}, +} + +@article{article:spiegelberg_intent=intent=schola_1969, + title = {\mkbibquote{Intention} und \mkbibquote{Intentionalit{\"a}t} in der + Scholastik, bei Brentano und Husserl}, + shorttitle = {Intention und Intentionalit{\"a}t}, + author = {Spiegelberg, Herbert}, + volume = {29}, + pages = {189--216}, + journaltitle = {Studia Philosophica}, + date = {1969}, + langid = {german}, + sorttitle = {Intention und Intentionalitat in der Scholastik, bei Brentano + und Husserl}, + indexsorttitle = {Intention und Intentionalitat in der Scholastik, bei + Brentano und Husserl}, + annotation = {An \texttt{article} entry. Note the \texttt{sorttitle} and + \texttt{indexsorttitle} fields and the markup of the quotes in + the database file}, +} + +@article{article:springer_mediae=pilgri=routes_1950, + title = {Mediaeval Pilgrim Routes from {Scandinavia} to {Rome}}, + shorttitle = {Mediaeval Pilgrim Routes}, + author = {Springer, Otto}, + volume = {12}, + pages = {92--122}, + journaltitle = {Mediaeval Studies}, + date = {1950}, + langid = {english}, + langidopts = {variant=british}, + annotation = {A plain \texttt{article} entry}, +} + +@article{article:weinberg_model=lepton_1967, + title = {A Model of Leptons}, + author = {Weinberg, Steven}, + volume = {19}, + pages = {1264--1266}, + journaltitle = {Phys.~Rev.~Lett.}, + date = {1967}, +} + +@string{anch-ie = {Angew.~Chem. Int.~Ed.}} + +@string{cup = {Cambridge University Press}} + +@string{dtv = {Deutscher Taschenbuch-Verlag}} + +@string{hup = {Harvard University Press}} + +@string{jams = {J.~Amer. Math. Soc.}} + +@string{jchph = {J.~Chem. Phys.}} + +@string{jomch = {J.~Organomet. Chem.}} + +@string{pup = {Princeton University Press}} + +@incollection{incollection:westfahl_true=fronti, + title = {The True Frontier}, + author = {Westfahl, Gary}, + pages = {55--65}, + subtitle = {Confronting and Avoiding the Realities of Space in {American} + Science Fiction Films}, + crossref = {westfahl:frontier}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {True Frontier, The}, + annotation = {A cross-referenced article from a \texttt{collection}. This is + an \texttt{incollection} entry with a \texttt{crossref} field. + Note the \texttt{subtitle} and \texttt{indextitle} fields}, +} + +@article{article:yoon-ryu_pallad=pincer=comple_2006, + title = {Palladium pincer complexes with reduced bond angle strain: + efficient catalysts for the {Heck} reaction}, + author = {Yoon, Myeong S. and Ryu, Dowook and Kim, Jeongryul and Ahn, Kyo + Han}, + volume = {25}, + number = {10}, + pages = {2409--2411}, + journaltitle = {Organometallics}, + date = {2006}, + indextitle = {Palladium pincer complexes}, +} diff --git a/tests/test-config.toml b/tests/test-config.toml index d3e42c5..8dd8014 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,10 +61,13 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = ["entrytype;;;;:", "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] +fields = ["shorthand;;;;+","entrytype;;;;:", "author;2;;-;_", "title;3;6;=;_", "year", "publisher;;5;#;" ] # fields = [ # CamelCase test # "author;2;;;", # "title;5;5;;", # "year" # ] case = "lowercase" +ascii_only = true +# ignored_words = ["the"] +# ignored_chars = ["?", "."] -- cgit v1.2.3 From 18fa9b8bcb02aa5653b976cad7ec9c3123d4f372 Mon Sep 17 00:00:00 2001 From: lukeflo Date: Tue, 14 Oct 2025 10:11:52 +0200 Subject: add dry-run example to citekeys doc --- CITEKEYS.md | 10 ++++++---- tests/test-config.toml | 9 ++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'tests/test-config.toml') diff --git a/CITEKEYS.md b/CITEKEYS.md index 912326a..165f58c 100644 --- a/CITEKEYS.md +++ b/CITEKEYS.md @@ -66,7 +66,8 @@ Every field pattern consists of the following five parts separated by semicolons. The general pattern looks like this (every subfield is explained below): -*biblatex field name* **;** *max word count* **;** *max char count* **;** *inner delimiter* **;** *trailing delimiter* +*biblatex field name* **;** *max word count* **;** *max char count* **;** *inner +delimiter* **;** *trailing delimiter* - **BibLaTeX field**: the first part represents the field name which value should be used to generate the content part of the citekey. Theoretically, any @@ -145,9 +146,10 @@ should be built. - Most importantly: *always use the **`--dry-run`** option first*! This will print a list of old and new values for all citekeys in the file without - changing anything. + changing anything. For the test file of this repo and using the pattern from + the [section below](#examples) `----dry-run` produces the following output: - After finding a good overall pattern, *use the `--output=` option* to create a - new file and don't overwrite your existent file. Thus, your original file + new file and don't overwrite your existing file. Thus, your original file isn't broken if the key formatter produces some unwanted output. - Even very long patterns are possible, they are not encouraged, since it bloats the bibfiles. @@ -155,7 +157,7 @@ should be built. it bares the risk of producing doublettes (e.g. single author and year only). But the citekey generator will not check for doublettes! - It is possible to keep special chars and use them as delimiters. But this - might cause problems other programs and CLI tools in particular, since many + might cause problems for other programs and CLI tools in particular, since many special chars are reserved for shell operations. For instance, it will very likely break the note file feature of `bibiman` which doesn't accept many special chars. diff --git a/tests/test-config.toml b/tests/test-config.toml index 8dd8014..704d8d8 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,7 +61,14 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = ["shorthand;;;;+","entrytype;;;;:", "author;2;;-;_", "title;3;6;=;_", "year", "publisher;;5;#;" ] +fields = [ + "shorthand;;;;+", + "entrytype;;;;:", + "author;2;;-;_", + "title;3;6;=;_", + "year", + "publisher;;5;#;" +] # fields = [ # CamelCase test # "author;2;;;", # "title;5;5;;", -- cgit v1.2.3