diff options
| author | lukeflo | 2025-10-12 23:01:17 +0200 |
|---|---|---|
| committer | lukeflo | 2025-10-12 23:01:27 +0200 |
| commit | 0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (patch) | |
| tree | 3009e5c32985690cc1b346f4688fa3e9e3da7fde | |
| parent | f112c4e13009e5ddfe3cf5c4cbe7f29f832b8553 (diff) | |
| download | bibiman-0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb.tar.gz bibiman-0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb.zip | |
ignore list for words, but need to solve inner delimiter problem for words ignored
| -rw-r--r-- | src/bibiman/citekeys.rs | 317 | ||||
| -rw-r--r-- | src/bibiman/citekeys/citekey_utils.rs | 327 | ||||
| -rw-r--r-- | tests/test-config.toml | 7 |
3 files changed, 348 insertions, 303 deletions
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs index 9d17403..2f56947 100644 --- a/src/bibiman/citekeys.rs +++ b/src/bibiman/citekeys.rs @@ -21,27 +21,33 @@ use std::{ path::{Path, PathBuf}, }; -use biblatex::{Bibliography, ChunksExt, Entry, Type}; +use biblatex::Bibliography; use color_eyre::eyre::{OptionExt, eyre}; -use indoc::formatdoc; use lexopt::Arg::{Long, Short}; -use owo_colors::{ - OwoColorize, - colors::{BrightBlue, Green, White}, -}; +use owo_colors::OwoColorize; use serde::{Deserialize, Serialize}; use crate::{ - bibiman::sanitize::sanitize_single_string_fully, - config::{BibiConfig, IGNORED_SPECIAL_CHARS}, + bibiman::citekeys::citekey_utils::{build_citekey, formatting_help}, + config::BibiConfig, }; +mod citekey_utils; + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum CitekeyCase { #[serde(alias = "uppercase", alias = "upper")] Upper, #[serde(alias = "lowercase", alias = "lower")] Lower, + #[serde( + alias = "camel", + alias = "camelcase", + alias = "camel_case", + alias = "uppercamelcase", + alias = "upper_camel_case" + )] + Camel, } #[derive(Debug, Default, Clone)] @@ -203,306 +209,13 @@ impl CitekeyFormatting { } } -fn formatting_help() { - let help = vec![ - formatdoc!( - "{} {}\n", - env!("CARGO_PKG_NAME").fg::<Green>().bold(), - env!("CARGO_PKG_VERSION") - ), - formatdoc!("{}", "USAGE".bold()), - formatdoc!( - "\t{} {} {} {}\n", - env!("CARGO_PKG_NAME").fg::<White>().bold(), - "format-citekeys".bold(), - "--source=<SOURCE>".bold(), - "--output=<TARGET>".bold() - ), - formatdoc!( - " - \tThis help describes the CLI usage for the citekey formatting - \tfunctionality of bibiman. The definition of patterns how the - \tcitekeys should be formatted must be set in the config file. - \tFor further informations how to use this patterns etc. see: - \t{} - ", - "https://codeberg.org/lukeflo/bibiman/src/branch/main#bibiman" - .italic() - .fg::<BrightBlue>() - ), - formatdoc!("{}", "OPTIONS".bold()), - formatdoc!( - " - \t{} - \tShow this help and exit - ", - "-h, --help".fg::<White>().bold() - ), - formatdoc!( - " - \t{} - \tDon't apply any changes to the named files. Instead print all - \told citekeys and the formatted strings that would have been - \tapplied in the format: {} => {} - ", - "-d, --dry-run".fg::<White>().bold(), - "old_key".italic(), - "new_key".bold() - ), - formatdoc! {" - \t{} - \tThe bibfile for which the citekey formatting should be processed. - \tTakes a path as argument. - ", "-s, -f, --source=, --file=".fg::<White>().bold()}, - formatdoc!( - " - \t{} - \tThe bibfile to which the updated content should be written. - \tTakes a path as argument. If the file doesn't exist, it will be - \tcreated. - \tIf the argument isn't used, the original file will be {}! - ", - "-t, -o, --target=, --output=".fg::<White>().bold(), - "overwritten".italic(), - ), - ]; - let help = help.join("\n"); - println!("{}", help); -} - -/// Build the citekey from the patterns defined in the config file -fn build_citekey(entry: &Entry, pattern_fields: &[String], case: Option<&CitekeyCase>) -> String { - // mut string the citekey is built from - let mut new_citekey = String::new(); - - // count different fields of pattern vec - let fields = pattern_fields.len(); - - // loop over pattern fields process them - for (idx, pattern) in pattern_fields.iter().enumerate() { - // parse single values from pattern field - let (field_name, word_count, char_count, inner_delimiter, trailing_delimiter) = - split_formatting_pat(pattern); - - // built the part of the citekey from the current pattern field - let formatted_field_str = { - let mut formatted_str = String::new(); - - // preformat the field depending on biblatex value - let field = preformat_field(field_name, entry); - - // split at whitespaces, count fields and set counter for processed - // splits - let mut split_field = field.split_whitespace(); - let mut words_passed = 0; - let field_count = field.split_whitespace().count(); - let word_count = if let Some(val) = word_count - && val <= field_count - { - val - } else { - field_count - }; - - // loop over single parts of current field and add correct delimiter - loop { - // terminate loop for current field if its empty. If its also the - // last of the pattern vec, pop the trailing delimiter - if field.is_empty() { - if idx + 1 == fields { - let _ = new_citekey.pop(); - } - break; - } - - // process the single slices and add correct delimiter - if let Some(field_slice) = split_field.next() { - // Create word slice char by char. We need to loop over chars - // instead of a simple bytes index to also catch chars which - // consist of more than one byte (äöüøæ etc...) - let mut word_slice = String::new(); - let word_chars = field_slice.chars(); - let mut counter = 0; - for c in word_chars { - if let Some(len) = char_count - && counter == len - { - break; - } - // if a word slice contains a special char, skip it - if IGNORED_SPECIAL_CHARS.contains(&c) { - continue; - } - word_slice.push(c); - counter += 1; - } - // Don't count empty slices and don't add delimiter to those - if !word_slice.is_empty() { - formatted_str = formatted_str + &word_slice; - words_passed += 1; - if word_count == words_passed { - formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); - break; - } else { - formatted_str = formatted_str + inner_delimiter.unwrap_or(""); - } - } else { - continue; - } - } else { - formatted_str = formatted_str + trailing_delimiter.unwrap_or(""); - break; - }; - } - formatted_str - }; - new_citekey = new_citekey + &formatted_field_str; - } - if let Some(case_format) = case { - match case_format { - CitekeyCase::Lower => new_citekey.to_lowercase(), - CitekeyCase::Upper => new_citekey.to_uppercase(), - } - } else { - new_citekey - } -} - -/// Preformat some fields which are very common to be used in citekeys -fn preformat_field(field: &str, entry: &Entry) -> String { - match field { - "title" => { - sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into())) - } - "author" => { - if let Ok(authors) = entry.author() { - let mut last_names = String::new(); - for a in authors.iter() { - last_names = last_names + &a.name + " "; - } - last_names - } else { - "".to_string() - } - } - "year" => { - if let Ok(date) = entry.date() { - date.to_chunks().format_verbatim()[..4].to_string() - } else { - entry.get_as::<String>(field).unwrap_or("".into()) - } - } - "subtitle" => { - sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into())) - } - "editor" => { - if let Ok(editors) = entry.editors() { - let mut last_names = String::new(); - for editortypes in editors.iter() { - for e in editortypes.0.iter() { - last_names = last_names + &e.name + " "; - } - } - last_names - } else { - "".to_string() - } - } - "pubtype" | "entrytype" => entry.entry_type.to_string(), - _ => entry.get_as::<String>(field).unwrap_or("".into()), - } -} - -/// Cut of word at char count index if its set -fn format_word(word: &str, count: Option<usize>) -> String { - // Since chars can consist of multiple bytes, we need this more complex - // loop to collect a specified number of chars (e.g. ÄÖÜäöü¢æø etc...) - // instead of simple byte indexing - let mut word_slice = String::new(); - let word_chars = word.chars(); - let mut counter = 0; - for c in word_chars { - if let Some(len) = count - && counter == len - { - break; - } - if IGNORED_SPECIAL_CHARS.contains(&c) { - continue; - } - word_slice.push(c); - counter += 1; - } - word_slice -} - -/// Split a formatting pattern of kind -/// `<field>;<word count>;<char count>;<inside delimiter>;<trailing delimiter>`, -/// e.g.: `title;3;3;_;:` will give `("title", 3, 3, "_", ":")` -fn split_formatting_pat( - pattern: &str, -) -> ( - &str, - Option<usize>, - Option<usize>, - Option<&str>, - Option<&str>, -) { - let mut splits = pattern.split(';'); - ( - splits - .next() - .expect("Need field value for formatting citekey"), - if let Some(next) = splits.next() - && next.len() > 0 - { - next.parse::<usize>().ok() - } else { - None - }, - if let Some(next) = splits.next() - && next.len() > 0 - { - next.parse::<usize>().ok() - } else { - None - }, - splits.next(), - splits.next(), - ) -} - #[cfg(test)] mod tests { use std::path::PathBuf; use biblatex::Bibliography; - use itertools::Itertools; - - use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting, split_formatting_pat}; - - #[test] - fn split_citekey_pattern() { - let pattern = "title;3;5;_;_"; - - assert_eq!( - split_formatting_pat(pattern), - ("title", Some(3), Some(5), Some("_"), Some("_")) - ); - let pattern = "year"; - - assert_eq!( - split_formatting_pat(pattern), - ("year", None, None, None, None) - ); - - let pattern = "author;1;;;_"; - assert_eq!( - split_formatting_pat(pattern), - ("author", Some(1), None, Some(""), Some("_")) - ); - } + use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting}; #[test] fn format_citekey_test() { diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs new file mode 100644 index 0000000..ee2c849 --- /dev/null +++ b/src/bibiman/citekeys/citekey_utils.rs @@ -0,0 +1,327 @@ +// bibiman - a TUI for managing BibLaTeX databases +// Copyright (C) 2025 lukeflo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <https://www.gnu.org/licenses/>. +///// + +use biblatex::{ChunksExt, Entry, Type}; +use indoc::formatdoc; +use owo_colors::{ + OwoColorize, + colors::{BrightBlue, Green, White}, +}; + +use crate::{ + bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully}, + config::IGNORED_SPECIAL_CHARS, +}; + +const IGNORE_WORDS: [&str; 20] = [ + "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine", + "eines", "des", "auf", "und", "für", "vor", +]; + +pub(super) fn formatting_help() { + let help = vec![ + formatdoc!( + "{} {}\n", + env!("CARGO_PKG_NAME").fg::<Green>().bold(), + env!("CARGO_PKG_VERSION") + ), + formatdoc!("{}", "USAGE".bold()), + formatdoc!( + "\t{} {} {} {}\n", + env!("CARGO_PKG_NAME").fg::<White>().bold(), + "format-citekeys".bold(), + "--source=<SOURCE>".bold(), + "--output=<TARGET>".bold() + ), + formatdoc!( + " + \tThis help describes the CLI usage for the citekey formatting + \tfunctionality of bibiman. The definition of patterns how the + \tcitekeys should be formatted must be set in the config file. + \tFor further informations how to use this patterns etc. see: + \t{} + ", + "https://codeberg.org/lukeflo/bibiman/src/branch/main#bibiman" + .italic() + .fg::<BrightBlue>() + ), + formatdoc!("{}", "OPTIONS".bold()), + formatdoc!( + " + \t{} + \tShow this help and exit + ", + "-h, --help".fg::<White>().bold() + ), + formatdoc!( + " + \t{} + \tDon't apply any changes to the named files. Instead print all + \told citekeys and the formatted strings that would have been + \tapplied in the format: {} => {} + ", + "-d, --dry-run".fg::<White>().bold(), + "old_key".italic(), + "new_key".bold() + ), + formatdoc! {" + \t{} + \tThe bibfile for which the citekey formatting should be processed. + \tTakes a path as argument. + ", "-s, -f, --source=, --file=".fg::<White>().bold()}, + formatdoc!( + " + \t{} + \tThe bibfile to which the updated content should be written. + \tTakes a path as argument. If the file doesn't exist, it will be + \tcreated. + \tIf the argument isn't used, the original file will be {}! + ", + "-t, -o, --target=, --output=".fg::<White>().bold(), + "overwritten".italic(), + ), + ]; + let help = help.join("\n"); + println!("{}", help); +} + +/// Build the citekey from the patterns defined in the config file +pub(super) fn build_citekey( + entry: &Entry, + pattern_fields: &[String], + case: Option<&CitekeyCase>, +) -> String { + // mut string the citekey is built from + let mut new_citekey = String::new(); + + // trailing delimiter of previous field + let mut trailing_delimiter: Option<&str> = None; + + // loop over pattern fields process them + 'field_loop: for pattern in pattern_fields.iter() { + // parse single values from pattern field + let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) = + split_formatting_pat(pattern); + + // built the part of the citekey from the current pattern field + let formatted_field_str = { + let mut formatted_str = String::new(); + + // preformat the field depending on biblatex value + let field = preformat_field(field_name, entry); + + // split at whitespaces, count fields and set counter for processed + // splits + let mut split_field = field.split_whitespace(); + let mut words_passed = 0; + let field_count = field.split_whitespace().count(); + let word_count = if let Some(val) = word_count + && val <= field_count + { + val + } else { + field_count + }; + + // If there is a trailing delimiter from the previous field, push it + if let Some(del) = trailing_delimiter { + formatted_str = del.to_string(); + }; + + // If the current field isn't empty, set trailing delimiter for + // upcoming loop repitition. If it's empty, start next run of loop + // directly + if !field.is_empty() { + trailing_delimiter = cur_trailing_delimiter; + } else { + continue 'field_loop; + } + + // loop over single parts of current field and add correct delimiter + 'word_loop: loop { + // process the single slices and add correct delimiter + if let Some(field_slice) = split_field.next() { + // Create word slice char by char. We need to loop over chars + // instead of a simple bytes index to also catch chars which + // consist of more than one byte (äöüøæ etc...) + let mut word_slice = String::new(); + let word_chars = field_slice.chars(); + let mut counter = 0; + 'char_loop: for mut c in word_chars { + // If camelcase is set, force first char of word to uppercase + if counter == 0 && case == Some(&CitekeyCase::Camel) { + c = c.to_ascii_uppercase() + } + if let Some(len) = char_count + && counter == len + { + break 'char_loop; + } + // if a word slice contains a special char, skip it + if IGNORED_SPECIAL_CHARS.contains(&c) { + continue 'char_loop; + } + word_slice.push(c); + counter += 1; + } + // Don't count empty slices and don't add delimiter to those + if !word_slice.is_empty() { + formatted_str = formatted_str + &word_slice; + words_passed += 1; + if word_count == words_passed { + break 'word_loop; + } else { + formatted_str = formatted_str + inner_delimiter.unwrap_or(""); + } + } else { + continue 'word_loop; + } + } else { + break 'word_loop; + }; + } + formatted_str + }; + new_citekey = new_citekey + &formatted_field_str; + } + match case { + Some(CitekeyCase::Lower) => new_citekey.to_lowercase(), + Some(CitekeyCase::Upper) => new_citekey.to_uppercase(), + _ => new_citekey, + } +} + +/// Preformat some fields which are very common to be used in citekeys +pub(super) fn preformat_field(field: &str, entry: &Entry) -> String { + match field { + // Sanitize all macro code from string + "title" => { + sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into())) + } + // Get author names. Fall back to editors before setting empty string + "author" => { + if let Ok(authors) = entry.author() { + let mut last_names = String::new(); + for a in authors.iter() { + last_names = last_names + &a.name + " "; + } + last_names + } else if let Ok(editors) = entry.editors() { + let mut last_names = String::new(); + for editortypes in editors.iter() { + for e in editortypes.0.iter() { + last_names = last_names + &e.name + " "; + } + } + last_names + } else { + "".to_string() + } + } + // Get year of date field, fallback to year field + "year" => { + if let Ok(date) = entry.date() { + date.to_chunks().format_verbatim()[..4].to_string() + } else { + entry.get_as::<String>(field).unwrap_or("".into()) + } + } + // Sanitize all macro code from string + "subtitle" => { + sanitize_single_string_fully(&entry.get_as::<String>(field).unwrap_or("".into())) + } + "editor" => { + if let Ok(editors) = entry.editors() { + let mut last_names = String::new(); + for editortypes in editors.iter() { + for e in editortypes.0.iter() { + last_names = last_names + &e.name + " "; + } + } + last_names + } else { + "".to_string() + } + } + "pubtype" | "entrytype" => entry.entry_type.to_string(), + _ => entry.get_as::<String>(field).unwrap_or("".into()), + } +} + +/// Split a formatting pattern of kind +/// `<field>;<word count>;<char count>;<inside delimiter>;<trailing delimiter>`, +/// e.g.: `title;3;3;_;:` will give `("title", 3, 3, "_", ":")` +pub(super) fn split_formatting_pat( + pattern: &str, +) -> ( + &str, + Option<usize>, + Option<usize>, + Option<&str>, + Option<&str>, +) { + let mut splits = pattern.split(';'); + ( + splits + .next() + .expect("Need field value for formatting citekey"), + if let Some(next) = splits.next() + && next.len() > 0 + { + next.parse::<usize>().ok() + } else { + None + }, + if let Some(next) = splits.next() + && next.len() > 0 + { + next.parse::<usize>().ok() + } else { + None + }, + splits.next(), + splits.next(), + ) +} + +#[cfg(test)] +mod test { + use crate::bibiman::citekeys::citekey_utils::split_formatting_pat; + + #[test] + fn split_citekey_pattern() { + let pattern = "title;3;5;_;_"; + + assert_eq!( + split_formatting_pat(pattern), + ("title", Some(3), Some(5), Some("_"), Some("_")) + ); + + let pattern = "year"; + + assert_eq!( + split_formatting_pat(pattern), + ("year", None, None, None, None) + ); + + let pattern = "author;1;;;_"; + assert_eq!( + split_formatting_pat(pattern), + ("author", Some(1), None, Some(""), Some("_")) + ); + } +} diff --git a/tests/test-config.toml b/tests/test-config.toml index 2c5ac96..d3e42c5 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,5 +61,10 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = [ "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] +fields = ["entrytype;;;;:", "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] +# fields = [ # CamelCase test +# "author;2;;;", +# "title;5;5;;", +# "year" +# ] case = "lowercase" |
