aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorlukeflo2025-10-13 15:45:53 +0200
committerlukeflo2025-10-13 15:57:42 +0200
commit467851007e1861834326deee3116aa88fe839f5a (patch)
tree7e1cb113d99c32ad5b434f7e87d851cd9c9be382 /src
parent0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (diff)
downloadbibiman-467851007e1861834326deee3116aa88fe839f5a.tar.gz
bibiman-467851007e1861834326deee3116aa88fe839f5a.zip
Working proof of concept of citekey formatting
Diffstat (limited to 'src')
-rw-r--r--src/bibiman/citekeys.rs69
-rw-r--r--src/bibiman/citekeys/citekey_utils.rs105
-rw-r--r--src/config.rs69
3 files changed, 166 insertions, 77 deletions
diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs
index 2f56947..0cec28e 100644
--- a/src/bibiman/citekeys.rs
+++ b/src/bibiman/citekeys.rs
@@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize};
use crate::{
bibiman::citekeys::citekey_utils::{build_citekey, formatting_help},
- config::BibiConfig,
+ config::{BibiConfig, IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
};
mod citekey_utils;
@@ -60,6 +60,7 @@ pub(crate) struct CitekeyFormatting {
case: Option<CitekeyCase>,
old_new_keys_map: Vec<(String, String)>,
dry_run: bool,
+ ascii_only: bool,
}
impl CitekeyFormatting {
@@ -69,14 +70,15 @@ impl CitekeyFormatting {
) -> color_eyre::Result<()> {
let mut formatter = CitekeyFormatting::default();
- formatter.fields = cfg
- .citekey_formatter
- .fields
- .clone()
- .ok_or_eyre("Need to define fields correctly in config file")?;
+ formatter.fields = cfg.citekey_formatter.fields.clone().ok_or_eyre(format!(
+ "Need to define {} correctly in config file",
+ "citekey pattern fields".red()
+ ))?;
formatter.case = cfg.citekey_formatter.case.clone();
+ formatter.ascii_only = cfg.citekey_formatter.ascii_only;
+
if formatter.fields.is_empty() {
return Err(eyre!(
"To format all citekeys, you need to provide {} values in the config file",
@@ -105,13 +107,26 @@ impl CitekeyFormatting {
formatter.bib_entries = Bibliography::parse(&bibstring)
.map_err(|e| eyre!("Couldn't parse bibfile due to {}", e.kind))?;
+ let ignored_chars = if let Some(chars) = &cfg.citekey_formatter.ignored_chars {
+ chars.as_slice()
+ } else {
+ IGNORED_SPECIAL_CHARS.as_slice()
+ };
+
+ let ignored_words = if let Some(words) = &cfg.citekey_formatter.ignored_words {
+ words.as_slice()
+ } else {
+ &*IGNORED_WORDS.as_slice()
+ };
+
formatter
- .do_formatting()
+ .do_formatting(ignored_chars, ignored_words)
.rev_sort_new_keys_by_len()
.update_file()?;
Ok(())
}
+
/// Start Citekey formatting with building a new instance of `CitekeyFormatting`
/// Formatting is processed file by file, because `bibman` can handle
/// multi-file setups.
@@ -144,16 +159,24 @@ impl CitekeyFormatting {
case: cfg.citekey_formatter.case.clone(),
old_new_keys_map: Vec::new(),
dry_run: false,
+ ascii_only: cfg.citekey_formatter.ascii_only,
})
}
/// Process the actual formatting. The citekey of every entry will be updated.
- pub fn do_formatting(&mut self) -> &mut Self {
+ pub fn do_formatting(&mut self, ignored_chars: &[char], ignored_words: &[String]) -> &mut Self {
let mut old_new_keys: Vec<(String, String)> = Vec::new();
for entry in self.bib_entries.iter() {
old_new_keys.push((
entry.key.clone(),
- build_citekey(entry, &self.fields, self.case.as_ref()),
+ build_citekey(
+ entry,
+ &self.fields,
+ self.case.as_ref(),
+ self.ascii_only,
+ ignored_chars,
+ ignored_words,
+ ),
));
}
@@ -215,12 +238,15 @@ mod tests {
use biblatex::Bibliography;
- use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting};
+ use crate::{
+ bibiman::citekeys::{CitekeyCase, CitekeyFormatting},
+ config::{IGNORED_SPECIAL_CHARS, IGNORED_WORDS},
+ };
#[test]
fn format_citekey_test() {
let src = r"
- @article{bos_latex_metadata_and_publishing_workflows_2023,
+ @article{Bos2023,
title = {{LaTeX}, metadata, and publishing workflows},
author = {Bos, Joppe W. and {McCurley}, Kevin S.},
year = {2023},
@@ -232,7 +258,7 @@ mod tests {
urldate = {2023-08-22},
note = {type: article},
}
- @book{bhambra_colonialism_social_theory_2021,
+ @book{Bhambra2021,
title = {Colonialism and \textbf{Modern Social Theory}},
author = {Bhambra, Gurminder K. and Holmwood, John},
location = {Cambridge and Medford},
@@ -247,29 +273,24 @@ mod tests {
fields: vec![
"entrytype;;;;:".into(),
"author;;;-;_".into(),
- "title;4;3;_;_".into(),
+ "title;4;3;=;_".into(),
"location;;4;:;_".into(),
"year".into(),
],
- case: None,
+ case: Some(CitekeyCase::Lower),
old_new_keys_map: Vec::new(),
dry_run: false,
+ ascii_only: true,
};
- let _ = formatting_struct.do_formatting();
+ let _ = formatting_struct
+ .do_formatting(IGNORED_SPECIAL_CHARS.as_slice(), &*IGNORED_WORDS.as_slice());
assert_eq!(
formatting_struct.old_new_keys_map.get(0).unwrap().1,
- "article:Bos-McCurley_LaT_met_and_pub_Empt_2023"
+ "article:bos-mccurley_lat=met=pub=wor_2023"
);
assert_eq!(
formatting_struct.old_new_keys_map.get(1).unwrap().1,
- "book:Bhambra-Holmwood_Col_and_Mod_Soc_Camb:and:Medf_2021"
- );
- formatting_struct.case = Some(CitekeyCase::Lower);
- let _ = formatting_struct.do_formatting().rev_sort_new_keys_by_len();
- // now the longer citekey is processed first and its in lowercase!
- assert_eq!(
- formatting_struct.old_new_keys_map.get(0).unwrap().1,
- "book:bhambra-holmwood_col_and_mod_soc_camb:and:medf_2021"
+ "book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021"
);
}
diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs
index ee2c849..5f70224 100644
--- a/src/bibiman/citekeys/citekey_utils.rs
+++ b/src/bibiman/citekeys/citekey_utils.rs
@@ -16,21 +16,14 @@
/////
use biblatex::{ChunksExt, Entry, Type};
+use deunicode::deunicode;
use indoc::formatdoc;
use owo_colors::{
OwoColorize,
colors::{BrightBlue, Green, White},
};
-use crate::{
- bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully},
- config::IGNORED_SPECIAL_CHARS,
-};
-
-const IGNORE_WORDS: [&str; 20] = [
- "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine",
- "eines", "des", "auf", "und", "für", "vor",
-];
+use crate::bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully};
pub(super) fn formatting_help() {
let help = vec![
@@ -104,6 +97,9 @@ pub(super) fn build_citekey(
entry: &Entry,
pattern_fields: &[String],
case: Option<&CitekeyCase>,
+ ascii_only: bool,
+ ignored_chars: &[char],
+ ignored_words: &[String],
) -> String {
// mut string the citekey is built from
let mut new_citekey = String::new();
@@ -114,7 +110,7 @@ pub(super) fn build_citekey(
// loop over pattern fields process them
'field_loop: for pattern in pattern_fields.iter() {
// parse single values from pattern field
- let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) =
+ let (field_name, max_words, max_chars, inner_delimiter, cur_trailing_delimiter) =
split_formatting_pat(pattern);
// built the part of the citekey from the current pattern field
@@ -126,16 +122,9 @@ pub(super) fn build_citekey(
// split at whitespaces, count fields and set counter for processed
// splits
- let mut split_field = field.split_whitespace();
+ let split_field = field.split_whitespace();
let mut words_passed = 0;
let field_count = field.split_whitespace().count();
- let word_count = if let Some(val) = word_count
- && val <= field_count
- {
- val
- } else {
- field_count
- };
// If there is a trailing delimiter from the previous field, push it
if let Some(del) = trailing_delimiter {
@@ -152,47 +141,57 @@ pub(super) fn build_citekey(
}
// loop over single parts of current field and add correct delimiter
- 'word_loop: loop {
- // process the single slices and add correct delimiter
- if let Some(field_slice) = split_field.next() {
- // Create word slice char by char. We need to loop over chars
- // instead of a simple bytes index to also catch chars which
- // consist of more than one byte (äöüøæ etc...)
- let mut word_slice = String::new();
- let word_chars = field_slice.chars();
- let mut counter = 0;
- 'char_loop: for mut c in word_chars {
- // If camelcase is set, force first char of word to uppercase
- if counter == 0 && case == Some(&CitekeyCase::Camel) {
- c = c.to_ascii_uppercase()
- }
- if let Some(len) = char_count
- && counter == len
- {
- break 'char_loop;
- }
- // if a word slice contains a special char, skip it
- if IGNORED_SPECIAL_CHARS.contains(&c) {
- continue 'char_loop;
- }
+ // process the single slices and add correct delimiter
+ 'word_loop: for (idx, field_slice) in split_field.enumerate() {
+ // if the current slice is a common word from the ignore list,
+ // skip it.
+ if ignored_words.contains(&field_slice.to_lowercase()) {
+ continue;
+ }
+
+ // Create word slice char by char. We need to loop over chars
+ // instead of a simple bytes index to also catch chars which
+ // consist of more than one byte (äöüøæ etc...)
+ let mut word_slice = String::new();
+ let word_chars = field_slice.chars();
+ let mut counter = 0;
+ 'char_loop: for mut c in word_chars {
+ // If camelcase is set, force first char of word to uppercase
+ if counter == 0 && case == Some(&CitekeyCase::Camel) {
+ c = c.to_ascii_uppercase()
+ }
+ if let Some(len) = max_chars
+ && counter >= len
+ {
+ break 'char_loop;
+ }
+ // if a word slice contains a special char, skip it
+ if ignored_chars.contains(&c) {
+ continue 'char_loop;
+ }
+ // if non-ascii chars should be mapped, check if needed and do it
+ if let Some(chars) = deunicode::deunicode_char(c)
+ && ascii_only
+ {
+ word_slice.push_str(chars);
+ counter += chars.len();
+ } else {
word_slice.push(c);
counter += 1;
}
- // Don't count empty slices and don't add delimiter to those
- if !word_slice.is_empty() {
- formatted_str = formatted_str + &word_slice;
- words_passed += 1;
- if word_count == words_passed {
- break 'word_loop;
- } else {
- formatted_str = formatted_str + inner_delimiter.unwrap_or("");
- }
+ }
+ // Don't count empty slices and don't add delimiter to those
+ if !word_slice.is_empty() {
+ formatted_str = formatted_str + &word_slice;
+ words_passed += 1;
+ if max_words.is_some_and(|max| max == words_passed) || idx + 1 == field_count {
+ break 'word_loop;
} else {
- continue 'word_loop;
+ formatted_str = formatted_str + inner_delimiter.unwrap_or("");
}
} else {
- break 'word_loop;
- };
+ continue 'word_loop;
+ }
}
formatted_str
};
diff --git a/src/config.rs b/src/config.rs
index b1c4b07..7c1a0f8 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -20,6 +20,7 @@ use std::{
io::{Write, stdin},
path::PathBuf,
str::FromStr,
+ sync::LazyLock,
};
use color_eyre::{eyre::Result, owo_colors::OwoColorize};
@@ -40,6 +41,31 @@ pub const IGNORED_SPECIAL_CHARS: [char; 33] = [
'&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"',
];
+pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| {
+ vec![
+ String::from("the"),
+ String::from("a"),
+ String::from("an"),
+ String::from("of"),
+ String::from("for"),
+ String::from("in"),
+ String::from("at"),
+ String::from("to"),
+ String::from("and"),
+ String::from("der"),
+ String::from("die"),
+ String::from("das"),
+ String::from("ein"),
+ String::from("eine"),
+ String::from("eines"),
+ String::from("des"),
+ String::from("auf"),
+ String::from("und"),
+ String::from("für"),
+ String::from("vor"),
+ ]
+});
+
const DEFAULT_CONFIG: &str = r##"
# [general]
## Default files/dirs which are loaded on startup
@@ -118,6 +144,40 @@ const DEFAULT_CONFIG: &str = r##"
## Convert chars to specified case. Possible values:
## "upper", "uppercase", "lower", "lowercase"
# case = "lowercase"
+
+## Map all unicode chars to their pure ascii equivalent
+# ascii_only = true
+
+## List of special chars that'll be ignored when building citekeys.
+## A custom list will overwrite the default list
+# ignored_chars = [
+# "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", """,
+# ]
+
+## List of words that'll be ignored when building citekeys.
+## A custom list will overwrite the default list
+# ignored_words = [
+# "the",
+# "a",
+# "an",
+# "of",
+# "for",
+# "in",
+# "at",
+# "to",
+# "and",
+# "der",
+# "die",
+# "das",
+# "ein",
+# "eine",
+# "eines",
+# "des",
+# "auf",
+# "und",
+# "für",
+# "vor",
+# ]
"##;
/// Main struct of the config file. Contains substructs/headings in toml
@@ -171,6 +231,9 @@ pub struct Colors {
pub struct CitekeyFormatter {
pub fields: Option<Vec<String>>,
pub case: Option<CitekeyCase>,
+ pub ascii_only: bool,
+ pub ignored_chars: Option<Vec<char>>,
+ pub ignored_words: Option<Vec<String>>,
}
impl Default for BibiConfig {
@@ -194,6 +257,9 @@ impl Default for BibiConfig {
citekey_formatter: CitekeyFormatter {
fields: None,
case: None,
+ ascii_only: true,
+ ignored_chars: None,
+ ignored_words: None,
},
}
}
@@ -224,6 +290,9 @@ impl BibiConfig {
citekey_formatter: CitekeyFormatter {
fields: None,
case: None,
+ ascii_only: true,
+ ignored_chars: None,
+ ignored_words: None,
},
}
}