diff options
| author | lukeflo | 2025-10-13 15:45:53 +0200 |
|---|---|---|
| committer | lukeflo | 2025-10-13 15:57:42 +0200 |
| commit | 467851007e1861834326deee3116aa88fe839f5a (patch) | |
| tree | 7e1cb113d99c32ad5b434f7e87d851cd9c9be382 | |
| parent | 0a8805acfb6fbb3d3a8c22f4ccbaf692a73cddfb (diff) | |
| download | bibiman-467851007e1861834326deee3116aa88fe839f5a.tar.gz bibiman-467851007e1861834326deee3116aa88fe839f5a.zip | |
Working proof of concept of citekey formatting
| -rw-r--r-- | CITEKEYS.md | 215 | ||||
| -rw-r--r-- | Cargo.lock | 7 | ||||
| -rw-r--r-- | Cargo.toml | 1 | ||||
| -rw-r--r-- | README.md | 20 | ||||
| -rw-r--r-- | src/bibiman/citekeys.rs | 69 | ||||
| -rw-r--r-- | src/bibiman/citekeys/citekey_utils.rs | 105 | ||||
| -rw-r--r-- | src/config.rs | 69 | ||||
| -rw-r--r-- | tests/biblatex-test-citekeys.bib | 476 | ||||
| -rw-r--r-- | tests/test-config.toml | 5 |
9 files changed, 889 insertions, 78 deletions
diff --git a/CITEKEYS.md b/CITEKEYS.md new file mode 100644 index 0000000..912326a --- /dev/null +++ b/CITEKEYS.md @@ -0,0 +1,215 @@ +# Formatting Citekeys<a name="formatting-citekeys"></a> + +<!-- mdformat-toc start --slug=github --maxlevel=6 --minlevel=1 --> + +- [Formatting Citekeys](#formatting-citekeys) + - [Settings](#settings) + - [Building Patterns](#building-patterns) + - [Ignore Lists and Char Case](#ignore-lists-and-char-case) + - [General Tipps](#general-tipps) + - [Examples](#examples) + +<!-- mdformat-toc end --> + +`bibiman` offers the possibility to create new citekeys from the fields of +BibLaTeX entries. This is done using an easy but powerful pattern-matching +syntax. + +## Settings<a name="settings"></a> + +All settings for the citekey generation have to be configured in the used config +file. The regular path is `XDG_CONFIG_DIR/bibiman/bibiman.toml`. But it can be +set dynamically with the `-c`/`--config=` global option. + +Following values can be set through the config file. A detailed explanation for +all fields follows below: + +```toml +[citekey_formatter] +fields = [ "author;2;;-;_", "title;3;6;_;_", "year" ] +case = "lowercase" +ascii_only = true +ignored_chars = [ + "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", "\"", +] +ignored_words = [ + "the", + "a", + "an", + "of", + "for", + "in", + "at", + "to", + "and", + "der", + "die", + "das", + "ein", + "eine", + "eines", + "des", + "auf", + "und", + "für", + "vor", +] +``` + +## Building Patterns<a name="building-patterns"></a> + +The main aspect for generating citekeys are the field patterns. They can be set +through an array in the config file where every array-item represents a single +BibLaTeX field to be used for generating a part of the citekey. + +Every field pattern consists of the following five parts separated by +semicolons. The general pattern looks like this (every subfield is explained +below): + +*biblatex field name* **;** *max word count* **;** *max char count* **;** *inner delimiter* **;** *trailing delimiter* + +- **BibLaTeX field**: the first part represents the field name which value + should be used to generate the content part of the citekey. Theoretically, any + BibLaTeX field can be selected by name. But there are some fields which are + much more common than others; e.g. `author`, `editor`, `title`, `year`/`date` + or `entrytype`. Those very common fields are preprocessed; meaning that for + instance LaTeX macros are fully stripped from the strings, or that `editor` is + a fallback value for `author` if the latter is empty (however, setting + `editor` explicitly is still possible). Also using `year` will parse the + `date` field too, to ensure a year number. +- **Max Word**: Defines how many words should maximal be used from the named + field. E.g. if the title consists of five words, and the max counter is set to + `3` only the first three fields will be used. +- **Max Chars/Word**: Defines how many chars, counting from the start, of each + word will be used to build the citekey. If for instance the value is set to + `5`, only the first five chars of any word will be used. Thus, "archaeology" + would be stripped down to "archa". +- **Inner Delimiter**: Sets the delimiter char used between words from the + currently named field; e.g. to separate the words of the `title` field. +- **Trailing Delimiter**: Sets the delimiter which separates the current fields + value from the following. This delimiter is only printed if the following + field has some content. + +For example, to use the `title` field, print maximal three words and of those +only the first five chars, single words separated by underscore and the whole +field separated by equal sign, insert the following pattern field into the +`fields` array: + +`title;3;5;_;=` + +Except the BibLaTeX field name, all other parts of the pattern can be left +blank. If the field name is the only value set, semicolon delimiters are also +not necessary. But if only one of the following parts should be set, all +delimiters need to be used. E.g. those are both valid: `title` or `title;;;_;=`. +The first would print all words of the title, no matter the length, not +separated by any char. The last would also print all words of the title, but +single words separated by underscores and the whole pattern value separated from +the following by an equal sign. This is not valid: `title;;_` since `bibiman` +can't know if the underscore means a delimiter (and which) or the max char +count. + +The pattern array inside the config file takes multiple pattern fields like the +predecing. This allows an elaborated citekey pattern which takes into account +multiple fields. + +## Ignore Lists and Char Case<a name="ignore-lists-and-char-case"></a> + +Beside the field patterns there are some other options to define how citekeys +should be built. + +`ascii_only=<BOOL>` +: If set to `true`, which is the default, non-ascii chars are mapped to their + ascii equivalent. For example, the German `ä` would be mapped to `a`. The + Turkish `ş` or Greek `σ`/`ς` would be mapped to `s`. If set to `false` all are + kept as they are. But this could lead to errors running LaTeX on the file. + +`case=<CASE>` +: If used, sets the case of the chars in the citekey. Valid values are + `uppercase`, `lowercase` or `camelcase`. Both first should be clear, the + latter means typical camel case also beginning the *first word* with an + uppercase letter; also referenced as upper camel case or Pascal case. + +`ignored_chars=<ARRAY>` +: Defines chars which should be ignored during parsing (meaning not print them). + The default list contains 33 special chars and is part of the default config + file (in out-commented state). Be aware, setting this key will completely + overwrite the default list! + +`ignored_words=<ARRAY>` +: A list of words which should be ignored parsing field values. The default list + contains about 20 very commonly used words in English and German; like + articles, pronouns or connector words. Like with `ignored_chars` setting this + key will completely overwrite the default list! + +## General Tipps<a name="general-tipps"></a> + +- Most importantly: *always use the **`--dry-run`** option first*! This will + print a list of old and new values for all citekeys in the file without + changing anything. +- After finding a good overall pattern, *use the `--output=` option* to create a + new file and don't overwrite your existent file. Thus, your original file + isn't broken if the key formatter produces some unwanted output. +- Even very long patterns are possible, they are not encouraged, since it bloats + the bibfiles. +- The same accounts for *too short* patterns; if the pattern is to unspecific, + it bares the risk of producing doublettes (e.g. single author and year only). + But the citekey generator will not check for doublettes! +- It is possible to keep special chars and use them as delimiters. But this + might cause problems other programs and CLI tools in particular, since many + special chars are reserved for shell operations. For instance, it will very + likely break the note file feature of `bibiman` which doesn't accept many + special chars. + +## Examples<a name="examples"></a> + +To make the process more clear a few examples might help. Following bibfile is +assumed: + +```latex +@article{Bos2023, + title = {{LaTeX}, metadata, and publishing workflows}, + author = {Bos, Joppe W. and {McCurley}, Kevin S.}, + year = {2023}, + month = apr, + journal = {arXiv}, + number = {{arXiv}:2301.08277}, + doi = {10.48550/arXiv.2301.08277}, + url = {http://arxiv.org/abs/2301.08277}, + urldate = {2023-08-22}, + note = {type: article}, +} +@book{Bhambra2021, + title = {Colonialism and \textbf{Modern Social Theory}}, + author = {Bhambra, Gurminder K. and Holmwood, John}, + location = {Cambridge and Medford}, + publisher = {Polity Press}, + date = {2021}, + +``` + +And the following values set in the config file: + +```toml +fields = [ + # Just print the whole entrytype and a colon as trailing delimiter + "entrytype;;;;:", + # Print all author names in full length, names separated by dash, + # the whole field by underscore + "author;;;-;_", + # Print first 4 words of title, first 3 chars of every word only. Title words + # separated by equal sign, the whole field by underscore + "title;4;3;=;_", + # Print all words of location, but only first 4 chars of every word. Single words + # separated by colon, whole field by underscore + "location;;4;:;_", + # Just print the whole year + "year", +] +case = "lowercase" +ascii_only = true +``` + +The combination of those setting will produce the following citekeys: + +- **`article:bos-mccurley_lat=met=pub=wor_2023`** +- **`book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021`** @@ -103,6 +103,7 @@ dependencies = [ "biblatex", "color-eyre", "crossterm", + "deunicode", "dirs", "editor-command", "figment", @@ -324,6 +325,12 @@ dependencies = [ ] [[package]] +name = "deunicode" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04" + +[[package]] name = "dirs" version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -41,6 +41,7 @@ owo-colors = "4.2.2" logos = "0.15.1" phf = { version = "0.13.1", features = ["macros"] } indoc = "2.0.6" +deunicode = "1.6.2" [workspace.metadata.cross.target.aarch64-unknown-linux-gnu] # Install libssl-dev:arm64, see <https://github.com/cross-rs/cross/blob/main/docs/custom_images.md#adding-dependencies-to-existing-images> @@ -24,9 +24,11 @@ - [Ubuntu/Debian](#ubuntudebian) - [Void Linux](#void-linux) - [Usage](#usage) + - [CLI for citekey formatting](#cli-for-citekey-formatting) - [Configuration](#configuration) - [Location of Config File](#location-of-config-file) - [General Configuration](#general-configuration) + - [Citekey formatting](#citekey-formatting) - [Color Configuration](#color-configuration) - [Features](#features) - [Keybindings](#keybindings) @@ -196,6 +198,13 @@ bibman tests/multi-files/ bibiman tests/biblatex-test.bib tests/multi-files/ ``` +### CLI for citekey formatting<a name="cli-for-citekey-formatting"></a> + +Beside the TUI `bibiman` can format and replace citekeys. To make use of this +feature run the program with the `format-citekeys` subcommand. For more +information on this use `bibiman format-citekeys --help` and the +[docs](./CITEKEYS.md). + ## Configuration<a name="configuration"></a> ### Location of Config File<a name="location-of-config-file"></a> @@ -268,6 +277,11 @@ note_symbol = "" ## Possible values are "journaltitle", "organization", "instituion", "publisher" ## and "pubtype" (which is the default) custom_column = "pubtype" + +[citekey_formatter] +fields = [] +ascii_only = true +case = "lowercase" ``` `bibfiles` @@ -326,6 +340,12 @@ custom_column = "pubtype" good advice to use a rather wide terminal window when using a value like `journaltitle`. +### Citekey formatting<a name="citekey-formatting"></a> + +`bibiman` now also offers a citekey generating feature. This enables to reformat +all citekeys based on an elaborated pattern matching syntax. For furthter +information and examples see the [docs](CITEKEYS.md). + ### Color Configuration<a name="color-configuration"></a> Furthermore, it is now possible to customize the colors. The following values diff --git a/src/bibiman/citekeys.rs b/src/bibiman/citekeys.rs index 2f56947..0cec28e 100644 --- a/src/bibiman/citekeys.rs +++ b/src/bibiman/citekeys.rs @@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize}; use crate::{ bibiman::citekeys::citekey_utils::{build_citekey, formatting_help}, - config::BibiConfig, + config::{BibiConfig, IGNORED_SPECIAL_CHARS, IGNORED_WORDS}, }; mod citekey_utils; @@ -60,6 +60,7 @@ pub(crate) struct CitekeyFormatting { case: Option<CitekeyCase>, old_new_keys_map: Vec<(String, String)>, dry_run: bool, + ascii_only: bool, } impl CitekeyFormatting { @@ -69,14 +70,15 @@ impl CitekeyFormatting { ) -> color_eyre::Result<()> { let mut formatter = CitekeyFormatting::default(); - formatter.fields = cfg - .citekey_formatter - .fields - .clone() - .ok_or_eyre("Need to define fields correctly in config file")?; + formatter.fields = cfg.citekey_formatter.fields.clone().ok_or_eyre(format!( + "Need to define {} correctly in config file", + "citekey pattern fields".red() + ))?; formatter.case = cfg.citekey_formatter.case.clone(); + formatter.ascii_only = cfg.citekey_formatter.ascii_only; + if formatter.fields.is_empty() { return Err(eyre!( "To format all citekeys, you need to provide {} values in the config file", @@ -105,13 +107,26 @@ impl CitekeyFormatting { formatter.bib_entries = Bibliography::parse(&bibstring) .map_err(|e| eyre!("Couldn't parse bibfile due to {}", e.kind))?; + let ignored_chars = if let Some(chars) = &cfg.citekey_formatter.ignored_chars { + chars.as_slice() + } else { + IGNORED_SPECIAL_CHARS.as_slice() + }; + + let ignored_words = if let Some(words) = &cfg.citekey_formatter.ignored_words { + words.as_slice() + } else { + &*IGNORED_WORDS.as_slice() + }; + formatter - .do_formatting() + .do_formatting(ignored_chars, ignored_words) .rev_sort_new_keys_by_len() .update_file()?; Ok(()) } + /// Start Citekey formatting with building a new instance of `CitekeyFormatting` /// Formatting is processed file by file, because `bibman` can handle /// multi-file setups. @@ -144,16 +159,24 @@ impl CitekeyFormatting { case: cfg.citekey_formatter.case.clone(), old_new_keys_map: Vec::new(), dry_run: false, + ascii_only: cfg.citekey_formatter.ascii_only, }) } /// Process the actual formatting. The citekey of every entry will be updated. - pub fn do_formatting(&mut self) -> &mut Self { + pub fn do_formatting(&mut self, ignored_chars: &[char], ignored_words: &[String]) -> &mut Self { let mut old_new_keys: Vec<(String, String)> = Vec::new(); for entry in self.bib_entries.iter() { old_new_keys.push(( entry.key.clone(), - build_citekey(entry, &self.fields, self.case.as_ref()), + build_citekey( + entry, + &self.fields, + self.case.as_ref(), + self.ascii_only, + ignored_chars, + ignored_words, + ), )); } @@ -215,12 +238,15 @@ mod tests { use biblatex::Bibliography; - use crate::bibiman::citekeys::{CitekeyCase, CitekeyFormatting}; + use crate::{ + bibiman::citekeys::{CitekeyCase, CitekeyFormatting}, + config::{IGNORED_SPECIAL_CHARS, IGNORED_WORDS}, + }; #[test] fn format_citekey_test() { let src = r" - @article{bos_latex_metadata_and_publishing_workflows_2023, + @article{Bos2023, title = {{LaTeX}, metadata, and publishing workflows}, author = {Bos, Joppe W. and {McCurley}, Kevin S.}, year = {2023}, @@ -232,7 +258,7 @@ mod tests { urldate = {2023-08-22}, note = {type: article}, } - @book{bhambra_colonialism_social_theory_2021, + @book{Bhambra2021, title = {Colonialism and \textbf{Modern Social Theory}}, author = {Bhambra, Gurminder K. and Holmwood, John}, location = {Cambridge and Medford}, @@ -247,29 +273,24 @@ mod tests { fields: vec![ "entrytype;;;;:".into(), "author;;;-;_".into(), - "title;4;3;_;_".into(), + "title;4;3;=;_".into(), "location;;4;:;_".into(), "year".into(), ], - case: None, + case: Some(CitekeyCase::Lower), old_new_keys_map: Vec::new(), dry_run: false, + ascii_only: true, }; - let _ = formatting_struct.do_formatting(); + let _ = formatting_struct + .do_formatting(IGNORED_SPECIAL_CHARS.as_slice(), &*IGNORED_WORDS.as_slice()); assert_eq!( formatting_struct.old_new_keys_map.get(0).unwrap().1, - "article:Bos-McCurley_LaT_met_and_pub_Empt_2023" + "article:bos-mccurley_lat=met=pub=wor_2023" ); assert_eq!( formatting_struct.old_new_keys_map.get(1).unwrap().1, - "book:Bhambra-Holmwood_Col_and_Mod_Soc_Camb:and:Medf_2021" - ); - formatting_struct.case = Some(CitekeyCase::Lower); - let _ = formatting_struct.do_formatting().rev_sort_new_keys_by_len(); - // now the longer citekey is processed first and its in lowercase! - assert_eq!( - formatting_struct.old_new_keys_map.get(0).unwrap().1, - "book:bhambra-holmwood_col_and_mod_soc_camb:and:medf_2021" + "book:bhambra-holmwood_col=mod=soc=the_camb:medf_2021" ); } diff --git a/src/bibiman/citekeys/citekey_utils.rs b/src/bibiman/citekeys/citekey_utils.rs index ee2c849..5f70224 100644 --- a/src/bibiman/citekeys/citekey_utils.rs +++ b/src/bibiman/citekeys/citekey_utils.rs @@ -16,21 +16,14 @@ ///// use biblatex::{ChunksExt, Entry, Type}; +use deunicode::deunicode; use indoc::formatdoc; use owo_colors::{ OwoColorize, colors::{BrightBlue, Green, White}, }; -use crate::{ - bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully}, - config::IGNORED_SPECIAL_CHARS, -}; - -const IGNORE_WORDS: [&str; 20] = [ - "the", "a", "an", "of", "for", "in", "at", "to", "and", "der", "die", "das", "ein", "eine", - "eines", "des", "auf", "und", "für", "vor", -]; +use crate::bibiman::{citekeys::CitekeyCase, sanitize::sanitize_single_string_fully}; pub(super) fn formatting_help() { let help = vec![ @@ -104,6 +97,9 @@ pub(super) fn build_citekey( entry: &Entry, pattern_fields: &[String], case: Option<&CitekeyCase>, + ascii_only: bool, + ignored_chars: &[char], + ignored_words: &[String], ) -> String { // mut string the citekey is built from let mut new_citekey = String::new(); @@ -114,7 +110,7 @@ pub(super) fn build_citekey( // loop over pattern fields process them 'field_loop: for pattern in pattern_fields.iter() { // parse single values from pattern field - let (field_name, word_count, char_count, inner_delimiter, cur_trailing_delimiter) = + let (field_name, max_words, max_chars, inner_delimiter, cur_trailing_delimiter) = split_formatting_pat(pattern); // built the part of the citekey from the current pattern field @@ -126,16 +122,9 @@ pub(super) fn build_citekey( // split at whitespaces, count fields and set counter for processed // splits - let mut split_field = field.split_whitespace(); + let split_field = field.split_whitespace(); let mut words_passed = 0; let field_count = field.split_whitespace().count(); - let word_count = if let Some(val) = word_count - && val <= field_count - { - val - } else { - field_count - }; // If there is a trailing delimiter from the previous field, push it if let Some(del) = trailing_delimiter { @@ -152,47 +141,57 @@ pub(super) fn build_citekey( } // loop over single parts of current field and add correct delimiter - 'word_loop: loop { - // process the single slices and add correct delimiter - if let Some(field_slice) = split_field.next() { - // Create word slice char by char. We need to loop over chars - // instead of a simple bytes index to also catch chars which - // consist of more than one byte (äöüøæ etc...) - let mut word_slice = String::new(); - let word_chars = field_slice.chars(); - let mut counter = 0; - 'char_loop: for mut c in word_chars { - // If camelcase is set, force first char of word to uppercase - if counter == 0 && case == Some(&CitekeyCase::Camel) { - c = c.to_ascii_uppercase() - } - if let Some(len) = char_count - && counter == len - { - break 'char_loop; - } - // if a word slice contains a special char, skip it - if IGNORED_SPECIAL_CHARS.contains(&c) { - continue 'char_loop; - } + // process the single slices and add correct delimiter + 'word_loop: for (idx, field_slice) in split_field.enumerate() { + // if the current slice is a common word from the ignore list, + // skip it. + if ignored_words.contains(&field_slice.to_lowercase()) { + continue; + } + + // Create word slice char by char. We need to loop over chars + // instead of a simple bytes index to also catch chars which + // consist of more than one byte (äöüøæ etc...) + let mut word_slice = String::new(); + let word_chars = field_slice.chars(); + let mut counter = 0; + 'char_loop: for mut c in word_chars { + // If camelcase is set, force first char of word to uppercase + if counter == 0 && case == Some(&CitekeyCase::Camel) { + c = c.to_ascii_uppercase() + } + if let Some(len) = max_chars + && counter >= len + { + break 'char_loop; + } + // if a word slice contains a special char, skip it + if ignored_chars.contains(&c) { + continue 'char_loop; + } + // if non-ascii chars should be mapped, check if needed and do it + if let Some(chars) = deunicode::deunicode_char(c) + && ascii_only + { + word_slice.push_str(chars); + counter += chars.len(); + } else { word_slice.push(c); counter += 1; } - // Don't count empty slices and don't add delimiter to those - if !word_slice.is_empty() { - formatted_str = formatted_str + &word_slice; - words_passed += 1; - if word_count == words_passed { - break 'word_loop; - } else { - formatted_str = formatted_str + inner_delimiter.unwrap_or(""); - } + } + // Don't count empty slices and don't add delimiter to those + if !word_slice.is_empty() { + formatted_str = formatted_str + &word_slice; + words_passed += 1; + if max_words.is_some_and(|max| max == words_passed) || idx + 1 == field_count { + break 'word_loop; } else { - continue 'word_loop; + formatted_str = formatted_str + inner_delimiter.unwrap_or(""); } } else { - break 'word_loop; - }; + continue 'word_loop; + } } formatted_str }; diff --git a/src/config.rs b/src/config.rs index b1c4b07..7c1a0f8 100644 --- a/src/config.rs +++ b/src/config.rs @@ -20,6 +20,7 @@ use std::{ io::{Write, stdin}, path::PathBuf, str::FromStr, + sync::LazyLock, }; use color_eyre::{eyre::Result, owo_colors::OwoColorize}; @@ -40,6 +41,31 @@ pub const IGNORED_SPECIAL_CHARS: [char; 33] = [ '&', '/', '`', '´', '#', '+', '*', '=', '|', '<', '>', '^', '°', '_', '"', ]; +pub static IGNORED_WORDS: LazyLock<Vec<String>> = LazyLock::new(|| { + vec![ + String::from("the"), + String::from("a"), + String::from("an"), + String::from("of"), + String::from("for"), + String::from("in"), + String::from("at"), + String::from("to"), + String::from("and"), + String::from("der"), + String::from("die"), + String::from("das"), + String::from("ein"), + String::from("eine"), + String::from("eines"), + String::from("des"), + String::from("auf"), + String::from("und"), + String::from("für"), + String::from("vor"), + ] +}); + const DEFAULT_CONFIG: &str = r##" # [general] ## Default files/dirs which are loaded on startup @@ -118,6 +144,40 @@ const DEFAULT_CONFIG: &str = r##" ## Convert chars to specified case. Possible values: ## "upper", "uppercase", "lower", "lowercase" # case = "lowercase" + +## Map all unicode chars to their pure ascii equivalent +# ascii_only = true + +## List of special chars that'll be ignored when building citekeys. +## A custom list will overwrite the default list +# ignored_chars = [ +# "?", "!", "\\", "\'", ".", "-", "–", ":", ",", "[", "]", "(", ")", "{", "}", "§", "$", "%", "&", "/", "`", "´", "#", "+", "*", "=", "|", "<", ">", "^", "°", "_", """, +# ] + +## List of words that'll be ignored when building citekeys. +## A custom list will overwrite the default list +# ignored_words = [ +# "the", +# "a", +# "an", +# "of", +# "for", +# "in", +# "at", +# "to", +# "and", +# "der", +# "die", +# "das", +# "ein", +# "eine", +# "eines", +# "des", +# "auf", +# "und", +# "für", +# "vor", +# ] "##; /// Main struct of the config file. Contains substructs/headings in toml @@ -171,6 +231,9 @@ pub struct Colors { pub struct CitekeyFormatter { pub fields: Option<Vec<String>>, pub case: Option<CitekeyCase>, + pub ascii_only: bool, + pub ignored_chars: Option<Vec<char>>, + pub ignored_words: Option<Vec<String>>, } impl Default for BibiConfig { @@ -194,6 +257,9 @@ impl Default for BibiConfig { citekey_formatter: CitekeyFormatter { fields: None, case: None, + ascii_only: true, + ignored_chars: None, + ignored_words: None, }, } } @@ -224,6 +290,9 @@ impl BibiConfig { citekey_formatter: CitekeyFormatter { fields: None, case: None, + ascii_only: true, + ignored_chars: None, + ignored_words: None, }, } } diff --git a/tests/biblatex-test-citekeys.bib b/tests/biblatex-test-citekeys.bib new file mode 100644 index 0000000..9767f97 --- /dev/null +++ b/tests/biblatex-test-citekeys.bib @@ -0,0 +1,476 @@ +@set{set, + entryset = {article:herrmann-ofele_carboc=carben=as_2006,article:aksin-turkmen_effect=immobi=on_2006,article:yoon-ryu_pallad=pincer=comple_2006}, + annotation = {A \texttt{set} with three members.}, +} + +@set{set, + entryset = {article:glashow_partia=symmet=weak_1961,article:weinberg_model=lepton_1967,salam}, + annotation = {A \texttt{set} with three members discussing the standard + model of particle physics.}, +} + +@collection{collection:matuz-miller_contem=litera=critic_1990gale, + title = {Contemporary Literary Criticism}, + year = {1990}, + location = {Detroit}, + publisher = {Gale}, + volume = {61}, + pages = {204--208}, + editor = {Matuz, Roger and Miller, Helen}, + keywords = {narration}, + langid = {english}, + langidopts = {variant=american}, + annotation = {A \texttt{collection} entry providing the excerpt information + for the \texttt{article:doody_heming=style=jakes_1974} entry. Note the format of the \texttt{ + pages} field}, +} + +@article{article:aksin-turkmen_effect=immobi=on_2006, + title = {Effect of immobilization on catalytic characteristics of saturated + {Pd-N}-heterocyclic carbenes in {Mizoroki-Heck} reactions}, + author = {Aks{\i}n, {\"O}zge and T{\"u}rkmen, Hayati and Artok , Levent and + { \c{C}}etinkaya, Bekir and Ni, Chaoying and B{\" u}y{ \"u}kg{\"u} + ng{ \" o}r, Orhan and {\"O}zkal, Erhan}, + volume = {691}, + number = {13}, + pages = {3027--3036}, + journaltitle = jomch, + date = {2006}, + indextitle = {Effect of immobilization on catalytic characteristics}, +} + +@article{article:angenendt_honore=salvat=vom_2002, + title = {In Honore Salvatoris~-- Vom Sinn und Unsinn der Patrozinienkunde}, + shorttitle = {In Honore Salvatoris}, + author = {Angenendt, Arnold}, + volume = {97}, + pages = {431--456, 791--823}, + journaltitle = {Revue d'Histoire Eccl{\'e}siastique}, + date = {2002}, + langid = {german}, + indextitle = {In Honore Salvatoris}, + annotation = {A German article in a French journal. Apart from that, a + typical \texttt{article} entry. Note the \texttt{indextitle} + field}, +} + +@book{book:aristotle_de=anima_1907cambr#unive#press, + title = {De Anima}, + author = {Aristotle}, + location = {Cambridge}, + publisher = cup, + date = {1907}, + editor = {Hicks, Robert Drew}, + keywords = {primary, ancient, philosophy, athens}, + langid = {english}, + langidopts = {variant=british}, + annotation = {A \texttt{book} entry with an \texttt{author} and an \texttt{ + editor}}, +} + +@book{book:aristotle_physic_1929g#p#putna, + title = {Physics}, + shorttitle = {Physics}, + author = {Aristotle}, + location = {New York}, + publisher = {G. P. Putnam}, + url = {https://www.infobooks.org/authors/classic/aristotle-books/#Physic}, + date = {1929}, + translator = {Wicksteed, P. H. and Cornford, F. M.}, + keywords = {primary, ancient, philosophy}, + langid = {english}, + langidopts = {variant=american}, + file = {~/Documents/coding/projects/bibiman/tests/book:aristotle_physic_1929g#p#putna.pdf}, + annotation = {A \texttt{book} entry with a \texttt{translator} field}, + abstract = {The Physics is a work by Aristotle dedicated to the study of + nature. Regarded by Heidegger as "the fundamental work of Western + philosophy", it presents the renowned distinction between the + four types of cause, as well as reflections on chance, motion, + infinity, and other fundamental concepts. It is here that + Aristotle sets out his celebrated paradox of time.}, +} + +@book{book:aristotle_poetic_1968clare#press, + title = {Poetics}, + shorttitle = {Poetics}, + author = {Aristotle}, + location = {Oxford}, + publisher = {Clarendon Press}, + series = {Clarendon {Aristotle}}, + date = {1968}, + editor = {Lucas, D. W.}, + keywords = {primary}, + langid = {english}, + langidopts = {variant=british}, + annotation = {A \texttt{book} entry with an \texttt{author} and an \texttt{ + editor} as well as a \texttt{series} field}, +} + +@mvbook{mvbook:aristotle_rhetor=aristo=with_1877cambr#unive#press, + title = {The \textbf{Rhetoric} of {Aristotle} with a commentary by the late {Edward + Meredith Cope}}, + shorttitle = {Rhetoric}, + author = {Aristotle}, + publisher = cup, + date = {1877}, + editor = {Cope, Edward Meredith}, + commentator = {Cope, Edward Meredith}, + volumes = {3}, + keywords = {primary}, + langid = {english}, + langidopts = {variant=british}, + sorttitle = {Rhetoric of Aristotle}, + indextitle = {Rhetoric of {Aristotle}, The}, + annotation = {A commented edition. Note the concatenation of the \texttt{ + editor} and \texttt{commentator} fields as well as the \texttt{ + volumes}, \texttt{sorttitle}, and \texttt{indextitle} fields}, +} + +@book{book:augustine_hetero=cataly=synthe_1995marce#dekke, + title = {Heterogeneous catalysis for the synthetic \textit{chemist}}, + shorttitle = {Heterogeneous catalysis}, + author = {Augustine, Robert L.}, + location = {New York}, + publisher = {Marcel Dekker}, + date = {1995}, + langid = {english}, + langidopts = {variant=american}, + annotation = {A plain \texttt{book} entry}, + keywords = {chemistry}, +} + +@book{book:averroes_epistl=on=possib_1982jewis#theol#semin#ameri, + title = {The Epistle on the Possibility of Conjunction with the Active + Intellect by {Ibn Rushd} with the Commentary of {Moses Narboni}}, + shorttitle = {Possibility of Conjunction}, + author = {Averroes}, + location = {New York}, + publisher = {Jewish Theological Seminary of America}, + series = {Moreshet: Studies in {Jewish} History, Literature and Thought}, + number = {7}, + date = {1982}, + editor = {Bland, Kalman P.}, + translator = {Bland, Kalman P.}, + keywords = {primary}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Epistle on the Possibility of Conjunction, The}, + annotation = {A \texttt{book} entry with a \texttt{series} and a \texttt{ + number}. Note the concatenation of the \texttt{editor} and + \texttt{translator} fields as well as the \texttt{indextitle} + field}, +} + +@article{article:baez-lauda_higher=algebr=v_2004, + title = {Higher-Dimensional Algebra {V}: 2-Groups}, + author = {Baez, John C. and Lauda, Aaron D.}, + volume = {12}, + pages = {423--491}, + journaltitle = {Theory and Applications of Categories}, + date = {2004}, + version = {3}, + eprint = {math/0307200v3}, + eprinttype = {arxiv}, + langid = {english}, + keywords = {math}, + langidopts = {variant=american}, + annotation = {An \texttt{article} with \texttt{eprint} and \texttt{ + eprinttype} fields. Note that the arXiv reference is + transformed into a clickable link if \texttt{hyperref} support + has been enabled. Compare \texttt{baez\slash online}, which is + the same item given as an \texttt{online} entry}, +} + +@article{article:bertram-wentworth_gromov=invari=holomo_1996, + title = {Gromov invariants for holomorphic maps on {Riemann} surfaces}, + shorttitle = {Gromov invariants}, + author = {Bertram, Aaron and Wentworth, Richard}, + volume = {9}, + number = {2}, + pages = {529--571}, + journaltitle = jams, + date = {1996}, + langid = {english}, + langidopts = {variant=american}, + annotation = {An \texttt{article} entry with a \texttt{volume} and a \texttt + {number} field}, +} + +@article{article:doody_heming=style=jakes_1974, + title = {Hemingway's Style and {Jake's} Narration}, + author = {Doody, Terrence}, + year = {1974}, + journal = {The Journal of Narrative Technique}, + volume = {4}, + number = {3}, + pages = {212--225}, + langid = {english}, + langidopts = {variant=american}, + related = {matuz:article:doody_heming=style=jakes_1974}, + relatedstring = {\autocap{e}xcerpt in}, + annotation = {An \texttt{article} entry cited as an excerpt from a \texttt{ + collection} entry. Note the format of the \texttt{related} and + \texttt{relatedstring} fields}, +} + +@article{article:gillies_herder=prepar=goethe_1933, + title = {Herder and the Preparation of {Goethe's} Idea of World Literature}, + author = {Gillies, Alexander}, + series = {newseries}, + volume = {9}, + pages = {46--67}, + journaltitle = {Publications of the English Goethe Society}, + date = {1933}, + langid = {english}, + langidopts = {variant=british}, + annotation = {An \texttt{article} entry with a \texttt{series} and a \texttt + {volume} field. Note that format of the \texttt{series} field + in the database file}, +} + +@article{article:glashow_partia=symmet=weak_1961, + title = {Partial Symmetries of Weak Interactions}, + author = {Glashow, Sheldon}, + volume = {22}, + pages = {579--588}, + journaltitle = {Nucl.~Phys.}, + date = {1961}, +} + +@article{article:herrmann-ofele_carboc=carben=as_2006, + title = {A carbocyclic carbene as an efficient catalyst ligand for {C--C} + coupling reactions}, + author = {Herrmann, Wolfgang A. and {\"O}fele, Karl and Schneider, Sabine K. + and Herdtweck, Eberhardt and Hoffmann, Stephan D.}, + volume = {45}, + number = {23}, + pages = {3859--3862}, + journaltitle = anch-ie, + date = {2006}, + indextitle = {Carbocyclic carbene as an efficient catalyst, A}, +} + +@article{article:hostetler-wingate_alkane=gold=cluste_1998, + title = {Alkanethiolate gold cluster molecules with core diameters from 1.5 + to 5.2~{nm}}, + shorttitle = {Alkanethiolate gold cluster molecules}, + author = {Hostetler, Michael J. and Wingate, Julia E. and Zhong, Chuan-Jian + and Harris, Jay E. and Vachet, Richard W. and Clark, Michael R. and + Londono, J. David and Green, Stephen J. and Stokes, Jennifer J. and + Wignall, George D. and Glish, Gary L. and Porter, Marc D. and Evans + , Neal D. and Murray, Royce W.}, + volume = {14}, + number = {1}, + pages = {17--30}, + journaltitle = {Langmuir}, + date = {1998}, + subtitle = {Core and monolayer properties as a function of core size}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Alkanethiolate gold cluster molecules}, + annotation = {An \texttt{article} entry with \arabic{author} authors. By + default, long author and editor lists are automatically + truncated. This is configurable}, +} + +@article{article:kastenholz-hunenberger_comput=method=ionic_2006, + title = {Computation of methodology\hyphen independent ionic solvation free + energies from molecular simulations}, + author = {Kastenholz, M. A. and H{\"u}nenberger, Philippe H.}, + volume = {124}, + doi = {10.1063/1.2172593}, + journaltitle = jchph, + date = {2006}, + subtitle = {{I}. {The} electrostatic potential in molecular liquids}, + eid = {124106}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Computation of ionic solvation free energies}, + annotation = {An \texttt{article} entry with an \texttt{eid} and a \texttt{ + doi} field. Note that the \textsc{doi} is transformed into a + clickable link if \texttt{hyperref} support has been enabled}, + abstract = {The computation of \texttt{ionic} solvation free energies from atomistic + simulations is a surprisingly difficult problem that has found no + satisfactory solution for more than 15 years. The reason is that + the charging free energies evaluated from such simulations are + affected by very large errors. One of these is related to the + choice of a specific convention for summing up the contributions + of solvent charges to the electrostatic potential in the ionic + cavity, namely, on the basis of point charges within entire + solvent molecules (M scheme) or on the basis of individual point + charges (P scheme). The use of an inappropriate convention may + lead to a charge-independent offset in the calculated potential, + which depends on the details of the summation scheme, on the + quadrupole-moment trace of the solvent molecule, and on the + approximate form used to represent electrostatic interactions in + the system. However, whether the M or P scheme (if any) + represents the appropriate convention is still a matter of + on-going debate. The goal of the present article is to settle + this long-standing controversy by carefully analyzing (both + analytically and numerically) the properties of the electrostatic + potential in molecular liquids (and inside cavities within them). + }, +} + +@article{article:sarfraz-razzak_techni=sectio=algori_2002, + title = {Technical section: {An} algorithm for automatic capturing of the + font outlines}, + author = {M. Sarfraz and M. F. A. Razzak}, + year = {2002}, + journal = {Computers and Graphics}, + volume = {26}, + number = {5}, + pages = {795--804}, + issn = {0097-8493}, + annotation = {An \texttt{article} entry with an \texttt{issn} field}, +} + +@article{article:reese_georgi=anglos=diplom_1958, + title = {Georgia in {Anglo-Spanish} Diplomacy, 1736--1739}, + author = {Reese, Trevor R.}, + series = {3}, + volume = {15}, + pages = {168--190}, + journaltitle = {William and Mary Quarterly}, + date = {1958}, + langid = {english}, + langidopts = {variant=american}, + annotation = {An \texttt{article} entry with a \texttt{series} and a \texttt + {volume} field. Note the format of the series. If the value of + the \texttt{series} field is an integer, this number is printed + as an ordinal and the string \enquote*{series} is appended + automatically}, +} + +@article{article:shore_twiceb=once=concei_1991, + title = {Twice-Born, Once Conceived}, + author = {Shore, Bradd}, + series = {newseries}, + volume = {93}, + number = {1}, + pages = {9--27}, + journaltitle = {American Anthropologist}, + date = {1991-03}, + subtitle = {Meaning Construction and Cultural Cognition}, + annotation = {An \texttt{article} entry with \texttt{series}, \texttt{volume + }, and \texttt{number} fields. Note the format of the \texttt{ + series} which is a localization key}, +} + +@article{article:sigfridsson-ryde_compar=method=derivi_1998, + title = {Comparison of methods for deriving atomic charges from the + electrostatic potential and moments}, + author = {Sigfridsson, Emma and Ryde, Ulf}, + volume = {19}, + number = {4}, + pages = {377--395}, + doi = {10.1002/(SICI)1096-987X(199803)19:4<377::AID-JCC1>3.0.CO;2-P}, + journaltitle = {Journal of Computational Chemistry}, + date = {1998}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {Methods for deriving atomic charges}, + annotation = {An \texttt{article} entry with \texttt{volume}, \texttt{number + }, and \texttt{doi} fields. Note that the \textsc{doi} is + transformed into a clickable link if \texttt{hyperref} support + has been enabled}, + abstract = {Four methods for deriving partial atomic charges from the + quantum chemical electrostatic potential (CHELP, CHELPG, + Merz-Kollman, and RESP) have been compared and critically + evaluated. It is shown that charges strongly depend on how and + where the potential points are selected. Two alternative methods + are suggested to avoid the arbitrariness in the point-selection + schemes and van der Waals exclusion radii: CHELP-BOW, which also + estimates the charges from the electrostatic potential, but with + potential points that are Boltzmann-weighted after their + occurrence in actual simulations using the energy function of the + program in which the charges will be used, and CHELMO, which + estimates the charges directly from the electrostatic multipole + moments. Different criteria for the quality of the charges are + discussed.}, +} + +@article{article:spiegelberg_intent=intent=schola_1969, + title = {\mkbibquote{Intention} und \mkbibquote{Intentionalit{\"a}t} in der + Scholastik, bei Brentano und Husserl}, + shorttitle = {Intention und Intentionalit{\"a}t}, + author = {Spiegelberg, Herbert}, + volume = {29}, + pages = {189--216}, + journaltitle = {Studia Philosophica}, + date = {1969}, + langid = {german}, + sorttitle = {Intention und Intentionalitat in der Scholastik, bei Brentano + und Husserl}, + indexsorttitle = {Intention und Intentionalitat in der Scholastik, bei + Brentano und Husserl}, + annotation = {An \texttt{article} entry. Note the \texttt{sorttitle} and + \texttt{indexsorttitle} fields and the markup of the quotes in + the database file}, +} + +@article{article:springer_mediae=pilgri=routes_1950, + title = {Mediaeval Pilgrim Routes from {Scandinavia} to {Rome}}, + shorttitle = {Mediaeval Pilgrim Routes}, + author = {Springer, Otto}, + volume = {12}, + pages = {92--122}, + journaltitle = {Mediaeval Studies}, + date = {1950}, + langid = {english}, + langidopts = {variant=british}, + annotation = {A plain \texttt{article} entry}, +} + +@article{article:weinberg_model=lepton_1967, + title = {A Model of Leptons}, + author = {Weinberg, Steven}, + volume = {19}, + pages = {1264--1266}, + journaltitle = {Phys.~Rev.~Lett.}, + date = {1967}, +} + +@string{anch-ie = {Angew.~Chem. Int.~Ed.}} + +@string{cup = {Cambridge University Press}} + +@string{dtv = {Deutscher Taschenbuch-Verlag}} + +@string{hup = {Harvard University Press}} + +@string{jams = {J.~Amer. Math. Soc.}} + +@string{jchph = {J.~Chem. Phys.}} + +@string{jomch = {J.~Organomet. Chem.}} + +@string{pup = {Princeton University Press}} + +@incollection{incollection:westfahl_true=fronti, + title = {The True Frontier}, + author = {Westfahl, Gary}, + pages = {55--65}, + subtitle = {Confronting and Avoiding the Realities of Space in {American} + Science Fiction Films}, + crossref = {westfahl:frontier}, + langid = {english}, + langidopts = {variant=american}, + indextitle = {True Frontier, The}, + annotation = {A cross-referenced article from a \texttt{collection}. This is + an \texttt{incollection} entry with a \texttt{crossref} field. + Note the \texttt{subtitle} and \texttt{indextitle} fields}, +} + +@article{article:yoon-ryu_pallad=pincer=comple_2006, + title = {Palladium pincer complexes with reduced bond angle strain: + efficient catalysts for the {Heck} reaction}, + author = {Yoon, Myeong S. and Ryu, Dowook and Kim, Jeongryul and Ahn, Kyo + Han}, + volume = {25}, + number = {10}, + pages = {2409--2411}, + journaltitle = {Organometallics}, + date = {2006}, + indextitle = {Palladium pincer complexes}, +} diff --git a/tests/test-config.toml b/tests/test-config.toml index d3e42c5..8dd8014 100644 --- a/tests/test-config.toml +++ b/tests/test-config.toml @@ -61,10 +61,13 @@ custom_column = "series" # year_color = "135" [citekey_formatter] -fields = ["entrytype;;;;:", "author;2;;-;_", "title;3;6;_;_", "year", "publisher;;5;#;" ] +fields = ["shorthand;;;;+","entrytype;;;;:", "author;2;;-;_", "title;3;6;=;_", "year", "publisher;;5;#;" ] # fields = [ # CamelCase test # "author;2;;;", # "title;5;5;;", # "year" # ] case = "lowercase" +ascii_only = true +# ignored_words = ["the"] +# ignored_chars = ["?", "."] |
