diff options
| author | Ellie Huxtable <ellie@elliehuxtable.com> | 2026-03-16 15:22:49 -0700 |
|---|---|---|
| committer | Ellie Huxtable <ellie@elliehuxtable.com> | 2026-03-16 15:22:49 -0700 |
| commit | 280499651c8308555e287dea79aa6569a95d99f5 (patch) | |
| tree | 355648de82f0134ad6a62fb9f361279c0a8485b5 /crates/atuin-nucleo/matcher | |
| parent | specify version in all daemon atuin crates (diff) | |
| parent | Squashed 'crates/atuin-nucleo/' content from commit 4253de9f (diff) | |
| download | atuin-280499651c8308555e287dea79aa6569a95d99f5.zip | |
bring in base nucleo
Diffstat (limited to 'crates/atuin-nucleo/matcher')
24 files changed, 5623 insertions, 0 deletions
diff --git a/crates/atuin-nucleo/matcher/Cargo.toml b/crates/atuin-nucleo/matcher/Cargo.toml new file mode 100644 index 00000000..4b90ddbb --- /dev/null +++ b/crates/atuin-nucleo/matcher/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "nucleo-matcher" +description = "plug and play high performance fuzzy matcher" +authors = ["Pascal Kuthe <pascalkuthe@pm.me>"] +version = "0.3.1" +edition = "2021" +license = "MPL-2.0" +repository = "https://github.com/helix-editor/nucleo" +readme = "../README.md" + +[dependencies] +memchr = "2.5.0" +unicode-segmentation = { version = "1.10", optional = true } + +[features] +default = ["unicode-normalization", "unicode-casefold", "unicode-segmentation"] +unicode-normalization = [] +unicode-casefold = [] +unicode-segmentation = ["dep:unicode-segmentation"] diff --git a/crates/atuin-nucleo/matcher/LICENSE b/crates/atuin-nucleo/matcher/LICENSE new file mode 120000 index 00000000..ea5b6064 --- /dev/null +++ b/crates/atuin-nucleo/matcher/LICENSE @@ -0,0 +1 @@ +../LICENSE
\ No newline at end of file diff --git a/crates/atuin-nucleo/matcher/fuzz.sh b/crates/atuin-nucleo/matcher/fuzz.sh new file mode 100755 index 00000000..d3ffa2c9 --- /dev/null +++ b/crates/atuin-nucleo/matcher/fuzz.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +cargo +nightly fuzz "${1}" fuzz_target_1 "${@:2:99}" diff --git a/crates/atuin-nucleo/matcher/fuzz/.gitignore b/crates/atuin-nucleo/matcher/fuzz/.gitignore new file mode 100644 index 00000000..1a45eee7 --- /dev/null +++ b/crates/atuin-nucleo/matcher/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/crates/atuin-nucleo/matcher/fuzz/Cargo.toml b/crates/atuin-nucleo/matcher/fuzz/Cargo.toml new file mode 100644 index 00000000..1b9d8a7f --- /dev/null +++ b/crates/atuin-nucleo/matcher/fuzz/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "fzf_oxide-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary = { version = "1", features = ["derive"] } + +[dependencies.fzf_oxide] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[profile.release] +debug = 1 + +[[bin]] +name = "fuzz_target_1" +path = "fuzz_targets/fuzz_target_1.rs" +test = false +doc = false + diff --git a/crates/atuin-nucleo/matcher/fuzz/fuzz_targets/fuzz_target_1.rs b/crates/atuin-nucleo/matcher/fuzz/fuzz_targets/fuzz_target_1.rs new file mode 100644 index 00000000..d9df7d36 --- /dev/null +++ b/crates/atuin-nucleo/matcher/fuzz/fuzz_targets/fuzz_target_1.rs @@ -0,0 +1,78 @@ +#![no_main] + +use fzf_oxide::{chars, Matcher, MatcherConfig, Utf32Str}; +use libfuzzer_sys::arbitrary::Arbitrary; +use libfuzzer_sys::fuzz_target; + +#[derive(Arbitrary, Debug)] +pub struct Input<'a> { + haystack: &'a str, + needle: &'a str, + ignore_case: bool, + normalize: bool, +} + +fuzz_target!(|data: Input<'_>| { + let mut data = data; + let mut config = MatcherConfig::DEFAULT; + config.ignore_case = data.ignore_case; + config.normalize = data.normalize; + let mut matcher = Matcher::new(config); + let mut indices_optimal = Vec::new(); + let mut indices_greedy = Vec::new(); + let mut needle_buf = Vec::new(); + let mut haystack_buf = Vec::new(); + let normalize = |mut c: char| { + if config.normalize { + c = chars::normalize(c); + } + if config.ignore_case { + c = chars::to_lower_case(c); + } + c + }; + let needle: String = data.needle.chars().map(normalize).collect(); + let needle_chars: Vec<_> = needle.chars().collect(); + let needle = Utf32Str::new(&needle, &mut needle_buf); + let haystack = Utf32Str::new(data.haystack, &mut haystack_buf); + + let greedy_score = matcher.fuzzy_indices_greedy(haystack, needle, &mut indices_greedy); + if greedy_score.is_some() { + let match_chars: Vec<_> = indices_greedy + .iter() + .map(|&i| normalize(haystack.get(i))) + .collect(); + assert_eq!( + match_chars, needle_chars, + "failed match, found {indices_greedy:?} {match_chars:?} (greedy)" + ); + } + let optimal_score = matcher.fuzzy_indices(haystack, needle, &mut indices_optimal); + if optimal_score.is_some() { + let match_chars: Vec<_> = indices_optimal + .iter() + .map(|&i| normalize(haystack.get(i))) + .collect(); + assert_eq!( + match_chars, needle_chars, + "failed match, found {indices_optimal:?} {match_chars:?}" + ); + } + match (greedy_score, optimal_score) { + (None, Some(score)) => unreachable!("optimal matched {score} but greedy did not match"), + (Some(score), None) => unreachable!("greedy matched {score} but optimal did not match"), + (Some(greedy), Some(optimal)) => { + assert!( + greedy <= optimal, + "optimal score must be atleast the same as greedy score {greedy} {optimal}" + ); + if indices_greedy == indices_optimal { + assert_eq!( + greedy, optimal, + "if matching same char greedy and optimal score should be identical" + ) + } + } + (None, None) => (), + } +}); diff --git a/crates/atuin-nucleo/matcher/generate_case_fold_table.sh b/crates/atuin-nucleo/matcher/generate_case_fold_table.sh new file mode 100755 index 00000000..32a26697 --- /dev/null +++ b/crates/atuin-nucleo/matcher/generate_case_fold_table.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -e + +dir=$(pwd) +mkdir /tmp/ucd-15.0.0 +cd /tmp/ucd-15.0.0 +curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip +unzip UCD.zip + +cd "${dir}" +cargo install ucd-generate +ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/chars/case_fold.rs +rm -rf /tmp/ucd-15.0.0 diff --git a/crates/atuin-nucleo/matcher/src/chars.rs b/crates/atuin-nucleo/matcher/src/chars.rs new file mode 100644 index 00000000..d13a2466 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/chars.rs @@ -0,0 +1,207 @@ +//! Utilities for working with (unicode) characters/codepoints + +use std::fmt::{self, Debug, Display}; + +#[cfg(feature = "unicode-casefold")] +use crate::chars::case_fold::CASE_FOLDING_SIMPLE; +use crate::Config; + +//autogenerated by generate-ucd +#[allow(warnings)] +#[rustfmt::skip] +#[cfg(feature = "unicode-casefold")] +mod case_fold; +#[cfg(feature = "unicode-normalization")] +mod normalize; + +pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { + const ASCII: bool; + fn char_class(self, config: &Config) -> CharClass; + fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass); + fn normalize(self, config: &Config) -> Self; +} + +/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub(crate) struct AsciiChar(pub u8); + +impl AsciiChar { + pub fn cast(bytes: &[u8]) -> &[AsciiChar] { + unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) } + } +} + +impl fmt::Display for AsciiChar { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Display::fmt(&(self.0 as char), f) + } +} + +impl PartialEq<AsciiChar> for char { + fn eq(&self, other: &AsciiChar) -> bool { + other.0 as char == *self + } +} + +impl Char for AsciiChar { + const ASCII: bool = true; + #[inline] + fn char_class(self, config: &Config) -> CharClass { + let c = self.0; + // using manual if conditions instead optimizes better + if c >= b'a' && c <= b'z' { + CharClass::Lower + } else if c >= b'A' && c <= b'Z' { + CharClass::Upper + } else if c >= b'0' && c <= b'9' { + CharClass::Number + } else if c.is_ascii_whitespace() { + CharClass::Whitespace + } else if config.delimiter_chars.contains(&c) { + CharClass::Delimiter + } else { + CharClass::NonWord + } + } + + #[inline(always)] + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { + let char_class = self.char_class(config); + if config.ignore_case && char_class == CharClass::Upper { + self.0 += 32 + } + (self, char_class) + } + + #[inline(always)] + fn normalize(mut self, config: &Config) -> Self { + if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { + self.0 += 32 + } + self + } +} +fn char_class_non_ascii(c: char) -> CharClass { + if c.is_lowercase() { + CharClass::Lower + } else if is_upper_case(c) { + CharClass::Upper + } else if c.is_numeric() { + CharClass::Number + } else if c.is_alphabetic() { + CharClass::Letter + } else if c.is_whitespace() { + CharClass::Whitespace + } else { + CharClass::NonWord + } +} +impl Char for char { + const ASCII: bool = false; + #[inline(always)] + fn char_class(self, config: &Config) -> CharClass { + if self.is_ascii() { + return AsciiChar(self as u8).char_class(config); + } + char_class_non_ascii(self) + } + + #[inline(always)] + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { + if self.is_ascii() { + let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); + return (c.0 as char, class); + } + let char_class = char_class_non_ascii(self); + #[cfg(feature = "unicode-casefold")] + let mut case_fold = char_class == CharClass::Upper; + #[cfg(feature = "unicode-normalization")] + if config.normalize { + self = normalize::normalize(self); + case_fold = true + } + #[cfg(feature = "unicode-casefold")] + if case_fold && config.ignore_case { + self = CASE_FOLDING_SIMPLE + .binary_search_by_key(&self, |(upper, _)| *upper) + .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1) + } + (self, char_class) + } + + #[inline(always)] + fn normalize(mut self, config: &Config) -> Self { + #[cfg(feature = "unicode-normalization")] + if config.normalize { + self = normalize::normalize(self); + } + #[cfg(feature = "unicode-casefold")] + if config.ignore_case { + self = to_lower_case(self) + } + self + } +} + +#[cfg(feature = "unicode-normalization")] +pub use normalize::normalize; +#[cfg(feature = "unicode-segmentation")] +use unicode_segmentation::UnicodeSegmentation; + +/// Converts a character to lower case using simple unicode case folding +#[cfg(feature = "unicode-casefold")] +#[inline(always)] +pub fn to_lower_case(c: char) -> char { + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) +} + +/// Checks if a character is upper case according to simple unicode case folding. +/// if the `unicode-casefold` feature is disable the equivalent std function is used +#[inline(always)] +pub fn is_upper_case(c: char) -> bool { + #[cfg(feature = "unicode-casefold")] + let val = CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .is_ok(); + #[cfg(not(feature = "unicode-casefold"))] + let val = c.is_uppercase(); + val +} + +#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] +pub(crate) enum CharClass { + Whitespace, + NonWord, + Delimiter, + Lower, + Upper, + Letter, + Number, +} + +/// Nucleo cannot match graphemes as single units. To work around +/// that we only use the first codepoint of each grapheme. This +/// iterator returns the first character of each unicode grapheme +/// in a string and is used for constructing `Utf32Str(ing)`. +pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ { + #[cfg(feature = "unicode-segmentation")] + let res = text.graphemes(true).map(|grapheme| { + // we need to special-case this check since `\r\n` is a single grapheme and is + // therefore the exception to the rule that normalization of a grapheme should + // map to the first character. + if grapheme == "\r\n" { + '\n' + } else { + grapheme + .chars() + .next() + .expect("graphemes must be non-empty") + } + }); + #[cfg(not(feature = "unicode-segmentation"))] + let res = text.chars(); + res +} diff --git a/crates/atuin-nucleo/matcher/src/chars/case_fold.rs b/crates/atuin-nucleo/matcher/src/chars/case_fold.rs new file mode 100644 index 00000000..aacbe461 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/chars/case_fold.rs @@ -0,0 +1,347 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.3.0 is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[ + ('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), + ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'), + ('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), + ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'), + ('Y', 'y'), ('Z', 'z'), ('µ', 'μ'), ('À', 'à'), ('Á', 'á'), + ('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'), + ('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'), + ('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'), + ('Ñ', 'ñ'), ('Ò', 'ò'), ('Ó', 'ó'), ('Ô', 'ô'), ('Õ', 'õ'), + ('Ö', 'ö'), ('Ø', 'ø'), ('Ù', 'ù'), ('Ú', 'ú'), ('Û', 'û'), + ('Ü', 'ü'), ('Ý', 'ý'), ('Þ', 'þ'), ('Ā', 'ā'), ('Ă', 'ă'), + ('Ą', 'ą'), ('Ć', 'ć'), ('Ĉ', 'ĉ'), ('Ċ', 'ċ'), ('Č', 'č'), + ('Ď', 'ď'), ('Đ', 'đ'), ('Ē', 'ē'), ('Ĕ', 'ĕ'), ('Ė', 'ė'), + ('Ę', 'ę'), ('Ě', 'ě'), ('Ĝ', 'ĝ'), ('Ğ', 'ğ'), ('Ġ', 'ġ'), + ('Ģ', 'ģ'), ('Ĥ', 'ĥ'), ('Ħ', 'ħ'), ('Ĩ', 'ĩ'), ('Ī', 'ī'), + ('Ĭ', 'ĭ'), ('Į', 'į'), ('IJ', 'ij'), ('Ĵ', 'ĵ'), ('Ķ', 'ķ'), + ('Ĺ', 'ĺ'), ('Ļ', 'ļ'), ('Ľ', 'ľ'), ('Ŀ', 'ŀ'), ('Ł', 'ł'), + ('Ń', 'ń'), ('Ņ', 'ņ'), ('Ň', 'ň'), ('Ŋ', 'ŋ'), ('Ō', 'ō'), + ('Ŏ', 'ŏ'), ('Ő', 'ő'), ('Œ', 'œ'), ('Ŕ', 'ŕ'), ('Ŗ', 'ŗ'), + ('Ř', 'ř'), ('Ś', 'ś'), ('Ŝ', 'ŝ'), ('Ş', 'ş'), ('Š', 'š'), + ('Ţ', 'ţ'), ('Ť', 'ť'), ('Ŧ', 'ŧ'), ('Ũ', 'ũ'), ('Ū', 'ū'), + ('Ŭ', 'ŭ'), ('Ů', 'ů'), ('Ű', 'ű'), ('Ų', 'ų'), ('Ŵ', 'ŵ'), + ('Ŷ', 'ŷ'), ('Ÿ', 'ÿ'), ('Ź', 'ź'), ('Ż', 'ż'), ('Ž', 'ž'), + ('ſ', 's'), ('Ɓ', 'ɓ'), ('Ƃ', 'ƃ'), ('Ƅ', 'ƅ'), ('Ɔ', 'ɔ'), + ('Ƈ', 'ƈ'), ('Ɖ', 'ɖ'), ('Ɗ', 'ɗ'), ('Ƌ', 'ƌ'), ('Ǝ', 'ǝ'), + ('Ə', 'ə'), ('Ɛ', 'ɛ'), ('Ƒ', 'ƒ'), ('Ɠ', 'ɠ'), ('Ɣ', 'ɣ'), + ('Ɩ', 'ɩ'), ('Ɨ', 'ɨ'), ('Ƙ', 'ƙ'), ('Ɯ', 'ɯ'), ('Ɲ', 'ɲ'), + ('Ɵ', 'ɵ'), ('Ơ', 'ơ'), ('Ƣ', 'ƣ'), ('Ƥ', 'ƥ'), ('Ʀ', 'ʀ'), + ('Ƨ', 'ƨ'), ('Ʃ', 'ʃ'), ('Ƭ', 'ƭ'), ('Ʈ', 'ʈ'), ('Ư', 'ư'), + ('Ʊ', 'ʊ'), ('Ʋ', 'ʋ'), ('Ƴ', 'ƴ'), ('Ƶ', 'ƶ'), ('Ʒ', 'ʒ'), + ('Ƹ', 'ƹ'), ('Ƽ', 'ƽ'), ('DŽ', 'dž'), ('Dž', 'dž'), ('LJ', 'lj'), + ('Lj', 'lj'), ('NJ', 'nj'), ('Nj', 'nj'), ('Ǎ', 'ǎ'), ('Ǐ', 'ǐ'), + ('Ǒ', 'ǒ'), ('Ǔ', 'ǔ'), ('Ǖ', 'ǖ'), ('Ǘ', 'ǘ'), ('Ǚ', 'ǚ'), + ('Ǜ', 'ǜ'), ('Ǟ', 'ǟ'), ('Ǡ', 'ǡ'), ('Ǣ', 'ǣ'), ('Ǥ', 'ǥ'), + ('Ǧ', 'ǧ'), ('Ǩ', 'ǩ'), ('Ǫ', 'ǫ'), ('Ǭ', 'ǭ'), ('Ǯ', 'ǯ'), + ('DZ', 'dz'), ('Dz', 'dz'), ('Ǵ', 'ǵ'), ('Ƕ', 'ƕ'), ('Ƿ', 'ƿ'), + ('Ǹ', 'ǹ'), ('Ǻ', 'ǻ'), ('Ǽ', 'ǽ'), ('Ǿ', 'ǿ'), ('Ȁ', 'ȁ'), + ('Ȃ', 'ȃ'), ('Ȅ', 'ȅ'), ('Ȇ', 'ȇ'), ('Ȉ', 'ȉ'), ('Ȋ', 'ȋ'), + ('Ȍ', 'ȍ'), ('Ȏ', 'ȏ'), ('Ȑ', 'ȑ'), ('Ȓ', 'ȓ'), ('Ȕ', 'ȕ'), + ('Ȗ', 'ȗ'), ('Ș', 'ș'), ('Ț', 'ț'), ('Ȝ', 'ȝ'), ('Ȟ', 'ȟ'), + ('Ƞ', 'ƞ'), ('Ȣ', 'ȣ'), ('Ȥ', 'ȥ'), ('Ȧ', 'ȧ'), ('Ȩ', 'ȩ'), + ('Ȫ', 'ȫ'), ('Ȭ', 'ȭ'), ('Ȯ', 'ȯ'), ('Ȱ', 'ȱ'), ('Ȳ', 'ȳ'), + ('Ⱥ', 'ⱥ'), ('Ȼ', 'ȼ'), ('Ƚ', 'ƚ'), ('Ⱦ', 'ⱦ'), ('Ɂ', 'ɂ'), + ('Ƀ', 'ƀ'), ('Ʉ', 'ʉ'), ('Ʌ', 'ʌ'), ('Ɇ', 'ɇ'), ('Ɉ', 'ɉ'), + ('Ɋ', 'ɋ'), ('Ɍ', 'ɍ'), ('Ɏ', 'ɏ'), ('\u{345}', 'ι'), ('Ͱ', 'ͱ'), + ('Ͳ', 'ͳ'), ('Ͷ', 'ͷ'), ('Ϳ', 'ϳ'), ('Ά', 'ά'), ('Έ', 'έ'), + ('Ή', 'ή'), ('Ί', 'ί'), ('Ό', 'ό'), ('Ύ', 'ύ'), ('Ώ', 'ώ'), + ('Α', 'α'), ('Β', 'β'), ('Γ', 'γ'), ('Δ', 'δ'), ('Ε', 'ε'), + ('Ζ', 'ζ'), ('Η', 'η'), ('Θ', 'θ'), ('Ι', 'ι'), ('Κ', 'κ'), + ('Λ', 'λ'), ('Μ', 'μ'), ('Ν', 'ν'), ('Ξ', 'ξ'), ('Ο', 'ο'), + ('Π', 'π'), ('Ρ', 'ρ'), ('Σ', 'σ'), ('Τ', 'τ'), ('Υ', 'υ'), + ('Φ', 'φ'), ('Χ', 'χ'), ('Ψ', 'ψ'), ('Ω', 'ω'), ('Ϊ', 'ϊ'), + ('Ϋ', 'ϋ'), ('ς', 'σ'), ('Ϗ', 'ϗ'), ('ϐ', 'β'), ('ϑ', 'θ'), + ('ϕ', 'φ'), ('ϖ', 'π'), ('Ϙ', 'ϙ'), ('Ϛ', 'ϛ'), ('Ϝ', 'ϝ'), + ('Ϟ', 'ϟ'), ('Ϡ', 'ϡ'), ('Ϣ', 'ϣ'), ('Ϥ', 'ϥ'), ('Ϧ', 'ϧ'), + ('Ϩ', 'ϩ'), ('Ϫ', 'ϫ'), ('Ϭ', 'ϭ'), ('Ϯ', 'ϯ'), ('ϰ', 'κ'), + ('ϱ', 'ρ'), ('ϴ', 'θ'), ('ϵ', 'ε'), ('Ϸ', 'ϸ'), ('Ϲ', 'ϲ'), + ('Ϻ', 'ϻ'), ('Ͻ', 'ͻ'), ('Ͼ', 'ͼ'), ('Ͽ', 'ͽ'), ('Ѐ', 'ѐ'), + ('Ё', 'ё'), ('Ђ', 'ђ'), ('Ѓ', 'ѓ'), ('Є', 'є'), ('Ѕ', 'ѕ'), + ('І', 'і'), ('Ї', 'ї'), ('Ј', 'ј'), ('Љ', 'љ'), ('Њ', 'њ'), + ('Ћ', 'ћ'), ('Ќ', 'ќ'), ('Ѝ', 'ѝ'), ('Ў', 'ў'), ('Џ', 'џ'), + ('А', 'а'), ('Б', 'б'), ('В', 'в'), ('Г', 'г'), ('Д', 'д'), + ('Е', 'е'), ('Ж', 'ж'), ('З', 'з'), ('И', 'и'), ('Й', 'й'), + ('К', 'к'), ('Л', 'л'), ('М', 'м'), ('Н', 'н'), ('О', 'о'), + ('П', 'п'), ('Р', 'р'), ('С', 'с'), ('Т', 'т'), ('У', 'у'), + ('Ф', 'ф'), ('Х', 'х'), ('Ц', 'ц'), ('Ч', 'ч'), ('Ш', 'ш'), + ('Щ', 'щ'), ('Ъ', 'ъ'), ('Ы', 'ы'), ('Ь', 'ь'), ('Э', 'э'), + ('Ю', 'ю'), ('Я', 'я'), ('Ѡ', 'ѡ'), ('Ѣ', 'ѣ'), ('Ѥ', 'ѥ'), + ('Ѧ', 'ѧ'), ('Ѩ', 'ѩ'), ('Ѫ', 'ѫ'), ('Ѭ', 'ѭ'), ('Ѯ', 'ѯ'), + ('Ѱ', 'ѱ'), ('Ѳ', 'ѳ'), ('Ѵ', 'ѵ'), ('Ѷ', 'ѷ'), ('Ѹ', 'ѹ'), + ('Ѻ', 'ѻ'), ('Ѽ', 'ѽ'), ('Ѿ', 'ѿ'), ('Ҁ', 'ҁ'), ('Ҋ', 'ҋ'), + ('Ҍ', 'ҍ'), ('Ҏ', 'ҏ'), ('Ґ', 'ґ'), ('Ғ', 'ғ'), ('Ҕ', 'ҕ'), + ('Җ', 'җ'), ('Ҙ', 'ҙ'), ('Қ', 'қ'), ('Ҝ', 'ҝ'), ('Ҟ', 'ҟ'), + ('Ҡ', 'ҡ'), ('Ң', 'ң'), ('Ҥ', 'ҥ'), ('Ҧ', 'ҧ'), ('Ҩ', 'ҩ'), + ('Ҫ', 'ҫ'), ('Ҭ', 'ҭ'), ('Ү', 'ү'), ('Ұ', 'ұ'), ('Ҳ', 'ҳ'), + ('Ҵ', 'ҵ'), ('Ҷ', 'ҷ'), ('Ҹ', 'ҹ'), ('Һ', 'һ'), ('Ҽ', 'ҽ'), + ('Ҿ', 'ҿ'), ('Ӏ', 'ӏ'), ('Ӂ', 'ӂ'), ('Ӄ', 'ӄ'), ('Ӆ', 'ӆ'), + ('Ӈ', 'ӈ'), ('Ӊ', 'ӊ'), ('Ӌ', 'ӌ'), ('Ӎ', 'ӎ'), ('Ӑ', 'ӑ'), + ('Ӓ', 'ӓ'), ('Ӕ', 'ӕ'), ('Ӗ', 'ӗ'), ('Ә', 'ә'), ('Ӛ', 'ӛ'), + ('Ӝ', 'ӝ'), ('Ӟ', 'ӟ'), ('Ӡ', 'ӡ'), ('Ӣ', 'ӣ'), ('Ӥ', 'ӥ'), + ('Ӧ', 'ӧ'), ('Ө', 'ө'), ('Ӫ', 'ӫ'), ('Ӭ', 'ӭ'), ('Ӯ', 'ӯ'), + ('Ӱ', 'ӱ'), ('Ӳ', 'ӳ'), ('Ӵ', 'ӵ'), ('Ӷ', 'ӷ'), ('Ӹ', 'ӹ'), + ('Ӻ', 'ӻ'), ('Ӽ', 'ӽ'), ('Ӿ', 'ӿ'), ('Ԁ', 'ԁ'), ('Ԃ', 'ԃ'), + ('Ԅ', 'ԅ'), ('Ԇ', 'ԇ'), ('Ԉ', 'ԉ'), ('Ԋ', 'ԋ'), ('Ԍ', 'ԍ'), + ('Ԏ', 'ԏ'), ('Ԑ', 'ԑ'), ('Ԓ', 'ԓ'), ('Ԕ', 'ԕ'), ('Ԗ', 'ԗ'), + ('Ԙ', 'ԙ'), ('Ԛ', 'ԛ'), ('Ԝ', 'ԝ'), ('Ԟ', 'ԟ'), ('Ԡ', 'ԡ'), + ('Ԣ', 'ԣ'), ('Ԥ', 'ԥ'), ('Ԧ', 'ԧ'), ('Ԩ', 'ԩ'), ('Ԫ', 'ԫ'), + ('Ԭ', 'ԭ'), ('Ԯ', 'ԯ'), ('Ա', 'ա'), ('Բ', 'բ'), ('Գ', 'գ'), + ('Դ', 'դ'), ('Ե', 'ե'), ('Զ', 'զ'), ('Է', 'է'), ('Ը', 'ը'), + ('Թ', 'թ'), ('Ժ', 'ժ'), ('Ի', 'ի'), ('Լ', 'լ'), ('Խ', 'խ'), + ('Ծ', 'ծ'), ('Կ', 'կ'), ('Հ', 'հ'), ('Ձ', 'ձ'), ('Ղ', 'ղ'), + ('Ճ', 'ճ'), ('Մ', 'մ'), ('Յ', 'յ'), ('Ն', 'ն'), ('Շ', 'շ'), + ('Ո', 'ո'), ('Չ', 'չ'), ('Պ', 'պ'), ('Ջ', 'ջ'), ('Ռ', 'ռ'), + ('Ս', 'ս'), ('Վ', 'վ'), ('Տ', 'տ'), ('Ր', 'ր'), ('Ց', 'ց'), + ('Ւ', 'ւ'), ('Փ', 'փ'), ('Ք', 'ք'), ('Օ', 'օ'), ('Ֆ', 'ֆ'), + ('Ⴀ', 'ⴀ'), ('Ⴁ', 'ⴁ'), ('Ⴂ', 'ⴂ'), ('Ⴃ', 'ⴃ'), + ('Ⴄ', 'ⴄ'), ('Ⴅ', 'ⴅ'), ('Ⴆ', 'ⴆ'), ('Ⴇ', 'ⴇ'), + ('Ⴈ', 'ⴈ'), ('Ⴉ', 'ⴉ'), ('Ⴊ', 'ⴊ'), ('Ⴋ', 'ⴋ'), + ('Ⴌ', 'ⴌ'), ('Ⴍ', 'ⴍ'), ('Ⴎ', 'ⴎ'), ('Ⴏ', 'ⴏ'), + ('Ⴐ', 'ⴐ'), ('Ⴑ', 'ⴑ'), ('Ⴒ', 'ⴒ'), ('Ⴓ', 'ⴓ'), + ('Ⴔ', 'ⴔ'), ('Ⴕ', 'ⴕ'), ('Ⴖ', 'ⴖ'), ('Ⴗ', 'ⴗ'), + ('Ⴘ', 'ⴘ'), ('Ⴙ', 'ⴙ'), ('Ⴚ', 'ⴚ'), ('Ⴛ', 'ⴛ'), + ('Ⴜ', 'ⴜ'), ('Ⴝ', 'ⴝ'), ('Ⴞ', 'ⴞ'), ('Ⴟ', 'ⴟ'), + ('Ⴠ', 'ⴠ'), ('Ⴡ', 'ⴡ'), ('Ⴢ', 'ⴢ'), ('Ⴣ', 'ⴣ'), + ('Ⴤ', 'ⴤ'), ('Ⴥ', 'ⴥ'), ('Ⴧ', 'ⴧ'), ('Ⴭ', 'ⴭ'), + ('ᏸ', 'Ᏸ'), ('ᏹ', 'Ᏹ'), ('ᏺ', 'Ᏺ'), ('ᏻ', 'Ᏻ'), + ('ᏼ', 'Ᏼ'), ('ᏽ', 'Ᏽ'), ('ᲀ', 'в'), ('ᲁ', 'д'), ('ᲂ', 'о'), + ('ᲃ', 'с'), ('ᲄ', 'т'), ('ᲅ', 'т'), ('ᲆ', 'ъ'), ('ᲇ', 'ѣ'), + ('ᲈ', 'ꙋ'), ('Ა', 'ა'), ('Ბ', 'ბ'), ('Გ', 'გ'), + ('Დ', 'დ'), ('Ე', 'ე'), ('Ვ', 'ვ'), ('Ზ', 'ზ'), + ('Თ', 'თ'), ('Ი', 'ი'), ('Კ', 'კ'), ('Ლ', 'ლ'), + ('Მ', 'მ'), ('Ნ', 'ნ'), ('Ო', 'ო'), ('Პ', 'პ'), + ('Ჟ', 'ჟ'), ('Რ', 'რ'), ('Ს', 'ს'), ('Ტ', 'ტ'), + ('Უ', 'უ'), ('Ფ', 'ფ'), ('Ქ', 'ქ'), ('Ღ', 'ღ'), + ('Ყ', 'ყ'), ('Შ', 'შ'), ('Ჩ', 'ჩ'), ('Ც', 'ც'), + ('Ძ', 'ძ'), ('Წ', 'წ'), ('Ჭ', 'ჭ'), ('Ხ', 'ხ'), + ('Ჯ', 'ჯ'), ('Ჰ', 'ჰ'), ('Ჱ', 'ჱ'), ('Ჲ', 'ჲ'), + ('Ჳ', 'ჳ'), ('Ჴ', 'ჴ'), ('Ჵ', 'ჵ'), ('Ჶ', 'ჶ'), + ('Ჷ', 'ჷ'), ('Ჸ', 'ჸ'), ('Ჹ', 'ჹ'), ('Ჺ', 'ჺ'), + ('Ჽ', 'ჽ'), ('Ჾ', 'ჾ'), ('Ჿ', 'ჿ'), ('Ḁ', 'ḁ'), + ('Ḃ', 'ḃ'), ('Ḅ', 'ḅ'), ('Ḇ', 'ḇ'), ('Ḉ', 'ḉ'), + ('Ḋ', 'ḋ'), ('Ḍ', 'ḍ'), ('Ḏ', 'ḏ'), ('Ḑ', 'ḑ'), + ('Ḓ', 'ḓ'), ('Ḕ', 'ḕ'), ('Ḗ', 'ḗ'), ('Ḙ', 'ḙ'), + ('Ḛ', 'ḛ'), ('Ḝ', 'ḝ'), ('Ḟ', 'ḟ'), ('Ḡ', 'ḡ'), + ('Ḣ', 'ḣ'), ('Ḥ', 'ḥ'), ('Ḧ', 'ḧ'), ('Ḩ', 'ḩ'), + ('Ḫ', 'ḫ'), ('Ḭ', 'ḭ'), ('Ḯ', 'ḯ'), ('Ḱ', 'ḱ'), + ('Ḳ', 'ḳ'), ('Ḵ', 'ḵ'), ('Ḷ', 'ḷ'), ('Ḹ', 'ḹ'), + ('Ḻ', 'ḻ'), ('Ḽ', 'ḽ'), ('Ḿ', 'ḿ'), ('Ṁ', 'ṁ'), + ('Ṃ', 'ṃ'), ('Ṅ', 'ṅ'), ('Ṇ', 'ṇ'), ('Ṉ', 'ṉ'), + ('Ṋ', 'ṋ'), ('Ṍ', 'ṍ'), ('Ṏ', 'ṏ'), ('Ṑ', 'ṑ'), + ('Ṓ', 'ṓ'), ('Ṕ', 'ṕ'), ('Ṗ', 'ṗ'), ('Ṙ', 'ṙ'), + ('Ṛ', 'ṛ'), ('Ṝ', 'ṝ'), ('Ṟ', 'ṟ'), ('Ṡ', 'ṡ'), + ('Ṣ', 'ṣ'), ('Ṥ', 'ṥ'), ('Ṧ', 'ṧ'), ('Ṩ', 'ṩ'), + ('Ṫ', 'ṫ'), ('Ṭ', 'ṭ'), ('Ṯ', 'ṯ'), ('Ṱ', 'ṱ'), + ('Ṳ', 'ṳ'), ('Ṵ', 'ṵ'), ('Ṷ', 'ṷ'), ('Ṹ', 'ṹ'), + ('Ṻ', 'ṻ'), ('Ṽ', 'ṽ'), ('Ṿ', 'ṿ'), ('Ẁ', 'ẁ'), + ('Ẃ', 'ẃ'), ('Ẅ', 'ẅ'), ('Ẇ', 'ẇ'), ('Ẉ', 'ẉ'), + ('Ẋ', 'ẋ'), ('Ẍ', 'ẍ'), ('Ẏ', 'ẏ'), ('Ẑ', 'ẑ'), + ('Ẓ', 'ẓ'), ('Ẕ', 'ẕ'), ('ẛ', 'ṡ'), ('ẞ', 'ß'), + ('Ạ', 'ạ'), ('Ả', 'ả'), ('Ấ', 'ấ'), ('Ầ', 'ầ'), + ('Ẩ', 'ẩ'), ('Ẫ', 'ẫ'), ('Ậ', 'ậ'), ('Ắ', 'ắ'), + ('Ằ', 'ằ'), ('Ẳ', 'ẳ'), ('Ẵ', 'ẵ'), ('Ặ', 'ặ'), + ('Ẹ', 'ẹ'), ('Ẻ', 'ẻ'), ('Ẽ', 'ẽ'), ('Ế', 'ế'), + ('Ề', 'ề'), ('Ể', 'ể'), ('Ễ', 'ễ'), ('Ệ', 'ệ'), + ('Ỉ', 'ỉ'), ('Ị', 'ị'), ('Ọ', 'ọ'), ('Ỏ', 'ỏ'), + ('Ố', 'ố'), ('Ồ', 'ồ'), ('Ổ', 'ổ'), ('Ỗ', 'ỗ'), + ('Ộ', 'ộ'), ('Ớ', 'ớ'), ('Ờ', 'ờ'), ('Ở', 'ở'), + ('Ỡ', 'ỡ'), ('Ợ', 'ợ'), ('Ụ', 'ụ'), ('Ủ', 'ủ'), + ('Ứ', 'ứ'), ('Ừ', 'ừ'), ('Ử', 'ử'), ('Ữ', 'ữ'), + ('Ự', 'ự'), ('Ỳ', 'ỳ'), ('Ỵ', 'ỵ'), ('Ỷ', 'ỷ'), + ('Ỹ', 'ỹ'), ('Ỻ', 'ỻ'), ('Ỽ', 'ỽ'), ('Ỿ', 'ỿ'), + ('Ἀ', 'ἀ'), ('Ἁ', 'ἁ'), ('Ἂ', 'ἂ'), ('Ἃ', 'ἃ'), + ('Ἄ', 'ἄ'), ('Ἅ', 'ἅ'), ('Ἆ', 'ἆ'), ('Ἇ', 'ἇ'), + ('Ἐ', 'ἐ'), ('Ἑ', 'ἑ'), ('Ἒ', 'ἒ'), ('Ἓ', 'ἓ'), + ('Ἔ', 'ἔ'), ('Ἕ', 'ἕ'), ('Ἠ', 'ἠ'), ('Ἡ', 'ἡ'), + ('Ἢ', 'ἢ'), ('Ἣ', 'ἣ'), ('Ἤ', 'ἤ'), ('Ἥ', 'ἥ'), + ('Ἦ', 'ἦ'), ('Ἧ', 'ἧ'), ('Ἰ', 'ἰ'), ('Ἱ', 'ἱ'), + ('Ἲ', 'ἲ'), ('Ἳ', 'ἳ'), ('Ἴ', 'ἴ'), ('Ἵ', 'ἵ'), + ('Ἶ', 'ἶ'), ('Ἷ', 'ἷ'), ('Ὀ', 'ὀ'), ('Ὁ', 'ὁ'), + ('Ὂ', 'ὂ'), ('Ὃ', 'ὃ'), ('Ὄ', 'ὄ'), ('Ὅ', 'ὅ'), + ('Ὑ', 'ὑ'), ('Ὓ', 'ὓ'), ('Ὕ', 'ὕ'), ('Ὗ', 'ὗ'), + ('Ὠ', 'ὠ'), ('Ὡ', 'ὡ'), ('Ὢ', 'ὢ'), ('Ὣ', 'ὣ'), + ('Ὤ', 'ὤ'), ('Ὥ', 'ὥ'), ('Ὦ', 'ὦ'), ('Ὧ', 'ὧ'), + ('ᾈ', 'ᾀ'), ('ᾉ', 'ᾁ'), ('ᾊ', 'ᾂ'), ('ᾋ', 'ᾃ'), + ('ᾌ', 'ᾄ'), ('ᾍ', 'ᾅ'), ('ᾎ', 'ᾆ'), ('ᾏ', 'ᾇ'), + ('ᾘ', 'ᾐ'), ('ᾙ', 'ᾑ'), ('ᾚ', 'ᾒ'), ('ᾛ', 'ᾓ'), + ('ᾜ', 'ᾔ'), ('ᾝ', 'ᾕ'), ('ᾞ', 'ᾖ'), ('ᾟ', 'ᾗ'), + ('ᾨ', 'ᾠ'), ('ᾩ', 'ᾡ'), ('ᾪ', 'ᾢ'), ('ᾫ', 'ᾣ'), + ('ᾬ', 'ᾤ'), ('ᾭ', 'ᾥ'), ('ᾮ', 'ᾦ'), ('ᾯ', 'ᾧ'), + ('Ᾰ', 'ᾰ'), ('Ᾱ', 'ᾱ'), ('Ὰ', 'ὰ'), ('Ά', 'ά'), + ('ᾼ', 'ᾳ'), ('ι', 'ι'), ('Ὲ', 'ὲ'), ('Έ', 'έ'), + ('Ὴ', 'ὴ'), ('Ή', 'ή'), ('ῌ', 'ῃ'), ('Ῐ', 'ῐ'), + ('Ῑ', 'ῑ'), ('Ὶ', 'ὶ'), ('Ί', 'ί'), ('Ῠ', 'ῠ'), + ('Ῡ', 'ῡ'), ('Ὺ', 'ὺ'), ('Ύ', 'ύ'), ('Ῥ', 'ῥ'), + ('Ὸ', 'ὸ'), ('Ό', 'ό'), ('Ὼ', 'ὼ'), ('Ώ', 'ώ'), + ('ῼ', 'ῳ'), ('Ω', 'ω'), ('K', 'k'), ('Å', 'å'), ('Ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅰ'), ('Ⅱ', 'ⅱ'), ('Ⅲ', 'ⅲ'), ('Ⅳ', 'ⅳ'), + ('Ⅴ', 'ⅴ'), ('Ⅵ', 'ⅵ'), ('Ⅶ', 'ⅶ'), ('Ⅷ', 'ⅷ'), + ('Ⅸ', 'ⅸ'), ('Ⅹ', 'ⅹ'), ('Ⅺ', 'ⅺ'), ('Ⅻ', 'ⅻ'), + ('Ⅼ', 'ⅼ'), ('Ⅽ', 'ⅽ'), ('Ⅾ', 'ⅾ'), ('Ⅿ', 'ⅿ'), + ('Ↄ', 'ↄ'), ('Ⓐ', 'ⓐ'), ('Ⓑ', 'ⓑ'), ('Ⓒ', 'ⓒ'), + ('Ⓓ', 'ⓓ'), ('Ⓔ', 'ⓔ'), ('Ⓕ', 'ⓕ'), ('Ⓖ', 'ⓖ'), + ('Ⓗ', 'ⓗ'), ('Ⓘ', 'ⓘ'), ('Ⓙ', 'ⓙ'), ('Ⓚ', 'ⓚ'), + ('Ⓛ', 'ⓛ'), ('Ⓜ', 'ⓜ'), ('Ⓝ', 'ⓝ'), ('Ⓞ', 'ⓞ'), + ('Ⓟ', 'ⓟ'), ('Ⓠ', 'ⓠ'), ('Ⓡ', 'ⓡ'), ('Ⓢ', 'ⓢ'), + ('Ⓣ', 'ⓣ'), ('Ⓤ', 'ⓤ'), ('Ⓥ', 'ⓥ'), ('Ⓦ', 'ⓦ'), + ('Ⓧ', 'ⓧ'), ('Ⓨ', 'ⓨ'), ('Ⓩ', 'ⓩ'), ('Ⰰ', 'ⰰ'), + ('Ⰱ', 'ⰱ'), ('Ⰲ', 'ⰲ'), ('Ⰳ', 'ⰳ'), ('Ⰴ', 'ⰴ'), + ('Ⰵ', 'ⰵ'), ('Ⰶ', 'ⰶ'), ('Ⰷ', 'ⰷ'), ('Ⰸ', 'ⰸ'), + ('Ⰹ', 'ⰹ'), ('Ⰺ', 'ⰺ'), ('Ⰻ', 'ⰻ'), ('Ⰼ', 'ⰼ'), + ('Ⰽ', 'ⰽ'), ('Ⰾ', 'ⰾ'), ('Ⰿ', 'ⰿ'), ('Ⱀ', 'ⱀ'), + ('Ⱁ', 'ⱁ'), ('Ⱂ', 'ⱂ'), ('Ⱃ', 'ⱃ'), ('Ⱄ', 'ⱄ'), + ('Ⱅ', 'ⱅ'), ('Ⱆ', 'ⱆ'), ('Ⱇ', 'ⱇ'), ('Ⱈ', 'ⱈ'), + ('Ⱉ', 'ⱉ'), ('Ⱊ', 'ⱊ'), ('Ⱋ', 'ⱋ'), ('Ⱌ', 'ⱌ'), + ('Ⱍ', 'ⱍ'), ('Ⱎ', 'ⱎ'), ('Ⱏ', 'ⱏ'), ('Ⱐ', 'ⱐ'), + ('Ⱑ', 'ⱑ'), ('Ⱒ', 'ⱒ'), ('Ⱓ', 'ⱓ'), ('Ⱔ', 'ⱔ'), + ('Ⱕ', 'ⱕ'), ('Ⱖ', 'ⱖ'), ('Ⱗ', 'ⱗ'), ('Ⱘ', 'ⱘ'), + ('Ⱙ', 'ⱙ'), ('Ⱚ', 'ⱚ'), ('Ⱛ', 'ⱛ'), ('Ⱜ', 'ⱜ'), + ('Ⱝ', 'ⱝ'), ('Ⱞ', 'ⱞ'), ('Ⱟ', 'ⱟ'), ('Ⱡ', 'ⱡ'), + ('Ɫ', 'ɫ'), ('Ᵽ', 'ᵽ'), ('Ɽ', 'ɽ'), ('Ⱨ', 'ⱨ'), + ('Ⱪ', 'ⱪ'), ('Ⱬ', 'ⱬ'), ('Ɑ', 'ɑ'), ('Ɱ', 'ɱ'), ('Ɐ', 'ɐ'), + ('Ɒ', 'ɒ'), ('Ⱳ', 'ⱳ'), ('Ⱶ', 'ⱶ'), ('Ȿ', 'ȿ'), ('Ɀ', 'ɀ'), + ('Ⲁ', 'ⲁ'), ('Ⲃ', 'ⲃ'), ('Ⲅ', 'ⲅ'), ('Ⲇ', 'ⲇ'), + ('Ⲉ', 'ⲉ'), ('Ⲋ', 'ⲋ'), ('Ⲍ', 'ⲍ'), ('Ⲏ', 'ⲏ'), + ('Ⲑ', 'ⲑ'), ('Ⲓ', 'ⲓ'), ('Ⲕ', 'ⲕ'), ('Ⲗ', 'ⲗ'), + ('Ⲙ', 'ⲙ'), ('Ⲛ', 'ⲛ'), ('Ⲝ', 'ⲝ'), ('Ⲟ', 'ⲟ'), + ('Ⲡ', 'ⲡ'), ('Ⲣ', 'ⲣ'), ('Ⲥ', 'ⲥ'), ('Ⲧ', 'ⲧ'), + ('Ⲩ', 'ⲩ'), ('Ⲫ', 'ⲫ'), ('Ⲭ', 'ⲭ'), ('Ⲯ', 'ⲯ'), + ('Ⲱ', 'ⲱ'), ('Ⲳ', 'ⲳ'), ('Ⲵ', 'ⲵ'), ('Ⲷ', 'ⲷ'), + ('Ⲹ', 'ⲹ'), ('Ⲻ', 'ⲻ'), ('Ⲽ', 'ⲽ'), ('Ⲿ', 'ⲿ'), + ('Ⳁ', 'ⳁ'), ('Ⳃ', 'ⳃ'), ('Ⳅ', 'ⳅ'), ('Ⳇ', 'ⳇ'), + ('Ⳉ', 'ⳉ'), ('Ⳋ', 'ⳋ'), ('Ⳍ', 'ⳍ'), ('Ⳏ', 'ⳏ'), + ('Ⳑ', 'ⳑ'), ('Ⳓ', 'ⳓ'), ('Ⳕ', 'ⳕ'), ('Ⳗ', 'ⳗ'), + ('Ⳙ', 'ⳙ'), ('Ⳛ', 'ⳛ'), ('Ⳝ', 'ⳝ'), ('Ⳟ', 'ⳟ'), + ('Ⳡ', 'ⳡ'), ('Ⳣ', 'ⳣ'), ('Ⳬ', 'ⳬ'), ('Ⳮ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('Ꙁ', 'ꙁ'), ('Ꙃ', 'ꙃ'), ('Ꙅ', 'ꙅ'), + ('Ꙇ', 'ꙇ'), ('Ꙉ', 'ꙉ'), ('Ꙋ', 'ꙋ'), ('Ꙍ', 'ꙍ'), + ('Ꙏ', 'ꙏ'), ('Ꙑ', 'ꙑ'), ('Ꙓ', 'ꙓ'), ('Ꙕ', 'ꙕ'), + ('Ꙗ', 'ꙗ'), ('Ꙙ', 'ꙙ'), ('Ꙛ', 'ꙛ'), ('Ꙝ', 'ꙝ'), + ('Ꙟ', 'ꙟ'), ('Ꙡ', 'ꙡ'), ('Ꙣ', 'ꙣ'), ('Ꙥ', 'ꙥ'), + ('Ꙧ', 'ꙧ'), ('Ꙩ', 'ꙩ'), ('Ꙫ', 'ꙫ'), ('Ꙭ', 'ꙭ'), + ('Ꚁ', 'ꚁ'), ('Ꚃ', 'ꚃ'), ('Ꚅ', 'ꚅ'), ('Ꚇ', 'ꚇ'), + ('Ꚉ', 'ꚉ'), ('Ꚋ', 'ꚋ'), ('Ꚍ', 'ꚍ'), ('Ꚏ', 'ꚏ'), + ('Ꚑ', 'ꚑ'), ('Ꚓ', 'ꚓ'), ('Ꚕ', 'ꚕ'), ('Ꚗ', 'ꚗ'), + ('Ꚙ', 'ꚙ'), ('Ꚛ', 'ꚛ'), ('Ꜣ', 'ꜣ'), ('Ꜥ', 'ꜥ'), + ('Ꜧ', 'ꜧ'), ('Ꜩ', 'ꜩ'), ('Ꜫ', 'ꜫ'), ('Ꜭ', 'ꜭ'), + ('Ꜯ', 'ꜯ'), ('Ꜳ', 'ꜳ'), ('Ꜵ', 'ꜵ'), ('Ꜷ', 'ꜷ'), + ('Ꜹ', 'ꜹ'), ('Ꜻ', 'ꜻ'), ('Ꜽ', 'ꜽ'), ('Ꜿ', 'ꜿ'), + ('Ꝁ', 'ꝁ'), ('Ꝃ', 'ꝃ'), ('Ꝅ', 'ꝅ'), ('Ꝇ', 'ꝇ'), + ('Ꝉ', 'ꝉ'), ('Ꝋ', 'ꝋ'), ('Ꝍ', 'ꝍ'), ('Ꝏ', 'ꝏ'), + ('Ꝑ', 'ꝑ'), ('Ꝓ', 'ꝓ'), ('Ꝕ', 'ꝕ'), ('Ꝗ', 'ꝗ'), + ('Ꝙ', 'ꝙ'), ('Ꝛ', 'ꝛ'), ('Ꝝ', 'ꝝ'), ('Ꝟ', 'ꝟ'), + ('Ꝡ', 'ꝡ'), ('Ꝣ', 'ꝣ'), ('Ꝥ', 'ꝥ'), ('Ꝧ', 'ꝧ'), + ('Ꝩ', 'ꝩ'), ('Ꝫ', 'ꝫ'), ('Ꝭ', 'ꝭ'), ('Ꝯ', 'ꝯ'), + ('Ꝺ', 'ꝺ'), ('Ꝼ', 'ꝼ'), ('Ᵹ', 'ᵹ'), ('Ꝿ', 'ꝿ'), + ('Ꞁ', 'ꞁ'), ('Ꞃ', 'ꞃ'), ('Ꞅ', 'ꞅ'), ('Ꞇ', 'ꞇ'), + ('Ꞌ', 'ꞌ'), ('Ɥ', 'ɥ'), ('Ꞑ', 'ꞑ'), ('Ꞓ', 'ꞓ'), + ('Ꞗ', 'ꞗ'), ('Ꞙ', 'ꞙ'), ('Ꞛ', 'ꞛ'), ('Ꞝ', 'ꞝ'), + ('Ꞟ', 'ꞟ'), ('Ꞡ', 'ꞡ'), ('Ꞣ', 'ꞣ'), ('Ꞥ', 'ꞥ'), + ('Ꞧ', 'ꞧ'), ('Ꞩ', 'ꞩ'), ('Ɦ', 'ɦ'), ('Ɜ', 'ɜ'), ('Ɡ', 'ɡ'), + ('Ɬ', 'ɬ'), ('Ɪ', 'ɪ'), ('Ʞ', 'ʞ'), ('Ʇ', 'ʇ'), ('Ʝ', 'ʝ'), + ('Ꭓ', 'ꭓ'), ('Ꞵ', 'ꞵ'), ('Ꞷ', 'ꞷ'), ('Ꞹ', 'ꞹ'), + ('Ꞻ', 'ꞻ'), ('Ꞽ', 'ꞽ'), ('Ꞿ', 'ꞿ'), ('Ꟁ', 'ꟁ'), + ('Ꟃ', 'ꟃ'), ('Ꞔ', 'ꞔ'), ('Ʂ', 'ʂ'), ('Ᶎ', 'ᶎ'), + ('Ꟈ', 'ꟈ'), ('Ꟊ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('Ꟗ', 'ꟗ'), + ('Ꟙ', 'ꟙ'), ('Ꟶ', 'ꟶ'), ('ꭰ', 'Ꭰ'), ('ꭱ', 'Ꭱ'), + ('ꭲ', 'Ꭲ'), ('ꭳ', 'Ꭳ'), ('ꭴ', 'Ꭴ'), ('ꭵ', 'Ꭵ'), + ('ꭶ', 'Ꭶ'), ('ꭷ', 'Ꭷ'), ('ꭸ', 'Ꭸ'), ('ꭹ', 'Ꭹ'), + ('ꭺ', 'Ꭺ'), ('ꭻ', 'Ꭻ'), ('ꭼ', 'Ꭼ'), ('ꭽ', 'Ꭽ'), + ('ꭾ', 'Ꭾ'), ('ꭿ', 'Ꭿ'), ('ꮀ', 'Ꮀ'), ('ꮁ', 'Ꮁ'), + ('ꮂ', 'Ꮂ'), ('ꮃ', 'Ꮃ'), ('ꮄ', 'Ꮄ'), ('ꮅ', 'Ꮅ'), + ('ꮆ', 'Ꮆ'), ('ꮇ', 'Ꮇ'), ('ꮈ', 'Ꮈ'), ('ꮉ', 'Ꮉ'), + ('ꮊ', 'Ꮊ'), ('ꮋ', 'Ꮋ'), ('ꮌ', 'Ꮌ'), ('ꮍ', 'Ꮍ'), + ('ꮎ', 'Ꮎ'), ('ꮏ', 'Ꮏ'), ('ꮐ', 'Ꮐ'), ('ꮑ', 'Ꮑ'), + ('ꮒ', 'Ꮒ'), ('ꮓ', 'Ꮓ'), ('ꮔ', 'Ꮔ'), ('ꮕ', 'Ꮕ'), + ('ꮖ', 'Ꮖ'), ('ꮗ', 'Ꮗ'), ('ꮘ', 'Ꮘ'), ('ꮙ', 'Ꮙ'), + ('ꮚ', 'Ꮚ'), ('ꮛ', 'Ꮛ'), ('ꮜ', 'Ꮜ'), ('ꮝ', 'Ꮝ'), + ('ꮞ', 'Ꮞ'), ('ꮟ', 'Ꮟ'), ('ꮠ', 'Ꮠ'), ('ꮡ', 'Ꮡ'), + ('ꮢ', 'Ꮢ'), ('ꮣ', 'Ꮣ'), ('ꮤ', 'Ꮤ'), ('ꮥ', 'Ꮥ'), + ('ꮦ', 'Ꮦ'), ('ꮧ', 'Ꮧ'), ('ꮨ', 'Ꮨ'), ('ꮩ', 'Ꮩ'), + ('ꮪ', 'Ꮪ'), ('ꮫ', 'Ꮫ'), ('ꮬ', 'Ꮬ'), ('ꮭ', 'Ꮭ'), + ('ꮮ', 'Ꮮ'), ('ꮯ', 'Ꮯ'), ('ꮰ', 'Ꮰ'), ('ꮱ', 'Ꮱ'), + ('ꮲ', 'Ꮲ'), ('ꮳ', 'Ꮳ'), ('ꮴ', 'Ꮴ'), ('ꮵ', 'Ꮵ'), + ('ꮶ', 'Ꮶ'), ('ꮷ', 'Ꮷ'), ('ꮸ', 'Ꮸ'), ('ꮹ', 'Ꮹ'), + ('ꮺ', 'Ꮺ'), ('ꮻ', 'Ꮻ'), ('ꮼ', 'Ꮼ'), ('ꮽ', 'Ꮽ'), + ('ꮾ', 'Ꮾ'), ('ꮿ', 'Ꮿ'), ('A', 'a'), ('B', 'b'), + ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), + ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), + ('K', 'k'), ('L', 'l'), ('M', 'm'), ('N', 'n'), + ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), + ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), + ('W', 'w'), ('X', 'x'), ('Y', 'y'), ('Z', 'z'), + ('𐐀', '𐐨'), ('𐐁', '𐐩'), ('𐐂', '𐐪'), ('𐐃', '𐐫'), + ('𐐄', '𐐬'), ('𐐅', '𐐭'), ('𐐆', '𐐮'), ('𐐇', '𐐯'), + ('𐐈', '𐐰'), ('𐐉', '𐐱'), ('𐐊', '𐐲'), ('𐐋', '𐐳'), + ('𐐌', '𐐴'), ('𐐍', '𐐵'), ('𐐎', '𐐶'), ('𐐏', '𐐷'), + ('𐐐', '𐐸'), ('𐐑', '𐐹'), ('𐐒', '𐐺'), ('𐐓', '𐐻'), + ('𐐔', '𐐼'), ('𐐕', '𐐽'), ('𐐖', '𐐾'), ('𐐗', '𐐿'), + ('𐐘', '𐑀'), ('𐐙', '𐑁'), ('𐐚', '𐑂'), ('𐐛', '𐑃'), + ('𐐜', '𐑄'), ('𐐝', '𐑅'), ('𐐞', '𐑆'), ('𐐟', '𐑇'), + ('𐐠', '𐑈'), ('𐐡', '𐑉'), ('𐐢', '𐑊'), ('𐐣', '𐑋'), + ('𐐤', '𐑌'), ('𐐥', '𐑍'), ('𐐦', '𐑎'), ('𐐧', '𐑏'), + ('𐒰', '𐓘'), ('𐒱', '𐓙'), ('𐒲', '𐓚'), ('𐒳', '𐓛'), + ('𐒴', '𐓜'), ('𐒵', '𐓝'), ('𐒶', '𐓞'), ('𐒷', '𐓟'), + ('𐒸', '𐓠'), ('𐒹', '𐓡'), ('𐒺', '𐓢'), ('𐒻', '𐓣'), + ('𐒼', '𐓤'), ('𐒽', '𐓥'), ('𐒾', '𐓦'), ('𐒿', '𐓧'), + ('𐓀', '𐓨'), ('𐓁', '𐓩'), ('𐓂', '𐓪'), ('𐓃', '𐓫'), + ('𐓄', '𐓬'), ('𐓅', '𐓭'), ('𐓆', '𐓮'), ('𐓇', '𐓯'), + ('𐓈', '𐓰'), ('𐓉', '𐓱'), ('𐓊', '𐓲'), ('𐓋', '𐓳'), + ('𐓌', '𐓴'), ('𐓍', '𐓵'), ('𐓎', '𐓶'), ('𐓏', '𐓷'), + ('𐓐', '𐓸'), ('𐓑', '𐓹'), ('𐓒', '𐓺'), ('𐓓', '𐓻'), + ('𐕰', '𐖗'), ('𐕱', '𐖘'), ('𐕲', '𐖙'), ('𐕳', '𐖚'), + ('𐕴', '𐖛'), ('𐕵', '𐖜'), ('𐕶', '𐖝'), ('𐕷', '𐖞'), + ('𐕸', '𐖟'), ('𐕹', '𐖠'), ('𐕺', '𐖡'), ('𐕼', '𐖣'), + ('𐕽', '𐖤'), ('𐕾', '𐖥'), ('𐕿', '𐖦'), ('𐖀', '𐖧'), + ('𐖁', '𐖨'), ('𐖂', '𐖩'), ('𐖃', '𐖪'), ('𐖄', '𐖫'), + ('𐖅', '𐖬'), ('𐖆', '𐖭'), ('𐖇', '𐖮'), ('𐖈', '𐖯'), + ('𐖉', '𐖰'), ('𐖊', '𐖱'), ('𐖌', '𐖳'), ('𐖍', '𐖴'), + ('𐖎', '𐖵'), ('𐖏', '𐖶'), ('𐖐', '𐖷'), ('𐖑', '𐖸'), + ('𐖒', '𐖹'), ('𐖔', '𐖻'), ('𐖕', '𐖼'), ('𐲀', '𐳀'), + ('𐲁', '𐳁'), ('𐲂', '𐳂'), ('𐲃', '𐳃'), ('𐲄', '𐳄'), + ('𐲅', '𐳅'), ('𐲆', '𐳆'), ('𐲇', '𐳇'), ('𐲈', '𐳈'), + ('𐲉', '𐳉'), ('𐲊', '𐳊'), ('𐲋', '𐳋'), ('𐲌', '𐳌'), + ('𐲍', '𐳍'), ('𐲎', '𐳎'), ('𐲏', '𐳏'), ('𐲐', '𐳐'), + ('𐲑', '𐳑'), ('𐲒', '𐳒'), ('𐲓', '𐳓'), ('𐲔', '𐳔'), + ('𐲕', '𐳕'), ('𐲖', '𐳖'), ('𐲗', '𐳗'), ('𐲘', '𐳘'), + ('𐲙', '𐳙'), ('𐲚', '𐳚'), ('𐲛', '𐳛'), ('𐲜', '𐳜'), + ('𐲝', '𐳝'), ('𐲞', '𐳞'), ('𐲟', '𐳟'), ('𐲠', '𐳠'), + ('𐲡', '𐳡'), ('𐲢', '𐳢'), ('𐲣', '𐳣'), ('𐲤', '𐳤'), + ('𐲥', '𐳥'), ('𐲦', '𐳦'), ('𐲧', '𐳧'), ('𐲨', '𐳨'), + ('𐲩', '𐳩'), ('𐲪', '𐳪'), ('𐲫', '𐳫'), ('𐲬', '𐳬'), + ('𐲭', '𐳭'), ('𐲮', '𐳮'), ('𐲯', '𐳯'), ('𐲰', '𐳰'), + ('𐲱', '𐳱'), ('𐲲', '𐳲'), ('𑢠', '𑣀'), ('𑢡', '𑣁'), + ('𑢢', '𑣂'), ('𑢣', '𑣃'), ('𑢤', '𑣄'), ('𑢥', '𑣅'), + ('𑢦', '𑣆'), ('𑢧', '𑣇'), ('𑢨', '𑣈'), ('𑢩', '𑣉'), + ('𑢪', '𑣊'), ('𑢫', '𑣋'), ('𑢬', '𑣌'), ('𑢭', '𑣍'), + ('𑢮', '𑣎'), ('𑢯', '𑣏'), ('𑢰', '𑣐'), ('𑢱', '𑣑'), + ('𑢲', '𑣒'), ('𑢳', '𑣓'), ('𑢴', '𑣔'), ('𑢵', '𑣕'), + ('𑢶', '𑣖'), ('𑢷', '𑣗'), ('𑢸', '𑣘'), ('𑢹', '𑣙'), + ('𑢺', '𑣚'), ('𑢻', '𑣛'), ('𑢼', '𑣜'), ('𑢽', '𑣝'), + ('𑢾', '𑣞'), ('𑢿', '𑣟'), ('𖹀', '𖹠'), ('𖹁', '𖹡'), + ('𖹂', '𖹢'), ('𖹃', '𖹣'), ('𖹄', '𖹤'), ('𖹅', '𖹥'), + ('𖹆', '𖹦'), ('𖹇', '𖹧'), ('𖹈', '𖹨'), ('𖹉', '𖹩'), + ('𖹊', '𖹪'), ('𖹋', '𖹫'), ('𖹌', '𖹬'), ('𖹍', '𖹭'), + ('𖹎', '𖹮'), ('𖹏', '𖹯'), ('𖹐', '𖹰'), ('𖹑', '𖹱'), + ('𖹒', '𖹲'), ('𖹓', '𖹳'), ('𖹔', '𖹴'), ('𖹕', '𖹵'), + ('𖹖', '𖹶'), ('𖹗', '𖹷'), ('𖹘', '𖹸'), ('𖹙', '𖹹'), + ('𖹚', '𖹺'), ('𖹛', '𖹻'), ('𖹜', '𖹼'), ('𖹝', '𖹽'), + ('𖹞', '𖹾'), ('𖹟', '𖹿'), ('𞤀', '𞤢'), ('𞤁', '𞤣'), + ('𞤂', '𞤤'), ('𞤃', '𞤥'), ('𞤄', '𞤦'), ('𞤅', '𞤧'), + ('𞤆', '𞤨'), ('𞤇', '𞤩'), ('𞤈', '𞤪'), ('𞤉', '𞤫'), + ('𞤊', '𞤬'), ('𞤋', '𞤭'), ('𞤌', '𞤮'), ('𞤍', '𞤯'), + ('𞤎', '𞤰'), ('𞤏', '𞤱'), ('𞤐', '𞤲'), ('𞤑', '𞤳'), + ('𞤒', '𞤴'), ('𞤓', '𞤵'), ('𞤔', '𞤶'), ('𞤕', '𞤷'), + ('𞤖', '𞤸'), ('𞤗', '𞤹'), ('𞤘', '𞤺'), ('𞤙', '𞤻'), + ('𞤚', '𞤼'), ('𞤛', '𞤽'), ('𞤜', '𞤾'), ('𞤝', '𞤿'), + ('𞤞', '𞥀'), ('𞤟', '𞥁'), ('𞤠', '𞥂'), ('𞤡', '𞥃'), +]; diff --git a/crates/atuin-nucleo/matcher/src/chars/normalize.rs b/crates/atuin-nucleo/matcher/src/chars/normalize.rs new file mode 100644 index 00000000..3de501aa --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/chars/normalize.rs @@ -0,0 +1,972 @@ +/// Normalize a Unicode character by converting Latin characters which are variants +/// of ASCII characters to their latin equivalent. +/// +/// Note that this method acts on single `char`s: if you want to perform full normalization, you +/// should first split on graphemes, and then normalize each grapheme by normalizing the first +/// `char` in the grapheme. +/// +/// If a character does not normalize to a single ASCII character, no normalization is performed. +/// +/// This performs normalization within the following Unicode blocks: +/// +/// - [Latin-1 Supplement](https://en.wikipedia.org/wiki/Latin-1_Supplement) +/// - [Latin Extended-A](https://en.wikipedia.org/wiki/Latin_Extended-A) +/// - [Latin Extended-B](https://en.wikipedia.org/wiki/Latin_Extended-B) +/// - [Latin Extended Additional](https://en.wikipedia.org/wiki/Latin_Extended_Additional) +/// - [Superscripts and Subscripts](https://en.wikipedia.org/wiki/Superscripts_and_Subscripts) +/// +/// If the character does not fall in this block, it is not normalized. +/// +/// # Example +/// ``` +/// # use nucleo_matcher::chars::normalize; +/// assert_eq!(normalize('ä'), 'a'); +/// assert_eq!(normalize('Æ'), 'Æ'); +/// assert_eq!(normalize('ữ'), 'u'); +/// ``` +pub fn normalize(c: char) -> char { + // outside checked blocks + if c < '\u{a0}' || c >= '\u{20A0}' { + return c; + } + // Latin-1 Supplement, Extended-A, Extended-B + if c <= '\u{29f}' { + return LATIN_1AB[c as usize - '\u{a0}' as usize]; + } + // between blocks + if c < '\u{1e00}' { + return c; + } + // Latin Extended Additional + if c <= '\u{1eff}' { + return LATIN_EXTENDED_ADDITIONAL[c as usize - '\u{1e00}' as usize]; + } + // between blocks + if c < '\u{2070}' { + return c; + } + // Superscripts and subscripts + SUPERSCRIPTS_AND_SUBSCRIPTS[c as usize - '\u{2070}' as usize] +} + +/// A char array corresponding to the following contiguous Unicode blocks: +/// +/// - [Latin-1 Supplement](https://en.wikipedia.org/wiki/Latin-1_Supplement) +/// - [Latin Extended-A](https://en.wikipedia.org/wiki/Latin_Extended-A) +/// - [Latin Extended-B](https://en.wikipedia.org/wiki/Latin_Extended-B) +/// +/// This covers the range `'\u{a0}'..='\u{29f}'`. +static LATIN_1AB: [char; 512] = [ + '\u{a0}', // invisible NON BREAKING SPACE + '!', // '¡'; '\u{a1}' + '¢', // '¢'; '\u{a2}' + '£', // '£'; '\u{a3}' + '¤', // '¤'; '\u{a4}' + '¥', // '¥'; '\u{a5}' + '¦', // '¦'; '\u{a6}' + '§', // '§'; '\u{a7}' + '¨', // '¨'; '\u{a8}' + '©', // '©'; '\u{a9}' + 'a', // 'ª'; '\u{aa}' + '«', // '«'; '\u{ab}' + '¬', // '¬'; '\u{ac}' + '\u{ad}', // invisible SOFT HYPHEN + '®', // '®'; '\u{ae}' + '¯', // '¯'; '\u{af}' + '°', // '°'; '\u{b0}' + '±', // '±'; '\u{b1}' + '2', // '²'; '\u{b2}' + '3', // '³'; '\u{b3}' + '´', // '´'; '\u{b4}' + 'µ', // 'µ'; '\u{b5}' + '¶', // '¶'; '\u{b6}' + '·', // '·'; '\u{b7}' + '¸', // '¸'; '\u{b8}' + '1', // '¹'; '\u{b9}' + '0', // 'º'; '\u{ba}' + '»', // '»'; '\u{bb}' + '¼', // '¼'; '\u{bc}' + '½', // '½'; '\u{bd}' + '¾', // '¾'; '\u{be}' + '?', // '¿'; '\u{bf}' + 'A', // 'À'; '\u{c0}' + 'A', // 'Á'; '\u{c1}' + 'A', // 'Â'; '\u{c2}' + 'A', // 'Ã'; '\u{c3}' + 'A', // 'Ä'; '\u{c4}' + 'A', // 'Å'; '\u{c5}' + 'Æ', // 'Æ'; '\u{c6}' + 'C', // 'Ç'; '\u{c7}' + 'E', // 'È'; '\u{c8}' + 'E', // 'É'; '\u{c9}' + 'E', // 'Ê'; '\u{ca}' + 'E', // 'Ë'; '\u{cb}' + 'I', // 'Ì'; '\u{cc}' + 'I', // 'Í'; '\u{cd}' + 'I', // 'Î'; '\u{ce}' + 'I', // 'Ï'; '\u{cf}' + 'D', // 'Ð'; '\u{d0}' + 'N', // 'Ñ'; '\u{d1}' + 'O', // 'Ò'; '\u{d2}' + 'O', // 'Ó'; '\u{d3}' + 'O', // 'Ô'; '\u{d4}' + 'O', // 'Õ'; '\u{d5}' + 'O', // 'Ö'; '\u{d6}' + '×', // '×'; '\u{d7}' + 'O', // 'Ø'; '\u{d8}' + 'U', // 'Ù'; '\u{d9}' + 'U', // 'Ú'; '\u{da}' + 'U', // 'Û'; '\u{db}' + 'U', // 'Ü'; '\u{dc}' + 'Y', // 'Ý'; '\u{dd}' + 'Þ', // 'Þ'; '\u{de}' + 's', // 'ß'; '\u{df}' + 'a', // 'à'; '\u{e0}' + 'a', // 'á'; '\u{e1}' + 'a', // 'â'; '\u{e2}' + 'a', // 'ã'; '\u{e3}' + 'a', // 'ä'; '\u{e4}' + 'a', // 'å'; '\u{e5}' + 'æ', // 'æ'; '\u{e6}' + 'c', // 'ç'; '\u{e7}' + 'e', // 'è'; '\u{e8}' + 'e', // 'é'; '\u{e9}' + 'e', // 'ê'; '\u{ea}' + 'e', // 'ë'; '\u{eb}' + 'i', // 'ì'; '\u{ec}' + 'i', // 'í'; '\u{ed}' + 'i', // 'î'; '\u{ee}' + 'i', // 'ï'; '\u{ef}' + 'd', // 'ð'; '\u{f0}' + 'n', // 'ñ'; '\u{f1}' + 'o', // 'ò'; '\u{f2}' + 'o', // 'ó'; '\u{f3}' + 'o', // 'ô'; '\u{f4}' + 'o', // 'õ'; '\u{f5}' + 'o', // 'ö'; '\u{f6}' + '÷', // '÷'; '\u{f7}' + 'o', // 'ø'; '\u{f8}' + 'u', // 'ù'; '\u{f9}' + 'u', // 'ú'; '\u{fa}' + 'u', // 'û'; '\u{fb}' + 'u', // 'ü'; '\u{fc}' + 'y', // 'ý'; '\u{fd}' + 'þ', // 'þ'; '\u{fe}' + 'y', // 'ÿ'; '\u{ff}' + 'A', // 'Ā'; '\u{100}' + 'a', // 'ā'; '\u{101}' + 'A', // 'Ă'; '\u{102}' + 'a', // 'ă'; '\u{103}' + 'A', // 'Ą'; '\u{104}' + 'a', // 'ą'; '\u{105}' + 'C', // 'Ć'; '\u{106}' + 'c', // 'ć'; '\u{107}' + 'C', // 'Ĉ'; '\u{108}' + 'c', // 'ĉ'; '\u{109}' + 'C', // 'Ċ'; '\u{10a}' + 'c', // 'ċ'; '\u{10b}' + 'C', // 'Č'; '\u{10c}' + 'c', // 'č'; '\u{10d}' + 'D', // 'Ď'; '\u{10e}' + 'd', // 'ď'; '\u{10f}' + 'D', // 'Đ'; '\u{110}' + 'd', // 'đ'; '\u{111}' + 'E', // 'Ē'; '\u{112}' + 'e', // 'ē'; '\u{113}' + 'E', // 'Ĕ'; '\u{114}' + 'e', // 'ĕ'; '\u{115}' + 'E', // 'Ė'; '\u{116}' + 'e', // 'ė'; '\u{117}' + 'E', // 'Ę'; '\u{118}' + 'e', // 'ę'; '\u{119}' + 'E', // 'Ě'; '\u{11a}' + 'e', // 'ě'; '\u{11b}' + 'G', // 'Ĝ'; '\u{11c}' + 'g', // 'ĝ'; '\u{11d}' + 'G', // 'Ğ'; '\u{11e}' + 'g', // 'ğ'; '\u{11f}' + 'G', // 'Ġ'; '\u{120}' + 'g', // 'ġ'; '\u{121}' + 'G', // 'Ģ'; '\u{122}' + 'g', // 'ģ'; '\u{123}' + 'H', // 'Ĥ'; '\u{124}' + 'h', // 'ĥ'; '\u{125}' + 'H', // 'Ħ'; '\u{126}' + 'h', // 'ħ'; '\u{127}' + 'I', // 'Ĩ'; '\u{128}' + 'i', // 'ĩ'; '\u{129}' + 'I', // 'Ī'; '\u{12a}' + 'i', // 'ī'; '\u{12b}' + 'I', // 'Ĭ'; '\u{12c}' + 'i', // 'ĭ'; '\u{12d}' + 'I', // 'Į'; '\u{12e}' + 'i', // 'į'; '\u{12f}' + 'I', // 'İ'; '\u{130}' + 'i', // 'ı'; '\u{131}' + 'IJ', // 'IJ'; '\u{132}' + 'ij', // 'ij'; '\u{133}' + 'J', // 'Ĵ'; '\u{134}' + 'j', // 'ĵ'; '\u{135}' + 'K', // 'Ķ'; '\u{136}' + 'k', // 'ķ'; '\u{137}' + 'ĸ', // 'ĸ'; '\u{138}' + 'L', // 'Ĺ'; '\u{139}' + 'l', // 'ĺ'; '\u{13a}' + 'L', // 'Ļ'; '\u{13b}' + 'l', // 'ļ'; '\u{13c}' + 'L', // 'Ľ'; '\u{13d}' + 'l', // 'ľ'; '\u{13e}' + 'L', // 'Ŀ'; '\u{13f}' + 'l', // 'ŀ'; '\u{140}' + 'L', // 'Ł'; '\u{141}' + 'l', // 'ł'; '\u{142}' + 'N', // 'Ń'; '\u{143}' + 'n', // 'ń'; '\u{144}' + 'N', // 'Ņ'; '\u{145}' + 'n', // 'ņ'; '\u{146}' + 'N', // 'Ň'; '\u{147}' + 'n', // 'ň'; '\u{148}' + 'n', // 'ʼn'; '\u{149}' + 'N', // 'Ŋ'; '\u{14a}' + 'n', // 'ŋ'; '\u{14b}' + 'O', // 'Ō'; '\u{14c}' + 'o', // 'ō'; '\u{14d}' + 'O', // 'Ŏ'; '\u{14e}' + 'o', // 'ŏ'; '\u{14f}' + 'O', // 'Ő'; '\u{150}' + 'o', // 'ő'; '\u{151}' + 'Œ', // 'Œ'; '\u{152}' + 'œ', // 'œ'; '\u{153}' + 'R', // 'Ŕ'; '\u{154}' + 'r', // 'ŕ'; '\u{155}' + 'R', // 'Ŗ'; '\u{156}' + 'r', // 'ŗ'; '\u{157}' + 'R', // 'Ř'; '\u{158}' + 'r', // 'ř'; '\u{159}' + 'S', // 'Ś'; '\u{15a}' + 's', // 'ś'; '\u{15b}' + 'S', // 'Ŝ'; '\u{15c}' + 's', // 'ŝ'; '\u{15d}' + 'S', // 'Ş'; '\u{15e}' + 's', // 'ş'; '\u{15f}' + 'S', // 'Š'; '\u{160}' + 's', // 'š'; '\u{161}' + 'T', // 'Ţ'; '\u{162}' + 't', // 'ţ'; '\u{163}' + 'T', // 'Ť'; '\u{164}' + 't', // 'ť'; '\u{165}' + 'T', // 'Ŧ'; '\u{166}' + 't', // 'ŧ'; '\u{167}' + 'U', // 'Ũ'; '\u{168}' + 'u', // 'ũ'; '\u{169}' + 'U', // 'Ū'; '\u{16a}' + 'u', // 'ū'; '\u{16b}' + 'U', // 'Ŭ'; '\u{16c}' + 'u', // 'ŭ'; '\u{16d}' + 'U', // 'Ů'; '\u{16e}' + 'u', // 'ů'; '\u{16f}' + 'U', // 'Ű'; '\u{170}' + 'u', // 'ű'; '\u{171}' + 'U', // 'Ų'; '\u{172}' + 'u', // 'ų'; '\u{173}' + 'W', // 'Ŵ'; '\u{174}' + 'w', // 'ŵ'; '\u{175}' + 'Y', // 'Ŷ'; '\u{176}' + 'y', // 'ŷ'; '\u{177}' + 'Y', // 'Ÿ'; '\u{178}' + 'Z', // 'Ź'; '\u{179}' + 'z', // 'ź'; '\u{17a}' + 'Z', // 'Ż'; '\u{17b}' + 'z', // 'ż'; '\u{17c}' + 'Z', // 'Ž'; '\u{17d}' + 'z', // 'ž'; '\u{17e}' + 's', // 'ſ'; '\u{17f}' + 'b', // 'ƀ'; '\u{180}' + 'B', // 'Ɓ'; '\u{181}' + 'b', // 'Ƃ'; '\u{182}' + 'b', // 'ƃ'; '\u{183}' + 'b', // 'Ƅ'; '\u{184}' + 'ƅ', // 'ƅ'; '\u{185}' + 'O', // 'Ɔ'; '\u{186}' + 'C', // 'Ƈ'; '\u{187}' + 'c', // 'ƈ'; '\u{188}' + 'D', // 'Ɖ'; '\u{189}' + 'D', // 'Ɗ'; '\u{18a}' + 'd', // 'Ƌ'; '\u{18b}' + 'd', // 'ƌ'; '\u{18c}' + 'ƍ', // 'ƍ'; '\u{18d}' + 'E', // 'Ǝ'; '\u{18e}' + 'e', // 'Ə'; '\u{18f}' + 'E', // 'Ɛ'; '\u{190}' + 'F', // 'Ƒ'; '\u{191}' + 'f', // 'ƒ'; '\u{192}' + 'G', // 'Ɠ'; '\u{193}' + 'Ɣ', // 'Ɣ'; '\u{194}' + 'h', // 'ƕ'; '\u{195}' + 'I', // 'Ɩ'; '\u{196}' + 'I', // 'Ɨ'; '\u{197}' + 'Ƙ', // 'Ƙ'; '\u{198}' + 'k', // 'ƙ'; '\u{199}' + 'l', // 'ƚ'; '\u{19a}' + 'ƛ', // 'ƛ'; '\u{19b}' + 'M', // 'Ɯ'; '\u{19c}' + 'N', // 'Ɲ'; '\u{19d}' + 'n', // 'ƞ'; '\u{19e}' + 'O', // 'Ɵ'; '\u{19f}' + 'O', // 'Ơ'; '\u{1a0}' + 'o', // 'ơ'; '\u{1a1}' + 'Ƣ', // 'Ƣ'; '\u{1a2}' + 'ƣ', // 'ƣ'; '\u{1a3}' + 'P', // 'Ƥ'; '\u{1a4}' + 'p', // 'ƥ'; '\u{1a5}' + 'R', // 'Ʀ'; '\u{1a6}' + 'S', // 'Ƨ'; '\u{1a7}' + 's', // 'ƨ'; '\u{1a8}' + 'Ʃ', // 'Ʃ'; '\u{1a9}' + 'l', // 'ƪ'; '\u{1aa}' + 't', // 'ƫ'; '\u{1ab}' + 'T', // 'Ƭ'; '\u{1ac}' + 't', // 'ƭ'; '\u{1ad}' + 'T', // 'Ʈ'; '\u{1ae}' + 'U', // 'Ư'; '\u{1af}' + 'u', // 'ư'; '\u{1b0}' + 'Ʊ', // 'Ʊ'; '\u{1b1}' + 'V', // 'Ʋ'; '\u{1b2}' + 'Y', // 'Ƴ'; '\u{1b3}' + 'y', // 'ƴ'; '\u{1b4}' + 'Z', // 'Ƶ'; '\u{1b5}' + 'z', // 'ƶ'; '\u{1b6}' + 'Ʒ', // 'Ʒ'; '\u{1b7}' + 'Ƹ', // 'Ƹ'; '\u{1b8}' + 'ƹ', // 'ƹ'; '\u{1b9}' + 'ƺ', // 'ƺ'; '\u{1ba}' + 'ƻ', // 'ƻ'; '\u{1bb}' + 'Ƽ', // 'Ƽ'; '\u{1bc}' + 'ƽ', // 'ƽ'; '\u{1bd}' + 'ƾ', // 'ƾ'; '\u{1be}' + 'ƿ', // 'ƿ'; '\u{1bf}' + 'ǀ', // 'ǀ'; '\u{1c0}' + 'ǁ', // 'ǁ'; '\u{1c1}' + 'ǂ', // 'ǂ'; '\u{1c2}' + '!', // 'ǃ'; '\u{1c3}' + 'DŽ', // 'DŽ'; '\u{1c4}' + 'Dž', // 'Dž'; '\u{1c5}' + 'dž', // 'dž'; '\u{1c6}' + 'LJ', // 'LJ'; '\u{1c7}' + 'Lj', // 'Lj'; '\u{1c8}' + 'lj', // 'lj'; '\u{1c9}' + 'NJ', // 'NJ'; '\u{1ca}' + 'Nj', // 'Nj'; '\u{1cb}' + 'nj', // 'nj'; '\u{1cc}' + 'A', // 'Ǎ'; '\u{1cd}' + 'a', // 'ǎ'; '\u{1ce}' + 'I', // 'Ǐ'; '\u{1cf}' + 'i', // 'ǐ'; '\u{1d0}' + 'O', // 'Ǒ'; '\u{1d1}' + 'o', // 'ǒ'; '\u{1d2}' + 'U', // 'Ǔ'; '\u{1d3}' + 'u', // 'ǔ'; '\u{1d4}' + 'U', // 'Ǖ'; '\u{1d5}' + 'u', // 'ǖ'; '\u{1d6}' + 'U', // 'Ǘ'; '\u{1d7}' + 'u', // 'ǘ'; '\u{1d8}' + 'U', // 'Ǚ'; '\u{1d9}' + 'u', // 'ǚ'; '\u{1da}' + 'U', // 'Ǜ'; '\u{1db}' + 'u', // 'ǜ'; '\u{1dc}' + 'e', // 'ǝ'; '\u{1dd}' + 'A', // 'Ǟ'; '\u{1de}' + 'a', // 'ǟ'; '\u{1df}' + 'A', // 'Ǡ'; '\u{1e0}' + 'a', // 'ǡ'; '\u{1e1}' + 'Æ', // 'Ǣ'; '\u{1e2}' + 'æ', // 'ǣ'; '\u{1e3}' + 'G', // 'Ǥ'; '\u{1e4}' + 'g', // 'ǥ'; '\u{1e5}' + 'G', // 'Ǧ'; '\u{1e6}' + 'g', // 'ǧ'; '\u{1e7}' + 'K', // 'Ǩ'; '\u{1e8}' + 'k', // 'ǩ'; '\u{1e9}' + 'O', // 'Ǫ'; '\u{1ea}' + 'o', // 'ǫ'; '\u{1eb}' + 'O', // 'Ǭ'; '\u{1ec}' + 'o', // 'ǭ'; '\u{1ed}' + 'Ǯ', // 'Ǯ'; '\u{1ee}' + 'ǯ', // 'ǯ'; '\u{1ef}' + 'j', // 'ǰ'; '\u{1f0}' + 'DZ', // 'DZ'; '\u{1f1}' + 'Dz', // 'Dz'; '\u{1f2}' + 'dz', // 'dz'; '\u{1f3}' + 'G', // 'Ǵ'; '\u{1f4}' + 'g', // 'ǵ'; '\u{1f5}' + 'Ƕ', // 'Ƕ'; '\u{1f6}' + 'Ƿ', // 'Ƿ'; '\u{1f7}' + 'N', // 'Ǹ'; '\u{1f8}' + 'n', // 'ǹ'; '\u{1f9}' + 'A', // 'Ǻ'; '\u{1fa}' + 'a', // 'ǻ'; '\u{1fb}' + 'Æ', // 'Ǽ'; '\u{1fc}' + 'æ', // 'ǽ'; '\u{1fd}' + 'O', // 'Ǿ'; '\u{1fe}' + 'o', // 'ǿ'; '\u{1ff}' + 'A', // 'Ȁ'; '\u{200}' + 'a', // 'ȁ'; '\u{201}' + 'A', // 'Ȃ'; '\u{202}' + 'a', // 'ȃ'; '\u{203}' + 'E', // 'Ȅ'; '\u{204}' + 'e', // 'ȅ'; '\u{205}' + 'E', // 'Ȇ'; '\u{206}' + 'e', // 'ȇ'; '\u{207}' + 'I', // 'Ȉ'; '\u{208}' + 'i', // 'ȉ'; '\u{209}' + 'I', // 'Ȋ'; '\u{20a}' + 'i', // 'ȋ'; '\u{20b}' + 'O', // 'Ȍ'; '\u{20c}' + 'o', // 'ȍ'; '\u{20d}' + 'O', // 'Ȏ'; '\u{20e}' + 'o', // 'ȏ'; '\u{20f}' + 'R', // 'Ȑ'; '\u{210}' + 'r', // 'ȑ'; '\u{211}' + 'R', // 'Ȓ'; '\u{212}' + 'r', // 'ȓ'; '\u{213}' + 'U', // 'Ȕ'; '\u{214}' + 'u', // 'ȕ'; '\u{215}' + 'U', // 'Ȗ'; '\u{216}' + 'u', // 'ȗ'; '\u{217}' + 'S', // 'Ș'; '\u{218}' + 's', // 'ș'; '\u{219}' + 'T', // 'Ț'; '\u{21a}' + 't', // 'ț'; '\u{21b}' + 'Ȝ', // 'Ȝ'; '\u{21c}' + 'ȝ', // 'ȝ'; '\u{21d}' + 'H', // 'Ȟ'; '\u{21e}' + 'h', // 'ȟ'; '\u{21f}' + 'N', // 'Ƞ'; '\u{220}' + 'd', // 'ȡ'; '\u{221}' + 'Ȣ', // 'Ȣ'; '\u{222}' + 'ȣ', // 'ȣ'; '\u{223}' + 'Z', // 'Ȥ'; '\u{224}' + 'z', // 'ȥ'; '\u{225}' + 'A', // 'Ȧ'; '\u{226}' + 'a', // 'ȧ'; '\u{227}' + 'E', // 'Ȩ'; '\u{228}' + 'e', // 'ȩ'; '\u{229}' + 'O', // 'Ȫ'; '\u{22a}' + 'o', // 'ȫ'; '\u{22b}' + 'O', // 'Ȭ'; '\u{22c}' + 'o', // 'ȭ'; '\u{22d}' + 'O', // 'Ȯ'; '\u{22e}' + 'o', // 'ȯ'; '\u{22f}' + 'O', // 'Ȱ'; '\u{230}' + 'o', // 'ȱ'; '\u{231}' + 'Y', // 'Ȳ'; '\u{232}' + 'y', // 'ȳ'; '\u{233}' + 'l', // 'ȴ'; '\u{234}' + 'n', // 'ȵ'; '\u{235}' + 't', // 'ȶ'; '\u{236}' + 'j', // 'ȷ'; '\u{237}' + 'ȸ', // 'ȸ'; '\u{238}' + 'ȹ', // 'ȹ'; '\u{239}' + 'A', // 'Ⱥ'; '\u{23a}' + 'C', // 'Ȼ'; '\u{23b}' + 'c', // 'ȼ'; '\u{23c}' + 'L', // 'Ƚ'; '\u{23d}' + 'T', // 'Ⱦ'; '\u{23e}' + 's', // 'ȿ'; '\u{23f}' + 'z', // 'ɀ'; '\u{240}' + 'Ɂ', // 'Ɂ'; '\u{241}' + 'ɂ', // 'ɂ'; '\u{242}' + 'B', // 'Ƀ'; '\u{243}' + 'U', // 'Ʉ'; '\u{244}' + 'V', // 'Ʌ'; '\u{245}' + 'E', // 'Ɇ'; '\u{246}' + 'e', // 'ɇ'; '\u{247}' + 'J', // 'Ɉ'; '\u{248}' + 'j', // 'ɉ'; '\u{249}' + 'Q', // 'Ɋ'; '\u{24a}' + 'q', // 'ɋ'; '\u{24b}' + 'R', // 'Ɍ'; '\u{24c}' + 'r', // 'ɍ'; '\u{24d}' + 'Y', // 'Ɏ'; '\u{24e}' + 'y', // 'ɏ'; '\u{24f}' + 'a', // 'ɐ'; '\u{250}' + 'a', // 'ɑ'; '\u{251}' + 'a', // 'ɒ'; '\u{252}' + 'b', // 'ɓ'; '\u{253}' + 'c', // 'ɔ'; '\u{254}' + 'c', // 'ɕ'; '\u{255}' + 'd', // 'ɖ'; '\u{256}' + 'd', // 'ɗ'; '\u{257}' + 'e', // 'ɘ'; '\u{258}' + 'e', // 'ə'; '\u{259}' + 'e', // 'ɚ'; '\u{25a}' + 'e', // 'ɛ'; '\u{25b}' + 'e', // 'ɜ'; '\u{25c}' + 'e', // 'ɝ'; '\u{25d}' + 'e', // 'ɞ'; '\u{25e}' + 'j', // 'ɟ'; '\u{25f}' + 'g', // 'ɠ'; '\u{260}' + 'g', // 'ɡ'; '\u{261}' + 'G', // 'ɢ'; '\u{262}' + 'g', // 'ɣ'; '\u{263}' + 'u', // 'ɤ'; '\u{264}' + 'h', // 'ɥ'; '\u{265}' + 'h', // 'ɦ'; '\u{266}' + 'h', // 'ɧ'; '\u{267}' + 'i', // 'ɨ'; '\u{268}' + 'i', // 'ɩ'; '\u{269}' + 'I', // 'ɪ'; '\u{26a}' + 'l', // 'ɫ'; '\u{26b}' + 'l', // 'ɬ'; '\u{26c}' + 'l', // 'ɭ'; '\u{26d}' + 'ɮ', // 'ɮ'; '\u{26e}' + 'm', // 'ɯ'; '\u{26f}' + 'm', // 'ɰ'; '\u{270}' + 'm', // 'ɱ'; '\u{271}' + 'n', // 'ɲ'; '\u{272}' + 'n', // 'ɳ'; '\u{273}' + 'N', // 'ɴ'; '\u{274}' + 'o', // 'ɵ'; '\u{275}' + 'ɶ', // 'ɶ'; '\u{276}' + 'ɷ', // 'ɷ'; '\u{277}' + 'ɸ', // 'ɸ'; '\u{278}' + 'r', // 'ɹ'; '\u{279}' + 'r', // 'ɺ'; '\u{27a}' + 'r', // 'ɻ'; '\u{27b}' + 'r', // 'ɼ'; '\u{27c}' + 'r', // 'ɽ'; '\u{27d}' + 'r', // 'ɾ'; '\u{27e}' + 'r', // 'ɿ'; '\u{27f}' + 'R', // 'ʀ'; '\u{280}' + 'R', // 'ʁ'; '\u{281}' + 's', // 'ʂ'; '\u{282}' + 'ʃ', // 'ʃ'; '\u{283}' + 'ʄ', // 'ʄ'; '\u{284}' + 'ʅ', // 'ʅ'; '\u{285}' + 'ʆ', // 'ʆ'; '\u{286}' + 't', // 'ʇ'; '\u{287}' + 't', // 'ʈ'; '\u{288}' + 'u', // 'ʉ'; '\u{289}' + 'ʊ', // 'ʊ'; '\u{28a}' + 'v', // 'ʋ'; '\u{28b}' + 'v', // 'ʌ'; '\u{28c}' + 'w', // 'ʍ'; '\u{28d}' + 'y', // 'ʎ'; '\u{28e}' + 'Y', // 'ʏ'; '\u{28f}' + 'z', // 'ʐ'; '\u{290}' + 'z', // 'ʑ'; '\u{291}' + 'ʒ', // 'ʒ'; '\u{292}' + 'ʓ', // 'ʓ'; '\u{293}' + 'ʔ', // 'ʔ'; '\u{294}' + 'ʕ', // 'ʕ'; '\u{295}' + 'ʖ', // 'ʖ'; '\u{296}' + 'c', // 'ʗ'; '\u{297}' + 'ʘ', // 'ʘ'; '\u{298}' + 'B', // 'ʙ'; '\u{299}' + 'e', // 'ʚ'; '\u{29a}' + 'G', // 'ʛ'; '\u{29b}' + 'H', // 'ʜ'; '\u{29c}' + 'j', // 'ʝ'; '\u{29d}' + 'k', // 'ʞ'; '\u{29e}' + 'L', // 'ʟ'; '\u{29f}' +]; + +/// A char array corresponding to the following Unicode block: +/// +/// - [Latin Extended Additional](https://en.wikipedia.org/wiki/Latin_Extended_Additional) +/// +/// This covers the range `'\u{1e00}'..='\u{1eff}'`. +static LATIN_EXTENDED_ADDITIONAL: [char; 256] = [ + 'A', // 'Ḁ'; '\u{1e00}' + 'a', // 'ḁ'; '\u{1e01}' + 'B', // 'Ḃ'; '\u{1e02}' + 'b', // 'ḃ'; '\u{1e03}' + 'B', // 'Ḅ'; '\u{1e04}' + 'b', // 'ḅ'; '\u{1e05}' + 'B', // 'Ḇ'; '\u{1e06}' + 'b', // 'ḇ'; '\u{1e07}' + 'C', // 'Ḉ'; '\u{1e08}' + 'c', // 'ḉ'; '\u{1e09}' + 'D', // 'Ḋ'; '\u{1e0a}' + 'e', // 'ḋ'; '\u{1e0b}' + 'D', // 'Ḍ'; '\u{1e0c}' + 'd', // 'ḍ'; '\u{1e0d}' + 'D', // 'Ḏ'; '\u{1e0e}' + 'd', // 'ḏ'; '\u{1e0f}' + 'D', // 'Ḑ'; '\u{1e10}' + 'd', // 'ḑ'; '\u{1e11}' + 'D', // 'Ḓ'; '\u{1e12}' + 'd', // 'ḓ'; '\u{1e13}' + 'E', // 'Ḕ'; '\u{1e14}' + 'e', // 'ḕ'; '\u{1e15}' + 'E', // 'Ḗ'; '\u{1e16}' + 'e', // 'ḗ'; '\u{1e17}' + 'E', // 'Ḙ'; '\u{1e18}' + 'e', // 'ḙ'; '\u{1e19}' + 'E', // 'Ḛ'; '\u{1e1a}' + 'e', // 'ḛ'; '\u{1e1b}' + 'E', // 'Ḝ'; '\u{1e1c}' + 'e', // 'ḝ'; '\u{1e1d}' + 'F', // 'Ḟ'; '\u{1e1e}' + 'f', // 'ḟ'; '\u{1e1f}' + 'G', // 'Ḡ'; '\u{1e20}' + 'g', // 'ḡ'; '\u{1e21}' + 'H', // 'Ḣ'; '\u{1e22}' + 'g', // 'ḣ'; '\u{1e23}' + 'H', // 'Ḥ'; '\u{1e24}' + 'g', // 'ḥ'; '\u{1e25}' + 'H', // 'Ḧ'; '\u{1e26}' + 'g', // 'ḧ'; '\u{1e27}' + 'H', // 'Ḩ'; '\u{1e28}' + 'g', // 'ḩ'; '\u{1e29}' + 'H', // 'Ḫ'; '\u{1e2a}' + 'h', // 'ḫ'; '\u{1e2b}' + 'I', // 'Ḭ'; '\u{1e2c}' + 'i', // 'ḭ'; '\u{1e2d}' + 'I', // 'Ḯ'; '\u{1e2e}' + 'i', // 'ḯ'; '\u{1e2f}' + 'K', // 'Ḱ'; '\u{1e30}' + 'k', // 'ḱ'; '\u{1e31}' + 'K', // 'Ḳ'; '\u{1e32}' + 'k', // 'ḳ'; '\u{1e33}' + 'K', // 'Ḵ'; '\u{1e34}' + 'k', // 'ḵ'; '\u{1e35}' + 'L', // 'Ḷ'; '\u{1e36}' + 'l', // 'ḷ'; '\u{1e37}' + 'L', // 'Ḹ'; '\u{1e38}' + 'l', // 'ḹ'; '\u{1e39}' + 'L', // 'Ḻ'; '\u{1e3a}' + 'l', // 'ḻ'; '\u{1e3b}' + 'L', // 'Ḽ'; '\u{1e3c}' + 'l', // 'ḽ'; '\u{1e3d}' + 'M', // 'Ḿ'; '\u{1e3e}' + 'm', // 'ḿ'; '\u{1e3f}' + 'M', // 'Ṁ'; '\u{1e40}' + 'm', // 'ṁ'; '\u{1e41}' + 'M', // 'Ṃ'; '\u{1e42}' + 'm', // 'ṃ'; '\u{1e43}' + 'N', // 'Ṅ'; '\u{1e44}' + 'n', // 'ṅ'; '\u{1e45}' + 'N', // 'Ṇ'; '\u{1e46}' + 'n', // 'ṇ'; '\u{1e47}' + 'N', // 'Ṉ'; '\u{1e48}' + 'n', // 'ṉ'; '\u{1e49}' + 'N', // 'Ṋ'; '\u{1e4a}' + 'n', // 'ṋ'; '\u{1e4b}' + 'O', // 'Ṍ'; '\u{1e4c}' + 'o', // 'ṍ'; '\u{1e4d}' + 'O', // 'Ṏ'; '\u{1e4e}' + 'o', // 'ṏ'; '\u{1e4f}' + 'O', // 'Ṑ'; '\u{1e50}' + 'o', // 'ṑ'; '\u{1e51}' + 'O', // 'Ṓ'; '\u{1e52}' + 'o', // 'ṓ'; '\u{1e53}' + 'P', // 'Ṕ'; '\u{1e54}' + 'p', // 'ṕ'; '\u{1e55}' + 'P', // 'Ṗ'; '\u{1e56}' + 'p', // 'ṗ'; '\u{1e57}' + 'R', // 'Ṙ'; '\u{1e58}' + 'r', // 'ṙ'; '\u{1e59}' + 'R', // 'Ṛ'; '\u{1e5a}' + 'r', // 'ṛ'; '\u{1e5b}' + 'R', // 'Ṝ'; '\u{1e5c}' + 'r', // 'ṝ'; '\u{1e5d}' + 'R', // 'Ṟ'; '\u{1e5e}' + 'r', // 'ṟ'; '\u{1e5f}' + 'S', // 'Ṡ'; '\u{1e60}' + 's', // 'ṡ'; '\u{1e61}' + 'S', // 'Ṣ'; '\u{1e62}' + 's', // 'ṣ'; '\u{1e63}' + 'S', // 'Ṥ'; '\u{1e64}' + 's', // 'ṥ'; '\u{1e65}' + 'S', // 'Ṧ'; '\u{1e66}' + 's', // 'ṧ'; '\u{1e67}' + 'S', // 'Ṩ'; '\u{1e68}' + 's', // 'ṩ'; '\u{1e69}' + 'T', // 'Ṫ'; '\u{1e6a}' + 't', // 'ṫ'; '\u{1e6b}' + 'T', // 'Ṭ'; '\u{1e6c}' + 't', // 'ṭ'; '\u{1e6d}' + 'T', // 'Ṯ'; '\u{1e6e}' + 't', // 'ṯ'; '\u{1e6f}' + 'T', // 'Ṱ'; '\u{1e70}' + 't', // 'ṱ'; '\u{1e71}' + 'U', // 'Ṳ'; '\u{1e72}' + 'u', // 'ṳ'; '\u{1e73}' + 'U', // 'Ṵ'; '\u{1e74}' + 'u', // 'ṵ'; '\u{1e75}' + 'U', // 'Ṷ'; '\u{1e76}' + 'u', // 'ṷ'; '\u{1e77}' + 'U', // 'Ṹ'; '\u{1e78}' + 'u', // 'ṹ'; '\u{1e79}' + 'U', // 'Ṻ'; '\u{1e7a}' + 'u', // 'ṻ'; '\u{1e7b}' + 'V', // 'Ṽ'; '\u{1e7c}' + 'v', // 'ṽ'; '\u{1e7d}' + 'V', // 'Ṿ'; '\u{1e7e}' + 'v', // 'ṿ'; '\u{1e7f}' + 'W', // 'Ẁ'; '\u{1e80}' + 'w', // 'ẁ'; '\u{1e81}' + 'W', // 'Ẃ'; '\u{1e82}' + 'w', // 'ẃ'; '\u{1e83}' + 'W', // 'Ẅ'; '\u{1e84}' + 'w', // 'ẅ'; '\u{1e85}' + 'W', // 'Ẇ'; '\u{1e86}' + 'w', // 'ẇ'; '\u{1e87}' + 'W', // 'Ẉ'; '\u{1e88}' + 'j', // 'ẉ'; '\u{1e89}' + 'X', // 'Ẋ'; '\u{1e8a}' + 'x', // 'ẋ'; '\u{1e8b}' + 'X', // 'Ẍ'; '\u{1e8c}' + 'x', // 'ẍ'; '\u{1e8d}' + 'Y', // 'Ẏ'; '\u{1e8e}' + 'y', // 'ẏ'; '\u{1e8f}' + 'Z', // 'Ẑ'; '\u{1e90}' + 'z', // 'ẑ'; '\u{1e91}' + 'Z', // 'Ẓ'; '\u{1e92}' + 'z', // 'ẓ'; '\u{1e93}' + 'Z', // 'Ẕ'; '\u{1e94}' + 'z', // 'ẕ'; '\u{1e95}' + 'h', // 'ẖ'; '\u{1e96}' + 't', // 'ẗ'; '\u{1e97}' + 'w', // 'ẘ'; '\u{1e98}' + 'y', // 'ẙ'; '\u{1e99}' + 'a', // 'ẚ'; '\u{1e9a}' + 'i', // 'ẛ'; '\u{1e9b}' + 'f', // 'ẜ'; '\u{1e9c}' + 'f', // 'ẝ'; '\u{1e9d}' + 'ẞ', // 'ẞ'; '\u{1e9e}' + 'ẟ', // 'ẟ'; '\u{1e9f}' + 'A', // 'Ạ'; '\u{1ea0}' + 'a', // 'ạ'; '\u{1ea1}' + 'A', // 'Ả'; '\u{1ea2}' + 'a', // 'ả'; '\u{1ea3}' + 'A', // 'Ấ'; '\u{1ea4}' + 'a', // 'ấ'; '\u{1ea5}' + 'A', // 'Ầ'; '\u{1ea6}' + 'a', // 'ầ'; '\u{1ea7}' + 'A', // 'Ẩ'; '\u{1ea8}' + 'a', // 'ẩ'; '\u{1ea9}' + 'A', // 'Ẫ'; '\u{1eaa}' + 'a', // 'ẫ'; '\u{1eab}' + 'A', // 'Ậ'; '\u{1eac}' + 'a', // 'ậ'; '\u{1ead}' + 'A', // 'Ắ'; '\u{1eae}' + 'a', // 'ắ'; '\u{1eaf}' + 'A', // 'Ằ'; '\u{1eb0}' + 'a', // 'ằ'; '\u{1eb1}' + 'A', // 'Ẳ'; '\u{1eb2}' + 'a', // 'ẳ'; '\u{1eb3}' + 'A', // 'Ẵ'; '\u{1eb4}' + 'a', // 'ẵ'; '\u{1eb5}' + 'A', // 'Ặ'; '\u{1eb6}' + 'a', // 'ặ'; '\u{1eb7}' + 'E', // 'Ẹ'; '\u{1eb8}' + 'e', // 'ẹ'; '\u{1eb9}' + 'E', // 'Ẻ'; '\u{1eba}' + 'e', // 'ẻ'; '\u{1ebb}' + 'E', // 'Ẽ'; '\u{1ebc}' + 'e', // 'ẽ'; '\u{1ebd}' + 'E', // 'Ế'; '\u{1ebe}' + 'e', // 'ế'; '\u{1ebf}' + 'E', // 'Ề'; '\u{1ec0}' + 'e', // 'ề'; '\u{1ec1}' + 'E', // 'Ể'; '\u{1ec2}' + 'e', // 'ể'; '\u{1ec3}' + 'E', // 'Ễ'; '\u{1ec4}' + 'e', // 'ễ'; '\u{1ec5}' + 'E', // 'Ệ'; '\u{1ec6}' + 'e', // 'ệ'; '\u{1ec7}' + 'I', // 'Ỉ'; '\u{1ec8}' + 'i', // 'ỉ'; '\u{1ec9}' + 'I', // 'Ị'; '\u{1eca}' + 'i', // 'ị'; '\u{1ecb}' + 'O', // 'Ọ'; '\u{1ecc}' + 'o', // 'ọ'; '\u{1ecd}' + 'O', // 'Ỏ'; '\u{1ece}' + 'o', // 'ỏ'; '\u{1ecf}' + 'O', // 'Ố'; '\u{1ed0}' + 'o', // 'ố'; '\u{1ed1}' + 'O', // 'Ồ'; '\u{1ed2}' + 'o', // 'ồ'; '\u{1ed3}' + 'O', // 'Ổ'; '\u{1ed4}' + 'o', // 'ổ'; '\u{1ed5}' + 'O', // 'Ỗ'; '\u{1ed6}' + 'o', // 'ỗ'; '\u{1ed7}' + 'O', // 'Ộ'; '\u{1ed8}' + 'o', // 'ộ'; '\u{1ed9}' + 'O', // 'Ớ'; '\u{1eda}' + 'o', // 'ớ'; '\u{1edb}' + 'O', // 'Ờ'; '\u{1edc}' + 'o', // 'ờ'; '\u{1edd}' + 'O', // 'Ở'; '\u{1ede}' + 'o', // 'ở'; '\u{1edf}' + 'O', // 'Ỡ'; '\u{1ee0}' + 'o', // 'ỡ'; '\u{1ee1}' + 'O', // 'Ợ'; '\u{1ee2}' + 'o', // 'ợ'; '\u{1ee3}' + 'U', // 'Ụ'; '\u{1ee4}' + 'u', // 'ụ'; '\u{1ee5}' + 'U', // 'Ủ'; '\u{1ee6}' + 'u', // 'ủ'; '\u{1ee7}' + 'U', // 'Ứ'; '\u{1ee8}' + 'u', // 'ứ'; '\u{1ee9}' + 'U', // 'Ừ'; '\u{1eea}' + 'u', // 'ừ'; '\u{1eeb}' + 'U', // 'Ử'; '\u{1eec}' + 'u', // 'ử'; '\u{1eed}' + 'U', // 'Ữ'; '\u{1eee}' + 'u', // 'ữ'; '\u{1eef}' + 'U', // 'Ự'; '\u{1ef0}' + 'u', // 'ự'; '\u{1ef1}' + 'Y', // 'Ỳ'; '\u{1ef2}' + 'y', // 'ỳ'; '\u{1ef3}' + 'Y', // 'Ỵ'; '\u{1ef4}' + 'y', // 'ỵ'; '\u{1ef5}' + 'Y', // 'Ỷ'; '\u{1ef6}' + 'y', // 'ỷ'; '\u{1ef7}' + 'Y', // 'Ỹ'; '\u{1ef8}' + 'y', // 'ỹ'; '\u{1ef9}' + 'Ỻ', // 'Ỻ'; '\u{1efa}' + 'ỻ', // 'ỻ'; '\u{1efb}' + 'Ỽ', // 'Ỽ'; '\u{1efc}' + 'ỽ', // 'ỽ'; '\u{1efd}' + 'Ỿ', // 'Ỿ'; '\u{1efe}' + 'ỿ', // 'ỿ'; '\u{1eff}' +]; + +/// A char array corresponding to the following Unicode block: +/// +/// - [Superscripts and Subscripts](https://en.wikipedia.org/wiki/Superscripts_and_Subscripts) +/// +/// This covers the range `'\u{2070}'..='\u{209f}'`. +static SUPERSCRIPTS_AND_SUBSCRIPTS: [char; 48] = [ + '0', // '⁰'; '\u{2070}' + 'i', // 'ⁱ'; '\u{2071}' + '', // ''; '\u{2072}' + '', // ''; '\u{2073}' + '4', // '⁴'; '\u{2074}' + '5', // '⁵'; '\u{2075}' + '6', // '⁶'; '\u{2076}' + '7', // '⁷'; '\u{2077}' + '8', // '⁸'; '\u{2078}' + '0', // '⁹'; '\u{2079}' + '+', // '⁺'; '\u{207a}' + '-', // '⁻'; '\u{207b}' + '=', // '⁼'; '\u{207c}' + '(', // '⁽'; '\u{207d}' + ')', // '⁾'; '\u{207e}' + 'n', // 'ⁿ'; '\u{207f}' + '0', // '₀'; '\u{2080}' + '1', // '₁'; '\u{2081}' + '2', // '₂'; '\u{2082}' + '3', // '₃'; '\u{2083}' + '4', // '₄'; '\u{2084}' + '5', // '₅'; '\u{2085}' + '6', // '₆'; '\u{2086}' + '7', // '₇'; '\u{2087}' + '8', // '₈'; '\u{2088}' + '9', // '₉'; '\u{2089}' + '+', // '₊'; '\u{208a}' + '-', // '₋'; '\u{208b}' + '=', // '₌'; '\u{208c}' + '(', // '₍'; '\u{208d}' + ')', // '₎'; '\u{208e}' + '', // ''; '\u{208f}' + 'a', // 'ₐ'; '\u{2090}' + 'e', // 'ₑ'; '\u{2091}' + 'o', // 'ₒ'; '\u{2092}' + 'x', // 'ₓ'; '\u{2093}' + 'e', // 'ₔ'; '\u{2094}' + 'h', // 'ₕ'; '\u{2095}' + 'k', // 'ₖ'; '\u{2096}' + 'l', // 'ₗ'; '\u{2097}' + 'm', // 'ₘ'; '\u{2098}' + 'n', // 'ₙ'; '\u{2099}' + 'p', // 'ₚ'; '\u{209a}' + 's', // 'ₛ'; '\u{209b}' + 't', // 'ₜ'; '\u{209c}' + '', // ''; '\u{209d}' + '', // ''; '\u{209e}' + '', // ''; '\u{209f}' +]; + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function for test assertions. + fn check_conversions(pairs: &[(char, char)]) { + for (original, normalized) in pairs { + assert_eq!(normalize(*original), *normalized); + } + } + + /// General conversion checks + #[test] + fn general() { + check_conversions(&[ + ('ą', 'a'), + ('À', 'A'), + ('ć', 'c'), + ('ę', 'e'), + ('ł', 'l'), + ('ń', 'n'), + ('ó', 'o'), + ('ś', 's'), + ('ź', 'z'), + ('ż', 'z'), + ('Ą', 'A'), + ('Ć', 'C'), + ('Ę', 'E'), + ('ł', 'l'), + ('Ł', 'L'), + ('Ń', 'N'), + ('Ó', 'O'), + ('Ś', 'S'), + ('Ź', 'Z'), + ('Ż', 'Z'), + ('¡', '!'), + ]); + } + + /// Some checks for characters which are not visible. + #[test] + fn invisible_chars() { + check_conversions(&[('\u{a0}', '\u{a0}'), ('\u{ad}', '\u{ad}')]); + } + + /// Check boundary cases in case ranges are modified. + #[test] + fn boundary_cases() { + check_conversions(&[ + ('\u{9f}', '\u{9f}'), + ('\u{a0}', '\u{a0}'), + ('¡', '!'), + ('ʟ', 'L'), + ('\u{2a0}', '\u{2a0}'), + ('\u{1dff}', '\u{1dff}'), + ('Ḁ', 'A'), + ('ỹ', 'y'), + ('\u{1eff}', '\u{1eff}'), + ('\u{1f00}', '\u{1f00}'), + ('⁰', '0'), + ('\u{209c}', 't'), + ('\u{209f}', '\u{209f}'), + ('\u{20a0}', '\u{20a0}'), + ]); + } + + /// Check that conversions outside the blocks are unchanged. + #[test] + fn unchanged_outside_blocks() { + check_conversions(&[ + ('a', 'a'), + ('⟁', '⟁'), + ('┍', '┍'), + ('ω', 'ω'), + ('⁕', '⁕'), + ('ה', 'ה'), + ]); + } +} diff --git a/crates/atuin-nucleo/matcher/src/config.rs b/crates/atuin-nucleo/matcher/src/config.rs new file mode 100644 index 00000000..eca7ae38 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/config.rs @@ -0,0 +1,70 @@ +use crate::chars::CharClass; +use crate::score::BONUS_BOUNDARY; + +/// Configuration data that controls how a matcher behaves +#[non_exhaustive] +#[derive(PartialEq, Eq, Debug, Clone)] +pub struct Config { + /// Characters that act as delimiters and provide bonus + /// for matching the following char + pub(crate) delimiter_chars: &'static [u8], + /// Extra bonus for word boundary after whitespace character or beginning of the string + pub(crate) bonus_boundary_white: u16, + /// Extra bonus for word boundary after slash, colon, semi-colon, and comma + pub(crate) bonus_boundary_delimiter: u16, + pub(crate) initial_char_class: CharClass, + + /// Whether to normalize latin script characters to ASCII (enabled by default) + pub normalize: bool, + /// whether to ignore casing + pub ignore_case: bool, + /// Whether to provide a bonus to matches by their distance from the start + /// of the haystack. The bonus is fairly small compared to the normal gap + /// penalty to avoid messing with the normal score heuristic. This setting + /// is not turned on by default and only recommended for autocompletion + /// usecases where the expectation is that the user is typing the entire + /// match. For a full fzf-like fuzzy matcher/picker word segmentation and + /// explicit prefix literals should be used instead. + pub prefer_prefix: bool, +} + +impl Config { + /// The default config for nucleo, implemented as a constant since + /// Default::default can not be called in a const context + pub const DEFAULT: Self = { + Config { + delimiter_chars: b"/,:;|", + bonus_boundary_white: BONUS_BOUNDARY + 2, + bonus_boundary_delimiter: BONUS_BOUNDARY + 1, + initial_char_class: CharClass::Whitespace, + normalize: true, + ignore_case: true, + prefer_prefix: false, + } + }; +} + +impl Config { + /// Configures the matcher with bonuses appropriate for matching file paths. + pub fn set_match_paths(&mut self) { + if cfg!(windows) { + self.delimiter_chars = b"/:\\"; + } else { + self.delimiter_chars = b"/:"; + } + self.bonus_boundary_white = BONUS_BOUNDARY; + self.initial_char_class = CharClass::Delimiter; + } + + /// Configures the matcher with bonuses appropriate for matching file paths. + pub const fn match_paths(mut self) -> Self { + if cfg!(windows) { + self.delimiter_chars = b"/\\"; + } else { + self.delimiter_chars = b"/"; + } + self.bonus_boundary_white = BONUS_BOUNDARY; + self.initial_char_class = CharClass::Delimiter; + self + } +} diff --git a/crates/atuin-nucleo/matcher/src/debug.rs b/crates/atuin-nucleo/matcher/src/debug.rs new file mode 100644 index 00000000..b8369f32 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/debug.rs @@ -0,0 +1,14 @@ +use crate::matrix::{MatrixCell, ScoreCell}; +use std::fmt::{Debug, Formatter, Result}; + +impl Debug for ScoreCell { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "({}, {})", self.score, self.matched) + } +} + +impl Debug for MatrixCell { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "({}, {})", (self.0 & 1) != 0, (self.0 & 2) != 0) + } +} diff --git a/crates/atuin-nucleo/matcher/src/exact.rs b/crates/atuin-nucleo/matcher/src/exact.rs new file mode 100644 index 00000000..3cb3ceb2 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/exact.rs @@ -0,0 +1,275 @@ +use memchr::memmem; +use memchr::{Memchr, Memchr2}; + +use crate::chars::{AsciiChar, Char}; +use crate::score::{BONUS_FIRST_CHAR_MULTIPLIER, SCORE_MATCH}; +use crate::Matcher; + +impl Matcher { + pub(crate) fn substring_match_1_ascii<const INDICES: bool>( + &mut self, + haystack: &[u8], + c: u8, + indices: &mut Vec<u32>, + ) -> Option<u16> { + let mut max_score = 0; + let mut max_pos = 0; + if self.config.ignore_case && c >= b'a' && c <= b'z' { + for i in Memchr2::new(c, c - 32, haystack) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let char_class = AsciiChar(haystack[i]).char_class(&self.config); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if bonus >= self.config.bonus_boundary_white { + break; + } + } + } + } else { + let char_class = AsciiChar(c).char_class(&self.config); + for i in Memchr::new(c, haystack) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if bonus >= self.config.bonus_boundary_white { + break; + } + } + } + } + if max_score == 0 { + return None; + } + + if INDICES { + indices.push(max_pos); + } + Some(max_score) + } + + pub(crate) fn substring_match_ascii_with_prefilter( + &mut self, + haystack: &[u8], + needle: &[u8], + prefilter_len: usize, + prefilter: impl Iterator<Item = usize>, + ) -> (u16, usize) { + let needle_without_prefilter = &needle[prefilter_len..]; + let mut max_score = 0; + let mut max_pos = 0; + for i in prefilter { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let char_class = AsciiChar(haystack[i]).char_class(&self.config); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score + && haystack[i + prefilter_len..(i + needle.len()).min(haystack.len())] + .iter() + .map(|&c| AsciiChar(c).normalize(&self.config).0) + .eq(needle_without_prefilter.iter().copied()) + { + max_pos = i; + max_score = score; + // can't get better than this + if bonus >= self.config.bonus_boundary_white { + break; + } + } + } + (max_score, max_pos) + } + + pub(crate) fn substring_match_ascii<const INDICES: bool>( + &mut self, + haystack: &[u8], + needle: &[u8], + indices: &mut Vec<u32>, + ) -> Option<u16> { + let mut max_score = 0; + let mut max_pos = 0; + if self.config.ignore_case { + match needle.iter().position(|&c| c >= b'a' && c <= b'z') { + // start with char do case insensitive search + Some(0) => { + (max_score, max_pos) = self.substring_match_ascii_with_prefilter( + haystack, + needle, + 1, + Memchr2::new( + needle[0], + needle[0] - 32, + &haystack[..haystack.len() - needle.len() + 1], + ), + ); + if max_score == 0 { + return None; + } + } + Some(1) => { + (max_score, max_pos) = self.substring_match_ascii_with_prefilter( + haystack, + needle, + 1, + Memchr::new(needle[0], &haystack[..haystack.len() - needle.len() + 1]), + ); + if max_score == 0 { + return None; + } + } + Some(len) => { + (max_score, max_pos) = self.substring_match_ascii_with_prefilter( + haystack, + needle, + 1, + memmem::find_iter(&haystack[..haystack.len() - needle.len() + len], needle), + ); + if max_score == 0 { + return None; + } + } + // in case we don't have any letter in the needle + // we can treat the search as case sensitive and use memmem directly which is way faster + None => (), + } + } + + if max_score == 0 { + let char_class = AsciiChar(needle[0]).char_class(&self.config); + for i in memmem::find_iter(haystack, needle) { + let prev_char_class = i + .checked_sub(1) + .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let bonus = self.config.bonus_for(prev_char_class, char_class); + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i; + max_score = score; + // can't get better than this + if bonus >= self.config.bonus_boundary_white { + break; + } + } + } + if max_score == 0 { + return None; + } + } + let score = self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + max_pos, + max_pos + needle.len(), + indices, + ); + Some(score) + } + + pub(crate) fn substring_match_1_non_ascii<const INDICES: bool>( + &mut self, + haystack: &[char], + needle: char, + start: usize, + indices: &mut Vec<u32>, + ) -> u16 { + let mut max_score = 0; + let mut max_pos = 0; + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + for (i, &c) in haystack[start..].iter().enumerate() { + let (c, char_class) = c.char_class_and_normalize(&self.config); + if c != needle { + continue; + } + let bonus = self.config.bonus_for(prev_class, char_class); + prev_class = char_class; + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score { + max_pos = i as u32; + max_score = score; + // can't get better than this + if bonus >= self.config.bonus_boundary_white { + break; + } + } + } + + if INDICES { + indices.push(max_pos + start as u32); + } + max_score + } + + pub(crate) fn substring_match_non_ascii<const INDICES: bool, N>( + &mut self, + haystack: &[char], + needle: &[N], + start: usize, + indices: &mut Vec<u32>, + ) -> Option<u16> + where + N: Char, + char: PartialEq<N>, + { + let mut max_score = 0; + let mut max_pos = 0; + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let end = haystack.len() - needle.len(); + for (i, &c) in haystack[start..end].iter().enumerate() { + let (c, char_class) = c.char_class_and_normalize(&self.config); + if c != needle[0] { + continue; + } + let bonus = self.config.bonus_for(prev_class, char_class); + prev_class = char_class; + let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; + if score > max_score + && haystack[start + i + 1..start + i + needle.len()] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(needle[1..].iter().copied()) + { + max_pos = i; + max_score = score; + // can't get better than this + if bonus >= self.config.bonus_boundary_white { + break; + } + } + } + if max_score == 0 { + return None; + } + + let score = self.calculate_score::<INDICES, _, _>( + haystack, + needle, + start + max_pos, + start + max_pos + needle.len(), + indices, + ); + Some(score) + } +} diff --git a/crates/atuin-nucleo/matcher/src/fuzzy_greedy.rs b/crates/atuin-nucleo/matcher/src/fuzzy_greedy.rs new file mode 100644 index 00000000..8215bf31 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/fuzzy_greedy.rs @@ -0,0 +1,51 @@ +use crate::chars::Char; +use crate::Matcher; + +impl Matcher { + /// greedy fallback algorithm, much faster (linear time) but reported scores/indicies + /// might not be the best match + pub(crate) fn fuzzy_match_greedy_<const INDICES: bool, H: Char + PartialEq<N>, N: Char>( + &mut self, + haystack: &[H], + needle: &[N], + mut start: usize, + mut end: usize, + indices: &mut Vec<u32>, + ) -> Option<u16> { + let first_char_end = if H::ASCII && N::ASCII { start + 1 } else { end }; + 'nonascii: { + if !H::ASCII || !N::ASCII { + let mut needle_iter = needle[1..].iter().copied(); + if let Some(mut needle_char) = needle_iter.next() { + for (i, &c) in haystack[first_char_end..].iter().enumerate() { + if c.normalize(&self.config) == needle_char { + let Some(next_needle_char) = needle_iter.next() else { + // we found a match so we are now in the same state + // as the prefilter would produce + end = first_char_end + i + 1; + break 'nonascii; + }; + needle_char = next_needle_char; + } + } + // some needle chars were not matched bail out + return None; + } + } + } // minimize the greedly match by greedy matching in reverse + + let mut needle_iter = needle.iter().rev().copied(); + let mut needle_char = needle_iter.next().unwrap(); + for (i, &c) in haystack[start..end].iter().enumerate().rev() { + let c = c.normalize(&self.config); + if c == needle_char { + let Some(next_needle_char) = needle_iter.next() else { + start += i; + break; + }; + needle_char = next_needle_char; + } + } + Some(self.calculate_score::<INDICES, H, N>(haystack, needle, start, end, indices)) + } +} diff --git a/crates/atuin-nucleo/matcher/src/fuzzy_optimal.rs b/crates/atuin-nucleo/matcher/src/fuzzy_optimal.rs new file mode 100644 index 00000000..5d53ecfb --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/fuzzy_optimal.rs @@ -0,0 +1,348 @@ +use std::cmp::max; + +use crate::chars::{Char, CharClass}; +use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell}; +use crate::score::{ + BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS, + PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH, +}; +use crate::{Config, Matcher}; + +impl Matcher { + pub(crate) fn fuzzy_match_optimal<const INDICES: bool, H: Char + PartialEq<N>, N: Char>( + &mut self, + haystack: &[H], + needle: &[N], + start: usize, + greedy_end: usize, + end: usize, + indices: &mut Vec<u32>, + ) -> Option<u16> { + // construct a matrix (and copy the haystack), the matrix and haystack size are bounded + // to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows + // us to treat needle indices as u16 + let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else { + return self.fuzzy_match_greedy_::<INDICES, H, N>( + haystack, needle, start, greedy_end, indices, + ); + }; + + let prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32); + // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects + if !matched { + assert!( + !N::ASCII || !H::ASCII, + "Non-match should have been caught by prefilter. Maybe `needle` is not normalized?" + ); + return None; + } + + // populate the matrix and find the best score + let matrix_len = matrix.populate_matrix::<INDICES, _>(needle); + let last_row_off = matrix.row_offs[needle.len() - 1]; + let relative_last_row_off = last_row_off as usize + 1 - needle.len(); + let (match_end, match_score_cell) = matrix.current_row[relative_last_row_off..] + .iter() + .enumerate() + .max_by_key(|(_, cell)| cell.score) + .expect("there must be atleast one match"); + if INDICES { + matrix.reconstruct_optimal_path(match_end as u16, indices, matrix_len, start as u32); + } + Some(match_score_cell.score) + } +} + +const UNMATCHED: ScoreCell = ScoreCell { + score: 0, + // if matched is true then the consecutive bonus + // is always atleast BONUS_CONSECUTIVE so + // this constant can never occur naturally + consecutive_bonus: 0, + matched: true, +}; + +fn next_m_cell(p_score: u16, bonus: u16, m_cell: ScoreCell) -> ScoreCell { + if m_cell == UNMATCHED { + return ScoreCell { + score: p_score + bonus + SCORE_MATCH, + matched: false, + consecutive_bonus: bonus as u8, + }; + } + + let mut consecutive_bonus = max(m_cell.consecutive_bonus as u16, BONUS_CONSECUTIVE); + if bonus >= BONUS_BOUNDARY && bonus > consecutive_bonus { + consecutive_bonus = bonus + } + + let score_match = m_cell.score + max(consecutive_bonus, bonus); + let score_skip = p_score + bonus; + if score_match > score_skip { + ScoreCell { + score: score_match + SCORE_MATCH, + matched: true, + consecutive_bonus: consecutive_bonus as u8, + } + } else { + ScoreCell { + score: score_skip + SCORE_MATCH, + matched: false, + consecutive_bonus: bonus as u8, + } + } +} + +fn p_score(prev_p_score: u16, prev_m_score: u16) -> (u16, bool) { + let score_match = prev_m_score.saturating_sub(PENALTY_GAP_START); + let score_skip = prev_p_score.saturating_sub(PENALTY_GAP_EXTENSION); + if score_match > score_skip { + (score_match, true) + } else { + (score_skip, false) + } +} + +impl<H: Char> MatcherDataView<'_, H> { + fn setup<const INDICES: bool, N: Char>( + &mut self, + needle: &[N], + mut prev_class: CharClass, + config: &Config, + start: u32, + ) -> bool + where + H: PartialEq<N>, + { + let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut()); + let (mut needle_char, mut row_start) = row_iter.next().unwrap(); + + let col_iter = self + .haystack + .iter_mut() + .zip(self.bonus.iter_mut()) + .enumerate(); + + let mut matched = false; + for (i, (c_, bonus_)) in col_iter { + let (c, class) = c_.char_class_and_normalize(config); + *c_ = c; + + let bonus = config.bonus_for(prev_class, class); + // save bonus for later so we don't have to recompute it each time + *bonus_ = bonus as u8; + prev_class = class; + + let i = i as u16; + if c == needle_char { + // save the first idx of each char + if let Some(next) = row_iter.next() { + *row_start = i; + (needle_char, row_start) = next; + } else if !matched { + *row_start = i; + // we have atleast one match + matched = true; + } + } + } + if !matched { + return false; + } + debug_assert_eq!(self.row_offs[0], 0); + Self::score_row::<true, INDICES, _>( + self.current_row, + self.matrix_cells, + self.haystack, + self.bonus, + 0, + self.row_offs[1], + 0, + needle[0], + needle[1], + if config.prefer_prefix { + if start == 0 { + MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE + } else { + (MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub( + (start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION, + ) + } + } else { + 0 + }, + ); + true + } + + #[allow(clippy::too_many_arguments)] + fn score_row<const FIRST_ROW: bool, const INDICES: bool, N: Char>( + current_row: &mut [ScoreCell], + matrix_cells: &mut [MatrixCell], + haystack: &[H], + bonus: &[u8], + row_off: u16, + mut next_row_off: u16, + needle_idx: u16, + needle_char: N, + next_needle_char: N, + mut prefix_bonus: u16, + ) where + H: PartialEq<N>, + { + next_row_off -= 1; + let relative_row_off = row_off - needle_idx; + let next_relative_row_off = next_row_off - needle_idx; + let skipped_col_iter = haystack[row_off as usize..next_row_off as usize] + .iter() + .zip(bonus[row_off as usize..next_row_off as usize].iter()) + .zip(current_row[relative_row_off as usize..next_relative_row_off as usize].iter_mut()) + .zip(matrix_cells.iter_mut()); + let mut prev_p_score = 0; + let mut prev_m_score = 0; + for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter { + let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); + let m_cell = if FIRST_ROW { + let cell = if c == needle_char { + ScoreCell { + score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER + + SCORE_MATCH + + prefix_bonus / PREFIX_BONUS_SCALE, + matched: false, + consecutive_bonus: *bonus, + } + } else { + UNMATCHED + }; + prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); + cell + } else { + *score_cell + }; + if INDICES { + matrix_cell.set(p_matched, m_cell.matched); + } + prev_p_score = p_score; + prev_m_score = m_cell.score; + } + let col_iter = haystack[next_row_off as usize..] + .windows(2) + .zip(bonus[next_row_off as usize..].windows(2)) + .zip(current_row[next_relative_row_off as usize..].iter_mut()) + .zip(matrix_cells[(next_relative_row_off - relative_row_off) as usize..].iter_mut()); + for (((c, bonus), score_cell), matrix_cell) in col_iter { + let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); + let m_cell = if FIRST_ROW { + let cell = if c[0] == needle_char { + ScoreCell { + score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER + + SCORE_MATCH + + prefix_bonus / PREFIX_BONUS_SCALE, + matched: false, + consecutive_bonus: bonus[0], + } + } else { + UNMATCHED + }; + prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); + cell + } else { + *score_cell + }; + *score_cell = if c[1] == next_needle_char { + next_m_cell(p_score, bonus[1] as u16, m_cell) + } else { + UNMATCHED + }; + if INDICES { + matrix_cell.set(p_matched, m_cell.matched); + } + prev_p_score = p_score; + prev_m_score = m_cell.score; + } + } + + fn populate_matrix<const INDICES: bool, N: Char>(&mut self, needle: &[N]) -> usize + where + H: PartialEq<N>, + { + let mut matrix_cells = &mut self.matrix_cells[self.current_row.len()..]; + let mut row_iter = needle[1..] + .iter() + .copied() + .zip(self.row_offs[1..].iter().copied()) + .enumerate(); + let (mut needle_idx, (mut needle_char, mut row_off)) = row_iter.next().unwrap(); + for (next_needle_idx, (next_needle_char, next_row_off)) in row_iter { + Self::score_row::<false, INDICES, _>( + self.current_row, + matrix_cells, + self.haystack, + self.bonus, + row_off, + next_row_off, + needle_idx as u16 + 1, + needle_char, + next_needle_char, + 0, + ); + let len = self.current_row.len() + needle_idx + 1 - row_off as usize; + matrix_cells = &mut matrix_cells[len..]; + (needle_idx, needle_char, row_off) = (next_needle_idx, next_needle_char, next_row_off); + } + matrix_cells.as_ptr() as usize - self.matrix_cells.as_ptr() as usize + } + + fn reconstruct_optimal_path( + &self, + max_score_end: u16, + indices: &mut Vec<u32>, + matrix_len: usize, + start: u32, + ) { + let indices_start = indices.len(); + indices.resize(indices_start + self.row_offs.len(), 0); + let indices = &mut indices[indices_start..]; + let last_row_off = *self.row_offs.last().unwrap(); + indices[self.row_offs.len() - 1] = start + max_score_end as u32 + last_row_off as u32; + + let mut matrix_cells = &self.matrix_cells[..matrix_len]; + let width = self.current_row.len(); + let mut row_iter = self.row_offs[..self.row_offs.len() - 1] + .iter() + .copied() + .enumerate() + .rev() + .map(|(i, off)| { + let relative_off = off as usize - i; + let row; + (matrix_cells, row) = + matrix_cells.split_at(matrix_cells.len() - (width - relative_off)); + (i, off, row) + }); + let (mut row_idx, mut row_off, mut row) = row_iter.next().unwrap(); + let mut col = max_score_end; + let relative_last_row_off = last_row_off as usize + 1 - self.row_offs.len(); + let mut matched = self.current_row[col as usize + relative_last_row_off].matched; + col += last_row_off - row_off - 1; + loop { + if matched { + indices[row_idx] = start + col as u32 + row_off as u32; + } + let next_matched = row[col as usize].get(matched); + if matched { + let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else { + break; + }; + col += row_off - next_row_off; + (row_idx, row_off, row) = (next_row_idx, next_row_off, next_row) + } + col -= 1; + matched = next_matched; + } + } +} diff --git a/crates/atuin-nucleo/matcher/src/lib.rs b/crates/atuin-nucleo/matcher/src/lib.rs new file mode 100644 index 00000000..3e8874c5 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/lib.rs @@ -0,0 +1,780 @@ +/*! +`nucleo_matcher` is a low level crate that contains the matcher implementation +used by the high level `nucleo` crate. + +**NOTE**: If you are building an fzf-like interactive fuzzy finder that is +meant to match a reasonably large number of items (> 100) using the high level +`nucleo` crate is highly recommended. Using `nucleo-matcher` directly in you ui +loop will be very slow. Implementing this logic yourself is very complex. + +The matcher is hightly optimized and can significantly outperform `fzf` and +`skim` (the `fuzzy-matcher` crate). However some of these optimizations require +a slightly less convenient API. Be sure to carefully read the documentation of +the [`Matcher`] to avoid unexpected behaviour. +# Examples + +For almost all usecases the [`pattern`] API should be used instead of calling +the matcher methods directly. [`Pattern::parse`](pattern::Pattern::parse) will +construct a single Atom (a single match operation) for each word. The pattern +can contain special characters to control what kind of match is performed (see +[`AtomKind`](crate::pattern::AtomKind)). + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching}; +let paths = ["foo/bar", "bar/foo", "foobar"]; +let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); +let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); +let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]); +``` + +If the pattern should be matched literally (without this special parsing) +[`Pattern::new`](pattern::Pattern::new) can be used instead. + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization}; +let paths = ["foo/bar", "bar/foo", "foobar"]; +let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); +let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); +let paths = ["^foo/bar", "bar/^foo", "foobar"]; +let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]); +``` + +Word segmentation is performed automatically on any unescaped character for which [`is_whitespace`](char::is_whitespace) returns true. +This is relevant, for instance, with non-english keyboard input. + +``` +# use nucleo_matcher::pattern::{Atom, Pattern, Normalization, CaseMatching}; +assert_eq!( + // double-width 'Ideographic Space', i.e. `'\u{3000}'` + Pattern::parse("ほげ ふが", CaseMatching::Smart, Normalization::Smart).atoms, + vec![ + Atom::parse("ほげ", CaseMatching::Smart, Normalization::Smart), + Atom::parse("ふが", CaseMatching::Smart, Normalization::Smart), + ], +); +``` + +If word segmentation is also not desired, a single `Atom` can be constructed directly. + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind}; +let paths = ["foobar", "foo bar"]; +let mut matcher = Matcher::new(Config::DEFAULT); +let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo bar", 192)]); +``` + + +# Status + +Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away). + +*/ + +// sadly ranges don't optmimzie well +#![allow(clippy::manual_range_contains)] +#![warn(missing_docs)] + +pub mod chars; +mod config; +#[cfg(test)] +mod debug; +mod exact; +mod fuzzy_greedy; +mod fuzzy_optimal; +mod matrix; +pub mod pattern; +mod prefilter; +mod score; +mod utf32_str; + +#[cfg(test)] +mod tests; + +pub use crate::config::Config; +pub use crate::utf32_str::{Utf32Str, Utf32String}; + +use crate::chars::{AsciiChar, Char}; +use crate::matrix::MatrixSlab; + +/// A matcher engine that can execute (fuzzy) matches. +/// +/// A matches contains **heap allocated** scratch memory that is reused during +/// matching. This scratch memory allows the matcher to guarantee that it will +/// **never allocate** during matching (with the exception of pushing to the +/// `indices` vector if there isn't enough capacity). However this scratch +/// memory is fairly large (around 135KB) so creating a matcher is expensive. +/// +/// All `.._match` functions will not compute the indices of the matched +/// characters. These should be used to prefilter to filter and rank all +/// matches. All `.._indices` functions will also compute the indices of the +/// matched characters but are slower compared to the `..match` variant. These +/// should be used when rendering the best N matches. Note that the `indices` +/// argument is **never cleared**. This allows running multiple different +/// matches on the same haystack and merging the indices by sorting and +/// deduplicating the vector. +/// +/// The `needle` argument for each function must always be normalized by the +/// caller (unicode normalization and case folding). Otherwise, the matcher +/// may fail to produce a match. The [`pattern`] modules provides utilities +/// to preprocess needles and **should usually be preferred over invoking the +/// matcher directly**. Additionally it's recommend to perform separate matches +/// for each word in the needle. Consider the folloling example: +/// +/// If `foo bar` is used as the needle it matches both `foo test baaar` and +/// `foo hello-world bar`. However, `foo test baaar` will receive a higher +/// score than `foo hello-world bar`. `baaar` contains a 2 character gap which +/// will receive a penalty and therefore the user will likely expect it to rank +/// lower. However, if `foo bar` is matched as a single query `hello-world` and +/// `test` are both considered gaps too. As `hello-world` is a much longer gap +/// then `test` the extra penalty for `baaar` is canceled out. If both words +/// are matched individually the interspersed words do not receive a penalty and +/// `foo hello-world bar` ranks higher. +/// +/// In general nucleo is a **substring matching tool** (except for the prefix/ +/// postfix matching modes) with no penalty assigned to matches that start +/// later within the same pattern (which enables matching words individually +/// as shown above). If patterns show a large variety in length and the syntax +/// described above is not used it may be preferable to give preference to +/// matches closer to the start of a haystack. To accommodate that usecase the +/// [`prefer_prefix`](Config::prefer_prefix) option can be set to true. +/// +/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than +/// that the matcher **will panic**. The caller must decide whether it wants to +/// filter out long haystacks or truncate them. +pub struct Matcher { + #[allow(missing_docs)] + pub config: Config, + slab: MatrixSlab, +} + +// this is just here for convenience not sure if we should implement this +impl Clone for Matcher { + fn clone(&self) -> Self { + Matcher { + config: self.config.clone(), + slab: MatrixSlab::new(), + } + } +} + +impl std::fmt::Debug for Matcher { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Matcher") + .field("config", &self.config) + .finish_non_exhaustive() + } +} + +impl Default for Matcher { + fn default() -> Self { + Matcher { + config: Config::DEFAULT, + slab: MatrixSlab::new(), + } + } +} + +impl Matcher { + /// Creates a new matcher instance, note that this will eagerly allocate a + /// fairly large chunk of heap memory (around 135KB currently but subject to + /// change) so matchers should be reused if called often (like in a loop). + pub fn new(config: Config) -> Self { + Self { + config, + slab: MatrixSlab::new(), + } + } + + /// Find the fuzzy match with the highest score in the `haystack`. + /// + /// This functions has `O(mn)` time complexity for short inputs. + /// To avoid slowdowns it automatically falls back to + /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large + /// needles and haystacks. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new()) + } + + /// Find the fuzzy match with the highest score in the `haystack` and + /// compute its indices. + /// + /// This functions has `O(mn)` time complexity for short inputs. To + /// avoid slowdowns it automatically falls back to + /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large needles + /// and haystacks + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::<true>(haystack, needle, indices) + } + + fn fuzzy_matcher_impl<const INDICES: bool>( + &mut self, + haystack_: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() > haystack_.len() { + return None; + } + if needle_.is_empty() { + return Some(0); + } + if needle_.len() == haystack_.len() { + return self.exact_match_impl::<INDICES>( + haystack_, + needle_, + 0, + haystack_.len(), + indices, + ); + } + assert!( + haystack_.len() <= u32::MAX as usize, + "fuzzy matching is only support for up to 2^32-1 codepoints" + ); + match (haystack_, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices); + } + let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; + if needle_.len() == end - start { + return Some(self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + )); + } + self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::<INDICES>( + haystack, + needle as char, + start, + indices, + ); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + if needle_.len() == end - start { + return self + .exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices); + } + self.fuzzy_match_optimal::<INDICES, char, AsciiChar>( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + end, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + if needle_.len() == end - start { + return self + .exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices); + } + self.fuzzy_match_optimal::<INDICES, char, char>( + haystack, + needle, + start, + start + 1, + end, + indices, + ) + } + } + } + + /// Greedly find a fuzzy match in the `haystack`. + /// + /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) + /// indices and scores. Usually [fuzzy_match](crate::Matcher::fuzzy_match) should + /// be preferred. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_match_greedy( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + ) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_match_greedy_impl::<false>(haystack, needle, &mut Vec::new()) + } + + /// Greedly find a fuzzy match in the `haystack` and compute its indices. + /// + /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) + /// indices and scores. Usually [fuzzy_indices](crate::Matcher::fuzzy_indices) should + /// be preferred. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_indices_greedy( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_match_greedy_impl::<true>(haystack, needle, indices) + } + + fn fuzzy_match_greedy_impl<const INDICES: bool>( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() > haystack.len() { + return None; + } + if needle_.is_empty() { + return Some(0); + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; + if needle_.len() == greedy_end - start { + return Some(self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + )); + } + self.fuzzy_match_greedy_::<INDICES, AsciiChar, AsciiChar>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + self.fuzzy_match_greedy_::<INDICES, char, AsciiChar>( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + self.fuzzy_match_greedy_::<INDICES, char, char>( + haystack, + needle, + start, + start + 1, + indices, + ) + } + } + } + + /// Finds the substring match with the highest score in the `haystack`. + /// + /// This functions has `O(nm)` time complexity. However many cases can + /// be significantly accelerated using prefilters so it's usually very fast + /// in practice. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn substring_match( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + ) -> Option<u16> { + self.substring_match_impl::<false>(haystack, needle_, &mut Vec::new()) + } + + /// Finds the substring match with the highest score in the `haystack` and + /// compute its indices. + /// + /// This functions has `O(nm)` time complexity. However many cases can + /// be significantly accelerated using prefilters so it's usually fast + /// in practice. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn substring_indices( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + self.substring_match_impl::<true>(haystack, needle_, indices) + } + + fn substring_match_impl<const INDICES: bool>( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() > haystack.len() { + return None; + } + if needle_.is_empty() { + return Some(0); + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices); + } + self.substring_match_ascii::<INDICES>(haystack, needle, indices) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::<INDICES>( + haystack, + needle as char, + start, + indices, + ); + return Some(res); + } + let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.substring_match_non_ascii::<INDICES, _>( + haystack, + AsciiChar::cast(needle), + start, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices); + return Some(res); + } + let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.substring_match_non_ascii::<INDICES, _>(haystack, needle, start, indices) + } + } + } + + /// Checks whether needle and haystack match exactly. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + let mut trailing_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if !needle.last().is_whitespace() { + trailing_space = haystack.trailing_white_space() + } + // avoid wraparound in size check + if trailing_space == haystack.len() { + return None; + } + self.exact_match_impl::<false>( + haystack, + needle, + leading_space, + haystack.len() - trailing_space, + &mut Vec::new(), + ) + } + + /// Checks whether needle and haystack match exactly and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn exact_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + let mut trailing_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if !needle.last().is_whitespace() { + trailing_space = haystack.trailing_white_space() + } + // avoid wraparound in size check + if trailing_space == haystack.len() { + return None; + } + self.exact_match_impl::<true>( + haystack, + needle, + leading_space, + haystack.len() - trailing_space, + indices, + ) + } + + /// Checks whether needle is a prefix of the haystack. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if haystack.len() - leading_space < needle.len() { + None + } else { + self.exact_match_impl::<false>( + haystack, + needle, + leading_space, + needle.len() + leading_space, + &mut Vec::new(), + ) + } + } + + /// Checks whether needle is a prefix of the haystack and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn prefix_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if haystack.len() - leading_space < needle.len() { + None + } else { + self.exact_match_impl::<true>( + haystack, + needle, + leading_space, + needle.len() + leading_space, + indices, + ) + } + } + + /// Checks whether needle is a postfix of the haystack. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut trailing_spaces = 0; + if !needle.last().is_whitespace() { + trailing_spaces = haystack.trailing_white_space() + } + if haystack.len() - trailing_spaces < needle.len() { + None + } else { + self.exact_match_impl::<false>( + haystack, + needle, + haystack.len() - needle.len() - trailing_spaces, + haystack.len() - trailing_spaces, + &mut Vec::new(), + ) + } + } + + /// Checks whether needle is a postfix of the haystack and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn postfix_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut trailing_spaces = 0; + if !needle.last().is_whitespace() { + trailing_spaces = haystack.trailing_white_space() + } + if haystack.len() - trailing_spaces < needle.len() { + None + } else { + self.exact_match_impl::<true>( + haystack, + needle, + haystack.len() - needle.len() - trailing_spaces, + haystack.len() - trailing_spaces, + indices, + ) + } + } + + fn exact_match_impl<const INDICES: bool>( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + start: usize, + end: usize, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() != end - start { + return None; + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + let score = match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let matched = if self.config.ignore_case { + AsciiChar::cast(haystack)[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))) + } else { + &haystack[start..end] == needle + }; + if !matched { + return None; + } + self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + return None; + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + let matched = haystack[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))); + if !matched { + return None; + } + + self.calculate_score::<INDICES, _, _>( + haystack, + AsciiChar::cast(needle), + start, + end, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let matched = haystack[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(needle.iter().map(|c| c.normalize(&self.config))); + if !matched { + return None; + } + self.calculate_score::<INDICES, _, _>(haystack, needle, start, end, indices) + } + }; + Some(score) + } +} diff --git a/crates/atuin-nucleo/matcher/src/matrix.rs b/crates/atuin-nucleo/matcher/src/matrix.rs new file mode 100644 index 00000000..a91ed95f --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/matrix.rs @@ -0,0 +1,198 @@ +use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout}; +use std::marker::PhantomData; +use std::mem::size_of; +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::ptr::{slice_from_raw_parts_mut, NonNull}; + +use crate::chars::Char; + +const MAX_MATRIX_SIZE: usize = 100 * 1024; // 100*1024 = 100KB + +// these two aren't hard maxima, instead we simply allow whatever will fit into memory +const MAX_HAYSTACK_LEN: usize = 2048; // 64KB +const MAX_NEEDLE_LEN: usize = 2048; // 64KB + +struct MatrixLayout<C: Char> { + haystack_len: usize, + needle_len: usize, + layout: Layout, + haystack_off: usize, + bonus_off: usize, + rows_off: usize, + score_off: usize, + matrix_off: usize, + _phantom: PhantomData<C>, +} +impl<C: Char> MatrixLayout<C> { + fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout<C> { + assert!(haystack_len >= needle_len); + assert!(haystack_len <= u32::MAX as usize); + let mut layout = Layout::from_size_align(0, 1).unwrap(); + let haystack_layout = Layout::array::<C>(haystack_len).unwrap(); + let bonus_layout = Layout::array::<u8>(haystack_len).unwrap(); + let rows_layout = Layout::array::<u16>(needle_len).unwrap(); + let score_layout = Layout::array::<ScoreCell>(haystack_len + 1 - needle_len).unwrap(); + let matrix_layout = + Layout::array::<MatrixCell>((haystack_len + 1 - needle_len) * needle_len).unwrap(); + + let haystack_off; + (layout, haystack_off) = layout.extend(haystack_layout).unwrap(); + let bonus_off; + (layout, bonus_off) = layout.extend(bonus_layout).unwrap(); + let rows_off; + (layout, rows_off) = layout.extend(rows_layout).unwrap(); + let score_off; + (layout, score_off) = layout.extend(score_layout).unwrap(); + let matrix_off; + (layout, matrix_off) = layout.extend(matrix_layout).unwrap(); + MatrixLayout { + haystack_len, + needle_len, + layout, + haystack_off, + bonus_off, + rows_off, + score_off, + matrix_off, + _phantom: PhantomData, + } + } + /// # Safety + /// + /// `ptr` must point at an allocated with MARTIX_ALLOC_LAYOUT + #[allow(clippy::type_complexity)] + unsafe fn fieds_from_ptr( + &self, + ptr: NonNull<u8>, + ) -> ( + *mut [C], + *mut [u8], + *mut [u16], + *mut [ScoreCell], + *mut [MatrixCell], + ) { + let base = ptr.as_ptr(); + let haystack = base.add(self.haystack_off) as *mut C; + let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len); + let bonus = base.add(self.bonus_off); + let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len); + let rows = base.add(self.rows_off) as *mut u16; + let rows = slice_from_raw_parts_mut(rows, self.needle_len); + let cells = base.add(self.score_off) as *mut ScoreCell; + let cells = slice_from_raw_parts_mut(cells, self.haystack_len + 1 - self.needle_len); + let matrix = base.add(self.matrix_off) as *mut MatrixCell; + let matrix = slice_from_raw_parts_mut( + matrix, + (self.haystack_len + 1 - self.needle_len) * self.haystack_len, + ); + (haystack, bonus, rows, cells, matrix) + } +} + +const _SIZE_CHECK: () = { + if size_of::<ScoreCell>() != 8 { + panic!() + } +}; + +// make this act like a u64 +#[repr(align(8))] +#[derive(Clone, Copy, PartialEq, Eq)] +pub(crate) struct ScoreCell { + pub score: u16, + pub consecutive_bonus: u8, + pub matched: bool, +} + +pub(crate) struct MatcherDataView<'a, C: Char> { + pub haystack: &'a mut [C], + // stored as a separate array instead of struct + // to avoid padding since char is too large and u8 too small :/ + pub bonus: &'a mut [u8], + pub current_row: &'a mut [ScoreCell], + pub row_offs: &'a mut [u16], + pub matrix_cells: &'a mut [MatrixCell], +} +#[repr(transparent)] +pub struct MatrixCell(pub(crate) u8); + +impl MatrixCell { + pub fn set(&mut self, p_match: bool, m_match: bool) { + self.0 = p_match as u8 | ((m_match as u8) << 1); + } + + pub fn get(&self, m_matrix: bool) -> bool { + let mask = m_matrix as u8 + 1; + (self.0 & mask) != 0 + } +} + +// we only use this to construct the layout for the slab allocation +#[allow(unused)] +struct MatcherData { + haystack: [char; MAX_HAYSTACK_LEN], + bonus: [u8; MAX_HAYSTACK_LEN], + row_offs: [u16; MAX_NEEDLE_LEN], + scratch_space: [ScoreCell; MAX_HAYSTACK_LEN], + matrix: [u8; MAX_MATRIX_SIZE], +} + +pub(crate) struct MatrixSlab(NonNull<u8>); +unsafe impl Sync for MatrixSlab {} +unsafe impl Send for MatrixSlab {} +impl UnwindSafe for MatrixSlab {} +impl RefUnwindSafe for MatrixSlab {} + +impl MatrixSlab { + pub fn new() -> Self { + let layout = Layout::new::<MatcherData>(); + // safety: the matrix is never zero sized (hardcoded constants) + let ptr = unsafe { alloc_zeroed(layout) }; + let Some(ptr) = NonNull::new(ptr) else { + handle_alloc_error(layout) + }; + MatrixSlab(ptr.cast()) + } + + pub(crate) fn alloc<C: Char>( + &mut self, + haystack_: &[C], + needle_len: usize, + ) -> Option<MatcherDataView<'_, C>> { + let cells = haystack_.len() * needle_len; + if cells > MAX_MATRIX_SIZE + || haystack_.len() > u16::MAX as usize + // ensures that scores never overflow + || needle_len > MAX_NEEDLE_LEN + { + return None; + } + let matrix_layout = MatrixLayout::<C>::new(haystack_.len(), needle_len); + if matrix_layout.layout.size() > size_of::<MatcherData>() { + return None; + } + unsafe { + // safely: this allocation is valid for MATRIX_ALLOC_LAYOUT + let (haystack, bonus, rows, current_row, matrix_cells) = + matrix_layout.fieds_from_ptr(self.0); + // copy haystack before creating references to ensure we don't create + // references to invalid chars (which may or may not be UB) + haystack_ + .as_ptr() + .copy_to_nonoverlapping(haystack as *mut _, haystack_.len()); + Some(MatcherDataView { + haystack: &mut *haystack, + row_offs: &mut *rows, + bonus: &mut *bonus, + current_row: &mut *current_row, + matrix_cells: &mut *matrix_cells, + }) + } + } +} + +impl Drop for MatrixSlab { + fn drop(&mut self) { + unsafe { dealloc(self.0.as_ptr(), Layout::new::<MatcherData>()) }; + } +} diff --git a/crates/atuin-nucleo/matcher/src/pattern.rs b/crates/atuin-nucleo/matcher/src/pattern.rs new file mode 100644 index 00000000..495feede --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/pattern.rs @@ -0,0 +1,566 @@ +//! This module provides a slightly higher level API for matching strings. + +use std::cmp::Reverse; + +use crate::{chars, Matcher, Utf32Str}; + +#[cfg(test)] +mod tests; + +use crate::Utf32String; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[non_exhaustive] +/// How to treat a case mismatch between two characters. +pub enum CaseMatching { + /// Characters never match their case folded version (`a != A`). + #[cfg_attr(not(feature = "unicode-casefold"), default)] + Respect, + /// Characters always match their case folded version (`a == A`). + #[cfg(feature = "unicode-casefold")] + Ignore, + /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are + /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise. + #[default] + #[cfg(feature = "unicode-casefold")] + Smart, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[non_exhaustive] +/// How to handle unicode normalization, +pub enum Normalization { + /// Characters never match their normalized version (`a != ä`). + #[cfg_attr(not(feature = "unicode-normalization"), default)] + Never, + /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom + /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`). + #[default] + #[cfg(feature = "unicode-normalization")] + Smart, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[non_exhaustive] +/// The kind of matching algorithm to run for an atom. +pub enum AtomKind { + /// Fuzzy matching where the needle must match any haystack characters + /// (match can contain gaps). This atom kind is used by default if no + /// special syntax is used. There is no negated fuzzy matching (too + /// many false positives). + /// + /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match). + Fuzzy, + /// The needle must match a contiguous sequence of haystack characters + /// without gaps. This atom kind is parsed from the following syntax: + /// `'foo` and `!foo` (negated). + /// + /// See also [`Matcher::substring_match`](crate::Matcher::substring_match). + Substring, + /// The needle must match all leading haystack characters without gaps or + /// prefix. This atom kind is parsed from the following syntax: `^foo` and + /// `!^foo` (negated). + /// + /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match). + Prefix, + /// The needle must match all trailing haystack characters without gaps or + /// postfix. This atom kind is parsed from the following syntax: `foo$` and + /// `!foo$` (negated). + /// + /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match). + Postfix, + /// The needle must match all haystack characters without gaps or prefix. + /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$` + /// (negated). + /// + /// See also [`Matcher::exact_match`](crate::Matcher::exact_match). + Exact, +} + +/// A single pattern component that is matched with a single [`Matcher`] function +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Atom { + /// Whether this pattern atom is a negative match. + /// A negative pattern atom will prevent haystacks matching it from + /// being matchend. It does not contribute to scoring/indices + pub negative: bool, + /// The kind of match that this pattern performs + pub kind: AtomKind, + needle: Utf32String, + ignore_case: bool, + normalize: bool, +} + +impl Atom { + /// Creates a single [`Atom`] from a string by performing unicode + /// normalization and case folding (if necessary). Optionally `\ ` can + /// be escaped to ` `. + pub fn new( + needle: &str, + case: CaseMatching, + normalize: Normalization, + kind: AtomKind, + escape_whitespace: bool, + ) -> Atom { + Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false) + } + + fn new_inner( + needle: &str, + case: CaseMatching, + normalization: Normalization, + kind: AtomKind, + escape_whitespace: bool, + append_dollar: bool, + ) -> Atom { + let mut ignore_case; + let mut normalize; + #[cfg(feature = "unicode-normalization")] + { + normalize = matches!(normalization, Normalization::Smart); + } + #[cfg(not(feature = "unicode-normalization"))] + { + normalize = false; + } + let needle = if needle.is_ascii() { + let mut needle = if escape_whitespace { + if let Some((start, rem)) = needle.split_once("\\ ") { + let mut needle = start.to_owned(); + for rem in rem.split("\\ ") { + needle.push(' '); + needle.push_str(rem); + } + needle + } else { + needle.to_owned() + } + } else { + needle.to_owned() + }; + + match case { + #[cfg(feature = "unicode-casefold")] + CaseMatching::Ignore => { + ignore_case = true; + needle.make_ascii_lowercase() + } + #[cfg(feature = "unicode-casefold")] + CaseMatching::Smart => { + ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) + } + CaseMatching::Respect => ignore_case = false, + } + if append_dollar { + needle.push('$'); + } + Utf32String::Ascii(needle.into_boxed_str()) + } else { + let mut needle_ = Vec::with_capacity(needle.len()); + #[cfg(feature = "unicode-casefold")] + { + ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); + } + #[cfg(not(feature = "unicode-casefold"))] + { + ignore_case = false; + } + #[cfg(feature = "unicode-normalization")] + { + normalize = matches!(normalization, Normalization::Smart); + } + if escape_whitespace { + let mut saw_backslash = false; + for mut c in chars::graphemes(needle) { + if saw_backslash { + if c == ' ' { + needle_.push(' '); + saw_backslash = false; + continue; + } else { + needle_.push('\\'); + } + } + saw_backslash = c == '\\'; + match case { + #[cfg(feature = "unicode-casefold")] + CaseMatching::Ignore => c = chars::to_lower_case(c), + #[cfg(feature = "unicode-casefold")] + CaseMatching::Smart => { + ignore_case = ignore_case && !chars::is_upper_case(c) + } + CaseMatching::Respect => (), + } + match normalization { + #[cfg(feature = "unicode-normalization")] + Normalization::Smart => { + normalize = normalize && chars::normalize(c) == c; + } + Normalization::Never => (), + } + needle_.push(c); + } + } else { + let chars = chars::graphemes(needle).map(|mut c| { + match case { + #[cfg(feature = "unicode-casefold")] + CaseMatching::Ignore => c = chars::to_lower_case(c), + #[cfg(feature = "unicode-casefold")] + CaseMatching::Smart => { + ignore_case = ignore_case && !chars::is_upper_case(c); + } + CaseMatching::Respect => (), + } + match normalization { + #[cfg(feature = "unicode-normalization")] + Normalization::Smart => { + normalize = normalize && chars::normalize(c) == c; + } + Normalization::Never => (), + } + c + }); + needle_.extend(chars); + }; + if append_dollar { + needle_.push('$'); + } + Utf32String::Unicode(needle_.into_boxed_slice()) + }; + Atom { + kind, + needle, + negative: false, + ignore_case, + normalize, + } + } + + /// Parse a pattern atom from a string. Some special trailing and leading + /// characters can be used to control the atom kind. See [`AtomKind`] for + /// details. + pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom { + let mut atom = raw; + let invert = match atom.as_bytes() { + [b'!', ..] => { + atom = &atom[1..]; + true + } + [b'\\', b'!', ..] => { + atom = &atom[1..]; + false + } + _ => false, + }; + + let mut kind = match atom.as_bytes() { + [b'^', ..] => { + atom = &atom[1..]; + AtomKind::Prefix + } + [b'\'', ..] => { + atom = &atom[1..]; + AtomKind::Substring + } + [b'\\', b'^' | b'\'', ..] => { + atom = &atom[1..]; + AtomKind::Fuzzy + } + _ => AtomKind::Fuzzy, + }; + + let mut append_dollar = false; + match atom.as_bytes() { + [.., b'\\', b'$'] => { + append_dollar = true; + atom = &atom[..atom.len() - 2] + } + [.., b'$'] => { + kind = if kind == AtomKind::Fuzzy { + AtomKind::Postfix + } else { + AtomKind::Exact + }; + atom = &atom[..atom.len() - 1] + } + _ => (), + } + + if invert && kind == AtomKind::Fuzzy { + kind = AtomKind::Substring + } + + let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar); + pattern.negative = invert; + pattern + } + + /// Matches this pattern against `haystack` (using the allocation and configuration + /// from `matcher`) and calculates a ranking score. See the [`Matcher`]. + /// Documentation for more details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> { + matcher.config.ignore_case = self.ignore_case; + matcher.config.normalize = self.normalize; + let pattern_score = match self.kind { + AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), + AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), + AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), + AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), + AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), + }; + if self.negative { + if pattern_score.is_some() { + return None; + } + Some(0) + } else { + pattern_score + } + } + + /// Matches this pattern against `haystack` (using the allocation and + /// configuration from `matcher`), calculates a ranking score and the match + /// indices. See the [`Matcher`]. Documentation for more + /// details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + /// + /// *Note:* The `indices` vector is not cleared by this function. + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec<u32>, + ) -> Option<u16> { + matcher.config.ignore_case = self.ignore_case; + matcher.config.normalize = self.normalize; + if self.negative { + let pattern_score = match self.kind { + AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), + AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), + AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), + AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), + AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), + }; + pattern_score.is_none().then_some(0) + } else { + match self.kind { + AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices), + AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices), + AtomKind::Substring => { + matcher.substring_indices(haystack, self.needle.slice(..), indices) + } + AtomKind::Prefix => { + matcher.prefix_indices(haystack, self.needle.slice(..), indices) + } + AtomKind::Postfix => { + matcher.postfix_indices(haystack, self.needle.slice(..), indices) + } + } + } + } + + /// Returns the needle text that is passed to the matcher. All indices + /// produced by the `indices` functions produce char indices used to index + /// this text + pub fn needle_text(&self) -> Utf32Str<'_> { + self.needle.slice(..) + } + /// Convenience function to easily match (and sort) a (relatively small) + /// list of inputs. + /// + /// *Note* This function is not recommended for building a full fuzzy + /// matching application that can match large numbers of matches (like all + /// files in a directory) as all matching is done on the current thread, + /// effectively blocking the UI. For such applications the high level + /// `nucleo` crate can be used instead. + pub fn match_list<T: AsRef<str>>( + &self, + items: impl IntoIterator<Item = T>, + matcher: &mut Matcher, + ) -> Vec<(T, u16)> { + if self.needle.is_empty() { + return items.into_iter().map(|item| (item, 0)).collect(); + } + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(_, score)| Reverse(*score)); + items + } +} + +fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ { + let mut saw_backslash = false; + pattern.split(move |c| { + saw_backslash = match c { + c if c.is_whitespace() && !saw_backslash => return true, + '\\' => true, + _ => false, + }; + false + }) +} + +#[derive(Debug, Default)] +/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom). +#[non_exhaustive] +pub struct Pattern { + /// The individual pattern (words) in this pattern + pub atoms: Vec<Atom>, +} + +impl Pattern { + /// Creates a pattern where each word is matched individually (whitespaces + /// can be escaped with `\`). Otherwise no parsing is performed (so `$`, `!`, + /// `'` and `^` don't receive special treatment). If you want to match the entire + /// pattern as a single needle use a single [`Atom`] instead. + pub fn new( + pattern: &str, + case_matching: CaseMatching, + normalize: Normalization, + kind: AtomKind, + ) -> Pattern { + let atoms = pattern_atoms(pattern) + .filter_map(|pat| { + let pat = Atom::new(pat, case_matching, normalize, kind, true); + (!pat.needle.is_empty()).then_some(pat) + }) + .collect(); + Pattern { atoms } + } + /// Creates a pattern where each word is matched individually (whitespaces + /// can be escaped with `\`). And `$`, `!`, `'` and `^` at word boundaries will + /// cause different matching behaviour (see [`AtomKind`]). These can be + /// escaped with backslash. + pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern { + let atoms = pattern_atoms(pattern) + .filter_map(|pat| { + let pat = Atom::parse(pat, case_matching, normalize); + (!pat.needle.is_empty()).then_some(pat) + }) + .collect(); + Pattern { atoms } + } + + /// Convenience function to easily match (and sort) a (relatively small) + /// list of inputs. + /// + /// *Note* This function is not recommended for building a full fuzzy + /// matching application that can match large numbers of matches (like all + /// files in a directory) as all matching is done on the current thread, + /// effectively blocking the UI. For such applications the high level + /// `nucleo` crate can be used instead. + pub fn match_list<T: AsRef<str>>( + &self, + items: impl IntoIterator<Item = T>, + matcher: &mut Matcher, + ) -> Vec<(T, u32)> { + if self.atoms.is_empty() { + return items.into_iter().map(|item| (item, 0)).collect(); + } + let mut buf = Vec::new(); + let mut items: Vec<_> = items + .into_iter() + .filter_map(|item| { + self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) + .map(|score| (item, score)) + }) + .collect(); + items.sort_by_key(|(_, score)| Reverse(*score)); + items + } + + /// Matches this pattern against `haystack` (using the allocation and configuration + /// from `matcher`) and calculates a ranking score. See the [`Matcher`] + /// documentation for more details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> { + if self.atoms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.atoms { + score += pattern.score(haystack, matcher)? as u32; + } + Some(score) + } + + /// Matches this pattern against `haystack` (using the allocation and + /// configuration from `matcher`), calculates a ranking score and the match + /// indices. See the [`Matcher`] documentation for more + /// details. + /// + /// *Note:* The `ignore_case` setting is overwritten to match the casing of + /// each pattern atom. + /// + /// *Note:* The indices for each pattern are calculated individually + /// and simply appended to the `indices` vector and not deduplicated/sorted. + /// This allows associating the match indices to their source pattern. If + /// required (like for highlighting) unique/sorted indices can be obtained + /// as follows: + /// + /// ``` + /// # let mut indices: Vec<u32> = Vec::new(); + /// indices.sort_unstable(); + /// indices.dedup(); + /// ``` + pub fn indices( + &self, + haystack: Utf32Str<'_>, + matcher: &mut Matcher, + indices: &mut Vec<u32>, + ) -> Option<u32> { + if self.atoms.is_empty() { + return Some(0); + } + let mut score = 0; + for pattern in &self.atoms { + score += pattern.indices(haystack, matcher, indices)? as u32; + } + Some(score) + } + + /// Refreshes this pattern by reparsing it from a string. This is mostly + /// equivalent to just constructing a new pattern using [`Pattern::parse`] + /// but is slightly more efficient by reusing some allocations + pub fn reparse( + &mut self, + pattern: &str, + case_matching: CaseMatching, + normalize: Normalization, + ) { + self.atoms.clear(); + let atoms = pattern_atoms(pattern).filter_map(|atom| { + let atom = Atom::parse(atom, case_matching, normalize); + if atom.needle.is_empty() { + return None; + } + Some(atom) + }); + self.atoms.extend(atoms); + } +} + +impl Clone for Pattern { + fn clone(&self) -> Self { + Self { + atoms: self.atoms.clone(), + } + } + + fn clone_from(&mut self, source: &Self) { + self.atoms.clone_from(&source.atoms); + } +} diff --git a/crates/atuin-nucleo/matcher/src/pattern/tests.rs b/crates/atuin-nucleo/matcher/src/pattern/tests.rs new file mode 100644 index 00000000..88880ba9 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/pattern/tests.rs @@ -0,0 +1,149 @@ +use crate::pattern::{Atom, AtomKind, CaseMatching, Normalization, Pattern}; + +#[test] +fn negative() { + let pat = Atom::parse("!foo", CaseMatching::Smart, Normalization::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!^foo", CaseMatching::Smart, Normalization::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!foo$", CaseMatching::Smart, Normalization::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("!^foo$", CaseMatching::Smart, Normalization::Smart); + assert!(pat.negative); + assert_eq!(pat.kind, AtomKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn pattern_kinds() { + let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Fuzzy); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("'foo", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Substring); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("^foo", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Prefix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("foo$", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Postfix); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("^foo$", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.negative); + assert_eq!(pat.kind, AtomKind::Exact); + assert_eq!(pat.needle.to_string(), "foo"); +} + +#[test] +fn case_matching() { + let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("Foo", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Foo", CaseMatching::Ignore, Normalization::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "foo"); + let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Foo"); + let pat = Atom::parse("Äxx", CaseMatching::Ignore, Normalization::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "äxx"); + let pat = Atom::parse("Äxx", CaseMatching::Respect, Normalization::Smart); + assert!(!pat.ignore_case); + let pat = Atom::parse("Axx", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Axx"); + let pat = Atom::parse("你xx", CaseMatching::Smart, Normalization::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = Atom::parse("你xx", CaseMatching::Ignore, Normalization::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "你xx"); + let pat = Atom::parse("Ⲽxx", CaseMatching::Smart, Normalization::Smart); + assert!(!pat.ignore_case); + assert_eq!(pat.needle.to_string(), "Ⲽxx"); + let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore, Normalization::Smart); + assert!(pat.ignore_case); + assert_eq!(pat.needle.to_string(), "ⲽxx"); +} + +#[test] +fn escape() { + let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "foo bar"); + let pat = Atom::parse("\\!foo", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "!foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\'foo", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "'foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\^foo", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "^foo"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("foo\\$", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("^foo\\$", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "foo$"); + assert_eq!(pat.kind, AtomKind::Prefix); + let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "!^foo$"); + assert_eq!(pat.kind, AtomKind::Fuzzy); + let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart, Normalization::Smart); + assert_eq!(pat.needle.to_string(), "^foo$"); + assert_eq!(pat.kind, AtomKind::Substring); +} + +#[test] +fn pattern_atoms() { + assert_eq!( + Pattern::parse("a b", CaseMatching::Ignore, Normalization::Smart).atoms, + vec![ + Atom::parse("a", CaseMatching::Ignore, Normalization::Smart), + Atom::parse("b", CaseMatching::Ignore, Normalization::Smart), + ] + ); + + assert_eq!( + Pattern::parse("a\n b", CaseMatching::Ignore, Normalization::Smart).atoms, + vec![ + Atom::parse("a", CaseMatching::Ignore, Normalization::Smart), + Atom::parse("b", CaseMatching::Ignore, Normalization::Smart), + ] + ); + + assert_eq!( + Pattern::parse(" a b\r\n", CaseMatching::Ignore, Normalization::Smart).atoms, + vec![ + Atom::parse("a", CaseMatching::Ignore, Normalization::Smart), + Atom::parse("b", CaseMatching::Ignore, Normalization::Smart), + ] + ); + + assert_eq!( + Pattern::parse("ほ げ", CaseMatching::Smart, Normalization::Smart).atoms, + vec![ + Atom::parse("ほ", CaseMatching::Smart, Normalization::Smart), + Atom::parse("げ", CaseMatching::Smart, Normalization::Smart), + ], + ) +} diff --git a/crates/atuin-nucleo/matcher/src/prefilter.rs b/crates/atuin-nucleo/matcher/src/prefilter.rs new file mode 100644 index 00000000..1e79cc2c --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/prefilter.rs @@ -0,0 +1,98 @@ +use ::memchr::{memchr, memchr2, memrchr, memrchr2}; + +use crate::chars::Char; +use crate::utf32_str::Utf32Str; +use crate::Matcher; + +#[inline(always)] +fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option<usize> { + if c >= b'a' && c <= b'z' { + memchr2(c, c - 32, haystack) + } else { + memchr(c, haystack) + } +} + +#[inline(always)] +fn find_ascii_ignore_case_rev(c: u8, haystack: &[u8]) -> Option<usize> { + if c >= b'a' && c <= b'z' { + memrchr2(c, c - 32, haystack) + } else { + memrchr(c, haystack) + } +} + +impl Matcher { + pub(crate) fn prefilter_ascii( + &self, + mut haystack: &[u8], + needle: &[u8], + only_greedy: bool, + ) -> Option<(usize, usize, usize)> { + if self.config.ignore_case { + let start = + find_ascii_ignore_case(needle[0], &haystack[..haystack.len() - needle.len() + 1])?; + let mut greedy_end = start + 1; + haystack = &haystack[greedy_end..]; + for &c in &needle[1..] { + let idx = find_ascii_ignore_case(c, haystack)? + 1; + greedy_end += idx; + haystack = &haystack[idx..]; + } + if only_greedy { + Some((start, greedy_end, greedy_end)) + } else { + let end = greedy_end + + find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack) + .map_or(0, |i| i + 1); + Some((start, greedy_end, end)) + } + } else { + let start = memchr(needle[0], &haystack[..haystack.len() - needle.len() + 1])?; + let mut greedy_end = start + 1; + haystack = &haystack[greedy_end..]; + for &c in &needle[1..] { + let idx = memchr(c, haystack)? + 1; + greedy_end += idx; + haystack = &haystack[idx..]; + } + if only_greedy { + Some((start, greedy_end, greedy_end)) + } else { + let end = + greedy_end + memrchr(*needle.last().unwrap(), haystack).map_or(0, |i| i + 1); + Some((start, greedy_end, end)) + } + } + } + + pub(crate) fn prefilter_non_ascii( + &self, + haystack: &[char], + needle: Utf32Str<'_>, + only_greedy: bool, + ) -> Option<(usize, usize)> { + let needle_char = needle.get(0); + let start = haystack[..haystack.len() - needle.len() + 1] + .iter() + .position(|c| c.normalize(&self.config) == needle_char)?; + let needle_char = needle.last(); + if only_greedy { + if haystack.len() - start < needle.len() { + return None; + } + Some((start, start + 1)) + } else { + let end = haystack.len() + - haystack[start + 1..] + .iter() + .rev() + .position(|c| c.normalize(&self.config) == needle_char)?; + if end - start < needle.len() { + return None; + } + + Some((start, end)) + } + } +} diff --git a/crates/atuin-nucleo/matcher/src/score.rs b/crates/atuin-nucleo/matcher/src/score.rs new file mode 100644 index 00000000..c934a8ef --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/score.rs @@ -0,0 +1,158 @@ +use std::cmp::max; + +use crate::chars::{Char, CharClass}; +use crate::{Config, Matcher}; + +pub(crate) const SCORE_MATCH: u16 = 16; +pub(crate) const PENALTY_GAP_START: u16 = 3; +pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; +/// If the prefer_prefix option is enabled we want to penalize +/// the initial gap. The prefix should not be too much +pub(crate) const PREFIX_BONUS_SCALE: u16 = 2; +pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY; + +// We prefer matches at the beginning of a word, but the bonus should not be +// too great to prevent the longer acronym matches from always winning over +// shorter fuzzy matches. The bonus point here was specifically chosen that +// the bonus is cancelled when the gap between the acronyms grows over +// 8 characters, which is approximately the average length of the words found +// in web2 dictionary and my file system. +pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; + +// Edge-triggered bonus for matches in camelCase words. +// Their value should be BONUS_BOUNDARY - PENALTY_GAP_EXTENSION = 7. +// However, this priporitzes camel case over non-camel case. +// In fzf/skim this is not a problem since they score off the max +// consecutive bonus. However, we don't do that (because its incorrect) +// so to avoids prioritizing camel we use a lower bonus. I think that's fine +// usually camel case is wekaer boundary than actual wourd boundaries anyway +// This also has the nice sideeffect of perfectly balancing out +// camel case, snake case and the consecutive version of the word +pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_START; + +/// Although bonus point for non-word characters is non-contextual, we need it +/// for computing bonus points for consecutive chunks starting with a non-word +/// character. +pub(crate) const BONUS_NON_WORD: u16 = BONUS_BOUNDARY; + +// Minimum bonus point given to characters in consecutive chunks. +// Note that bonus points for consecutive matches shouldn't have needed if we +// used fixed match score as in the original algorithm. +pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION; + +// The first character in the typed pattern usually has more significance +// than the rest so it's important that it appears at special positions where +// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". +// The amount of the extra bonus should be limited so that the gap penalty is +// still respected. +pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; + +impl Config { + #[inline] + pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { + if class > CharClass::Delimiter { + // transition from non word to word + match prev_class { + CharClass::Whitespace => return self.bonus_boundary_white, + CharClass::Delimiter => return self.bonus_boundary_delimiter, + CharClass::NonWord => return BONUS_BOUNDARY, + _ => (), + } + } + if prev_class == CharClass::Lower && class == CharClass::Upper + || prev_class != CharClass::Number && class == CharClass::Number + { + // camelCase letter123 + BONUS_CAMEL123 + } else if class == CharClass::Whitespace { + self.bonus_boundary_white + } else if class == CharClass::NonWord { + return BONUS_NON_WORD; + } else { + 0 + } + } +} +impl Matcher { + #[inline(always)] + pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { + self.config.bonus_for(prev_class, class) + } + + pub(crate) fn calculate_score<const INDICES: bool, H: Char + PartialEq<N>, N: Char>( + &mut self, + haystack: &[H], + needle: &[N], + start: usize, + end: usize, + indices: &mut Vec<u32>, + ) -> u16 { + if INDICES { + indices.reserve(needle.len()); + } + + let mut prev_class = start + .checked_sub(1) + .map(|i| haystack[i].char_class(&self.config)) + .unwrap_or(self.config.initial_char_class); + let mut needle_iter = needle.iter(); + let mut needle_char = *needle_iter.next().unwrap(); + + let mut in_gap = false; + let mut consecutive = 1; + + // unrolled the first iteration to make applying the first char multiplier less awkward + if INDICES { + indices.push(start as u32) + } + let class = haystack[start].char_class(&self.config); + let mut first_bonus = self.bonus_for(prev_class, class); + let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER; + prev_class = class; + needle_char = *needle_iter.next().unwrap_or(&needle_char); + + for (i, c) in haystack[start + 1..end].iter().enumerate() { + let (c, class) = c.char_class_and_normalize(&self.config); + if c == needle_char { + if INDICES { + indices.push(i as u32 + start as u32 + 1) + } + let mut bonus = self.bonus_for(prev_class, class); + if consecutive != 0 { + if bonus >= BONUS_BOUNDARY && bonus > first_bonus { + first_bonus = bonus + } + bonus = max(max(bonus, first_bonus), BONUS_CONSECUTIVE); + } else { + first_bonus = bonus; + } + score += SCORE_MATCH + bonus; + in_gap = false; + consecutive += 1; + if let Some(&next) = needle_iter.next() { + needle_char = next; + } + } else { + let penalty = if in_gap { + PENALTY_GAP_EXTENSION + } else { + PENALTY_GAP_START + }; + score = score.saturating_sub(penalty); + in_gap = true; + consecutive = 0; + } + prev_class = class; + } + if self.config.prefer_prefix { + if start != 0 { + let penalty = PENALTY_GAP_START + + PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16; + score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE); + } else { + score += MAX_PREFIX_BONUS; + } + } + score + } +} diff --git a/crates/atuin-nucleo/matcher/src/tests.rs b/crates/atuin-nucleo/matcher/src/tests.rs new file mode 100644 index 00000000..32a02403 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/tests.rs @@ -0,0 +1,771 @@ +use crate::chars::Char; +use crate::pattern::{CaseMatching, Normalization, Pattern}; +use crate::score::{ + BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, + MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, +}; +use crate::utf32_str::Utf32Str; +use crate::{Config, Matcher}; + +use Algorithm::*; + +#[derive(Debug)] +enum Algorithm { + FuzzyOptimal, + FuzzyGreedy, + Substring, + Prefix, + Postfix, + Exact, +} + +fn assert_matches( + algorithm: &[Algorithm], + normalize: bool, + case_sensitive: bool, + path: bool, + prefer_prefix: bool, + cases: &[(&str, &str, &[u32], u16)], +) { + let mut config = Config { + normalize, + ignore_case: !case_sensitive, + prefer_prefix, + ..Config::DEFAULT + }; + if path { + config.set_match_paths(); + } + let mut matcher = Matcher::new(config); + let mut matched_indices = Vec::new(); + let mut needle_buf = Vec::new(); + let mut haystack_buf = Vec::new(); + for &(haystack, needle, indices, mut score) in cases { + let needle = if !case_sensitive { + needle.to_lowercase() + } else { + needle.to_owned() + }; + let needle = Utf32Str::new(&needle, &mut needle_buf); + let haystack = Utf32Str::new(haystack, &mut haystack_buf); + score += needle.len() as u16 * SCORE_MATCH; + for algo in algorithm { + println!("xx {matched_indices:?} {algo:?}"); + matched_indices.clear(); + let res = match algo { + FuzzyOptimal => matcher.fuzzy_indices(haystack, needle, &mut matched_indices), + FuzzyGreedy => matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices), + Substring => matcher.substring_indices(haystack, needle, &mut matched_indices), + Prefix => matcher.prefix_indices(haystack, needle, &mut matched_indices), + Postfix => matcher.postfix_indices(haystack, needle, &mut matched_indices), + Exact => matcher.exact_indices(haystack, needle, &mut matched_indices), + }; + println!("{matched_indices:?}"); + let match_chars: Vec<_> = matched_indices + .iter() + .map(|&i| haystack.get(i).normalize(&matcher.config)) + .collect(); + let needle_chars: Vec<_> = needle.chars().collect(); + + assert_eq!( + res, + Some(score), + "{needle:?} did not match {haystack:?}: matched {match_chars:?} {matched_indices:?} {algo:?}" + ); + assert_eq!( + matched_indices, indices, + "{needle:?} match {haystack:?} {algo:?}" + ); + assert_eq!( + match_chars, needle_chars, + "{needle:?} match {haystack:?} indices are incorrect {matched_indices:?} {algo:?}" + ); + } + } +} + +fn assert_not_matches_with( + normalize: bool, + case_sensitive: bool, + algorithm: &[Algorithm], + cases: &[(&str, &str)], +) { + let config = Config { + normalize, + ignore_case: !case_sensitive, + ..Config::DEFAULT + }; + let mut matcher = Matcher::new(config); + let mut needle_buf = Vec::new(); + let mut haystack_buf = Vec::new(); + for &(haystack, needle) in cases { + let needle = if !case_sensitive { + needle.to_lowercase() + } else { + needle.to_owned() + }; + let needle = Utf32Str::new(&needle, &mut needle_buf); + let haystack = Utf32Str::new(haystack, &mut haystack_buf); + + for algo in algorithm { + let res = match algo { + FuzzyOptimal => matcher.fuzzy_match(haystack, needle), + FuzzyGreedy => matcher.fuzzy_match_greedy(haystack, needle), + Substring => matcher.substring_match(haystack, needle), + Prefix => matcher.prefix_match(haystack, needle), + Postfix => matcher.postfix_match(haystack, needle), + Exact => matcher.exact_match(haystack, needle), + }; + assert_eq!( + res, None, + "{needle:?} should not match {haystack:?} {algo:?}" + ); + } + } +} + +pub fn assert_not_matches(normalize: bool, case_sensitive: bool, cases: &[(&str, &str)]) { + assert_not_matches_with( + normalize, + case_sensitive, + &[FuzzyOptimal, FuzzyGreedy, Substring, Prefix, Postfix, Exact], + cases, + ) +} + +const BONUS_BOUNDARY_WHITE: u16 = Config::DEFAULT.bonus_boundary_white; +const BONUS_BOUNDARY_DELIMITER: u16 = Config::DEFAULT.bonus_boundary_delimiter; + +#[test] +fn test_fuzzy() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + false, + false, + false, + false, + &[ + ( + "fooBarbaz1", + "obr", + &[2, 3, 5], + BONUS_CAMEL123 - PENALTY_GAP_START, + ), + ( + "/usr/share/doc/at/ChangeLog", + "changelog", + &[18, 19, 20, 21, 22, 23, 24, 25, 26], + (BONUS_FIRST_CHAR_MULTIPLIER + 8) * BONUS_BOUNDARY_DELIMITER, + ), + ( + "fooBarbaz1", + "br", + &[3, 5], + BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, + ), + ( + "foo bar baz", + "fbb", + &[0, 4, 8], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "/AutomatorDocument.icns", + "rdoc", + &[9, 10, 11, 12], + BONUS_CAMEL123 + 2 * BONUS_CONSECUTIVE, + ), + ( + "/man1/zshcompctl.1", + "zshc", + &[6, 7, 8, 9], + BONUS_BOUNDARY_DELIMITER * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ( + "/.oh-my-zsh/cache", + "zshc", + &[8, 9, 10, 12], + BONUS_BOUNDARY * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - PENALTY_GAP_START + + BONUS_BOUNDARY_DELIMITER, + ), + ( + "ab0123 456", + "12356", + &[3, 4, 5, 8, 9], + BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION, + ), + ( + "abc123 456", + "12356", + &[3, 4, 5, 8, 9], + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 2) + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION + + BONUS_CONSECUTIVE, + ), + ( + "foo/bar/baz", + "fbb", + &[0, 4, 8], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "fooBarBaz", + "fbb", + &[0, 3, 6], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 + - 2 * PENALTY_GAP_START + - 2 * PENALTY_GAP_EXTENSION, + ), + ( + "foo barbaz", + "fbb", + &[0, 4, 7], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START * 2 + - PENALTY_GAP_EXTENSION * 3, + ), + ( + "fooBar Baz", + "foob", + &[0, 1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ( + "xFoo-Bar Baz", + "foo-b", + &[1, 2, 3, 4, 5], + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 2) + 2 * BONUS_NON_WORD, + ), + ], + ); +} + +#[test] +fn empty_needle() { + assert_matches( + &[Substring, Prefix, Postfix, FuzzyGreedy, FuzzyOptimal, Exact], + false, + false, + false, + false, + &[("foo bar baz", "", &[], 0)], + ); +} + +#[test] +fn test_substring() { + assert_matches( + &[Substring, Prefix], + false, + false, + false, + false, + &[ + ( + "foo bar baz", + "foo", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo bar baz", + "FOO", + &[1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo bar baz", + " FOO", + &[0, 1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ], + ); + assert_matches( + &[Substring, Postfix], + false, + false, + false, + false, + &[ + ( + "foo bar baz", + "baz", + &[8, 9, 10], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + "foo bar baz ", + "baz", + &[8, 9, 10], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + "foo bar baz ", + "baz ", + &[8, 9, 10, 11], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ], + ); + assert_matches( + &[Substring, Prefix, Postfix, Exact, FuzzyGreedy, FuzzyOptimal], + false, + false, + false, + false, + &[ + ( + "foo", + "foo", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo", + "foo", + &[1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + " foo", + " foo", + &[0, 1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ], + ); + assert_matches( + &[Substring], + false, + false, + false, + false, + &[ + ( + "fooBarbaz1", + "oba", + &[2, 3, 4], + BONUS_CAMEL123 + BONUS_CONSECUTIVE, + ), + ( + "/AutomatorDocument.icns", + "rdoc", + &[9, 10, 11, 12], + BONUS_CAMEL123 + 2 * BONUS_CONSECUTIVE, + ), + ( + "/man1/zshcompctl.1", + "zshc", + &[6, 7, 8, 9], + BONUS_BOUNDARY_DELIMITER * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ( + "/.oh-my-zsh/cache", + "zsh/c", + &[8, 9, 10, 11, 12], + BONUS_BOUNDARY * (BONUS_FIRST_CHAR_MULTIPLIER + 2) + + BONUS_NON_WORD + + BONUS_BOUNDARY_DELIMITER, + ), + ], + ); + assert_not_matches_with( + true, + false, + &[Prefix, Substring, Postfix, Exact], + &[( + "At the Road’s End - Seeming - SOL: A Self-Banishment Ritual", + "adi", + )], + ) +} + +#[test] +fn test_substring_case_sensitive() { + assert_matches( + &[Substring, Prefix], + false, + true, + false, + false, + &[ + ( + "Foo bar baz", + "Foo", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + "Fȫô bar baz", + "Fȫô", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ( + "Foo ฿ar baz", + "Foo", + &[0, 1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), + ), + ], + ); + assert_not_matches_with(false, true, &[Substring, Prefix], &[("foo bar baz", "Foo")]); +} + +#[test] +fn test_fuzzy_case_sensitive() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + false, + true, + false, + false, + &[ + ( + "fooBarbaz1", + "oBr", + &[2, 3, 5], + BONUS_CAMEL123 - PENALTY_GAP_START, + ), + ( + "Foo/Bar/Baz", + "FBB", + &[0, 4, 8], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 + - 2 * PENALTY_GAP_START + - 4 * PENALTY_GAP_EXTENSION, + ), + ( + "FooBarBaz", + "FBB", + &[0, 3, 6], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 + - 2 * PENALTY_GAP_START + - 2 * PENALTY_GAP_EXTENSION, + ), + ( + "FooBar Baz", + "FooB", + &[0, 1, 2, 3], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), + ), + ("foo-bar", "o-ba", &[2, 3, 4, 5], BONUS_NON_WORD * 3), + ], + ); +} + +#[test] +fn test_normalize() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + true, + false, + false, + false, + &[ + ( + "Só Danço Samba", + "So", + &[0, 1], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), + ), + ( + "Só Danço Samba", + "sodc", + &[0, 1, 3, 6], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) - PENALTY_GAP_START + + BONUS_BOUNDARY_WHITE + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ( + "Danço", + "danco", + &[0, 1, 2, 3, 4], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ( + "DanÇo", + "danco", + &[0, 1, 2, 3, 4], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ( + "xÇando", + "cando", + &[1, 2, 3, 4, 5], + BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4), + ), + ("ۂ(GCGɴCG", "n", &[5], 0), + ], + ) +} + +#[test] +fn test_unicode() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal, Substring], + true, + false, + false, + false, + &[ + ( + "你好世界", + "你好", + &[0, 1], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), + ), + ( + " 你好世界", + "你好", + &[1, 2], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), + ), + ], + ); + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + true, + false, + false, + false, + &[( + "你好世界", + "你世", + &[0, 2], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, + )], + ); + assert_not_matches( + false, + false, + &[("Flibbertigibbet / イタズラっ子たち", "lying")], + ); +} + +#[test] +fn test_long_str() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + false, + false, + false, + false, + &[( + &"x".repeat(u16::MAX as usize + 1), + "xx", + &[0, 1], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), + )], + ); +} + +#[test] +fn test_casing() { + assert_matches( + &[FuzzyGreedy, FuzzyOptimal], + false, + false, + false, + false, + &[ + // these two have the same score + ( + "fooBar", + "foobar", + &[0, 1, 2, 3, 4, 5], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 5), + ), + ( + "foobar", + "foobar", + &[0, 1, 2, 3, 4, 5], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 5), + ), + // these two have the same score (slightly lower than the other two: 60 instead of 70) + ( + "foo-bar", + "foobar", + &[0, 1, 2, 4, 5, 6], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - PENALTY_GAP_START + + BONUS_BOUNDARY * 3, + ), + ( + "foo_bar", + "foobar", + &[0, 1, 2, 4, 5, 6], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - PENALTY_GAP_START + + BONUS_BOUNDARY * 3, + ), + ], + ) +} + +#[test] +fn test_optimal() { + assert_matches( + &[FuzzyOptimal], + false, + false, + false, + false, + &[ + ( + "axxx xx ", + "xx", + &[5, 6], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), + ), + ( + "SS!H", + "S!", + &[0, 2], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START + + BONUS_NON_WORD, + ), + // this case is a cool example of why our algorithm is more than fzf + // we handle this corretly detect that it's better to match + // the second f instead of the third yielding a higher score + // (despite using the same scoring function!) + ( + "xf.foo", + "xfoo", + &[0, 3, 4, 5], + BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION + + BONUS_BOUNDARY * 3, + ), + ( + "xf fo", + "xfo", + &[0, 3, 4], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2) + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ], + ); +} + +#[test] +fn test_reject() { + assert_not_matches( + true, + false, + &[ + ("你好界", "abc"), + ("你好界", "a"), + ("你好世界", "富"), + ("Só Danço Samba", "sox"), + ("fooBarbaz", "fooBarbazz"), + ("fooBarbaz", "c"), + ], + ); + assert_not_matches( + true, + true, + &[ + ("你好界", "abc"), + ("abc", "你"), + ("abc", "A"), + ("abc", "d"), + ("你好世界", "富"), + ("Só Danço Samba", "sox"), + ("fooBarbaz", "oBZ"), + ("Foo Bar Baz", "fbb"), + ("fooBarbaz", "fooBarbazz"), + ], + ); + assert_not_matches( + false, + true, + &[ + ("Só Danço Samba", "sod"), + ("Só Danço Samba", "soc"), + ("Só Danç", "So"), + ], + ); + assert_not_matches(false, false, &[("ۂۂfoۂۂ", "foo")]); +} + +#[test] +fn test_prefer_prefix() { + assert_matches( + &[FuzzyOptimal, FuzzyGreedy], + false, + false, + false, + true, + &[ + ( + "Moby Dick", + "md", + &[0, 5], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS + - PENALTY_GAP_START + - 3 * PENALTY_GAP_EXTENSION, + ), + ( + "Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage", + "md", + &[82, 85], + BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + - PENALTY_GAP_START + - PENALTY_GAP_EXTENSION, + ), + ], + ); +} + +#[test] +fn test_single_char_needle() { + assert_matches( + &[FuzzyOptimal], + false, + false, + false, + false, + &[( + "foO", + "o", + &[2], + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_CAMEL123, + )], + ); + assert_matches( + &[FuzzyOptimal], + false, + false, + false, + false, + &[( + "föÖ", + "ö", + &[2], + BONUS_FIRST_CHAR_MULTIPLIER * BONUS_CAMEL123, + )], + ); +} + +#[test] +fn umlaut() { + let paths = ["be", "bë"]; + let mut matcher = Matcher::new(Config::DEFAULT); + let matches = Pattern::parse("ë", CaseMatching::Ignore, Normalization::Smart) + .match_list(paths, &mut matcher); + assert_eq!(matches.len(), 1); + let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Never) + .match_list(paths, &mut matcher); + assert_eq!(matches.len(), 1); + let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Smart) + .match_list(paths, &mut matcher); + assert_eq!(matches.len(), 2); +} diff --git a/crates/atuin-nucleo/matcher/src/utf32_str.rs b/crates/atuin-nucleo/matcher/src/utf32_str.rs new file mode 100644 index 00000000..664dae7a --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/utf32_str.rs @@ -0,0 +1,428 @@ +#[cfg(test)] +mod tests; + +use std::borrow::Cow; +use std::ops::{Bound, RangeBounds}; +use std::{fmt, slice}; + +use memchr::memmem; + +use crate::chars; + +/// Check if a given string can be represented internally as the `Ascii` variant in a +/// [`Utf32String`] or a [`Utf32Str`]. +/// +/// This returns true if the string is ASCII and does not contain a windows-style newline +/// `'\r'`. +/// The additional carriage return check is required since even for strings consisting only +/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme. +#[inline] +fn has_ascii_graphemes(string: &str) -> bool { + string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none() +} + +/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching. +/// +/// This is mostly intended as an internal string type, but some methods are exposed for +/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string +/// using one of its `From<T>` constructors for string types `T` or from the +/// [`Utf32Str::new`] method. +/// +/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string +/// slice. +/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical +/// to the original string. +/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string. +/// +/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these +/// assumptions when handling `Utf32Str(ing)`s of unknown origin. +/// +/// ## Caveats +/// Despite the name, this type is quite far from being a true string type. Here are some +/// examples demonstrating this. +/// +/// ### String conversions are not round-trip +/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u + +/// COMBINING_DIAERESIS`), the trailing codepoints are truncated. +/// ``` +/// # use nucleo_matcher::Utf32String; +/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u"); +/// ``` +/// +/// ### Indexing is done by grapheme +/// Indexing into a string is done by grapheme rather than by codepoint. +/// ``` +/// # use nucleo_matcher::Utf32String; +/// assert!(Utf32String::from("au\u{0308}").len() == 2); +/// ``` +/// +/// ### A `Unicode` variant may be produced by all-ASCII characters. +/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme, +/// strings containing `\r\n` will still result in a `Unicode` variant. +/// ``` +/// # use nucleo_matcher::Utf32String; +/// let s = Utf32String::from("\r\n"); +/// assert!(!s.slice(..).is_ascii()); +/// assert!(s.len() == 1); +/// assert!(s.slice(..).get(0) == '\n'); +/// ``` +/// +/// ## Design rationale +/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching +/// operates on codepoints (ideally, it should operate on graphemes but that's too +/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5 +/// times) during matching. +/// +/// Doing codepoint segmentation on the fly not only blows trough the cache +/// (lookup tables and I-cache) but also has nontrivial runtime compared to the +/// matching itself. Furthermore there are many extra optimizations available +/// for ASCII only text, but checking each match has too much overhead. +/// +/// Of course, this comes at extra memory cost as we usually still need the UTF-8 +/// encoded variant for rendering. In the (dominant) case of ASCII-only text +/// we don't require a copy. Furthermore fuzzy matching usually is applied while +/// the user is typing on the fly so the same item is potentially matched many +/// times (making the the up-front cost more worth it). That means that its +/// basically always worth it to pre-segment the string. +/// +/// For usecases that only match (a lot of) strings once its possible to keep +/// char buffer around that is filled with the presegmented chars. +/// +/// Another advantage of this approach is that the matcher will naturally +/// produce grapheme indices (instead of utf8 offsets) anyway. With a +/// codepoint basic representation like this the indices can be used +/// directly +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] +pub enum Utf32Str<'a> { + /// A string represented as ASCII encoded bytes. + /// Correctness invariant: must only contain valid ASCII (`<= 127`) + Ascii(&'a [u8]), + /// A string represented as an array of unicode codepoints (basically UTF-32). + Unicode(&'a [char]), +} + +impl<'a> Utf32Str<'a> { + /// Convenience method to construct a `Utf32Str` from a normal UTF-8 str + pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self { + if has_ascii_graphemes(str) { + Utf32Str::Ascii(str.as_bytes()) + } else { + buf.clear(); + buf.extend(crate::chars::graphemes(str)); + Utf32Str::Unicode(buf) + } + } + + /// Returns the number of characters in this string. + #[inline] + pub fn len(self) -> usize { + match self { + Utf32Str::Unicode(codepoints) => codepoints.len(), + Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), + } + } + + /// Returns whether this string is empty. + #[inline] + pub fn is_empty(self) -> bool { + match self { + Utf32Str::Unicode(codepoints) => codepoints.is_empty(), + Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(), + } + } + + /// Creates a slice with a string that contains the characters in + /// the specified **character range**. + #[inline] + pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, + Bound::Unbounded => self.len(), + }; + match self { + Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]), + Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + /// Returns the number of leading whitespaces in this string + #[inline] + pub(crate) fn leading_white_space(self) -> usize { + match self { + Utf32Str::Ascii(bytes) => bytes + .iter() + .position(|b| !b.is_ascii_whitespace()) + .unwrap_or(0), + Utf32Str::Unicode(codepoints) => codepoints + .iter() + .position(|c| !c.is_whitespace()) + .unwrap_or(0), + } + } + + /// Returns the number of trailing whitespaces in this string + #[inline] + pub(crate) fn trailing_white_space(self) -> usize { + match self { + Utf32Str::Ascii(bytes) => bytes + .iter() + .rev() + .position(|b| !b.is_ascii_whitespace()) + .unwrap_or(0), + Utf32Str::Unicode(codepoints) => codepoints + .iter() + .rev() + .position(|c| !c.is_whitespace()) + .unwrap_or(0), + } + } + + /// Same as `slice` but accepts a u32 range for convenience since + /// those are the indices returned by the matcher. + #[inline] + pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> { + let start = match range.start_bound() { + Bound::Included(&start) => start as usize, + Bound::Excluded(&start) => start as usize + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end as usize + 1, + Bound::Excluded(&end) => end as usize, + Bound::Unbounded => self.len(), + }; + match self { + Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]), + Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + /// Returns whether this string only contains graphemes which are single ASCII chars. + /// + /// This is almost equivalent to the string being ASCII, except with the additional requirement + /// that the string cannot contain a windows-style newline `\r\n` which is treated as a single + /// grapheme. + pub fn is_ascii(self) -> bool { + matches!(self, Utf32Str::Ascii(_)) + } + + /// Returns the `n`th character in this string, zero-indexed + pub fn get(self, n: u32) -> char { + match self { + Utf32Str::Ascii(bytes) => bytes[n as usize] as char, + Utf32Str::Unicode(codepoints) => codepoints[n as usize], + } + } + + /// Returns the last character in this string. + /// + /// Panics if the string is empty. + pub(crate) fn last(self) -> char { + match self { + Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, + Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], + } + } + + /// Returns the first character in this string. + /// + /// Panics if the string is empty. + pub(crate) fn first(self) -> char { + match self { + Utf32Str::Ascii(bytes) => bytes[0] as char, + Utf32Str::Unicode(codepoints) => codepoints[0], + } + } + + /// Returns an iterator over the characters in this string + pub fn chars(self) -> Chars<'a> { + match self { + Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), + Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), + } + } +} + +impl fmt::Debug for Utf32Str<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "\"")?; + for c in self.chars() { + for c in c.escape_debug() { + write!(f, "{c}")? + } + } + write!(f, "\"") + } +} + +impl fmt::Display for Utf32Str<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for c in self.chars() { + write!(f, "{c}")? + } + Ok(()) + } +} + +pub enum Chars<'a> { + Ascii(slice::Iter<'a, u8>), + Unicode(slice::Iter<'a, char>), +} + +impl Iterator for Chars<'_> { + type Item = char; + + fn next(&mut self) -> Option<Self::Item> { + match self { + Chars::Ascii(iter) => iter.next().map(|&c| c as char), + Chars::Unicode(iter) => iter.next().copied(), + } + } +} + +impl DoubleEndedIterator for Chars<'_> { + fn next_back(&mut self) -> Option<Self::Item> { + match self { + Chars::Ascii(iter) => iter.next_back().map(|&c| c as char), + Chars::Unicode(iter) => iter.next_back().copied(), + } + } +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] +/// An owned version of [`Utf32Str`]. +/// +/// See the API documentation for [`Utf32Str`] for more detail. +pub enum Utf32String { + /// A string represented as ASCII encoded bytes. + /// Correctness invariant: must only contain valid ASCII (<=127) + Ascii(Box<str>), + /// A string represented as an array of unicode codepoints (basically UTF-32). + Unicode(Box<[char]>), +} + +impl Default for Utf32String { + fn default() -> Self { + Self::Ascii(String::new().into_boxed_str()) + } +} + +impl Utf32String { + /// Returns the number of characters in this string. + #[inline] + pub fn len(&self) -> usize { + match self { + Utf32String::Unicode(codepoints) => codepoints.len(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), + } + } + + /// Returns whether this string is empty. + #[inline] + pub fn is_empty(&self) -> bool { + match self { + Utf32String::Unicode(codepoints) => codepoints.is_empty(), + Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(), + } + } + + /// Creates a slice with a string that contains the characters in + /// the specified **character range**. + #[inline] + pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, + Bound::Unbounded => self.len(), + }; + match self { + Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]), + Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), + } + } + + /// Same as `slice` but accepts a u32 range for convenience since + /// those are the indices returned by the matcher. + #[inline] + pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, + Bound::Unbounded => self.len() as u32, + }; + match self { + Utf32String::Ascii(bytes) => { + Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize]) + } + Utf32String::Unicode(codepoints) => { + Utf32Str::Unicode(&codepoints[start as usize..end as usize]) + } + } + } +} + +impl From<&str> for Utf32String { + #[inline] + fn from(value: &str) -> Self { + if has_ascii_graphemes(value) { + Self::Ascii(value.to_owned().into_boxed_str()) + } else { + Self::Unicode(chars::graphemes(value).collect()) + } + } +} + +impl From<Box<str>> for Utf32String { + fn from(value: Box<str>) -> Self { + if has_ascii_graphemes(&value) { + Self::Ascii(value) + } else { + Self::Unicode(chars::graphemes(&value).collect()) + } + } +} + +impl From<String> for Utf32String { + #[inline] + fn from(value: String) -> Self { + value.into_boxed_str().into() + } +} + +impl<'a> From<Cow<'a, str>> for Utf32String { + #[inline] + fn from(value: Cow<'a, str>) -> Self { + match value { + Cow::Borrowed(value) => value.into(), + Cow::Owned(value) => value.into(), + } + } +} + +impl fmt::Debug for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self.slice(..)) + } +} + +impl fmt::Display for Utf32String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.slice(..)) + } +} diff --git a/crates/atuin-nucleo/matcher/src/utf32_str/tests.rs b/crates/atuin-nucleo/matcher/src/utf32_str/tests.rs new file mode 100644 index 00000000..a38c8875 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/utf32_str/tests.rs @@ -0,0 +1,44 @@ +use crate::{Utf32Str, Utf32String}; + +#[test] +fn test_utf32str_ascii() { + /// Helper function for testing + fn expect_ascii(src: &str, is_ascii: bool) { + let mut buffer = Vec::new(); + assert!(Utf32Str::new(src, &mut buffer).is_ascii() == is_ascii); + assert!(Utf32String::from(src).slice(..).is_ascii() == is_ascii); + assert!(Utf32String::from(src.to_owned()).slice(..).is_ascii() == is_ascii); + } + + // ascii + expect_ascii("", true); + expect_ascii("a", true); + expect_ascii("a\nb", true); + expect_ascii("\n\r", true); + + // not ascii + expect_ascii("aü", false); + expect_ascii("au\u{0308}", false); + + // windows-style newline + expect_ascii("a\r\nb", false); + expect_ascii("ü\r\n", false); + expect_ascii("\r\n", false); +} + +#[test] +fn test_grapheme_truncation() { + // ascii is preserved + let s = Utf32String::from("ab"); + assert_eq!(s.slice(..).get(0), 'a'); + assert_eq!(s.slice(..).get(1), 'b'); + + // windows-style newline is truncated to '\n' + let s = Utf32String::from("\r\n"); + assert_eq!(s.slice(..).get(0), '\n'); + + // normal graphemes are truncated to the first character + let s = Utf32String::from("u\u{0308}\r\n"); + assert_eq!(s.slice(..).get(0), 'u'); + assert_eq!(s.slice(..).get(1), '\n'); +} |
