diff options
| author | Ellie Huxtable <ellie@elliehuxtable.com> | 2026-03-16 15:22:49 -0700 |
|---|---|---|
| committer | Ellie Huxtable <ellie@elliehuxtable.com> | 2026-03-16 15:22:49 -0700 |
| commit | 280499651c8308555e287dea79aa6569a95d99f5 (patch) | |
| tree | 355648de82f0134ad6a62fb9f361279c0a8485b5 /crates/atuin-nucleo/matcher/src/chars.rs | |
| parent | specify version in all daemon atuin crates (diff) | |
| parent | Squashed 'crates/atuin-nucleo/' content from commit 4253de9f (diff) | |
| download | atuin-280499651c8308555e287dea79aa6569a95d99f5.zip | |
bring in base nucleo
Diffstat (limited to 'crates/atuin-nucleo/matcher/src/chars.rs')
| -rw-r--r-- | crates/atuin-nucleo/matcher/src/chars.rs | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/crates/atuin-nucleo/matcher/src/chars.rs b/crates/atuin-nucleo/matcher/src/chars.rs new file mode 100644 index 00000000..d13a2466 --- /dev/null +++ b/crates/atuin-nucleo/matcher/src/chars.rs @@ -0,0 +1,207 @@ +//! Utilities for working with (unicode) characters/codepoints + +use std::fmt::{self, Debug, Display}; + +#[cfg(feature = "unicode-casefold")] +use crate::chars::case_fold::CASE_FOLDING_SIMPLE; +use crate::Config; + +//autogenerated by generate-ucd +#[allow(warnings)] +#[rustfmt::skip] +#[cfg(feature = "unicode-casefold")] +mod case_fold; +#[cfg(feature = "unicode-normalization")] +mod normalize; + +pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { + const ASCII: bool; + fn char_class(self, config: &Config) -> CharClass; + fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass); + fn normalize(self, config: &Config) -> Self; +} + +/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub(crate) struct AsciiChar(pub u8); + +impl AsciiChar { + pub fn cast(bytes: &[u8]) -> &[AsciiChar] { + unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) } + } +} + +impl fmt::Display for AsciiChar { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Display::fmt(&(self.0 as char), f) + } +} + +impl PartialEq<AsciiChar> for char { + fn eq(&self, other: &AsciiChar) -> bool { + other.0 as char == *self + } +} + +impl Char for AsciiChar { + const ASCII: bool = true; + #[inline] + fn char_class(self, config: &Config) -> CharClass { + let c = self.0; + // using manual if conditions instead optimizes better + if c >= b'a' && c <= b'z' { + CharClass::Lower + } else if c >= b'A' && c <= b'Z' { + CharClass::Upper + } else if c >= b'0' && c <= b'9' { + CharClass::Number + } else if c.is_ascii_whitespace() { + CharClass::Whitespace + } else if config.delimiter_chars.contains(&c) { + CharClass::Delimiter + } else { + CharClass::NonWord + } + } + + #[inline(always)] + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { + let char_class = self.char_class(config); + if config.ignore_case && char_class == CharClass::Upper { + self.0 += 32 + } + (self, char_class) + } + + #[inline(always)] + fn normalize(mut self, config: &Config) -> Self { + if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { + self.0 += 32 + } + self + } +} +fn char_class_non_ascii(c: char) -> CharClass { + if c.is_lowercase() { + CharClass::Lower + } else if is_upper_case(c) { + CharClass::Upper + } else if c.is_numeric() { + CharClass::Number + } else if c.is_alphabetic() { + CharClass::Letter + } else if c.is_whitespace() { + CharClass::Whitespace + } else { + CharClass::NonWord + } +} +impl Char for char { + const ASCII: bool = false; + #[inline(always)] + fn char_class(self, config: &Config) -> CharClass { + if self.is_ascii() { + return AsciiChar(self as u8).char_class(config); + } + char_class_non_ascii(self) + } + + #[inline(always)] + fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { + if self.is_ascii() { + let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); + return (c.0 as char, class); + } + let char_class = char_class_non_ascii(self); + #[cfg(feature = "unicode-casefold")] + let mut case_fold = char_class == CharClass::Upper; + #[cfg(feature = "unicode-normalization")] + if config.normalize { + self = normalize::normalize(self); + case_fold = true + } + #[cfg(feature = "unicode-casefold")] + if case_fold && config.ignore_case { + self = CASE_FOLDING_SIMPLE + .binary_search_by_key(&self, |(upper, _)| *upper) + .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1) + } + (self, char_class) + } + + #[inline(always)] + fn normalize(mut self, config: &Config) -> Self { + #[cfg(feature = "unicode-normalization")] + if config.normalize { + self = normalize::normalize(self); + } + #[cfg(feature = "unicode-casefold")] + if config.ignore_case { + self = to_lower_case(self) + } + self + } +} + +#[cfg(feature = "unicode-normalization")] +pub use normalize::normalize; +#[cfg(feature = "unicode-segmentation")] +use unicode_segmentation::UnicodeSegmentation; + +/// Converts a character to lower case using simple unicode case folding +#[cfg(feature = "unicode-casefold")] +#[inline(always)] +pub fn to_lower_case(c: char) -> char { + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) +} + +/// Checks if a character is upper case according to simple unicode case folding. +/// if the `unicode-casefold` feature is disable the equivalent std function is used +#[inline(always)] +pub fn is_upper_case(c: char) -> bool { + #[cfg(feature = "unicode-casefold")] + let val = CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |(upper, _)| *upper) + .is_ok(); + #[cfg(not(feature = "unicode-casefold"))] + let val = c.is_uppercase(); + val +} + +#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] +pub(crate) enum CharClass { + Whitespace, + NonWord, + Delimiter, + Lower, + Upper, + Letter, + Number, +} + +/// Nucleo cannot match graphemes as single units. To work around +/// that we only use the first codepoint of each grapheme. This +/// iterator returns the first character of each unicode grapheme +/// in a string and is used for constructing `Utf32Str(ing)`. +pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ { + #[cfg(feature = "unicode-segmentation")] + let res = text.graphemes(true).map(|grapheme| { + // we need to special-case this check since `\r\n` is a single grapheme and is + // therefore the exception to the rule that normalization of a grapheme should + // map to the first character. + if grapheme == "\r\n" { + '\n' + } else { + grapheme + .chars() + .next() + .expect("graphemes must be non-empty") + } + }); + #[cfg(not(feature = "unicode-segmentation"))] + let res = text.chars(); + res +} |
