bring in base nucleo

author: Ellie Huxtable <ellie@elliehuxtable.com> 2026-03-16 15:22:49 -0700
committer: Ellie Huxtable <ellie@elliehuxtable.com> 2026-03-16 15:22:49 -0700
commit: 280499651c8308555e287dea79aa6569a95d99f5 (patch)
tree: 355648de82f0134ad6a62fb9f361279c0a8485b5 /crates/atuin-nucleo/matcher/src/chars.rs
parent: specify version in all daemon atuin crates (diff)
parent: Squashed 'crates/atuin-nucleo/' content from commit 4253de9f (diff)
download: atuin-280499651c8308555e287dea79aa6569a95d99f5.zip
1 files changed, 207 insertions, 0 deletions
diff --git a/crates/atuin-nucleo/matcher/src/chars.rs b/crates/atuin-nucleo/matcher/src/chars.rs
new file mode 100644
index 00000000..d13a2466
--- /dev/null
+++ b/crates/atuin-nucleo/matcher/src/chars.rs
@@ -0,0 +1,207 @@
+//! Utilities for working with (unicode) characters/codepoints
+
+use std::fmt::{self, Debug, Display};
+
+#[cfg(feature = "unicode-casefold")]
+use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
+use crate::Config;
+
+//autogenerated by generate-ucd
+#[allow(warnings)]
+#[rustfmt::skip]
+#[cfg(feature = "unicode-casefold")]
+mod case_fold;
+#[cfg(feature = "unicode-normalization")]
+mod normalize;
+
+pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
+    const ASCII: bool;
+    fn char_class(self, config: &Config) -> CharClass;
+    fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass);
+    fn normalize(self, config: &Config) -> Self;
+}
+
+/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation
+#[repr(transparent)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct AsciiChar(pub u8);
+
+impl AsciiChar {
+    pub fn cast(bytes: &[u8]) -> &[AsciiChar] {
+        unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) }
+    }
+}
+
+impl fmt::Display for AsciiChar {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        Display::fmt(&(self.0 as char), f)
+    }
+}
+
+impl PartialEq<AsciiChar> for char {
+    fn eq(&self, other: &AsciiChar) -> bool {
+        other.0 as char == *self
+    }
+}
+
+impl Char for AsciiChar {
+    const ASCII: bool = true;
+    #[inline]
+    fn char_class(self, config: &Config) -> CharClass {
+        let c = self.0;
+        // using manual if conditions instead optimizes better
+        if c >= b'a' && c <= b'z' {
+            CharClass::Lower
+        } else if c >= b'A' && c <= b'Z' {
+            CharClass::Upper
+        } else if c >= b'0' && c <= b'9' {
+            CharClass::Number
+        } else if c.is_ascii_whitespace() {
+            CharClass::Whitespace
+        } else if config.delimiter_chars.contains(&c) {
+            CharClass::Delimiter
+        } else {
+            CharClass::NonWord
+        }
+    }
+
+    #[inline(always)]
+    fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
+        let char_class = self.char_class(config);
+        if config.ignore_case && char_class == CharClass::Upper {
+            self.0 += 32
+        }
+        (self, char_class)
+    }
+
+    #[inline(always)]
+    fn normalize(mut self, config: &Config) -> Self {
+        if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' {
+            self.0 += 32
+        }
+        self
+    }
+}
+fn char_class_non_ascii(c: char) -> CharClass {
+    if c.is_lowercase() {
+        CharClass::Lower
+    } else if is_upper_case(c) {
+        CharClass::Upper
+    } else if c.is_numeric() {
+        CharClass::Number
+    } else if c.is_alphabetic() {
+        CharClass::Letter
+    } else if c.is_whitespace() {
+        CharClass::Whitespace
+    } else {
+        CharClass::NonWord
+    }
+}
+impl Char for char {
+    const ASCII: bool = false;
+    #[inline(always)]
+    fn char_class(self, config: &Config) -> CharClass {
+        if self.is_ascii() {
+            return AsciiChar(self as u8).char_class(config);
+        }
+        char_class_non_ascii(self)
+    }
+
+    #[inline(always)]
+    fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
+        if self.is_ascii() {
+            let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config);
+            return (c.0 as char, class);
+        }
+        let char_class = char_class_non_ascii(self);
+        #[cfg(feature = "unicode-casefold")]
+        let mut case_fold = char_class == CharClass::Upper;
+        #[cfg(feature = "unicode-normalization")]
+        if config.normalize {
+            self = normalize::normalize(self);
+            case_fold = true
+        }
+        #[cfg(feature = "unicode-casefold")]
+        if case_fold && config.ignore_case {
+            self = CASE_FOLDING_SIMPLE
+                .binary_search_by_key(&self, |(upper, _)| *upper)
+                .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
+        }
+        (self, char_class)
+    }
+
+    #[inline(always)]
+    fn normalize(mut self, config: &Config) -> Self {
+        #[cfg(feature = "unicode-normalization")]
+        if config.normalize {
+            self = normalize::normalize(self);
+        }
+        #[cfg(feature = "unicode-casefold")]
+        if config.ignore_case {
+            self = to_lower_case(self)
+        }
+        self
+    }
+}
+
+#[cfg(feature = "unicode-normalization")]
+pub use normalize::normalize;
+#[cfg(feature = "unicode-segmentation")]
+use unicode_segmentation::UnicodeSegmentation;
+
+/// Converts a character to lower case using simple unicode case folding
+#[cfg(feature = "unicode-casefold")]
+#[inline(always)]
+pub fn to_lower_case(c: char) -> char {
+    CASE_FOLDING_SIMPLE
+        .binary_search_by_key(&c, |(upper, _)| *upper)
+        .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
+}
+
+/// Checks if a character is upper case according to simple unicode case folding.
+/// if the `unicode-casefold` feature is disable the equivalent std function is used
+#[inline(always)]
+pub fn is_upper_case(c: char) -> bool {
+    #[cfg(feature = "unicode-casefold")]
+    let val = CASE_FOLDING_SIMPLE
+        .binary_search_by_key(&c, |(upper, _)| *upper)
+        .is_ok();
+    #[cfg(not(feature = "unicode-casefold"))]
+    let val = c.is_uppercase();
+    val
+}
+
+#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
+pub(crate) enum CharClass {
+    Whitespace,
+    NonWord,
+    Delimiter,
+    Lower,
+    Upper,
+    Letter,
+    Number,
+}
+
+/// Nucleo cannot match graphemes as single units. To work around
+/// that we only use the first codepoint of each grapheme. This
+/// iterator returns the first character of each unicode grapheme
+/// in a string and is used for constructing `Utf32Str(ing)`.
+pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
+    #[cfg(feature = "unicode-segmentation")]
+    let res = text.graphemes(true).map(|grapheme| {
+        // we need to special-case this check since `\r\n` is a single grapheme and is
+        // therefore the exception to the rule that normalization of a grapheme should
+        // map to the first character.
+        if grapheme == "\r\n" {
+            '\n'
+        } else {
+            grapheme
+                .chars()
+                .next()
+                .expect("graphemes must be non-empty")
+        }
+    });
+    #[cfg(not(feature = "unicode-segmentation"))]
+    let res = text.chars();
+    res
+}
author	Ellie Huxtable <ellie@elliehuxtable.com>	2026-03-16 15:22:49 -0700
committer	Ellie Huxtable <ellie@elliehuxtable.com>	2026-03-16 15:22:49 -0700
commit	280499651c8308555e287dea79aa6569a95d99f5 (patch)
tree	355648de82f0134ad6a62fb9f361279c0a8485b5 /crates/atuin-nucleo/matcher/src/chars.rs
parent	specify version in all daemon atuin crates (diff)
parent	Squashed 'crates/atuin-nucleo/' content from commit 4253de9f (diff)
download	atuin-280499651c8308555e287dea79aa6569a95d99f5.zip