aboutsummaryrefslogtreecommitdiffstats
path: root/crates/atuin-nucleo/matcher/src/chars.rs
diff options
context:
space:
mode:
authorEllie Huxtable <ellie@elliehuxtable.com>2026-03-16 15:22:49 -0700
committerEllie Huxtable <ellie@elliehuxtable.com>2026-03-16 15:22:49 -0700
commit280499651c8308555e287dea79aa6569a95d99f5 (patch)
tree355648de82f0134ad6a62fb9f361279c0a8485b5 /crates/atuin-nucleo/matcher/src/chars.rs
parentspecify version in all daemon atuin crates (diff)
parentSquashed 'crates/atuin-nucleo/' content from commit 4253de9f (diff)
downloadatuin-280499651c8308555e287dea79aa6569a95d99f5.zip
bring in base nucleo
Diffstat (limited to 'crates/atuin-nucleo/matcher/src/chars.rs')
-rw-r--r--crates/atuin-nucleo/matcher/src/chars.rs207
1 files changed, 207 insertions, 0 deletions
diff --git a/crates/atuin-nucleo/matcher/src/chars.rs b/crates/atuin-nucleo/matcher/src/chars.rs
new file mode 100644
index 00000000..d13a2466
--- /dev/null
+++ b/crates/atuin-nucleo/matcher/src/chars.rs
@@ -0,0 +1,207 @@
+//! Utilities for working with (unicode) characters/codepoints
+
+use std::fmt::{self, Debug, Display};
+
+#[cfg(feature = "unicode-casefold")]
+use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
+use crate::Config;
+
+//autogenerated by generate-ucd
+#[allow(warnings)]
+#[rustfmt::skip]
+#[cfg(feature = "unicode-casefold")]
+mod case_fold;
+#[cfg(feature = "unicode-normalization")]
+mod normalize;
+
+pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
+ const ASCII: bool;
+ fn char_class(self, config: &Config) -> CharClass;
+ fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass);
+ fn normalize(self, config: &Config) -> Self;
+}
+
+/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation
+#[repr(transparent)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct AsciiChar(pub u8);
+
+impl AsciiChar {
+ pub fn cast(bytes: &[u8]) -> &[AsciiChar] {
+ unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) }
+ }
+}
+
+impl fmt::Display for AsciiChar {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ Display::fmt(&(self.0 as char), f)
+ }
+}
+
+impl PartialEq<AsciiChar> for char {
+ fn eq(&self, other: &AsciiChar) -> bool {
+ other.0 as char == *self
+ }
+}
+
+impl Char for AsciiChar {
+ const ASCII: bool = true;
+ #[inline]
+ fn char_class(self, config: &Config) -> CharClass {
+ let c = self.0;
+ // using manual if conditions instead optimizes better
+ if c >= b'a' && c <= b'z' {
+ CharClass::Lower
+ } else if c >= b'A' && c <= b'Z' {
+ CharClass::Upper
+ } else if c >= b'0' && c <= b'9' {
+ CharClass::Number
+ } else if c.is_ascii_whitespace() {
+ CharClass::Whitespace
+ } else if config.delimiter_chars.contains(&c) {
+ CharClass::Delimiter
+ } else {
+ CharClass::NonWord
+ }
+ }
+
+ #[inline(always)]
+ fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
+ let char_class = self.char_class(config);
+ if config.ignore_case && char_class == CharClass::Upper {
+ self.0 += 32
+ }
+ (self, char_class)
+ }
+
+ #[inline(always)]
+ fn normalize(mut self, config: &Config) -> Self {
+ if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' {
+ self.0 += 32
+ }
+ self
+ }
+}
+fn char_class_non_ascii(c: char) -> CharClass {
+ if c.is_lowercase() {
+ CharClass::Lower
+ } else if is_upper_case(c) {
+ CharClass::Upper
+ } else if c.is_numeric() {
+ CharClass::Number
+ } else if c.is_alphabetic() {
+ CharClass::Letter
+ } else if c.is_whitespace() {
+ CharClass::Whitespace
+ } else {
+ CharClass::NonWord
+ }
+}
+impl Char for char {
+ const ASCII: bool = false;
+ #[inline(always)]
+ fn char_class(self, config: &Config) -> CharClass {
+ if self.is_ascii() {
+ return AsciiChar(self as u8).char_class(config);
+ }
+ char_class_non_ascii(self)
+ }
+
+ #[inline(always)]
+ fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
+ if self.is_ascii() {
+ let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config);
+ return (c.0 as char, class);
+ }
+ let char_class = char_class_non_ascii(self);
+ #[cfg(feature = "unicode-casefold")]
+ let mut case_fold = char_class == CharClass::Upper;
+ #[cfg(feature = "unicode-normalization")]
+ if config.normalize {
+ self = normalize::normalize(self);
+ case_fold = true
+ }
+ #[cfg(feature = "unicode-casefold")]
+ if case_fold && config.ignore_case {
+ self = CASE_FOLDING_SIMPLE
+ .binary_search_by_key(&self, |(upper, _)| *upper)
+ .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
+ }
+ (self, char_class)
+ }
+
+ #[inline(always)]
+ fn normalize(mut self, config: &Config) -> Self {
+ #[cfg(feature = "unicode-normalization")]
+ if config.normalize {
+ self = normalize::normalize(self);
+ }
+ #[cfg(feature = "unicode-casefold")]
+ if config.ignore_case {
+ self = to_lower_case(self)
+ }
+ self
+ }
+}
+
+#[cfg(feature = "unicode-normalization")]
+pub use normalize::normalize;
+#[cfg(feature = "unicode-segmentation")]
+use unicode_segmentation::UnicodeSegmentation;
+
+/// Converts a character to lower case using simple unicode case folding
+#[cfg(feature = "unicode-casefold")]
+#[inline(always)]
+pub fn to_lower_case(c: char) -> char {
+ CASE_FOLDING_SIMPLE
+ .binary_search_by_key(&c, |(upper, _)| *upper)
+ .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
+}
+
+/// Checks if a character is upper case according to simple unicode case folding.
+/// if the `unicode-casefold` feature is disable the equivalent std function is used
+#[inline(always)]
+pub fn is_upper_case(c: char) -> bool {
+ #[cfg(feature = "unicode-casefold")]
+ let val = CASE_FOLDING_SIMPLE
+ .binary_search_by_key(&c, |(upper, _)| *upper)
+ .is_ok();
+ #[cfg(not(feature = "unicode-casefold"))]
+ let val = c.is_uppercase();
+ val
+}
+
+#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
+pub(crate) enum CharClass {
+ Whitespace,
+ NonWord,
+ Delimiter,
+ Lower,
+ Upper,
+ Letter,
+ Number,
+}
+
+/// Nucleo cannot match graphemes as single units. To work around
+/// that we only use the first codepoint of each grapheme. This
+/// iterator returns the first character of each unicode grapheme
+/// in a string and is used for constructing `Utf32Str(ing)`.
+pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
+ #[cfg(feature = "unicode-segmentation")]
+ let res = text.graphemes(true).map(|grapheme| {
+ // we need to special-case this check since `\r\n` is a single grapheme and is
+ // therefore the exception to the rule that normalization of a grapheme should
+ // map to the first character.
+ if grapheme == "\r\n" {
+ '\n'
+ } else {
+ grapheme
+ .chars()
+ .next()
+ .expect("graphemes must be non-empty")
+ }
+ });
+ #[cfg(not(feature = "unicode-segmentation"))]
+ let res = text.chars();
+ res
+}