//! Utilities for working with (unicode) characters/codepoints use std::fmt::{self, Debug, Display}; #[cfg(feature = "unicode-casefold")] use crate::chars::case_fold::CASE_FOLDING_SIMPLE; use crate::Config; //autogenerated by generate-ucd #[expect(warnings)] #[rustfmt::skip] #[cfg(feature = "unicode-casefold")] mod case_fold; #[cfg(feature = "unicode-normalization")] mod normalize; pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { const ASCII: bool; fn char_class(self, config: &Config) -> CharClass; fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass); fn normalize(self, config: &Config) -> Self; } /// repr tansparent wrapper around u8 with better formatting and `PartialEq` implementation #[repr(transparent)] #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub(crate) struct AsciiChar(pub u8); impl AsciiChar { pub fn cast(bytes: &[u8]) -> &[AsciiChar] { unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) } } } impl fmt::Display for AsciiChar { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { Display::fmt(&(self.0 as char), f) } } impl PartialEq for char { fn eq(&self, other: &AsciiChar) -> bool { other.0 as char == *self } } impl Char for AsciiChar { const ASCII: bool = true; #[inline] fn char_class(self, config: &Config) -> CharClass { let c = self.0; // using manual if conditions instead optimizes better if c >= b'a' && c <= b'z' { CharClass::Lower } else if c >= b'A' && c <= b'Z' { CharClass::Upper } else if c >= b'0' && c <= b'9' { CharClass::Number } else if c.is_ascii_whitespace() { CharClass::Whitespace } else if config.delimiter_chars.contains(&c) { CharClass::Delimiter } else { CharClass::NonWord } } #[inline(always)] fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { let char_class = self.char_class(config); if config.ignore_case && char_class == CharClass::Upper { self.0 += 32 } (self, char_class) } #[inline(always)] fn normalize(mut self, config: &Config) -> Self { if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { self.0 += 32 } self } } fn char_class_non_ascii(c: char) -> CharClass { if c.is_lowercase() { CharClass::Lower } else if is_upper_case(c) { CharClass::Upper } else if c.is_numeric() { CharClass::Number } else if c.is_alphabetic() { CharClass::Letter } else if c.is_whitespace() { CharClass::Whitespace } else { CharClass::NonWord } } impl Char for char { const ASCII: bool = false; #[inline(always)] fn char_class(self, config: &Config) -> CharClass { if self.is_ascii() { return AsciiChar(self as u8).char_class(config); } char_class_non_ascii(self) } #[inline(always)] fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { if self.is_ascii() { let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); return (c.0 as char, class); } let char_class = char_class_non_ascii(self); #[cfg(feature = "unicode-casefold")] let mut case_fold = char_class == CharClass::Upper; #[cfg(feature = "unicode-normalization")] if config.normalize { self = normalize::normalize(self); case_fold = true } #[cfg(feature = "unicode-casefold")] if case_fold && config.ignore_case { self = CASE_FOLDING_SIMPLE .binary_search_by_key(&self, |(upper, _)| *upper) .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1) } (self, char_class) } #[inline(always)] fn normalize(mut self, config: &Config) -> Self { #[cfg(feature = "unicode-normalization")] if config.normalize { self = normalize::normalize(self); } #[cfg(feature = "unicode-casefold")] if config.ignore_case { self = to_lower_case(self) } self } } #[cfg(feature = "unicode-normalization")] pub use normalize::normalize; #[cfg(feature = "unicode-segmentation")] use unicode_segmentation::UnicodeSegmentation; /// Converts a character to lower case using simple unicode case folding #[cfg(feature = "unicode-casefold")] #[inline(always)] pub fn to_lower_case(c: char) -> char { CASE_FOLDING_SIMPLE .binary_search_by_key(&c, |(upper, _)| *upper) .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) } /// Checks if a character is upper case according to simple unicode case folding. /// if the `unicode-casefold` feature is disable the equivalent std function is used #[inline(always)] pub fn is_upper_case(c: char) -> bool { #[cfg(feature = "unicode-casefold")] let val = CASE_FOLDING_SIMPLE .binary_search_by_key(&c, |(upper, _)| *upper) .is_ok(); #[cfg(not(feature = "unicode-casefold"))] let val = c.is_uppercase(); val } #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] pub(crate) enum CharClass { Whitespace, NonWord, Delimiter, Lower, Upper, Letter, Number, } /// Nucleo cannot match graphemes as single units. To work around /// that we only use the first codepoint of each grapheme. This /// iterator returns the first character of each unicode grapheme /// in a string and is used for constructing `Utf32Str(ing)`. pub fn graphemes(text: &str) -> impl Iterator + '_ { #[cfg(feature = "unicode-segmentation")] let res = text.graphemes(true).map(|grapheme| { // we need to special-case this check since `\r\n` is a single grapheme and is // therefore the exception to the rule that normalization of a grapheme should // map to the first character. if grapheme == "\r\n" { '\n' } else { grapheme .chars() .next() .expect("graphemes must be non-empty") } }); #[cfg(not(feature = "unicode-segmentation"))] let res = text.chars(); res }