diff options
| author | Ellie Huxtable <ellie@elliehuxtable.com> | 2026-03-16 15:22:49 -0700 |
|---|---|---|
| committer | Ellie Huxtable <ellie@elliehuxtable.com> | 2026-03-16 15:22:49 -0700 |
| commit | 8f9777ce7aecfe1a163a915e3245466b9dd9ac2e (patch) | |
| tree | a878a0495e5f84266b7e36918a9e1a9432b0ddd8 /matcher/src/lib.rs | |
| download | atuin-8f9777ce7aecfe1a163a915e3245466b9dd9ac2e.zip | |
Squashed 'crates/atuin-nucleo/' content from commit 4253de9f
git-subtree-dir: crates/atuin-nucleo
git-subtree-split: 4253de9faabb4e5c6d81d946a5e35a90f87347ee
Diffstat (limited to 'matcher/src/lib.rs')
| -rw-r--r-- | matcher/src/lib.rs | 780 |
1 files changed, 780 insertions, 0 deletions
diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs new file mode 100644 index 00000000..3e8874c5 --- /dev/null +++ b/matcher/src/lib.rs @@ -0,0 +1,780 @@ +/*! +`nucleo_matcher` is a low level crate that contains the matcher implementation +used by the high level `nucleo` crate. + +**NOTE**: If you are building an fzf-like interactive fuzzy finder that is +meant to match a reasonably large number of items (> 100) using the high level +`nucleo` crate is highly recommended. Using `nucleo-matcher` directly in you ui +loop will be very slow. Implementing this logic yourself is very complex. + +The matcher is hightly optimized and can significantly outperform `fzf` and +`skim` (the `fuzzy-matcher` crate). However some of these optimizations require +a slightly less convenient API. Be sure to carefully read the documentation of +the [`Matcher`] to avoid unexpected behaviour. +# Examples + +For almost all usecases the [`pattern`] API should be used instead of calling +the matcher methods directly. [`Pattern::parse`](pattern::Pattern::parse) will +construct a single Atom (a single match operation) for each word. The pattern +can contain special characters to control what kind of match is performed (see +[`AtomKind`](crate::pattern::AtomKind)). + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching}; +let paths = ["foo/bar", "bar/foo", "foobar"]; +let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); +let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); +let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]); +``` + +If the pattern should be matched literally (without this special parsing) +[`Pattern::new`](pattern::Pattern::new) can be used instead. + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization}; +let paths = ["foo/bar", "bar/foo", "foobar"]; +let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); +let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); +let paths = ["^foo/bar", "bar/^foo", "foobar"]; +let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]); +``` + +Word segmentation is performed automatically on any unescaped character for which [`is_whitespace`](char::is_whitespace) returns true. +This is relevant, for instance, with non-english keyboard input. + +``` +# use nucleo_matcher::pattern::{Atom, Pattern, Normalization, CaseMatching}; +assert_eq!( + // double-width 'Ideographic Space', i.e. `'\u{3000}'` + Pattern::parse("ほげ ふが", CaseMatching::Smart, Normalization::Smart).atoms, + vec![ + Atom::parse("ほげ", CaseMatching::Smart, Normalization::Smart), + Atom::parse("ふが", CaseMatching::Smart, Normalization::Smart), + ], +); +``` + +If word segmentation is also not desired, a single `Atom` can be constructed directly. + +``` +# use nucleo_matcher::{Matcher, Config}; +# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind}; +let paths = ["foobar", "foo bar"]; +let mut matcher = Matcher::new(Config::DEFAULT); +let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher); +assert_eq!(matches, vec![("foo bar", 192)]); +``` + + +# Status + +Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away). + +*/ + +// sadly ranges don't optmimzie well +#![allow(clippy::manual_range_contains)] +#![warn(missing_docs)] + +pub mod chars; +mod config; +#[cfg(test)] +mod debug; +mod exact; +mod fuzzy_greedy; +mod fuzzy_optimal; +mod matrix; +pub mod pattern; +mod prefilter; +mod score; +mod utf32_str; + +#[cfg(test)] +mod tests; + +pub use crate::config::Config; +pub use crate::utf32_str::{Utf32Str, Utf32String}; + +use crate::chars::{AsciiChar, Char}; +use crate::matrix::MatrixSlab; + +/// A matcher engine that can execute (fuzzy) matches. +/// +/// A matches contains **heap allocated** scratch memory that is reused during +/// matching. This scratch memory allows the matcher to guarantee that it will +/// **never allocate** during matching (with the exception of pushing to the +/// `indices` vector if there isn't enough capacity). However this scratch +/// memory is fairly large (around 135KB) so creating a matcher is expensive. +/// +/// All `.._match` functions will not compute the indices of the matched +/// characters. These should be used to prefilter to filter and rank all +/// matches. All `.._indices` functions will also compute the indices of the +/// matched characters but are slower compared to the `..match` variant. These +/// should be used when rendering the best N matches. Note that the `indices` +/// argument is **never cleared**. This allows running multiple different +/// matches on the same haystack and merging the indices by sorting and +/// deduplicating the vector. +/// +/// The `needle` argument for each function must always be normalized by the +/// caller (unicode normalization and case folding). Otherwise, the matcher +/// may fail to produce a match. The [`pattern`] modules provides utilities +/// to preprocess needles and **should usually be preferred over invoking the +/// matcher directly**. Additionally it's recommend to perform separate matches +/// for each word in the needle. Consider the folloling example: +/// +/// If `foo bar` is used as the needle it matches both `foo test baaar` and +/// `foo hello-world bar`. However, `foo test baaar` will receive a higher +/// score than `foo hello-world bar`. `baaar` contains a 2 character gap which +/// will receive a penalty and therefore the user will likely expect it to rank +/// lower. However, if `foo bar` is matched as a single query `hello-world` and +/// `test` are both considered gaps too. As `hello-world` is a much longer gap +/// then `test` the extra penalty for `baaar` is canceled out. If both words +/// are matched individually the interspersed words do not receive a penalty and +/// `foo hello-world bar` ranks higher. +/// +/// In general nucleo is a **substring matching tool** (except for the prefix/ +/// postfix matching modes) with no penalty assigned to matches that start +/// later within the same pattern (which enables matching words individually +/// as shown above). If patterns show a large variety in length and the syntax +/// described above is not used it may be preferable to give preference to +/// matches closer to the start of a haystack. To accommodate that usecase the +/// [`prefer_prefix`](Config::prefer_prefix) option can be set to true. +/// +/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than +/// that the matcher **will panic**. The caller must decide whether it wants to +/// filter out long haystacks or truncate them. +pub struct Matcher { + #[allow(missing_docs)] + pub config: Config, + slab: MatrixSlab, +} + +// this is just here for convenience not sure if we should implement this +impl Clone for Matcher { + fn clone(&self) -> Self { + Matcher { + config: self.config.clone(), + slab: MatrixSlab::new(), + } + } +} + +impl std::fmt::Debug for Matcher { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Matcher") + .field("config", &self.config) + .finish_non_exhaustive() + } +} + +impl Default for Matcher { + fn default() -> Self { + Matcher { + config: Config::DEFAULT, + slab: MatrixSlab::new(), + } + } +} + +impl Matcher { + /// Creates a new matcher instance, note that this will eagerly allocate a + /// fairly large chunk of heap memory (around 135KB currently but subject to + /// change) so matchers should be reused if called often (like in a loop). + pub fn new(config: Config) -> Self { + Self { + config, + slab: MatrixSlab::new(), + } + } + + /// Find the fuzzy match with the highest score in the `haystack`. + /// + /// This functions has `O(mn)` time complexity for short inputs. + /// To avoid slowdowns it automatically falls back to + /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large + /// needles and haystacks. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::<false>(haystack, needle, &mut Vec::new()) + } + + /// Find the fuzzy match with the highest score in the `haystack` and + /// compute its indices. + /// + /// This functions has `O(mn)` time complexity for short inputs. To + /// avoid slowdowns it automatically falls back to + /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large needles + /// and haystacks + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_matcher_impl::<true>(haystack, needle, indices) + } + + fn fuzzy_matcher_impl<const INDICES: bool>( + &mut self, + haystack_: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() > haystack_.len() { + return None; + } + if needle_.is_empty() { + return Some(0); + } + if needle_.len() == haystack_.len() { + return self.exact_match_impl::<INDICES>( + haystack_, + needle_, + 0, + haystack_.len(), + indices, + ); + } + assert!( + haystack_.len() <= u32::MAX as usize, + "fuzzy matching is only support for up to 2^32-1 codepoints" + ); + match (haystack_, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices); + } + let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; + if needle_.len() == end - start { + return Some(self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + )); + } + self.fuzzy_match_optimal::<INDICES, AsciiChar, AsciiChar>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::<INDICES>( + haystack, + needle as char, + start, + indices, + ); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + if needle_.len() == end - start { + return self + .exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices); + } + self.fuzzy_match_optimal::<INDICES, char, AsciiChar>( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + end, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices); + return Some(res); + } + let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; + if needle_.len() == end - start { + return self + .exact_match_impl::<INDICES>(haystack_, needle_, start, end, indices); + } + self.fuzzy_match_optimal::<INDICES, char, char>( + haystack, + needle, + start, + start + 1, + end, + indices, + ) + } + } + } + + /// Greedly find a fuzzy match in the `haystack`. + /// + /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) + /// indices and scores. Usually [fuzzy_match](crate::Matcher::fuzzy_match) should + /// be preferred. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_match_greedy( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + ) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_match_greedy_impl::<false>(haystack, needle, &mut Vec::new()) + } + + /// Greedly find a fuzzy match in the `haystack` and compute its indices. + /// + /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) + /// indices and scores. Usually [fuzzy_indices](crate::Matcher::fuzzy_indices) should + /// be preferred. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn fuzzy_indices_greedy( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + assert!(haystack.len() <= u32::MAX as usize); + self.fuzzy_match_greedy_impl::<true>(haystack, needle, indices) + } + + fn fuzzy_match_greedy_impl<const INDICES: bool>( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() > haystack.len() { + return None; + } + if needle_.is_empty() { + return Some(0); + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; + if needle_.len() == greedy_end - start { + return Some(self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + )); + } + self.fuzzy_match_greedy_::<INDICES, AsciiChar, AsciiChar>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + greedy_end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + self.fuzzy_match_greedy_::<INDICES, char, AsciiChar>( + haystack, + AsciiChar::cast(needle), + start, + start + 1, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + self.fuzzy_match_greedy_::<INDICES, char, char>( + haystack, + needle, + start, + start + 1, + indices, + ) + } + } + } + + /// Finds the substring match with the highest score in the `haystack`. + /// + /// This functions has `O(nm)` time complexity. However many cases can + /// be significantly accelerated using prefilters so it's usually very fast + /// in practice. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn substring_match( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + ) -> Option<u16> { + self.substring_match_impl::<false>(haystack, needle_, &mut Vec::new()) + } + + /// Finds the substring match with the highest score in the `haystack` and + /// compute its indices. + /// + /// This functions has `O(nm)` time complexity. However many cases can + /// be significantly accelerated using prefilters so it's usually fast + /// in practice. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn substring_indices( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + self.substring_match_impl::<true>(haystack, needle_, indices) + } + + fn substring_match_impl<const INDICES: bool>( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() > haystack.len() { + return None; + } + if needle_.is_empty() { + return Some(0); + } + if needle_.len() == haystack.len() { + return self.exact_match_impl::<INDICES>(haystack, needle_, 0, haystack.len(), indices); + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + return self.substring_match_1_ascii::<INDICES>(haystack, needle, indices); + } + self.substring_match_ascii::<INDICES>(haystack, needle, indices) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + None + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self.substring_match_1_non_ascii::<INDICES>( + haystack, + needle as char, + start, + indices, + ); + return Some(res); + } + let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.substring_match_non_ascii::<INDICES, _>( + haystack, + AsciiChar::cast(needle), + start, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + if let &[needle] = needle { + let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; + let res = self + .substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices); + return Some(res); + } + let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; + self.substring_match_non_ascii::<INDICES, _>(haystack, needle, start, indices) + } + } + } + + /// Checks whether needle and haystack match exactly. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + let mut trailing_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if !needle.last().is_whitespace() { + trailing_space = haystack.trailing_white_space() + } + // avoid wraparound in size check + if trailing_space == haystack.len() { + return None; + } + self.exact_match_impl::<false>( + haystack, + needle, + leading_space, + haystack.len() - trailing_space, + &mut Vec::new(), + ) + } + + /// Checks whether needle and haystack match exactly and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn exact_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + let mut trailing_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if !needle.last().is_whitespace() { + trailing_space = haystack.trailing_white_space() + } + // avoid wraparound in size check + if trailing_space == haystack.len() { + return None; + } + self.exact_match_impl::<true>( + haystack, + needle, + leading_space, + haystack.len() - trailing_space, + indices, + ) + } + + /// Checks whether needle is a prefix of the haystack. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if haystack.len() - leading_space < needle.len() { + None + } else { + self.exact_match_impl::<false>( + haystack, + needle, + leading_space, + needle.len() + leading_space, + &mut Vec::new(), + ) + } + } + + /// Checks whether needle is a prefix of the haystack and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn prefix_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut leading_space = 0; + if !needle.first().is_whitespace() { + leading_space = haystack.leading_white_space() + } + if haystack.len() - leading_space < needle.len() { + None + } else { + self.exact_match_impl::<true>( + haystack, + needle, + leading_space, + needle.len() + leading_space, + indices, + ) + } + } + + /// Checks whether needle is a postfix of the haystack. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut trailing_spaces = 0; + if !needle.last().is_whitespace() { + trailing_spaces = haystack.trailing_white_space() + } + if haystack.len() - trailing_spaces < needle.len() { + None + } else { + self.exact_match_impl::<false>( + haystack, + needle, + haystack.len() - needle.len() - trailing_spaces, + haystack.len() - trailing_spaces, + &mut Vec::new(), + ) + } + } + + /// Checks whether needle is a postfix of the haystack and compute the matches indices. + /// + /// This functions has `O(n)` time complexity. + /// + /// See the [matcher documentation](crate::Matcher) for more details. + pub fn postfix_indices( + &mut self, + haystack: Utf32Str<'_>, + needle: Utf32Str<'_>, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle.is_empty() { + return Some(0); + } + let mut trailing_spaces = 0; + if !needle.last().is_whitespace() { + trailing_spaces = haystack.trailing_white_space() + } + if haystack.len() - trailing_spaces < needle.len() { + None + } else { + self.exact_match_impl::<true>( + haystack, + needle, + haystack.len() - needle.len() - trailing_spaces, + haystack.len() - trailing_spaces, + indices, + ) + } + } + + fn exact_match_impl<const INDICES: bool>( + &mut self, + haystack: Utf32Str<'_>, + needle_: Utf32Str<'_>, + start: usize, + end: usize, + indices: &mut Vec<u32>, + ) -> Option<u16> { + if needle_.len() != end - start { + return None; + } + assert!( + haystack.len() <= u32::MAX as usize, + "matching is only support for up to 2^32-1 codepoints" + ); + let score = match (haystack, needle_) { + (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { + let matched = if self.config.ignore_case { + AsciiChar::cast(haystack)[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))) + } else { + &haystack[start..end] == needle + }; + if !matched { + return None; + } + self.calculate_score::<INDICES, _, _>( + AsciiChar::cast(haystack), + AsciiChar::cast(needle), + start, + end, + indices, + ) + } + (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { + // a purely ascii haystack can never be transformed to match + // a needle that contains non-ascii chars since we don't allow gaps + return None; + } + (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { + let matched = haystack[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(AsciiChar::cast(needle) + .iter() + .map(|c| c.normalize(&self.config))); + if !matched { + return None; + } + + self.calculate_score::<INDICES, _, _>( + haystack, + AsciiChar::cast(needle), + start, + end, + indices, + ) + } + (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { + let matched = haystack[start..end] + .iter() + .map(|c| c.normalize(&self.config)) + .eq(needle.iter().map(|c| c.normalize(&self.config))); + if !matched { + return None; + } + self.calculate_score::<INDICES, _, _>(haystack, needle, start, end, indices) + } + }; + Some(score) + } +} |
