aboutsummaryrefslogtreecommitdiffstats
path: root/matcher/src/pattern.rs
diff options
context:
space:
mode:
Diffstat (limited to 'matcher/src/pattern.rs')
-rw-r--r--matcher/src/pattern.rs566
1 files changed, 566 insertions, 0 deletions
diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs
new file mode 100644
index 00000000..495feede
--- /dev/null
+++ b/matcher/src/pattern.rs
@@ -0,0 +1,566 @@
+//! This module provides a slightly higher level API for matching strings.
+
+use std::cmp::Reverse;
+
+use crate::{chars, Matcher, Utf32Str};
+
+#[cfg(test)]
+mod tests;
+
+use crate::Utf32String;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[non_exhaustive]
+/// How to treat a case mismatch between two characters.
+pub enum CaseMatching {
+ /// Characters never match their case folded version (`a != A`).
+ #[cfg_attr(not(feature = "unicode-casefold"), default)]
+ Respect,
+ /// Characters always match their case folded version (`a == A`).
+ #[cfg(feature = "unicode-casefold")]
+ Ignore,
+ /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
+ /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
+ #[default]
+ #[cfg(feature = "unicode-casefold")]
+ Smart,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[non_exhaustive]
+/// How to handle unicode normalization,
+pub enum Normalization {
+ /// Characters never match their normalized version (`a != ä`).
+ #[cfg_attr(not(feature = "unicode-normalization"), default)]
+ Never,
+ /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
+ /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
+ #[default]
+ #[cfg(feature = "unicode-normalization")]
+ Smart,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[non_exhaustive]
+/// The kind of matching algorithm to run for an atom.
+pub enum AtomKind {
+ /// Fuzzy matching where the needle must match any haystack characters
+ /// (match can contain gaps). This atom kind is used by default if no
+ /// special syntax is used. There is no negated fuzzy matching (too
+ /// many false positives).
+ ///
+ /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
+ Fuzzy,
+ /// The needle must match a contiguous sequence of haystack characters
+ /// without gaps. This atom kind is parsed from the following syntax:
+ /// `'foo` and `!foo` (negated).
+ ///
+ /// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
+ Substring,
+ /// The needle must match all leading haystack characters without gaps or
+ /// prefix. This atom kind is parsed from the following syntax: `^foo` and
+ /// `!^foo` (negated).
+ ///
+ /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
+ Prefix,
+ /// The needle must match all trailing haystack characters without gaps or
+ /// postfix. This atom kind is parsed from the following syntax: `foo$` and
+ /// `!foo$` (negated).
+ ///
+ /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
+ Postfix,
+ /// The needle must match all haystack characters without gaps or prefix.
+ /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
+ /// (negated).
+ ///
+ /// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
+ Exact,
+}
+
+/// A single pattern component that is matched with a single [`Matcher`] function
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct Atom {
+ /// Whether this pattern atom is a negative match.
+ /// A negative pattern atom will prevent haystacks matching it from
+ /// being matchend. It does not contribute to scoring/indices
+ pub negative: bool,
+ /// The kind of match that this pattern performs
+ pub kind: AtomKind,
+ needle: Utf32String,
+ ignore_case: bool,
+ normalize: bool,
+}
+
+impl Atom {
+ /// Creates a single [`Atom`] from a string by performing unicode
+ /// normalization and case folding (if necessary). Optionally `\ ` can
+ /// be escaped to ` `.
+ pub fn new(
+ needle: &str,
+ case: CaseMatching,
+ normalize: Normalization,
+ kind: AtomKind,
+ escape_whitespace: bool,
+ ) -> Atom {
+ Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
+ }
+
+ fn new_inner(
+ needle: &str,
+ case: CaseMatching,
+ normalization: Normalization,
+ kind: AtomKind,
+ escape_whitespace: bool,
+ append_dollar: bool,
+ ) -> Atom {
+ let mut ignore_case;
+ let mut normalize;
+ #[cfg(feature = "unicode-normalization")]
+ {
+ normalize = matches!(normalization, Normalization::Smart);
+ }
+ #[cfg(not(feature = "unicode-normalization"))]
+ {
+ normalize = false;
+ }
+ let needle = if needle.is_ascii() {
+ let mut needle = if escape_whitespace {
+ if let Some((start, rem)) = needle.split_once("\\ ") {
+ let mut needle = start.to_owned();
+ for rem in rem.split("\\ ") {
+ needle.push(' ');
+ needle.push_str(rem);
+ }
+ needle
+ } else {
+ needle.to_owned()
+ }
+ } else {
+ needle.to_owned()
+ };
+
+ match case {
+ #[cfg(feature = "unicode-casefold")]
+ CaseMatching::Ignore => {
+ ignore_case = true;
+ needle.make_ascii_lowercase()
+ }
+ #[cfg(feature = "unicode-casefold")]
+ CaseMatching::Smart => {
+ ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
+ }
+ CaseMatching::Respect => ignore_case = false,
+ }
+ if append_dollar {
+ needle.push('$');
+ }
+ Utf32String::Ascii(needle.into_boxed_str())
+ } else {
+ let mut needle_ = Vec::with_capacity(needle.len());
+ #[cfg(feature = "unicode-casefold")]
+ {
+ ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
+ }
+ #[cfg(not(feature = "unicode-casefold"))]
+ {
+ ignore_case = false;
+ }
+ #[cfg(feature = "unicode-normalization")]
+ {
+ normalize = matches!(normalization, Normalization::Smart);
+ }
+ if escape_whitespace {
+ let mut saw_backslash = false;
+ for mut c in chars::graphemes(needle) {
+ if saw_backslash {
+ if c == ' ' {
+ needle_.push(' ');
+ saw_backslash = false;
+ continue;
+ } else {
+ needle_.push('\\');
+ }
+ }
+ saw_backslash = c == '\\';
+ match case {
+ #[cfg(feature = "unicode-casefold")]
+ CaseMatching::Ignore => c = chars::to_lower_case(c),
+ #[cfg(feature = "unicode-casefold")]
+ CaseMatching::Smart => {
+ ignore_case = ignore_case && !chars::is_upper_case(c)
+ }
+ CaseMatching::Respect => (),
+ }
+ match normalization {
+ #[cfg(feature = "unicode-normalization")]
+ Normalization::Smart => {
+ normalize = normalize && chars::normalize(c) == c;
+ }
+ Normalization::Never => (),
+ }
+ needle_.push(c);
+ }
+ } else {
+ let chars = chars::graphemes(needle).map(|mut c| {
+ match case {
+ #[cfg(feature = "unicode-casefold")]
+ CaseMatching::Ignore => c = chars::to_lower_case(c),
+ #[cfg(feature = "unicode-casefold")]
+ CaseMatching::Smart => {
+ ignore_case = ignore_case && !chars::is_upper_case(c);
+ }
+ CaseMatching::Respect => (),
+ }
+ match normalization {
+ #[cfg(feature = "unicode-normalization")]
+ Normalization::Smart => {
+ normalize = normalize && chars::normalize(c) == c;
+ }
+ Normalization::Never => (),
+ }
+ c
+ });
+ needle_.extend(chars);
+ };
+ if append_dollar {
+ needle_.push('$');
+ }
+ Utf32String::Unicode(needle_.into_boxed_slice())
+ };
+ Atom {
+ kind,
+ needle,
+ negative: false,
+ ignore_case,
+ normalize,
+ }
+ }
+
+ /// Parse a pattern atom from a string. Some special trailing and leading
+ /// characters can be used to control the atom kind. See [`AtomKind`] for
+ /// details.
+ pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
+ let mut atom = raw;
+ let invert = match atom.as_bytes() {
+ [b'!', ..] => {
+ atom = &atom[1..];
+ true
+ }
+ [b'\\', b'!', ..] => {
+ atom = &atom[1..];
+ false
+ }
+ _ => false,
+ };
+
+ let mut kind = match atom.as_bytes() {
+ [b'^', ..] => {
+ atom = &atom[1..];
+ AtomKind::Prefix
+ }
+ [b'\'', ..] => {
+ atom = &atom[1..];
+ AtomKind::Substring
+ }
+ [b'\\', b'^' | b'\'', ..] => {
+ atom = &atom[1..];
+ AtomKind::Fuzzy
+ }
+ _ => AtomKind::Fuzzy,
+ };
+
+ let mut append_dollar = false;
+ match atom.as_bytes() {
+ [.., b'\\', b'$'] => {
+ append_dollar = true;
+ atom = &atom[..atom.len() - 2]
+ }
+ [.., b'$'] => {
+ kind = if kind == AtomKind::Fuzzy {
+ AtomKind::Postfix
+ } else {
+ AtomKind::Exact
+ };
+ atom = &atom[..atom.len() - 1]
+ }
+ _ => (),
+ }
+
+ if invert && kind == AtomKind::Fuzzy {
+ kind = AtomKind::Substring
+ }
+
+ let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
+ pattern.negative = invert;
+ pattern
+ }
+
+ /// Matches this pattern against `haystack` (using the allocation and configuration
+ /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
+ /// Documentation for more details.
+ ///
+ /// *Note:* The `ignore_case` setting is overwritten to match the casing of
+ /// each pattern atom.
+ pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
+ matcher.config.ignore_case = self.ignore_case;
+ matcher.config.normalize = self.normalize;
+ let pattern_score = match self.kind {
+ AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
+ AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
+ AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
+ AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
+ AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
+ };
+ if self.negative {
+ if pattern_score.is_some() {
+ return None;
+ }
+ Some(0)
+ } else {
+ pattern_score
+ }
+ }
+
+ /// Matches this pattern against `haystack` (using the allocation and
+ /// configuration from `matcher`), calculates a ranking score and the match
+ /// indices. See the [`Matcher`]. Documentation for more
+ /// details.
+ ///
+ /// *Note:* The `ignore_case` setting is overwritten to match the casing of
+ /// each pattern atom.
+ ///
+ /// *Note:* The `indices` vector is not cleared by this function.
+ pub fn indices(
+ &self,
+ haystack: Utf32Str<'_>,
+ matcher: &mut Matcher,
+ indices: &mut Vec<u32>,
+ ) -> Option<u16> {
+ matcher.config.ignore_case = self.ignore_case;
+ matcher.config.normalize = self.normalize;
+ if self.negative {
+ let pattern_score = match self.kind {
+ AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
+ AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
+ AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
+ AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
+ AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
+ };
+ pattern_score.is_none().then_some(0)
+ } else {
+ match self.kind {
+ AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
+ AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
+ AtomKind::Substring => {
+ matcher.substring_indices(haystack, self.needle.slice(..), indices)
+ }
+ AtomKind::Prefix => {
+ matcher.prefix_indices(haystack, self.needle.slice(..), indices)
+ }
+ AtomKind::Postfix => {
+ matcher.postfix_indices(haystack, self.needle.slice(..), indices)
+ }
+ }
+ }
+ }
+
+ /// Returns the needle text that is passed to the matcher. All indices
+ /// produced by the `indices` functions produce char indices used to index
+ /// this text
+ pub fn needle_text(&self) -> Utf32Str<'_> {
+ self.needle.slice(..)
+ }
+ /// Convenience function to easily match (and sort) a (relatively small)
+ /// list of inputs.
+ ///
+ /// *Note* This function is not recommended for building a full fuzzy
+ /// matching application that can match large numbers of matches (like all
+ /// files in a directory) as all matching is done on the current thread,
+ /// effectively blocking the UI. For such applications the high level
+ /// `nucleo` crate can be used instead.
+ pub fn match_list<T: AsRef<str>>(
+ &self,
+ items: impl IntoIterator<Item = T>,
+ matcher: &mut Matcher,
+ ) -> Vec<(T, u16)> {
+ if self.needle.is_empty() {
+ return items.into_iter().map(|item| (item, 0)).collect();
+ }
+ let mut buf = Vec::new();
+ let mut items: Vec<_> = items
+ .into_iter()
+ .filter_map(|item| {
+ self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
+ .map(|score| (item, score))
+ })
+ .collect();
+ items.sort_by_key(|(_, score)| Reverse(*score));
+ items
+ }
+}
+
+fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
+ let mut saw_backslash = false;
+ pattern.split(move |c| {
+ saw_backslash = match c {
+ c if c.is_whitespace() && !saw_backslash => return true,
+ '\\' => true,
+ _ => false,
+ };
+ false
+ })
+}
+
+#[derive(Debug, Default)]
+/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
+#[non_exhaustive]
+pub struct Pattern {
+ /// The individual pattern (words) in this pattern
+ pub atoms: Vec<Atom>,
+}
+
+impl Pattern {
+ /// Creates a pattern where each word is matched individually (whitespaces
+ /// can be escaped with `\`). Otherwise no parsing is performed (so `$`, `!`,
+ /// `'` and `^` don't receive special treatment). If you want to match the entire
+ /// pattern as a single needle use a single [`Atom`] instead.
+ pub fn new(
+ pattern: &str,
+ case_matching: CaseMatching,
+ normalize: Normalization,
+ kind: AtomKind,
+ ) -> Pattern {
+ let atoms = pattern_atoms(pattern)
+ .filter_map(|pat| {
+ let pat = Atom::new(pat, case_matching, normalize, kind, true);
+ (!pat.needle.is_empty()).then_some(pat)
+ })
+ .collect();
+ Pattern { atoms }
+ }
+ /// Creates a pattern where each word is matched individually (whitespaces
+ /// can be escaped with `\`). And `$`, `!`, `'` and `^` at word boundaries will
+ /// cause different matching behaviour (see [`AtomKind`]). These can be
+ /// escaped with backslash.
+ pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
+ let atoms = pattern_atoms(pattern)
+ .filter_map(|pat| {
+ let pat = Atom::parse(pat, case_matching, normalize);
+ (!pat.needle.is_empty()).then_some(pat)
+ })
+ .collect();
+ Pattern { atoms }
+ }
+
+ /// Convenience function to easily match (and sort) a (relatively small)
+ /// list of inputs.
+ ///
+ /// *Note* This function is not recommended for building a full fuzzy
+ /// matching application that can match large numbers of matches (like all
+ /// files in a directory) as all matching is done on the current thread,
+ /// effectively blocking the UI. For such applications the high level
+ /// `nucleo` crate can be used instead.
+ pub fn match_list<T: AsRef<str>>(
+ &self,
+ items: impl IntoIterator<Item = T>,
+ matcher: &mut Matcher,
+ ) -> Vec<(T, u32)> {
+ if self.atoms.is_empty() {
+ return items.into_iter().map(|item| (item, 0)).collect();
+ }
+ let mut buf = Vec::new();
+ let mut items: Vec<_> = items
+ .into_iter()
+ .filter_map(|item| {
+ self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
+ .map(|score| (item, score))
+ })
+ .collect();
+ items.sort_by_key(|(_, score)| Reverse(*score));
+ items
+ }
+
+ /// Matches this pattern against `haystack` (using the allocation and configuration
+ /// from `matcher`) and calculates a ranking score. See the [`Matcher`]
+ /// documentation for more details.
+ ///
+ /// *Note:* The `ignore_case` setting is overwritten to match the casing of
+ /// each pattern atom.
+ pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
+ if self.atoms.is_empty() {
+ return Some(0);
+ }
+ let mut score = 0;
+ for pattern in &self.atoms {
+ score += pattern.score(haystack, matcher)? as u32;
+ }
+ Some(score)
+ }
+
+ /// Matches this pattern against `haystack` (using the allocation and
+ /// configuration from `matcher`), calculates a ranking score and the match
+ /// indices. See the [`Matcher`] documentation for more
+ /// details.
+ ///
+ /// *Note:* The `ignore_case` setting is overwritten to match the casing of
+ /// each pattern atom.
+ ///
+ /// *Note:* The indices for each pattern are calculated individually
+ /// and simply appended to the `indices` vector and not deduplicated/sorted.
+ /// This allows associating the match indices to their source pattern. If
+ /// required (like for highlighting) unique/sorted indices can be obtained
+ /// as follows:
+ ///
+ /// ```
+ /// # let mut indices: Vec<u32> = Vec::new();
+ /// indices.sort_unstable();
+ /// indices.dedup();
+ /// ```
+ pub fn indices(
+ &self,
+ haystack: Utf32Str<'_>,
+ matcher: &mut Matcher,
+ indices: &mut Vec<u32>,
+ ) -> Option<u32> {
+ if self.atoms.is_empty() {
+ return Some(0);
+ }
+ let mut score = 0;
+ for pattern in &self.atoms {
+ score += pattern.indices(haystack, matcher, indices)? as u32;
+ }
+ Some(score)
+ }
+
+ /// Refreshes this pattern by reparsing it from a string. This is mostly
+ /// equivalent to just constructing a new pattern using [`Pattern::parse`]
+ /// but is slightly more efficient by reusing some allocations
+ pub fn reparse(
+ &mut self,
+ pattern: &str,
+ case_matching: CaseMatching,
+ normalize: Normalization,
+ ) {
+ self.atoms.clear();
+ let atoms = pattern_atoms(pattern).filter_map(|atom| {
+ let atom = Atom::parse(atom, case_matching, normalize);
+ if atom.needle.is_empty() {
+ return None;
+ }
+ Some(atom)
+ });
+ self.atoms.extend(atoms);
+ }
+}
+
+impl Clone for Pattern {
+ fn clone(&self) -> Self {
+ Self {
+ atoms: self.atoms.clone(),
+ }
+ }
+
+ fn clone_from(&mut self, source: &Self) {
+ self.atoms.clone_from(&source.atoms);
+ }
+}