1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
|
use std::cmp::max;
use crate::chars::{Char, CharClass};
use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell};
use crate::score::{
BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS,
PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH,
};
use crate::{Config, Matcher};
impl Matcher {
pub(crate) fn fuzzy_match_optimal<const INDICES: bool, H: Char + PartialEq<N>, N: Char>(
&mut self,
haystack: &[H],
needle: &[N],
start: usize,
greedy_end: usize,
end: usize,
indices: &mut Vec<u32>,
) -> Option<u16> {
// construct a matrix (and copy the haystack), the matrix and haystack size are bounded
// to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows
// us to treat needle indices as u16
let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else {
return self.fuzzy_match_greedy_::<INDICES, H, N>(
haystack, needle, start, greedy_end, indices,
);
};
let prev_class = start
.checked_sub(1)
.map(|i| haystack[i].char_class(&self.config))
.unwrap_or(self.config.initial_char_class);
let matched = matrix.setup::<INDICES, _>(needle, prev_class, &self.config, start as u32);
// this only happened with unicode haystacks, for ASCII the prefilter handles all rejects
if !matched {
assert!(
!N::ASCII || !H::ASCII,
"Non-match should have been caught by prefilter. Maybe `needle` is not normalized?"
);
return None;
}
// populate the matrix and find the best score
let matrix_len = matrix.populate_matrix::<INDICES, _>(needle);
let last_row_off = matrix.row_offs[needle.len() - 1];
let relative_last_row_off = last_row_off as usize + 1 - needle.len();
let (match_end, match_score_cell) = matrix.current_row[relative_last_row_off..]
.iter()
.enumerate()
.max_by_key(|(_, cell)| cell.score)
.expect("there must be at least one match");
if INDICES {
matrix.reconstruct_optimal_path(match_end as u16, indices, matrix_len, start as u32);
}
Some(match_score_cell.score)
}
}
const UNMATCHED: ScoreCell = ScoreCell {
score: 0,
// if matched is true then the consecutive bonus
// is always at least BONUS_CONSECUTIVE so
// this constant can never occur naturally
consecutive_bonus: 0,
matched: true,
};
fn next_m_cell(p_score: u16, bonus: u16, m_cell: ScoreCell) -> ScoreCell {
if m_cell == UNMATCHED {
return ScoreCell {
score: p_score + bonus + SCORE_MATCH,
matched: false,
consecutive_bonus: bonus as u8,
};
}
let mut consecutive_bonus = max(m_cell.consecutive_bonus as u16, BONUS_CONSECUTIVE);
if bonus >= BONUS_BOUNDARY && bonus > consecutive_bonus {
consecutive_bonus = bonus
}
let score_match = m_cell.score + max(consecutive_bonus, bonus);
let score_skip = p_score + bonus;
if score_match > score_skip {
ScoreCell {
score: score_match + SCORE_MATCH,
matched: true,
consecutive_bonus: consecutive_bonus as u8,
}
} else {
ScoreCell {
score: score_skip + SCORE_MATCH,
matched: false,
consecutive_bonus: bonus as u8,
}
}
}
fn p_score(prev_p_score: u16, prev_m_score: u16) -> (u16, bool) {
let score_match = prev_m_score.saturating_sub(PENALTY_GAP_START);
let score_skip = prev_p_score.saturating_sub(PENALTY_GAP_EXTENSION);
if score_match > score_skip {
(score_match, true)
} else {
(score_skip, false)
}
}
impl<H: Char> MatcherDataView<'_, H> {
fn setup<const INDICES: bool, N: Char>(
&mut self,
needle: &[N],
mut prev_class: CharClass,
config: &Config,
start: u32,
) -> bool
where
H: PartialEq<N>,
{
let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut());
let (mut needle_char, mut row_start) = row_iter.next().unwrap();
let col_iter = self
.haystack
.iter_mut()
.zip(self.bonus.iter_mut())
.enumerate();
let mut matched = false;
for (i, (c_, bonus_)) in col_iter {
let (c, class) = c_.char_class_and_normalize(config);
*c_ = c;
let bonus = config.bonus_for(prev_class, class);
// save bonus for later so we don't have to recompute it each time
*bonus_ = bonus as u8;
prev_class = class;
let i = i as u16;
if c == needle_char {
// save the first idx of each char
if let Some(next) = row_iter.next() {
*row_start = i;
(needle_char, row_start) = next;
} else if !matched {
*row_start = i;
// we have at least one match
matched = true;
}
}
}
if !matched {
return false;
}
debug_assert_eq!(self.row_offs[0], 0);
Self::score_row::<true, INDICES, _>(
self.current_row,
self.matrix_cells,
self.haystack,
self.bonus,
0,
self.row_offs[1],
0,
needle[0],
needle[1],
if config.prefer_prefix {
if start == 0 {
MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE
} else {
(MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub(
(start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION,
)
}
} else {
0
},
);
true
}
#[expect(clippy::too_many_arguments)]
fn score_row<const FIRST_ROW: bool, const INDICES: bool, N: Char>(
current_row: &mut [ScoreCell],
matrix_cells: &mut [MatrixCell],
haystack: &[H],
bonus: &[u8],
row_off: u16,
mut next_row_off: u16,
needle_idx: u16,
needle_char: N,
next_needle_char: N,
mut prefix_bonus: u16,
) where
H: PartialEq<N>,
{
next_row_off -= 1;
let relative_row_off = row_off - needle_idx;
let next_relative_row_off = next_row_off - needle_idx;
let skipped_col_iter = haystack[row_off as usize..next_row_off as usize]
.iter()
.zip(bonus[row_off as usize..next_row_off as usize].iter())
.zip(current_row[relative_row_off as usize..next_relative_row_off as usize].iter_mut())
.zip(matrix_cells.iter_mut());
let mut prev_p_score = 0;
let mut prev_m_score = 0;
for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW {
let cell = if c == needle_char {
ScoreCell {
score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false,
consecutive_bonus: *bonus,
}
} else {
UNMATCHED
};
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else {
*score_cell
};
if INDICES {
matrix_cell.set(p_matched, m_cell.matched);
}
prev_p_score = p_score;
prev_m_score = m_cell.score;
}
let col_iter = haystack[next_row_off as usize..]
.windows(2)
.zip(bonus[next_row_off as usize..].windows(2))
.zip(current_row[next_relative_row_off as usize..].iter_mut())
.zip(matrix_cells[(next_relative_row_off - relative_row_off) as usize..].iter_mut());
for (((c, bonus), score_cell), matrix_cell) in col_iter {
let (p_score, p_matched) = p_score(prev_p_score, prev_m_score);
let m_cell = if FIRST_ROW {
let cell = if c[0] == needle_char {
ScoreCell {
score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER
+ SCORE_MATCH
+ prefix_bonus / PREFIX_BONUS_SCALE,
matched: false,
consecutive_bonus: bonus[0],
}
} else {
UNMATCHED
};
prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION);
cell
} else {
*score_cell
};
*score_cell = if c[1] == next_needle_char {
next_m_cell(p_score, bonus[1] as u16, m_cell)
} else {
UNMATCHED
};
if INDICES {
matrix_cell.set(p_matched, m_cell.matched);
}
prev_p_score = p_score;
prev_m_score = m_cell.score;
}
}
fn populate_matrix<const INDICES: bool, N: Char>(&mut self, needle: &[N]) -> usize
where
H: PartialEq<N>,
{
let mut matrix_cells = &mut self.matrix_cells[self.current_row.len()..];
let mut row_iter = needle[1..]
.iter()
.copied()
.zip(self.row_offs[1..].iter().copied())
.enumerate();
let (mut needle_idx, (mut needle_char, mut row_off)) = row_iter.next().unwrap();
for (next_needle_idx, (next_needle_char, next_row_off)) in row_iter {
Self::score_row::<false, INDICES, _>(
self.current_row,
matrix_cells,
self.haystack,
self.bonus,
row_off,
next_row_off,
needle_idx as u16 + 1,
needle_char,
next_needle_char,
0,
);
let len = self.current_row.len() + needle_idx + 1 - row_off as usize;
matrix_cells = &mut matrix_cells[len..];
(needle_idx, needle_char, row_off) = (next_needle_idx, next_needle_char, next_row_off);
}
matrix_cells.as_ptr() as usize - self.matrix_cells.as_ptr() as usize
}
fn reconstruct_optimal_path(
&self,
max_score_end: u16,
indices: &mut Vec<u32>,
matrix_len: usize,
start: u32,
) {
let indices_start = indices.len();
indices.resize(indices_start + self.row_offs.len(), 0);
let indices = &mut indices[indices_start..];
let last_row_off = *self.row_offs.last().unwrap();
indices[self.row_offs.len() - 1] = start + max_score_end as u32 + last_row_off as u32;
let mut matrix_cells = &self.matrix_cells[..matrix_len];
let width = self.current_row.len();
let mut row_iter = self.row_offs[..self.row_offs.len() - 1]
.iter()
.copied()
.enumerate()
.rev()
.map(|(i, off)| {
let relative_off = off as usize - i;
let row;
(matrix_cells, row) =
matrix_cells.split_at(matrix_cells.len() - (width - relative_off));
(i, off, row)
});
let (mut row_idx, mut row_off, mut row) = row_iter.next().unwrap();
let mut col = max_score_end;
let relative_last_row_off = last_row_off as usize + 1 - self.row_offs.len();
let mut matched = self.current_row[col as usize + relative_last_row_off].matched;
col += last_row_off - row_off - 1;
loop {
if matched {
indices[row_idx] = start + col as u32 + row_off as u32;
}
let next_matched = row[col as usize].get(matched);
if matched {
let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else {
break;
};
col += row_off - next_row_off;
(row_idx, row_off, row) = (next_row_idx, next_row_off, next_row)
}
col -= 1;
matched = next_matched;
}
}
}
|