perf(search): benchmark smart sort (#2202)

author: Ellie Huxtable <ellie@elliehuxtable.com> 2024-06-26 12:40:17 +0100
committer: GitHub <noreply@github.com> 2024-06-26 12:40:17 +0100
commit: 1201caee5c7b3020e1c879e527feb934fd1d8023 (patch)
tree: 5af6ecc78075401e459c97b65f74689e141d5137 /crates/atuin-history
parent: feat: add several other GitHub access token patterns (#2200) (diff)
download: atuin-1201caee5c7b3020e1c879e527feb934fd1d8023.zip
4 files changed, 91 insertions, 0 deletions
diff --git a/crates/atuin-history/Cargo.toml b/crates/atuin-history/Cargo.toml
index 4836cf74..2ed0e1a5 100644
--- a/crates/atuin-history/Cargo.toml
+++ b/crates/atuin-history/Cargo.toml
@@ -39,3 +39,12 @@ tracing = "0.1"
 uuid = { workspace = true }
 unicode-segmentation = "1.11.0"
 sysinfo = "0.30.7"
+
+[dev-dependencies]
+tracing-tree = "0.3"
+divan = "0.1.14"
+rand = { workspace = true }
+
+[[bench]]
+name = "smart_sort"
+harness = false
diff --git a/crates/atuin-history/benches/smart_sort.rs b/crates/atuin-history/benches/smart_sort.rs
new file mode 100644
index 00000000..a78064de
--- /dev/null
+++ b/crates/atuin-history/benches/smart_sort.rs
@@ -0,0 +1,35 @@
+use atuin_client::history::History;
+use atuin_history::sort::sort;
+
+use rand::Rng;
+
+fn main() {
+    // Run registered benchmarks.
+    divan::main();
+}
+
+// Smart sort usually runs on 200 entries, test on a few sizes
+#[divan::bench(args=[100, 200, 400, 800, 1600, 10000])]
+fn smart_sort(lines: usize) {
+    // benchmark a few different sizes of "history"
+    // first we need to generate some history. This will use a whole bunch of memory, sorry
+    let mut rng = rand::thread_rng();
+    let now = time::OffsetDateTime::now_utc().unix_timestamp();
+
+    let possible_commands = ["echo", "ls", "cd", "grep", "atuin", "curl"];
+    let mut commands = Vec::<History>::with_capacity(lines);
+
+    for _ in 0..lines {
+        let command = possible_commands[rng.gen_range(0..possible_commands.len())];
+
+        let command = History::import()
+            .command(command)
+            .timestamp(time::OffsetDateTime::from_unix_timestamp(rng.gen_range(0..now)).unwrap())
+            .build()
+            .into();
+
+        commands.push(command);
+    }
+
+    let _ = sort("curl", commands);
+}
diff --git a/crates/atuin-history/src/lib.rs b/crates/atuin-history/src/lib.rs
index 9d34677f..e7b33916 100644
--- a/crates/atuin-history/src/lib.rs
+++ b/crates/atuin-history/src/lib.rs
@@ -1 +1,2 @@
+pub mod sort;
 pub mod stats;
diff --git a/crates/atuin-history/src/sort.rs b/crates/atuin-history/src/sort.rs
new file mode 100644
index 00000000..4465a142
--- /dev/null
+++ b/crates/atuin-history/src/sort.rs
@@ -0,0 +1,46 @@
+use atuin_client::history::History;
+
+type ScoredHistory = (f64, History);
+
+// Fuzzy search already comes sorted by minspan
+// This sorting should be applicable to all search modes, and solve the more "obvious" issues
+// first.
+// Later on, we can pass in context and do some boosts there too.
+pub fn sort(query: &str, input: Vec<History>) -> Vec<History> {
+    // This can totally be extended. We need to be _careful_ that it's not slow.
+    // We also need to balance sorting db-side with sorting here. SQLite can do a lot,
+    // but some things are just much easier/more doable in Rust.
+
+    let mut scored = input
+        .into_iter()
+        .map(|h| {
+            // If history is _prefixed_ with the query, score it more highly
+            let score = if h.command.starts_with(query) {
+                2.0
+            } else if h.command.contains(query) {
+                1.75
+            } else {
+                1.0
+            };
+
+            // calculate how long ago the history was, in seconds
+            let now = time::OffsetDateTime::now_utc().unix_timestamp();
+            let time = h.timestamp.unix_timestamp();
+            let diff = std::cmp::max(1, now - time); // no /0 please
+
+            // prefer newer history, but not hugely so as to offset the other scoring
+            // the numbers will get super small over time, but I don't want time to overpower other
+            // scoring
+            #[allow(clippy::cast_precision_loss)]
+            let time_score = 1.0 + (1.0 / diff as f64);
+            let score = score * time_score;
+
+            (score, h)
+        })
+        .collect::<Vec<ScoredHistory>>();
+
+    scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap().reverse());
+
+    // Remove the scores and return the history
+    scored.into_iter().map(|(_, h)| h).collect::<Vec<History>>()
+}
author	Ellie Huxtable <ellie@elliehuxtable.com>	2024-06-26 12:40:17 +0100
committer	GitHub <noreply@github.com>	2024-06-26 12:40:17 +0100
commit	1201caee5c7b3020e1c879e527feb934fd1d8023 (patch)
tree	5af6ecc78075401e459c97b65f74689e141d5137 /crates/atuin-history
parent	feat: add several other GitHub access token patterns (#2200) (diff)
download	atuin-1201caee5c7b3020e1c879e527feb934fd1d8023.zip