From 53f49fa9279ac86944dfdc48f80a5783430632bf Mon Sep 17 00:00:00 2001 From: Benedikt Peetz Date: Thu, 22 Aug 2024 14:01:22 +0200 Subject: perf(raw_update.py)!: Don't fetch entries that are already in the database Testing has resulted in a speed-up of circa 3400% (updates of 1 subscription (which is already fully stored) are now ca. 1 sec from previously 34 sec). BREAKING CHANGE: The extractor hash is now calculated from the `id` and not the `webpage_url` field requiring a complete re-fetch of all stored videos. ```bash $# export your subscriptions: $ yt subs list --urls > subs.txt $# remove the old database $ mv ~/.local/share/yt/videos.sqlite{,.old} $# reimport the subsciptions $ yt subs import subs.txt $# refetch all videos $ yt upadate ``` --- src/update/mod.rs | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'src/update') diff --git a/src/update/mod.rs b/src/update/mod.rs index bdd6c27..119c53c 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -53,9 +53,15 @@ pub async fn update( } } + // We can get away with not having to re-fetch the hashes every time, as the returned video + // should not contain duplicates. + let hashes = get_all_hashes(app).await?; + let mut child = Command::new("raw_update.py") .arg(max_backlog.to_string()) + .arg(urls.len().to_string()) .args(&urls) + .args(&hashes.iter().map(|haz| haz.to_string()).collect::>()) .stdout(Stdio::piped()) .stderr(Stdio::null()) .stdin(Stdio::null()) @@ -70,10 +76,6 @@ pub async fn update( ) .lines(); - // We can get away with not having to re-fetch the hashes every time, as the returned video - // should not contain duplicates. - let hashes = get_all_hashes(app).await?; - while let Some(line) = out.next_line().await? { // use tokio::{fs::File, io::AsyncWriteExt}; // let mut output = File::create("output.json").await?; @@ -93,7 +95,7 @@ pub async fn update( let out = child.wait().await?; if out.success() { - error!("A yt update-once invokation failed for all subscriptions.") + error!("The update_raw.py invokation failed for all subscriptions.") } Ok(()) @@ -174,16 +176,11 @@ async fn process_subscription( unsmuggle_url(smug_url)? }; - let extractor_hash = blake3::hash(url.as_str().as_bytes()); + let extractor_hash = blake3::hash(unwrap_option!(entry.id).as_bytes()); if hashes.contains(&extractor_hash) { // We already stored the video information - println!( - "(Ignoring duplicated video from: '{}' -> '{}')", - sub.name, - unwrap_option!(entry.title) - ); - return Ok(()); + unreachable!("The python update script should have never provided us a duplicated video"); } else { let video = Video { cache_path: None, @@ -203,7 +200,6 @@ async fn process_subscription( println!("{}", video.to_color_display()); add_video(app, video).await?; + Ok(()) } - - Ok(()) } -- cgit 1.4.1