diff options
author | Benedikt Peetz <benedikt.peetz@b-peetz.de> | 2024-08-22 14:01:22 +0200 |
---|---|---|
committer | Benedikt Peetz <benedikt.peetz@b-peetz.de> | 2024-08-22 14:01:22 +0200 |
commit | 53f49fa9279ac86944dfdc48f80a5783430632bf (patch) | |
tree | 6732d9b1f167d4cfe091b30378e1d5c6a8f4a4cb | |
parent | build(package): Include python `blake3` dependency (diff) | |
download | yt-53f49fa9279ac86944dfdc48f80a5783430632bf.zip |
perf(raw_update.py)!: Don't fetch entries that are already in the database
Testing has resulted in a speed-up of circa 3400% (updates of 1 subscription (which is already fully stored) are now ca. 1 sec from previously 34 sec). BREAKING CHANGE: The extractor hash is now calculated from the `id` and not the `webpage_url` field requiring a complete re-fetch of all stored videos. ```bash $# export your subscriptions: $ yt subs list --urls > subs.txt $# remove the old database $ mv ~/.local/share/yt/videos.sqlite{,.old} $# reimport the subsciptions $ yt subs import subs.txt $# refetch all videos $ yt upadate ```
-rwxr-xr-x | python_update/raw_update.py | 32 | ||||
-rw-r--r-- | src/update/mod.rs | 24 |
2 files changed, 31 insertions, 25 deletions
diff --git a/python_update/raw_update.py b/python_update/raw_update.py index 82be0a1..6f5b78d 100755 --- a/python_update/raw_update.py +++ b/python_update/raw_update.py @@ -13,14 +13,15 @@ # This has been take from the `ytcc` updater code (at `8893bc98428cb78d458a9cf3ded03f519d86a46b`). # Source URL: https://github.com/woefe/ytcc/commit/8893bc98428cb78d458a9cf3ded03f519d86a46b +from blake3 import blake3 +from dataclasses import dataclass +from functools import partial +from typing import Any, Iterable, Optional, Tuple, TypeVar import asyncio import itertools import json import logging import sys -from dataclasses import dataclass -from functools import partial -from typing import Any, Iterable, Optional, Tuple, TypeVar import yt_dlp @@ -85,7 +86,9 @@ class Fetcher: "extractor_args": {"youtubetab": {"approximate_date": [""]}}, } - async def get_unprocessed_entries(self, url: str) -> Iterable[Tuple[str, Any]]: + async def get_unprocessed_entries( + self, url: str, hashes: Iterable[str] + ) -> Iterable[Tuple[str, str, Any]]: result = [] with yt_dlp.YoutubeDL(self.ydl_opts) as ydl: logger.info("Checking playlist '%s'...", url) @@ -104,7 +107,10 @@ class Fetcher: else: entries = info.get("entries", []) for entry in take(self.max_items, entries): - result.append((url, entry)) + id = str.encode(yt_dlp.utils.unsmuggle_url(entry["id"])[0]) + ehash = blake3(id).hexdigest() + if ehash not in hashes: + result.append((url, entry)) return result def _process_ie(self, entry): @@ -135,10 +141,11 @@ class Updater: def __init__(self, max_backlog=20): self.max_items = max_backlog self.fetcher = Fetcher(max_backlog) + self.hashes = None async def update_url(self, url: str): print(f"Updating {url}...", file=sys.stderr) - new_entries = await self.fetcher.get_unprocessed_entries(url) + new_entries = await self.fetcher.get_unprocessed_entries(url, self.hashes) await asyncio.gather( *itertools.starmap(self.fetcher.process_entry, new_entries) @@ -147,14 +154,17 @@ class Updater: async def do_update(self, urls: Iterable[str]): await asyncio.gather(*map(self.update_url, urls)) - def update(self, urls: Iterable[str]): + def update(self, urls: Iterable[str], hashes: Iterable[str]): + self.hashes = hashes asyncio.run(self.do_update(urls)) -def update(max_backlog: int): +def update(): + max_backlog = int(sys.argv[1]) + subscriptions_number = int(sys.argv[2]) u = Updater(max_backlog=max_backlog) - u.update(sys.argv[2:]) + u.update(sys.argv[3:(3 + subscriptions_number)], sys.argv[(3 + subscriptions_number):]) -max_backlog = int(sys.argv[1]) -update(max_backlog) +print(sys.argv, file=sys.stderr) +update() diff --git a/src/update/mod.rs b/src/update/mod.rs index bdd6c27..119c53c 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -53,9 +53,15 @@ pub async fn update( } } + // We can get away with not having to re-fetch the hashes every time, as the returned video + // should not contain duplicates. + let hashes = get_all_hashes(app).await?; + let mut child = Command::new("raw_update.py") .arg(max_backlog.to_string()) + .arg(urls.len().to_string()) .args(&urls) + .args(&hashes.iter().map(|haz| haz.to_string()).collect::<Vec<_>>()) .stdout(Stdio::piped()) .stderr(Stdio::null()) .stdin(Stdio::null()) @@ -70,10 +76,6 @@ pub async fn update( ) .lines(); - // We can get away with not having to re-fetch the hashes every time, as the returned video - // should not contain duplicates. - let hashes = get_all_hashes(app).await?; - while let Some(line) = out.next_line().await? { // use tokio::{fs::File, io::AsyncWriteExt}; // let mut output = File::create("output.json").await?; @@ -93,7 +95,7 @@ pub async fn update( let out = child.wait().await?; if out.success() { - error!("A yt update-once invokation failed for all subscriptions.") + error!("The update_raw.py invokation failed for all subscriptions.") } Ok(()) @@ -174,16 +176,11 @@ async fn process_subscription( unsmuggle_url(smug_url)? }; - let extractor_hash = blake3::hash(url.as_str().as_bytes()); + let extractor_hash = blake3::hash(unwrap_option!(entry.id).as_bytes()); if hashes.contains(&extractor_hash) { // We already stored the video information - println!( - "(Ignoring duplicated video from: '{}' -> '{}')", - sub.name, - unwrap_option!(entry.title) - ); - return Ok(()); + unreachable!("The python update script should have never provided us a duplicated video"); } else { let video = Video { cache_path: None, @@ -203,7 +200,6 @@ async fn process_subscription( println!("{}", video.to_color_display()); add_video(app, video).await?; + Ok(()) } - - Ok(()) } |