aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenedikt Peetz <benedikt.peetz@b-peetz.de>2024-08-22 14:01:22 +0200
committerBenedikt Peetz <benedikt.peetz@b-peetz.de>2024-08-22 14:01:22 +0200
commit53f49fa9279ac86944dfdc48f80a5783430632bf (patch)
tree6732d9b1f167d4cfe091b30378e1d5c6a8f4a4cb
parentbuild(package): Include python `blake3` dependency (diff)
downloadyt-53f49fa9279ac86944dfdc48f80a5783430632bf.zip
perf(raw_update.py)!: Don't fetch entries that are already in the database
Testing has resulted in a speed-up of circa 3400% (updates of 1 subscription (which is already fully stored) are now ca. 1 sec from previously 34 sec). BREAKING CHANGE: The extractor hash is now calculated from the `id` and not the `webpage_url` field requiring a complete re-fetch of all stored videos. ```bash $# export your subscriptions: $ yt subs list --urls > subs.txt $# remove the old database $ mv ~/.local/share/yt/videos.sqlite{,.old} $# reimport the subsciptions $ yt subs import subs.txt $# refetch all videos $ yt upadate ```
-rwxr-xr-xpython_update/raw_update.py32
-rw-r--r--src/update/mod.rs24
2 files changed, 31 insertions, 25 deletions
diff --git a/python_update/raw_update.py b/python_update/raw_update.py
index 82be0a1..6f5b78d 100755
--- a/python_update/raw_update.py
+++ b/python_update/raw_update.py
@@ -13,14 +13,15 @@
# This has been take from the `ytcc` updater code (at `8893bc98428cb78d458a9cf3ded03f519d86a46b`).
# Source URL: https://github.com/woefe/ytcc/commit/8893bc98428cb78d458a9cf3ded03f519d86a46b
+from blake3 import blake3
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Iterable, Optional, Tuple, TypeVar
import asyncio
import itertools
import json
import logging
import sys
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Iterable, Optional, Tuple, TypeVar
import yt_dlp
@@ -85,7 +86,9 @@ class Fetcher:
"extractor_args": {"youtubetab": {"approximate_date": [""]}},
}
- async def get_unprocessed_entries(self, url: str) -> Iterable[Tuple[str, Any]]:
+ async def get_unprocessed_entries(
+ self, url: str, hashes: Iterable[str]
+ ) -> Iterable[Tuple[str, str, Any]]:
result = []
with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
logger.info("Checking playlist '%s'...", url)
@@ -104,7 +107,10 @@ class Fetcher:
else:
entries = info.get("entries", [])
for entry in take(self.max_items, entries):
- result.append((url, entry))
+ id = str.encode(yt_dlp.utils.unsmuggle_url(entry["id"])[0])
+ ehash = blake3(id).hexdigest()
+ if ehash not in hashes:
+ result.append((url, entry))
return result
def _process_ie(self, entry):
@@ -135,10 +141,11 @@ class Updater:
def __init__(self, max_backlog=20):
self.max_items = max_backlog
self.fetcher = Fetcher(max_backlog)
+ self.hashes = None
async def update_url(self, url: str):
print(f"Updating {url}...", file=sys.stderr)
- new_entries = await self.fetcher.get_unprocessed_entries(url)
+ new_entries = await self.fetcher.get_unprocessed_entries(url, self.hashes)
await asyncio.gather(
*itertools.starmap(self.fetcher.process_entry, new_entries)
@@ -147,14 +154,17 @@ class Updater:
async def do_update(self, urls: Iterable[str]):
await asyncio.gather(*map(self.update_url, urls))
- def update(self, urls: Iterable[str]):
+ def update(self, urls: Iterable[str], hashes: Iterable[str]):
+ self.hashes = hashes
asyncio.run(self.do_update(urls))
-def update(max_backlog: int):
+def update():
+ max_backlog = int(sys.argv[1])
+ subscriptions_number = int(sys.argv[2])
u = Updater(max_backlog=max_backlog)
- u.update(sys.argv[2:])
+ u.update(sys.argv[3:(3 + subscriptions_number)], sys.argv[(3 + subscriptions_number):])
-max_backlog = int(sys.argv[1])
-update(max_backlog)
+print(sys.argv, file=sys.stderr)
+update()
diff --git a/src/update/mod.rs b/src/update/mod.rs
index bdd6c27..119c53c 100644
--- a/src/update/mod.rs
+++ b/src/update/mod.rs
@@ -53,9 +53,15 @@ pub async fn update(
}
}
+ // We can get away with not having to re-fetch the hashes every time, as the returned video
+ // should not contain duplicates.
+ let hashes = get_all_hashes(app).await?;
+
let mut child = Command::new("raw_update.py")
.arg(max_backlog.to_string())
+ .arg(urls.len().to_string())
.args(&urls)
+ .args(&hashes.iter().map(|haz| haz.to_string()).collect::<Vec<_>>())
.stdout(Stdio::piped())
.stderr(Stdio::null())
.stdin(Stdio::null())
@@ -70,10 +76,6 @@ pub async fn update(
)
.lines();
- // We can get away with not having to re-fetch the hashes every time, as the returned video
- // should not contain duplicates.
- let hashes = get_all_hashes(app).await?;
-
while let Some(line) = out.next_line().await? {
// use tokio::{fs::File, io::AsyncWriteExt};
// let mut output = File::create("output.json").await?;
@@ -93,7 +95,7 @@ pub async fn update(
let out = child.wait().await?;
if out.success() {
- error!("A yt update-once invokation failed for all subscriptions.")
+ error!("The update_raw.py invokation failed for all subscriptions.")
}
Ok(())
@@ -174,16 +176,11 @@ async fn process_subscription(
unsmuggle_url(smug_url)?
};
- let extractor_hash = blake3::hash(url.as_str().as_bytes());
+ let extractor_hash = blake3::hash(unwrap_option!(entry.id).as_bytes());
if hashes.contains(&extractor_hash) {
// We already stored the video information
- println!(
- "(Ignoring duplicated video from: '{}' -> '{}')",
- sub.name,
- unwrap_option!(entry.title)
- );
- return Ok(());
+ unreachable!("The python update script should have never provided us a duplicated video");
} else {
let video = Video {
cache_path: None,
@@ -203,7 +200,6 @@ async fn process_subscription(
println!("{}", video.to_color_display());
add_video(app, video).await?;
+ Ok(())
}
-
- Ok(())
}