about summary refs log tree commit diff stats
path: root/python_update
diff options
context:
space:
mode:
authorBenedikt Peetz <benedikt.peetz@b-peetz.de>2024-08-22 14:01:22 +0200
committerBenedikt Peetz <benedikt.peetz@b-peetz.de>2024-08-22 14:01:22 +0200
commit53f49fa9279ac86944dfdc48f80a5783430632bf (patch)
tree6732d9b1f167d4cfe091b30378e1d5c6a8f4a4cb /python_update
parentbuild(package): Include python `blake3` dependency (diff)
downloadyt-53f49fa9279ac86944dfdc48f80a5783430632bf.zip
perf(raw_update.py)!: Don't fetch entries that are already in the database
Testing has resulted in a speed-up of circa 3400% (updates of 1
subscription (which is already fully stored) are now ca. 1 sec from
previously 34 sec).

BREAKING CHANGE: The extractor hash is now calculated from the `id` and
not the `webpage_url` field requiring a complete re-fetch of all stored
videos.
```bash
$# export your subscriptions:
$ yt subs list --urls > subs.txt
$# remove the old database
$ mv ~/.local/share/yt/videos.sqlite{,.old}
$# reimport the subsciptions
$ yt subs import subs.txt
$# refetch all videos
$ yt upadate
```
Diffstat (limited to '')
-rwxr-xr-xpython_update/raw_update.py32
1 files changed, 21 insertions, 11 deletions
diff --git a/python_update/raw_update.py b/python_update/raw_update.py
index 82be0a1..6f5b78d 100755
--- a/python_update/raw_update.py
+++ b/python_update/raw_update.py
@@ -13,14 +13,15 @@
 # This has been take from the `ytcc` updater code (at `8893bc98428cb78d458a9cf3ded03f519d86a46b`).
 # Source URL: https://github.com/woefe/ytcc/commit/8893bc98428cb78d458a9cf3ded03f519d86a46b
 
+from blake3 import blake3
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Iterable, Optional, Tuple, TypeVar
 import asyncio
 import itertools
 import json
 import logging
 import sys
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Iterable, Optional, Tuple, TypeVar
 
 import yt_dlp
 
@@ -85,7 +86,9 @@ class Fetcher:
             "extractor_args": {"youtubetab": {"approximate_date": [""]}},
         }
 
-    async def get_unprocessed_entries(self, url: str) -> Iterable[Tuple[str, Any]]:
+    async def get_unprocessed_entries(
+        self, url: str, hashes: Iterable[str]
+    ) -> Iterable[Tuple[str, str, Any]]:
         result = []
         with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
             logger.info("Checking playlist '%s'...", url)
@@ -104,7 +107,10 @@ class Fetcher:
             else:
                 entries = info.get("entries", [])
                 for entry in take(self.max_items, entries):
-                    result.append((url, entry))
+                    id = str.encode(yt_dlp.utils.unsmuggle_url(entry["id"])[0])
+                    ehash = blake3(id).hexdigest()
+                    if ehash not in hashes:
+                        result.append((url, entry))
         return result
 
     def _process_ie(self, entry):
@@ -135,10 +141,11 @@ class Updater:
     def __init__(self, max_backlog=20):
         self.max_items = max_backlog
         self.fetcher = Fetcher(max_backlog)
+        self.hashes = None
 
     async def update_url(self, url: str):
         print(f"Updating {url}...", file=sys.stderr)
-        new_entries = await self.fetcher.get_unprocessed_entries(url)
+        new_entries = await self.fetcher.get_unprocessed_entries(url, self.hashes)
 
         await asyncio.gather(
             *itertools.starmap(self.fetcher.process_entry, new_entries)
@@ -147,14 +154,17 @@ class Updater:
     async def do_update(self, urls: Iterable[str]):
         await asyncio.gather(*map(self.update_url, urls))
 
-    def update(self, urls: Iterable[str]):
+    def update(self, urls: Iterable[str], hashes: Iterable[str]):
+        self.hashes = hashes
         asyncio.run(self.do_update(urls))
 
 
-def update(max_backlog: int):
+def update():
+    max_backlog = int(sys.argv[1])
+    subscriptions_number = int(sys.argv[2])
     u = Updater(max_backlog=max_backlog)
-    u.update(sys.argv[2:])
+    u.update(sys.argv[3:(3 + subscriptions_number)], sys.argv[(3 + subscriptions_number):])
 
 
-max_backlog = int(sys.argv[1])
-update(max_backlog)
+print(sys.argv, file=sys.stderr)
+update()