1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
#!/usr/bin/env python3
"""
SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT
Copyright © 2021 pukkandan.ytdlp@gmail.com
Copyright © 2024 soispha@vhack.eu
* Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote
* Change FIELDS according to your needs
The output file will be in the format:
[{
'text': 'comment 1',
...
'replies': [{
'text': 'reply 1',
...
'replies': [...],
}, ...],
}, ...]
"""
import json
import sys
import argparse
from datetime import datetime
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def get_fields(dct):
for name, fn in FIELDS.items():
val = fn(dct, name)
if val is not None:
yield name, val
def filter_func(comments):
return [dict(get_fields(c)) for c in comments]
FIELDS = {
"text": dict.get,
"author": dict.get,
"timestamp": lambda dct, name: dct.get(name)
and datetime.strftime(datetime.utcfromtimestamp(dct.get(name)), "%Y-%m-%d"),
"edited": lambda dct, name: "(edited)" in dct.get("_time_text"),
"author_is_uploader": dict.get,
"is_favorited": dict.get,
# Add more fields here
"replies": lambda dct, name: filter_func(dct.get(name, [])) or None,
}
parser = argparse.ArgumentParser()
parser.add_argument(
"inputfile",
metavar="FILE",
help="File to read video metadata from (info.json)",
)
args = parser.parse_args()
eprint("Reading file")
with open(args.inputfile, encoding="utf-8") as f:
info_dict = json.load(f)
comment_data = {
c["id"]: c
for c in sorted(info_dict["comments"], key=lambda c: c.get("timestamp") or 0)
}
count = len(info_dict["comments"])
del info_dict
nested_comments = []
for i, (cid, c) in enumerate(comment_data.items(), 1):
eprint(f"Processing comment {i}/{count}", end="\r")
parent = (
nested_comments
if c["parent"] == "root"
else comment_data[c["parent"]].setdefault("replies", [])
)
parent.append(c)
del parent
eprint("")
nested_comments = filter_func(nested_comments)
eprint("Converting to json")
out = json.dumps(nested_comments, indent=4, ensure_ascii=False)
del nested_comments
eprint("Writing file")
print(out)
eprint("Done")
|