hm/soispha/pkgs/scripts/specific/ytcc/nest_comments.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

#!/usr/bin/env python3

"""
SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT
Copyright © 2021 pukkandan.ytdlp@gmail.com
Copyright © 2024 soispha@vhack.eu


* Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote
* Change FIELDS according to your needs

The output file will be in the format:
[{
  'text': 'comment 1',
  ...
  'replies': [{
    'text': 'reply 1',
    ...
    'replies': [...],
  }, ...],
}, ...]
"""

import json
import sys
import argparse
from datetime import datetime

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

def get_fields(dct):
    for name, fn in FIELDS.items():
        val = fn(dct, name)
        if val is not None:
            yield name, val


def filter_func(comments):
    return [dict(get_fields(c)) for c in comments]


FIELDS = {
    "text": dict.get,
    "author": dict.get,
    "timestamp": lambda dct, name: dct.get(name)
    and datetime.strftime(datetime.utcfromtimestamp(dct.get(name)), "%Y-%m-%d"),
    "edited": lambda dct, name: "(edited)" in dct.get("_time_text"),
    "author_is_uploader": dict.get,
    "is_favorited": dict.get,
    # Add more fields here
    "replies": lambda dct, name: filter_func(dct.get(name, [])) or None,
}


parser = argparse.ArgumentParser()
parser.add_argument(
    "inputfile",
    metavar="FILE",
    help="File to read video metadata from (info.json)",
)
args = parser.parse_args()


eprint("Reading file")
with open(args.inputfile, encoding="utf-8") as f:
    info_dict = json.load(f)

comment_data = {
    c["id"]: c
    for c in sorted(info_dict["comments"], key=lambda c: c.get("timestamp") or 0)
}
count = len(info_dict["comments"])
del info_dict
nested_comments = []
for i, (cid, c) in enumerate(comment_data.items(), 1):
    eprint(f"Processing comment {i}/{count}", end="\r")
    parent = (
        nested_comments
        if c["parent"] == "root"
        else comment_data[c["parent"]].setdefault("replies", [])
    )
    parent.append(c)
del parent


eprint("")
nested_comments = filter_func(nested_comments)


eprint("Converting to json")
out = json.dumps(nested_comments, indent=4, ensure_ascii=False)


del nested_comments
eprint("Writing file")
print(out)
eprint("Done")