diff options
| -rw-r--r-- | crates/yt/Cargo.toml | 1 | ||||
| -rw-r--r-- | crates/yt/src/storage/db/video/comments/display.rs | 8 | ||||
| -rw-r--r-- | crates/yt/src/storage/db/video/comments/mod.rs | 259 | ||||
| -rw-r--r-- | crates/yt/src/storage/db/video/comments/raw.rs | 9 | ||||
| -rw-r--r-- | crates/yt/src/storage/db/video/comments/tests.rs | 37 |
5 files changed, 162 insertions, 152 deletions
diff --git a/crates/yt/Cargo.toml b/crates/yt/Cargo.toml index 71335d9..91d9204 100644 --- a/crates/yt/Cargo.toml +++ b/crates/yt/Cargo.toml @@ -35,7 +35,6 @@ futures = "0.3.32" libmpv2.workspace = true log.workspace = true notify = { version = "8.2.0", default-features = false } -regex = "1.12.3" serde.workspace = true serde_json.workspace = true shlex = "2.0.1" diff --git a/crates/yt/src/storage/db/video/comments/display.rs b/crates/yt/src/storage/db/video/comments/display.rs index c372603..d0c400d 100644 --- a/crates/yt/src/storage/db/video/comments/display.rs +++ b/crates/yt/src/storage/db/video/comments/display.rs @@ -33,7 +33,7 @@ impl Comments { color: bool, ) -> std::fmt::Result { let ident = &(0..ident_count).map(|_| " ").collect::<String>(); - let value = &comment.value; + let value = &comment.raw; f.write_str(ident)?; @@ -79,7 +79,7 @@ impl Comments { write!( f, " [{}]", - comment.value.like_count.bold().red().render(color) + comment.raw.like_count.bold().red().render(color) )?; f.write_str(":\n")?; @@ -102,7 +102,7 @@ impl Comments { f.write_str("\n")?; } else { let mut children = comment.replies.clone(); - children.sort_by(|a, b| a.value.timestamp.cmp(&b.value.timestamp)); + children.sort_by(|a, b| a.raw.timestamp.cmp(&b.raw.timestamp)); for child in children { format(&child, f, ident_count + 4, color)?; @@ -116,7 +116,7 @@ impl Comments { if !&self.inner.is_empty() { let mut children = self.inner.clone(); - children.sort_by(|a, b| b.value.like_count.cmp(&a.value.like_count)); + children.sort_by(|a, b| b.raw.like_count.cmp(&a.raw.like_count)); for child in children { format(&child, &mut f, 0, use_color)?; diff --git a/crates/yt/src/storage/db/video/comments/mod.rs b/crates/yt/src/storage/db/video/comments/mod.rs index 41a03be..b199346 100644 --- a/crates/yt/src/storage/db/video/comments/mod.rs +++ b/crates/yt/src/storage/db/video/comments/mod.rs @@ -8,11 +8,10 @@ // You should have received a copy of the License along with this program. // If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. -use std::mem; +use log::debug; +use url::Url; -use regex::{Captures, Regex}; - -use crate::storage::db::video::comments::raw::{Parent, RawComment}; +use crate::storage::db::video::comments::raw::{Id, RawComment}; pub(crate) mod display; pub(crate) mod raw; @@ -20,182 +19,168 @@ pub(crate) mod raw; #[cfg(test)] mod tests; -#[derive(Debug, Clone, PartialEq)] -pub(crate) struct Comment { - value: RawComment, - replies: Vec<Self>, -} - #[derive(Debug, Default, PartialEq)] pub(crate) struct Comments { inner: Vec<Comment>, } +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct Comment { + raw: RawComment, + replies: Vec<Self>, +} + impl Comments { - pub(crate) fn from_raw(raw: Vec<RawComment>) -> Self { - let mut me = Self::default(); + pub(crate) fn from_raw(mut raw: Vec<RawComment>) -> Self { + let mut me = Self { inner: vec![] }; + + raw.iter_mut().enumerate().for_each(|(index, raw_comment)| { + raw_comment.original_order = index; + }); + + raw.sort_by_key(|raw| match &raw.parent { + raw::Parent::Root => 0, + raw::Parent::Id(id) => id.split('.').count(), + }); - // Apply the parent -> child mapping yt provides us with. for raw_comment in raw { - if let Parent::Id(id) = &raw_comment.parent { - me.insert(&(id.clone()), Comment::from(raw_comment)); - } else { - me.inner.push(Comment::from(raw_comment)); + match raw_comment.parent.clone() { + raw::Parent::Root => me.add_toplevel(raw_comment), + raw::Parent::Id(id) => { + let ids: Vec<_> = id.split('.').collect(); + me.add_reply(&ids, raw_comment); + } } } { // Sort the final comments chronologically. - // This ensures that replies are matched with the comment they actually replied to and - // not a later comment from the same author. - for comment in &mut me.inner { - comment - .replies - .sort_by_key(|comment| comment.value.timestamp); + // This reverses our sort we did before for ids. + me.sort_replies(); + } - for reply in &comment.replies { - assert!(reply.replies.is_empty()); - } + me + } + + fn sort_replies(&mut self) { + self.inner.sort_by_key(|comment| comment.raw.original_order); + + self.inner.iter_mut().for_each(Comment::sort_replies); + } + + fn add_toplevel(&mut self, comment: RawComment) { + self.inner.push(comment.into()); + } + + fn get_id(&mut self, id: &str) -> &mut Comment { + for comment in &mut self.inner { + if comment.raw.id.id == id { + return comment; } } - { - let find_reply_indicator = - Regex::new(r"\u{200b}?(@[^\t\s]+)\u{200b}?").expect("This is hardcoded"); + unreachable!("We cannot add a comment, that is a reply to an not-yet added one.") + } - // Try to re-construct the replies for the reply comments. - for comment in &mut me.inner { - let previous_replies = mem::take(&mut comment.replies); + fn add_reply(&mut self, ids: &[&str], mut raw_comment: RawComment) { + fn first_line(text: &str) -> &str { + let end = text + .chars() + .take_while(|ch| *ch != '\n' && *ch != '.') + .map(char::len_utf8) + .sum(); - let mut reply_tree = Comments::default(); + &text[..end] + } - for reply in previous_replies { - // We try to reconstruct the parent child relation ship by looking (naively) - // for a reply indicator. Currently, this is just the `@<some_name>`, as yt - // seems to insert that by default if you press `reply-to` in their clients. - // - // This follows these steps: - // - Does this reply have a “reply indicator”? - // - If yes, try to resolve the indicator. - // - If it is resolvable, add this reply to the [`Comment`] it resolved to. - // - If not, keep the comment as reply. + debug!("**Searching for parent id: `{}`", ids.join("-")); - if let Some(reply_indicator_matches) = - find_reply_indicator.captures(&reply.value.text.clone()) - { - // We found a reply indicator. - // First we traverse the current `reply_tree` in reversed order to find a - // match, than we check if the reply indicator matches the reply tree root - // and afterward we declare it unmatching and add it as toplevel. + let first = ids + .first() + .expect("We cannot have a comment reply, without also having it's parent id encoded"); + let mut reply = self.get_id(first); + debug!(" -> {}: `{}`", first, first_line(&reply.raw.text)); - let reply_target_author = reply_indicator_matches - .get(1) - .expect("This should also exist") - .as_str(); + for id in &ids[1..] { + debug!(" **Searching for id: `{id}`"); - if let Some(parent) = reply_tree.find_author_mut(reply_target_author) { - parent - .replies - .push(comment_from_reply(reply, &reply_indicator_matches)); - } else if comment.value.author == reply_target_author { - reply_tree - .add_toplevel(comment_from_reply(reply, &reply_indicator_matches)); - } else { - eprintln!( - "Failed to find a parent for ('{}') both directly \ - and via replies! The reply text was:\n'{}'\n", - reply_target_author, reply.value.text - ); - reply_tree.add_toplevel(reply); - } - } else { - // The comment text did not contain a reply indicator, so add it as - // toplevel. - reply_tree.add_toplevel(reply); - } - } + reply = reply.get_id(id); - comment.replies = reply_tree.inner; - } + debug!(" -> {}: `{}`", id, first_line(&reply.raw.text)); } - me - } - - fn add_toplevel(&mut self, value: Comment) { - self.inner.push(value); + raw_comment.text = raw_comment + .text + .trim() + .trim_start_matches(&reply.raw.author) + .trim() + .to_owned(); + reply.replies.push(raw_comment.into()); } +} - fn insert(&mut self, id: &str, value: Comment) { - let parent = self - .inner +impl Comment { + fn maybe_get_id(&mut self, id: &str) -> Option<&mut Self> { + self.replies .iter_mut() - .find(|c| c.value.id.id == id) - .expect("One of these should exist"); - - parent.replies.push(value); + .find(|comment| comment.raw.id.id == id) } - fn find_author_mut(&mut self, reply_target_author: &str) -> Option<&mut Comment> { - fn perform_check<'a>( - comment: &'a mut Comment, - reply_target_author: &str, - ) -> Option<&'a mut Comment> { - // TODO(@bpeetz): This is a workaround until rust has lexiographic lifetime support. <2025-07-18> - fn find_in_replies<'a>( - comment: &'a mut Comment, - reply_target_author: &str, - ) -> Option<&'a mut Comment> { - comment - .replies - .iter_mut() - .rev() - .find_map(|reply: &mut Comment| perform_check(reply, reply_target_author)) + fn get_id(&mut self, id: &str) -> &mut Self { + // TODO: This `if` is a work-around, until lexicographic lifetimes are added. <2026-05-26> + if self.maybe_get_id(id).is_none() { + macro_rules! from_last { + ($field:ident, $self:expr) => { + $self + .replies + .last() + .map_or(self.raw.$field, |last| last.raw.$field) + }; } - let comment_author_matches_target = comment.value.author == reply_target_author; - match find_in_replies(comment, reply_target_author) { - Some(_) => Some( - // PERFORMANCE(@bpeetz): We should not need to run this code twice. <2025-07-18> - find_in_replies(comment, reply_target_author) - .expect("We already had a Some result for this."), - ), - None if comment_author_matches_target => Some(comment), - None => None, - } - } + debug!( + "Failed to find an id for a reply (the parent id did not exist). Assuming deleted comment" + ); - for comment in self.inner.iter_mut().rev() { - if let Some(output) = perform_check(comment, reply_target_author) { - return Some(output); - } - } + self.replies.push(Comment { + raw: RawComment { + original_order: from_last!(original_order, self) + 1, + id: Id { id: id.to_owned() }, + text: "<Deleted comment>".to_owned(), + like_count: 0, + is_pinned: false, + author_id: "@ghost".to_owned(), + author: "@ghost".to_owned(), + author_is_verified: false, + author_thumbnail: Url::parse("https://example.org/@ghost").expect("hard-coded"), + parent: raw::Parent::Id(self.raw.id.id.clone()), + edited: false, + timestamp: from_last!(timestamp, self), + author_url: None, + author_is_uploader: false, + is_favorited: false, + }, + replies: vec![], + }); - None + self.replies.last_mut().expect("We just added it") + } else { + self.maybe_get_id(id).expect("It's some") + } } -} -fn comment_from_reply(reply: Comment, reply_indicator_matches: &Captures<'_>) -> Comment { - Comment::from(RawComment { - text: { - // Remove the `@<some_name>` for the comment text. - let full_match = reply_indicator_matches - .get(0) - .expect("This will always exist"); - let text = reply.value.text[0..full_match.start()].to_owned() - + &reply.value.text[full_match.end()..]; + fn sort_replies(&mut self) { + self.replies + .sort_by_key(|comment| comment.raw.original_order); - text.trim_matches(|c: char| c == '\u{200b}' || c == '\u{2060}' || c.is_whitespace()) - .to_owned() - }, - ..reply.value - }) + self.replies.iter_mut().for_each(Comment::sort_replies); + } } impl From<RawComment> for Comment { fn from(value: RawComment) -> Self { Self { - value, + raw: value, replies: vec![], } } diff --git a/crates/yt/src/storage/db/video/comments/raw.rs b/crates/yt/src/storage/db/video/comments/raw.rs index 3b7f40f..e27eedd 100644 --- a/crates/yt/src/storage/db/video/comments/raw.rs +++ b/crates/yt/src/storage/db/video/comments/raw.rs @@ -47,10 +47,15 @@ impl From<String> for Parent { #[derive(Debug, Deserialize, Clone, Eq, PartialEq, PartialOrd, Ord)] #[allow(clippy::struct_excessive_bools)] pub(crate) struct RawComment { + /// This field is used to encode the original order of the comments in the raw vector, returned + /// by yt-dlp. + #[serde(default = "zero")] + pub(crate) original_order: usize, + pub(crate) id: Id, pub(crate) text: String, #[serde(default = "zero")] - pub(crate) like_count: u32, + pub(crate) like_count: usize, pub(crate) is_pinned: bool, pub(crate) author_id: String, #[serde(default = "unknown")] @@ -71,7 +76,7 @@ pub(crate) struct RawComment { fn unknown() -> String { "<Unknown>".to_string() } -fn zero() -> u32 { +fn zero() -> usize { 0 } fn edited_from_time_text<'de, D>(d: D) -> Result<bool, D::Error> diff --git a/crates/yt/src/storage/db/video/comments/tests.rs b/crates/yt/src/storage/db/video/comments/tests.rs index 03e3597..8cb1a9a 100644 --- a/crates/yt/src/storage/db/video/comments/tests.rs +++ b/crates/yt/src/storage/db/video/comments/tests.rs @@ -38,6 +38,11 @@ macro_rules! mk_comments { ) )+ ) => {{ + use std::sync::atomic::{AtomicUsize, Ordering}; + + static INDEX_INPUT: AtomicUsize = AtomicUsize::new(0); + static INDEX_EXPECTED: AtomicUsize = AtomicUsize::new(0); + let (nested_input, _) = mk_comments!( $( $( @@ -49,7 +54,7 @@ macro_rules! mk_comments { let mut input: Vec<RawComment> = vec![ $( - mk_comments!(@to_raw input $name $comment $parent, $actual_parent) + mk_comments!(@to_raw input $name INDEX_INPUT.fetch_add(1, Ordering::Relaxed), $comment $parent, $actual_parent) ),+ ]; input.extend(nested_input); @@ -58,7 +63,7 @@ macro_rules! mk_comments { inner: vec![ $( Comment { - value: mk_comments!(@to_raw expected $name $comment $parent, $actual_parent), + raw: mk_comments!(@to_raw expected $name INDEX_EXPECTED.fetch_add(1, Ordering::Relaxed), $comment $parent, $actual_parent), replies: { let (_, nested_expected) = mk_comments!( $( @@ -86,6 +91,11 @@ macro_rules! mk_comments { ) )+ ) => {{ + use std::sync::atomic::{AtomicUsize, Ordering}; + + static INDEX_INPUT: AtomicUsize = AtomicUsize::new(0); + static INDEX_EXPECTED: AtomicUsize = AtomicUsize::new(0); + let (nested_input, _) = mk_comments!( $( $( @@ -97,7 +107,7 @@ macro_rules! mk_comments { let mut input: Vec<RawComment> = vec![ $( - mk_comments!(@to_raw input $name $comment) + mk_comments!(@to_raw input $name INDEX_INPUT.fetch_add(1, Ordering::Relaxed), $comment) ),+ ]; input.extend(nested_input); @@ -106,7 +116,7 @@ macro_rules! mk_comments { inner: vec![ $( Comment { - value: mk_comments!(@to_raw expected $name $comment), + raw: mk_comments!(@to_raw expected $name INDEX_EXPECTED.fetch_add(1, Ordering::Relaxed), $comment), replies: { let (_, nested_expected) = mk_comments!( $( @@ -125,19 +135,30 @@ macro_rules! mk_comments { (input, expected) }}; - (@mk_id $name:ident $comment:literal) => {{ + (@mk_id $name:ident $comment:literal $($parent:expr)?) => {{ use std::hash::{Hash, Hasher}; let input = format!("{}{}", stringify!($name), $comment); let mut digest = std::hash::DefaultHasher::new(); input.hash(&mut digest); - Id { id: digest.finish().to_string() } + + #[allow(unused_mut, unused_assignments)] + { + let mut parent_id = ".".to_owned(); + + $( + parent_id = format!("{}.", $parent.id); + )? + + Id { id: format!("{parent_id}{}", digest.finish().to_string()) } + } }}; - (@to_raw $state:ident $name:ident $comment:literal $($parent:expr, $actual_parent:ident)?) => { + (@to_raw $state:ident $name:ident $index:expr, $comment:literal $($parent:expr, $actual_parent:ident)?) => { RawComment { - id: mk_comments!(@mk_id $name $comment), + original_order: $index, + id: mk_comments!(@mk_id $name $comment $($parent)?), text: mk_comments!(@mk_text $state $comment $(, $actual_parent)?), like_count: 0, is_pinned: false, |
