// yt - A fully featured command line YouTube client // // Copyright (C) 2025 Benedikt Peetz // Copyright (C) 2025 uutils developers // SPDX-License-Identifier: MIT // // This file is part of Yt. // // You should have received a copy of the License along with this program. // If not, see . // This file is part of the uutils coreutils package. // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. use std::iter::Peekable; use std::slice::Iter; use unicode_width::UnicodeWidthChar; use crate::FmtOptions; fn char_width(c: char) -> usize { if (c as usize) < 0xA0 { // if it is ASCII, call it exactly 1 wide (including control chars) // calling control chars' widths 1 is consistent with OpenBSD fmt 1 } else { // otherwise, get the unicode width // note that we shouldn't actually get None here because only c < 0xA0 // can return None, but for safety and future-proofing we do it this way UnicodeWidthChar::width(c).unwrap_or(1) } } // lines with PSKIP, lacking PREFIX, or which are entirely blank are // NoFormatLines; otherwise, they are FormatLines #[derive(Debug)] pub(super) enum Line { FormatLine(FileLine), NoFormatLine(String, bool), } impl Line { // when we know that it's a FormatLine, as in the ParagraphStream iterator fn get_formatline(self) -> FileLine { match self { Self::FormatLine(fl) => fl, Self::NoFormatLine(..) => panic!("Found NoFormatLine when expecting FormatLine"), } } // when we know that it's a NoFormatLine, as in the ParagraphStream iterator fn get_noformatline(self) -> (String, bool) { match self { Self::NoFormatLine(s, b) => (s, b), Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"), } } } /// Each line's prefix has to be considered to know whether to merge it with /// the next line or not #[derive(Debug)] pub(super) struct FileLine { line: String, /// The end of the indent, always the start of the text indent_end: usize, /// The end of the PREFIX's indent, that is, the spaces before the prefix prefix_indent_end: usize, /// Display length of indent taking into account tabs indent_len: usize, /// PREFIX indent length taking into account tabs prefix_len: usize, } /// Iterator that produces a stream of Lines from a file pub(super) struct FileLines<'a> { opts: &'a FmtOptions, lines: std::str::Lines<'a>, } impl FileLines<'_> { fn new<'b>(opts: &'b FmtOptions, lines: std::str::Lines<'b>) -> FileLines<'b> { FileLines { opts, lines } } /// returns true if this line should be formatted fn match_prefix(&self, line: &str) -> (bool, usize) { let Some(prefix) = &self.opts.prefix else { return (true, 0); }; FileLines::match_prefix_generic(prefix, line, self.opts.exact_prefix) } /// returns true if this line should be formatted fn match_anti_prefix(&self, line: &str) -> bool { let Some(anti_prefix) = &self.opts.anti_prefix else { return true; }; match FileLines::match_prefix_generic(anti_prefix, line, self.opts.exact_anti_prefix) { (true, _) => false, (_, _) => true, } } fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, usize) { if line.starts_with(pfx) { return (true, 0); } if !exact { // we do it this way rather than byte indexing to support unicode whitespace chars for (i, char) in line.char_indices() { if line[i..].starts_with(pfx) { return (true, i); } else if !char.is_whitespace() { break; } } } (false, 0) } fn compute_indent(&self, string: &str, prefix_end: usize) -> (usize, usize, usize) { let mut prefix_len = 0; let mut indent_len = 0; let mut indent_end = 0; for (os, c) in string.char_indices() { if os == prefix_end { // we found the end of the prefix, so this is the printed length of the prefix here prefix_len = indent_len; } if (os >= prefix_end) && !c.is_whitespace() { // found first non-whitespace after prefix, this is indent_end indent_end = os; break; } else if c == '\t' { // compute tab length indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth; } else { // non-tab character indent_len += char_width(c); } } (indent_end, prefix_len, indent_len) } } impl Iterator for FileLines<'_> { type Item = Line; fn next(&mut self) -> Option { let n = self.lines.next()?; // if this line is entirely whitespace, // emit a blank line // Err(true) indicates that this was a linebreak, // which is important to know when detecting mail headers if n.chars().all(char::is_whitespace) { return Some(Line::NoFormatLine(String::new(), true)); } let (pmatch, poffset) = self.match_prefix(n); // if this line does not match the prefix, // emit the line unprocessed and iterate again if !pmatch { return Some(Line::NoFormatLine(n.to_owned(), false)); } // if the line matches the prefix, but is blank after, // don't allow lines to be combined through it (that is, // treat it like a blank line, except that since it's // not truly blank we will not allow mail headers on the // following line) if pmatch && n[poffset + self.opts.prefix.as_ref().map_or(0, String::len)..] .chars() .all(char::is_whitespace) { return Some(Line::NoFormatLine(n.to_owned(), false)); } // skip if this line matches the anti_prefix // (NOTE definition of match_anti_prefix is TRUE if we should process) if !self.match_anti_prefix(n) { return Some(Line::NoFormatLine(n.to_owned(), false)); } // figure out the indent, prefix, and prefixindent ending points let prefix_end = poffset + self.opts.prefix.as_ref().map_or(0, String::len); let (indent_end, prefix_len, indent_len) = self.compute_indent(n, prefix_end); Some(Line::FormatLine(FileLine { line: n.to_owned(), indent_end, prefix_indent_end: poffset, indent_len, prefix_len, })) } } /// A paragraph : a collection of [`FileLines`] that are to be formatted /// plus info about the paragraph's indentation /// /// We only retain the String from the [`FileLine`]; the other info /// is only there to help us in deciding how to merge lines into Paragraphs #[derive(Debug)] pub(super) struct Paragraph { /// the lines of the file lines: Vec, /// string representing the init, that is, the first line's indent pub init_str: String, /// printable length of the init string considering TABWIDTH pub init_len: usize, /// byte location of end of init in first line String init_end: usize, /// string representing indent pub indent_str: String, /// length of above pub indent_len: usize, /// byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward) indent_end: usize, /// we need to know if this is a mail header because we do word splitting differently in that case pub mail_header: bool, } /// An iterator producing a stream of paragraphs from a stream of lines /// given a set of options. pub(super) struct ParagraphStream<'a> { lines: Peekable>, next_mail: bool, opts: &'a FmtOptions, } impl ParagraphStream<'_> { pub(super) fn new<'b>(opts: &'b FmtOptions, text: &'b str) -> ParagraphStream<'b> { let lines = FileLines::new(opts, text.lines()).peekable(); // at the beginning of the file, we might find mail headers ParagraphStream { lines, next_mail: true, opts, } } /// Detect RFC822 mail header fn is_mail_header(line: &FileLine) -> bool { // a mail header begins with either "From " (envelope sender line) // or with a sequence of printable ASCII chars (33 to 126, inclusive, // except colon) followed by a colon. if line.indent_end > 0 { false } else { let l_slice = &line.line[..]; if l_slice.starts_with("From ") { true } else { let Some(colon_posn) = l_slice.find(':') else { return false; }; // header field must be nonzero length if colon_posn == 0 { return false; } l_slice[..colon_posn] .chars() .all(|x| !matches!(x as usize, y if !(33..=126).contains(&y))) } } } } impl Iterator for ParagraphStream<'_> { type Item = Result; #[allow(clippy::cognitive_complexity)] fn next(&mut self) -> Option> { // return a NoFormatLine in an Err; it should immediately be output let noformat = match self.lines.peek()? { Line::FormatLine(_) => false, Line::NoFormatLine(_, _) => true, }; // found a NoFormatLine, immediately dump it out if noformat { let (s, nm) = self.lines.next().unwrap().get_noformatline(); self.next_mail = nm; return Some(Err(s)); } // found a FormatLine, now build a paragraph let mut init_str = String::new(); let mut init_end = 0; let mut init_len = 0; let mut indent_str = String::new(); let mut indent_end = 0; let mut indent_len = 0; let mut prefix_len = 0; let mut prefix_indent_end = 0; let mut p_lines = Vec::new(); let mut in_mail = false; let mut second_done = false; // for when we use crown or tagged mode loop { // peek ahead // need to explicitly force fl out of scope before we can call self.lines.next() let Some(Line::FormatLine(fl)) = self.lines.peek() else { break; }; if p_lines.is_empty() { // first time through the loop, get things set up // detect mail header if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) { in_mail = true; // there can't be any indent or prefixindent because otherwise is_mail_header // would fail since there cannot be any whitespace before the colon in a // valid header field indent_str.push_str(" "); indent_len = 2; } else { if self.opts.crown_margin || self.opts.tagged_paragraph { init_str.push_str(&fl.line[..fl.indent_end]); init_len = fl.indent_len; init_end = fl.indent_end; } else { second_done = true; } // these will be overwritten in the 2nd line of crown or tagged mode, but // we are not guaranteed to get to the 2nd line, e.g., if the next line // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around indent_str.push_str(&fl.line[..fl.indent_end]); indent_len = fl.indent_len; indent_end = fl.indent_end; // save these to check for matching lines prefix_len = fl.prefix_len; prefix_indent_end = fl.prefix_indent_end; // in tagged mode, add 4 spaces of additional indenting by default // (gnu fmt's behavior is different: it seems to find the closest column to // indent_end that is divisible by 3. But honestly that behavior seems // pretty arbitrary. // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big. if self.opts.tagged_paragraph { indent_str.push_str(" "); indent_len += 4; } } } else if in_mail { // lines following mail headers must begin with spaces if fl.indent_end == 0 || (self.opts.prefix.is_some() && fl.prefix_indent_end == 0) { break; // this line does not begin with spaces } } else if !second_done { // now we have enough info to handle crown margin and tagged mode // in both crown and tagged modes we require that prefix_len is the same if prefix_len != fl.prefix_len || prefix_indent_end != fl.prefix_indent_end { break; } // in tagged mode, indent has to be *different* on following lines if self.opts.tagged_paragraph && indent_len - 4 == fl.indent_len && indent_end == fl.indent_end { break; } // this is part of the same paragraph, get the indent info from this line indent_str.clear(); indent_str.push_str(&fl.line[..fl.indent_end]); indent_len = fl.indent_len; indent_end = fl.indent_end; second_done = true; } else { // detect mismatch if indent_end != fl.indent_end || prefix_indent_end != fl.prefix_indent_end || indent_len != fl.indent_len || prefix_len != fl.prefix_len { break; } } p_lines.push(self.lines.next().unwrap().get_formatline().line); // when we're in split-only mode, we never join lines, so stop here if self.opts.split_only { break; } } // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot. // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank // NoFormatLine. self.next_mail = in_mail; Some(Ok(Paragraph { lines: p_lines, init_str, init_len, init_end, indent_str, indent_len, indent_end, mail_header: in_mail, })) } } pub(super) struct ParaWords<'a> { opts: &'a FmtOptions, para: &'a Paragraph, words: Vec>, } impl<'a> ParaWords<'a> { pub(super) fn new(opts: &'a FmtOptions, para: &'a Paragraph) -> Self { let mut pw = ParaWords { opts, para, words: Vec::new(), }; pw.create_words(); pw } fn create_words(&mut self) { if self.para.mail_header { // no extra spacing for mail headers; always exactly 1 space // safe to trim_start on every line of a mail header, since the // first line is guaranteed not to have any spaces self.words.extend( self.para .lines .iter() .flat_map(|x| x.split_whitespace()) .map(|x| WordInfo { word: x, word_start: 0, word_nchars: x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped) before_tab: None, after_tab: 0, sentence_start: false, ends_punct: false, new_line: false, }), ); } else { // first line self.words .extend(if self.opts.crown_margin || self.opts.tagged_paragraph { // crown and tagged mode has the "init" in the first line, so slice from there WordSplit::new(self.opts, &self.para.lines[0][self.para.init_end..]) } else { // otherwise we slice from the indent WordSplit::new(self.opts, &self.para.lines[0][self.para.indent_end..]) }); if self.para.lines.len() > 1 { let indent_end = self.para.indent_end; let opts = self.opts; self.words.extend( self.para .lines .iter() .skip(1) .flat_map(|x| WordSplit::new(opts, &x[indent_end..])), ); } } } pub(super) fn words(&'a self) -> Iter<'a, WordInfo<'a>> { self.words.iter() } } struct WordSplit<'a> { opts: &'a FmtOptions, string: &'a str, length: usize, position: usize, prev_punct: bool, } impl WordSplit<'_> { fn analyze_tabs(&self, string: &str) -> (Option, usize, Option) { // given a string, determine (length before tab) and (printed length after first tab) // if there are no tabs, beforetab = -1 and aftertab is the printed length let mut beforetab = None; let mut aftertab = 0; let mut word_start = None; for (os, c) in string.char_indices() { if !c.is_whitespace() { word_start = Some(os); break; } else if c == '\t' { if beforetab.is_none() { beforetab = Some(aftertab); aftertab = 0; } else { aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth; } } else { aftertab += 1; } } (beforetab, aftertab, word_start) } } impl WordSplit<'_> { fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> { // wordsplits *must* start at a non-whitespace character let trim_string = string.trim_start(); WordSplit { opts, string: trim_string, length: string.len(), position: 0, prev_punct: false, } } fn is_punctuation(c: char) -> bool { matches!(c, '!' | '.' | '?') } } pub(super) struct WordInfo<'a> { pub word: &'a str, pub word_start: usize, pub word_nchars: usize, pub before_tab: Option, pub after_tab: usize, pub sentence_start: bool, pub ends_punct: bool, pub new_line: bool, } // returns (&str, is_start_of_sentence) impl<'a> Iterator for WordSplit<'a> { type Item = WordInfo<'a>; fn next(&mut self) -> Option> { if self.position >= self.length { return None; } let old_position = self.position; let new_line = old_position == 0; // find the start of the next word, and record if we find a tab character let (before_tab, after_tab, word_start) = if let (b, a, Some(s)) = self.analyze_tabs(&self.string[old_position..]) { (b, a, s + old_position) } else { self.position = self.length; return None; }; // find the beginning of the next whitespace // note that this preserves the invariant that self.position // points to whitespace character OR end of string let mut word_nchars = 0; self.position = match self.string[word_start..].find(|x: char| { if x.is_whitespace() { true } else { word_nchars += char_width(x); false } }) { None => self.length, Some(s) => s + word_start, }; let word_start_relative = word_start - old_position; // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence. let is_start_of_sentence = self.prev_punct && (before_tab.is_some() || word_start_relative > 1); // now record whether this word ends in punctuation self.prev_punct = match self.string[..self.position].chars().next_back() { Some(ch) => WordSplit::is_punctuation(ch), _ => panic!("fatal: expected word not to be empty"), }; let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform { (&self.string[word_start..self.position], 0, None, 0) } else { ( &self.string[old_position..self.position], word_start_relative, before_tab, after_tab, ) }; Some(WordInfo { word, word_start: word_start_relative, word_nchars, before_tab, after_tab, sentence_start: is_start_of_sentence, ends_punct: self.prev_punct, new_line, }) } }