refactor(crates/fmt): Init forked `uu_fmt` library

author: Benedikt Peetz <benedikt.peetz@b-peetz.de> 2025-02-16 18:06:15 +0100
committer: Benedikt Peetz <benedikt.peetz@b-peetz.de> 2025-02-16 18:08:10 +0100
commit: 7cfa6939deb5496a07313a2a34632da1a3fb1b89 (patch)
tree: 2eb43066ea6e8d14079be1da10f9c5856282e676 /crates/fmt/src/parasplit.rs
parent: chore(crates/termsize): Vendor (diff)
download: yt-7cfa6939deb5496a07313a2a34632da1a3fb1b89.zip
1 files changed, 629 insertions, 0 deletions
diff --git a/crates/fmt/src/parasplit.rs b/crates/fmt/src/parasplit.rs
new file mode 100644
index 0000000..d4723cb
--- /dev/null
+++ b/crates/fmt/src/parasplit.rs
@@ -0,0 +1,629 @@
+// yt - A fully featured command line YouTube client
+//
+// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
+// Copyright (C) 2025 uutils developers
+// SPDX-License-Identifier: MIT
+//
+// This file is part of Yt.
+//
+// You should have received a copy of the License along with this program.
+// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>.
+
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use std::iter::Peekable;
+use std::slice::Iter;
+use unicode_width::UnicodeWidthChar;
+
+use crate::FmtOptions;
+
+fn char_width(c: char) -> usize {
+    if (c as usize) < 0xA0 {
+        // if it is ASCII, call it exactly 1 wide (including control chars)
+        // calling control chars' widths 1 is consistent with OpenBSD fmt
+        1
+    } else {
+        // otherwise, get the unicode width
+        // note that we shouldn't actually get None here because only c < 0xA0
+        // can return None, but for safety and future-proofing we do it this way
+        UnicodeWidthChar::width(c).unwrap_or(1)
+    }
+}
+
+// lines with PSKIP, lacking PREFIX, or which are entirely blank are
+// NoFormatLines; otherwise, they are FormatLines
+#[derive(Debug)]
+pub(super) enum Line {
+    FormatLine(FileLine),
+    NoFormatLine(String, bool),
+}
+
+impl Line {
+    // when we know that it's a FormatLine, as in the ParagraphStream iterator
+    fn get_formatline(self) -> FileLine {
+        match self {
+            Self::FormatLine(fl) => fl,
+            Self::NoFormatLine(..) => panic!("Found NoFormatLine when expecting FormatLine"),
+        }
+    }
+
+    // when we know that it's a NoFormatLine, as in the ParagraphStream iterator
+    fn get_noformatline(self) -> (String, bool) {
+        match self {
+            Self::NoFormatLine(s, b) => (s, b),
+            Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"),
+        }
+    }
+}
+
+/// Each line's prefix has to be considered to know whether to merge it with
+/// the next line or not
+#[derive(Debug)]
+pub(super) struct FileLine {
+    line: String,
+    /// The end of the indent, always the start of the text
+    indent_end: usize,
+
+    /// The end of the PREFIX's indent, that is, the spaces before the prefix
+    prefix_indent_end: usize,
+
+    /// Display length of indent taking into account tabs
+    indent_len: usize,
+
+    /// PREFIX indent length taking into account tabs
+    prefix_len: usize,
+}
+
+/// Iterator that produces a stream of Lines from a file
+pub(super) struct FileLines<'a> {
+    opts: &'a FmtOptions,
+    lines: std::str::Lines<'a>,
+}
+
+impl FileLines<'_> {
+    fn new<'b>(opts: &'b FmtOptions, lines: std::str::Lines<'b>) -> FileLines<'b> {
+        FileLines { opts, lines }
+    }
+
+    /// returns true if this line should be formatted
+    fn match_prefix(&self, line: &str) -> (bool, usize) {
+        let Some(prefix) = &self.opts.prefix else {
+            return (true, 0);
+        };
+
+        FileLines::match_prefix_generic(prefix, line, self.opts.exact_prefix)
+    }
+
+    /// returns true if this line should be formatted
+    fn match_anti_prefix(&self, line: &str) -> bool {
+        let Some(anti_prefix) = &self.opts.anti_prefix else {
+            return true;
+        };
+
+        match FileLines::match_prefix_generic(anti_prefix, line, self.opts.exact_anti_prefix) {
+            (true, _) => false,
+            (_, _) => true,
+        }
+    }
+
+    fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, usize) {
+        if line.starts_with(pfx) {
+            return (true, 0);
+        }
+
+        if !exact {
+            // we do it this way rather than byte indexing to support unicode whitespace chars
+            for (i, char) in line.char_indices() {
+                if line[i..].starts_with(pfx) {
+                    return (true, i);
+                } else if !char.is_whitespace() {
+                    break;
+                }
+            }
+        }
+
+        (false, 0)
+    }
+
+    fn compute_indent(&self, string: &str, prefix_end: usize) -> (usize, usize, usize) {
+        let mut prefix_len = 0;
+        let mut indent_len = 0;
+        let mut indent_end = 0;
+        for (os, c) in string.char_indices() {
+            if os == prefix_end {
+                // we found the end of the prefix, so this is the printed length of the prefix here
+                prefix_len = indent_len;
+            }
+
+            if (os >= prefix_end) && !c.is_whitespace() {
+                // found first non-whitespace after prefix, this is indent_end
+                indent_end = os;
+                break;
+            } else if c == '\t' {
+                // compute tab length
+                indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
+            } else {
+                // non-tab character
+                indent_len += char_width(c);
+            }
+        }
+        (indent_end, prefix_len, indent_len)
+    }
+}
+
+impl Iterator for FileLines<'_> {
+    type Item = Line;
+
+    fn next(&mut self) -> Option<Line> {
+        let n = self.lines.next()?;
+
+        // if this line is entirely whitespace,
+        // emit a blank line
+        // Err(true) indicates that this was a linebreak,
+        // which is important to know when detecting mail headers
+        if n.chars().all(char::is_whitespace) {
+            return Some(Line::NoFormatLine(String::new(), true));
+        }
+
+        let (pmatch, poffset) = self.match_prefix(n);
+
+        // if this line does not match the prefix,
+        // emit the line unprocessed and iterate again
+        if !pmatch {
+            return Some(Line::NoFormatLine(n.to_owned(), false));
+        }
+
+        // if the line matches the prefix, but is blank after,
+        // don't allow lines to be combined through it (that is,
+        // treat it like a blank line, except that since it's
+        // not truly blank we will not allow mail headers on the
+        // following line)
+        if pmatch
+            && n[poffset + self.opts.prefix.as_ref().map_or(0, String::len)..]
+                .chars()
+                .all(char::is_whitespace)
+        {
+            return Some(Line::NoFormatLine(n.to_owned(), false));
+        }
+
+        // skip if this line matches the anti_prefix
+        // (NOTE definition of match_anti_prefix is TRUE if we should process)
+        if !self.match_anti_prefix(n) {
+            return Some(Line::NoFormatLine(n.to_owned(), false));
+        }
+
+        // figure out the indent, prefix, and prefixindent ending points
+        let prefix_end = poffset + self.opts.prefix.as_ref().map_or(0, String::len);
+        let (indent_end, prefix_len, indent_len) = self.compute_indent(n, prefix_end);
+
+        Some(Line::FormatLine(FileLine {
+            line: n.to_owned(),
+            indent_end,
+            prefix_indent_end: poffset,
+            indent_len,
+            prefix_len,
+        }))
+    }
+}
+
+/// A paragraph : a collection of [`FileLines`] that are to be formatted
+/// plus info about the paragraph's indentation
+///
+/// We only retain the String from the [`FileLine`]; the other info
+/// is only there to help us in deciding how to merge lines into Paragraphs
+#[derive(Debug)]
+pub(super) struct Paragraph {
+    /// the lines of the file
+    lines: Vec<String>,
+    /// string representing the init, that is, the first line's indent
+    pub init_str: String,
+    /// printable length of the init string considering TABWIDTH
+    pub init_len: usize,
+    /// byte location of end of init in first line String
+    init_end: usize,
+    /// string representing indent
+    pub indent_str: String,
+    /// length of above
+    pub indent_len: usize,
+    /// byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
+    indent_end: usize,
+    /// we need to know if this is a mail header because we do word splitting differently in that case
+    pub mail_header: bool,
+}
+
+/// An iterator producing a stream of paragraphs from a stream of lines
+/// given a set of options.
+pub(super) struct ParagraphStream<'a> {
+    lines: Peekable<FileLines<'a>>,
+    next_mail: bool,
+    opts: &'a FmtOptions,
+}
+
+impl ParagraphStream<'_> {
+    pub(super) fn new<'b>(opts: &'b FmtOptions, text: &'b str) -> ParagraphStream<'b> {
+        let lines = FileLines::new(opts, text.lines()).peekable();
+        // at the beginning of the file, we might find mail headers
+        ParagraphStream {
+            lines,
+            next_mail: true,
+            opts,
+        }
+    }
+
+    /// Detect RFC822 mail header
+    fn is_mail_header(line: &FileLine) -> bool {
+        // a mail header begins with either "From " (envelope sender line)
+        // or with a sequence of printable ASCII chars (33 to 126, inclusive,
+        // except colon) followed by a colon.
+        if line.indent_end > 0 {
+            false
+        } else {
+            let l_slice = &line.line[..];
+            if l_slice.starts_with("From ") {
+                true
+            } else {
+                let Some(colon_posn) = l_slice.find(':') else {
+                    return false;
+                };
+
+                // header field must be nonzero length
+                if colon_posn == 0 {
+                    return false;
+                }
+
+                l_slice[..colon_posn]
+                    .chars()
+                    .all(|x| !matches!(x as usize, y if !(33..=126).contains(&y)))
+            }
+        }
+    }
+}
+
+impl Iterator for ParagraphStream<'_> {
+    type Item = Result<Paragraph, String>;
+
+    #[allow(clippy::cognitive_complexity)]
+    fn next(&mut self) -> Option<Result<Paragraph, String>> {
+        // return a NoFormatLine in an Err; it should immediately be output
+        let noformat = match self.lines.peek()? {
+            Line::FormatLine(_) => false,
+            Line::NoFormatLine(_, _) => true,
+        };
+
+        // found a NoFormatLine, immediately dump it out
+        if noformat {
+            let (s, nm) = self.lines.next().unwrap().get_noformatline();
+            self.next_mail = nm;
+            return Some(Err(s));
+        }
+
+        // found a FormatLine, now build a paragraph
+        let mut init_str = String::new();
+        let mut init_end = 0;
+        let mut init_len = 0;
+        let mut indent_str = String::new();
+        let mut indent_end = 0;
+        let mut indent_len = 0;
+        let mut prefix_len = 0;
+        let mut prefix_indent_end = 0;
+        let mut p_lines = Vec::new();
+
+        let mut in_mail = false;
+        let mut second_done = false; // for when we use crown or tagged mode
+        loop {
+            // peek ahead
+            // need to explicitly force fl out of scope before we can call self.lines.next()
+            let Some(Line::FormatLine(fl)) = self.lines.peek() else {
+                break;
+            };
+
+            if p_lines.is_empty() {
+                // first time through the loop, get things set up
+                // detect mail header
+                if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
+                    in_mail = true;
+                    // there can't be any indent or prefixindent because otherwise is_mail_header
+                    // would fail since there cannot be any whitespace before the colon in a
+                    // valid header field
+                    indent_str.push_str("  ");
+                    indent_len = 2;
+                } else {
+                    if self.opts.crown_margin || self.opts.tagged_paragraph {
+                        init_str.push_str(&fl.line[..fl.indent_end]);
+                        init_len = fl.indent_len;
+                        init_end = fl.indent_end;
+                    } else {
+                        second_done = true;
+                    }
+
+                    // these will be overwritten in the 2nd line of crown or tagged mode, but
+                    // we are not guaranteed to get to the 2nd line, e.g., if the next line
+                    // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
+                    indent_str.push_str(&fl.line[..fl.indent_end]);
+                    indent_len = fl.indent_len;
+                    indent_end = fl.indent_end;
+
+                    // save these to check for matching lines
+                    prefix_len = fl.prefix_len;
+                    prefix_indent_end = fl.prefix_indent_end;
+
+                    // in tagged mode, add 4 spaces of additional indenting by default
+                    // (gnu fmt's behavior is different: it seems to find the closest column to
+                    // indent_end that is divisible by 3. But honestly that behavior seems
+                    // pretty arbitrary.
+                    // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big.
+                    if self.opts.tagged_paragraph {
+                        indent_str.push_str("    ");
+                        indent_len += 4;
+                    }
+                }
+            } else if in_mail {
+                // lines following mail headers must begin with spaces
+                if fl.indent_end == 0 || (self.opts.prefix.is_some() && fl.prefix_indent_end == 0) {
+                    break; // this line does not begin with spaces
+                }
+            } else if !second_done {
+                // now we have enough info to handle crown margin and tagged mode
+
+                // in both crown and tagged modes we require that prefix_len is the same
+                if prefix_len != fl.prefix_len || prefix_indent_end != fl.prefix_indent_end {
+                    break;
+                }
+
+                // in tagged mode, indent has to be *different* on following lines
+                if self.opts.tagged_paragraph
+                    && indent_len - 4 == fl.indent_len
+                    && indent_end == fl.indent_end
+                {
+                    break;
+                }
+
+                // this is part of the same paragraph, get the indent info from this line
+                indent_str.clear();
+                indent_str.push_str(&fl.line[..fl.indent_end]);
+                indent_len = fl.indent_len;
+                indent_end = fl.indent_end;
+
+                second_done = true;
+            } else {
+                // detect mismatch
+                if indent_end != fl.indent_end
+                    || prefix_indent_end != fl.prefix_indent_end
+                    || indent_len != fl.indent_len
+                    || prefix_len != fl.prefix_len
+                {
+                    break;
+                }
+            }
+
+            p_lines.push(self.lines.next().unwrap().get_formatline().line);
+
+            // when we're in split-only mode, we never join lines, so stop here
+            if self.opts.split_only {
+                break;
+            }
+        }
+
+        // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot.
+        // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank
+        // NoFormatLine.
+        self.next_mail = in_mail;
+
+        Some(Ok(Paragraph {
+            lines: p_lines,
+            init_str,
+            init_len,
+            init_end,
+            indent_str,
+            indent_len,
+            indent_end,
+            mail_header: in_mail,
+        }))
+    }
+}
+
+pub(super) struct ParaWords<'a> {
+    opts: &'a FmtOptions,
+    para: &'a Paragraph,
+    words: Vec<WordInfo<'a>>,
+}
+
+impl<'a> ParaWords<'a> {
+    pub(super) fn new(opts: &'a FmtOptions, para: &'a Paragraph) -> Self {
+        let mut pw = ParaWords {
+            opts,
+            para,
+            words: Vec::new(),
+        };
+        pw.create_words();
+        pw
+    }
+
+    fn create_words(&mut self) {
+        if self.para.mail_header {
+            // no extra spacing for mail headers; always exactly 1 space
+            // safe to trim_start on every line of a mail header, since the
+            // first line is guaranteed not to have any spaces
+            self.words.extend(
+                self.para
+                    .lines
+                    .iter()
+                    .flat_map(|x| x.split_whitespace())
+                    .map(|x| WordInfo {
+                        word: x,
+                        word_start: 0,
+                        word_nchars: x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped)
+                        before_tab: None,
+                        after_tab: 0,
+                        sentence_start: false,
+                        ends_punct: false,
+                        new_line: false,
+                    }),
+            );
+        } else {
+            // first line
+            self.words
+                .extend(if self.opts.crown_margin || self.opts.tagged_paragraph {
+                    // crown and tagged mode has the "init" in the first line, so slice from there
+                    WordSplit::new(self.opts, &self.para.lines[0][self.para.init_end..])
+                } else {
+                    // otherwise we slice from the indent
+                    WordSplit::new(self.opts, &self.para.lines[0][self.para.indent_end..])
+                });
+
+            if self.para.lines.len() > 1 {
+                let indent_end = self.para.indent_end;
+                let opts = self.opts;
+                self.words.extend(
+                    self.para
+                        .lines
+                        .iter()
+                        .skip(1)
+                        .flat_map(|x| WordSplit::new(opts, &x[indent_end..])),
+                );
+            }
+        }
+    }
+
+    pub(super) fn words(&'a self) -> Iter<'a, WordInfo<'a>> {
+        self.words.iter()
+    }
+}
+
+struct WordSplit<'a> {
+    opts: &'a FmtOptions,
+    string: &'a str,
+    length: usize,
+    position: usize,
+    prev_punct: bool,
+}
+
+impl WordSplit<'_> {
+    fn analyze_tabs(&self, string: &str) -> (Option<usize>, usize, Option<usize>) {
+        // given a string, determine (length before tab) and (printed length after first tab)
+        // if there are no tabs, beforetab = -1 and aftertab is the printed length
+        let mut beforetab = None;
+        let mut aftertab = 0;
+        let mut word_start = None;
+        for (os, c) in string.char_indices() {
+            if !c.is_whitespace() {
+                word_start = Some(os);
+                break;
+            } else if c == '\t' {
+                if beforetab.is_none() {
+                    beforetab = Some(aftertab);
+                    aftertab = 0;
+                } else {
+                    aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth;
+                }
+            } else {
+                aftertab += 1;
+            }
+        }
+        (beforetab, aftertab, word_start)
+    }
+}
+
+impl WordSplit<'_> {
+    fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
+        // wordsplits *must* start at a non-whitespace character
+        let trim_string = string.trim_start();
+        WordSplit {
+            opts,
+            string: trim_string,
+            length: string.len(),
+            position: 0,
+            prev_punct: false,
+        }
+    }
+
+    fn is_punctuation(c: char) -> bool {
+        matches!(c, '!' | '.' | '?')
+    }
+}
+
+pub(super) struct WordInfo<'a> {
+    pub word: &'a str,
+    pub word_start: usize,
+    pub word_nchars: usize,
+    pub before_tab: Option<usize>,
+    pub after_tab: usize,
+    pub sentence_start: bool,
+    pub ends_punct: bool,
+    pub new_line: bool,
+}
+
+// returns (&str, is_start_of_sentence)
+impl<'a> Iterator for WordSplit<'a> {
+    type Item = WordInfo<'a>;
+
+    fn next(&mut self) -> Option<WordInfo<'a>> {
+        if self.position >= self.length {
+            return None;
+        }
+
+        let old_position = self.position;
+        let new_line = old_position == 0;
+
+        // find the start of the next word, and record if we find a tab character
+        let (before_tab, after_tab, word_start) =
+            if let (b, a, Some(s)) = self.analyze_tabs(&self.string[old_position..]) {
+                (b, a, s + old_position)
+            } else {
+                self.position = self.length;
+                return None;
+            };
+
+        // find the beginning of the next whitespace
+        // note that this preserves the invariant that self.position
+        // points to whitespace character OR end of string
+        let mut word_nchars = 0;
+        self.position = match self.string[word_start..].find(|x: char| {
+            if x.is_whitespace() {
+                true
+            } else {
+                word_nchars += char_width(x);
+                false
+            }
+        }) {
+            None => self.length,
+            Some(s) => s + word_start,
+        };
+
+        let word_start_relative = word_start - old_position;
+        // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence.
+        let is_start_of_sentence =
+            self.prev_punct && (before_tab.is_some() || word_start_relative > 1);
+
+        // now record whether this word ends in punctuation
+        self.prev_punct = match self.string[..self.position].chars().next_back() {
+            Some(ch) => WordSplit::is_punctuation(ch),
+            _ => panic!("fatal: expected word not to be empty"),
+        };
+
+        let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform {
+            (&self.string[word_start..self.position], 0, None, 0)
+        } else {
+            (
+                &self.string[old_position..self.position],
+                word_start_relative,
+                before_tab,
+                after_tab,
+            )
+        };
+
+        Some(WordInfo {
+            word,
+            word_start: word_start_relative,
+            word_nchars,
+            before_tab,
+            after_tab,
+            sentence_start: is_start_of_sentence,
+            ends_punct: self.prev_punct,
+            new_line,
+        })
+    }
+}
author	Benedikt Peetz <benedikt.peetz@b-peetz.de>	2025-02-16 18:06:15 +0100
committer	Benedikt Peetz <benedikt.peetz@b-peetz.de>	2025-02-16 18:08:10 +0100
commit	7cfa6939deb5496a07313a2a34632da1a3fb1b89 (patch)
tree	2eb43066ea6e8d14079be1da10f9c5856282e676 /crates/fmt/src/parasplit.rs
parent	chore(crates/termsize): Vendor (diff)
download	yt-7cfa6939deb5496a07313a2a34632da1a3fb1b89.zip