aboutsummaryrefslogtreecommitdiffstats
path: root/crates
diff options
context:
space:
mode:
Diffstat (limited to 'crates')
-rw-r--r--crates/fmt/Cargo.toml30
-rw-r--r--crates/fmt/LICENSE18
-rw-r--r--crates/fmt/LICENSE.license10
-rw-r--r--crates/fmt/src/fmt.rs137
-rw-r--r--crates/fmt/src/linebreak.rs520
-rw-r--r--crates/fmt/src/parasplit.rs629
6 files changed, 1344 insertions, 0 deletions
diff --git a/crates/fmt/Cargo.toml b/crates/fmt/Cargo.toml
new file mode 100644
index 0000000..7f82a09
--- /dev/null
+++ b/crates/fmt/Cargo.toml
@@ -0,0 +1,30 @@
+# yt - A fully featured command line YouTube client
+#
+# Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
+# Copyright (C) 2025 uutils developers
+# SPDX-License-Identifier: MIT
+#
+# This file is part of Yt.
+#
+# You should have received a copy of the License along with this program.
+# If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>.
+
+[package]
+name = "uu_fmt"
+authors = ["uutils developers", "Benedikt Peetz <benedikt.peetz@b-peetz.de>"]
+license = "MIT"
+description = "A fork of the uutils fmt tool. This fork is a library instead of a binary."
+version.workspace = true
+edition.workspace = true
+repository.workspace = true
+rust-version.workspace = true
+publish = false
+
+[lib]
+path = "src/fmt.rs"
+
+[dependencies]
+unicode-width = "0.2.0"
+
+[lints]
+workspace = true
diff --git a/crates/fmt/LICENSE b/crates/fmt/LICENSE
new file mode 100644
index 0000000..21bd444
--- /dev/null
+++ b/crates/fmt/LICENSE
@@ -0,0 +1,18 @@
+Copyright (c) uutils developers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/crates/fmt/LICENSE.license b/crates/fmt/LICENSE.license
new file mode 100644
index 0000000..6cee99d
--- /dev/null
+++ b/crates/fmt/LICENSE.license
@@ -0,0 +1,10 @@
+yt - A fully featured command line YouTube client
+
+Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
+Copyright (C) 2025 uutils developers
+SPDX-License-Identifier: MIT
+
+This file is part of Yt.
+
+You should have received a copy of the License along with this program.
+If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>.
diff --git a/crates/fmt/src/fmt.rs b/crates/fmt/src/fmt.rs
new file mode 100644
index 0000000..3067bea
--- /dev/null
+++ b/crates/fmt/src/fmt.rs
@@ -0,0 +1,137 @@
+// yt - A fully featured command line YouTube client
+//
+// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
+// Copyright (C) 2025 uutils developers
+// SPDX-License-Identifier: MIT
+//
+// This file is part of Yt.
+//
+// You should have received a copy of the License along with this program.
+// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>.
+
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use std::fmt::Write;
+
+use linebreak::break_lines;
+use parasplit::ParagraphStream;
+
+mod linebreak;
+mod parasplit;
+
+#[derive(Debug)]
+#[allow(clippy::struct_excessive_bools)]
+pub struct FmtOptions {
+ /// First and second line of paragraph
+ /// may have different indentations, in which
+ /// case the first line's indentation is preserved,
+ /// and each subsequent line's indentation matches the second line.
+ pub crown_margin: bool,
+
+ /// Like the [`crown_margin`], except that the first and second line of a paragraph *must*
+ /// have different indentation or they are treated as separate paragraphs.
+ pub tagged_paragraph: bool,
+
+ /// Attempt to detect and preserve mail headers in the input.
+ /// Be careful when combining this with [`prefix`].
+ pub mail: bool,
+
+ /// Split lines only, do not reflow.
+ pub split_only: bool,
+
+ /// Insert exactly one space between words, and two between sentences.
+ /// Sentence breaks in the input are detected as [?!.] followed by two spaces or a newline;
+ /// other punctuation is not interpreted as a sentence break.
+ pub uniform: bool,
+
+ /// Reformat only lines beginning with PREFIX, reattaching PREFIX to reformatted lines.
+ /// Unless [`exact_prefix`] is specified, leading whitespace will be ignored when matching PREFIX.
+ pub prefix: Option<String>,
+
+ /// Do not reformat lines beginning with ``ANTI_PREFIX``.
+ /// Unless [`exact_anti_prefix`] is specified, leading whitespace will be ignored when matching ``ANTI_PREFIX``.
+ pub anti_prefix: Option<String>,
+
+ /// [`prefix`] must match at the beginning of the line with no preceding whitespace.
+ pub exact_prefix: bool,
+
+ /// [`anti_prefix`] must match at the beginning of the line with no preceding whitespace.
+ pub exact_anti_prefix: bool,
+
+ /// Fill output lines up to a maximum of WIDTH columns, default 75.
+ pub width: usize,
+
+ /// Goal width, default of 93% of WIDTH.
+ /// Must be less than or equal to WIDTH.
+ pub goal: usize,
+
+ /// Break lines more quickly at the expense of a potentially more ragged appearance.
+ pub quick: bool,
+
+ /// Treat tabs as TABWIDTH spaces for determining line length, default 8.
+ /// Note that this is used only for calculating line lengths; tabs are preserved in the output.
+ pub tabwidth: usize,
+}
+
+impl FmtOptions {
+ #[must_use]
+ #[allow(clippy::cast_sign_loss)]
+ #[allow(clippy::cast_possible_truncation)]
+ #[allow(clippy::cast_precision_loss)]
+ pub fn new(width: Option<usize>, goal: Option<usize>, tabwidth: Option<usize>) -> Self {
+ // by default, goal is 93% of width
+ const DEFAULT_GOAL_TO_WIDTH_RATIO: f64 = 0.93;
+ const DEFAULT_WIDTH: usize = 75;
+
+ FmtOptions {
+ crown_margin: false,
+ tagged_paragraph: false,
+ mail: false,
+ split_only: false,
+ uniform: false,
+ prefix: None,
+ anti_prefix: None,
+ exact_prefix: false,
+ exact_anti_prefix: false,
+ width: width.unwrap_or(DEFAULT_WIDTH),
+ goal: goal.unwrap_or(
+ ((width.unwrap_or(DEFAULT_WIDTH) as f64) * DEFAULT_GOAL_TO_WIDTH_RATIO).floor()
+ as usize,
+ ),
+ quick: false,
+ tabwidth: tabwidth.unwrap_or(8),
+ }
+ }
+}
+
+/// Process text and format it according to the provided options.
+///
+/// # Arguments
+///
+/// * `text` - The text to process.
+/// * `fmt_opts` - A reference to a [`FmtOptions`] structure containing the formatting options.
+///
+/// # Returns
+///
+/// The formatted [`String`].
+#[must_use]
+pub fn process_text(text: &str, fmt_opts: &FmtOptions) -> String {
+ let mut output = String::new();
+
+ let p_stream = ParagraphStream::new(fmt_opts, text);
+ for para_result in p_stream {
+ match para_result {
+ Err(s) => {
+ output.push_str(&s);
+ output.push('\n');
+ }
+ Ok(para) => write!(output, "{}", break_lines(&para, fmt_opts))
+ .expect("This is in-memory. It should not fail"),
+ }
+ }
+
+ output
+}
diff --git a/crates/fmt/src/linebreak.rs b/crates/fmt/src/linebreak.rs
new file mode 100644
index 0000000..b1dc6fa
--- /dev/null
+++ b/crates/fmt/src/linebreak.rs
@@ -0,0 +1,520 @@
+// yt - A fully featured command line YouTube client
+//
+// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
+// Copyright (C) 2025 uutils developers
+// SPDX-License-Identifier: MIT
+//
+// This file is part of Yt.
+//
+// You should have received a copy of the License along with this program.
+// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>.
+
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use std::fmt::Write;
+use std::{cmp, mem};
+
+use crate::FmtOptions;
+use crate::parasplit::{ParaWords, Paragraph, WordInfo};
+
+struct BreakArgs<'a> {
+ opts: &'a FmtOptions,
+ init_len: usize,
+ indent_str: &'a str,
+ indent_len: usize,
+ uniform: bool,
+ output: String,
+}
+
+impl BreakArgs<'_> {
+ fn compute_width(&self, winfo: &WordInfo<'_>, position_n: usize, fresh: bool) -> usize {
+ if fresh {
+ 0
+ } else {
+ let post = winfo.after_tab;
+ match winfo.before_tab {
+ None => post,
+ Some(pre) => {
+ post + ((pre + position_n) / self.opts.tabwidth + 1) * self.opts.tabwidth
+ - position_n
+ }
+ }
+ }
+ }
+}
+
+pub(super) fn break_lines(para: &Paragraph, opts: &FmtOptions) -> String {
+ let mut output = String::new();
+
+ // indent
+ let p_indent = &para.indent_str;
+ let p_indent_len = para.indent_len;
+
+ // words
+ let p_words = ParaWords::new(opts, para);
+ let mut p_words_words = p_words.words();
+
+ // the first word will *always* appear on the first line
+ // make sure of this here
+ let Some(winfo) = p_words_words.next() else {
+ return "\n".to_owned();
+ };
+
+ // print the init, if it exists, and get its length
+ let p_init_len = winfo.word_nchars
+ + if opts.crown_margin || opts.tagged_paragraph {
+ // handle "init" portion
+ output.push_str(&para.init_str);
+ para.init_len
+ } else if !para.mail_header {
+ // for non-(crown, tagged) that's the same as a normal indent
+ output.push_str(p_indent);
+ p_indent_len
+ } else {
+ // except that mail headers get no indent at all
+ 0
+ };
+
+ // write first word after writing init
+ write!(output, "{}", winfo.word).expect("Works");
+
+ // does this paragraph require uniform spacing?
+ let uniform = para.mail_header || opts.uniform;
+
+ let mut break_args = BreakArgs {
+ opts,
+ init_len: p_init_len,
+ indent_str: p_indent,
+ indent_len: p_indent_len,
+ uniform,
+ output,
+ };
+
+ if opts.quick || para.mail_header {
+ break_simple(p_words_words, &mut break_args);
+ } else {
+ break_knuth_plass(p_words_words, &mut break_args);
+ };
+
+ break_args.output
+}
+
+// break_simple implements a "greedy" breaking algorithm: print words until
+// maxlength would be exceeded, then print a linebreak and indent and continue.
+fn break_simple<'a, T: Iterator<Item = &'a WordInfo<'a>>>(iter: T, args: &mut BreakArgs<'a>) {
+ iter.fold((args.init_len, false), |(l, prev_punct), winfo| {
+ accum_words_simple(args, l, prev_punct, winfo)
+ });
+ args.output.push('\n');
+}
+
+fn accum_words_simple<'a>(
+ args: &mut BreakArgs<'a>,
+ l: usize,
+ prev_punct: bool,
+ winfo: &'a WordInfo<'a>,
+) -> (usize, bool) {
+ // compute the length of this word, considering how tabs will expand at this position on the line
+ let wlen = winfo.word_nchars + args.compute_width(winfo, l, false);
+
+ let slen = compute_slen(
+ args.uniform,
+ winfo.new_line,
+ winfo.sentence_start,
+ prev_punct,
+ );
+
+ if l + wlen + slen > args.opts.width {
+ write_newline(args.indent_str, &mut args.output);
+ write_with_spaces(&winfo.word[winfo.word_start..], 0, &mut args.output);
+ (args.indent_len + winfo.word_nchars, winfo.ends_punct)
+ } else {
+ write_with_spaces(winfo.word, slen, &mut args.output);
+ (l + wlen + slen, winfo.ends_punct)
+ }
+}
+
+// break_knuth_plass implements an "optimal" breaking algorithm in the style of
+// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software,
+// Practice and Experience. Vol. 11, No. 11, November 1981.
+// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf
+#[allow(trivial_casts)]
+fn break_knuth_plass<'a, T: Clone + Iterator<Item = &'a WordInfo<'a>>>(
+ mut iter: T,
+ args: &mut BreakArgs<'a>,
+) {
+ // run the algorithm to get the breakpoints
+ let breakpoints = find_kp_breakpoints(iter.clone(), args);
+
+ // iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it
+ let result: (bool, bool) = breakpoints.iter().rev().fold(
+ (false, false),
+ |(mut prev_punct, mut fresh), &(next_break, break_before)| {
+ if fresh {
+ write_newline(args.indent_str, &mut args.output);
+ }
+ // at each breakpoint, keep emitting words until we find the word matching this breakpoint
+ for winfo in &mut iter {
+ let (slen, word) = slice_if_fresh(
+ fresh,
+ winfo.word,
+ winfo.word_start,
+ args.uniform,
+ winfo.new_line,
+ winfo.sentence_start,
+ prev_punct,
+ );
+ fresh = false;
+ prev_punct = winfo.ends_punct;
+
+ // We find identical breakpoints here by comparing addresses of the references.
+ // This is OK because the backing vector is not mutating once we are linebreaking.
+ let winfo_ptr = winfo as *const _;
+ let next_break_ptr = next_break as *const _;
+ if winfo_ptr == next_break_ptr {
+ // OK, we found the matching word
+ if break_before {
+ write_newline(args.indent_str, &mut args.output);
+ write_with_spaces(&winfo.word[winfo.word_start..], 0, &mut args.output);
+ } else {
+ // breaking after this word, so that means "fresh" is true for the next iteration
+ write_with_spaces(word, slen, &mut args.output);
+ fresh = true;
+ }
+ break;
+ }
+ write_with_spaces(word, slen, &mut args.output);
+ }
+ (prev_punct, fresh)
+ },
+ );
+ let (mut prev_punct, mut fresh) = result;
+
+ // after the last linebreak, write out the rest of the final line.
+ for winfo in iter {
+ if fresh {
+ write_newline(args.indent_str, &mut args.output);
+ }
+ let (slen, word) = slice_if_fresh(
+ fresh,
+ winfo.word,
+ winfo.word_start,
+ args.uniform,
+ winfo.new_line,
+ winfo.sentence_start,
+ prev_punct,
+ );
+ prev_punct = winfo.ends_punct;
+ fresh = false;
+ write_with_spaces(word, slen, &mut args.output);
+ }
+
+ args.output.push('\n');
+}
+
+struct LineBreak<'a> {
+ prev: usize,
+ linebreak: Option<&'a WordInfo<'a>>,
+ break_before: bool,
+ demerits: i64,
+ prev_rat: f32,
+ length: usize,
+ fresh: bool,
+}
+
+#[allow(clippy::cognitive_complexity)]
+#[allow(clippy::cast_possible_wrap)]
+fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>(
+ iter: T,
+ args: &BreakArgs<'a>,
+) -> Vec<(&'a WordInfo<'a>, bool)> {
+ let mut iter = iter.peekable();
+ // set up the initial null linebreak
+ let mut linebreaks = vec![LineBreak {
+ prev: 0,
+ linebreak: None,
+ break_before: false,
+ demerits: 0,
+ prev_rat: 0.0,
+ length: args.init_len,
+ fresh: false,
+ }];
+ // this vec holds the current active linebreaks; next_ holds the breaks that will be active for
+ // the next word
+ let mut active_breaks = vec![0];
+ let mut next_active_breaks = vec![];
+
+ let stretch = args.opts.width - args.opts.goal;
+ let minlength = args.opts.goal.max(stretch + 1) - stretch;
+ let mut new_linebreaks = vec![];
+ let mut is_sentence_start = false;
+ let mut least_demerits = 0;
+ loop {
+ let Some(w) = iter.next() else { break };
+
+ // if this is the last word, we don't add additional demerits for this break
+ let (is_last_word, is_sentence_end) = match iter.peek() {
+ None => (true, true),
+ Some(&&WordInfo {
+ sentence_start: st,
+ new_line: nl,
+ ..
+ }) => (false, st || (nl && w.ends_punct)),
+ };
+
+ // should we be adding extra space at the beginning of the next sentence?
+ let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false);
+
+ let mut ld_new = i64::MAX;
+ let mut ld_next = i64::MAX;
+ let mut ld_idx = 0;
+ new_linebreaks.clear();
+ next_active_breaks.clear();
+ // go through each active break, extending it and possibly adding a new active
+ // break if we are above the minimum required length
+ #[allow(clippy::explicit_iter_loop)]
+ for &i in active_breaks.iter() {
+ let active = &mut linebreaks[i];
+ // normalize demerits to avoid overflow, and record if this is the least
+ active.demerits -= least_demerits;
+ if active.demerits < ld_next {
+ ld_next = active.demerits;
+ ld_idx = i;
+ }
+
+ // get the new length
+ let tlen = w.word_nchars
+ + args.compute_width(w, active.length, active.fresh)
+ + slen
+ + active.length;
+
+ // if tlen is longer than args.opts.width, we drop this break from the active list
+ // otherwise, we extend the break, and possibly add a new break at this point
+ if tlen <= args.opts.width {
+ // this break will still be active next time
+ next_active_breaks.push(i);
+ // we can put this word on this line
+ active.fresh = false;
+ active.length = tlen;
+
+ // if we're above the minlength, we can also consider breaking here
+ if tlen >= minlength {
+ let (new_demerits, new_ratio) = if is_last_word {
+ // there is no penalty for the final line's length
+ (0, 0.0)
+ } else {
+ compute_demerits(
+ args.opts.goal as isize - tlen as isize,
+ stretch,
+ w.word_nchars,
+ active.prev_rat,
+ )
+ };
+
+ // do not even consider adding a line that has too many demerits
+ // also, try to detect overflow by checking signum
+ let total_demerits = new_demerits + active.demerits;
+ if new_demerits < BAD_INFTY_SQ
+ && total_demerits < ld_new
+ && active.demerits.signum() <= new_demerits.signum()
+ {
+ ld_new = total_demerits;
+ new_linebreaks.push(LineBreak {
+ prev: i,
+ linebreak: Some(w),
+ break_before: false,
+ demerits: total_demerits,
+ prev_rat: new_ratio,
+ length: args.indent_len,
+ fresh: true,
+ });
+ }
+ }
+ }
+ }
+
+ // if we generated any new linebreaks, add the last one to the list
+ // the last one is always the best because we don't add to new_linebreaks unless
+ // it's better than the best one so far
+ match new_linebreaks.pop() {
+ None => (),
+ Some(lb) => {
+ next_active_breaks.push(linebreaks.len());
+ linebreaks.push(lb);
+ }
+ }
+
+ if next_active_breaks.is_empty() {
+ // every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx
+ let new_break =
+ restart_active_breaks(args, &linebreaks[ld_idx], ld_idx, w, slen, minlength);
+ next_active_breaks.push(linebreaks.len());
+ linebreaks.push(new_break);
+ least_demerits = 0;
+ } else {
+ // next time around, normalize out the demerits fields
+ // on active linebreaks to make overflow less likely
+ least_demerits = cmp::max(ld_next, 0);
+ }
+ // swap in new list of active breaks
+ mem::swap(&mut active_breaks, &mut next_active_breaks);
+ // If this was the last word in a sentence, the next one must be the first in the next.
+ is_sentence_start = is_sentence_end;
+ }
+
+ // return the best path
+ build_best_path(&linebreaks, &active_breaks)
+}
+
+fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a WordInfo<'a>, bool)> {
+ // of the active paths, we select the one with the fewest demerits
+ active
+ .iter()
+ .min_by_key(|&&a| paths[a].demerits)
+ .map(|&(mut best_idx)| {
+ let mut breakwords = vec![];
+ // now, chase the pointers back through the break list, recording
+ // the words at which we should break
+ loop {
+ let next_best = &paths[best_idx];
+ match next_best.linebreak {
+ None => return breakwords,
+ Some(prev) => {
+ breakwords.push((prev, next_best.break_before));
+ best_idx = next_best.prev;
+ }
+ }
+ }
+ })
+ .unwrap_or_default()
+}
+
+// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed
+const BAD_INFTY: i64 = 10_000_000;
+const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
+// badness = BAD_MULT * abs(r) ^ 3
+const BAD_MULT: f32 = 100.0;
+// DR_MULT is multiplier for delta-R between lines
+const DR_MULT: f32 = 600.0;
+// DL_MULT is penalty multiplier for short words at end of line
+const DL_MULT: f32 = 300.0;
+
+#[allow(clippy::cast_precision_loss)]
+#[allow(clippy::cast_possible_truncation)]
+fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
+ // how much stretch are we using?
+ let ratio = if delta_len == 0 {
+ 0.0f32
+ } else {
+ delta_len as f32 / stretch as f32
+ };
+
+ // compute badness given the stretch ratio
+ let bad_linelen = if ratio.abs() > 1.0f32 {
+ BAD_INFTY
+ } else {
+ (BAD_MULT * ratio.powi(3).abs()) as i64
+ };
+
+ // we penalize lines ending in really short words
+ let bad_wordlen = if wlen >= stretch {
+ 0
+ } else {
+ (DL_MULT
+ * ((stretch - wlen) as f32 / (stretch - 1) as f32)
+ .powi(3)
+ .abs()) as i64
+ };
+
+ // we penalize lines that have very different ratios from previous lines
+ let bad_delta_r = (DR_MULT * ((ratio - prev_rat) / 2.0).powi(3).abs()) as i64;
+
+ let demerits = i64::pow(1 + bad_linelen + bad_wordlen + bad_delta_r, 2);
+
+ (demerits, ratio)
+}
+
+#[allow(clippy::cast_possible_wrap)]
+fn restart_active_breaks<'a>(
+ args: &BreakArgs<'a>,
+ active: &LineBreak<'a>,
+ act_idx: usize,
+ w: &'a WordInfo<'a>,
+ slen: usize,
+ min: usize,
+) -> LineBreak<'a> {
+ let (break_before, line_length) = if active.fresh {
+ // never break before a word if that word would be the first on a line
+ (false, args.indent_len)
+ } else {
+ // choose the lesser evil: breaking too early, or breaking too late
+ let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh);
+ let underlen = min as isize - active.length as isize;
+ let overlen = (wlen + slen + active.length) as isize - args.opts.width as isize;
+ if overlen > underlen {
+ // break early, put this word on the next line
+ (true, args.indent_len + w.word_nchars)
+ } else {
+ (false, args.indent_len)
+ }
+ };
+
+ // restart the linebreak. This will be our only active path.
+ LineBreak {
+ prev: act_idx,
+ linebreak: Some(w),
+ break_before,
+ demerits: 0, // this is the only active break, so we can reset the demerit count
+ prev_rat: if break_before { 1.0 } else { -1.0 },
+ length: line_length,
+ fresh: !break_before,
+ }
+}
+
+// Number of spaces to add before a word, based on mode, newline, sentence start.
+#[allow(clippy::fn_params_excessive_bools)]
+fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> usize {
+ if uniform || newline {
+ if start || (newline && punct) { 2 } else { 1 }
+ } else {
+ 0
+ }
+}
+
+// If we're on a fresh line, slen=0 and we slice off leading whitespace.
+// Otherwise, compute slen and leave whitespace alone.
+#[allow(clippy::fn_params_excessive_bools)]
+fn slice_if_fresh(
+ fresh: bool,
+ word: &str,
+ start: usize,
+ uniform: bool,
+ newline: bool,
+ second_start: bool,
+ punct: bool,
+) -> (usize, &str) {
+ if fresh {
+ (0, &word[start..])
+ } else {
+ (compute_slen(uniform, newline, second_start, punct), word)
+ }
+}
+
+// Write a newline and add the indent.
+fn write_newline(indent: &str, output: &mut String) {
+ output.push('\n');
+ output.push_str(indent);
+}
+
+// Write the word, along with slen spaces.
+fn write_with_spaces(word: &str, slen: usize, output: &mut String) {
+ if slen == 2 {
+ output.push_str(" ");
+ } else if slen == 1 {
+ output.push(' ');
+ }
+ output.push_str(word);
+}
diff --git a/crates/fmt/src/parasplit.rs b/crates/fmt/src/parasplit.rs
new file mode 100644
index 0000000..d4723cb
--- /dev/null
+++ b/crates/fmt/src/parasplit.rs
@@ -0,0 +1,629 @@
+// yt - A fully featured command line YouTube client
+//
+// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
+// Copyright (C) 2025 uutils developers
+// SPDX-License-Identifier: MIT
+//
+// This file is part of Yt.
+//
+// You should have received a copy of the License along with this program.
+// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>.
+
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use std::iter::Peekable;
+use std::slice::Iter;
+use unicode_width::UnicodeWidthChar;
+
+use crate::FmtOptions;
+
+fn char_width(c: char) -> usize {
+ if (c as usize) < 0xA0 {
+ // if it is ASCII, call it exactly 1 wide (including control chars)
+ // calling control chars' widths 1 is consistent with OpenBSD fmt
+ 1
+ } else {
+ // otherwise, get the unicode width
+ // note that we shouldn't actually get None here because only c < 0xA0
+ // can return None, but for safety and future-proofing we do it this way
+ UnicodeWidthChar::width(c).unwrap_or(1)
+ }
+}
+
+// lines with PSKIP, lacking PREFIX, or which are entirely blank are
+// NoFormatLines; otherwise, they are FormatLines
+#[derive(Debug)]
+pub(super) enum Line {
+ FormatLine(FileLine),
+ NoFormatLine(String, bool),
+}
+
+impl Line {
+ // when we know that it's a FormatLine, as in the ParagraphStream iterator
+ fn get_formatline(self) -> FileLine {
+ match self {
+ Self::FormatLine(fl) => fl,
+ Self::NoFormatLine(..) => panic!("Found NoFormatLine when expecting FormatLine"),
+ }
+ }
+
+ // when we know that it's a NoFormatLine, as in the ParagraphStream iterator
+ fn get_noformatline(self) -> (String, bool) {
+ match self {
+ Self::NoFormatLine(s, b) => (s, b),
+ Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"),
+ }
+ }
+}
+
+/// Each line's prefix has to be considered to know whether to merge it with
+/// the next line or not
+#[derive(Debug)]
+pub(super) struct FileLine {
+ line: String,
+ /// The end of the indent, always the start of the text
+ indent_end: usize,
+
+ /// The end of the PREFIX's indent, that is, the spaces before the prefix
+ prefix_indent_end: usize,
+
+ /// Display length of indent taking into account tabs
+ indent_len: usize,
+
+ /// PREFIX indent length taking into account tabs
+ prefix_len: usize,
+}
+
+/// Iterator that produces a stream of Lines from a file
+pub(super) struct FileLines<'a> {
+ opts: &'a FmtOptions,
+ lines: std::str::Lines<'a>,
+}
+
+impl FileLines<'_> {
+ fn new<'b>(opts: &'b FmtOptions, lines: std::str::Lines<'b>) -> FileLines<'b> {
+ FileLines { opts, lines }
+ }
+
+ /// returns true if this line should be formatted
+ fn match_prefix(&self, line: &str) -> (bool, usize) {
+ let Some(prefix) = &self.opts.prefix else {
+ return (true, 0);
+ };
+
+ FileLines::match_prefix_generic(prefix, line, self.opts.exact_prefix)
+ }
+
+ /// returns true if this line should be formatted
+ fn match_anti_prefix(&self, line: &str) -> bool {
+ let Some(anti_prefix) = &self.opts.anti_prefix else {
+ return true;
+ };
+
+ match FileLines::match_prefix_generic(anti_prefix, line, self.opts.exact_anti_prefix) {
+ (true, _) => false,
+ (_, _) => true,
+ }
+ }
+
+ fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, usize) {
+ if line.starts_with(pfx) {
+ return (true, 0);
+ }
+
+ if !exact {
+ // we do it this way rather than byte indexing to support unicode whitespace chars
+ for (i, char) in line.char_indices() {
+ if line[i..].starts_with(pfx) {
+ return (true, i);
+ } else if !char.is_whitespace() {
+ break;
+ }
+ }
+ }
+
+ (false, 0)
+ }
+
+ fn compute_indent(&self, string: &str, prefix_end: usize) -> (usize, usize, usize) {
+ let mut prefix_len = 0;
+ let mut indent_len = 0;
+ let mut indent_end = 0;
+ for (os, c) in string.char_indices() {
+ if os == prefix_end {
+ // we found the end of the prefix, so this is the printed length of the prefix here
+ prefix_len = indent_len;
+ }
+
+ if (os >= prefix_end) && !c.is_whitespace() {
+ // found first non-whitespace after prefix, this is indent_end
+ indent_end = os;
+ break;
+ } else if c == '\t' {
+ // compute tab length
+ indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
+ } else {
+ // non-tab character
+ indent_len += char_width(c);
+ }
+ }
+ (indent_end, prefix_len, indent_len)
+ }
+}
+
+impl Iterator for FileLines<'_> {
+ type Item = Line;
+
+ fn next(&mut self) -> Option<Line> {
+ let n = self.lines.next()?;
+
+ // if this line is entirely whitespace,
+ // emit a blank line
+ // Err(true) indicates that this was a linebreak,
+ // which is important to know when detecting mail headers
+ if n.chars().all(char::is_whitespace) {
+ return Some(Line::NoFormatLine(String::new(), true));
+ }
+
+ let (pmatch, poffset) = self.match_prefix(n);
+
+ // if this line does not match the prefix,
+ // emit the line unprocessed and iterate again
+ if !pmatch {
+ return Some(Line::NoFormatLine(n.to_owned(), false));
+ }
+
+ // if the line matches the prefix, but is blank after,
+ // don't allow lines to be combined through it (that is,
+ // treat it like a blank line, except that since it's
+ // not truly blank we will not allow mail headers on the
+ // following line)
+ if pmatch
+ && n[poffset + self.opts.prefix.as_ref().map_or(0, String::len)..]
+ .chars()
+ .all(char::is_whitespace)
+ {
+ return Some(Line::NoFormatLine(n.to_owned(), false));
+ }
+
+ // skip if this line matches the anti_prefix
+ // (NOTE definition of match_anti_prefix is TRUE if we should process)
+ if !self.match_anti_prefix(n) {
+ return Some(Line::NoFormatLine(n.to_owned(), false));
+ }
+
+ // figure out the indent, prefix, and prefixindent ending points
+ let prefix_end = poffset + self.opts.prefix.as_ref().map_or(0, String::len);
+ let (indent_end, prefix_len, indent_len) = self.compute_indent(n, prefix_end);
+
+ Some(Line::FormatLine(FileLine {
+ line: n.to_owned(),
+ indent_end,
+ prefix_indent_end: poffset,
+ indent_len,
+ prefix_len,
+ }))
+ }
+}
+
+/// A paragraph : a collection of [`FileLines`] that are to be formatted
+/// plus info about the paragraph's indentation
+///
+/// We only retain the String from the [`FileLine`]; the other info
+/// is only there to help us in deciding how to merge lines into Paragraphs
+#[derive(Debug)]
+pub(super) struct Paragraph {
+ /// the lines of the file
+ lines: Vec<String>,
+ /// string representing the init, that is, the first line's indent
+ pub init_str: String,
+ /// printable length of the init string considering TABWIDTH
+ pub init_len: usize,
+ /// byte location of end of init in first line String
+ init_end: usize,
+ /// string representing indent
+ pub indent_str: String,
+ /// length of above
+ pub indent_len: usize,
+ /// byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
+ indent_end: usize,
+ /// we need to know if this is a mail header because we do word splitting differently in that case
+ pub mail_header: bool,
+}
+
+/// An iterator producing a stream of paragraphs from a stream of lines
+/// given a set of options.
+pub(super) struct ParagraphStream<'a> {
+ lines: Peekable<FileLines<'a>>,
+ next_mail: bool,
+ opts: &'a FmtOptions,
+}
+
+impl ParagraphStream<'_> {
+ pub(super) fn new<'b>(opts: &'b FmtOptions, text: &'b str) -> ParagraphStream<'b> {
+ let lines = FileLines::new(opts, text.lines()).peekable();
+ // at the beginning of the file, we might find mail headers
+ ParagraphStream {
+ lines,
+ next_mail: true,
+ opts,
+ }
+ }
+
+ /// Detect RFC822 mail header
+ fn is_mail_header(line: &FileLine) -> bool {
+ // a mail header begins with either "From " (envelope sender line)
+ // or with a sequence of printable ASCII chars (33 to 126, inclusive,
+ // except colon) followed by a colon.
+ if line.indent_end > 0 {
+ false
+ } else {
+ let l_slice = &line.line[..];
+ if l_slice.starts_with("From ") {
+ true
+ } else {
+ let Some(colon_posn) = l_slice.find(':') else {
+ return false;
+ };
+
+ // header field must be nonzero length
+ if colon_posn == 0 {
+ return false;
+ }
+
+ l_slice[..colon_posn]
+ .chars()
+ .all(|x| !matches!(x as usize, y if !(33..=126).contains(&y)))
+ }
+ }
+ }
+}
+
+impl Iterator for ParagraphStream<'_> {
+ type Item = Result<Paragraph, String>;
+
+ #[allow(clippy::cognitive_complexity)]
+ fn next(&mut self) -> Option<Result<Paragraph, String>> {
+ // return a NoFormatLine in an Err; it should immediately be output
+ let noformat = match self.lines.peek()? {
+ Line::FormatLine(_) => false,
+ Line::NoFormatLine(_, _) => true,
+ };
+
+ // found a NoFormatLine, immediately dump it out
+ if noformat {
+ let (s, nm) = self.lines.next().unwrap().get_noformatline();
+ self.next_mail = nm;
+ return Some(Err(s));
+ }
+
+ // found a FormatLine, now build a paragraph
+ let mut init_str = String::new();
+ let mut init_end = 0;
+ let mut init_len = 0;
+ let mut indent_str = String::new();
+ let mut indent_end = 0;
+ let mut indent_len = 0;
+ let mut prefix_len = 0;
+ let mut prefix_indent_end = 0;
+ let mut p_lines = Vec::new();
+
+ let mut in_mail = false;
+ let mut second_done = false; // for when we use crown or tagged mode
+ loop {
+ // peek ahead
+ // need to explicitly force fl out of scope before we can call self.lines.next()
+ let Some(Line::FormatLine(fl)) = self.lines.peek() else {
+ break;
+ };
+
+ if p_lines.is_empty() {
+ // first time through the loop, get things set up
+ // detect mail header
+ if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
+ in_mail = true;
+ // there can't be any indent or prefixindent because otherwise is_mail_header
+ // would fail since there cannot be any whitespace before the colon in a
+ // valid header field
+ indent_str.push_str(" ");
+ indent_len = 2;
+ } else {
+ if self.opts.crown_margin || self.opts.tagged_paragraph {
+ init_str.push_str(&fl.line[..fl.indent_end]);
+ init_len = fl.indent_len;
+ init_end = fl.indent_end;
+ } else {
+ second_done = true;
+ }
+
+ // these will be overwritten in the 2nd line of crown or tagged mode, but
+ // we are not guaranteed to get to the 2nd line, e.g., if the next line
+ // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
+ indent_str.push_str(&fl.line[..fl.indent_end]);
+ indent_len = fl.indent_len;
+ indent_end = fl.indent_end;
+
+ // save these to check for matching lines
+ prefix_len = fl.prefix_len;
+ prefix_indent_end = fl.prefix_indent_end;
+
+ // in tagged mode, add 4 spaces of additional indenting by default
+ // (gnu fmt's behavior is different: it seems to find the closest column to
+ // indent_end that is divisible by 3. But honestly that behavior seems
+ // pretty arbitrary.
+ // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big.
+ if self.opts.tagged_paragraph {
+ indent_str.push_str(" ");
+ indent_len += 4;
+ }
+ }
+ } else if in_mail {
+ // lines following mail headers must begin with spaces
+ if fl.indent_end == 0 || (self.opts.prefix.is_some() && fl.prefix_indent_end == 0) {
+ break; // this line does not begin with spaces
+ }
+ } else if !second_done {
+ // now we have enough info to handle crown margin and tagged mode
+
+ // in both crown and tagged modes we require that prefix_len is the same
+ if prefix_len != fl.prefix_len || prefix_indent_end != fl.prefix_indent_end {
+ break;
+ }
+
+ // in tagged mode, indent has to be *different* on following lines
+ if self.opts.tagged_paragraph
+ && indent_len - 4 == fl.indent_len
+ && indent_end == fl.indent_end
+ {
+ break;
+ }
+
+ // this is part of the same paragraph, get the indent info from this line
+ indent_str.clear();
+ indent_str.push_str(&fl.line[..fl.indent_end]);
+ indent_len = fl.indent_len;
+ indent_end = fl.indent_end;
+
+ second_done = true;
+ } else {
+ // detect mismatch
+ if indent_end != fl.indent_end
+ || prefix_indent_end != fl.prefix_indent_end
+ || indent_len != fl.indent_len
+ || prefix_len != fl.prefix_len
+ {
+ break;
+ }
+ }
+
+ p_lines.push(self.lines.next().unwrap().get_formatline().line);
+
+ // when we're in split-only mode, we never join lines, so stop here
+ if self.opts.split_only {
+ break;
+ }
+ }
+
+ // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot.
+ // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank
+ // NoFormatLine.
+ self.next_mail = in_mail;
+
+ Some(Ok(Paragraph {
+ lines: p_lines,
+ init_str,
+ init_len,
+ init_end,
+ indent_str,
+ indent_len,
+ indent_end,
+ mail_header: in_mail,
+ }))
+ }
+}
+
+pub(super) struct ParaWords<'a> {
+ opts: &'a FmtOptions,
+ para: &'a Paragraph,
+ words: Vec<WordInfo<'a>>,
+}
+
+impl<'a> ParaWords<'a> {
+ pub(super) fn new(opts: &'a FmtOptions, para: &'a Paragraph) -> Self {
+ let mut pw = ParaWords {
+ opts,
+ para,
+ words: Vec::new(),
+ };
+ pw.create_words();
+ pw
+ }
+
+ fn create_words(&mut self) {
+ if self.para.mail_header {
+ // no extra spacing for mail headers; always exactly 1 space
+ // safe to trim_start on every line of a mail header, since the
+ // first line is guaranteed not to have any spaces
+ self.words.extend(
+ self.para
+ .lines
+ .iter()
+ .flat_map(|x| x.split_whitespace())
+ .map(|x| WordInfo {
+ word: x,
+ word_start: 0,
+ word_nchars: x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped)
+ before_tab: None,
+ after_tab: 0,
+ sentence_start: false,
+ ends_punct: false,
+ new_line: false,
+ }),
+ );
+ } else {
+ // first line
+ self.words
+ .extend(if self.opts.crown_margin || self.opts.tagged_paragraph {
+ // crown and tagged mode has the "init" in the first line, so slice from there
+ WordSplit::new(self.opts, &self.para.lines[0][self.para.init_end..])
+ } else {
+ // otherwise we slice from the indent
+ WordSplit::new(self.opts, &self.para.lines[0][self.para.indent_end..])
+ });
+
+ if self.para.lines.len() > 1 {
+ let indent_end = self.para.indent_end;
+ let opts = self.opts;
+ self.words.extend(
+ self.para
+ .lines
+ .iter()
+ .skip(1)
+ .flat_map(|x| WordSplit::new(opts, &x[indent_end..])),
+ );
+ }
+ }
+ }
+
+ pub(super) fn words(&'a self) -> Iter<'a, WordInfo<'a>> {
+ self.words.iter()
+ }
+}
+
+struct WordSplit<'a> {
+ opts: &'a FmtOptions,
+ string: &'a str,
+ length: usize,
+ position: usize,
+ prev_punct: bool,
+}
+
+impl WordSplit<'_> {
+ fn analyze_tabs(&self, string: &str) -> (Option<usize>, usize, Option<usize>) {
+ // given a string, determine (length before tab) and (printed length after first tab)
+ // if there are no tabs, beforetab = -1 and aftertab is the printed length
+ let mut beforetab = None;
+ let mut aftertab = 0;
+ let mut word_start = None;
+ for (os, c) in string.char_indices() {
+ if !c.is_whitespace() {
+ word_start = Some(os);
+ break;
+ } else if c == '\t' {
+ if beforetab.is_none() {
+ beforetab = Some(aftertab);
+ aftertab = 0;
+ } else {
+ aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth;
+ }
+ } else {
+ aftertab += 1;
+ }
+ }
+ (beforetab, aftertab, word_start)
+ }
+}
+
+impl WordSplit<'_> {
+ fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
+ // wordsplits *must* start at a non-whitespace character
+ let trim_string = string.trim_start();
+ WordSplit {
+ opts,
+ string: trim_string,
+ length: string.len(),
+ position: 0,
+ prev_punct: false,
+ }
+ }
+
+ fn is_punctuation(c: char) -> bool {
+ matches!(c, '!' | '.' | '?')
+ }
+}
+
+pub(super) struct WordInfo<'a> {
+ pub word: &'a str,
+ pub word_start: usize,
+ pub word_nchars: usize,
+ pub before_tab: Option<usize>,
+ pub after_tab: usize,
+ pub sentence_start: bool,
+ pub ends_punct: bool,
+ pub new_line: bool,
+}
+
+// returns (&str, is_start_of_sentence)
+impl<'a> Iterator for WordSplit<'a> {
+ type Item = WordInfo<'a>;
+
+ fn next(&mut self) -> Option<WordInfo<'a>> {
+ if self.position >= self.length {
+ return None;
+ }
+
+ let old_position = self.position;
+ let new_line = old_position == 0;
+
+ // find the start of the next word, and record if we find a tab character
+ let (before_tab, after_tab, word_start) =
+ if let (b, a, Some(s)) = self.analyze_tabs(&self.string[old_position..]) {
+ (b, a, s + old_position)
+ } else {
+ self.position = self.length;
+ return None;
+ };
+
+ // find the beginning of the next whitespace
+ // note that this preserves the invariant that self.position
+ // points to whitespace character OR end of string
+ let mut word_nchars = 0;
+ self.position = match self.string[word_start..].find(|x: char| {
+ if x.is_whitespace() {
+ true
+ } else {
+ word_nchars += char_width(x);
+ false
+ }
+ }) {
+ None => self.length,
+ Some(s) => s + word_start,
+ };
+
+ let word_start_relative = word_start - old_position;
+ // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence.
+ let is_start_of_sentence =
+ self.prev_punct && (before_tab.is_some() || word_start_relative > 1);
+
+ // now record whether this word ends in punctuation
+ self.prev_punct = match self.string[..self.position].chars().next_back() {
+ Some(ch) => WordSplit::is_punctuation(ch),
+ _ => panic!("fatal: expected word not to be empty"),
+ };
+
+ let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform {
+ (&self.string[word_start..self.position], 0, None, 0)
+ } else {
+ (
+ &self.string[old_position..self.position],
+ word_start_relative,
+ before_tab,
+ after_tab,
+ )
+ };
+
+ Some(WordInfo {
+ word,
+ word_start: word_start_relative,
+ word_nchars,
+ before_tab,
+ after_tab,
+ sentence_start: is_start_of_sentence,
+ ends_punct: self.prev_punct,
+ new_line,
+ })
+ }
+}