diff options
Diffstat (limited to 'crates/fmt/src/linebreak.rs')
-rw-r--r-- | crates/fmt/src/linebreak.rs | 520 |
1 files changed, 520 insertions, 0 deletions
diff --git a/crates/fmt/src/linebreak.rs b/crates/fmt/src/linebreak.rs new file mode 100644 index 0000000..b1dc6fa --- /dev/null +++ b/crates/fmt/src/linebreak.rs @@ -0,0 +1,520 @@ +// yt - A fully featured command line YouTube client +// +// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de> +// Copyright (C) 2025 uutils developers +// SPDX-License-Identifier: MIT +// +// This file is part of Yt. +// +// You should have received a copy of the License along with this program. +// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. + +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::fmt::Write; +use std::{cmp, mem}; + +use crate::FmtOptions; +use crate::parasplit::{ParaWords, Paragraph, WordInfo}; + +struct BreakArgs<'a> { + opts: &'a FmtOptions, + init_len: usize, + indent_str: &'a str, + indent_len: usize, + uniform: bool, + output: String, +} + +impl BreakArgs<'_> { + fn compute_width(&self, winfo: &WordInfo<'_>, position_n: usize, fresh: bool) -> usize { + if fresh { + 0 + } else { + let post = winfo.after_tab; + match winfo.before_tab { + None => post, + Some(pre) => { + post + ((pre + position_n) / self.opts.tabwidth + 1) * self.opts.tabwidth + - position_n + } + } + } + } +} + +pub(super) fn break_lines(para: &Paragraph, opts: &FmtOptions) -> String { + let mut output = String::new(); + + // indent + let p_indent = ¶.indent_str; + let p_indent_len = para.indent_len; + + // words + let p_words = ParaWords::new(opts, para); + let mut p_words_words = p_words.words(); + + // the first word will *always* appear on the first line + // make sure of this here + let Some(winfo) = p_words_words.next() else { + return "\n".to_owned(); + }; + + // print the init, if it exists, and get its length + let p_init_len = winfo.word_nchars + + if opts.crown_margin || opts.tagged_paragraph { + // handle "init" portion + output.push_str(¶.init_str); + para.init_len + } else if !para.mail_header { + // for non-(crown, tagged) that's the same as a normal indent + output.push_str(p_indent); + p_indent_len + } else { + // except that mail headers get no indent at all + 0 + }; + + // write first word after writing init + write!(output, "{}", winfo.word).expect("Works"); + + // does this paragraph require uniform spacing? + let uniform = para.mail_header || opts.uniform; + + let mut break_args = BreakArgs { + opts, + init_len: p_init_len, + indent_str: p_indent, + indent_len: p_indent_len, + uniform, + output, + }; + + if opts.quick || para.mail_header { + break_simple(p_words_words, &mut break_args); + } else { + break_knuth_plass(p_words_words, &mut break_args); + }; + + break_args.output +} + +// break_simple implements a "greedy" breaking algorithm: print words until +// maxlength would be exceeded, then print a linebreak and indent and continue. +fn break_simple<'a, T: Iterator<Item = &'a WordInfo<'a>>>(iter: T, args: &mut BreakArgs<'a>) { + iter.fold((args.init_len, false), |(l, prev_punct), winfo| { + accum_words_simple(args, l, prev_punct, winfo) + }); + args.output.push('\n'); +} + +fn accum_words_simple<'a>( + args: &mut BreakArgs<'a>, + l: usize, + prev_punct: bool, + winfo: &'a WordInfo<'a>, +) -> (usize, bool) { + // compute the length of this word, considering how tabs will expand at this position on the line + let wlen = winfo.word_nchars + args.compute_width(winfo, l, false); + + let slen = compute_slen( + args.uniform, + winfo.new_line, + winfo.sentence_start, + prev_punct, + ); + + if l + wlen + slen > args.opts.width { + write_newline(args.indent_str, &mut args.output); + write_with_spaces(&winfo.word[winfo.word_start..], 0, &mut args.output); + (args.indent_len + winfo.word_nchars, winfo.ends_punct) + } else { + write_with_spaces(winfo.word, slen, &mut args.output); + (l + wlen + slen, winfo.ends_punct) + } +} + +// break_knuth_plass implements an "optimal" breaking algorithm in the style of +// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software, +// Practice and Experience. Vol. 11, No. 11, November 1981. +// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf +#[allow(trivial_casts)] +fn break_knuth_plass<'a, T: Clone + Iterator<Item = &'a WordInfo<'a>>>( + mut iter: T, + args: &mut BreakArgs<'a>, +) { + // run the algorithm to get the breakpoints + let breakpoints = find_kp_breakpoints(iter.clone(), args); + + // iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it + let result: (bool, bool) = breakpoints.iter().rev().fold( + (false, false), + |(mut prev_punct, mut fresh), &(next_break, break_before)| { + if fresh { + write_newline(args.indent_str, &mut args.output); + } + // at each breakpoint, keep emitting words until we find the word matching this breakpoint + for winfo in &mut iter { + let (slen, word) = slice_if_fresh( + fresh, + winfo.word, + winfo.word_start, + args.uniform, + winfo.new_line, + winfo.sentence_start, + prev_punct, + ); + fresh = false; + prev_punct = winfo.ends_punct; + + // We find identical breakpoints here by comparing addresses of the references. + // This is OK because the backing vector is not mutating once we are linebreaking. + let winfo_ptr = winfo as *const _; + let next_break_ptr = next_break as *const _; + if winfo_ptr == next_break_ptr { + // OK, we found the matching word + if break_before { + write_newline(args.indent_str, &mut args.output); + write_with_spaces(&winfo.word[winfo.word_start..], 0, &mut args.output); + } else { + // breaking after this word, so that means "fresh" is true for the next iteration + write_with_spaces(word, slen, &mut args.output); + fresh = true; + } + break; + } + write_with_spaces(word, slen, &mut args.output); + } + (prev_punct, fresh) + }, + ); + let (mut prev_punct, mut fresh) = result; + + // after the last linebreak, write out the rest of the final line. + for winfo in iter { + if fresh { + write_newline(args.indent_str, &mut args.output); + } + let (slen, word) = slice_if_fresh( + fresh, + winfo.word, + winfo.word_start, + args.uniform, + winfo.new_line, + winfo.sentence_start, + prev_punct, + ); + prev_punct = winfo.ends_punct; + fresh = false; + write_with_spaces(word, slen, &mut args.output); + } + + args.output.push('\n'); +} + +struct LineBreak<'a> { + prev: usize, + linebreak: Option<&'a WordInfo<'a>>, + break_before: bool, + demerits: i64, + prev_rat: f32, + length: usize, + fresh: bool, +} + +#[allow(clippy::cognitive_complexity)] +#[allow(clippy::cast_possible_wrap)] +fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>( + iter: T, + args: &BreakArgs<'a>, +) -> Vec<(&'a WordInfo<'a>, bool)> { + let mut iter = iter.peekable(); + // set up the initial null linebreak + let mut linebreaks = vec![LineBreak { + prev: 0, + linebreak: None, + break_before: false, + demerits: 0, + prev_rat: 0.0, + length: args.init_len, + fresh: false, + }]; + // this vec holds the current active linebreaks; next_ holds the breaks that will be active for + // the next word + let mut active_breaks = vec![0]; + let mut next_active_breaks = vec![]; + + let stretch = args.opts.width - args.opts.goal; + let minlength = args.opts.goal.max(stretch + 1) - stretch; + let mut new_linebreaks = vec![]; + let mut is_sentence_start = false; + let mut least_demerits = 0; + loop { + let Some(w) = iter.next() else { break }; + + // if this is the last word, we don't add additional demerits for this break + let (is_last_word, is_sentence_end) = match iter.peek() { + None => (true, true), + Some(&&WordInfo { + sentence_start: st, + new_line: nl, + .. + }) => (false, st || (nl && w.ends_punct)), + }; + + // should we be adding extra space at the beginning of the next sentence? + let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false); + + let mut ld_new = i64::MAX; + let mut ld_next = i64::MAX; + let mut ld_idx = 0; + new_linebreaks.clear(); + next_active_breaks.clear(); + // go through each active break, extending it and possibly adding a new active + // break if we are above the minimum required length + #[allow(clippy::explicit_iter_loop)] + for &i in active_breaks.iter() { + let active = &mut linebreaks[i]; + // normalize demerits to avoid overflow, and record if this is the least + active.demerits -= least_demerits; + if active.demerits < ld_next { + ld_next = active.demerits; + ld_idx = i; + } + + // get the new length + let tlen = w.word_nchars + + args.compute_width(w, active.length, active.fresh) + + slen + + active.length; + + // if tlen is longer than args.opts.width, we drop this break from the active list + // otherwise, we extend the break, and possibly add a new break at this point + if tlen <= args.opts.width { + // this break will still be active next time + next_active_breaks.push(i); + // we can put this word on this line + active.fresh = false; + active.length = tlen; + + // if we're above the minlength, we can also consider breaking here + if tlen >= minlength { + let (new_demerits, new_ratio) = if is_last_word { + // there is no penalty for the final line's length + (0, 0.0) + } else { + compute_demerits( + args.opts.goal as isize - tlen as isize, + stretch, + w.word_nchars, + active.prev_rat, + ) + }; + + // do not even consider adding a line that has too many demerits + // also, try to detect overflow by checking signum + let total_demerits = new_demerits + active.demerits; + if new_demerits < BAD_INFTY_SQ + && total_demerits < ld_new + && active.demerits.signum() <= new_demerits.signum() + { + ld_new = total_demerits; + new_linebreaks.push(LineBreak { + prev: i, + linebreak: Some(w), + break_before: false, + demerits: total_demerits, + prev_rat: new_ratio, + length: args.indent_len, + fresh: true, + }); + } + } + } + } + + // if we generated any new linebreaks, add the last one to the list + // the last one is always the best because we don't add to new_linebreaks unless + // it's better than the best one so far + match new_linebreaks.pop() { + None => (), + Some(lb) => { + next_active_breaks.push(linebreaks.len()); + linebreaks.push(lb); + } + } + + if next_active_breaks.is_empty() { + // every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx + let new_break = + restart_active_breaks(args, &linebreaks[ld_idx], ld_idx, w, slen, minlength); + next_active_breaks.push(linebreaks.len()); + linebreaks.push(new_break); + least_demerits = 0; + } else { + // next time around, normalize out the demerits fields + // on active linebreaks to make overflow less likely + least_demerits = cmp::max(ld_next, 0); + } + // swap in new list of active breaks + mem::swap(&mut active_breaks, &mut next_active_breaks); + // If this was the last word in a sentence, the next one must be the first in the next. + is_sentence_start = is_sentence_end; + } + + // return the best path + build_best_path(&linebreaks, &active_breaks) +} + +fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a WordInfo<'a>, bool)> { + // of the active paths, we select the one with the fewest demerits + active + .iter() + .min_by_key(|&&a| paths[a].demerits) + .map(|&(mut best_idx)| { + let mut breakwords = vec![]; + // now, chase the pointers back through the break list, recording + // the words at which we should break + loop { + let next_best = &paths[best_idx]; + match next_best.linebreak { + None => return breakwords, + Some(prev) => { + breakwords.push((prev, next_best.break_before)); + best_idx = next_best.prev; + } + } + } + }) + .unwrap_or_default() +} + +// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed +const BAD_INFTY: i64 = 10_000_000; +const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY; +// badness = BAD_MULT * abs(r) ^ 3 +const BAD_MULT: f32 = 100.0; +// DR_MULT is multiplier for delta-R between lines +const DR_MULT: f32 = 600.0; +// DL_MULT is penalty multiplier for short words at end of line +const DL_MULT: f32 = 300.0; + +#[allow(clippy::cast_precision_loss)] +#[allow(clippy::cast_possible_truncation)] +fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) { + // how much stretch are we using? + let ratio = if delta_len == 0 { + 0.0f32 + } else { + delta_len as f32 / stretch as f32 + }; + + // compute badness given the stretch ratio + let bad_linelen = if ratio.abs() > 1.0f32 { + BAD_INFTY + } else { + (BAD_MULT * ratio.powi(3).abs()) as i64 + }; + + // we penalize lines ending in really short words + let bad_wordlen = if wlen >= stretch { + 0 + } else { + (DL_MULT + * ((stretch - wlen) as f32 / (stretch - 1) as f32) + .powi(3) + .abs()) as i64 + }; + + // we penalize lines that have very different ratios from previous lines + let bad_delta_r = (DR_MULT * ((ratio - prev_rat) / 2.0).powi(3).abs()) as i64; + + let demerits = i64::pow(1 + bad_linelen + bad_wordlen + bad_delta_r, 2); + + (demerits, ratio) +} + +#[allow(clippy::cast_possible_wrap)] +fn restart_active_breaks<'a>( + args: &BreakArgs<'a>, + active: &LineBreak<'a>, + act_idx: usize, + w: &'a WordInfo<'a>, + slen: usize, + min: usize, +) -> LineBreak<'a> { + let (break_before, line_length) = if active.fresh { + // never break before a word if that word would be the first on a line + (false, args.indent_len) + } else { + // choose the lesser evil: breaking too early, or breaking too late + let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh); + let underlen = min as isize - active.length as isize; + let overlen = (wlen + slen + active.length) as isize - args.opts.width as isize; + if overlen > underlen { + // break early, put this word on the next line + (true, args.indent_len + w.word_nchars) + } else { + (false, args.indent_len) + } + }; + + // restart the linebreak. This will be our only active path. + LineBreak { + prev: act_idx, + linebreak: Some(w), + break_before, + demerits: 0, // this is the only active break, so we can reset the demerit count + prev_rat: if break_before { 1.0 } else { -1.0 }, + length: line_length, + fresh: !break_before, + } +} + +// Number of spaces to add before a word, based on mode, newline, sentence start. +#[allow(clippy::fn_params_excessive_bools)] +fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> usize { + if uniform || newline { + if start || (newline && punct) { 2 } else { 1 } + } else { + 0 + } +} + +// If we're on a fresh line, slen=0 and we slice off leading whitespace. +// Otherwise, compute slen and leave whitespace alone. +#[allow(clippy::fn_params_excessive_bools)] +fn slice_if_fresh( + fresh: bool, + word: &str, + start: usize, + uniform: bool, + newline: bool, + second_start: bool, + punct: bool, +) -> (usize, &str) { + if fresh { + (0, &word[start..]) + } else { + (compute_slen(uniform, newline, second_start, punct), word) + } +} + +// Write a newline and add the indent. +fn write_newline(indent: &str, output: &mut String) { + output.push('\n'); + output.push_str(indent); +} + +// Write the word, along with slen spaces. +fn write_with_spaces(word: &str, slen: usize, output: &mut String) { + if slen == 2 { + output.push_str(" "); + } else if slen == 1 { + output.push(' '); + } + output.push_str(word); +} |