diff options
Diffstat (limited to 'crates/fmt')
-rw-r--r-- | crates/fmt/Cargo.toml | 30 | ||||
-rw-r--r-- | crates/fmt/LICENSE | 18 | ||||
-rw-r--r-- | crates/fmt/LICENSE.license | 10 | ||||
-rw-r--r-- | crates/fmt/src/fmt.rs | 137 | ||||
-rw-r--r-- | crates/fmt/src/linebreak.rs | 520 | ||||
-rw-r--r-- | crates/fmt/src/parasplit.rs | 629 |
6 files changed, 1344 insertions, 0 deletions
diff --git a/crates/fmt/Cargo.toml b/crates/fmt/Cargo.toml new file mode 100644 index 0000000..7f82a09 --- /dev/null +++ b/crates/fmt/Cargo.toml @@ -0,0 +1,30 @@ +# yt - A fully featured command line YouTube client +# +# Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de> +# Copyright (C) 2025 uutils developers +# SPDX-License-Identifier: MIT +# +# This file is part of Yt. +# +# You should have received a copy of the License along with this program. +# If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. + +[package] +name = "uu_fmt" +authors = ["uutils developers", "Benedikt Peetz <benedikt.peetz@b-peetz.de>"] +license = "MIT" +description = "A fork of the uutils fmt tool. This fork is a library instead of a binary." +version.workspace = true +edition.workspace = true +repository.workspace = true +rust-version.workspace = true +publish = false + +[lib] +path = "src/fmt.rs" + +[dependencies] +unicode-width = "0.2.0" + +[lints] +workspace = true diff --git a/crates/fmt/LICENSE b/crates/fmt/LICENSE new file mode 100644 index 0000000..21bd444 --- /dev/null +++ b/crates/fmt/LICENSE @@ -0,0 +1,18 @@ +Copyright (c) uutils developers + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/crates/fmt/LICENSE.license b/crates/fmt/LICENSE.license new file mode 100644 index 0000000..6cee99d --- /dev/null +++ b/crates/fmt/LICENSE.license @@ -0,0 +1,10 @@ +yt - A fully featured command line YouTube client + +Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de> +Copyright (C) 2025 uutils developers +SPDX-License-Identifier: MIT + +This file is part of Yt. + +You should have received a copy of the License along with this program. +If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. diff --git a/crates/fmt/src/fmt.rs b/crates/fmt/src/fmt.rs new file mode 100644 index 0000000..3067bea --- /dev/null +++ b/crates/fmt/src/fmt.rs @@ -0,0 +1,137 @@ +// yt - A fully featured command line YouTube client +// +// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de> +// Copyright (C) 2025 uutils developers +// SPDX-License-Identifier: MIT +// +// This file is part of Yt. +// +// You should have received a copy of the License along with this program. +// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. + +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::fmt::Write; + +use linebreak::break_lines; +use parasplit::ParagraphStream; + +mod linebreak; +mod parasplit; + +#[derive(Debug)] +#[allow(clippy::struct_excessive_bools)] +pub struct FmtOptions { + /// First and second line of paragraph + /// may have different indentations, in which + /// case the first line's indentation is preserved, + /// and each subsequent line's indentation matches the second line. + pub crown_margin: bool, + + /// Like the [`crown_margin`], except that the first and second line of a paragraph *must* + /// have different indentation or they are treated as separate paragraphs. + pub tagged_paragraph: bool, + + /// Attempt to detect and preserve mail headers in the input. + /// Be careful when combining this with [`prefix`]. + pub mail: bool, + + /// Split lines only, do not reflow. + pub split_only: bool, + + /// Insert exactly one space between words, and two between sentences. + /// Sentence breaks in the input are detected as [?!.] followed by two spaces or a newline; + /// other punctuation is not interpreted as a sentence break. + pub uniform: bool, + + /// Reformat only lines beginning with PREFIX, reattaching PREFIX to reformatted lines. + /// Unless [`exact_prefix`] is specified, leading whitespace will be ignored when matching PREFIX. + pub prefix: Option<String>, + + /// Do not reformat lines beginning with ``ANTI_PREFIX``. + /// Unless [`exact_anti_prefix`] is specified, leading whitespace will be ignored when matching ``ANTI_PREFIX``. + pub anti_prefix: Option<String>, + + /// [`prefix`] must match at the beginning of the line with no preceding whitespace. + pub exact_prefix: bool, + + /// [`anti_prefix`] must match at the beginning of the line with no preceding whitespace. + pub exact_anti_prefix: bool, + + /// Fill output lines up to a maximum of WIDTH columns, default 75. + pub width: usize, + + /// Goal width, default of 93% of WIDTH. + /// Must be less than or equal to WIDTH. + pub goal: usize, + + /// Break lines more quickly at the expense of a potentially more ragged appearance. + pub quick: bool, + + /// Treat tabs as TABWIDTH spaces for determining line length, default 8. + /// Note that this is used only for calculating line lengths; tabs are preserved in the output. + pub tabwidth: usize, +} + +impl FmtOptions { + #[must_use] + #[allow(clippy::cast_sign_loss)] + #[allow(clippy::cast_possible_truncation)] + #[allow(clippy::cast_precision_loss)] + pub fn new(width: Option<usize>, goal: Option<usize>, tabwidth: Option<usize>) -> Self { + // by default, goal is 93% of width + const DEFAULT_GOAL_TO_WIDTH_RATIO: f64 = 0.93; + const DEFAULT_WIDTH: usize = 75; + + FmtOptions { + crown_margin: false, + tagged_paragraph: false, + mail: false, + split_only: false, + uniform: false, + prefix: None, + anti_prefix: None, + exact_prefix: false, + exact_anti_prefix: false, + width: width.unwrap_or(DEFAULT_WIDTH), + goal: goal.unwrap_or( + ((width.unwrap_or(DEFAULT_WIDTH) as f64) * DEFAULT_GOAL_TO_WIDTH_RATIO).floor() + as usize, + ), + quick: false, + tabwidth: tabwidth.unwrap_or(8), + } + } +} + +/// Process text and format it according to the provided options. +/// +/// # Arguments +/// +/// * `text` - The text to process. +/// * `fmt_opts` - A reference to a [`FmtOptions`] structure containing the formatting options. +/// +/// # Returns +/// +/// The formatted [`String`]. +#[must_use] +pub fn process_text(text: &str, fmt_opts: &FmtOptions) -> String { + let mut output = String::new(); + + let p_stream = ParagraphStream::new(fmt_opts, text); + for para_result in p_stream { + match para_result { + Err(s) => { + output.push_str(&s); + output.push('\n'); + } + Ok(para) => write!(output, "{}", break_lines(¶, fmt_opts)) + .expect("This is in-memory. It should not fail"), + } + } + + output +} diff --git a/crates/fmt/src/linebreak.rs b/crates/fmt/src/linebreak.rs new file mode 100644 index 0000000..b1dc6fa --- /dev/null +++ b/crates/fmt/src/linebreak.rs @@ -0,0 +1,520 @@ +// yt - A fully featured command line YouTube client +// +// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de> +// Copyright (C) 2025 uutils developers +// SPDX-License-Identifier: MIT +// +// This file is part of Yt. +// +// You should have received a copy of the License along with this program. +// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. + +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::fmt::Write; +use std::{cmp, mem}; + +use crate::FmtOptions; +use crate::parasplit::{ParaWords, Paragraph, WordInfo}; + +struct BreakArgs<'a> { + opts: &'a FmtOptions, + init_len: usize, + indent_str: &'a str, + indent_len: usize, + uniform: bool, + output: String, +} + +impl BreakArgs<'_> { + fn compute_width(&self, winfo: &WordInfo<'_>, position_n: usize, fresh: bool) -> usize { + if fresh { + 0 + } else { + let post = winfo.after_tab; + match winfo.before_tab { + None => post, + Some(pre) => { + post + ((pre + position_n) / self.opts.tabwidth + 1) * self.opts.tabwidth + - position_n + } + } + } + } +} + +pub(super) fn break_lines(para: &Paragraph, opts: &FmtOptions) -> String { + let mut output = String::new(); + + // indent + let p_indent = ¶.indent_str; + let p_indent_len = para.indent_len; + + // words + let p_words = ParaWords::new(opts, para); + let mut p_words_words = p_words.words(); + + // the first word will *always* appear on the first line + // make sure of this here + let Some(winfo) = p_words_words.next() else { + return "\n".to_owned(); + }; + + // print the init, if it exists, and get its length + let p_init_len = winfo.word_nchars + + if opts.crown_margin || opts.tagged_paragraph { + // handle "init" portion + output.push_str(¶.init_str); + para.init_len + } else if !para.mail_header { + // for non-(crown, tagged) that's the same as a normal indent + output.push_str(p_indent); + p_indent_len + } else { + // except that mail headers get no indent at all + 0 + }; + + // write first word after writing init + write!(output, "{}", winfo.word).expect("Works"); + + // does this paragraph require uniform spacing? + let uniform = para.mail_header || opts.uniform; + + let mut break_args = BreakArgs { + opts, + init_len: p_init_len, + indent_str: p_indent, + indent_len: p_indent_len, + uniform, + output, + }; + + if opts.quick || para.mail_header { + break_simple(p_words_words, &mut break_args); + } else { + break_knuth_plass(p_words_words, &mut break_args); + }; + + break_args.output +} + +// break_simple implements a "greedy" breaking algorithm: print words until +// maxlength would be exceeded, then print a linebreak and indent and continue. +fn break_simple<'a, T: Iterator<Item = &'a WordInfo<'a>>>(iter: T, args: &mut BreakArgs<'a>) { + iter.fold((args.init_len, false), |(l, prev_punct), winfo| { + accum_words_simple(args, l, prev_punct, winfo) + }); + args.output.push('\n'); +} + +fn accum_words_simple<'a>( + args: &mut BreakArgs<'a>, + l: usize, + prev_punct: bool, + winfo: &'a WordInfo<'a>, +) -> (usize, bool) { + // compute the length of this word, considering how tabs will expand at this position on the line + let wlen = winfo.word_nchars + args.compute_width(winfo, l, false); + + let slen = compute_slen( + args.uniform, + winfo.new_line, + winfo.sentence_start, + prev_punct, + ); + + if l + wlen + slen > args.opts.width { + write_newline(args.indent_str, &mut args.output); + write_with_spaces(&winfo.word[winfo.word_start..], 0, &mut args.output); + (args.indent_len + winfo.word_nchars, winfo.ends_punct) + } else { + write_with_spaces(winfo.word, slen, &mut args.output); + (l + wlen + slen, winfo.ends_punct) + } +} + +// break_knuth_plass implements an "optimal" breaking algorithm in the style of +// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software, +// Practice and Experience. Vol. 11, No. 11, November 1981. +// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf +#[allow(trivial_casts)] +fn break_knuth_plass<'a, T: Clone + Iterator<Item = &'a WordInfo<'a>>>( + mut iter: T, + args: &mut BreakArgs<'a>, +) { + // run the algorithm to get the breakpoints + let breakpoints = find_kp_breakpoints(iter.clone(), args); + + // iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it + let result: (bool, bool) = breakpoints.iter().rev().fold( + (false, false), + |(mut prev_punct, mut fresh), &(next_break, break_before)| { + if fresh { + write_newline(args.indent_str, &mut args.output); + } + // at each breakpoint, keep emitting words until we find the word matching this breakpoint + for winfo in &mut iter { + let (slen, word) = slice_if_fresh( + fresh, + winfo.word, + winfo.word_start, + args.uniform, + winfo.new_line, + winfo.sentence_start, + prev_punct, + ); + fresh = false; + prev_punct = winfo.ends_punct; + + // We find identical breakpoints here by comparing addresses of the references. + // This is OK because the backing vector is not mutating once we are linebreaking. + let winfo_ptr = winfo as *const _; + let next_break_ptr = next_break as *const _; + if winfo_ptr == next_break_ptr { + // OK, we found the matching word + if break_before { + write_newline(args.indent_str, &mut args.output); + write_with_spaces(&winfo.word[winfo.word_start..], 0, &mut args.output); + } else { + // breaking after this word, so that means "fresh" is true for the next iteration + write_with_spaces(word, slen, &mut args.output); + fresh = true; + } + break; + } + write_with_spaces(word, slen, &mut args.output); + } + (prev_punct, fresh) + }, + ); + let (mut prev_punct, mut fresh) = result; + + // after the last linebreak, write out the rest of the final line. + for winfo in iter { + if fresh { + write_newline(args.indent_str, &mut args.output); + } + let (slen, word) = slice_if_fresh( + fresh, + winfo.word, + winfo.word_start, + args.uniform, + winfo.new_line, + winfo.sentence_start, + prev_punct, + ); + prev_punct = winfo.ends_punct; + fresh = false; + write_with_spaces(word, slen, &mut args.output); + } + + args.output.push('\n'); +} + +struct LineBreak<'a> { + prev: usize, + linebreak: Option<&'a WordInfo<'a>>, + break_before: bool, + demerits: i64, + prev_rat: f32, + length: usize, + fresh: bool, +} + +#[allow(clippy::cognitive_complexity)] +#[allow(clippy::cast_possible_wrap)] +fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>( + iter: T, + args: &BreakArgs<'a>, +) -> Vec<(&'a WordInfo<'a>, bool)> { + let mut iter = iter.peekable(); + // set up the initial null linebreak + let mut linebreaks = vec![LineBreak { + prev: 0, + linebreak: None, + break_before: false, + demerits: 0, + prev_rat: 0.0, + length: args.init_len, + fresh: false, + }]; + // this vec holds the current active linebreaks; next_ holds the breaks that will be active for + // the next word + let mut active_breaks = vec![0]; + let mut next_active_breaks = vec![]; + + let stretch = args.opts.width - args.opts.goal; + let minlength = args.opts.goal.max(stretch + 1) - stretch; + let mut new_linebreaks = vec![]; + let mut is_sentence_start = false; + let mut least_demerits = 0; + loop { + let Some(w) = iter.next() else { break }; + + // if this is the last word, we don't add additional demerits for this break + let (is_last_word, is_sentence_end) = match iter.peek() { + None => (true, true), + Some(&&WordInfo { + sentence_start: st, + new_line: nl, + .. + }) => (false, st || (nl && w.ends_punct)), + }; + + // should we be adding extra space at the beginning of the next sentence? + let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false); + + let mut ld_new = i64::MAX; + let mut ld_next = i64::MAX; + let mut ld_idx = 0; + new_linebreaks.clear(); + next_active_breaks.clear(); + // go through each active break, extending it and possibly adding a new active + // break if we are above the minimum required length + #[allow(clippy::explicit_iter_loop)] + for &i in active_breaks.iter() { + let active = &mut linebreaks[i]; + // normalize demerits to avoid overflow, and record if this is the least + active.demerits -= least_demerits; + if active.demerits < ld_next { + ld_next = active.demerits; + ld_idx = i; + } + + // get the new length + let tlen = w.word_nchars + + args.compute_width(w, active.length, active.fresh) + + slen + + active.length; + + // if tlen is longer than args.opts.width, we drop this break from the active list + // otherwise, we extend the break, and possibly add a new break at this point + if tlen <= args.opts.width { + // this break will still be active next time + next_active_breaks.push(i); + // we can put this word on this line + active.fresh = false; + active.length = tlen; + + // if we're above the minlength, we can also consider breaking here + if tlen >= minlength { + let (new_demerits, new_ratio) = if is_last_word { + // there is no penalty for the final line's length + (0, 0.0) + } else { + compute_demerits( + args.opts.goal as isize - tlen as isize, + stretch, + w.word_nchars, + active.prev_rat, + ) + }; + + // do not even consider adding a line that has too many demerits + // also, try to detect overflow by checking signum + let total_demerits = new_demerits + active.demerits; + if new_demerits < BAD_INFTY_SQ + && total_demerits < ld_new + && active.demerits.signum() <= new_demerits.signum() + { + ld_new = total_demerits; + new_linebreaks.push(LineBreak { + prev: i, + linebreak: Some(w), + break_before: false, + demerits: total_demerits, + prev_rat: new_ratio, + length: args.indent_len, + fresh: true, + }); + } + } + } + } + + // if we generated any new linebreaks, add the last one to the list + // the last one is always the best because we don't add to new_linebreaks unless + // it's better than the best one so far + match new_linebreaks.pop() { + None => (), + Some(lb) => { + next_active_breaks.push(linebreaks.len()); + linebreaks.push(lb); + } + } + + if next_active_breaks.is_empty() { + // every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx + let new_break = + restart_active_breaks(args, &linebreaks[ld_idx], ld_idx, w, slen, minlength); + next_active_breaks.push(linebreaks.len()); + linebreaks.push(new_break); + least_demerits = 0; + } else { + // next time around, normalize out the demerits fields + // on active linebreaks to make overflow less likely + least_demerits = cmp::max(ld_next, 0); + } + // swap in new list of active breaks + mem::swap(&mut active_breaks, &mut next_active_breaks); + // If this was the last word in a sentence, the next one must be the first in the next. + is_sentence_start = is_sentence_end; + } + + // return the best path + build_best_path(&linebreaks, &active_breaks) +} + +fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a WordInfo<'a>, bool)> { + // of the active paths, we select the one with the fewest demerits + active + .iter() + .min_by_key(|&&a| paths[a].demerits) + .map(|&(mut best_idx)| { + let mut breakwords = vec![]; + // now, chase the pointers back through the break list, recording + // the words at which we should break + loop { + let next_best = &paths[best_idx]; + match next_best.linebreak { + None => return breakwords, + Some(prev) => { + breakwords.push((prev, next_best.break_before)); + best_idx = next_best.prev; + } + } + } + }) + .unwrap_or_default() +} + +// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed +const BAD_INFTY: i64 = 10_000_000; +const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY; +// badness = BAD_MULT * abs(r) ^ 3 +const BAD_MULT: f32 = 100.0; +// DR_MULT is multiplier for delta-R between lines +const DR_MULT: f32 = 600.0; +// DL_MULT is penalty multiplier for short words at end of line +const DL_MULT: f32 = 300.0; + +#[allow(clippy::cast_precision_loss)] +#[allow(clippy::cast_possible_truncation)] +fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) { + // how much stretch are we using? + let ratio = if delta_len == 0 { + 0.0f32 + } else { + delta_len as f32 / stretch as f32 + }; + + // compute badness given the stretch ratio + let bad_linelen = if ratio.abs() > 1.0f32 { + BAD_INFTY + } else { + (BAD_MULT * ratio.powi(3).abs()) as i64 + }; + + // we penalize lines ending in really short words + let bad_wordlen = if wlen >= stretch { + 0 + } else { + (DL_MULT + * ((stretch - wlen) as f32 / (stretch - 1) as f32) + .powi(3) + .abs()) as i64 + }; + + // we penalize lines that have very different ratios from previous lines + let bad_delta_r = (DR_MULT * ((ratio - prev_rat) / 2.0).powi(3).abs()) as i64; + + let demerits = i64::pow(1 + bad_linelen + bad_wordlen + bad_delta_r, 2); + + (demerits, ratio) +} + +#[allow(clippy::cast_possible_wrap)] +fn restart_active_breaks<'a>( + args: &BreakArgs<'a>, + active: &LineBreak<'a>, + act_idx: usize, + w: &'a WordInfo<'a>, + slen: usize, + min: usize, +) -> LineBreak<'a> { + let (break_before, line_length) = if active.fresh { + // never break before a word if that word would be the first on a line + (false, args.indent_len) + } else { + // choose the lesser evil: breaking too early, or breaking too late + let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh); + let underlen = min as isize - active.length as isize; + let overlen = (wlen + slen + active.length) as isize - args.opts.width as isize; + if overlen > underlen { + // break early, put this word on the next line + (true, args.indent_len + w.word_nchars) + } else { + (false, args.indent_len) + } + }; + + // restart the linebreak. This will be our only active path. + LineBreak { + prev: act_idx, + linebreak: Some(w), + break_before, + demerits: 0, // this is the only active break, so we can reset the demerit count + prev_rat: if break_before { 1.0 } else { -1.0 }, + length: line_length, + fresh: !break_before, + } +} + +// Number of spaces to add before a word, based on mode, newline, sentence start. +#[allow(clippy::fn_params_excessive_bools)] +fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> usize { + if uniform || newline { + if start || (newline && punct) { 2 } else { 1 } + } else { + 0 + } +} + +// If we're on a fresh line, slen=0 and we slice off leading whitespace. +// Otherwise, compute slen and leave whitespace alone. +#[allow(clippy::fn_params_excessive_bools)] +fn slice_if_fresh( + fresh: bool, + word: &str, + start: usize, + uniform: bool, + newline: bool, + second_start: bool, + punct: bool, +) -> (usize, &str) { + if fresh { + (0, &word[start..]) + } else { + (compute_slen(uniform, newline, second_start, punct), word) + } +} + +// Write a newline and add the indent. +fn write_newline(indent: &str, output: &mut String) { + output.push('\n'); + output.push_str(indent); +} + +// Write the word, along with slen spaces. +fn write_with_spaces(word: &str, slen: usize, output: &mut String) { + if slen == 2 { + output.push_str(" "); + } else if slen == 1 { + output.push(' '); + } + output.push_str(word); +} diff --git a/crates/fmt/src/parasplit.rs b/crates/fmt/src/parasplit.rs new file mode 100644 index 0000000..d4723cb --- /dev/null +++ b/crates/fmt/src/parasplit.rs @@ -0,0 +1,629 @@ +// yt - A fully featured command line YouTube client +// +// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de> +// Copyright (C) 2025 uutils developers +// SPDX-License-Identifier: MIT +// +// This file is part of Yt. +// +// You should have received a copy of the License along with this program. +// If not, see <https://www.gnu.org/licenses/gpl-3.0.txt>. + +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::iter::Peekable; +use std::slice::Iter; +use unicode_width::UnicodeWidthChar; + +use crate::FmtOptions; + +fn char_width(c: char) -> usize { + if (c as usize) < 0xA0 { + // if it is ASCII, call it exactly 1 wide (including control chars) + // calling control chars' widths 1 is consistent with OpenBSD fmt + 1 + } else { + // otherwise, get the unicode width + // note that we shouldn't actually get None here because only c < 0xA0 + // can return None, but for safety and future-proofing we do it this way + UnicodeWidthChar::width(c).unwrap_or(1) + } +} + +// lines with PSKIP, lacking PREFIX, or which are entirely blank are +// NoFormatLines; otherwise, they are FormatLines +#[derive(Debug)] +pub(super) enum Line { + FormatLine(FileLine), + NoFormatLine(String, bool), +} + +impl Line { + // when we know that it's a FormatLine, as in the ParagraphStream iterator + fn get_formatline(self) -> FileLine { + match self { + Self::FormatLine(fl) => fl, + Self::NoFormatLine(..) => panic!("Found NoFormatLine when expecting FormatLine"), + } + } + + // when we know that it's a NoFormatLine, as in the ParagraphStream iterator + fn get_noformatline(self) -> (String, bool) { + match self { + Self::NoFormatLine(s, b) => (s, b), + Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"), + } + } +} + +/// Each line's prefix has to be considered to know whether to merge it with +/// the next line or not +#[derive(Debug)] +pub(super) struct FileLine { + line: String, + /// The end of the indent, always the start of the text + indent_end: usize, + + /// The end of the PREFIX's indent, that is, the spaces before the prefix + prefix_indent_end: usize, + + /// Display length of indent taking into account tabs + indent_len: usize, + + /// PREFIX indent length taking into account tabs + prefix_len: usize, +} + +/// Iterator that produces a stream of Lines from a file +pub(super) struct FileLines<'a> { + opts: &'a FmtOptions, + lines: std::str::Lines<'a>, +} + +impl FileLines<'_> { + fn new<'b>(opts: &'b FmtOptions, lines: std::str::Lines<'b>) -> FileLines<'b> { + FileLines { opts, lines } + } + + /// returns true if this line should be formatted + fn match_prefix(&self, line: &str) -> (bool, usize) { + let Some(prefix) = &self.opts.prefix else { + return (true, 0); + }; + + FileLines::match_prefix_generic(prefix, line, self.opts.exact_prefix) + } + + /// returns true if this line should be formatted + fn match_anti_prefix(&self, line: &str) -> bool { + let Some(anti_prefix) = &self.opts.anti_prefix else { + return true; + }; + + match FileLines::match_prefix_generic(anti_prefix, line, self.opts.exact_anti_prefix) { + (true, _) => false, + (_, _) => true, + } + } + + fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, usize) { + if line.starts_with(pfx) { + return (true, 0); + } + + if !exact { + // we do it this way rather than byte indexing to support unicode whitespace chars + for (i, char) in line.char_indices() { + if line[i..].starts_with(pfx) { + return (true, i); + } else if !char.is_whitespace() { + break; + } + } + } + + (false, 0) + } + + fn compute_indent(&self, string: &str, prefix_end: usize) -> (usize, usize, usize) { + let mut prefix_len = 0; + let mut indent_len = 0; + let mut indent_end = 0; + for (os, c) in string.char_indices() { + if os == prefix_end { + // we found the end of the prefix, so this is the printed length of the prefix here + prefix_len = indent_len; + } + + if (os >= prefix_end) && !c.is_whitespace() { + // found first non-whitespace after prefix, this is indent_end + indent_end = os; + break; + } else if c == '\t' { + // compute tab length + indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth; + } else { + // non-tab character + indent_len += char_width(c); + } + } + (indent_end, prefix_len, indent_len) + } +} + +impl Iterator for FileLines<'_> { + type Item = Line; + + fn next(&mut self) -> Option<Line> { + let n = self.lines.next()?; + + // if this line is entirely whitespace, + // emit a blank line + // Err(true) indicates that this was a linebreak, + // which is important to know when detecting mail headers + if n.chars().all(char::is_whitespace) { + return Some(Line::NoFormatLine(String::new(), true)); + } + + let (pmatch, poffset) = self.match_prefix(n); + + // if this line does not match the prefix, + // emit the line unprocessed and iterate again + if !pmatch { + return Some(Line::NoFormatLine(n.to_owned(), false)); + } + + // if the line matches the prefix, but is blank after, + // don't allow lines to be combined through it (that is, + // treat it like a blank line, except that since it's + // not truly blank we will not allow mail headers on the + // following line) + if pmatch + && n[poffset + self.opts.prefix.as_ref().map_or(0, String::len)..] + .chars() + .all(char::is_whitespace) + { + return Some(Line::NoFormatLine(n.to_owned(), false)); + } + + // skip if this line matches the anti_prefix + // (NOTE definition of match_anti_prefix is TRUE if we should process) + if !self.match_anti_prefix(n) { + return Some(Line::NoFormatLine(n.to_owned(), false)); + } + + // figure out the indent, prefix, and prefixindent ending points + let prefix_end = poffset + self.opts.prefix.as_ref().map_or(0, String::len); + let (indent_end, prefix_len, indent_len) = self.compute_indent(n, prefix_end); + + Some(Line::FormatLine(FileLine { + line: n.to_owned(), + indent_end, + prefix_indent_end: poffset, + indent_len, + prefix_len, + })) + } +} + +/// A paragraph : a collection of [`FileLines`] that are to be formatted +/// plus info about the paragraph's indentation +/// +/// We only retain the String from the [`FileLine`]; the other info +/// is only there to help us in deciding how to merge lines into Paragraphs +#[derive(Debug)] +pub(super) struct Paragraph { + /// the lines of the file + lines: Vec<String>, + /// string representing the init, that is, the first line's indent + pub init_str: String, + /// printable length of the init string considering TABWIDTH + pub init_len: usize, + /// byte location of end of init in first line String + init_end: usize, + /// string representing indent + pub indent_str: String, + /// length of above + pub indent_len: usize, + /// byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward) + indent_end: usize, + /// we need to know if this is a mail header because we do word splitting differently in that case + pub mail_header: bool, +} + +/// An iterator producing a stream of paragraphs from a stream of lines +/// given a set of options. +pub(super) struct ParagraphStream<'a> { + lines: Peekable<FileLines<'a>>, + next_mail: bool, + opts: &'a FmtOptions, +} + +impl ParagraphStream<'_> { + pub(super) fn new<'b>(opts: &'b FmtOptions, text: &'b str) -> ParagraphStream<'b> { + let lines = FileLines::new(opts, text.lines()).peekable(); + // at the beginning of the file, we might find mail headers + ParagraphStream { + lines, + next_mail: true, + opts, + } + } + + /// Detect RFC822 mail header + fn is_mail_header(line: &FileLine) -> bool { + // a mail header begins with either "From " (envelope sender line) + // or with a sequence of printable ASCII chars (33 to 126, inclusive, + // except colon) followed by a colon. + if line.indent_end > 0 { + false + } else { + let l_slice = &line.line[..]; + if l_slice.starts_with("From ") { + true + } else { + let Some(colon_posn) = l_slice.find(':') else { + return false; + }; + + // header field must be nonzero length + if colon_posn == 0 { + return false; + } + + l_slice[..colon_posn] + .chars() + .all(|x| !matches!(x as usize, y if !(33..=126).contains(&y))) + } + } + } +} + +impl Iterator for ParagraphStream<'_> { + type Item = Result<Paragraph, String>; + + #[allow(clippy::cognitive_complexity)] + fn next(&mut self) -> Option<Result<Paragraph, String>> { + // return a NoFormatLine in an Err; it should immediately be output + let noformat = match self.lines.peek()? { + Line::FormatLine(_) => false, + Line::NoFormatLine(_, _) => true, + }; + + // found a NoFormatLine, immediately dump it out + if noformat { + let (s, nm) = self.lines.next().unwrap().get_noformatline(); + self.next_mail = nm; + return Some(Err(s)); + } + + // found a FormatLine, now build a paragraph + let mut init_str = String::new(); + let mut init_end = 0; + let mut init_len = 0; + let mut indent_str = String::new(); + let mut indent_end = 0; + let mut indent_len = 0; + let mut prefix_len = 0; + let mut prefix_indent_end = 0; + let mut p_lines = Vec::new(); + + let mut in_mail = false; + let mut second_done = false; // for when we use crown or tagged mode + loop { + // peek ahead + // need to explicitly force fl out of scope before we can call self.lines.next() + let Some(Line::FormatLine(fl)) = self.lines.peek() else { + break; + }; + + if p_lines.is_empty() { + // first time through the loop, get things set up + // detect mail header + if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) { + in_mail = true; + // there can't be any indent or prefixindent because otherwise is_mail_header + // would fail since there cannot be any whitespace before the colon in a + // valid header field + indent_str.push_str(" "); + indent_len = 2; + } else { + if self.opts.crown_margin || self.opts.tagged_paragraph { + init_str.push_str(&fl.line[..fl.indent_end]); + init_len = fl.indent_len; + init_end = fl.indent_end; + } else { + second_done = true; + } + + // these will be overwritten in the 2nd line of crown or tagged mode, but + // we are not guaranteed to get to the 2nd line, e.g., if the next line + // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around + indent_str.push_str(&fl.line[..fl.indent_end]); + indent_len = fl.indent_len; + indent_end = fl.indent_end; + + // save these to check for matching lines + prefix_len = fl.prefix_len; + prefix_indent_end = fl.prefix_indent_end; + + // in tagged mode, add 4 spaces of additional indenting by default + // (gnu fmt's behavior is different: it seems to find the closest column to + // indent_end that is divisible by 3. But honestly that behavior seems + // pretty arbitrary. + // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big. + if self.opts.tagged_paragraph { + indent_str.push_str(" "); + indent_len += 4; + } + } + } else if in_mail { + // lines following mail headers must begin with spaces + if fl.indent_end == 0 || (self.opts.prefix.is_some() && fl.prefix_indent_end == 0) { + break; // this line does not begin with spaces + } + } else if !second_done { + // now we have enough info to handle crown margin and tagged mode + + // in both crown and tagged modes we require that prefix_len is the same + if prefix_len != fl.prefix_len || prefix_indent_end != fl.prefix_indent_end { + break; + } + + // in tagged mode, indent has to be *different* on following lines + if self.opts.tagged_paragraph + && indent_len - 4 == fl.indent_len + && indent_end == fl.indent_end + { + break; + } + + // this is part of the same paragraph, get the indent info from this line + indent_str.clear(); + indent_str.push_str(&fl.line[..fl.indent_end]); + indent_len = fl.indent_len; + indent_end = fl.indent_end; + + second_done = true; + } else { + // detect mismatch + if indent_end != fl.indent_end + || prefix_indent_end != fl.prefix_indent_end + || indent_len != fl.indent_len + || prefix_len != fl.prefix_len + { + break; + } + } + + p_lines.push(self.lines.next().unwrap().get_formatline().line); + + // when we're in split-only mode, we never join lines, so stop here + if self.opts.split_only { + break; + } + } + + // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot. + // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank + // NoFormatLine. + self.next_mail = in_mail; + + Some(Ok(Paragraph { + lines: p_lines, + init_str, + init_len, + init_end, + indent_str, + indent_len, + indent_end, + mail_header: in_mail, + })) + } +} + +pub(super) struct ParaWords<'a> { + opts: &'a FmtOptions, + para: &'a Paragraph, + words: Vec<WordInfo<'a>>, +} + +impl<'a> ParaWords<'a> { + pub(super) fn new(opts: &'a FmtOptions, para: &'a Paragraph) -> Self { + let mut pw = ParaWords { + opts, + para, + words: Vec::new(), + }; + pw.create_words(); + pw + } + + fn create_words(&mut self) { + if self.para.mail_header { + // no extra spacing for mail headers; always exactly 1 space + // safe to trim_start on every line of a mail header, since the + // first line is guaranteed not to have any spaces + self.words.extend( + self.para + .lines + .iter() + .flat_map(|x| x.split_whitespace()) + .map(|x| WordInfo { + word: x, + word_start: 0, + word_nchars: x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped) + before_tab: None, + after_tab: 0, + sentence_start: false, + ends_punct: false, + new_line: false, + }), + ); + } else { + // first line + self.words + .extend(if self.opts.crown_margin || self.opts.tagged_paragraph { + // crown and tagged mode has the "init" in the first line, so slice from there + WordSplit::new(self.opts, &self.para.lines[0][self.para.init_end..]) + } else { + // otherwise we slice from the indent + WordSplit::new(self.opts, &self.para.lines[0][self.para.indent_end..]) + }); + + if self.para.lines.len() > 1 { + let indent_end = self.para.indent_end; + let opts = self.opts; + self.words.extend( + self.para + .lines + .iter() + .skip(1) + .flat_map(|x| WordSplit::new(opts, &x[indent_end..])), + ); + } + } + } + + pub(super) fn words(&'a self) -> Iter<'a, WordInfo<'a>> { + self.words.iter() + } +} + +struct WordSplit<'a> { + opts: &'a FmtOptions, + string: &'a str, + length: usize, + position: usize, + prev_punct: bool, +} + +impl WordSplit<'_> { + fn analyze_tabs(&self, string: &str) -> (Option<usize>, usize, Option<usize>) { + // given a string, determine (length before tab) and (printed length after first tab) + // if there are no tabs, beforetab = -1 and aftertab is the printed length + let mut beforetab = None; + let mut aftertab = 0; + let mut word_start = None; + for (os, c) in string.char_indices() { + if !c.is_whitespace() { + word_start = Some(os); + break; + } else if c == '\t' { + if beforetab.is_none() { + beforetab = Some(aftertab); + aftertab = 0; + } else { + aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth; + } + } else { + aftertab += 1; + } + } + (beforetab, aftertab, word_start) + } +} + +impl WordSplit<'_> { + fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> { + // wordsplits *must* start at a non-whitespace character + let trim_string = string.trim_start(); + WordSplit { + opts, + string: trim_string, + length: string.len(), + position: 0, + prev_punct: false, + } + } + + fn is_punctuation(c: char) -> bool { + matches!(c, '!' | '.' | '?') + } +} + +pub(super) struct WordInfo<'a> { + pub word: &'a str, + pub word_start: usize, + pub word_nchars: usize, + pub before_tab: Option<usize>, + pub after_tab: usize, + pub sentence_start: bool, + pub ends_punct: bool, + pub new_line: bool, +} + +// returns (&str, is_start_of_sentence) +impl<'a> Iterator for WordSplit<'a> { + type Item = WordInfo<'a>; + + fn next(&mut self) -> Option<WordInfo<'a>> { + if self.position >= self.length { + return None; + } + + let old_position = self.position; + let new_line = old_position == 0; + + // find the start of the next word, and record if we find a tab character + let (before_tab, after_tab, word_start) = + if let (b, a, Some(s)) = self.analyze_tabs(&self.string[old_position..]) { + (b, a, s + old_position) + } else { + self.position = self.length; + return None; + }; + + // find the beginning of the next whitespace + // note that this preserves the invariant that self.position + // points to whitespace character OR end of string + let mut word_nchars = 0; + self.position = match self.string[word_start..].find(|x: char| { + if x.is_whitespace() { + true + } else { + word_nchars += char_width(x); + false + } + }) { + None => self.length, + Some(s) => s + word_start, + }; + + let word_start_relative = word_start - old_position; + // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence. + let is_start_of_sentence = + self.prev_punct && (before_tab.is_some() || word_start_relative > 1); + + // now record whether this word ends in punctuation + self.prev_punct = match self.string[..self.position].chars().next_back() { + Some(ch) => WordSplit::is_punctuation(ch), + _ => panic!("fatal: expected word not to be empty"), + }; + + let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform { + (&self.string[word_start..self.position], 0, None, 0) + } else { + ( + &self.string[old_position..self.position], + word_start_relative, + before_tab, + after_tab, + ) + }; + + Some(WordInfo { + word, + word_start: word_start_relative, + word_nchars, + before_tab, + after_tab, + sentence_start: is_start_of_sentence, + ends_punct: self.prev_punct, + new_line, + }) + } +} |