blob: 3d15d69ebf4058f147434700c63c46977b37085b [file] [log] [blame]
/* `L' command implementation for GNU sed, based on GNU fmt 1.22.
Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
/* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>. */
#include "sed.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <limits.h>
/* The following parameters represent the program's idea of what is
"best". Adjust to taste, subject to the caveats given. */
/* Prefer lines to be LEEWAY % shorter than the maximum width, giving
room for optimization. */
#define LEEWAY 7
/* Costs and bonuses are expressed as the equivalent departure from the
optimal line length, multiplied by 10. e.g. assigning something a
cost of 50 means that it is as bad as a line 5 characters too short
or too long. The definition of SHORT_COST(n) should not be changed.
However, EQUIV(n) may need tuning. */
typedef long COST;
#define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
#define SQR(n) ((n) * (n))
#define EQUIV(n) SQR ((COST) (n))
/* Cost of a filled line n chars longer or shorter than best_width. */
#define SHORT_COST(n) EQUIV ((n) * 10)
/* Cost of the difference between adjacent filled lines. */
#define RAGGED_COST(n) (SHORT_COST (n) / 2)
/* Basic cost per line. */
#define LINE_COST EQUIV (70)
/* Cost of breaking a line after the first word of a sentence, where
the length of the word is N. */
#define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
/* Cost of breaking a line before the last word of a sentence, where
the length of the word is N. */
#define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
/* Bonus for breaking a line at the end of a sentence. */
#define SENTENCE_BONUS EQUIV (50)
/* Cost of breaking a line after a period not marking end of a sentence.
With the definition of sentence we are using (borrowed from emacs, see
get_line()) such a break would then look like a sentence break. Hence
we assign a very high cost -- it should be avoided unless things are
really bad. */
#define NOBREAK_COST EQUIV (600)
/* Bonus for breaking a line before open parenthesis. */
#define PAREN_BONUS EQUIV (40)
/* Bonus for breaking a line after other punctuation. */
#define PUNCT_BONUS EQUIV(40)
/* Credit for breaking a long paragraph one line later. */
#define LINE_CREDIT EQUIV(3)
/* Size of paragraph buffer in words. Longer paragraphs are handled
neatly (cf. flush_paragraph()), so there's little to gain by making
these larger. */
#define MAXWORDS 1000
#define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++)
/* Extra ctype(3)-style macros. */
#define isopen(c) (strchr ("([`'\"", (c)) != NULL)
#define isclose(c) (strchr (")]'\"", (c)) != NULL)
#define isperiod(c) (strchr (".?!", (c)) != NULL)
/* Size of a tab stop, for expansion on input and re-introduction on
output. */
#define TABWIDTH 8
/* Word descriptor structure. */
typedef struct Word WORD;
struct Word
{
/* Static attributes determined during input. */
const char *text; /* the text of the word */
short length; /* length of this word */
short space; /* the size of the following space */
unsigned paren:1; /* starts with open paren */
unsigned period:1; /* ends in [.?!])* */
unsigned punct:1; /* ends in punctuation */
unsigned final:1; /* end of sentence */
/* The remaining fields are computed during the optimization. */
short line_length; /* length of the best line starting here */
COST best_cost; /* cost of best paragraph starting here */
WORD *next_break; /* break which achieves best_cost */
};
/* Forward declarations. */
static bool get_paragraph (void);
static int get_line (int c);
static int get_space (int c);
static int copy_rest (int c);
static bool same_para (int c);
static void flush_paragraph (void);
static void fmt_paragraph (void);
static void check_punctuation (WORD *w);
static COST base_cost (WORD *this);
static COST line_cost (WORD *next, int len);
static void put_paragraph (WORD *finish);
static void put_line (WORD *w, int indent);
static void put_word (WORD *w);
static void put_space (int space);
/* Option values. */
/* User-supplied maximum line width (default WIDTH). The only output
lines
longer than this will each comprise a single word. */
static int max_width;
/* Space for the paragraph text. */
static const char *parabuf;
/* End of space for the paragraph text. */
static const char *end_of_parabuf;
/* The file on which we output */
static FILE *outfile;
/* Values derived from the option values. */
/* The preferred width of text lines, set to LEEWAY % less than max_width. */
static int best_width;
/* Dynamic variables. */
/* Start column of the character most recently read from the input file. */
static int in_column;
/* Start column of the next character to be written to stdout. */
static int out_column;
/* The words of a paragraph -- longer paragraphs are handled neatly
(cf. flush_paragraph()). */
static WORD words[MAXWORDS];
/* A pointer into the above word array, indicating the first position
after the last complete word. Sometimes it will point at an incomplete
word. */
static WORD *word_limit;
/* Indentation of the first line of the current paragraph. */
static int first_indent;
/* Indentation of other lines of the current paragraph */
static int other_indent;
/* The last character read from the input file. */
static int next_char;
/* If nonzero, the length of the last line output in the current
paragraph, used to charge for raggedness at the split point for long
paragraphs chosen by fmt_paragraph(). */
static int last_line_length;
/* read file F and send formatted output to stdout. */
void
fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
{
parabuf = line;
end_of_parabuf = line_end;
outfile = output_file;
max_width = max_length;
best_width = max_width * (201 - 2 * LEEWAY) / 200;
in_column = 0;
other_indent = 0;
next_char = GETC();
while (get_paragraph ())
{
fmt_paragraph ();
put_paragraph (word_limit);
}
}
/* Read a paragraph from input file F. A paragraph consists of a
maximal number of non-blank (excluding any prefix) lines
with the same indent.
Return false if end-of-file was encountered before the start of a
paragraph, else true. */
static bool
get_paragraph ()
{
register int c;
last_line_length = 0;
c = next_char;
/* Scan (and copy) blank lines, and lines not introduced by the prefix. */
while (c == '\n' || c == EOF)
{
c = copy_rest (c);
if (c == EOF)
{
next_char = EOF;
return false;
}
putc ('\n', outfile);
c = GETC();
}
/* Got a suitable first line for a paragraph. */
first_indent = in_column;
word_limit = words;
c = get_line (c);
/* Read rest of paragraph. */
other_indent = in_column;
while (same_para (c) && in_column == other_indent)
c = get_line (c);
(word_limit - 1)->period = (word_limit - 1)->final = true;
next_char = c;
return true;
}
/* Copy to the output a blank line. In the latter, C is \n or EOF.
Return the character (\n or EOF) ending the line. */
static int
copy_rest (register int c)
{
out_column = 0;
while (c != '\n' && c != EOF)
{
putc (c, outfile);
c = GETC();
}
return c;
}
/* Return true if a line whose first non-blank character after the
prefix (if any) is C could belong to the current paragraph,
otherwise false. */
static bool
same_para (register int c)
{
return (c != '\n' && c != EOF);
}
/* Read a line from the input data given first non-blank character C
after the prefix, and the following indent, and break it into words.
A word is a maximal non-empty string of non-white characters. A word
ending in [.?!]["')\]]* and followed by end-of-line or at least two
spaces ends a sentence, as in emacs.
Return the first non-blank character of the next line. */
static int
get_line (register int c)
{
int start;
register WORD *end_of_word;
end_of_word = &words[MAXWORDS - 2];
do
{ /* for each word in a line */
/* Scan word. */
word_limit->text = parabuf - 1;
do
c = GETC();
while (c != EOF && !ISSPACE (c));
word_limit->length = parabuf - word_limit->text - (c != EOF);
in_column += word_limit->length;
check_punctuation (word_limit);
/* Scan inter-word space. */
start = in_column;
c = get_space (c);
word_limit->space = in_column - start;
word_limit->final = (c == EOF
|| (word_limit->period
&& (c == '\n' || word_limit->space > 1)));
if (c == '\n' || c == EOF)
word_limit->space = word_limit->final ? 2 : 1;
if (word_limit == end_of_word)
flush_paragraph ();
word_limit++;
if (c == EOF)
{
in_column = first_indent;
return EOF;
}
}
while (c != '\n');
in_column = 0;
c = GETC();
return get_space (c);
}
/* Read blank characters from the input data, starting with C, and keeping
in_column up-to-date. Return first non-blank character. */
static int
get_space (register int c)
{
for (;;)
{
if (c == ' ')
in_column++;
else if (c == '\t')
in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
else
return c;
c = GETC();
}
}
/* Set extra fields in word W describing any attached punctuation. */
static void
check_punctuation (register WORD *w)
{
register const char *start, *finish;
start = w->text;
finish = start + (w->length - 1);
w->paren = isopen (*start);
w->punct = ISPUNCT (*finish);
while (isclose (*finish) && finish > start)
finish--;
w->period = isperiod (*finish);
}
/* Flush part of the paragraph to make room. This function is called on
hitting the limit on the number of words or characters. */
static void
flush_paragraph (void)
{
WORD *split_point;
register WORD *w;
COST best_break;
/* - format what you have so far as a paragraph,
- find a low-cost line break near the end,
- output to there,
- make that the start of the paragraph. */
fmt_paragraph ();
/* Choose a good split point. */
split_point = word_limit;
best_break = MAXCOST;
for (w = words->next_break; w != word_limit; w = w->next_break)
{
if (w->best_cost - w->next_break->best_cost < best_break)
{
split_point = w;
best_break = w->best_cost - w->next_break->best_cost;
}
if (best_break <= MAXCOST - LINE_CREDIT)
best_break += LINE_CREDIT;
}
put_paragraph (split_point);
/* Copy words from split_point down to word -- we use memmove because
the source and target may overlap. */
memmove ((char *) words, (char *) split_point,
(word_limit - split_point + 1) * sizeof (WORD));
word_limit -= split_point - words;
}
/* Compute the optimal formatting for the whole paragraph by computing
and remembering the optimal formatting for each suffix from the empty
one to the whole paragraph. */
static void
fmt_paragraph (void)
{
register WORD *start, *w;
register int len;
register COST wcost, best;
int saved_length;
word_limit->best_cost = 0;
saved_length = word_limit->length;
word_limit->length = max_width; /* sentinel */
for (start = word_limit - 1; start >= words; start--)
{
best = MAXCOST;
len = start == words ? first_indent : other_indent;
/* At least one word, however long, in the line. */
w = start;
len += w->length;
do
{
w++;
/* Consider breaking before w. */
wcost = line_cost (w, len) + w->best_cost;
if (start == words && last_line_length > 0)
wcost += RAGGED_COST (len - last_line_length);
if (wcost < best)
{
best = wcost;
start->next_break = w;
start->line_length = len;
}
len += (w - 1)->space + w->length; /* w > start >= words */
}
while (len < max_width);
start->best_cost = best + base_cost (start);
}
word_limit->length = saved_length;
}
/* Return the constant component of the cost of breaking before the
word THIS. */
static COST
base_cost (register WORD *this)
{
register COST cost;
cost = LINE_COST;
if (this > words)
{
if ((this - 1)->period)
{
if ((this - 1)->final)
cost -= SENTENCE_BONUS;
else
cost += NOBREAK_COST;
}
else if ((this - 1)->punct)
cost -= PUNCT_BONUS;
else if (this > words + 1 && (this - 2)->final)
cost += WIDOW_COST ((this - 1)->length);
}
if (this->paren)
cost -= PAREN_BONUS;
else if (this->final)
cost += ORPHAN_COST (this->length);
return cost;
}
/* Return the component of the cost of breaking before word NEXT that
depends on LEN, the length of the line beginning there. */
static COST
line_cost (register WORD *next, register int len)
{
register int n;
register COST cost;
if (next == word_limit)
return 0;
n = best_width - len;
cost = SHORT_COST (n);
if (next->next_break != word_limit)
{
n = len - next->line_length;
cost += RAGGED_COST (n);
}
return cost;
}
/* Output to stdout a paragraph from word up to (but not including)
FINISH, which must be in the next_break chain from word. */
static void
put_paragraph (register WORD *finish)
{
register WORD *w;
put_line (words, first_indent);
for (w = words->next_break; w != finish; w = w->next_break)
put_line (w, other_indent);
}
/* Output to stdout the line beginning with word W, beginning in column
INDENT, including the prefix (if any). */
static void
put_line (register WORD *w, int indent)
{
register WORD *endline;
out_column = 0;
put_space (indent);
endline = w->next_break - 1;
for (; w != endline; w++)
{
put_word (w);
put_space (w->space);
}
put_word (w);
last_line_length = out_column;
putc ('\n', outfile);
}
/* Output to stdout the word W. */
static void
put_word (register WORD *w)
{
register const char *s;
register int n;
s = w->text;
for (n = w->length; n != 0; n--)
putc (*s++, outfile);
out_column += w->length;
}
/* Output to stdout SPACE spaces, or equivalent tabs. */
static void
put_space (int space)
{
out_column += space;
while (space--)
putc (' ', outfile);
}