blob: 8ce9faad34907f30df7b9c16ad59ade9789345c6 [file] [log] [blame]
//! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
//!
//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logoColor=white&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K
//!
//! <br>
//!
//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
//!
//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
//! The diff implementation is based on [Myers' diff algorithm] but includes
//! some [semantic cleanups] to increase human readability by factoring out
//! commonalities which are likely to be coincidental.
//!
//! Diff Match Patch was originally built in 2006 to power Google Docs.
//!
//! # Interface
//!
//! Here is the entire API of the Rust implementation. It operates on borrowed
//! strings and the return value of the diff algorithm is a vector of chunks
//! pointing into slices of those input strings.
//!
//! ```
//! pub enum Chunk<'a> {
//! Equal(&'a str),
//! Delete(&'a str),
//! Insert(&'a str),
//! }
//!
//! # const IGNORE: &str = stringify! {
//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
//! # };
//! ```
//!
//! [Diff Match Patch]: https://github.com/google/diff-match-patch
//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.4")]
#![allow(
clippy::blocks_in_if_conditions,
clippy::cast_possible_wrap,
clippy::cast_sign_loss,
clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
clippy::collapsible_else_if,
clippy::comparison_chain,
clippy::match_same_arms,
clippy::module_name_repetitions,
clippy::must_use_candidate,
clippy::new_without_default,
clippy::octal_escapes,
clippy::shadow_unrelated,
clippy::similar_names,
clippy::too_many_lines,
clippy::unseparated_literal_suffix,
unused_parens, // false positive on Some(&(mut diff)) pattern
)]
mod find;
mod range;
#[cfg(test)]
mod tests;
use crate::range::{bytes, str, Range};
use std::cmp;
use std::collections::VecDeque;
use std::fmt::{self, Debug};
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum Chunk<'a> {
Equal(&'a str),
Delete(&'a str),
Insert(&'a str),
}
#[derive(Copy, Clone)]
enum Diff<'a, 'b> {
Equal(Range<'a>, Range<'b>),
Delete(Range<'a>),
Insert(Range<'b>),
}
impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
fn text(&self) -> Range<'tmp> {
match *self {
Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
}
}
fn grow_left(&mut self, increment: usize) {
self.for_each(|range| {
range.offset -= increment;
range.len += increment;
});
}
fn grow_right(&mut self, increment: usize) {
self.for_each(|range| range.len += increment);
}
fn shift_left(&mut self, increment: usize) {
self.for_each(|range| range.offset -= increment);
}
fn shift_right(&mut self, increment: usize) {
self.for_each(|range| range.offset += increment);
}
fn for_each(&mut self, f: impl Fn(&mut Range)) {
match self {
Diff::Equal(range1, range2) => {
f(range1);
f(range2);
}
Diff::Delete(range) => f(range),
Diff::Insert(range) => f(range),
}
}
}
pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
let text1 = Range::new(text1, ..);
let text2 = Range::new(text2, ..);
let mut solution = main(text1, text2);
cleanup_char_boundary(&mut solution);
cleanup_semantic(&mut solution);
cleanup_merge(&mut solution);
solution.diffs.into_iter().map(Chunk::from).collect()
}
struct Solution<'a, 'b> {
text1: Range<'a>,
text2: Range<'b>,
diffs: Vec<Diff<'a, 'b>>,
utf8: bool,
}
fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
let whole1 = text1;
let whole2 = text2;
// Trim off common prefix.
let common_prefix_len = common_prefix_bytes(text1, text2);
let common_prefix = Diff::Equal(
text1.substring(..common_prefix_len),
text2.substring(..common_prefix_len),
);
text1 = text1.substring(common_prefix_len..);
text2 = text2.substring(common_prefix_len..);
// Trim off common suffix.
let common_suffix_len = common_suffix_bytes(text1, text2);
let common_suffix = Diff::Equal(
text1.substring(text1.len - common_suffix_len..),
text2.substring(text2.len - common_suffix_len..),
);
text1 = text1.substring(..text1.len - common_suffix_len);
text2 = text2.substring(..text2.len - common_suffix_len);
// Compute the diff on the middle block.
let mut solution = Solution {
text1: whole1,
text2: whole2,
diffs: compute(text1, text2),
utf8: false,
};
// Restore the prefix and suffix.
if common_prefix_len > 0 {
solution.diffs.insert(0, common_prefix);
}
if common_suffix_len > 0 {
solution.diffs.push(common_suffix);
}
cleanup_merge(&mut solution);
solution
}
// Find the differences between two texts. Assumes that the texts do not have
// any common prefix or suffix.
fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
match (text1.is_empty(), text2.is_empty()) {
(true, true) => return Vec::new(),
(true, false) => return vec![Diff::Insert(text2)],
(false, true) => return vec![Diff::Delete(text1)],
(false, false) => {}
}
// Check for entire shorter text inside the longer text.
if text1.len > text2.len {
if let Some(i) = text1.find(text2) {
return vec![
Diff::Delete(text1.substring(..i)),
Diff::Equal(text1.substring(i..i + text2.len), text2),
Diff::Delete(text1.substring(i + text2.len..)),
];
}
} else {
if let Some(i) = text2.find(text1) {
return vec![
Diff::Insert(text2.substring(..i)),
Diff::Equal(text1, text2.substring(i..i + text1.len)),
Diff::Insert(text2.substring(i + text1.len..)),
];
}
}
if text1.len == 1 || text2.len == 1 {
// Single character string.
// After the previous check, the character can't be an equality.
return vec![Diff::Delete(text1), Diff::Insert(text2)];
}
bisect(text1, text2)
}
// Find the 'middle snake' of a diff, split the problem in two and return the
// recursively constructed diff.
//
// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
let max_d = (text1.len + text2.len + 1) / 2;
let v_offset = max_d;
let v_len = 2 * max_d;
let mut v1 = vec![-1isize; v_len];
let mut v2 = vec![-1isize; v_len];
v1[v_offset + 1] = 0;
v2[v_offset + 1] = 0;
let delta = text1.len as isize - text2.len as isize;
// If the total number of characters is odd, then the front path will
// collide with the reverse path.
let front = delta % 2 != 0;
// Offsets for start and end of k loop.
// Prevents mapping of space beyond the grid.
let mut k1start = 0;
let mut k1end = 0;
let mut k2start = 0;
let mut k2end = 0;
for d in 0..max_d as isize {
// Walk the front path one step.
let mut k1 = -d + k1start;
while k1 <= d - k1end {
let k1_offset = (v_offset as isize + k1) as usize;
let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
v1[k1_offset + 1]
} else {
v1[k1_offset - 1] + 1
} as usize;
let mut y1 = (x1 as isize - k1) as usize;
if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
let advance = common_prefix_bytes(s1, s2);
x1 += advance;
y1 += advance;
}
v1[k1_offset] = x1 as isize;
if x1 > text1.len {
// Ran off the right of the graph.
k1end += 2;
} else if y1 > text2.len {
// Ran off the bottom of the graph.
k1start += 2;
} else if front {
let k2_offset = v_offset as isize + delta - k1;
if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
// Mirror x2 onto top-left coordinate system.
let x2 = text1.len as isize - v2[k2_offset as usize];
if x1 as isize >= x2 {
// Overlap detected.
return bisect_split(text1, text2, x1, y1);
}
}
}
k1 += 2;
}
// Walk the reverse path one step.
let mut k2 = -d + k2start;
while k2 <= d - k2end {
let k2_offset = (v_offset as isize + k2) as usize;
let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
v2[k2_offset + 1]
} else {
v2[k2_offset - 1] + 1
} as usize;
let mut y2 = (x2 as isize - k2) as usize;
if x2 < text1.len && y2 < text2.len {
let advance = common_suffix_bytes(
text1.substring(..text1.len - x2),
text2.substring(..text2.len - y2),
);
x2 += advance;
y2 += advance;
}
v2[k2_offset] = x2 as isize;
if x2 > text1.len {
// Ran off the left of the graph.
k2end += 2;
} else if y2 > text2.len {
// Ran off the top of the graph.
k2start += 2;
} else if !front {
let k1_offset = v_offset as isize + delta - k2;
if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
let x1 = v1[k1_offset as usize] as usize;
let y1 = v_offset + x1 - k1_offset as usize;
// Mirror x2 onto top-left coordinate system.
x2 = text1.len - x2;
if x1 >= x2 {
// Overlap detected.
return bisect_split(text1, text2, x1, y1);
}
}
}
k2 += 2;
}
}
// Number of diffs equals number of characters, no commonality at all.
vec![Diff::Delete(text1), Diff::Insert(text2)]
}
// Given the location of the 'middle snake', split the diff in two parts and
// recurse.
fn bisect_split<'a, 'b>(
text1: Range<'a>,
text2: Range<'b>,
x: usize,
y: usize,
) -> Vec<Diff<'a, 'b>> {
let (text1a, text1b) = text1.split_at(x);
let (text2a, text2b) = text2.split_at(y);
// Compute both diffs serially.
let mut diffs = main(text1a, text2a).diffs;
diffs.extend(main(text1b, text2b).diffs);
diffs
}
// Determine the length of the common prefix of two strings.
fn common_prefix(text1: Range, text2: Range) -> usize {
for ((i, ch1), ch2) in text1.char_indices().zip(text2.chars()) {
if ch1 != ch2 {
return i;
}
}
cmp::min(text1.len, text2.len)
}
// Determine the length of the common suffix of two strings.
fn common_suffix(text1: Range, text2: Range) -> usize {
for ((i, ch1), ch2) in text1.char_indices().rev().zip(text2.chars().rev()) {
if ch1 != ch2 {
return text1.len - i - ch1.len_utf8();
}
}
cmp::min(text1.len, text2.len)
}
fn common_prefix_bytes(text1: Range, text2: Range) -> usize {
for (i, (b1, b2)) in text1.bytes().zip(text2.bytes()).enumerate() {
if b1 != b2 {
return i;
}
}
cmp::min(text1.len, text2.len)
}
fn common_suffix_bytes(text1: Range, text2: Range) -> usize {
for (i, (b1, b2)) in text1.bytes().rev().zip(text2.bytes().rev()).enumerate() {
if b1 != b2 {
return i;
}
}
cmp::min(text1.len, text2.len)
}
// Determine if the suffix of one string is the prefix of another.
//
// Returns the number of characters common to the end of the first string and
// the start of the second string.
fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
// Eliminate the null case.
if text1.is_empty() || text2.is_empty() {
return 0;
}
// Truncate the longer string.
if text1.len > text2.len {
text1 = text1.substring(text1.len - text2.len..);
} else if text1.len < text2.len {
text2 = text2.substring(..text1.len);
}
// Quick check for the worst case.
if bytes(text1) == bytes(text2) {
return text1.len;
}
// Start by looking for a single character match
// and increase length until no match is found.
// Performance analysis: https://neil.fraser.name/news/2010/11/04/
let mut best = 0;
let mut length = 1;
loop {
let pattern = text1.substring(text1.len - length..);
let found = match text2.find(pattern) {
Some(found) => found,
None => return best,
};
length += found;
if found == 0
|| bytes(text1.substring(text1.len - length..)) == bytes(text2.substring(..length))
{
best = length;
length += 1;
}
}
}
fn cleanup_char_boundary(solution: &mut Solution) {
fn boundary_down(doc: &str, pos: usize) -> usize {
let mut adjust = 0;
while !doc.is_char_boundary(pos - adjust) {
adjust += 1;
}
adjust
}
fn boundary_up(doc: &str, pos: usize) -> usize {
let mut adjust = 0;
while !doc.is_char_boundary(pos + adjust) {
adjust += 1;
}
adjust
}
fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
let prev_end = prev.offset + prev.len;
if prev_end > range.offset {
let delta = cmp::min(prev_end - range.offset, range.len);
range.offset += delta;
range.len -= delta;
}
}
let mut read = 0;
let mut retain = 0;
let mut last_delete = Range::empty();
let mut last_insert = Range::empty();
while let Some(&(mut diff)) = solution.diffs.get(read) {
read += 1;
match &mut diff {
Diff::Equal(range1, range2) => {
let adjust = boundary_up(range1.doc, range1.offset);
// If the whole range is sub-character, skip it.
if range1.len <= adjust {
continue;
}
range1.offset += adjust;
range1.len -= adjust;
range2.offset += adjust;
range2.len -= adjust;
let adjust = boundary_down(range1.doc, range1.offset + range1.len);
range1.len -= adjust;
range2.len -= adjust;
last_delete = Range::empty();
last_insert = Range::empty();
}
Diff::Delete(range) => {
skip_overlap(&last_delete, range);
if range.len == 0 {
continue;
}
let adjust = boundary_down(range.doc, range.offset);
range.offset -= adjust;
range.len += adjust;
let adjust = boundary_up(range.doc, range.offset + range.len);
range.len += adjust;
last_delete = *range;
}
Diff::Insert(range) => {
skip_overlap(&last_insert, range);
if range.len == 0 {
continue;
}
let adjust = boundary_down(range.doc, range.offset);
range.offset -= adjust;
range.len += adjust;
let adjust = boundary_up(range.doc, range.offset + range.len);
range.len += adjust;
last_insert = *range;
}
}
solution.diffs[retain] = diff;
retain += 1;
}
solution.diffs.truncate(retain);
solution.utf8 = true;
}
// Reduce the number of edits by eliminating semantically trivial equalities.
fn cleanup_semantic(solution: &mut Solution) {
let mut diffs = &mut solution.diffs;
if diffs.is_empty() {
return;
}
let mut changes = false;
let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
let mut last_equality = None; // Always equal to equalities.peek().text
let mut pointer = 0;
// Number of characters that changed prior to the equality.
let mut len_insertions1 = 0;
let mut len_deletions1 = 0;
// Number of characters that changed after the equality.
let mut len_insertions2 = 0;
let mut len_deletions2 = 0;
while let Some(&this_diff) = diffs.get(pointer) {
match this_diff {
Diff::Equal(text1, text2) => {
equalities.push_back(pointer);
len_insertions1 = len_insertions2;
len_deletions1 = len_deletions2;
len_insertions2 = 0;
len_deletions2 = 0;
last_equality = Some((text1, text2));
pointer += 1;
continue;
}
Diff::Delete(text) => len_deletions2 += text.len,
Diff::Insert(text) => len_insertions2 += text.len,
}
// Eliminate an equality that is smaller or equal to the edits on both
// sides of it.
if last_equality.map_or(false, |(last_equality, _)| {
last_equality.len <= cmp::max(len_insertions1, len_deletions1)
&& last_equality.len <= cmp::max(len_insertions2, len_deletions2)
}) {
// Jump back to offending equality.
pointer = equalities.pop_back().unwrap();
// Replace equality with a delete.
diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
// Insert a corresponding insert.
diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
len_insertions1 = 0; // Reset the counters.
len_insertions2 = 0;
len_deletions1 = 0;
len_deletions2 = 0;
last_equality = None;
changes = true;
// Throw away the previous equality (it needs to be reevaluated).
equalities.pop_back();
if let Some(back) = equalities.back() {
// There is a safe equality we can fall back to.
pointer = *back;
} else {
// There are no previous equalities, jump back to the start.
pointer = 0;
continue;
}
}
pointer += 1;
}
// Normalize the diff.
if changes {
cleanup_merge(solution);
}
cleanup_semantic_lossless(solution);
diffs = &mut solution.diffs;
// Find any overlaps between deletions and insertions.
// e.g: <del>abcxxx</del><ins>xxxdef</ins>
// -> <del>abc</del>xxx<ins>def</ins>
// e.g: <del>xxxabc</del><ins>defxxx</ins>
// -> <ins>def</ins>xxx<del>abc</del>
// Only extract an overlap if it is as big as the edit ahead or behind it.
let mut pointer = 1;
while let Some(&this_diff) = diffs.get(pointer) {
let prev_diff = diffs[pointer - 1];
if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
let overlap_len1 = common_overlap(deletion, insertion);
let overlap_len2 = common_overlap(insertion, deletion);
let overlap_min = cmp::min(deletion.len, insertion.len);
if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
// Overlap found. Insert an equality and trim the surrounding edits.
diffs.insert(
pointer,
Diff::Equal(
deletion.substring(deletion.len - overlap_len1..deletion.len),
insertion.substring(..overlap_len1),
),
);
diffs[pointer - 1] =
Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
} else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
// Reverse overlap found.
// Insert an equality and swap and trim the surrounding edits.
diffs.insert(
pointer,
Diff::Equal(
deletion.substring(..overlap_len2),
insertion.substring(insertion.len - overlap_len2..insertion.len),
),
);
diffs[pointer - 1] =
Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
}
pointer += 1;
}
pointer += 1;
}
}
// Look for single edits surrounded on both sides by equalities which can be
// shifted sideways to align the edit to a word boundary.
//
// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
fn cleanup_semantic_lossless(solution: &mut Solution) {
let diffs = &mut solution.diffs;
let mut pointer = 1;
while let Some(&next_diff) = diffs.get(pointer + 1) {
let prev_diff = diffs[pointer - 1];
if let (
Diff::Equal(mut prev_equal1, mut prev_equal2),
Diff::Equal(mut next_equal1, mut next_equal2),
) = (prev_diff, next_diff)
{
// This is a single edit surrounded by equalities.
let mut edit = diffs[pointer];
// First, shift the edit as far left as possible.
let common_offset = common_suffix(prev_equal1, edit.text());
let original_prev_len = prev_equal1.len;
prev_equal1.len -= common_offset;
prev_equal2.len -= common_offset;
edit.shift_left(common_offset);
next_equal1.offset -= common_offset;
next_equal1.len += common_offset;
next_equal2.offset -= common_offset;
next_equal2.len += common_offset;
// Second, step character by character right, looking for the best fit.
let mut best_prev_equal = (prev_equal1, prev_equal2);
let mut best_edit = edit;
let mut best_next_equal = (next_equal1, next_equal2);
let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
+ cleanup_semantic_score(edit.text(), next_equal1);
while !edit.text().is_empty()
&& !next_equal1.is_empty()
&& edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
{
let increment = edit.text().chars().next().unwrap().len_utf8();
prev_equal1.len += increment;
prev_equal2.len += increment;
edit.shift_right(increment);
next_equal1.offset += increment;
next_equal1.len -= increment;
next_equal2.offset += increment;
next_equal2.len -= increment;
let score = cleanup_semantic_score(prev_equal1, edit.text())
+ cleanup_semantic_score(edit.text(), next_equal1);
// The >= encourages trailing rather than leading whitespace on edits.
if score >= best_score {
best_score = score;
best_prev_equal = (prev_equal1, prev_equal2);
best_edit = edit;
best_next_equal = (next_equal1, next_equal2);
}
}
if original_prev_len != best_prev_equal.0.len {
// We have an improvement, save it back to the diff.
if best_next_equal.0.is_empty() {
diffs.remove(pointer + 1);
} else {
diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
}
diffs[pointer] = best_edit;
if best_prev_equal.0.is_empty() {
diffs.remove(pointer - 1);
pointer -= 1;
} else {
diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
}
}
}
pointer += 1;
}
}
// Given two strings, compute a score representing whether the internal boundary
// falls on logical boundaries.
//
// Scores range from 6 (best) to 0 (worst).
fn cleanup_semantic_score(one: Range, two: Range) -> usize {
if one.is_empty() || two.is_empty() {
// Edges are the best.
return 6;
}
// Each port of this function behaves slightly differently due to subtle
// differences in each language's definition of things like 'whitespace'.
// Since this function's purpose is largely cosmetic, the choice has been
// made to use each language's native features rather than force total
// conformity.
let char1 = one.chars().next_back().unwrap();
let char2 = two.chars().next().unwrap();
let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
let line_break1 = whitespace1 && char1.is_control();
let line_break2 = whitespace2 && char2.is_control();
let blank_line1 = line_break1 && (one.ends_with("\n\n") || one.ends_with("\n\r\n"));
let blank_line2 = line_break2 && (two.starts_with("\n\n") || two.starts_with("\r\n\r\n"));
if blank_line1 || blank_line2 {
// Five points for blank lines.
5
} else if line_break1 || line_break2 {
// Four points for line breaks.
4
} else if non_alphanumeric1 && !whitespace1 && whitespace2 {
// Three points for end of sentences.
3
} else if whitespace1 || whitespace2 {
// Two points for whitespace.
2
} else if non_alphanumeric1 || non_alphanumeric2 {
// One point for non-alphanumeric.
1
} else {
0
}
}
// Reorder and merge like edit sections. Merge equalities. Any edit section can
// move as long as it doesn't cross an equality.
fn cleanup_merge(solution: &mut Solution) {
let diffs = &mut solution.diffs;
let common_prefix = if solution.utf8 {
common_prefix
} else {
common_prefix_bytes
};
let common_suffix = if solution.utf8 {
common_suffix
} else {
common_suffix_bytes
};
loop {
if diffs.is_empty() {
return;
}
diffs.push(Diff::Equal(
solution.text1.substring(solution.text1.len..),
solution.text2.substring(solution.text2.len..),
)); // Add a dummy entry at the end.
let mut pointer = 0;
let mut count_delete = 0;
let mut count_insert = 0;
let mut text_delete = Range::empty();
let mut text_insert = Range::empty();
while let Some(&this_diff) = diffs.get(pointer) {
match this_diff {
Diff::Insert(text) => {
count_insert += 1;
if text_insert.is_empty() {
text_insert = text;
} else {
text_insert.len += text.len;
}
}
Diff::Delete(text) => {
count_delete += 1;
if text_delete.is_empty() {
text_delete = text;
} else {
text_delete.len += text.len;
}
}
Diff::Equal(text, _) => {
let count_both = count_delete + count_insert;
if count_both > 1 {
let both_types = count_delete != 0 && count_insert != 0;
// Delete the offending records.
diffs.splice(pointer - count_both..pointer, None);
pointer -= count_both;
if both_types {
// Factor out any common prefix.
let common_length = common_prefix(text_insert, text_delete);
if common_length != 0 {
if pointer > 0 {
match &mut diffs[pointer - 1] {
Diff::Equal(this_diff1, this_diff2) => {
this_diff1.len += common_length;
this_diff2.len += common_length;
}
_ => unreachable!(
"previous diff should have been an equality"
),
}
} else {
diffs.insert(
pointer,
Diff::Equal(
text_delete.substring(..common_length),
text_insert.substring(..common_length),
),
);
pointer += 1;
}
text_insert = text_insert.substring(common_length..);
text_delete = text_delete.substring(common_length..);
}
// Factor out any common suffix.
let common_length = common_suffix(text_insert, text_delete);
if common_length != 0 {
diffs[pointer].grow_left(common_length);
text_insert.len -= common_length;
text_delete.len -= common_length;
}
}
// Insert the merged records.
if !text_delete.is_empty() {
diffs.insert(pointer, Diff::Delete(text_delete));
pointer += 1;
}
if !text_insert.is_empty() {
diffs.insert(pointer, Diff::Insert(text_insert));
pointer += 1;
}
} else if pointer > 0 {
if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
diffs.get_mut(pointer - 1)
{
// Merge this equality with the previous one.
prev_equal1.len += text.len;
prev_equal2.len += text.len;
diffs.remove(pointer);
pointer -= 1;
}
}
count_insert = 0;
count_delete = 0;
text_delete = Range::empty();
text_insert = Range::empty();
}
}
pointer += 1;
}
if diffs.last().unwrap().text().is_empty() {
diffs.pop(); // Remove the dummy entry at the end.
}
// Second pass: look for single edits surrounded on both sides by equalities
// which can be shifted sideways to eliminate an equality.
// e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
let mut changes = false;
let mut pointer = 1;
// Intentionally ignore the first and last element (don't need checking).
while let Some(&next_diff) = diffs.get(pointer + 1) {
let prev_diff = diffs[pointer - 1];
let this_diff = diffs[pointer];
if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
// This is a single edit surrounded by equalities.
if this_diff.text().ends_with(prev_diff) {
// Shift the edit over the previous equality.
diffs[pointer].shift_left(prev_diff.len);
diffs[pointer + 1].grow_left(prev_diff.len);
diffs.remove(pointer - 1); // Delete prev_diff.
changes = true;
} else if this_diff.text().starts_with(next_diff) {
// Shift the edit over the next equality.
diffs[pointer - 1].grow_right(next_diff.len);
diffs[pointer].shift_right(next_diff.len);
diffs.remove(pointer + 1); // Delete next_diff.
changes = true;
}
}
pointer += 1;
}
// If shifts were made, the diff needs reordering and another shift sweep.
if !changes {
return;
}
}
}
impl Debug for Chunk<'_> {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
let (name, text) = match *self {
Chunk::Equal(text) => ("Equal", text),
Chunk::Delete(text) => ("Delete", text),
Chunk::Insert(text) => ("Insert", text),
};
write!(formatter, "{}({:?})", name, text)
}
}
impl Debug for Diff<'_, '_> {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
let (name, bytes) = match *self {
Diff::Equal(range, _) => ("Equal", bytes(range)),
Diff::Delete(range) => ("Delete", bytes(range)),
Diff::Insert(range) => ("Insert", bytes(range)),
};
let text = String::from_utf8_lossy(bytes);
write!(formatter, "{}({:?})", name, text)
}
}
impl<'a> From<Diff<'a, 'a>> for Chunk<'a> {
fn from(diff: Diff<'a, 'a>) -> Self {
match diff {
Diff::Equal(range, _) => Chunk::Equal(str(range)),
Diff::Delete(range) => Chunk::Delete(str(range)),
Diff::Insert(range) => Chunk::Insert(str(range)),
}
}
}