blob: 43e7898519a5809c130a4e5a52cc66264a94a49b [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.archivepatcher.generator.bsdiff;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.nio.charset.StandardCharsets;
// TODO(andrewhayden) clean up the various generatePatch(...) methods, there are too many.
/**
* A helper class that handles the main BsDiff I/O and patch generation, by calling into the main
* algorithm implementation in {@link BsDiff}
*/
public class BsDiffPatchWriter {
static final int DEFAULT_MINIMUM_MATCH_LENGTH = 16;
/**
* Write a patch entry.
*
* @param newData
* @param oldData
* @param newPosition the first byte in |newData| to write either raw or as a diff against
* |oldData|.
* @param oldPosition the first byte in |oldData| to diff against |newData|. Ignored if
* |diffLength| is empty.
* @param diffLength the number of bytes to diff between |newData| and |oldData| starting at
* |newPosition| and |oldPosition| respectively.
* @param extraLength the number of bytes from |newData| to write starting at |newPosition +
* diffLength|.
* @param oldPositionOffsetForNextEntry the offset between |oldPosition| for the next entry and
* |oldPosition| + |diffLength| for this entry.
* @param outputStream the output stream to write the patch entry to.
* @throws IOException if unable to read or write data
*/
private static void writeEntry(
RandomAccessObject newData,
RandomAccessObject oldData,
int newPosition,
int oldPosition,
int diffLength,
int extraLength,
int oldPositionOffsetForNextEntry,
OutputStream outputStream)
throws IOException {
// Write control data
BsUtil.writeFormattedLong(diffLength, outputStream);
BsUtil.writeFormattedLong(extraLength, outputStream);
BsUtil.writeFormattedLong(oldPositionOffsetForNextEntry, outputStream);
newData.seek(newPosition);
oldData.seek(oldPosition);
// Write diff data
for (int i = 0; i < diffLength; ++i) {
// TODO(hartmanng): test using a small buffer to insulate read() calls (and write() for that
// matter).
outputStream.write(newData.readUnsignedByte() - oldData.readUnsignedByte());
}
if (extraLength > 0) {
// This seek will throw an IOException sometimes, if we try to seek to the byte after
// the end of the RandomAccessObject.
newData.seek(newPosition + diffLength);
// Write extra data
for (int i = 0; i < extraLength; ++i) {
// TODO(hartmanng): same as above - test buffering readByte().
outputStream.write(newData.readByte());
}
}
}
/**
* Generate a BsDiff patch given a Matcher.
*
* @param oldData the old blob
* @param newData the new blob
* @param matcher a Matcher to find binary matches between oldData and newData
* @param outputStream the outputStream for the new generated patch
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
// Visible for testing only
static void generatePatchWithMatcher(
RandomAccessObject oldData,
RandomAccessObject newData,
Matcher matcher,
OutputStream outputStream)
throws IOException, InterruptedException {
// Compute the differences, writing ctrl as we go
int lastNewPosition = 0;
int lastOldPosition = 0;
int newPosition = 0;
int oldPosition = 0;
while (newPosition < newData.length()) {
if (Thread.interrupted()) {
throw new InterruptedException();
}
Matcher.NextMatch nextMatch = matcher.next();
if (nextMatch.didFindMatch) {
newPosition = nextMatch.newPosition;
oldPosition = nextMatch.oldPosition;
} else {
newPosition = (int) newData.length();
}
// Extend the current match (|newPosition|, |oldPosition|) backward such that 50% of the bytes
// match. We have written diff / extra data up till |lastNewPosition| so we cannot extend
// further back than |lastNewPosition|.
int backwardExtension = 0;
if (newPosition < newData.length()) {
int score = 0;
int bestScore = 0;
for (int i = 1; newPosition - i >= lastNewPosition && oldPosition >= i; ++i) {
oldData.seek(oldPosition - i);
newData.seek(newPosition - i);
if (oldData.readByte() == newData.readByte()) {
++score;
} else {
--score;
}
if (score > bestScore) {
bestScore = score;
backwardExtension = i;
}
}
}
// Extend the previous match (|lastNewPosition|, |lastOldPosition|) forward such that 50% of
// the bytes match. (|lastNewPosition|, |lastOldPosition|) were extended backward in the
// previous iteration of the loop.
int forwardExtension = 0;
{
int score = 0;
int bestScore = 0;
oldData.seek(lastOldPosition);
newData.seek(lastNewPosition);
for (int i = 0;
lastNewPosition + i < newPosition && lastOldPosition + i < oldData.length();
++i) {
if (oldData.readByte() == newData.readByte()) {
++score;
} else {
--score;
}
if (score > bestScore) {
bestScore = score;
forwardExtension = i + 1;
}
}
}
// Adjust |backwardExtension| and |forwardExtension| such that the extended matches do
// not intersect in |newData|. They can intersect in |oldData|.
int overlap = (lastNewPosition + forwardExtension) - (newPosition - backwardExtension);
if (overlap > 0) {
int score = 0;
int bestScore = 0;
int backwardExtensionDecrement = 0;
for (int i = 0; i < overlap; ++i) {
newData.seek(lastNewPosition + forwardExtension - overlap + i);
oldData.seek(lastOldPosition + forwardExtension - overlap + i);
if (newData.readByte() == oldData.readByte()) {
++score;
}
newData.seek(newPosition - backwardExtension + i);
oldData.seek(oldPosition - backwardExtension + i);
if (newData.readByte() == oldData.readByte()) {
--score;
}
if (score > bestScore) {
bestScore = score;
backwardExtensionDecrement = i + 1;
}
}
forwardExtension -= overlap - backwardExtensionDecrement;
backwardExtension -= backwardExtensionDecrement;
}
// Write an entry with:
// - The diff between |newData| and |oldData| for the previous extended match:
// oldData[lastOldPosition ... lastOldPosition + forwardExtension - 1] and
// newData[lastNewPosition ... lastNewPosition + forwardExtension - 1].
// - The bytes in |newData| between |lastNewPosition| and |newPosition| which are part of
// neither the previous extended match or the new extended match:
// newData[lastNewPosition + forwardExtension ... newPosition - backwardExtension - 1]
int oldPositionOffset = 0;
if (newPosition < newData.length()) {
// The offset from the byte after the last byte of the previous match in |newData| to the
// first byte of the new match in |oldData|.
oldPositionOffset =
(oldPosition - backwardExtension) - (lastOldPosition + forwardExtension);
}
// The number of bytes in |newData| between |lastNewPosition| and |newPosition| which are part
// of neither the previous extended match or the new extended match.
int newNoMatchLength =
(newPosition - backwardExtension) - (lastNewPosition + forwardExtension);
writeEntry(
newData,
oldData,
lastNewPosition,
lastOldPosition,
forwardExtension,
newNoMatchLength,
oldPositionOffset,
outputStream);
lastNewPosition = newPosition - backwardExtension;
lastOldPosition = oldPosition - backwardExtension;
}
}
/**
* Generate a diff between the old data and the new, writing to the specified stream. Uses {@link
* #DEFAULT_MINIMUM_MATCH_LENGTH} as the match length.
*
* @param oldData the old data
* @param newData the new data
* @param outputStream where output should be written
* @param randomAccessObjectFactory factory to create auxiliary storage during BsDiff
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
public static void generatePatch(
final RandomAccessObject oldData,
final RandomAccessObject newData,
final OutputStream outputStream,
final RandomAccessObjectFactory randomAccessObjectFactory)
throws IOException, InterruptedException {
generatePatch(
oldData, newData, outputStream, randomAccessObjectFactory, DEFAULT_MINIMUM_MATCH_LENGTH);
}
/**
* Generate a diff between the old data and the new, writing to the specified stream. Uses
* in-memory byte array storage for ancillary allocations and {@link
* #DEFAULT_MINIMUM_MATCH_LENGTH} as the match length.
*
* @param oldData the old data
* @param newData the new data
* @param outputStream where output should be written
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
public static void generatePatch(
final byte[] oldData, final byte[] newData, final OutputStream outputStream)
throws IOException, InterruptedException {
generatePatch(oldData, newData, outputStream, DEFAULT_MINIMUM_MATCH_LENGTH);
}
/**
* Generate a diff between the old data and the new, writing to the specified stream. Uses
* in-memory byte array storage for ancillary allocations.
*
* @param oldData the old data
* @param newData the new data
* @param outputStream where output should be written
* @param minimumMatchLength the minimum "match" (in bytes) for BsDiff to consider between the
* oldData and newData. This can have a significant effect on both the generated patch size
* and the amount of time and memory required to apply the patch.
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
public static void generatePatch(
final byte[] oldData,
final byte[] newData,
final OutputStream outputStream,
final int minimumMatchLength)
throws IOException, InterruptedException {
try (RandomAccessObject oldDataRAO =
new RandomAccessObject.RandomAccessByteArrayObject(oldData);
RandomAccessObject newDataRAO =
new RandomAccessObject.RandomAccessByteArrayObject(newData); ) {
generatePatch(
oldDataRAO,
newDataRAO,
outputStream,
new RandomAccessObjectFactory.RandomAccessByteArrayObjectFactory(),
minimumMatchLength);
}
}
/**
* Generate a diff between the old data and the new, writing to the specified stream. Uses
* file-based storage for ancillary operations and {@link #DEFAULT_MINIMUM_MATCH_LENGTH} as the
* match length.
*
* @param oldData a file containing the old data
* @param newData a file containing the new data
* @param outputStream where output should be written
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
public static void generatePatch(
final File oldData, final File newData, final OutputStream outputStream)
throws IOException, InterruptedException {
generatePatch(oldData, newData, outputStream, DEFAULT_MINIMUM_MATCH_LENGTH);
}
/**
* Generate a diff between the old data and the new, writing to the specified stream. Uses
* file-based storage for ancillary allocations.
*
* @param oldData a file containing the old data
* @param newData a file containing the new data
* @param outputStream where output should be written
* @param minimumMatchLength the minimum "match" (in bytes) for BsDiff to consider between the
* oldData and newData. This can have a significant effect on both the generated patch size
* and the amount of time and memory required to apply the patch.
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
public static void generatePatch(
final File oldData,
final File newData,
final OutputStream outputStream,
final int minimumMatchLength)
throws IOException, InterruptedException {
try (RandomAccessFile oldDataRAF = new RandomAccessFile(oldData, "r");
RandomAccessFile newDataRAF = new RandomAccessFile(newData, "r");
RandomAccessObject oldDataRAO =
new RandomAccessObject.RandomAccessMmapObject(oldDataRAF, "r");
RandomAccessObject newDataRAO =
new RandomAccessObject.RandomAccessMmapObject(newDataRAF, "r"); ) {
generatePatch(
oldDataRAO,
newDataRAO,
outputStream,
new RandomAccessObjectFactory.RandomAccessMmapObjectFactory("rw"),
minimumMatchLength);
}
// Due to a bug in the JVM (http://bugs.java.com/view_bug.do?bug_id=6417205), we need to call
// gc() and runFinalization() explicitly to get rid of any MappedByteBuffers we may have used
// during patch generation.
System.gc();
System.runFinalization();
}
/**
* Generate a diff between the old data and the new, writing to the specified stream.
*
* @param oldData the old data
* @param newData the new data
* @param outputStream where output should be written
* @param randomAccessObjectFactory factory to create auxiliary storage during BsDiff
* @param minimumMatchLength the minimum "match" (in bytes) for BsDiff to consider between the
* oldData and newData. This can have a significant effect on both the generated patch size
* and the amount of time and memory required to apply the patch.
* @throws IOException if unable to read or write data
* @throws InterruptedException if any thread interrupts this thread
*/
public static void generatePatch(
final RandomAccessObject oldData,
final RandomAccessObject newData,
final OutputStream outputStream,
final RandomAccessObjectFactory randomAccessObjectFactory,
final int minimumMatchLength)
throws IOException, InterruptedException {
// Write header (signature + new file length)
outputStream.write("ENDSLEY/BSDIFF43".getBytes(StandardCharsets.US_ASCII));
BsUtil.writeFormattedLong(newData.length(), outputStream);
// Do the suffix search.
try (final RandomAccessObject groupArray =
new DivSuffixSorter(randomAccessObjectFactory).suffixSort(oldData)) {
BsDiffMatcher matcher = new BsDiffMatcher(oldData, newData, groupArray, minimumMatchLength);
generatePatchWithMatcher(oldData, newData, matcher, outputStream);
}
}
}