src/google/vcencoder.h - platform/external/open-vcdiff - Git at Google

 // Copyright 2007 Google Inc.
 // Author: Lincoln Smith
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef OPEN_VCDIFF_VCENCODER_H_
 #define OPEN_VCDIFF_VCENCODER_H_

 #include <stddef.h>  // size_t
 #include "google/format_extension_flags.h"
 #include "google/output_string.h"

 namespace open_vcdiff {

 class VCDiffEngine;
 class VCDiffStreamingEncoderImpl;

 // A HashedDictionary must be constructed from the dictionary data
 // in order to use VCDiffStreamingEncoder.  If the same dictionary will
 // be used to perform several encoding operations, then the caller should
 // create the HashedDictionary once and cache it for reuse.  This object
 // is thread-safe: the same const HashedDictionary can be used
 // by several threads simultaneously, each with its own VCDiffStreamingEncoder.
 //
 // dictionary_contents is copied into the HashedDictionary, so the
 // caller may free that string, if desired, after the constructor returns.
 //
 class HashedDictionary {
  public:
   HashedDictionary(const char* dictionary_contents,
                    size_t dictionary_size);
   ~HashedDictionary();

   // Init() must be called before using the HashedDictionary as an argument
   // to the VCDiffStreamingEncoder, or for any other purpose except
   // destruction.  It returns true if initialization succeeded, or false
   // if an error occurred, in which case the caller should destroy the object
   // without using it.
   bool Init();

   const VCDiffEngine* engine() const { return engine_; }

  private:
   const VCDiffEngine* engine_;

   // Make the copy constructor and assignment operator private
   // so that they don't inadvertently get used.
   HashedDictionary(const HashedDictionary&);  // NOLINT
   void operator=(const HashedDictionary&);
 };

 // The standard streaming interface to the VCDIFF (RFC 3284) encoder.
 // "Streaming" in this context means that, even though the entire set of
 // input data to be encoded may not be available at once, the encoder
 // can produce partial output based on what is available.  Of course,
 // the caller should try to maximize the sizes of the data chunks passed
 // to the encoder.
 class VCDiffStreamingEncoder {
  public:
   // The HashedDictionary object passed to the constructor must remain valid,
   // without being deleted, for the lifetime of the VCDiffStreamingEncoder
   // object.
   //
   // format_extensions allows certain open-vcdiff extensions to the VCDIFF
   // format to be included in the encoded output.  These extensions are not
   // part of the RFC 3284 draft standard, so specifying any extension flags
   // will make the output compatible only with open-vcdiff, or with other
   // VCDIFF implementations that accept these extensions.  See above for an
   // explanation of each possible flag value.
   //
   // *** look_for_target_matches:
   // The VCDIFF format allows COPY instruction addresses to reference data from
   // the source (dictionary), or from previously encoded target data.
   //
   // If look_for_target_matches is false, then the encoder will only
   // produce COPY instructions that reference source data from the dictionary,
   // never from previously encoded target data.  This will speed up the encoding
   // process, but the encoded data will not be as compact.
   //
   // If this value is true, then the encoder will produce COPY instructions
   // that reference either source data or target data.  A COPY instruction from
   // the previously encoded target data may even extend into the range of the
   // data being produced by that same COPY instruction; for example, if the
   // previously encoded target data is "LA", then a single COPY instruction of
   // length 10 can produce the additional target data "LALALALALA".
   //
   // There is a third type of COPY instruction that starts within
   // the source data and extends from the end of the source data
   // into the beginning of the target data.  This VCDIFF encoder will never
   // produce a COPY instruction of this third type (regardless of the value of
   // look_for_target_matches) because the cost of checking for matches
   // across the source-target boundary would not justify its benefits.
   //
   VCDiffStreamingEncoder(const HashedDictionary* dictionary,
                          VCDiffFormatExtensionFlags format_extensions,
                          bool look_for_target_matches);
   ~VCDiffStreamingEncoder();

   // The client should use these routines as follows:
   //    HashedDictionary hd(dictionary, dictionary_size);
   //    if (!hd.Init()) {
   //      HandleError();
   //      return;
   //    }
   //    string output_string;
   //    VCDiffStreamingEncoder v(hd, false, false);
   //    if (!v.StartEncoding(&output_string)) {
   //      HandleError();
   //      return;  // No need to call FinishEncoding()
   //    }
   //    Process(output_string.data(), output_string.size());
   //    output_string.clear();
   //    while (get data_buf) {
   //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
   //        HandleError();
   //        return;  // No need to call FinishEncoding()
   //      }
   //      // The encoding is appended to output_string at each call,
   //      // so clear output_string once its contents have been processed.
   //      Process(output_string.data(), output_string.size());
   //      output_string.clear();
   //    }
   //    if (!v.FinishEncoding(&output_string)) {
   //      HandleError();
   //      return;
   //    }
   //    Process(output_string.data(), output_string.size());
   //    output_string.clear();
   //
   // I.e., the allowed pattern of calls is
   //    StartEncoding EncodeChunk* FinishEncoding
   //
   // The size of the encoded output depends on the sizes of the chunks
   // passed in (i.e. the chunking boundary affects compression).
   // However the decoded output is independent of chunk boundaries.

   // Sets up the data structures for encoding.
   // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
   // to *output_string.
   //
   // Note: we *append*, so the old contents of *output_string stick around.
   // This convention differs from the non-streaming Encode/Decode
   // interfaces in VCDiffEncoder.
   //
   // If an error occurs, this function returns false; otherwise it returns true.
   // If this function returns false, the caller does not need to call
   // FinishEncoding or to do any cleanup except destroying the
   // VCDiffStreamingEncoder object.
   template<class OutputType>
   bool StartEncoding(OutputType* output) {
     OutputString<OutputType> output_string(output);
     return StartEncodingToInterface(&output_string);
   }

   bool StartEncodingToInterface(OutputStringInterface* output_string);

   // Appends compressed encoding for "data" (one complete VCDIFF delta window)
   // to *output_string.
   // If an error occurs (for example, if StartEncoding was not called
   // earlier or StartEncoding returned false), this function returns false;
   // otherwise it returns true.  The caller does not need to call FinishEncoding
   // or do any cleanup except destroying the VCDiffStreamingEncoder
   // if this function returns false.
   template<class OutputType>
   bool EncodeChunk(const char* data, size_t len, OutputType* output) {
     OutputString<OutputType> output_string(output);
     return EncodeChunkToInterface(data, len, &output_string);
   }

   bool EncodeChunkToInterface(const char* data, size_t len,
                               OutputStringInterface* output_string);

   // Finishes encoding and appends any leftover encoded data to *output_string.
   // If an error occurs (for example, if StartEncoding was not called
   // earlier or StartEncoding returned false), this function returns false;
   // otherwise it returns true.  The caller does not need to
   // do any cleanup except destroying the VCDiffStreamingEncoder
   // if this function returns false.
   template<class OutputType>
   bool FinishEncoding(OutputType* output) {
     OutputString<OutputType> output_string(output);
     return FinishEncodingToInterface(&output_string);
   }

   bool FinishEncodingToInterface(OutputStringInterface* output_string);

  private:
   VCDiffStreamingEncoderImpl* const impl_;

   // Make the copy constructor and assignment operator private
   // so that they don't inadvertently get used.
   VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
   void operator=(const VCDiffStreamingEncoder&);
 };

 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used
 // if the entire target data string is available.
 //
 class VCDiffEncoder {
  public:
   VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
       : dictionary_(dictionary_contents, dictionary_size),
         encoder_(NULL),
         flags_(VCD_STANDARD_FORMAT),
         look_for_target_matches_(true) { }

   ~VCDiffEncoder() {
     delete encoder_;
   }

   // By default, VCDiffEncoder uses standard VCDIFF format.  This function
   // can be used before calling Encode(), to specify that interleaved format
   // and/or checksum format should be used.
   void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }

   // By default, VCDiffEncoder looks for matches in the dictionary and also in
   // the previously encoded target data.  This function can be used before
   // calling Encode(), to specify whether or not target matching should be
   // enabled.
   void SetTargetMatching(bool look_for_target_matches) {
     look_for_target_matches_ = look_for_target_matches;
   }

   // Replaces old contents of output_string with the encoded form of
   // target_data.
   template<class OutputType>
   bool Encode(const char* target_data,
               size_t target_len,
               OutputType* output) {
     OutputString<OutputType> output_string(output);
     return EncodeToInterface(target_data, target_len, &output_string);
   }

  private:
   bool EncodeToInterface(const char* target_data,
                          size_t target_len,
                          OutputStringInterface* output_string);

   HashedDictionary dictionary_;
   VCDiffStreamingEncoder* encoder_;
   VCDiffFormatExtensionFlags flags_;
   bool look_for_target_matches_;

   // Make the copy constructor and assignment operator private
   // so that they don't inadvertently get used.
   VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
   void operator=(const VCDiffEncoder&);
 };

 }  // namespace open_vcdiff

 #endif  // OPEN_VCDIFF_VCENCODER_H_
	// Copyright 2007 Google Inc.
	// Author: Lincoln Smith
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef OPEN_VCDIFF_VCENCODER_H_
	#define OPEN_VCDIFF_VCENCODER_H_

	#include <stddef.h> // size_t
	#include "google/format_extension_flags.h"
	#include "google/output_string.h"

	namespace open_vcdiff {

	class VCDiffEngine;
	class VCDiffStreamingEncoderImpl;

	// A HashedDictionary must be constructed from the dictionary data
	// in order to use VCDiffStreamingEncoder. If the same dictionary will
	// be used to perform several encoding operations, then the caller should
	// create the HashedDictionary once and cache it for reuse. This object
	// is thread-safe: the same const HashedDictionary can be used
	// by several threads simultaneously, each with its own VCDiffStreamingEncoder.
	//
	// dictionary_contents is copied into the HashedDictionary, so the
	// caller may free that string, if desired, after the constructor returns.
	//
	class HashedDictionary {
	public:
	HashedDictionary(const char* dictionary_contents,
	size_t dictionary_size);
	~HashedDictionary();

	// Init() must be called before using the HashedDictionary as an argument
	// to the VCDiffStreamingEncoder, or for any other purpose except
	// destruction. It returns true if initialization succeeded, or false
	// if an error occurred, in which case the caller should destroy the object
	// without using it.
	bool Init();

	const VCDiffEngine* engine() const { return engine_; }

	private:
	const VCDiffEngine* engine_;

	// Make the copy constructor and assignment operator private
	// so that they don't inadvertently get used.
	HashedDictionary(const HashedDictionary&); // NOLINT
	void operator=(const HashedDictionary&);
	};

	// The standard streaming interface to the VCDIFF (RFC 3284) encoder.
	// "Streaming" in this context means that, even though the entire set of
	// input data to be encoded may not be available at once, the encoder
	// can produce partial output based on what is available. Of course,
	// the caller should try to maximize the sizes of the data chunks passed
	// to the encoder.
	class VCDiffStreamingEncoder {
	public:
	// The HashedDictionary object passed to the constructor must remain valid,
	// without being deleted, for the lifetime of the VCDiffStreamingEncoder
	// object.
	//
	// format_extensions allows certain open-vcdiff extensions to the VCDIFF
	// format to be included in the encoded output. These extensions are not
	// part of the RFC 3284 draft standard, so specifying any extension flags
	// will make the output compatible only with open-vcdiff, or with other
	// VCDIFF implementations that accept these extensions. See above for an
	// explanation of each possible flag value.
	//
	// *** look_for_target_matches:
	// The VCDIFF format allows COPY instruction addresses to reference data from
	// the source (dictionary), or from previously encoded target data.
	//
	// If look_for_target_matches is false, then the encoder will only
	// produce COPY instructions that reference source data from the dictionary,
	// never from previously encoded target data. This will speed up the encoding
	// process, but the encoded data will not be as compact.
	//
	// If this value is true, then the encoder will produce COPY instructions
	// that reference either source data or target data. A COPY instruction from
	// the previously encoded target data may even extend into the range of the
	// data being produced by that same COPY instruction; for example, if the
	// previously encoded target data is "LA", then a single COPY instruction of
	// length 10 can produce the additional target data "LALALALALA".
	//
	// There is a third type of COPY instruction that starts within
	// the source data and extends from the end of the source data
	// into the beginning of the target data. This VCDIFF encoder will never
	// produce a COPY instruction of this third type (regardless of the value of
	// look_for_target_matches) because the cost of checking for matches
	// across the source-target boundary would not justify its benefits.
	//
	VCDiffStreamingEncoder(const HashedDictionary* dictionary,
	VCDiffFormatExtensionFlags format_extensions,
	bool look_for_target_matches);
	~VCDiffStreamingEncoder();

	// The client should use these routines as follows:
	// HashedDictionary hd(dictionary, dictionary_size);
	// if (!hd.Init()) {
	// HandleError();
	// return;
	// }
	// string output_string;
	// VCDiffStreamingEncoder v(hd, false, false);
	// if (!v.StartEncoding(&output_string)) {
	// HandleError();
	// return; // No need to call FinishEncoding()
	// }
	// Process(output_string.data(), output_string.size());
	// output_string.clear();
	// while (get data_buf) {
	// if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
	// HandleError();
	// return; // No need to call FinishEncoding()
	// }
	// // The encoding is appended to output_string at each call,
	// // so clear output_string once its contents have been processed.
	// Process(output_string.data(), output_string.size());
	// output_string.clear();
	// }
	// if (!v.FinishEncoding(&output_string)) {
	// HandleError();
	// return;
	// }
	// Process(output_string.data(), output_string.size());
	// output_string.clear();
	//
	// I.e., the allowed pattern of calls is
	// StartEncoding EncodeChunk* FinishEncoding
	//
	// The size of the encoded output depends on the sizes of the chunks
	// passed in (i.e. the chunking boundary affects compression).
	// However the decoded output is independent of chunk boundaries.

	// Sets up the data structures for encoding.
	// Writes a VCDIFF delta file header (as defined in RFC section 4.1)
	// to *output_string.
	//
	// Note: we append, so the old contents of *output_string stick around.
	// This convention differs from the non-streaming Encode/Decode
	// interfaces in VCDiffEncoder.
	//
	// If an error occurs, this function returns false; otherwise it returns true.
	// If this function returns false, the caller does not need to call
	// FinishEncoding or to do any cleanup except destroying the
	// VCDiffStreamingEncoder object.
	template<class OutputType>
	bool StartEncoding(OutputType* output) {
	OutputString<OutputType> output_string(output);
	return StartEncodingToInterface(&output_string);
	}

	bool StartEncodingToInterface(OutputStringInterface* output_string);

	// Appends compressed encoding for "data" (one complete VCDIFF delta window)
	// to *output_string.
	// If an error occurs (for example, if StartEncoding was not called
	// earlier or StartEncoding returned false), this function returns false;
	// otherwise it returns true. The caller does not need to call FinishEncoding
	// or do any cleanup except destroying the VCDiffStreamingEncoder
	// if this function returns false.
	template<class OutputType>
	bool EncodeChunk(const char* data, size_t len, OutputType* output) {
	OutputString<OutputType> output_string(output);
	return EncodeChunkToInterface(data, len, &output_string);
	}

	bool EncodeChunkToInterface(const char* data, size_t len,
	OutputStringInterface* output_string);

	// Finishes encoding and appends any leftover encoded data to *output_string.
	// If an error occurs (for example, if StartEncoding was not called
	// earlier or StartEncoding returned false), this function returns false;
	// otherwise it returns true. The caller does not need to
	// do any cleanup except destroying the VCDiffStreamingEncoder
	// if this function returns false.
	template<class OutputType>
	bool FinishEncoding(OutputType* output) {
	OutputString<OutputType> output_string(output);
	return FinishEncodingToInterface(&output_string);
	}

	bool FinishEncodingToInterface(OutputStringInterface* output_string);

	private:
	VCDiffStreamingEncoderImpl* const impl_;

	// Make the copy constructor and assignment operator private
	// so that they don't inadvertently get used.
	VCDiffStreamingEncoder(const VCDiffStreamingEncoder&); // NOLINT
	void operator=(const VCDiffStreamingEncoder&);
	};

	// A simpler (non-streaming) interface to the VCDIFF encoder that can be used
	// if the entire target data string is available.
	//
	class VCDiffEncoder {
	public:
	VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
	: dictionary_(dictionary_contents, dictionary_size),
	encoder_(NULL),
	flags_(VCD_STANDARD_FORMAT),
	look_for_target_matches_(true) { }

	~VCDiffEncoder() {
	delete encoder_;
	}

	// By default, VCDiffEncoder uses standard VCDIFF format. This function
	// can be used before calling Encode(), to specify that interleaved format
	// and/or checksum format should be used.
	void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }

	// By default, VCDiffEncoder looks for matches in the dictionary and also in
	// the previously encoded target data. This function can be used before
	// calling Encode(), to specify whether or not target matching should be
	// enabled.
	void SetTargetMatching(bool look_for_target_matches) {
	look_for_target_matches_ = look_for_target_matches;
	}

	// Replaces old contents of output_string with the encoded form of
	// target_data.
	template<class OutputType>
	bool Encode(const char* target_data,
	size_t target_len,
	OutputType* output) {
	OutputString<OutputType> output_string(output);
	return EncodeToInterface(target_data, target_len, &output_string);
	}

	private:
	bool EncodeToInterface(const char* target_data,
	size_t target_len,
	OutputStringInterface* output_string);

	HashedDictionary dictionary_;
	VCDiffStreamingEncoder* encoder_;
	VCDiffFormatExtensionFlags flags_;
	bool look_for_target_matches_;

	// Make the copy constructor and assignment operator private
	// so that they don't inadvertently get used.
	VCDiffEncoder(const VCDiffEncoder&); // NOLINT
	void operator=(const VCDiffEncoder&);
	};

	} // namespace open_vcdiff

	#endif // OPEN_VCDIFF_VCENCODER_H_