| // Copyright 2017 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "puffin/src/include/puffin/utils.h" |
| |
| #include <inttypes.h> |
| |
| #include <algorithm> |
| #include <set> |
| #include <string> |
| #include <vector> |
| |
| #include "puffin/src/bit_reader.h" |
| #include "puffin/src/file_stream.h" |
| #include "puffin/src/include/puffin/common.h" |
| #include "puffin/src/include/puffin/puffer.h" |
| #include "puffin/src/logging.h" |
| #include "puffin/src/memory_stream.h" |
| #include "puffin/src/puff_writer.h" |
| |
| using std::set; |
| using std::string; |
| using std::vector; |
| |
| namespace { |
| // Use memcpy to access the unaligned data of type |T|. |
| template <typename T> |
| inline T get_unaligned(const void* address) { |
| T result; |
| memcpy(&result, address, sizeof(T)); |
| return result; |
| } |
| |
| struct ExtentData { |
| puffin::BitExtent extent; |
| uint64_t byte_offset; |
| uint64_t byte_length; |
| const puffin::Buffer& data; |
| |
| ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data) |
| : extent(in_extent), data(in_data) { |
| // Round start offset up and end offset down to exclude bits not in this |
| // extent. We simply ignore the bits at start and end that's not on byte |
| // boundary because as long as the majority of the bytes are the same, |
| // bsdiff will be able to reference it. |
| byte_offset = (extent.offset + 7) / 8; |
| uint64_t byte_end_offset = (extent.offset + extent.length) / 8; |
| CHECK(byte_end_offset <= data.size()); |
| if (byte_end_offset > byte_offset) { |
| byte_length = byte_end_offset - byte_offset; |
| } else { |
| byte_length = 0; |
| } |
| } |
| |
| int Compare(const ExtentData& other) const { |
| if (extent.length != other.extent.length) { |
| return extent.length < other.extent.length ? -1 : 1; |
| } |
| return memcmp(data.data() + byte_offset, |
| other.data.data() + other.byte_offset, |
| std::min(byte_length, other.byte_length)); |
| } |
| bool operator<(const ExtentData& other) const { return Compare(other) < 0; } |
| bool operator==(const ExtentData& other) const { return Compare(other) == 0; } |
| }; |
| |
| } // namespace |
| |
| namespace puffin { |
| |
| bool LocateDeflatesInDeflateStream(const uint8_t* data, |
| uint64_t size, |
| uint64_t virtual_offset, |
| vector<BitExtent>* deflates, |
| uint64_t* compressed_size) { |
| Puffer puffer; |
| BufferBitReader bit_reader(data, size); |
| BufferPuffWriter puff_writer(nullptr, 0); |
| vector<BitExtent> sub_deflates; |
| TEST_AND_RETURN_FALSE( |
| puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates)); |
| for (const auto& deflate : sub_deflates) { |
| deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length); |
| } |
| if (compressed_size) { |
| *compressed_size = bit_reader.Offset(); |
| } |
| return true; |
| } |
| |
| // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the |
| // definition of a zlib stream. For finding the deflate blocks, we relying on |
| // the proper size of the zlib stream in |data|. Basically the size of the zlib |
| // stream should be known before hand. Otherwise we need to parse the stream and |
| // find the location of compressed blocks using CalculateSizeOfDeflateBlock(). |
| bool LocateDeflatesInZlib(const Buffer& data, |
| std::vector<BitExtent>* deflates) { |
| // A zlib stream has the following format: |
| // 0 1 compression method and flag |
| // 1 1 flag |
| // 2 4 preset dictionary (optional) |
| // 2 or 6 n compressed data |
| // n+(2 or 6) 4 Adler-32 checksum |
| TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer |
| uint16_t cmf = data[0]; |
| auto compression_method = cmf & 0x0F; |
| // For deflate compression_method should be 8. |
| TEST_AND_RETURN_FALSE(compression_method == 8); |
| |
| auto cinfo = (cmf & 0xF0) >> 4; |
| // Value greater than 7 is not allowed in deflate. |
| TEST_AND_RETURN_FALSE(cinfo <= 7); |
| |
| auto flag = data[1]; |
| TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0); |
| |
| uint64_t header_len = 2; |
| if (flag & 0x20) { |
| header_len += 4; // 4 bytes for the preset dictionary. |
| } |
| |
| // 4 is for ADLER32. |
| TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream( |
| data.data() + header_len, data.size() - header_len - 4, header_len, |
| deflates, nullptr)); |
| return true; |
| } |
| |
| bool FindDeflateSubBlocks(const UniqueStreamPtr& src, |
| const vector<ByteExtent>& deflates, |
| vector<BitExtent>* subblock_deflates) { |
| Puffer puffer; |
| Buffer deflate_buffer; |
| for (const auto& deflate : deflates) { |
| TEST_AND_RETURN_FALSE(src->Seek(deflate.offset)); |
| // Read from src into deflate_buffer. |
| deflate_buffer.resize(deflate.length); |
| TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length)); |
| |
| // Find all the subblocks. |
| BufferBitReader bit_reader(deflate_buffer.data(), deflate.length); |
| // The uncompressed blocks will be ignored since we are passing a null |
| // buffered puff writer and a valid deflate locations output array. This |
| // should not happen in the puffdiff or anywhere else by default. |
| BufferPuffWriter puff_writer(nullptr, 0); |
| vector<BitExtent> subblocks; |
| TEST_AND_RETURN_FALSE( |
| puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks)); |
| TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset()); |
| for (const auto& subblock : subblocks) { |
| subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8, |
| subblock.length); |
| } |
| } |
| return true; |
| } |
| |
| bool LocateDeflatesInZlibBlocks(const string& file_path, |
| const vector<ByteExtent>& zlibs, |
| vector<BitExtent>* deflates) { |
| auto src = FileStream::Open(file_path, true, false); |
| TEST_AND_RETURN_FALSE(src); |
| |
| Buffer buffer; |
| for (const auto& zlib : zlibs) { |
| buffer.resize(zlib.length); |
| TEST_AND_RETURN_FALSE(src->Seek(zlib.offset)); |
| TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size())); |
| vector<BitExtent> tmp_deflates; |
| TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates)); |
| for (const auto& deflate : tmp_deflates) { |
| deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length); |
| } |
| } |
| return true; |
| } |
| |
| // For more information about gzip format, refer to RFC 1952 located at: |
| // https://www.ietf.org/rfc/rfc1952.txt |
| bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) { |
| uint64_t member_start = 0; |
| while (member_start + 10 <= data.size() && data[member_start + 0] == 0x1F && |
| data[member_start + 1] == 0x8B && data[member_start + 2] == 8) { |
| // Each member entry has the following format |
| // 0 1 0x1F |
| // 1 1 0x8B |
| // 2 1 compression method (8 denotes deflate) |
| // 3 1 set of flags |
| // 4 4 modification time |
| // 8 1 extra flags |
| // 9 1 operating system |
| |
| uint64_t offset = member_start + 10; |
| int flag = data[member_start + 3]; |
| // Extra field |
| if (flag & 4) { |
| TEST_AND_RETURN_FALSE(offset + 2 <= data.size()); |
| uint16_t extra_length = data[offset++]; |
| extra_length |= static_cast<uint16_t>(data[offset++]) << 8; |
| TEST_AND_RETURN_FALSE(offset + extra_length <= data.size()); |
| offset += extra_length; |
| } |
| // File name field |
| if (flag & 8) { |
| while (true) { |
| TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); |
| if (data[offset++] == 0) { |
| break; |
| } |
| } |
| } |
| // File comment field |
| if (flag & 16) { |
| while (true) { |
| TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); |
| if (data[offset++] == 0) { |
| break; |
| } |
| } |
| } |
| // CRC16 field |
| if (flag & 2) { |
| offset += 2; |
| } |
| |
| uint64_t compressed_size = 0; |
| TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream( |
| data.data() + offset, data.size() - offset, offset, deflates, |
| &compressed_size)); |
| offset += compressed_size; |
| |
| // Ignore CRC32 and uncompressed size. |
| TEST_AND_RETURN_FALSE(offset + 8 <= data.size()); |
| offset += 8; |
| member_start = offset; |
| } |
| // Return true if we've successfully parsed at least one gzip. |
| return member_start != 0; |
| } |
| |
| // For more information about the zip format, refer to |
| // https://support.pkware.com/display/PKZIP/APPNOTE |
| bool LocateDeflatesInZipArchive(const Buffer& data, |
| vector<BitExtent>* deflates) { |
| uint64_t pos = 0; |
| while (pos <= data.size() - 30) { |
| // TODO(xunchang) add support for big endian system when searching for |
| // magic numbers. |
| if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) { |
| pos++; |
| continue; |
| } |
| |
| // local file header format |
| // 0 4 0x04034b50 |
| // 4 2 minimum version needed to extract |
| // 6 2 general purpose bit flag |
| // 8 2 compression method |
| // 10 4 file last modification date & time |
| // 14 4 CRC-32 |
| // 18 4 compressed size |
| // 22 4 uncompressed size |
| // 26 2 file name length |
| // 28 2 extra field length |
| // 30 n file name |
| // 30+n m extra field |
| auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8); |
| if (compression_method != 8) { // non-deflate type |
| pos += 4; |
| continue; |
| } |
| |
| auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18); |
| auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26); |
| auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28); |
| uint64_t header_size = 30 + file_name_length + extra_field_length; |
| |
| // sanity check |
| if (static_cast<uint64_t>(header_size) + compressed_size > data.size() || |
| pos > data.size() - header_size - compressed_size) { |
| pos += 4; |
| continue; |
| } |
| |
| vector<BitExtent> tmp_deflates; |
| uint64_t offset = pos + header_size; |
| uint64_t calculated_compressed_size = 0; |
| if (!LocateDeflatesInDeflateStream( |
| data.data() + offset, data.size() - offset, offset, &tmp_deflates, |
| &calculated_compressed_size)) { |
| LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos |
| << ", skip adding deflates for this entry."; |
| pos += 4; |
| continue; |
| } |
| |
| // Double check the compressed size if it is available in the file header. |
| if (compressed_size > 0 && compressed_size != calculated_compressed_size) { |
| LOG(WARNING) << "Compressed size in the file header: " << compressed_size |
| << " doesn't equal the real size: " |
| << calculated_compressed_size; |
| } |
| |
| deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end()); |
| pos += header_size + calculated_compressed_size; |
| } |
| |
| return true; |
| } |
| |
| bool LocateDeflateSubBlocksInZipArchive(const Buffer& data, |
| vector<BitExtent>* deflates) { |
| return LocateDeflatesInZipArchive(data, deflates); |
| } |
| |
| bool FindPuffLocations(const UniqueStreamPtr& src, |
| const vector<BitExtent>& deflates, |
| vector<ByteExtent>* puffs, |
| uint64_t* out_puff_size) { |
| Puffer puffer; |
| Buffer deflate_buffer; |
| |
| // Here accumulate the size difference between each corresponding deflate and |
| // puff. At the end we add this cummulative size difference to the size of the |
| // deflate stream to get the size of the puff stream. We use signed size |
| // because puff size could be smaller than deflate size. |
| int64_t total_size_difference = 0; |
| for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) { |
| // Read from src into deflate_buffer. |
| auto start_byte = deflate->offset / 8; |
| auto end_byte = (deflate->offset + deflate->length + 7) / 8; |
| deflate_buffer.resize(end_byte - start_byte); |
| TEST_AND_RETURN_FALSE(src->Seek(start_byte)); |
| TEST_AND_RETURN_FALSE( |
| src->Read(deflate_buffer.data(), deflate_buffer.size())); |
| // Find the size of the puff. |
| BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size()); |
| uint64_t bits_to_skip = deflate->offset % 8; |
| TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip)); |
| bit_reader.DropBits(bits_to_skip); |
| |
| BufferPuffWriter puff_writer(nullptr, 0); |
| TEST_AND_RETURN_FALSE( |
| puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr)); |
| TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset()); |
| |
| // 1 if a deflate ends at the same byte that the next deflate starts and |
| // there is a few bits gap between them. In practice this may never happen, |
| // but it is a good idea to support it anyways. If there is a gap, the value |
| // of the gap will be saved as an integer byte to the puff stream. The parts |
| // of the byte that belogs to the deflates are shifted out. |
| int gap = 0; |
| if (deflate != deflates.begin()) { |
| auto prev_deflate = std::prev(deflate); |
| if ((prev_deflate->offset + prev_deflate->length == deflate->offset) |
| // If deflates are on byte boundary the gap will not be counted later, |
| // so we won't worry about it. |
| && (deflate->offset % 8 != 0)) { |
| gap = 1; |
| } |
| } |
| |
| start_byte = ((deflate->offset + 7) / 8); |
| end_byte = (deflate->offset + deflate->length) / 8; |
| int64_t deflate_length_in_bytes = end_byte - start_byte; |
| |
| // If there was no gap bits between the current and previous deflates, there |
| // will be no extra gap byte, so the offset will be shifted one byte back. |
| auto puff_offset = start_byte - gap + total_size_difference; |
| auto puff_size = puff_writer.Size(); |
| // Add the location into puff. |
| puffs->emplace_back(puff_offset, puff_size); |
| total_size_difference += |
| static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap; |
| } |
| |
| uint64_t src_size; |
| TEST_AND_RETURN_FALSE(src->GetSize(&src_size)); |
| auto final_size = static_cast<int64_t>(src_size) + total_size_difference; |
| TEST_AND_RETURN_FALSE(final_size >= 0); |
| *out_puff_size = final_size; |
| return true; |
| } |
| |
| void RemoveEqualBitExtents(const Buffer& data1, |
| const Buffer& data2, |
| std::vector<BitExtent>* extents1, |
| std::vector<BitExtent>* extents2) { |
| set<ExtentData> extent1_set, equal_extents; |
| for (const BitExtent& ext : *extents1) { |
| extent1_set.emplace(ext, data1); |
| } |
| |
| auto new_extents2_end = extents2->begin(); |
| for (const BitExtent& ext : *extents2) { |
| ExtentData extent_data(ext, data2); |
| if (extent1_set.find(extent_data) != extent1_set.end()) { |
| equal_extents.insert(extent_data); |
| } else { |
| *new_extents2_end++ = ext; |
| } |
| } |
| extents2->erase(new_extents2_end, extents2->end()); |
| extents1->erase( |
| std::remove_if(extents1->begin(), extents1->end(), |
| [&equal_extents, &data1](const BitExtent& ext) { |
| return equal_extents.find(ExtentData(ext, data1)) != |
| equal_extents.end(); |
| }), |
| extents1->end()); |
| } |
| } // namespace puffin |