blob: ada816cceb809e861d9d8facd2a83bd2f1e697ef [file] [log] [blame]
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "puffin/src/include/puffin/utils.h"
#include <inttypes.h>
#include <string>
#include <vector>
#include <zlib.h>
#include "puffin/src/bit_reader.h"
#include "puffin/src/file_stream.h"
#include "puffin/src/include/puffin/common.h"
#include "puffin/src/include/puffin/puffer.h"
#include "puffin/src/logging.h"
#include "puffin/src/memory_stream.h"
#include "puffin/src/puff_writer.h"
using std::string;
using std::vector;
namespace {
// Use memcpy to access the unaligned data of type |T|.
template <typename T>
inline T get_unaligned(const void* address) {
T result;
memcpy(&result, address, sizeof(T));
return result;
}
// Calculate both the compressed size and uncompressed size of the deflate
// block that starts from the offset |start| of buffer |data|.
bool CalculateSizeOfDeflateBlock(const puffin::Buffer& data,
uint64_t start,
uint64_t* compressed_size,
uint64_t* uncompressed_size) {
TEST_AND_RETURN_FALSE(compressed_size != nullptr &&
uncompressed_size != nullptr);
TEST_AND_RETURN_FALSE(start < data.size());
z_stream strm = {};
strm.avail_in = data.size() - start;
strm.next_in = data.data() + start;
// -15 means we are decoding a 'raw' stream without zlib headers.
if (inflateInit2(&strm, -15)) {
LOG(ERROR) << "Failed to initialize inflate: " << strm.msg;
return false;
}
const unsigned int kBufferSize = 32768;
std::vector<uint8_t> uncompressed_data(kBufferSize);
*uncompressed_size = 0;
int status = Z_OK;
do {
// Overwrite the same buffer since we don't need the uncompressed data.
strm.avail_out = kBufferSize;
strm.next_out = uncompressed_data.data();
status = inflate(&strm, Z_NO_FLUSH);
if (status < 0) {
LOG(ERROR) << "Inflate failed: " << strm.msg << ", has decompressed "
<< *uncompressed_size << " bytes.";
return false;
}
*uncompressed_size += kBufferSize - strm.avail_out;
} while (status != Z_STREAM_END);
*compressed_size = data.size() - start - strm.avail_in;
TEST_AND_RETURN_FALSE(inflateEnd(&strm) == Z_OK);
return true;
}
} // namespace
namespace puffin {
// This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
// definition of a zlib stream. For finding the deflate blocks, we relying on
// the proper size of the zlib stream in |data|. Basically the size of the zlib
// stream should be known before hand. Otherwise we need to parse the stream and
// find the location of compressed blocks using CalculateSizeOfDeflateBlock().
bool LocateDeflatesInZlib(const Buffer& data,
std::vector<ByteExtent>* deflate_blocks) {
// A zlib stream has the following format:
// 0 1 compression method and flag
// 1 1 flag
// 2 4 preset dictionary (optional)
// 2 or 6 n compressed data
// n+(2 or 6) 4 Adler-32 checksum
TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer
uint16_t cmf = data[0];
auto compression_method = cmf & 0x0F;
// For deflate compression_method should be 8.
TEST_AND_RETURN_FALSE(compression_method == 8);
auto cinfo = (cmf & 0xF0) >> 4;
// Value greater than 7 is not allowed in deflate.
TEST_AND_RETURN_FALSE(cinfo <= 7);
auto flag = data[1];
TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
uint64_t header_len = 2;
if (flag & 0x20) {
header_len += 4; // 4 bytes for the preset dictionary.
}
// 4 is for ADLER32.
deflate_blocks->emplace_back(header_len, data.size() - header_len - 4);
return true;
}
bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
const vector<ByteExtent>& deflates,
vector<BitExtent>* subblock_deflates) {
Puffer puffer;
Buffer deflate_buffer;
for (const auto& deflate : deflates) {
TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
// Read from src into deflate_buffer.
deflate_buffer.resize(deflate.length);
TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
// Find all the subblocks.
BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
BufferPuffWriter puff_writer(nullptr, 0);
vector<BitExtent> subblocks;
TEST_AND_RETURN_FALSE(
puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
for (const auto& subblock : subblocks) {
subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
subblock.length);
}
}
return true;
}
bool LocateDeflatesInZlibBlocks(const string& file_path,
const vector<ByteExtent>& zlibs,
vector<BitExtent>* deflates) {
auto src = FileStream::Open(file_path, true, false);
TEST_AND_RETURN_FALSE(src);
Buffer buffer;
for (auto& zlib : zlibs) {
buffer.resize(zlib.length);
TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
vector<ByteExtent> deflate_blocks;
TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &deflate_blocks));
vector<BitExtent> deflate_subblocks;
auto zlib_blc_src = MemoryStream::CreateForRead(buffer);
TEST_AND_RETURN_FALSE(
FindDeflateSubBlocks(zlib_blc_src, deflate_blocks, &deflate_subblocks));
// Relocated based on the offset of the zlib.
for (const auto& def : deflate_subblocks) {
deflates->emplace_back(zlib.offset * 8 + def.offset, def.length);
}
}
return true;
}
// For more information about gzip format, refer to RFC 1952 located at:
// https://www.ietf.org/rfc/rfc1952.txt
bool LocateDeflatesInGzip(const Buffer& data,
vector<ByteExtent>* deflate_blocks) {
uint64_t member_start = 0;
while (member_start < data.size()) {
// Each member entry has the following format
// 0 1 0x1F
// 1 1 0x8B
// 2 1 compression method (8 denotes deflate)
// 3 1 set of flags
// 4 4 modification time
// 8 1 extra flags
// 9 1 operating system
TEST_AND_RETURN_FALSE(member_start + 10 <= data.size());
TEST_AND_RETURN_FALSE(data[member_start + 0] == 0x1F);
TEST_AND_RETURN_FALSE(data[member_start + 1] == 0x8B);
TEST_AND_RETURN_FALSE(data[member_start + 2] == 8);
uint64_t offset = member_start + 10;
int flag = data[member_start + 3];
// Extra field
if (flag & 4) {
TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
uint16_t extra_length = data[offset++];
extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
offset += extra_length;
}
// File name field
if (flag & 8) {
while (true) {
TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
if (data[offset++] == 0) {
break;
}
}
}
// File comment field
if (flag & 16) {
while (true) {
TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
if (data[offset++] == 0) {
break;
}
}
}
// CRC16 field
if (flag & 2) {
offset += 2;
}
uint64_t compressed_size, uncompressed_size;
TEST_AND_RETURN_FALSE(CalculateSizeOfDeflateBlock(
data, offset, &compressed_size, &uncompressed_size));
TEST_AND_RETURN_FALSE(offset + compressed_size <= data.size());
deflate_blocks->push_back(ByteExtent(offset, compressed_size));
offset += compressed_size;
// Ignore CRC32;
TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
offset += 4;
uint32_t u_size = 0;
for (size_t i = 0; i < 4; i++) {
u_size |= static_cast<uint32_t>(data[offset++]) << (i * 8);
}
TEST_AND_RETURN_FALSE(uncompressed_size % (1 << 31) == u_size);
member_start = offset;
}
return true;
}
// For more information about the zip format, refer to
// https://support.pkware.com/display/PKZIP/APPNOTE
bool LocateDeflatesInZipArchive(const Buffer& data,
vector<ByteExtent>* deflate_blocks) {
uint64_t pos = 0;
while (pos <= data.size() - 30) {
// TODO(xunchang) add support for big endian system when searching for
// magic numbers.
if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
pos++;
continue;
}
// local file header format
// 0 4 0x04034b50
// 4 2 minimum version needed to extract
// 6 2 general purpose bit flag
// 8 2 compression method
// 10 4 file last modification date & time
// 14 4 CRC-32
// 18 4 compressed size
// 22 4 uncompressed size
// 26 2 file name length
// 28 2 extra field length
// 30 n file name
// 30+n m extra field
auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
if (compression_method != 8) { // non-deflate type
pos += 4;
continue;
}
auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
auto uncompressed_size = get_unaligned<uint32_t>(data.data() + pos + 22);
auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
uint64_t header_size = 30 + file_name_length + extra_field_length;
// sanity check
if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
pos > data.size() - header_size - compressed_size) {
pos += 4;
continue;
}
uint64_t calculated_compressed_size;
uint64_t calculated_uncompressed_size;
if (!CalculateSizeOfDeflateBlock(data, pos + header_size,
&calculated_compressed_size,
&calculated_uncompressed_size)) {
LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
<< ", skip adding deflates for this entry.";
pos += 4;
continue;
}
// Double check the compressed size and uncompressed size if they are
// available in the file header.
if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
LOG(WARNING) << "Compressed size in the file header: " << compressed_size
<< " doesn't equal the real size: "
<< calculated_compressed_size;
}
if (uncompressed_size > 0 &&
uncompressed_size != calculated_uncompressed_size) {
LOG(WARNING) << "Uncompressed size in the file header: "
<< uncompressed_size << " doesn't equal the real size: "
<< calculated_uncompressed_size;
}
deflate_blocks->emplace_back(pos + header_size, calculated_compressed_size);
pos += header_size + calculated_compressed_size;
}
return true;
}
bool LocateDeflateSubBlocksInZipArchive(const Buffer& data,
vector<BitExtent>* deflates) {
vector<ByteExtent> deflate_blocks;
if (!LocateDeflatesInZipArchive(data, &deflate_blocks)) {
return false;
}
auto src = MemoryStream::CreateForRead(data);
return FindDeflateSubBlocks(src, deflate_blocks, deflates);
}
bool FindPuffLocations(const UniqueStreamPtr& src,
const vector<BitExtent>& deflates,
vector<ByteExtent>* puffs,
uint64_t* out_puff_size) {
Puffer puffer;
Buffer deflate_buffer;
// Here accumulate the size difference between each corresponding deflate and
// puff. At the end we add this cummulative size difference to the size of the
// deflate stream to get the size of the puff stream. We use signed size
// because puff size could be smaller than deflate size.
int64_t total_size_difference = 0;
for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
// Read from src into deflate_buffer.
auto start_byte = deflate->offset / 8;
auto end_byte = (deflate->offset + deflate->length + 7) / 8;
deflate_buffer.resize(end_byte - start_byte);
TEST_AND_RETURN_FALSE(src->Seek(start_byte));
TEST_AND_RETURN_FALSE(
src->Read(deflate_buffer.data(), deflate_buffer.size()));
// Find the size of the puff.
BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
uint64_t bits_to_skip = deflate->offset % 8;
TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
bit_reader.DropBits(bits_to_skip);
BufferPuffWriter puff_writer(nullptr, 0);
TEST_AND_RETURN_FALSE(
puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
// 1 if a deflate ends at the same byte that the next deflate starts and
// there is a few bits gap between them. In practice this may never happen,
// but it is a good idea to support it anyways. If there is a gap, the value
// of the gap will be saved as an integer byte to the puff stream. The parts
// of the byte that belogs to the deflates are shifted out.
int gap = 0;
if (deflate != deflates.begin()) {
auto prev_deflate = std::prev(deflate);
if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
// If deflates are on byte boundary the gap will not be counted later,
// so we won't worry about it.
&& (deflate->offset % 8 != 0)) {
gap = 1;
}
}
start_byte = ((deflate->offset + 7) / 8);
end_byte = (deflate->offset + deflate->length) / 8;
int64_t deflate_length_in_bytes = end_byte - start_byte;
// If there was no gap bits between the current and previous deflates, there
// will be no extra gap byte, so the offset will be shifted one byte back.
auto puff_offset = start_byte - gap + total_size_difference;
auto puff_size = puff_writer.Size();
// Add the location into puff.
puffs->emplace_back(puff_offset, puff_size);
total_size_difference +=
static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
}
uint64_t src_size;
TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
TEST_AND_RETURN_FALSE(final_size >= 0);
*out_puff_size = final_size;
return true;
}
} // namespace puffin