torch/csrc/jit/serialization.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <cstdio>
 #include <cstring>
 #include <cerrno>
 #include <istream>
 #include <ostream>
 #include <fstream>

 namespace torch { namespace jit {

 // This file defines an on-disk serialization format to be used for PyTorch
 // model serialization. All integer values are serialized as little-endian.
 // Everything in this format is aligned to 64-byte boundaries to allow for direct
 // memory mapping and use in, for example, AVX512 instructions.
 // The format is as follows:
 //
 // -- File header --
 // [8 bytes] Magic number - little endian integer that spells 'PYTORCH1' in ASCII
 // [8 bytes] Version number - The version of this file format that this file is in.
 //                            this allows us to revise and extend this format
 // [56 bytes] Padding/reserved
 //
 // After the file header reside N records of the format
 // [8 bytes] Tag - this is a tag that identifies the type of this record. The
 //                 values are defined in the RecordTags enum below.
 // [8 bytes] size - Size in bytes of the payload of this record
 // [56 bytes] Pad/reserved - This space pads out the payload to a 64-byte alignment.
 // [size bytes] Payload - The actual raw data for the object serialized in this record
 // [size - (size % 64) bytes] Pad/reserved - pad out this record so the next
 //                                                one is aligned to 64 bytes
 //
 // Following those records is a special footer:
 // [8 bytes] Tag - This tag field should contain the value for RecordTags::FOOTER
 //                 to correctly identify the footer
 // [8 bytes] Offset of last record - The last record in this format is used
 //                                   as an index into the rest of the file, so
 //                                   a reader can use this offset to seek to
 //                                   the last record and read the index.
 // [56 bytes] Pad/reserved - Pad out the footer s.t. the whole file's size is a
 //                           multiple of 64 bytes.
 //
 //
 // When developing this format we want to pay particular attention to the
 // following use cases:
 //
 // -- Reading --
 // 1) Reading with full random access
 //   a) Reading with file api's such as fread()
 //   b) mmaping the file and jumping around the mapped region
 // 2) Reading with 1-pass sequential access
 //      -> A reader will need to build up a data structure of parsed structures
 //         as it reads
 //
 // -- Writing --
 // 1) Writing with full random access
 // 2) Writing with 1-pass sequential access
 //      -> We must take care not to require updating values that have already
 //         been written. We place the variable-length index at the end and do
 //         not put any indicies into the header to fulfill this constraint.

 namespace {
   struct RecordTags {
     enum {
       STORAGE = 1,
       FOOTER = 2,
     };
   };

   // Common constants
   static constexpr uint64_t kFileMagicNumber = 0x314843524f545950L; // PYTORCH1
   static constexpr uint64_t kFieldAlignment = 64L; // 64 byte alignment supports up to AVX512 for mmap

   // Reader-specific constants
   static constexpr uint64_t kMaxSupportedFileFormatVersion = 0x1L;

   // Writer-specific constants
   static constexpr uint64_t kFileFormatVersion = 0x1L;
   static constexpr uint8_t kPadValue = 0xEF;

 }  // namespace

 class PyTorchStreamReader {
  public:
   PyTorchStreamReader(std::istream& in_) : in(in_) {
     // Store file size so we know when we're done reading because the f* APIs
     // don't do a good job of that
     in.seekg(0L, in.end);
     file_size = in.tellg();
     in.seekg(0L);
     readAndValidateFileHeader();
     // Do this now since we're reasonably sure this is actually a PyT file from
     // the header.
     if (file_size % kFieldAlignment != 0) {
       throw std::runtime_error("File length is not a multiple of the alignment"
                                " size. Is this a valid PyTorch file?");
     }
     readAndValidateFileFooter();
   }
   std::tuple<at::DataPtr, size_t> getLastRecord() {
     return getRecordWithKey(last_record_offset);
   }
   std::tuple<at::DataPtr, size_t> getRecordWithKey(uint64_t key) {
     if (key + kFieldAlignment > file_size) {
       throw std::runtime_error("Provided key is larger than the size of the file.");
     }
     if (key % kFieldAlignment != 0) {
       throw std::runtime_error("Provided key is not divisible by the alignment size.");
     }
     // Seek to the provided offset
     cursor = key;
     in.seekg(cursor);
     auto tag = read64BitIntegerLittleEndian();
     if (tag != RecordTags::STORAGE) {
       throw std::runtime_error("Attempted to read a record of non-storage type");
     }
     auto size = read64BitIntegerLittleEndian();
     seekToNextAlignmentBoundary();
     auto ptr = malloc(size);
     at::DataPtr retval(ptr, ptr, free, at::kCPU);

     in.read((char*)ptr, size);
     cursor += size;
     seekToNextAlignmentBoundary();
     return std::tuple<at::DataPtr, size_t>(std::move(retval), size);
   }
   ~PyTorchStreamReader() {
   }
  private:
   std::istream& in;
   size_t cursor = 0;
   size_t file_size;
   size_t last_record_offset;

   // Utility functions
   uint64_t read64BitIntegerLittleEndian() {
    uint64_t retval;
    // TODO endian swap on platforms that need it?
    in.read(reinterpret_cast<char *>(&retval), 8);
    std::streamsize read_bytes = in.gcount();
    if (read_bytes != 8) {
      std::ostringstream errmsg;
      errmsg << "Expected to read 8 bytes but got " << read_bytes;
      throw std::runtime_error(errmsg.str());
    }
    cursor += read_bytes;
    return retval;
   }

   void seekToNextAlignmentBoundary() {
    size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment);
    size_t pad_amount = next_offset - cursor;
    cursor += pad_amount;
    in.seekg(cursor);
   }

   // File format deserialization functions
   void readAndValidateFileHeader() {
    // Validate magic number
    uint64_t magic = read64BitIntegerLittleEndian();
    if (magic != kFileMagicNumber) {
      throw std::runtime_error("Magic number mismatch in PyTorch file. File may"
                               " be corrupted or is not actually a PyTorch file.");
    }
    uint64_t file_format_version = read64BitIntegerLittleEndian();
    if (file_format_version > kMaxSupportedFileFormatVersion) {
      std::ostringstream errmsg;
      errmsg << "Attempted to read a PyTorch file with version " << file_format_version
             << " but the maximum supported version for reading is " << kMaxSupportedFileFormatVersion
             << ". Your PyTorch installation may be too old.";
      throw std::runtime_error(errmsg.str());
    }
    seekToNextAlignmentBoundary();
   }
   void readAndValidateFileFooter() {
     // Seek to location of file footer. We've already validated that the file
     // length is a multiple of the alignment size
     cursor = file_size - kFieldAlignment;
     in.seekg(cursor);
     auto tag = read64BitIntegerLittleEndian();
     if (tag != RecordTags::FOOTER) {
       throw std::runtime_error("File footer has wrong record type. Is this"
                                " file corrupted?");
     }
     last_record_offset = read64BitIntegerLittleEndian();
     if (last_record_offset > file_size) {
       throw std::runtime_error("Offset of last record is higher than the size"
                                " of the file! Is this file corrupted?");
     }
   }
 };

 class PyTorchStreamWriter {
  public:
   PyTorchStreamWriter(std::ostream& out_) : out(out_) {
     writeFileHeader();
     // In the case that we do not write any records into this file, the last
     // record index written into the footer will point to the footer itself.
     last_record_idx = cursor;
   }
   uint64_t writeRecord(const char* data, size_t size) {
     JIT_ASSERT(!finalized);
     uint64_t record_offset = cursor;
     last_record_idx = record_offset;
     write64BitIntegerLittleEndian(RecordTags::STORAGE);
     write64BitIntegerLittleEndian(size);
     padToNextAlignmentBoundary();
     writeBuffer(data, size);
     padToNextAlignmentBoundary();
     return record_offset;
   }
   void writeEndOfFile() {
     JIT_ASSERT(!finalized);
     writeFileFooter();
     finalized = true;
   }
   ~PyTorchStreamWriter() {
     if (!finalized) {
       writeEndOfFile();
     }
   }
  private:
   std::ostream& out;
   size_t cursor = 0;
   bool finalized = false;
   size_t last_record_idx = 0;

   // Utility functions
   void write64BitIntegerLittleEndian(const uint64_t value) {
     // TODO endian swap on platforms that need it?
     out.write(reinterpret_cast<const char *>(&value), 8);
     cursor += 8u;
   }

   void writePad(const size_t num_bytes) {
     static std::vector<char> pad_buffer(kPadValue, kFieldAlignment);
     out.write(pad_buffer.data(), num_bytes);
     cursor += num_bytes;
   }

   void padToNextAlignmentBoundary() {
     size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment);
     size_t pad_amount = next_offset - cursor;
     writePad(pad_amount);
   }

   void writeBuffer(const char* data, size_t size) {
     out.write(data, size);
     cursor += size;
   }

   // File format write functions
   void writeFileHeader() {
     write64BitIntegerLittleEndian(kFileMagicNumber);
     write64BitIntegerLittleEndian(kFileFormatVersion);
     padToNextAlignmentBoundary();
   }

   void writeFileFooter() {
     write64BitIntegerLittleEndian(RecordTags::FOOTER);
     write64BitIntegerLittleEndian(last_record_idx);
     padToNextAlignmentBoundary();
   }
 };

 class PyTorchFileReader {
  public:
   PyTorchFileReader(const std::string& filename) :
     in(filename, std::ios_base::binary),
     stream_reader(in) {}

   std::tuple<at::DataPtr, size_t> getLastRecord() {
     return stream_reader.getLastRecord();
   }

   std::tuple<at::DataPtr, size_t> getRecordWithKey(uint64_t key) {
     return stream_reader.getRecordWithKey(key);
   }

  private:
   std::ifstream in;
   PyTorchStreamReader stream_reader;
 };

 class PyTorchFileWriter {
  public:
   PyTorchFileWriter(const std::string& filename) :
     out(filename, std::ios_base::binary),
     stream_writer(out) {}

   uint64_t writeRecord(const char* data, size_t size) {
     return stream_writer.writeRecord(data, size);
   }

   void writeEndOfFile() {
     stream_writer.writeEndOfFile();
     out.close();
   }

  private:
   std::ofstream out;
   PyTorchStreamWriter stream_writer;
 };

 }}  // namespace torch::jit
	#pragma once

	#include <cstdio>
	#include <cstring>
	#include <cerrno>
	#include <istream>
	#include <ostream>
	#include <fstream>

	namespace torch { namespace jit {

	// This file defines an on-disk serialization format to be used for PyTorch
	// model serialization. All integer values are serialized as little-endian.
	// Everything in this format is aligned to 64-byte boundaries to allow for direct
	// memory mapping and use in, for example, AVX512 instructions.
	// The format is as follows:
	//
	// -- File header --
	// [8 bytes] Magic number - little endian integer that spells 'PYTORCH1' in ASCII
	// [8 bytes] Version number - The version of this file format that this file is in.
	// this allows us to revise and extend this format
	// [56 bytes] Padding/reserved
	//
	// After the file header reside N records of the format
	// [8 bytes] Tag - this is a tag that identifies the type of this record. The
	// values are defined in the RecordTags enum below.
	// [8 bytes] size - Size in bytes of the payload of this record
	// [56 bytes] Pad/reserved - This space pads out the payload to a 64-byte alignment.
	// [size bytes] Payload - The actual raw data for the object serialized in this record
	// [size - (size % 64) bytes] Pad/reserved - pad out this record so the next
	// one is aligned to 64 bytes
	//
	// Following those records is a special footer:
	// [8 bytes] Tag - This tag field should contain the value for RecordTags::FOOTER
	// to correctly identify the footer
	// [8 bytes] Offset of last record - The last record in this format is used
	// as an index into the rest of the file, so
	// a reader can use this offset to seek to
	// the last record and read the index.
	// [56 bytes] Pad/reserved - Pad out the footer s.t. the whole file's size is a
	// multiple of 64 bytes.
	//
	//
	// When developing this format we want to pay particular attention to the
	// following use cases:
	//
	// -- Reading --
	// 1) Reading with full random access
	// a) Reading with file api's such as fread()
	// b) mmaping the file and jumping around the mapped region
	// 2) Reading with 1-pass sequential access
	// -> A reader will need to build up a data structure of parsed structures
	// as it reads
	//
	// -- Writing --
	// 1) Writing with full random access
	// 2) Writing with 1-pass sequential access
	// -> We must take care not to require updating values that have already
	// been written. We place the variable-length index at the end and do
	// not put any indicies into the header to fulfill this constraint.

	namespace {
	struct RecordTags {
	enum {
	STORAGE = 1,
	FOOTER = 2,
	};
	};

	// Common constants
	static constexpr uint64_t kFileMagicNumber = 0x314843524f545950L; // PYTORCH1
	static constexpr uint64_t kFieldAlignment = 64L; // 64 byte alignment supports up to AVX512 for mmap

	// Reader-specific constants
	static constexpr uint64_t kMaxSupportedFileFormatVersion = 0x1L;

	// Writer-specific constants
	static constexpr uint64_t kFileFormatVersion = 0x1L;
	static constexpr uint8_t kPadValue = 0xEF;

	} // namespace

	class PyTorchStreamReader {
	public:
	PyTorchStreamReader(std::istream& in_) : in(in_) {
	// Store file size so we know when we're done reading because the f* APIs
	// don't do a good job of that
	in.seekg(0L, in.end);
	file_size = in.tellg();
	in.seekg(0L);
	readAndValidateFileHeader();
	// Do this now since we're reasonably sure this is actually a PyT file from
	// the header.
	if (file_size % kFieldAlignment != 0) {
	throw std::runtime_error("File length is not a multiple of the alignment"
	" size. Is this a valid PyTorch file?");
	}
	readAndValidateFileFooter();
	}
	std::tuple<at::DataPtr, size_t> getLastRecord() {
	return getRecordWithKey(last_record_offset);
	}
	std::tuple<at::DataPtr, size_t> getRecordWithKey(uint64_t key) {
	if (key + kFieldAlignment > file_size) {
	throw std::runtime_error("Provided key is larger than the size of the file.");
	}
	if (key % kFieldAlignment != 0) {
	throw std::runtime_error("Provided key is not divisible by the alignment size.");
	}
	// Seek to the provided offset
	cursor = key;
	in.seekg(cursor);
	auto tag = read64BitIntegerLittleEndian();
	if (tag != RecordTags::STORAGE) {
	throw std::runtime_error("Attempted to read a record of non-storage type");
	}
	auto size = read64BitIntegerLittleEndian();
	seekToNextAlignmentBoundary();
	auto ptr = malloc(size);
	at::DataPtr retval(ptr, ptr, free, at::kCPU);

	in.read((char*)ptr, size);
	cursor += size;
	seekToNextAlignmentBoundary();
	return std::tuple<at::DataPtr, size_t>(std::move(retval), size);
	}
	~PyTorchStreamReader() {
	}
	private:
	std::istream& in;
	size_t cursor = 0;
	size_t file_size;
	size_t last_record_offset;

	// Utility functions
	uint64_t read64BitIntegerLittleEndian() {
	uint64_t retval;
	// TODO endian swap on platforms that need it?
	in.read(reinterpret_cast<char *>(&retval), 8);
	std::streamsize read_bytes = in.gcount();
	if (read_bytes != 8) {
	std::ostringstream errmsg;
	errmsg << "Expected to read 8 bytes but got " << read_bytes;
	throw std::runtime_error(errmsg.str());
	}
	cursor += read_bytes;
	return retval;
	}

	void seekToNextAlignmentBoundary() {
	size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment);
	size_t pad_amount = next_offset - cursor;
	cursor += pad_amount;
	in.seekg(cursor);
	}

	// File format deserialization functions
	void readAndValidateFileHeader() {
	// Validate magic number
	uint64_t magic = read64BitIntegerLittleEndian();
	if (magic != kFileMagicNumber) {
	throw std::runtime_error("Magic number mismatch in PyTorch file. File may"
	" be corrupted or is not actually a PyTorch file.");
	}
	uint64_t file_format_version = read64BitIntegerLittleEndian();
	if (file_format_version > kMaxSupportedFileFormatVersion) {
	std::ostringstream errmsg;
	errmsg << "Attempted to read a PyTorch file with version " << file_format_version
	<< " but the maximum supported version for reading is " << kMaxSupportedFileFormatVersion
	<< ". Your PyTorch installation may be too old.";
	throw std::runtime_error(errmsg.str());
	}
	seekToNextAlignmentBoundary();
	}
	void readAndValidateFileFooter() {
	// Seek to location of file footer. We've already validated that the file
	// length is a multiple of the alignment size
	cursor = file_size - kFieldAlignment;
	in.seekg(cursor);
	auto tag = read64BitIntegerLittleEndian();
	if (tag != RecordTags::FOOTER) {
	throw std::runtime_error("File footer has wrong record type. Is this"
	" file corrupted?");
	}
	last_record_offset = read64BitIntegerLittleEndian();
	if (last_record_offset > file_size) {
	throw std::runtime_error("Offset of last record is higher than the size"
	" of the file! Is this file corrupted?");
	}
	}
	};

	class PyTorchStreamWriter {
	public:
	PyTorchStreamWriter(std::ostream& out_) : out(out_) {
	writeFileHeader();
	// In the case that we do not write any records into this file, the last
	// record index written into the footer will point to the footer itself.
	last_record_idx = cursor;
	}
	uint64_t writeRecord(const char* data, size_t size) {
	JIT_ASSERT(!finalized);
	uint64_t record_offset = cursor;
	last_record_idx = record_offset;
	write64BitIntegerLittleEndian(RecordTags::STORAGE);
	write64BitIntegerLittleEndian(size);
	padToNextAlignmentBoundary();
	writeBuffer(data, size);
	padToNextAlignmentBoundary();
	return record_offset;
	}
	void writeEndOfFile() {
	JIT_ASSERT(!finalized);
	writeFileFooter();
	finalized = true;
	}
	~PyTorchStreamWriter() {
	if (!finalized) {
	writeEndOfFile();
	}
	}
	private:
	std::ostream& out;
	size_t cursor = 0;
	bool finalized = false;
	size_t last_record_idx = 0;

	// Utility functions
	void write64BitIntegerLittleEndian(const uint64_t value) {
	// TODO endian swap on platforms that need it?
	out.write(reinterpret_cast<const char *>(&value), 8);
	cursor += 8u;
	}

	void writePad(const size_t num_bytes) {
	static std::vector<char> pad_buffer(kPadValue, kFieldAlignment);
	out.write(pad_buffer.data(), num_bytes);
	cursor += num_bytes;
	}

	void padToNextAlignmentBoundary() {
	size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment);
	size_t pad_amount = next_offset - cursor;
	writePad(pad_amount);
	}

	void writeBuffer(const char* data, size_t size) {
	out.write(data, size);
	cursor += size;
	}

	// File format write functions
	void writeFileHeader() {
	write64BitIntegerLittleEndian(kFileMagicNumber);
	write64BitIntegerLittleEndian(kFileFormatVersion);
	padToNextAlignmentBoundary();
	}

	void writeFileFooter() {
	write64BitIntegerLittleEndian(RecordTags::FOOTER);
	write64BitIntegerLittleEndian(last_record_idx);
	padToNextAlignmentBoundary();
	}
	};

	class PyTorchFileReader {
	public:
	PyTorchFileReader(const std::string& filename) :
	in(filename, std::ios_base::binary),
	stream_reader(in) {}

	std::tuple<at::DataPtr, size_t> getLastRecord() {
	return stream_reader.getLastRecord();
	}

	std::tuple<at::DataPtr, size_t> getRecordWithKey(uint64_t key) {
	return stream_reader.getRecordWithKey(key);
	}

	private:
	std::ifstream in;
	PyTorchStreamReader stream_reader;
	};

	class PyTorchFileWriter {
	public:
	PyTorchFileWriter(const std::string& filename) :
	out(filename, std::ios_base::binary),
	stream_writer(out) {}

	uint64_t writeRecord(const char* data, size_t size) {
	return stream_writer.writeRecord(data, size);
	}

	void writeEndOfFile() {
	stream_writer.writeEndOfFile();
	out.close();
	}

	private:
	std::ofstream out;
	PyTorchStreamWriter stream_writer;
	};

	}} // namespace torch::jit