src/db.rs - platform/external/n2 - Git at Google

 //! The n2 database stores information about previous builds for determining
 //! which files are up to date.

 use crate::file_pool::FilePool;
 use crate::graph;
 use crate::{
     densemap, densemap::DenseMap, graph::BuildId, graph::Graph, graph::Hashes, hash::BuildHash,
 };
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fmt::Display;
 use std::fs::File;
 use std::io::{BufWriter, Seek, Write};
 use std::path::Path;
 use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;

 const VERSION: u32 = 2;
 const DB_HEADER_SIZE: usize = 8;
 const DB_CHUNK_SIZE: usize = 1024 * 1024; // 1 MB

 /// Files are identified by integers that are stable across n2 executions.
 #[derive(Debug, Clone, Copy)]
 pub struct Id(u32);
 impl densemap::Index for Id {
     fn index(&self) -> usize {
         self.0 as usize
     }
 }
 impl From<usize> for Id {
     fn from(u: usize) -> Id {
         Id(u as u32)
     }
 }

 /// The loaded state of a database, as needed to make updates to the stored
 /// state.  Other state is directly loaded into the build graph.
 #[derive(Default)]
 pub struct IdMap {
     /// Maps db::Id to FileId.
     fileids: DenseMap<Id, Arc<graph::File>>,
     /// Maps FileId to db::Id.
     db_ids: HashMap<*const graph::File, Id>,
 }

 // SAFETY: send is not automatically implemented for IdMap because it has
 // a raw pointer in it. But we're only using those pointers as map keys,
 // and never dereferencing them, so they're essentially integers.
 unsafe impl Send for IdMap {}

 fn distance_to_end_of_chunk(offset: usize, header_size: usize, chunk_size: usize) -> usize {
     let chunk_end = if offset < header_size {
         header_size + chunk_size
     } else if (offset - header_size) % chunk_size == 0 {
         (offset - header_size + 1).next_multiple_of(chunk_size) + header_size
     } else {
         (offset - header_size).next_multiple_of(chunk_size) + header_size
     };
     chunk_end - offset
 }

 #[cfg(test)]
 #[test]
 fn test_distance_to_end_of_chunk() {
     assert_eq!(distance_to_end_of_chunk(0, 8, 10), 18);
     assert_eq!(distance_to_end_of_chunk(1, 8, 10), 17);
     assert_eq!(distance_to_end_of_chunk(8, 8, 10), 10);
     assert_eq!(distance_to_end_of_chunk(9, 8, 10), 9);
     assert_eq!(distance_to_end_of_chunk(17, 8, 10), 1);
     assert_eq!(distance_to_end_of_chunk(18, 8, 10), 10);
     assert_eq!(distance_to_end_of_chunk(27, 8, 10), 1);
     assert_eq!(distance_to_end_of_chunk(28, 8, 10), 10);
     assert_eq!(distance_to_end_of_chunk(29, 8, 10), 9);
 }

 /// An opened database, ready for writes.
 pub struct Writer {
     ids: IdMap,
     w: BufWriter<File>,
     offset: usize,
 }

 impl Writer {
     fn create(path: &Path) -> std::io::Result<Self> {
         let f = std::fs::File::create(path)?;
         let mut w = Self::from_opened(IdMap::default(), f)?;
         assert_eq!(w.offset, 0);
         w.write_signature()?;
         assert_eq!(w.offset, DB_HEADER_SIZE);
         Ok(w)
     }

     fn from_opened(ids: IdMap, mut w: File) -> std::io::Result<Self> {
         let offset = w.seek(std::io::SeekFrom::End(0))?.try_into().unwrap();
         Ok(Writer {
             ids,
             w: BufWriter::new(w),
             offset,
         })
     }

     fn write(&mut self, buf: &[u8]) -> std::io::Result<()> {
         self.offset += buf.len();
         self.w.write_all(buf)
     }

     fn write_u16(&mut self, n: u16) -> std::io::Result<()> {
         self.write(&n.to_le_bytes())
     }

     fn write_u24(&mut self, n: u32) -> std::io::Result<()> {
         self.write(&n.to_le_bytes()[..3])
     }

     fn write_u64(&mut self, n: u64) -> std::io::Result<()> {
         self.write(&n.to_le_bytes())
     }

     fn write_str(&mut self, s: &str) -> std::io::Result<()> {
         assert!(s.len() > 0);
         self.write_u16(s.len() as u16)?;
         self.write(s.as_bytes())
     }

     fn write_id(&mut self, id: Id) -> std::io::Result<()> {
         if id.0 > (1 << 24) {
             panic!("too many fileids");
         }
         self.write_u24(id.0)
     }

     fn write_signature(&mut self) -> std::io::Result<()> {
         self.write("n2db".as_bytes())?;
         self.write(&u32::to_le_bytes(VERSION))?;
         // Must flush this because the writers destructor may not be called.
         // (we use std::mem::forget for performance reasons)
         self.w.flush()?;
         Ok(())
     }

     fn ensure_space_for_record(&mut self, record_size: usize) -> std::io::Result<()> {
         if record_size > DB_CHUNK_SIZE {
             return Err(std::io::Error::new(
                 std::io::ErrorKind::Other,
                 "Record was bigger than the database chunk size",
             ));
         }
         let d2eoc = distance_to_end_of_chunk(self.offset, DB_HEADER_SIZE, DB_CHUNK_SIZE);
         if record_size > d2eoc {
             const ZEROS: [u8; DB_CHUNK_SIZE] = [0; DB_CHUNK_SIZE];
             self.write(&ZEROS[..d2eoc])?;
         }
         Ok(())
     }

     fn write_path(&mut self, name: &str) -> std::io::Result<()> {
         if name.len() >= 0b1000_0000_0000_0000 {
             panic!("filename too long");
         }
         self.ensure_space_for_record(2 + name.len())?;
         self.write_str(&name)
         // We don't need to flush paths because they're always written as
         // part of a build, and the build will flush.
     }

     fn ensure_id(&mut self, file: &Arc<graph::File>) -> std::io::Result<Id> {
         let id = match self.ids.db_ids.get(&(file.as_ref() as *const graph::File)) {
             Some(&id) => id,
             None => {
                 let id = self.ids.fileids.push(file.clone());
                 self.ids
                     .db_ids
                     .insert(file.as_ref() as *const graph::File, id);
                 self.write_path(&file.name)?;
                 id
             }
         };
         Ok(id)
     }

     pub fn write_build(
         &mut self,
         graph: &Graph,
         id: BuildId,
         hash: BuildHash,
     ) -> std::io::Result<()> {
         let build = &graph.builds[id];

         // Run ensure_id on all the ids we will use first, because ensure_id
         // may write a filepath to the output file, and we don't what that
         // write to occurr in the middle of a build record.
         let outs = build.outs();
         let deps = build.discovered_ins();
         for out in outs {
             self.ensure_id(&out)?;
         }
         for dep in deps {
             self.ensure_id(&dep)?;
         }

         let record_size = 2 + 3 * build.outs().len() + 2 + 3 * build.discovered_ins().len() + 8;
         self.ensure_space_for_record(record_size)?;
         let initial_offset = self.offset;

         let mark = (outs.len() as u16) | 0b1000_0000_0000_0000;
         self.write_u16(mark)?;
         for out in outs {
             let id = self.ensure_id(&out)?;
             self.write_id(id)?;
         }

         self.write_u16(deps.len() as u16)?;
         for dep in deps {
             let id = self.ensure_id(&dep)?;
             self.write_id(id)?;
         }

         self.write_u64(hash.0)?;
         assert_eq!(initial_offset + record_size, self.offset);
         // Flush after writing builds to try and ensure there's a valid
         // database file even if n2 is killed while running.
         self.w.flush()?;
         Ok(())
     }
 }

 trait ByteReader {
     fn read_u16(&mut self) -> std::io::Result<u16>;
     fn read_u24(&mut self) -> std::io::Result<u32>;
     fn read_u64(&mut self) -> std::io::Result<u64>;
     fn read_str(&mut self, len: usize) -> std::io::Result<String>;
     fn read_id(&mut self) -> std::io::Result<Id> {
         self.read_u24().map(Id)
     }
 }

 impl ByteReader for &[u8] {
     fn read_u16(&mut self) -> std::io::Result<u16> {
         let result = match self.first_chunk::<2>() {
             Some(c) => u16::from_le_bytes(*c),
             None => {
                 return Err(std::io::Error::new(
                     std::io::ErrorKind::UnexpectedEof,
                     "EOF",
                 ))
             }
         };
         *self = &self[2..];
         Ok(result)
     }

     fn read_u24(&mut self) -> std::io::Result<u32> {
         let result = match self.first_chunk::<3>() {
             Some(c) => {
                 let buf: [u8; 4] = [c[0], c[1], c[2], 0];
                 u32::from_le_bytes(buf)
             }
             None => {
                 return Err(std::io::Error::new(
                     std::io::ErrorKind::UnexpectedEof,
                     "EOF",
                 ))
             }
         };
         *self = &self[3..];
         Ok(result)
     }

     fn read_u64(&mut self) -> std::io::Result<u64> {
         let result = match self.first_chunk::<8>() {
             Some(c) => u64::from_le_bytes(*c),
             None => {
                 return Err(std::io::Error::new(
                     std::io::ErrorKind::UnexpectedEof,
                     "EOF",
                 ))
             }
         };
         *self = &self[8..];
         Ok(result)
     }

     fn read_str(&mut self, len: usize) -> std::io::Result<String> {
         if self.len() < len {
             return Err(std::io::Error::new(
                 std::io::ErrorKind::UnexpectedEof,
                 "EOF",
             ));
         }
         let mut buf = Vec::new();
         buf.extend_from_slice(&self[..len]);
         *self = &self[len..];
         Ok(unsafe { String::from_utf8_unchecked(buf) })
     }
 }

 #[derive(Debug)]
 enum ReadError {
     InvalidSignature,
     InvalidVersion,
     IoError(std::io::Error),
 }

 impl Display for ReadError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             ReadError::InvalidSignature => write!(f, "invalid database header"),
             ReadError::InvalidVersion => write!(f, "invalid database version"),
             ReadError::IoError(e) => e.fmt(f),
         }
     }
 }

 impl std::error::Error for ReadError {}

 impl From<std::io::Error> for ReadError {
     fn from(value: std::io::Error) -> Self {
         ReadError::IoError(value)
     }
 }

 enum Record {
     File(Arc<graph::File>),
     Build {
         outs: Vec<Id>,
         discovered_deps: Vec<Id>,
         hash: BuildHash,
     },
 }

 fn read_path(data: &mut &[u8], len: usize, graph: &Graph) -> std::io::Result<Arc<graph::File>> {
     let name = data.read_str(len)?;
     // No canonicalization needed, paths were written canonicalized.
     Ok(graph.files.id_from_canonical(name))
 }

 fn read_build(data: &mut &[u8], len: usize) -> std::io::Result<Record> {
     // This record logs a build.  We expect all the outputs to be
     // outputs of the same build id; if not, that means the graph has
     // changed since this log, in which case we just ignore it.
     //
     // It's possible we log a build that generates files A B, then
     // change the build file such that it only generates file A; this
     // logic will still attach the old dependencies to A, but it
     // shouldn't matter because the changed command line will cause us
     // to rebuild A regardless, and these dependencies are only used
     // to affect dirty checking, not build order.

     let mut outs = Vec::with_capacity(len);
     for _ in 0..len {
         outs.push(data.read_id()?);
     }

     let len = data.read_u16()?;
     let mut deps = Vec::with_capacity(len as usize);
     for _ in 0..len {
         deps.push(data.read_id()?);
     }

     Ok(Record::Build {
         outs: outs,
         discovered_deps: deps,
         hash: BuildHash(data.read_u64()?),
     })
 }

 fn read_signature(data: &[u8]) -> Result<(), ReadError> {
     if data.len() < 8 {
         return Err(ReadError::InvalidSignature);
     }
     if &data[..4] != "n2db".as_bytes() {
         return Err(ReadError::InvalidSignature);
     }
     let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
     if version != VERSION {
         return Err(ReadError::InvalidVersion);
     }
     Ok(())
 }

 /// Reads an on-disk database, loading its state into the provided Graph/Hashes.
 fn read(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> Result<IdMap, ReadError> {
     let file_pool = FilePool::new();
     let data = file_pool.read_file(path)?;

     read_signature(data)?;

     // First, collect all the records from the file. We'll do this in parallel,
     // splitting the database into fixed size chunks. The size of the chunks is
     // hardcoded, and the writing code needs to take care to pad to the chunk
     // size so that this split is valid.
     let num_file_records = AtomicUsize::new(0);
     let records = data[DB_HEADER_SIZE..]
         .chunks(DB_CHUNK_SIZE)
         .collect::<Vec<_>>()
         .par_iter()
         .flat_map(|chunk: &&[u8]| -> Vec<Result<Record, std::io::Error>> {
             let mut chunk = *chunk;
             let mut result = Vec::new();
             loop {
                 let mut len = match chunk.read_u16() {
                     Ok(r) => r,
                     Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => break,
                     Err(err) => {
                         result.push(Err(err));
                         break;
                     }
                 };
                 let mask = 0b1000_0000_0000_0000;
                 if len == 0 {
                     // do nothing, this is the padding at the end of the chunk
                     break;
                 } else if len & mask == 0 {
                     num_file_records.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                     result
                         .push(read_path(&mut chunk, len as usize, graph).map(|x| Record::File(x)));
                 } else {
                     len &= !mask;
                     result.push(read_build(&mut chunk, len as usize));
                 }
             }
             result
         })
         .collect::<Result<Vec<Record>, std::io::Error>>()?;

     // Drop the file_pool in parallel because munmap can be slow
     rayon::spawn(|| drop(file_pool));

     // Then, loop over all the records in serial, populating the id maps
     // and filling out the hashes and discovered deps in the builds.
     // The only reason we do this in serial is so that the file records are
     // added in order, we could do those first and then the builds in parallel
     // after if needed.
     let num_file_records = num_file_records.load(std::sync::atomic::Ordering::Relaxed);
     let mut ids = IdMap::default();
     ids.fileids.reserve(num_file_records);
     ids.db_ids.reserve(num_file_records);
     for record in records.into_iter() {
         match record {
             Record::File(f) => {
                 let ptr = f.as_ref() as *const graph::File;
                 let id = ids.fileids.push(f);
                 ids.db_ids.insert(ptr, id);
             }
             Record::Build {
                 outs,
                 discovered_deps,
                 hash,
             } => {
                 let mut unique_bid = None;
                 for out in outs {
                     match *ids.fileids[out].input.lock().unwrap() {
                         Some(bid) => match unique_bid {
                             None => unique_bid = Some(bid),
                             Some(x) if x == bid => {
                                 // Ok, matches the existing id.
                             }
                             Some(_) => {
                                 // mismatch, ignore this build
                                 unique_bid = None;
                                 break;
                             }
                         },
                         None => {
                             unique_bid = None;
                             break;
                         }
                     }
                 }
                 if let Some(id) = unique_bid {
                     let deps = discovered_deps
                         .into_iter()
                         .map(|d| ids.fileids[d].clone())
                         .collect();
                     // Common case: only one associated build.
                     graph.builds[id].set_discovered_ins(deps);
                     hashes.set(id, hash);
                 }
             }
         }
     }

     Ok(ids)
 }

 /// Opens or creates an on-disk database, loading its state into the provided Graph.
 pub fn open(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result<Writer> {
     match read(path, graph, hashes) {
         Ok(ids) => {
             let f = std::fs::OpenOptions::new().append(true).open(path)?;
             let w = Writer::from_opened(ids, f)?;
             assert!(w.offset > 0);
             Ok(w)
         }
         Err(ReadError::InvalidVersion) => Ok(Writer::create(path)?),
         Err(ReadError::IoError(err)) if err.kind() == std::io::ErrorKind::NotFound => {
             Ok(Writer::create(path)?)
         }
         Err(e) => Err(e)?,
     }
 }
	//! The n2 database stores information about previous builds for determining
	//! which files are up to date.

	use crate::file_pool::FilePool;
	use crate::graph;
	use crate::{
	densemap, densemap::DenseMap, graph::BuildId, graph::Graph, graph::Hashes, hash::BuildHash,
	};
	use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
	use std::collections::HashMap;
	use std::convert::TryInto;
	use std::fmt::Display;
	use std::fs::File;
	use std::io::{BufWriter, Seek, Write};
	use std::path::Path;
	use std::sync::atomic::AtomicUsize;
	use std::sync::Arc;

	const VERSION: u32 = 2;
	const DB_HEADER_SIZE: usize = 8;
	const DB_CHUNK_SIZE: usize = 1024 * 1024; // 1 MB

	/// Files are identified by integers that are stable across n2 executions.
	#[derive(Debug, Clone, Copy)]
	pub struct Id(u32);
	impl densemap::Index for Id {
	fn index(&self) -> usize {
	self.0 as usize
	}
	}
	impl From<usize> for Id {
	fn from(u: usize) -> Id {
	Id(u as u32)
	}
	}

	/// The loaded state of a database, as needed to make updates to the stored
	/// state. Other state is directly loaded into the build graph.
	#[derive(Default)]
	pub struct IdMap {
	/// Maps db::Id to FileId.
	fileids: DenseMap<Id, Arc<graph::File>>,
	/// Maps FileId to db::Id.
	db_ids: HashMap<*const graph::File, Id>,
	}

	// SAFETY: send is not automatically implemented for IdMap because it has
	// a raw pointer in it. But we're only using those pointers as map keys,
	// and never dereferencing them, so they're essentially integers.
	unsafe impl Send for IdMap {}

	fn distance_to_end_of_chunk(offset: usize, header_size: usize, chunk_size: usize) -> usize {
	let chunk_end = if offset < header_size {
	header_size + chunk_size
	} else if (offset - header_size) % chunk_size == 0 {
	(offset - header_size + 1).next_multiple_of(chunk_size) + header_size
	} else {
	(offset - header_size).next_multiple_of(chunk_size) + header_size
	};
	chunk_end - offset
	}

	#[cfg(test)]
	#[test]
	fn test_distance_to_end_of_chunk() {
	assert_eq!(distance_to_end_of_chunk(0, 8, 10), 18);
	assert_eq!(distance_to_end_of_chunk(1, 8, 10), 17);
	assert_eq!(distance_to_end_of_chunk(8, 8, 10), 10);
	assert_eq!(distance_to_end_of_chunk(9, 8, 10), 9);
	assert_eq!(distance_to_end_of_chunk(17, 8, 10), 1);
	assert_eq!(distance_to_end_of_chunk(18, 8, 10), 10);
	assert_eq!(distance_to_end_of_chunk(27, 8, 10), 1);
	assert_eq!(distance_to_end_of_chunk(28, 8, 10), 10);
	assert_eq!(distance_to_end_of_chunk(29, 8, 10), 9);
	}

	/// An opened database, ready for writes.
	pub struct Writer {
	ids: IdMap,
	w: BufWriter<File>,
	offset: usize,
	}

	impl Writer {
	fn create(path: &Path) -> std::io::Result<Self> {
	let f = std::fs::File::create(path)?;
	let mut w = Self::from_opened(IdMap::default(), f)?;
	assert_eq!(w.offset, 0);
	w.write_signature()?;
	assert_eq!(w.offset, DB_HEADER_SIZE);
	Ok(w)
	}

	fn from_opened(ids: IdMap, mut w: File) -> std::io::Result<Self> {
	let offset = w.seek(std::io::SeekFrom::End(0))?.try_into().unwrap();
	Ok(Writer {
	ids,
	w: BufWriter::new(w),
	offset,
	})
	}

	fn write(&mut self, buf: &[u8]) -> std::io::Result<()> {
	self.offset += buf.len();
	self.w.write_all(buf)
	}

	fn write_u16(&mut self, n: u16) -> std::io::Result<()> {
	self.write(&n.to_le_bytes())
	}

	fn write_u24(&mut self, n: u32) -> std::io::Result<()> {
	self.write(&n.to_le_bytes()[..3])
	}

	fn write_u64(&mut self, n: u64) -> std::io::Result<()> {
	self.write(&n.to_le_bytes())
	}

	fn write_str(&mut self, s: &str) -> std::io::Result<()> {
	assert!(s.len() > 0);
	self.write_u16(s.len() as u16)?;
	self.write(s.as_bytes())
	}

	fn write_id(&mut self, id: Id) -> std::io::Result<()> {
	if id.0 > (1 << 24) {
	panic!("too many fileids");
	}
	self.write_u24(id.0)
	}

	fn write_signature(&mut self) -> std::io::Result<()> {
	self.write("n2db".as_bytes())?;
	self.write(&u32::to_le_bytes(VERSION))?;
	// Must flush this because the writers destructor may not be called.
	// (we use std::mem::forget for performance reasons)
	self.w.flush()?;
	Ok(())
	}

	fn ensure_space_for_record(&mut self, record_size: usize) -> std::io::Result<()> {
	if record_size > DB_CHUNK_SIZE {
	return Err(std::io::Error::new(
	std::io::ErrorKind::Other,
	"Record was bigger than the database chunk size",
	));
	}
	let d2eoc = distance_to_end_of_chunk(self.offset, DB_HEADER_SIZE, DB_CHUNK_SIZE);
	if record_size > d2eoc {
	const ZEROS: [u8; DB_CHUNK_SIZE] = [0; DB_CHUNK_SIZE];
	self.write(&ZEROS[..d2eoc])?;
	}
	Ok(())
	}

	fn write_path(&mut self, name: &str) -> std::io::Result<()> {
	if name.len() >= 0b1000_0000_0000_0000 {
	panic!("filename too long");
	}
	self.ensure_space_for_record(2 + name.len())?;
	self.write_str(&name)
	// We don't need to flush paths because they're always written as
	// part of a build, and the build will flush.
	}

	fn ensure_id(&mut self, file: &Arc<graph::File>) -> std::io::Result<Id> {
	let id = match self.ids.db_ids.get(&(file.as_ref() as *const graph::File)) {
	Some(&id) => id,
	None => {
	let id = self.ids.fileids.push(file.clone());
	self.ids
	.db_ids
	.insert(file.as_ref() as *const graph::File, id);
	self.write_path(&file.name)?;
	id
	}
	};
	Ok(id)
	}

	pub fn write_build(
	&mut self,
	graph: &Graph,
	id: BuildId,
	hash: BuildHash,
	) -> std::io::Result<()> {
	let build = &graph.builds[id];

	// Run ensure_id on all the ids we will use first, because ensure_id
	// may write a filepath to the output file, and we don't what that
	// write to occurr in the middle of a build record.
	let outs = build.outs();
	let deps = build.discovered_ins();
	for out in outs {
	self.ensure_id(&out)?;
	}
	for dep in deps {
	self.ensure_id(&dep)?;
	}

	let record_size = 2 + 3 * build.outs().len() + 2 + 3 * build.discovered_ins().len() + 8;
	self.ensure_space_for_record(record_size)?;
	let initial_offset = self.offset;

	let mark = (outs.len() as u16) \| 0b1000_0000_0000_0000;
	self.write_u16(mark)?;
	for out in outs {
	let id = self.ensure_id(&out)?;
	self.write_id(id)?;
	}

	self.write_u16(deps.len() as u16)?;
	for dep in deps {
	let id = self.ensure_id(&dep)?;
	self.write_id(id)?;
	}

	self.write_u64(hash.0)?;
	assert_eq!(initial_offset + record_size, self.offset);
	// Flush after writing builds to try and ensure there's a valid
	// database file even if n2 is killed while running.
	self.w.flush()?;
	Ok(())
	}
	}

	trait ByteReader {
	fn read_u16(&mut self) -> std::io::Result<u16>;
	fn read_u24(&mut self) -> std::io::Result<u32>;
	fn read_u64(&mut self) -> std::io::Result<u64>;
	fn read_str(&mut self, len: usize) -> std::io::Result<String>;
	fn read_id(&mut self) -> std::io::Result<Id> {
	self.read_u24().map(Id)
	}
	}

	impl ByteReader for &[u8] {
	fn read_u16(&mut self) -> std::io::Result<u16> {
	let result = match self.first_chunk::<2>() {
	Some(c) => u16::from_le_bytes(*c),
	None => {
	return Err(std::io::Error::new(
	std::io::ErrorKind::UnexpectedEof,
	"EOF",
	))
	}
	};
	*self = &self[2..];
	Ok(result)
	}

	fn read_u24(&mut self) -> std::io::Result<u32> {
	let result = match self.first_chunk::<3>() {
	Some(c) => {
	let buf: [u8; 4] = [c[0], c[1], c[2], 0];
	u32::from_le_bytes(buf)
	}
	None => {
	return Err(std::io::Error::new(
	std::io::ErrorKind::UnexpectedEof,
	"EOF",
	))
	}
	};
	*self = &self[3..];
	Ok(result)
	}

	fn read_u64(&mut self) -> std::io::Result<u64> {
	let result = match self.first_chunk::<8>() {
	Some(c) => u64::from_le_bytes(*c),
	None => {
	return Err(std::io::Error::new(
	std::io::ErrorKind::UnexpectedEof,
	"EOF",
	))
	}
	};
	*self = &self[8..];
	Ok(result)
	}

	fn read_str(&mut self, len: usize) -> std::io::Result<String> {
	if self.len() < len {
	return Err(std::io::Error::new(
	std::io::ErrorKind::UnexpectedEof,
	"EOF",
	));
	}
	let mut buf = Vec::new();
	buf.extend_from_slice(&self[..len]);
	*self = &self[len..];
	Ok(unsafe { String::from_utf8_unchecked(buf) })
	}
	}

	#[derive(Debug)]
	enum ReadError {
	InvalidSignature,
	InvalidVersion,
	IoError(std::io::Error),
	}

	impl Display for ReadError {
	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
	match self {
	ReadError::InvalidSignature => write!(f, "invalid database header"),
	ReadError::InvalidVersion => write!(f, "invalid database version"),
	ReadError::IoError(e) => e.fmt(f),
	}
	}
	}

	impl std::error::Error for ReadError {}

	impl From<std::io::Error> for ReadError {
	fn from(value: std::io::Error) -> Self {
	ReadError::IoError(value)
	}
	}

	enum Record {
	File(Arc<graph::File>),
	Build {
	outs: Vec<Id>,
	discovered_deps: Vec<Id>,
	hash: BuildHash,
	},
	}

	fn read_path(data: &mut &[u8], len: usize, graph: &Graph) -> std::io::Result<Arc<graph::File>> {
	let name = data.read_str(len)?;
	// No canonicalization needed, paths were written canonicalized.
	Ok(graph.files.id_from_canonical(name))
	}

	fn read_build(data: &mut &[u8], len: usize) -> std::io::Result<Record> {
	// This record logs a build. We expect all the outputs to be
	// outputs of the same build id; if not, that means the graph has
	// changed since this log, in which case we just ignore it.
	//
	// It's possible we log a build that generates files A B, then
	// change the build file such that it only generates file A; this
	// logic will still attach the old dependencies to A, but it
	// shouldn't matter because the changed command line will cause us
	// to rebuild A regardless, and these dependencies are only used
	// to affect dirty checking, not build order.

	let mut outs = Vec::with_capacity(len);
	for _ in 0..len {
	outs.push(data.read_id()?);
	}

	let len = data.read_u16()?;
	let mut deps = Vec::with_capacity(len as usize);
	for _ in 0..len {
	deps.push(data.read_id()?);
	}

	Ok(Record::Build {
	outs: outs,
	discovered_deps: deps,
	hash: BuildHash(data.read_u64()?),
	})
	}

	fn read_signature(data: &[u8]) -> Result<(), ReadError> {
	if data.len() < 8 {
	return Err(ReadError::InvalidSignature);
	}
	if &data[..4] != "n2db".as_bytes() {
	return Err(ReadError::InvalidSignature);
	}
	let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
	if version != VERSION {
	return Err(ReadError::InvalidVersion);
	}
	Ok(())
	}

	/// Reads an on-disk database, loading its state into the provided Graph/Hashes.
	fn read(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> Result<IdMap, ReadError> {
	let file_pool = FilePool::new();
	let data = file_pool.read_file(path)?;

	read_signature(data)?;

	// First, collect all the records from the file. We'll do this in parallel,
	// splitting the database into fixed size chunks. The size of the chunks is
	// hardcoded, and the writing code needs to take care to pad to the chunk
	// size so that this split is valid.
	let num_file_records = AtomicUsize::new(0);
	let records = data[DB_HEADER_SIZE..]
	.chunks(DB_CHUNK_SIZE)
	.collect::<Vec<_>>()
	.par_iter()
	.flat_map(\|chunk: &&[u8]\| -> Vec<Result<Record, std::io::Error>> {
	let mut chunk = *chunk;
	let mut result = Vec::new();
	loop {
	let mut len = match chunk.read_u16() {
	Ok(r) => r,
	Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => break,
	Err(err) => {
	result.push(Err(err));
	break;
	}
	};
	let mask = 0b1000_0000_0000_0000;
	if len == 0 {
	// do nothing, this is the padding at the end of the chunk
	break;
	} else if len & mask == 0 {
	num_file_records.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
	result
	.push(read_path(&mut chunk, len as usize, graph).map(\|x\| Record::File(x)));
	} else {
	len &= !mask;
	result.push(read_build(&mut chunk, len as usize));
	}
	}
	result
	})
	.collect::<Result<Vec<Record>, std::io::Error>>()?;

	// Drop the file_pool in parallel because munmap can be slow
	rayon::spawn(\|\| drop(file_pool));

	// Then, loop over all the records in serial, populating the id maps
	// and filling out the hashes and discovered deps in the builds.
	// The only reason we do this in serial is so that the file records are
	// added in order, we could do those first and then the builds in parallel
	// after if needed.
	let num_file_records = num_file_records.load(std::sync::atomic::Ordering::Relaxed);
	let mut ids = IdMap::default();
	ids.fileids.reserve(num_file_records);
	ids.db_ids.reserve(num_file_records);
	for record in records.into_iter() {
	match record {
	Record::File(f) => {
	let ptr = f.as_ref() as *const graph::File;
	let id = ids.fileids.push(f);
	ids.db_ids.insert(ptr, id);
	}
	Record::Build {
	outs,
	discovered_deps,
	hash,
	} => {
	let mut unique_bid = None;
	for out in outs {
	match *ids.fileids[out].input.lock().unwrap() {
	Some(bid) => match unique_bid {
	None => unique_bid = Some(bid),
	Some(x) if x == bid => {
	// Ok, matches the existing id.
	}
	Some(_) => {
	// mismatch, ignore this build
	unique_bid = None;
	break;
	}
	},
	None => {
	unique_bid = None;
	break;
	}
	}
	}
	if let Some(id) = unique_bid {
	let deps = discovered_deps
	.into_iter()
	.map(\|d\| ids.fileids[d].clone())
	.collect();
	// Common case: only one associated build.
	graph.builds[id].set_discovered_ins(deps);
	hashes.set(id, hash);
	}
	}
	}
	}

	Ok(ids)
	}

	/// Opens or creates an on-disk database, loading its state into the provided Graph.
	pub fn open(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result<Writer> {
	match read(path, graph, hashes) {
	Ok(ids) => {
	let f = std::fs::OpenOptions::new().append(true).open(path)?;
	let w = Writer::from_opened(ids, f)?;
	assert!(w.offset > 0);
	Ok(w)
	}
	Err(ReadError::InvalidVersion) => Ok(Writer::create(path)?),
	Err(ReadError::IoError(err)) if err.kind() == std::io::ErrorKind::NotFound => {
	Ok(Writer::create(path)?)
	}
	Err(e) => Err(e)?,
	}
	}