| //! The n2 database stores information about previous builds for determining |
| //! which files are up to date. |
| |
| use crate::file_pool::FilePool; |
| use crate::graph; |
| use crate::{ |
| densemap, densemap::DenseMap, graph::BuildId, graph::Graph, graph::Hashes, hash::BuildHash, |
| }; |
| use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; |
| use std::collections::HashMap; |
| use std::convert::TryInto; |
| use std::fmt::Display; |
| use std::fs::File; |
| use std::io::{BufWriter, Seek, Write}; |
| use std::path::Path; |
| use std::sync::atomic::AtomicUsize; |
| use std::sync::Arc; |
| |
| const VERSION: u32 = 2; |
| const DB_HEADER_SIZE: usize = 8; |
| const DB_CHUNK_SIZE: usize = 1024 * 1024; // 1 MB |
| |
| /// Files are identified by integers that are stable across n2 executions. |
| #[derive(Debug, Clone, Copy)] |
| pub struct Id(u32); |
| impl densemap::Index for Id { |
| fn index(&self) -> usize { |
| self.0 as usize |
| } |
| } |
| impl From<usize> for Id { |
| fn from(u: usize) -> Id { |
| Id(u as u32) |
| } |
| } |
| |
| /// The loaded state of a database, as needed to make updates to the stored |
| /// state. Other state is directly loaded into the build graph. |
| #[derive(Default)] |
| pub struct IdMap { |
| /// Maps db::Id to FileId. |
| fileids: DenseMap<Id, Arc<graph::File>>, |
| /// Maps FileId to db::Id. |
| db_ids: HashMap<*const graph::File, Id>, |
| } |
| |
| // SAFETY: send is not automatically implemented for IdMap because it has |
| // a raw pointer in it. But we're only using those pointers as map keys, |
| // and never dereferencing them, so they're essentially integers. |
| unsafe impl Send for IdMap {} |
| |
| fn distance_to_end_of_chunk(offset: usize, header_size: usize, chunk_size: usize) -> usize { |
| let chunk_end = if offset < header_size { |
| header_size + chunk_size |
| } else if (offset - header_size) % chunk_size == 0 { |
| (offset - header_size + 1).next_multiple_of(chunk_size) + header_size |
| } else { |
| (offset - header_size).next_multiple_of(chunk_size) + header_size |
| }; |
| chunk_end - offset |
| } |
| |
| #[cfg(test)] |
| #[test] |
| fn test_distance_to_end_of_chunk() { |
| assert_eq!(distance_to_end_of_chunk(0, 8, 10), 18); |
| assert_eq!(distance_to_end_of_chunk(1, 8, 10), 17); |
| assert_eq!(distance_to_end_of_chunk(8, 8, 10), 10); |
| assert_eq!(distance_to_end_of_chunk(9, 8, 10), 9); |
| assert_eq!(distance_to_end_of_chunk(17, 8, 10), 1); |
| assert_eq!(distance_to_end_of_chunk(18, 8, 10), 10); |
| assert_eq!(distance_to_end_of_chunk(27, 8, 10), 1); |
| assert_eq!(distance_to_end_of_chunk(28, 8, 10), 10); |
| assert_eq!(distance_to_end_of_chunk(29, 8, 10), 9); |
| } |
| |
| /// An opened database, ready for writes. |
| pub struct Writer { |
| ids: IdMap, |
| w: BufWriter<File>, |
| offset: usize, |
| } |
| |
| impl Writer { |
| fn create(path: &Path) -> std::io::Result<Self> { |
| let f = std::fs::File::create(path)?; |
| let mut w = Self::from_opened(IdMap::default(), f)?; |
| assert_eq!(w.offset, 0); |
| w.write_signature()?; |
| assert_eq!(w.offset, DB_HEADER_SIZE); |
| Ok(w) |
| } |
| |
| fn from_opened(ids: IdMap, mut w: File) -> std::io::Result<Self> { |
| let offset = w.seek(std::io::SeekFrom::End(0))?.try_into().unwrap(); |
| Ok(Writer { |
| ids, |
| w: BufWriter::new(w), |
| offset, |
| }) |
| } |
| |
| fn write(&mut self, buf: &[u8]) -> std::io::Result<()> { |
| self.offset += buf.len(); |
| self.w.write_all(buf) |
| } |
| |
| fn write_u16(&mut self, n: u16) -> std::io::Result<()> { |
| self.write(&n.to_le_bytes()) |
| } |
| |
| fn write_u24(&mut self, n: u32) -> std::io::Result<()> { |
| self.write(&n.to_le_bytes()[..3]) |
| } |
| |
| fn write_u64(&mut self, n: u64) -> std::io::Result<()> { |
| self.write(&n.to_le_bytes()) |
| } |
| |
| fn write_str(&mut self, s: &str) -> std::io::Result<()> { |
| assert!(s.len() > 0); |
| self.write_u16(s.len() as u16)?; |
| self.write(s.as_bytes()) |
| } |
| |
| fn write_id(&mut self, id: Id) -> std::io::Result<()> { |
| if id.0 > (1 << 24) { |
| panic!("too many fileids"); |
| } |
| self.write_u24(id.0) |
| } |
| |
| fn write_signature(&mut self) -> std::io::Result<()> { |
| self.write("n2db".as_bytes())?; |
| self.write(&u32::to_le_bytes(VERSION))?; |
| // Must flush this because the writers destructor may not be called. |
| // (we use std::mem::forget for performance reasons) |
| self.w.flush()?; |
| Ok(()) |
| } |
| |
| fn ensure_space_for_record(&mut self, record_size: usize) -> std::io::Result<()> { |
| if record_size > DB_CHUNK_SIZE { |
| return Err(std::io::Error::new( |
| std::io::ErrorKind::Other, |
| "Record was bigger than the database chunk size", |
| )); |
| } |
| let d2eoc = distance_to_end_of_chunk(self.offset, DB_HEADER_SIZE, DB_CHUNK_SIZE); |
| if record_size > d2eoc { |
| const ZEROS: [u8; DB_CHUNK_SIZE] = [0; DB_CHUNK_SIZE]; |
| self.write(&ZEROS[..d2eoc])?; |
| } |
| Ok(()) |
| } |
| |
| fn write_path(&mut self, name: &str) -> std::io::Result<()> { |
| if name.len() >= 0b1000_0000_0000_0000 { |
| panic!("filename too long"); |
| } |
| self.ensure_space_for_record(2 + name.len())?; |
| self.write_str(&name) |
| // We don't need to flush paths because they're always written as |
| // part of a build, and the build will flush. |
| } |
| |
| fn ensure_id(&mut self, file: &Arc<graph::File>) -> std::io::Result<Id> { |
| let id = match self.ids.db_ids.get(&(file.as_ref() as *const graph::File)) { |
| Some(&id) => id, |
| None => { |
| let id = self.ids.fileids.push(file.clone()); |
| self.ids |
| .db_ids |
| .insert(file.as_ref() as *const graph::File, id); |
| self.write_path(&file.name)?; |
| id |
| } |
| }; |
| Ok(id) |
| } |
| |
| pub fn write_build( |
| &mut self, |
| graph: &Graph, |
| id: BuildId, |
| hash: BuildHash, |
| ) -> std::io::Result<()> { |
| let build = &graph.builds[id]; |
| |
| // Run ensure_id on all the ids we will use first, because ensure_id |
| // may write a filepath to the output file, and we don't what that |
| // write to occurr in the middle of a build record. |
| let outs = build.outs(); |
| let deps = build.discovered_ins(); |
| for out in outs { |
| self.ensure_id(&out)?; |
| } |
| for dep in deps { |
| self.ensure_id(&dep)?; |
| } |
| |
| let record_size = 2 + 3 * build.outs().len() + 2 + 3 * build.discovered_ins().len() + 8; |
| self.ensure_space_for_record(record_size)?; |
| let initial_offset = self.offset; |
| |
| let mark = (outs.len() as u16) | 0b1000_0000_0000_0000; |
| self.write_u16(mark)?; |
| for out in outs { |
| let id = self.ensure_id(&out)?; |
| self.write_id(id)?; |
| } |
| |
| self.write_u16(deps.len() as u16)?; |
| for dep in deps { |
| let id = self.ensure_id(&dep)?; |
| self.write_id(id)?; |
| } |
| |
| self.write_u64(hash.0)?; |
| assert_eq!(initial_offset + record_size, self.offset); |
| // Flush after writing builds to try and ensure there's a valid |
| // database file even if n2 is killed while running. |
| self.w.flush()?; |
| Ok(()) |
| } |
| } |
| |
| trait ByteReader { |
| fn read_u16(&mut self) -> std::io::Result<u16>; |
| fn read_u24(&mut self) -> std::io::Result<u32>; |
| fn read_u64(&mut self) -> std::io::Result<u64>; |
| fn read_str(&mut self, len: usize) -> std::io::Result<String>; |
| fn read_id(&mut self) -> std::io::Result<Id> { |
| self.read_u24().map(Id) |
| } |
| } |
| |
| impl ByteReader for &[u8] { |
| fn read_u16(&mut self) -> std::io::Result<u16> { |
| let result = match self.first_chunk::<2>() { |
| Some(c) => u16::from_le_bytes(*c), |
| None => { |
| return Err(std::io::Error::new( |
| std::io::ErrorKind::UnexpectedEof, |
| "EOF", |
| )) |
| } |
| }; |
| *self = &self[2..]; |
| Ok(result) |
| } |
| |
| fn read_u24(&mut self) -> std::io::Result<u32> { |
| let result = match self.first_chunk::<3>() { |
| Some(c) => { |
| let buf: [u8; 4] = [c[0], c[1], c[2], 0]; |
| u32::from_le_bytes(buf) |
| } |
| None => { |
| return Err(std::io::Error::new( |
| std::io::ErrorKind::UnexpectedEof, |
| "EOF", |
| )) |
| } |
| }; |
| *self = &self[3..]; |
| Ok(result) |
| } |
| |
| fn read_u64(&mut self) -> std::io::Result<u64> { |
| let result = match self.first_chunk::<8>() { |
| Some(c) => u64::from_le_bytes(*c), |
| None => { |
| return Err(std::io::Error::new( |
| std::io::ErrorKind::UnexpectedEof, |
| "EOF", |
| )) |
| } |
| }; |
| *self = &self[8..]; |
| Ok(result) |
| } |
| |
| fn read_str(&mut self, len: usize) -> std::io::Result<String> { |
| if self.len() < len { |
| return Err(std::io::Error::new( |
| std::io::ErrorKind::UnexpectedEof, |
| "EOF", |
| )); |
| } |
| let mut buf = Vec::new(); |
| buf.extend_from_slice(&self[..len]); |
| *self = &self[len..]; |
| Ok(unsafe { String::from_utf8_unchecked(buf) }) |
| } |
| } |
| |
| #[derive(Debug)] |
| enum ReadError { |
| InvalidSignature, |
| InvalidVersion, |
| IoError(std::io::Error), |
| } |
| |
| impl Display for ReadError { |
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| match self { |
| ReadError::InvalidSignature => write!(f, "invalid database header"), |
| ReadError::InvalidVersion => write!(f, "invalid database version"), |
| ReadError::IoError(e) => e.fmt(f), |
| } |
| } |
| } |
| |
| impl std::error::Error for ReadError {} |
| |
| impl From<std::io::Error> for ReadError { |
| fn from(value: std::io::Error) -> Self { |
| ReadError::IoError(value) |
| } |
| } |
| |
| enum Record { |
| File(Arc<graph::File>), |
| Build { |
| outs: Vec<Id>, |
| discovered_deps: Vec<Id>, |
| hash: BuildHash, |
| }, |
| } |
| |
| fn read_path(data: &mut &[u8], len: usize, graph: &Graph) -> std::io::Result<Arc<graph::File>> { |
| let name = data.read_str(len)?; |
| // No canonicalization needed, paths were written canonicalized. |
| Ok(graph.files.id_from_canonical(name)) |
| } |
| |
| fn read_build(data: &mut &[u8], len: usize) -> std::io::Result<Record> { |
| // This record logs a build. We expect all the outputs to be |
| // outputs of the same build id; if not, that means the graph has |
| // changed since this log, in which case we just ignore it. |
| // |
| // It's possible we log a build that generates files A B, then |
| // change the build file such that it only generates file A; this |
| // logic will still attach the old dependencies to A, but it |
| // shouldn't matter because the changed command line will cause us |
| // to rebuild A regardless, and these dependencies are only used |
| // to affect dirty checking, not build order. |
| |
| let mut outs = Vec::with_capacity(len); |
| for _ in 0..len { |
| outs.push(data.read_id()?); |
| } |
| |
| let len = data.read_u16()?; |
| let mut deps = Vec::with_capacity(len as usize); |
| for _ in 0..len { |
| deps.push(data.read_id()?); |
| } |
| |
| Ok(Record::Build { |
| outs: outs, |
| discovered_deps: deps, |
| hash: BuildHash(data.read_u64()?), |
| }) |
| } |
| |
| fn read_signature(data: &[u8]) -> Result<(), ReadError> { |
| if data.len() < 8 { |
| return Err(ReadError::InvalidSignature); |
| } |
| if &data[..4] != "n2db".as_bytes() { |
| return Err(ReadError::InvalidSignature); |
| } |
| let version = u32::from_le_bytes(data[4..8].try_into().unwrap()); |
| if version != VERSION { |
| return Err(ReadError::InvalidVersion); |
| } |
| Ok(()) |
| } |
| |
| /// Reads an on-disk database, loading its state into the provided Graph/Hashes. |
| fn read(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> Result<IdMap, ReadError> { |
| let file_pool = FilePool::new(); |
| let data = file_pool.read_file(path)?; |
| |
| read_signature(data)?; |
| |
| // First, collect all the records from the file. We'll do this in parallel, |
| // splitting the database into fixed size chunks. The size of the chunks is |
| // hardcoded, and the writing code needs to take care to pad to the chunk |
| // size so that this split is valid. |
| let num_file_records = AtomicUsize::new(0); |
| let records = data[DB_HEADER_SIZE..] |
| .chunks(DB_CHUNK_SIZE) |
| .collect::<Vec<_>>() |
| .par_iter() |
| .flat_map(|chunk: &&[u8]| -> Vec<Result<Record, std::io::Error>> { |
| let mut chunk = *chunk; |
| let mut result = Vec::new(); |
| loop { |
| let mut len = match chunk.read_u16() { |
| Ok(r) => r, |
| Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => break, |
| Err(err) => { |
| result.push(Err(err)); |
| break; |
| } |
| }; |
| let mask = 0b1000_0000_0000_0000; |
| if len == 0 { |
| // do nothing, this is the padding at the end of the chunk |
| break; |
| } else if len & mask == 0 { |
| num_file_records.fetch_add(1, std::sync::atomic::Ordering::Relaxed); |
| result |
| .push(read_path(&mut chunk, len as usize, graph).map(|x| Record::File(x))); |
| } else { |
| len &= !mask; |
| result.push(read_build(&mut chunk, len as usize)); |
| } |
| } |
| result |
| }) |
| .collect::<Result<Vec<Record>, std::io::Error>>()?; |
| |
| // Drop the file_pool in parallel because munmap can be slow |
| rayon::spawn(|| drop(file_pool)); |
| |
| // Then, loop over all the records in serial, populating the id maps |
| // and filling out the hashes and discovered deps in the builds. |
| // The only reason we do this in serial is so that the file records are |
| // added in order, we could do those first and then the builds in parallel |
| // after if needed. |
| let num_file_records = num_file_records.load(std::sync::atomic::Ordering::Relaxed); |
| let mut ids = IdMap::default(); |
| ids.fileids.reserve(num_file_records); |
| ids.db_ids.reserve(num_file_records); |
| for record in records.into_iter() { |
| match record { |
| Record::File(f) => { |
| let ptr = f.as_ref() as *const graph::File; |
| let id = ids.fileids.push(f); |
| ids.db_ids.insert(ptr, id); |
| } |
| Record::Build { |
| outs, |
| discovered_deps, |
| hash, |
| } => { |
| let mut unique_bid = None; |
| for out in outs { |
| match *ids.fileids[out].input.lock().unwrap() { |
| Some(bid) => match unique_bid { |
| None => unique_bid = Some(bid), |
| Some(x) if x == bid => { |
| // Ok, matches the existing id. |
| } |
| Some(_) => { |
| // mismatch, ignore this build |
| unique_bid = None; |
| break; |
| } |
| }, |
| None => { |
| unique_bid = None; |
| break; |
| } |
| } |
| } |
| if let Some(id) = unique_bid { |
| let deps = discovered_deps |
| .into_iter() |
| .map(|d| ids.fileids[d].clone()) |
| .collect(); |
| // Common case: only one associated build. |
| graph.builds[id].set_discovered_ins(deps); |
| hashes.set(id, hash); |
| } |
| } |
| } |
| } |
| |
| Ok(ids) |
| } |
| |
| /// Opens or creates an on-disk database, loading its state into the provided Graph. |
| pub fn open(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result<Writer> { |
| match read(path, graph, hashes) { |
| Ok(ids) => { |
| let f = std::fs::OpenOptions::new().append(true).open(path)?; |
| let w = Writer::from_opened(ids, f)?; |
| assert!(w.offset > 0); |
| Ok(w) |
| } |
| Err(ReadError::InvalidVersion) => Ok(Writer::create(path)?), |
| Err(ReadError::IoError(err)) if err.kind() == std::io::ErrorKind::NotFound => { |
| Ok(Writer::create(path)?) |
| } |
| Err(e) => Err(e)?, |
| } |
| } |