blob: 02505fa5d381d46bdfe272dcde58e7516d454302 [file] [log] [blame]
//! The n2 database stores information about previous builds for determining
//! which files are up to date.
use crate::file_pool::FilePool;
use crate::graph;
use crate::{
densemap, densemap::DenseMap, graph::BuildId, graph::Graph, graph::Hashes, hash::BuildHash,
};
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use std::collections::HashMap;
use std::convert::TryInto;
use std::fmt::Display;
use std::fs::File;
use std::io::{BufWriter, Seek, Write};
use std::path::Path;
use std::sync::atomic::AtomicUsize;
use std::sync::Arc;
const VERSION: u32 = 2;
const DB_HEADER_SIZE: usize = 8;
const DB_CHUNK_SIZE: usize = 1024 * 1024; // 1 MB
/// Files are identified by integers that are stable across n2 executions.
#[derive(Debug, Clone, Copy)]
pub struct Id(u32);
impl densemap::Index for Id {
fn index(&self) -> usize {
self.0 as usize
}
}
impl From<usize> for Id {
fn from(u: usize) -> Id {
Id(u as u32)
}
}
/// The loaded state of a database, as needed to make updates to the stored
/// state. Other state is directly loaded into the build graph.
#[derive(Default)]
pub struct IdMap {
/// Maps db::Id to FileId.
fileids: DenseMap<Id, Arc<graph::File>>,
/// Maps FileId to db::Id.
db_ids: HashMap<*const graph::File, Id>,
}
// SAFETY: send is not automatically implemented for IdMap because it has
// a raw pointer in it. But we're only using those pointers as map keys,
// and never dereferencing them, so they're essentially integers.
unsafe impl Send for IdMap {}
fn distance_to_end_of_chunk(offset: usize, header_size: usize, chunk_size: usize) -> usize {
let chunk_end = if offset < header_size {
header_size + chunk_size
} else if (offset - header_size) % chunk_size == 0 {
(offset - header_size + 1).next_multiple_of(chunk_size) + header_size
} else {
(offset - header_size).next_multiple_of(chunk_size) + header_size
};
chunk_end - offset
}
#[cfg(test)]
#[test]
fn test_distance_to_end_of_chunk() {
assert_eq!(distance_to_end_of_chunk(0, 8, 10), 18);
assert_eq!(distance_to_end_of_chunk(1, 8, 10), 17);
assert_eq!(distance_to_end_of_chunk(8, 8, 10), 10);
assert_eq!(distance_to_end_of_chunk(9, 8, 10), 9);
assert_eq!(distance_to_end_of_chunk(17, 8, 10), 1);
assert_eq!(distance_to_end_of_chunk(18, 8, 10), 10);
assert_eq!(distance_to_end_of_chunk(27, 8, 10), 1);
assert_eq!(distance_to_end_of_chunk(28, 8, 10), 10);
assert_eq!(distance_to_end_of_chunk(29, 8, 10), 9);
}
/// An opened database, ready for writes.
pub struct Writer {
ids: IdMap,
w: BufWriter<File>,
offset: usize,
}
impl Writer {
fn create(path: &Path) -> std::io::Result<Self> {
let f = std::fs::File::create(path)?;
let mut w = Self::from_opened(IdMap::default(), f)?;
assert_eq!(w.offset, 0);
w.write_signature()?;
assert_eq!(w.offset, DB_HEADER_SIZE);
Ok(w)
}
fn from_opened(ids: IdMap, mut w: File) -> std::io::Result<Self> {
let offset = w.seek(std::io::SeekFrom::End(0))?.try_into().unwrap();
Ok(Writer {
ids,
w: BufWriter::new(w),
offset,
})
}
fn write(&mut self, buf: &[u8]) -> std::io::Result<()> {
self.offset += buf.len();
self.w.write_all(buf)
}
fn write_u16(&mut self, n: u16) -> std::io::Result<()> {
self.write(&n.to_le_bytes())
}
fn write_u24(&mut self, n: u32) -> std::io::Result<()> {
self.write(&n.to_le_bytes()[..3])
}
fn write_u64(&mut self, n: u64) -> std::io::Result<()> {
self.write(&n.to_le_bytes())
}
fn write_str(&mut self, s: &str) -> std::io::Result<()> {
assert!(s.len() > 0);
self.write_u16(s.len() as u16)?;
self.write(s.as_bytes())
}
fn write_id(&mut self, id: Id) -> std::io::Result<()> {
if id.0 > (1 << 24) {
panic!("too many fileids");
}
self.write_u24(id.0)
}
fn write_signature(&mut self) -> std::io::Result<()> {
self.write("n2db".as_bytes())?;
self.write(&u32::to_le_bytes(VERSION))?;
// Must flush this because the writers destructor may not be called.
// (we use std::mem::forget for performance reasons)
self.w.flush()?;
Ok(())
}
fn ensure_space_for_record(&mut self, record_size: usize) -> std::io::Result<()> {
if record_size > DB_CHUNK_SIZE {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Record was bigger than the database chunk size",
));
}
let d2eoc = distance_to_end_of_chunk(self.offset, DB_HEADER_SIZE, DB_CHUNK_SIZE);
if record_size > d2eoc {
const ZEROS: [u8; DB_CHUNK_SIZE] = [0; DB_CHUNK_SIZE];
self.write(&ZEROS[..d2eoc])?;
}
Ok(())
}
fn write_path(&mut self, name: &str) -> std::io::Result<()> {
if name.len() >= 0b1000_0000_0000_0000 {
panic!("filename too long");
}
self.ensure_space_for_record(2 + name.len())?;
self.write_str(&name)
// We don't need to flush paths because they're always written as
// part of a build, and the build will flush.
}
fn ensure_id(&mut self, file: &Arc<graph::File>) -> std::io::Result<Id> {
let id = match self.ids.db_ids.get(&(file.as_ref() as *const graph::File)) {
Some(&id) => id,
None => {
let id = self.ids.fileids.push(file.clone());
self.ids
.db_ids
.insert(file.as_ref() as *const graph::File, id);
self.write_path(&file.name)?;
id
}
};
Ok(id)
}
pub fn write_build(
&mut self,
graph: &Graph,
id: BuildId,
hash: BuildHash,
) -> std::io::Result<()> {
let build = &graph.builds[id];
// Run ensure_id on all the ids we will use first, because ensure_id
// may write a filepath to the output file, and we don't what that
// write to occurr in the middle of a build record.
let outs = build.outs();
let deps = build.discovered_ins();
for out in outs {
self.ensure_id(&out)?;
}
for dep in deps {
self.ensure_id(&dep)?;
}
let record_size = 2 + 3 * build.outs().len() + 2 + 3 * build.discovered_ins().len() + 8;
self.ensure_space_for_record(record_size)?;
let initial_offset = self.offset;
let mark = (outs.len() as u16) | 0b1000_0000_0000_0000;
self.write_u16(mark)?;
for out in outs {
let id = self.ensure_id(&out)?;
self.write_id(id)?;
}
self.write_u16(deps.len() as u16)?;
for dep in deps {
let id = self.ensure_id(&dep)?;
self.write_id(id)?;
}
self.write_u64(hash.0)?;
assert_eq!(initial_offset + record_size, self.offset);
// Flush after writing builds to try and ensure there's a valid
// database file even if n2 is killed while running.
self.w.flush()?;
Ok(())
}
}
trait ByteReader {
fn read_u16(&mut self) -> std::io::Result<u16>;
fn read_u24(&mut self) -> std::io::Result<u32>;
fn read_u64(&mut self) -> std::io::Result<u64>;
fn read_str(&mut self, len: usize) -> std::io::Result<String>;
fn read_id(&mut self) -> std::io::Result<Id> {
self.read_u24().map(Id)
}
}
impl ByteReader for &[u8] {
fn read_u16(&mut self) -> std::io::Result<u16> {
let result = match self.first_chunk::<2>() {
Some(c) => u16::from_le_bytes(*c),
None => {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"EOF",
))
}
};
*self = &self[2..];
Ok(result)
}
fn read_u24(&mut self) -> std::io::Result<u32> {
let result = match self.first_chunk::<3>() {
Some(c) => {
let buf: [u8; 4] = [c[0], c[1], c[2], 0];
u32::from_le_bytes(buf)
}
None => {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"EOF",
))
}
};
*self = &self[3..];
Ok(result)
}
fn read_u64(&mut self) -> std::io::Result<u64> {
let result = match self.first_chunk::<8>() {
Some(c) => u64::from_le_bytes(*c),
None => {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"EOF",
))
}
};
*self = &self[8..];
Ok(result)
}
fn read_str(&mut self, len: usize) -> std::io::Result<String> {
if self.len() < len {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"EOF",
));
}
let mut buf = Vec::new();
buf.extend_from_slice(&self[..len]);
*self = &self[len..];
Ok(unsafe { String::from_utf8_unchecked(buf) })
}
}
#[derive(Debug)]
enum ReadError {
InvalidSignature,
InvalidVersion,
IoError(std::io::Error),
}
impl Display for ReadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ReadError::InvalidSignature => write!(f, "invalid database header"),
ReadError::InvalidVersion => write!(f, "invalid database version"),
ReadError::IoError(e) => e.fmt(f),
}
}
}
impl std::error::Error for ReadError {}
impl From<std::io::Error> for ReadError {
fn from(value: std::io::Error) -> Self {
ReadError::IoError(value)
}
}
enum Record {
File(Arc<graph::File>),
Build {
outs: Vec<Id>,
discovered_deps: Vec<Id>,
hash: BuildHash,
},
}
fn read_path(data: &mut &[u8], len: usize, graph: &Graph) -> std::io::Result<Arc<graph::File>> {
let name = data.read_str(len)?;
// No canonicalization needed, paths were written canonicalized.
Ok(graph.files.id_from_canonical(name))
}
fn read_build(data: &mut &[u8], len: usize) -> std::io::Result<Record> {
// This record logs a build. We expect all the outputs to be
// outputs of the same build id; if not, that means the graph has
// changed since this log, in which case we just ignore it.
//
// It's possible we log a build that generates files A B, then
// change the build file such that it only generates file A; this
// logic will still attach the old dependencies to A, but it
// shouldn't matter because the changed command line will cause us
// to rebuild A regardless, and these dependencies are only used
// to affect dirty checking, not build order.
let mut outs = Vec::with_capacity(len);
for _ in 0..len {
outs.push(data.read_id()?);
}
let len = data.read_u16()?;
let mut deps = Vec::with_capacity(len as usize);
for _ in 0..len {
deps.push(data.read_id()?);
}
Ok(Record::Build {
outs: outs,
discovered_deps: deps,
hash: BuildHash(data.read_u64()?),
})
}
fn read_signature(data: &[u8]) -> Result<(), ReadError> {
if data.len() < 8 {
return Err(ReadError::InvalidSignature);
}
if &data[..4] != "n2db".as_bytes() {
return Err(ReadError::InvalidSignature);
}
let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
if version != VERSION {
return Err(ReadError::InvalidVersion);
}
Ok(())
}
/// Reads an on-disk database, loading its state into the provided Graph/Hashes.
fn read(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> Result<IdMap, ReadError> {
let file_pool = FilePool::new();
let data = file_pool.read_file(path)?;
read_signature(data)?;
// First, collect all the records from the file. We'll do this in parallel,
// splitting the database into fixed size chunks. The size of the chunks is
// hardcoded, and the writing code needs to take care to pad to the chunk
// size so that this split is valid.
let num_file_records = AtomicUsize::new(0);
let records = data[DB_HEADER_SIZE..]
.chunks(DB_CHUNK_SIZE)
.collect::<Vec<_>>()
.par_iter()
.flat_map(|chunk: &&[u8]| -> Vec<Result<Record, std::io::Error>> {
let mut chunk = *chunk;
let mut result = Vec::new();
loop {
let mut len = match chunk.read_u16() {
Ok(r) => r,
Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => break,
Err(err) => {
result.push(Err(err));
break;
}
};
let mask = 0b1000_0000_0000_0000;
if len == 0 {
// do nothing, this is the padding at the end of the chunk
break;
} else if len & mask == 0 {
num_file_records.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
result
.push(read_path(&mut chunk, len as usize, graph).map(|x| Record::File(x)));
} else {
len &= !mask;
result.push(read_build(&mut chunk, len as usize));
}
}
result
})
.collect::<Result<Vec<Record>, std::io::Error>>()?;
// Drop the file_pool in parallel because munmap can be slow
rayon::spawn(|| drop(file_pool));
// Then, loop over all the records in serial, populating the id maps
// and filling out the hashes and discovered deps in the builds.
// The only reason we do this in serial is so that the file records are
// added in order, we could do those first and then the builds in parallel
// after if needed.
let num_file_records = num_file_records.load(std::sync::atomic::Ordering::Relaxed);
let mut ids = IdMap::default();
ids.fileids.reserve(num_file_records);
ids.db_ids.reserve(num_file_records);
for record in records.into_iter() {
match record {
Record::File(f) => {
let ptr = f.as_ref() as *const graph::File;
let id = ids.fileids.push(f);
ids.db_ids.insert(ptr, id);
}
Record::Build {
outs,
discovered_deps,
hash,
} => {
let mut unique_bid = None;
for out in outs {
match *ids.fileids[out].input.lock().unwrap() {
Some(bid) => match unique_bid {
None => unique_bid = Some(bid),
Some(x) if x == bid => {
// Ok, matches the existing id.
}
Some(_) => {
// mismatch, ignore this build
unique_bid = None;
break;
}
},
None => {
unique_bid = None;
break;
}
}
}
if let Some(id) = unique_bid {
let deps = discovered_deps
.into_iter()
.map(|d| ids.fileids[d].clone())
.collect();
// Common case: only one associated build.
graph.builds[id].set_discovered_ins(deps);
hashes.set(id, hash);
}
}
}
}
Ok(ids)
}
/// Opens or creates an on-disk database, loading its state into the provided Graph.
pub fn open(path: &Path, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result<Writer> {
match read(path, graph, hashes) {
Ok(ids) => {
let f = std::fs::OpenOptions::new().append(true).open(path)?;
let w = Writer::from_opened(ids, f)?;
assert!(w.offset > 0);
Ok(w)
}
Err(ReadError::InvalidVersion) => Ok(Writer::create(path)?),
Err(ReadError::IoError(err)) if err.kind() == std::io::ErrorKind::NotFound => {
Ok(Writer::create(path)?)
}
Err(e) => Err(e)?,
}
}