Recover FS if superblock rewrite fails
Failing a superblock rewrite, either because the initial cleared
superblock failed to commit or because a forced rewrite of the current
state failed to commit, would previously cause the FS to go into a
read-only state until re-initialization. This change adds the ability to
recover from this state by resetting the transaction and trying the same
write again.
Test: build.py qemu-generic-arm64-test-debug --test storage-tp-clear-test
Bug: 202792882
Bug: 198638362
Bug: 194313068
Change-Id: I06016f786b228fd30d7923b7303bc1e0e960a52e
diff --git a/block_cache.c b/block_cache.c
index 1fe48df..9b51eee 100644
--- a/block_cache.c
+++ b/block_cache.c
@@ -213,8 +213,17 @@
pr_err("write block %" PRIu64 " failed, fail transaction\n",
entry->block);
transaction_fail(entry->dirty_tr);
+
+ /*
+ * Failing the transaction must not clear the block number, as we rely
+ * on the block number + pinned flag to reserve and reuse the block
+ * cache entry when reinitializing a special transaction.
+ */
+ assert(block == entry->block);
+ } else {
+ entry->dirty_tr = NULL;
+ entry->pinned = false;
}
- entry->dirty_tr = NULL;
}
/**
@@ -363,6 +372,11 @@
* that entry belongs to must also fail.
*/
if (entry->dirty_tr->fs->initial_super_block_tr) {
+ /*
+ * transaction_initial_super_block_complete() always reinitialize
+ * initial_super_block_tr if the write failed.
+ */
+ assert(!entry->dirty_tr->fs->initial_super_block_tr->failed);
transaction_fail(entry->dirty_tr);
assert(!entry->dirty);
return;
@@ -401,6 +415,8 @@
entry->block = DATA_BLOCK_INVALID;
entry->dirty = false;
entry->dirty_tr = NULL;
+ /* We have to unpin here because we're clearing the block number */
+ entry->pinned = false;
entry->dirty_mac = false;
}
@@ -460,7 +476,19 @@
stats_timer_stop(STATS_CACHE_LOOKUP_FOUND);
goto done;
}
- if (!block_cache_entry_has_refs(entry)) {
+ /*
+ * Do not select any cache entries that have active references as they
+ * aren't ready to flush, and do not select any pinned entries. Pinned
+ * entries can only be flushed by
+ * transaction_initial_super_block_complete() and may not be flushed by
+ * another transaction. We need to keep special superblock writes pinned
+ * in the cache because otherwise we might fill the cache up with other
+ * data, flushing the special superblock, which might fail to write. In
+ * this case we would leave no room to recreate the write later, since
+ * the cache is full of data which can't be flushed until the initial
+ * superblock write is completed.
+ */
+ if (!block_cache_entry_has_refs(entry) && !entry->pinned) {
score = block_cache_entry_score(entry, available);
available++;
if (score >= unused_entry_score) {
@@ -475,6 +503,13 @@
entry->block);
}
} else {
+ /*
+ * Pinned entries must have a valid block number so they can be
+ * reused.
+ */
+ if (entry->pinned) {
+ assert(entry->block != DATA_BLOCK_INVALID);
+ }
if (print_cache_lookup_verbose) {
printf("%s: block %" PRIu64
", cache entry %zd in use for %" PRIu64 "\n",
@@ -722,6 +757,7 @@
block_cache_entries[i].dirty = false;
block_cache_entries[i].dirty_ref = false;
block_cache_entries[i].dirty_mac = false;
+ block_cache_entries[i].pinned = false;
block_cache_entries[i].dirty_tr = NULL;
block_cache_entries[i].io_op = BLOCK_CACHE_IO_OP_NONE;
obj_init(&block_cache_entries[i].obj, &ref);
@@ -1222,15 +1258,34 @@
* @tr: Transaction
* @block: Block number
* @ref: Pointer to store reference in.
+ * @pinned: Pin this block in the cache until it is successfully written
*
* Return: Block data pointer.
*/
void* block_get_cleared_super(struct transaction* tr,
data_block_t block,
- struct obj_ref* ref) {
+ struct obj_ref* ref,
+ bool pinned) {
void* data_rw;
const void* data_ro = block_cache_get_data(tr->fs, tr->fs->super_dev, block,
false, NULL, 0, ref);
+
+ /*
+ * We should never end up in a situation where there is a dirty copy of a
+ * super block in the cache while we are trying to rewrite that super block.
+ * If a super block entry was created via write_current_super_block(), it
+ * must be flushed before the necessary data writes go through to write new
+ * root nodes. If we are trying to commit an empty transaction (i.e. no data
+ * blocks changed), we skip the super block update in
+ * transaction_complete(). The only other way to write a new super block,
+ * write_current_super_block(), will be a no-op if there is already a
+ * pending super block rewrite.
+ */
+ assert(data_ro);
+ struct block_cache_entry* entry = data_to_block_cache_entry(data_ro);
+ assert(!entry->dirty);
+ entry->pinned = pinned;
+
data_rw = block_dirty(tr, data_ro, false);
assert(tr->fs->super_dev->block_size <= MAX_BLOCK_SIZE);
memset(data_rw, 0, tr->fs->super_dev->block_size);
diff --git a/block_cache.h b/block_cache.h
index 7894474..074fc49 100644
--- a/block_cache.h
+++ b/block_cache.h
@@ -98,7 +98,8 @@
void* block_get_cleared_super(struct transaction* tr,
data_block_t block,
- struct obj_ref* ref);
+ struct obj_ref* ref,
+ bool pinned);
void* block_move(struct transaction* tr,
const void* data,
diff --git a/block_cache_priv.h b/block_cache_priv.h
index 350fa9d..7e7d8f5 100644
--- a/block_cache_priv.h
+++ b/block_cache_priv.h
@@ -56,6 +56,7 @@
* after encrypting block.
* @dirty_tmp: Data can be discarded by
* block_cache_discard_transaction.
+ * @pinned: Block cannot be reused if it fails to write.
* @dirty_tr: Transaction that modified block.
* @obj: Reference tracking struct.
* @lru_node: List node for tracking least recently used cache
@@ -80,6 +81,7 @@
bool dirty_ref;
bool dirty_mac;
bool dirty_tmp;
+ bool pinned;
struct transaction* dirty_tr;
struct obj obj;
diff --git a/fs.h b/fs.h
index 2f955a4..7e83b5f 100644
--- a/fs.h
+++ b/fs.h
@@ -93,5 +93,6 @@
bool clear);
void fs_unknown_super_block_state_all(void);
+void write_current_super_block(struct fs* fs, bool reinitialize);
void fs_destroy(struct fs* fs);
diff --git a/super.c b/super.c
index 3f57721..4c00ad5 100644
--- a/super.c
+++ b/super.c
@@ -34,6 +34,7 @@
#include "block_set.h"
#include "debug.h"
#include "file.h"
+#include "fs.h"
#include "transaction.h"
#define SUPER_BLOCK_MAGIC (0x0073797473757274ULL) /* trustys */
@@ -95,17 +96,20 @@
static struct list_node fs_list = LIST_INITIAL_VALUE(fs_list);
/**
- * update_super_block - Generate and write superblock
+ * update_super_block_internal - Generate and write superblock
* @tr: Transaction object.
* @free: New free root.
* @files: New files root.
+ * @pinned: New block should not be reused in the block cache until
+ * it is successfully written.
*
* Return: %true if super block was updated (in cache), %false if transaction
* failed before super block was updated.
*/
-bool update_super_block(struct transaction* tr,
- const struct block_mac* free,
- const struct block_mac* files) {
+static bool update_super_block_internal(struct transaction* tr,
+ const struct block_mac* free,
+ const struct block_mac* files,
+ bool pinned) {
struct super_block* super_rw;
struct obj_ref super_ref = OBJ_REF_INITIAL_VALUE(super_ref);
unsigned int ver;
@@ -135,8 +139,8 @@
pr_write("write super block %" PRIu64 ", ver %d\n",
tr->fs->super_block[index], ver);
- super_rw =
- block_get_cleared_super(tr, tr->fs->super_block[index], &super_ref);
+ super_rw = block_get_cleared_super(tr, tr->fs->super_block[index],
+ &super_ref, pinned);
if (tr->failed) {
block_put_dirty_discard(super_rw, &super_ref);
return false;
@@ -165,6 +169,21 @@
}
/**
+ * update_super_block - Generate and write superblock
+ * @tr: Transaction object.
+ * @free: New free root.
+ * @files: New files root.
+ *
+ * Return: %true if super block was updated (in cache), %false if transaction
+ * failed before super block was updated.
+ */
+bool update_super_block(struct transaction* tr,
+ const struct block_mac* free,
+ const struct block_mac* files) {
+ return update_super_block_internal(tr, free, files, false);
+}
+
+/**
* write_initial_super_block - Write initial superblock to internal transaction
* @fs: File system state object.
*
@@ -184,44 +203,89 @@
fs->initial_super_block_tr = tr;
transaction_init(tr, fs, true);
- return update_super_block(tr, NULL, NULL);
+ return update_super_block_internal(tr, NULL, NULL, true);
}
/**
* write_current_super_block - Write current superblock to internal transaction
- * @fs: File system state object.
+ * @fs: File system state object.
+ * @reinitialize: Allow the special transaction to be reinitialized if it has
+ * failed
*
* Write the current state of the super block to an internal transaction that
* will be written before any other block. This can be used to re-sync the
* in-memory fs-state with the on-disk state after detecting a write failure
* where no longer know the on-disk super block state.
*/
-static void write_current_super_block(struct fs* fs) {
+void write_current_super_block(struct fs* fs, bool reinitialize) {
bool super_block_updated;
struct transaction* tr;
if (fs->initial_super_block_tr) {
/*
- * If initial_super_block_tr is already set there is no need to allocate
- * a new one so return early.
+ * If initial_super_block_tr is already pending and not failed there is
+ * no need to allocate a new one so return early.
*
- * Currently initial_super_block_tr can point to a failed transaction.
- * If that is the case @fs will never be write-able again.
- * TODO: Make sure initial_super_block_tr does not stay in a failed
- * state.
+ * If the special transaction has failed, we need to re-initialize it so
+ * that we can attempt to recover to a good state.
+ *
+ * We are only allowed to reinitialze if the @reinitialize parameter is
+ * true. We don't want to allow reinitialization while cleaning blocks
+ * (i.e. via fs_unknown_super_block_state_all()), as this would reset
+ * the special transaction to non-failed state and create a situation
+ * where transaction_initial_super_block_complete() cannot know if it
+ * successfully flushed the special transaction to disk. Therefore we
+ * only allow transaction_initial_super_block_complete() to reinitialize
+ * a failed special transaction after it attempts and fails to write the
+ * block to disk.
+ *
+ * Since we pin special superblock entries in the block cache and
+ * therefore cannot evict them with normal transactions,
+ * transaction_initial_super_block_complete() is the only place we can
+ * attempt a special transaction write, and if it fails the transaction
+ * is immediately reinitialized. Therefore we should only ever be in a
+ * failed state if reinitialize is true (i.e. we are being called from
+ * transaction_initial_super_block_complete()).
*/
- return;
- }
- tr = calloc(1, sizeof(*tr));
- if (!tr) {
- /* Not safe to proceed. TODO: add flag to defer this allocation? */
- abort();
- }
- fs->initial_super_block_tr = tr;
- transaction_init(tr, fs, true);
- super_block_updated =
- update_super_block(tr, &fs->free.block_tree.root, &fs->files.root);
+ assert(reinitialize || !fs->initial_super_block_tr->failed);
+ if (!fs->initial_super_block_tr->failed || !reinitialize) {
+ return;
+ }
+
+ tr = fs->initial_super_block_tr;
+ transaction_activate(tr);
+ } else {
+ tr = calloc(1, sizeof(*tr));
+ if (!tr) {
+ /* Not safe to proceed. TODO: add flag to defer this allocation? */
+ abort();
+ }
+ transaction_init(tr, fs, true);
+ fs->initial_super_block_tr = tr;
+ }
+
+ /*
+ * Until the filesystem contains committed data, fs->free.block_tree.root
+ * will be zero, i.e. an invalid block mac. fs->free.block_tree.root is only
+ * updated in transaction_complete() after successfully writing a new
+ * superblock. If the filesystem is empty, we need to emit a cleared
+ * superblock with a special flag to prevent the superblock state from
+ * getting out of sync with the filesystem data if a reboot occurrs before
+ * committing a superblock with data.
+ *
+ * We can't use fs->files.root here because it may be invalid if there are
+ * no files in the filesystem. If the free node is zero, then the files node
+ * must be as well, so we assert this.
+ */
+ bool fs_is_cleared = !block_mac_valid(tr, &fs->free.block_tree.root);
+ if (fs_is_cleared) {
+ assert(!block_mac_valid(tr, &fs->files.root));
+ super_block_updated = update_super_block_internal(tr, NULL, NULL, true);
+ } else {
+ super_block_updated = update_super_block_internal(
+ tr, &fs->free.block_tree.root, &fs->files.root, true);
+ }
if (!super_block_updated) {
/* Not safe to proceed. TODO: add flag to try again? */
abort();
@@ -561,6 +625,16 @@
struct fs* fs;
list_for_every_entry(&fs_list, fs, struct fs, node) {
/* TODO: filter out filesystems that are not affected? */
- write_current_super_block(fs);
+ /*
+ * We can't reinitialize an existing, failed special transaction here.
+ * If a initial superblock write failed and triggered
+ * fs_unknown_super_block_state_all() we need to leave that superblock
+ * transaction in a failed state so that the transaction that that
+ * triggered the failing write can also be failed further up the call
+ * chain. If a special transaction already exists we are guaranteed that
+ * it will be reinitialized and flushed to disk before any new writes to
+ * that FS, so we don't need to reinitialize it here.
+ */
+ write_current_super_block(fs, false /* reinitialize */);
}
}
diff --git a/transaction.c b/transaction.c
index 9e88714..3596cae 100644
--- a/transaction.c
+++ b/transaction.c
@@ -399,18 +399,26 @@
* @tr: Transaction object. Must match initial_super_block_tr in fs.
*
* Flush the initial superblock in @tr to disk. If the block could not be
- * written return and leave @tr in a failed state. Otherwise clear
- * @tr->fs->initial_super_block_tr and free @tr.
+ * written re-initialize @tr and leave it in place for another attempt.
+ * Otherwise clear @tr->fs->initial_super_block_tr and free @tr.
+ *
+ * The initial superblock can only be flushed from the block cache by the
+ * block_cache_clean_transaction() call here, as we do not allow initial
+ * superblocks to be flushed to make room for other data. This ensures that we
+ * don't run out of room to recreate the superblock write in case it fails.
*/
void transaction_initial_super_block_complete(struct transaction* tr) {
assert(tr == tr->fs->initial_super_block_tr);
block_cache_clean_transaction(tr);
if (tr->failed) {
/*
- * If we failed to write the superblock we leave the failed
- * initial_super_block_tr transaction in place so all future write
- * transactions to this filesystems will also fail.
+ * If we failed to write the superblock we re-initialize a new attempt
+ * to write that superblock before the next time we write to this
+ * filesystem.
*/
+ pr_err("%s: failed to write initial superblock, version %d.\n",
+ __func__, tr->fs->written_super_block_version);
+ write_current_super_block(tr->fs, true /* reinitialize */);
return;
}
printf("%s: write initial superblock, version %d -> %d\n", __func__,