Recover FS if superblock rewrite fails

Failing a superblock rewrite, either because the initial cleared
superblock failed to commit or because a forced rewrite of the current
state failed to commit, would previously cause the FS to go into a
read-only state until re-initialization. This change adds the ability to
recover from this state by resetting the transaction and trying the same
write again.

Test: build.py qemu-generic-arm64-test-debug --test storage-tp-clear-test
Bug: 202792882
Bug: 198638362
Bug: 194313068
Change-Id: I06016f786b228fd30d7923b7303bc1e0e960a52e
diff --git a/block_cache.c b/block_cache.c
index 1fe48df..9b51eee 100644
--- a/block_cache.c
+++ b/block_cache.c
@@ -213,8 +213,17 @@
         pr_err("write block %" PRIu64 " failed, fail transaction\n",
                entry->block);
         transaction_fail(entry->dirty_tr);
+
+        /*
+         * Failing the transaction must not clear the block number, as we rely
+         * on the block number + pinned flag to reserve and reuse the block
+         * cache entry when reinitializing a special transaction.
+         */
+        assert(block == entry->block);
+    } else {
+        entry->dirty_tr = NULL;
+        entry->pinned = false;
     }
-    entry->dirty_tr = NULL;
 }
 
 /**
@@ -363,6 +372,11 @@
          * that entry belongs to must also fail.
          */
         if (entry->dirty_tr->fs->initial_super_block_tr) {
+            /*
+             * transaction_initial_super_block_complete() always reinitialize
+             * initial_super_block_tr if the write failed.
+             */
+            assert(!entry->dirty_tr->fs->initial_super_block_tr->failed);
             transaction_fail(entry->dirty_tr);
             assert(!entry->dirty);
             return;
@@ -401,6 +415,8 @@
     entry->block = DATA_BLOCK_INVALID;
     entry->dirty = false;
     entry->dirty_tr = NULL;
+    /* We have to unpin here because we're clearing the block number */
+    entry->pinned = false;
 
     entry->dirty_mac = false;
 }
@@ -460,7 +476,19 @@
             stats_timer_stop(STATS_CACHE_LOOKUP_FOUND);
             goto done;
         }
-        if (!block_cache_entry_has_refs(entry)) {
+        /*
+         * Do not select any cache entries that have active references as they
+         * aren't ready to flush, and do not select any pinned entries. Pinned
+         * entries can only be flushed by
+         * transaction_initial_super_block_complete() and may not be flushed by
+         * another transaction. We need to keep special superblock writes pinned
+         * in the cache because otherwise we might fill the cache up with other
+         * data, flushing the special superblock, which might fail to write. In
+         * this case we would leave no room to recreate the write later, since
+         * the cache is full of data which can't be flushed until the initial
+         * superblock write is completed.
+         */
+        if (!block_cache_entry_has_refs(entry) && !entry->pinned) {
             score = block_cache_entry_score(entry, available);
             available++;
             if (score >= unused_entry_score) {
@@ -475,6 +503,13 @@
                        entry->block);
             }
         } else {
+            /*
+             * Pinned entries must have a valid block number so they can be
+             * reused.
+             */
+            if (entry->pinned) {
+                assert(entry->block != DATA_BLOCK_INVALID);
+            }
             if (print_cache_lookup_verbose) {
                 printf("%s: block %" PRIu64
                        ", cache entry %zd in use for %" PRIu64 "\n",
@@ -722,6 +757,7 @@
         block_cache_entries[i].dirty = false;
         block_cache_entries[i].dirty_ref = false;
         block_cache_entries[i].dirty_mac = false;
+        block_cache_entries[i].pinned = false;
         block_cache_entries[i].dirty_tr = NULL;
         block_cache_entries[i].io_op = BLOCK_CACHE_IO_OP_NONE;
         obj_init(&block_cache_entries[i].obj, &ref);
@@ -1222,15 +1258,34 @@
  * @tr:         Transaction
  * @block:      Block number
  * @ref:        Pointer to store reference in.
+ * @pinned:     Pin this block in the cache until it is successfully written
  *
  * Return: Block data pointer.
  */
 void* block_get_cleared_super(struct transaction* tr,
                               data_block_t block,
-                              struct obj_ref* ref) {
+                              struct obj_ref* ref,
+                              bool pinned) {
     void* data_rw;
     const void* data_ro = block_cache_get_data(tr->fs, tr->fs->super_dev, block,
                                                false, NULL, 0, ref);
+
+    /*
+     * We should never end up in a situation where there is a dirty copy of a
+     * super block in the cache while we are trying to rewrite that super block.
+     * If a super block entry was created via write_current_super_block(), it
+     * must be flushed before the necessary data writes go through to write new
+     * root nodes. If we are trying to commit an empty transaction (i.e. no data
+     * blocks changed), we skip the super block update in
+     * transaction_complete(). The only other way to write a new super block,
+     * write_current_super_block(), will be a no-op if there is already a
+     * pending super block rewrite.
+     */
+    assert(data_ro);
+    struct block_cache_entry* entry = data_to_block_cache_entry(data_ro);
+    assert(!entry->dirty);
+    entry->pinned = pinned;
+
     data_rw = block_dirty(tr, data_ro, false);
     assert(tr->fs->super_dev->block_size <= MAX_BLOCK_SIZE);
     memset(data_rw, 0, tr->fs->super_dev->block_size);
diff --git a/block_cache.h b/block_cache.h
index 7894474..074fc49 100644
--- a/block_cache.h
+++ b/block_cache.h
@@ -98,7 +98,8 @@
 
 void* block_get_cleared_super(struct transaction* tr,
                               data_block_t block,
-                              struct obj_ref* ref);
+                              struct obj_ref* ref,
+                              bool pinned);
 
 void* block_move(struct transaction* tr,
                  const void* data,
diff --git a/block_cache_priv.h b/block_cache_priv.h
index 350fa9d..7e7d8f5 100644
--- a/block_cache_priv.h
+++ b/block_cache_priv.h
@@ -56,6 +56,7 @@
  *                          after encrypting block.
  * @dirty_tmp:              Data can be discarded by
  *                          block_cache_discard_transaction.
+ * @pinned:                 Block cannot be reused if it fails to write.
  * @dirty_tr:               Transaction that modified block.
  * @obj:                    Reference tracking struct.
  * @lru_node:               List node for tracking least recently used cache
@@ -80,6 +81,7 @@
     bool dirty_ref;
     bool dirty_mac;
     bool dirty_tmp;
+    bool pinned;
     struct transaction* dirty_tr;
 
     struct obj obj;
diff --git a/fs.h b/fs.h
index 2f955a4..7e83b5f 100644
--- a/fs.h
+++ b/fs.h
@@ -93,5 +93,6 @@
             bool clear);
 
 void fs_unknown_super_block_state_all(void);
+void write_current_super_block(struct fs* fs, bool reinitialize);
 
 void fs_destroy(struct fs* fs);
diff --git a/super.c b/super.c
index 3f57721..4c00ad5 100644
--- a/super.c
+++ b/super.c
@@ -34,6 +34,7 @@
 #include "block_set.h"
 #include "debug.h"
 #include "file.h"
+#include "fs.h"
 #include "transaction.h"
 
 #define SUPER_BLOCK_MAGIC (0x0073797473757274ULL) /* trustys */
@@ -95,17 +96,20 @@
 static struct list_node fs_list = LIST_INITIAL_VALUE(fs_list);
 
 /**
- * update_super_block - Generate and write superblock
+ * update_super_block_internal - Generate and write superblock
  * @tr:         Transaction object.
  * @free:       New free root.
  * @files:      New files root.
+ * @pinned:     New block should not be reused in the block cache until
+ *              it is successfully written.
  *
  * Return: %true if super block was updated (in cache), %false if transaction
  * failed before super block was updated.
  */
-bool update_super_block(struct transaction* tr,
-                        const struct block_mac* free,
-                        const struct block_mac* files) {
+static bool update_super_block_internal(struct transaction* tr,
+                                        const struct block_mac* free,
+                                        const struct block_mac* files,
+                                        bool pinned) {
     struct super_block* super_rw;
     struct obj_ref super_ref = OBJ_REF_INITIAL_VALUE(super_ref);
     unsigned int ver;
@@ -135,8 +139,8 @@
     pr_write("write super block %" PRIu64 ", ver %d\n",
              tr->fs->super_block[index], ver);
 
-    super_rw =
-            block_get_cleared_super(tr, tr->fs->super_block[index], &super_ref);
+    super_rw = block_get_cleared_super(tr, tr->fs->super_block[index],
+                                       &super_ref, pinned);
     if (tr->failed) {
         block_put_dirty_discard(super_rw, &super_ref);
         return false;
@@ -165,6 +169,21 @@
 }
 
 /**
+ * update_super_block - Generate and write superblock
+ * @tr:         Transaction object.
+ * @free:       New free root.
+ * @files:      New files root.
+ *
+ * Return: %true if super block was updated (in cache), %false if transaction
+ * failed before super block was updated.
+ */
+bool update_super_block(struct transaction* tr,
+                        const struct block_mac* free,
+                        const struct block_mac* files) {
+    return update_super_block_internal(tr, free, files, false);
+}
+
+/**
  * write_initial_super_block - Write initial superblock to internal transaction
  * @fs:         File system state object.
  *
@@ -184,44 +203,89 @@
     fs->initial_super_block_tr = tr;
 
     transaction_init(tr, fs, true);
-    return update_super_block(tr, NULL, NULL);
+    return update_super_block_internal(tr, NULL, NULL, true);
 }
 
 /**
  * write_current_super_block - Write current superblock to internal transaction
- * @fs:         File system state object.
+ * @fs:           File system state object.
+ * @reinitialize: Allow the special transaction to be reinitialized if it has
+ *                failed
  *
  * Write the current state of the super block to an internal transaction that
  * will be written before any other block. This can be used to re-sync the
  * in-memory fs-state with the on-disk state after detecting a write failure
  * where no longer know the on-disk super block state.
  */
-static void write_current_super_block(struct fs* fs) {
+void write_current_super_block(struct fs* fs, bool reinitialize) {
     bool super_block_updated;
     struct transaction* tr;
 
     if (fs->initial_super_block_tr) {
         /*
-         * If initial_super_block_tr is already set there is no need to allocate
-         * a new one so return early.
+         * If initial_super_block_tr is already pending and not failed there is
+         * no need to allocate a new one so return early.
          *
-         * Currently initial_super_block_tr can point to a failed transaction.
-         * If that is the case @fs will never be write-able again.
-         * TODO: Make sure initial_super_block_tr does not stay in a failed
-         * state.
+         * If the special transaction has failed, we need to re-initialize it so
+         * that we can attempt to recover to a good state.
+         *
+         * We are only allowed to reinitialze if the @reinitialize parameter is
+         * true. We don't want to allow reinitialization while cleaning blocks
+         * (i.e. via fs_unknown_super_block_state_all()), as this would reset
+         * the special transaction to non-failed state and create a situation
+         * where transaction_initial_super_block_complete() cannot know if it
+         * successfully flushed the special transaction to disk. Therefore we
+         * only allow transaction_initial_super_block_complete() to reinitialize
+         * a failed special transaction after it attempts and fails to write the
+         * block to disk.
+         *
+         * Since we pin special superblock entries in the block cache and
+         * therefore cannot evict them with normal transactions,
+         * transaction_initial_super_block_complete() is the only place we can
+         * attempt a special transaction write, and if it fails the transaction
+         * is immediately reinitialized. Therefore we should only ever be in a
+         * failed state if reinitialize is true (i.e. we are being called from
+         * transaction_initial_super_block_complete()).
          */
-        return;
-    }
-    tr = calloc(1, sizeof(*tr));
-    if (!tr) {
-        /* Not safe to proceed. TODO: add flag to defer this allocation? */
-        abort();
-    }
-    fs->initial_super_block_tr = tr;
 
-    transaction_init(tr, fs, true);
-    super_block_updated =
-            update_super_block(tr, &fs->free.block_tree.root, &fs->files.root);
+        assert(reinitialize || !fs->initial_super_block_tr->failed);
+        if (!fs->initial_super_block_tr->failed || !reinitialize) {
+            return;
+        }
+
+        tr = fs->initial_super_block_tr;
+        transaction_activate(tr);
+    } else {
+        tr = calloc(1, sizeof(*tr));
+        if (!tr) {
+            /* Not safe to proceed. TODO: add flag to defer this allocation? */
+            abort();
+        }
+        transaction_init(tr, fs, true);
+        fs->initial_super_block_tr = tr;
+    }
+
+    /*
+     * Until the filesystem contains committed data, fs->free.block_tree.root
+     * will be zero, i.e. an invalid block mac. fs->free.block_tree.root is only
+     * updated in transaction_complete() after successfully writing a new
+     * superblock. If the filesystem is empty, we need to emit a cleared
+     * superblock with a special flag to prevent the superblock state from
+     * getting out of sync with the filesystem data if a reboot occurrs before
+     * committing a superblock with data.
+     *
+     * We can't use fs->files.root here because it may be invalid if there are
+     * no files in the filesystem. If the free node is zero, then the files node
+     * must be as well, so we assert this.
+     */
+    bool fs_is_cleared = !block_mac_valid(tr, &fs->free.block_tree.root);
+    if (fs_is_cleared) {
+        assert(!block_mac_valid(tr, &fs->files.root));
+        super_block_updated = update_super_block_internal(tr, NULL, NULL, true);
+    } else {
+        super_block_updated = update_super_block_internal(
+                tr, &fs->free.block_tree.root, &fs->files.root, true);
+    }
     if (!super_block_updated) {
         /* Not safe to proceed. TODO: add flag to try again? */
         abort();
@@ -561,6 +625,16 @@
     struct fs* fs;
     list_for_every_entry(&fs_list, fs, struct fs, node) {
         /* TODO: filter out filesystems that are not affected? */
-        write_current_super_block(fs);
+        /*
+         * We can't reinitialize an existing, failed special transaction here.
+         * If a initial superblock write failed and triggered
+         * fs_unknown_super_block_state_all() we need to leave that superblock
+         * transaction in a failed state so that the transaction that that
+         * triggered the failing write can also be failed further up the call
+         * chain. If a special transaction already exists we are guaranteed that
+         * it will be reinitialized and flushed to disk before any new writes to
+         * that FS, so we don't need to reinitialize it here.
+         */
+        write_current_super_block(fs, false /* reinitialize */);
     }
 }
diff --git a/transaction.c b/transaction.c
index 9e88714..3596cae 100644
--- a/transaction.c
+++ b/transaction.c
@@ -399,18 +399,26 @@
  * @tr:         Transaction object. Must match initial_super_block_tr in fs.
  *
  * Flush the initial superblock in @tr to disk. If the block could not be
- * written return and leave @tr in a failed state. Otherwise clear
- * @tr->fs->initial_super_block_tr and free @tr.
+ * written re-initialize @tr and leave it in place for another attempt.
+ * Otherwise clear @tr->fs->initial_super_block_tr and free @tr.
+ *
+ * The initial superblock can only be flushed from the block cache by the
+ * block_cache_clean_transaction() call here, as we do not allow initial
+ * superblocks to be flushed to make room for other data. This ensures that we
+ * don't run out of room to recreate the superblock write in case it fails.
  */
 void transaction_initial_super_block_complete(struct transaction* tr) {
     assert(tr == tr->fs->initial_super_block_tr);
     block_cache_clean_transaction(tr);
     if (tr->failed) {
         /*
-         * If we failed to write the superblock we leave the failed
-         * initial_super_block_tr transaction in place so all future write
-         * transactions to this filesystems will also fail.
+         * If we failed to write the superblock we re-initialize a new attempt
+         * to write that superblock before the next time we write to this
+         * filesystem.
          */
+        pr_err("%s: failed to write initial superblock, version %d.\n",
+               __func__, tr->fs->written_super_block_version);
+        write_current_super_block(tr->fs, true /* reinitialize */);
         return;
     }
     printf("%s: write initial superblock, version %d -> %d\n", __func__,