transaction.c - trusty/app/storage - Git at Google

 /*
  * Copyright (C) 2015-2016 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <assert.h>
 #include <inttypes.h>
 #include <lk/compiler.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>

 #include "array.h"
 #include "block_allocator.h"
 #include "block_set.h"
 #include "checkpoint.h"
 #include "debug.h"
 #include "file.h"
 #include "transaction.h"

 bool print_merge_free;

 /**
  * transaction_check_free
  * @tr:         Transaction object.
  * @set:        Free set to check.
  * @min_free    Number of block that must be in @set
  *
  * Return: %true is @set contains @min_free or more blocks, %false otherwise.
  */
 static bool transaction_check_free(struct transaction* tr,
                                    struct block_set* set,
                                    data_block_t min_free) {
     data_block_t next_block;
     struct block_range free_range;
     data_block_t count;

     next_block = 0;
     while (true) {
         free_range = block_set_find_next_range(tr, set, next_block);
         if (block_range_empty(free_range)) {
             return false;
         }
         count = free_range.end - free_range.start;
         if (count >= min_free) {
             return true;
         }
         min_free -= count;
         next_block = free_range.end;
     }
 }

 /**
  * transaction_merge_free_sets
  * @tr:         Transaction object.
  * @new_set:    Output set.
  * @set_i:      Initial set.
  * @set_d:      Set of blocks to delete.
  * @set_a:      Set of blocks to add.
  *
  * Helper function to update the free_set when committing a transaction.
  * @new_set = @set_i - @set_d - @new_set-blocks + @set_a + set_[ida]-blocks
  * @new_set must start empty and will be initialized and filled in this
  * function.
  */
 static void transaction_merge_free_sets(struct transaction* tr,
                                         struct block_set* new_set,
                                         struct block_set* set_i,
                                         struct block_set* set_d,
                                         struct block_set* set_a) {
     data_block_t next_block;
     struct block_range delete_range = BLOCK_RANGE_INITIAL_VALUE(delete_range);
     struct block_range add_range = BLOCK_RANGE_INITIAL_VALUE(add_range);

     /* new_set should start empty. */
     assert(block_set_find_next_block(tr, new_set, 1, true) == 0);
     block_set_copy(tr, new_set, set_i);

     full_assert(block_set_check(tr, set_i));
     full_assert(block_set_check(tr, set_d));
     full_assert(block_set_check(tr, set_a));

     assert(!block_mac_valid(tr, &set_i->block_tree.root) ||
            transaction_block_need_copy(
                    tr, block_mac_to_block(tr, &set_i->block_tree.root)));
     if (print_merge_free) {
         printf("%s\n", __func__);
     }

     /* TODO: don't walk the whole tree each time */
     next_block = 1;
     while (next_block != 0) {
         tr->min_free_block = next_block;
         delete_range = block_set_find_next_range(tr, set_d, next_block);
         add_range = block_set_find_next_range(tr, set_a, next_block);
         if (print_merge_free) {
             printf("%s: add %" PRIu64 "-%" PRIu64 " or delete %" PRIu64
                    "-%" PRIu64 "\n",
                    __func__, add_range.start, add_range.end - 1,
                    delete_range.start, delete_range.end - 1);
         }
         assert(!block_range_overlap(delete_range, add_range));
         if (block_range_before(delete_range, add_range)) {
             assert(delete_range.start >= next_block);
             tr->min_free_block = delete_range.end;
             block_allocator_suspend_set_updates(tr);
             block_set_remove_range(tr, new_set, delete_range);
             block_allocator_process_queue(tr);
             next_block = delete_range.end;
         } else if (!block_range_empty(add_range)) {
             assert(add_range.start >= next_block);
             tr->min_free_block = add_range.end;
             block_allocator_suspend_set_updates(tr);
             block_set_add_range(tr, new_set, add_range);
             block_allocator_process_queue(tr);
             next_block = add_range.end;
         } else {
             assert(block_range_empty(delete_range));
             assert(block_range_empty(add_range));
             next_block = 0;
         }
         if (tr->failed) {
             pr_warn("transaction failed, abort\n");
             return;
         }
     }
     full_assert(block_set_check(tr, new_set));
 }

 /** transaction_rebuild_free_set - Rebuild free set from referenced file blocks
  * @tr:             Transaction object.
  * @new_free_set:   Output free set.
  * @files:          Root block and mac of the files tree.
  * @checkpoint:     Checkpoint metadata block and mac
  *
  * Rebuilds the file system free set by walking the current files tree and
  * ensuring that all referenced blocks are marked as not free. @new_free_set
  * will be initialized to contain all blocks not referenced from the files root.
  * The @checkpoint metadata block will also be removed from the free set, but
  * its children (checkpoint files tree and free set) will not be checked. The
  * blocks in the checkpoint (beside the metadata block) are tracked as
  * free/allocated by the checkpoint free set rather than the active file system
  * free set.
  *
  * We ignore tr->freed and tr->fs->free here because we are reconstructing the
  * entire free set. All blocks that were freed in this transaction will not be
  * referenced by @new_files.
  */
 static void transaction_rebuild_free_set(struct transaction* tr,
                                          struct block_set* new_free_set,
                                          struct block_mac* new_files,
                                          struct block_mac* new_checkpoint) {
     struct block_range init_range = {
             .start = tr->fs->min_block_num,
             .end = tr->fs->dev->block_count,
     };
     struct block_range range;
     struct block_set previously_allocated =
             BLOCK_SET_INITIAL_VALUE(previously_allocated);

     /*
      * Copy and save tr->allocated so that we can keep track of the blocks
      * already allocated for the current transaction when performing allocations
      * for the new free set tree nodes. We then reset tr->allocated so that it
      * will only hold new blocks allocated for new_free_set. All blocks
      * allocated for files will already be referenced in new_files, so we'll
      * already be removing them from new_free_set.
      */
     assert(list_in_list(&tr->allocated.node));
     list_delete(&tr->allocated.node);
     block_set_copy_ro(tr, &previously_allocated, &tr->allocated);
     list_add_tail(&tr->fs->allocated, &previously_allocated.node);

     block_set_init(tr->fs, &tr->allocated);
     list_add_tail(&tr->fs->allocated, &tr->allocated.node);

     block_set_init(tr->fs, new_free_set);
     new_free_set->block_tree.copy_on_write = true;
     new_free_set->block_tree.allow_copy_on_write = true;

     block_set_add_range(tr, new_free_set, init_range);

     if (block_mac_valid(tr, new_checkpoint)) {
         block_set_remove_block(tr, new_free_set,
                                block_mac_to_block(tr, new_checkpoint));
     }
     if (tr->failed) {
         pr_warn("transaction failed, abort\n");
         return;
     }

     if (block_mac_valid(tr, new_files)) {
         files_rebuild_free_set(tr, new_free_set, new_files);
         if (tr->failed) {
             pr_warn("transaction failed, abort\n");
             return;
         }
     }

     for (range = block_set_find_next_range(tr, &tr->allocated, 1);
          !block_range_empty(range);
          range = block_set_find_next_range(tr, &tr->allocated, range.end)) {
         tr->min_free_block = range.end;
         block_allocator_suspend_set_updates(tr);
         block_set_remove_range(tr, new_free_set, range);
         block_allocator_process_queue(tr);
     }

     /*
      * Copy the rest of the allocated blocks back to tr->allocated to maintain a
      * consistent state. We don't actually need to do this with the current code
      * calling this function, but this restores the transaction state to what
      * would be expected if it were to be used in the future.
      */
     for (range = block_set_find_next_range(tr, &previously_allocated, 1);
          !block_range_empty(range);
          range = block_set_find_next_range(tr, &previously_allocated,
                                            range.end)) {
         block_set_add_range(tr, &tr->allocated, range);
     }
     list_delete(&previously_allocated.node);

     full_assert(block_set_check(tr, new_free_set));
 }

 /**
  * transaction_block_need_copy - Check if block needs copy
  * @tr:         Transaction object.
  * @block:      Block number to check.
  *
  * Return: %true if block has not been allocated as a non-tmp block for @tr,
  * %false otherwise.
  */
 bool transaction_block_need_copy(struct transaction* tr, data_block_t block) {
     assert(block);
     assert(!block_set_block_in_set(tr, &tr->tmp_allocated, block));
     assert(!block_allocator_allocation_queued(tr, block, true));

     return !block_set_block_in_set(tr, &tr->allocated, block) &&
            !block_allocator_allocation_queued(tr, block, false);
 }

 /**
  * transaction_delete_active - Remove transaction from active lists (internal)
  * @tr:         Transaction object.
  */
 static void transaction_delete_active(struct transaction* tr) {
     assert(list_in_list(&tr->allocated.node));
     assert(list_in_list(&tr->tmp_allocated.node));
     list_delete(&tr->allocated.node);
     list_delete(&tr->tmp_allocated.node);
 }

 /**
  * transaction_fail - Fail transaction
  * @tr:         Transaction object.
  *
  * Marks transaction as failed, removes it from active list, discards dirty
  * cache entries and restore open files to last committed state.
  */
 void transaction_fail(struct transaction* tr) {
     assert(!tr->failed);

     tr->failed = true;

     if (tr->complete) {
         return;
     }

     block_cache_discard_transaction(tr, true);
     transaction_delete_active(tr);
     file_transaction_failed(tr);
 }

 /**
  * transaction_free - Free transaction
  * @tr:         Transaction object.
  *
  * Prepare @tr for free. @tr must not be active and all open files must already
  * be closed.
  */
 void transaction_free(struct transaction* tr) {
     assert(!transaction_is_active(tr));
     assert(list_is_empty(&tr->open_files));
     assert(list_in_list(&tr->node));
     list_delete(&tr->node);
 }

 /**
  * check_free_tree - Check tree of free set (internal)
  * @tr:         Transaction object.
  * @free:       Set object.
  *
  * Check that the blocks used by the tree for a free set are not in the same
  * set.
  */
 static void check_free_tree(struct transaction* tr, struct block_set* free) {
     unsigned int i;
     struct block_tree_path path;

     block_tree_walk(tr, &free->block_tree, 0, true, &path);
     while (block_tree_path_get_key(&path)) {
         for (i = 0; i < path.count; i++) {
             assert(!block_set_block_in_set(
                     tr, free,
                     block_mac_to_block(tr, &path.entry[i].block_mac)));
         }
         block_tree_path_next(&path);
     }
 }

 /**
  * transaction_complete - Complete transaction, optionally updating checkpoint
  * @tr:                Transaction object.
  * @update_checkpoint: If true, update checkpoint with the new file-system
  *                     state.
  */
 void transaction_complete_etc(struct transaction* tr, bool update_checkpoint) {
     struct block_mac new_files;
     struct transaction* tmp_tr;
     struct transaction* other_tr;
     struct block_set new_free_set = BLOCK_SET_INITIAL_VALUE(new_free_set);
     struct checkpoint* new_checkpoint = NULL;
     struct block_mac new_checkpoint_mac;
     struct obj_ref new_checkpoint_ref =
             OBJ_REF_INITIAL_VALUE(new_checkpoint_ref);
     bool super_block_updated;

     assert(tr->fs);
     assert(!tr->complete);

     if (tr->fs->checkpoint_required) {
         tr->fs->checkpoint_required = false;
         if (!checkpoint_commit(tr->fs)) {
             /*
              * checkpoint creation failed, so we need to try again before we
              * commit the next transaction
              */
             tr->fs->checkpoint_required = true;
             transaction_fail(tr);
             pr_warn("auto-checkpoint failed, abort\n");
             goto err_transaction_failed;
         }
     }

     // printf("%s: %" PRIu64 "\n", __func__, tr->version);

     block_mac_copy(tr, &new_checkpoint_mac, &tr->fs->checkpoint);

     if (tr->failed) {
         pr_warn("transaction failed, abort\n");
         goto err_transaction_failed;
     }

     assert(transaction_is_active(tr));

     file_transaction_complete(tr, &new_files);
     if (tr->failed) {
         pr_warn("transaction failed, abort\n");
         goto err_transaction_failed;
     }

     if (update_checkpoint) {
         new_checkpoint = checkpoint_get_new_block(tr, &new_checkpoint_ref,
                                                   &new_checkpoint_mac);
         if (tr->failed) {
             pr_warn("transaction failed, abort\n");
             goto err_transaction_failed;
         }
         assert(new_checkpoint);
     }

     tr->new_free_set = &new_free_set;
     if (tr->rebuild_free_set) {
         transaction_rebuild_free_set(tr, &new_free_set, &new_files,
                                      &new_checkpoint_mac);
     } else {
         transaction_merge_free_sets(tr, &new_free_set, &tr->fs->free,
                                     &tr->allocated, &tr->freed);
     }
     if (tr->failed) {
         pr_warn("transaction failed, abort\n");
         goto err_transaction_failed;
     }

     if (!transaction_check_free(tr, &new_free_set, tr->fs->reserved_count)) {
         if (!tr->failed) {
             transaction_fail(tr);
         }
         pr_warn("transaction would leave fs too full, abort\n");
         goto err_transaction_failed;
     }

     if (tr->fs->alternate_data && tr->repaired) {
         if (!tr->failed) {
             transaction_fail(tr);
         }
         pr_warn("transaction cannot repair alternate fs, abort\n");
         goto err_transaction_failed;
     }

     if (0) {
         printf("%s: old free:\n", __func__);
         block_set_print(tr, &tr->fs->free);
         printf("%s: tmp_allocated:\n", __func__);
         block_set_print(tr, &tr->tmp_allocated);
         printf("%s: allocated:\n", __func__);
         block_set_print(tr, &tr->allocated);
         printf("%s: freed:\n", __func__);
         block_set_print(tr, &tr->freed);
         printf("%s: new free:\n", __func__);
         block_set_print(tr, &new_free_set);
     }

     if (tr->failed) {
         pr_warn("transaction failed, abort\n");
         goto err_transaction_failed;
     }

     if (update_checkpoint) {
         checkpoint_update_roots(tr, new_checkpoint, &new_files,
                                 &new_free_set.block_tree.root);
         block_put_dirty(tr, new_checkpoint, &new_checkpoint_ref,
                         &new_checkpoint_mac, NULL);
         /*
          * We have now released the block reference new_checkpoint_ref, so make
          * sure we don't release it again in err_transaction_failed
          */
         new_checkpoint = NULL;
     }

     block_cache_clean_transaction(tr);

     if (tr->failed) {
         pr_warn("transaction failed, abort\n");
         goto err_transaction_failed;
     }

     assert(block_range_empty(new_free_set.initial_range));
     check_free_tree(tr, &new_free_set);

     if (block_mac_same_block(tr, &tr->fs->free.block_tree.root,
                              &new_free_set.block_tree.root)) {
         /*
          * If the root block of the free tree did not move, there can be no
          * other changes to the filesystem.
          */
         assert(block_mac_eq(tr, &tr->fs->free.block_tree.root,
                             &new_free_set.block_tree.root));
         assert(block_mac_eq(tr, &tr->fs->files.root, &new_files));

         /*
          * Skip super block write if there are no changes to the filesystem.
          * This is needed in case a previous write error has triggered a request
          * to write another copy of the old super block. There can only be one
          * copy of each block in the cache. If we try to write a new super block
          * here before cleaning the pending one, we get a conflict. If there
          * were changes to the filesystem, the pending super block has already
          * been cleaned at this point.
          */
         goto complete_nop_transaction;
     }

     super_block_updated = update_super_block(tr, &new_free_set.block_tree.root,
                                              &new_files, &new_checkpoint_mac);
     if (!super_block_updated) {
         assert(tr->failed);
         pr_warn("failed to update super block, abort\n");
         goto err_transaction_failed;
     }
     block_cache_clean_transaction(tr);

     /*
      * If an error was detected writing the super block, it is not safe to
      * continue as we do not know if the write completed. We need to rewrite a
      * known state over the unknown super block to avoid an inconsistent view of
      * the filesystem.
      *
      * At this point block_cache_complete_write has been called by the block
      * device, so the current superblock slot in the block cache is free and not
      * associated with the pending transaction.
      */
     if (tr->failed) {
         pr_warn("failed to write super block, notify fs and abort the transaction\n");
         /*
          * Superblock could have been written or not. Make sure no other blocks
          * are written to the filesystem before writing another copy of the
          * superblock with the existing file and free trees.
          *
          * TODO: Don't trigger a superblock write on unaffected filesystems.
          * We update all for now to simplify testing.
          */
         fs_unknown_super_block_state_all();
         goto err_transaction_failed;
     }

     tr->fs->free.block_tree.root = new_free_set.block_tree.root;
     block_range_clear(
             &tr->fs->free
                      .initial_range); /* clear for initial file-system state */
     tr->fs->files.root = new_files;
     tr->fs->super_block_version = tr->fs->written_super_block_version;
     tr->fs->checkpoint = new_checkpoint_mac;
     if (tr->repaired) {
         assert(!tr->fs->alternate_data);
         tr->fs->main_repaired = true;
     }
     if (update_checkpoint) {
         tr->fs->checkpoint_free.block_tree.root = new_free_set.block_tree.root;
         block_range_clear(&tr->fs->checkpoint_free.initial_range);
     }

 complete_nop_transaction:
     transaction_delete_active(tr);
     tr->complete = true;

     file_transaction_success(tr);
     assert(!tr->failed);

     check_free_tree(tr, &tr->fs->free);

     list_for_every_entry_safe(&tr->fs->transactions, other_tr, tmp_tr,
                               struct transaction, node) {
         if (tr->failed) {
             break;
         }
         if (!transaction_is_active(other_tr)) {
             continue;
         }
         if (tr->rebuild_free_set) {
             /*
              * TODO: only fail actually conflicting transactions when rebuilding
              * the free set. When rebuilding, tr->freed does not contain all
              * freed blocks if tree nodes were dropped. We could rebuild a free
              * set delta by subtracting the new free set from the old one and
              * then compare this delta against other transactions.
              */
             pr_warn("Rebuilding free set requires failing all pending transactions\n");
             transaction_fail(other_tr);
         } else if (block_set_overlap(tr, &tr->freed, &other_tr->freed)) {
             pr_warn("fail conflicting transaction\n");
             transaction_fail(other_tr);
         }
     }
     if (tr->failed) {
         pr_warn("transaction failed while failing conflicting transactions\n");
         tr->failed = false;
         list_for_every_entry_safe(&tr->fs->transactions, other_tr, tmp_tr,
                                   struct transaction, node) {
             if (!transaction_is_active(other_tr)) {
                 continue;
             }
             pr_warn("fail possibly conflicting transaction\n");
             transaction_fail(other_tr);
         }
     }
     assert(!tr->failed);
     block_cache_discard_transaction(tr, false);

 err_transaction_failed:
     if (new_checkpoint) {
         block_put_dirty_discard(new_checkpoint, &new_checkpoint_ref);
     }
     if (tr->failed) {
         file_transaction_complete_failed(tr);
     }
     assert(!block_cache_debug_get_ref_block_count());
 }

 /**
  * transaction_initial_super_block_complete - Complete special transaction
  * @tr:         Transaction object. Must match initial_super_block_tr in fs.
  *
  * Flush the initial superblock in @tr to disk. If the block could not be
  * written re-initialize @tr and leave it in place for another attempt.
  * Otherwise clear @tr->fs->initial_super_block_tr and free @tr.
  *
  * The initial superblock can only be flushed from the block cache by the
  * block_cache_clean_transaction() call here, as we do not allow initial
  * superblocks to be flushed to make room for other data. This ensures that we
  * don't run out of room to recreate the superblock write in case it fails.
  */
 void transaction_initial_super_block_complete(struct transaction* tr) {
     assert(tr == tr->fs->initial_super_block_tr);
     block_cache_clean_transaction(tr);
     if (tr->failed) {
         /*
          * If we failed to write the superblock we re-initialize a new attempt
          * to write that superblock before the next time we write to this
          * filesystem.
          */
         pr_err("%s: failed to write initial superblock, version %d.\n",
                __func__, tr->fs->written_super_block_version);
         write_current_super_block(tr->fs, true /* reinitialize */);
         return;
     }
     printf("%s: write initial superblock, version %d -> %d\n", __func__,
            tr->fs->super_block_version, tr->fs->written_super_block_version);

     assert(tr == tr->fs->initial_super_block_tr);
     tr->fs->super_block_version = tr->fs->written_super_block_version;
     tr->fs->initial_super_block_tr = NULL;

     /* not a real transaction, discard the state so it can be freed */
     transaction_fail(tr);
     transaction_free(tr);
     free(tr);
 }

 /**
  * transaction_activate - Activate transaction
  * @tr:         Transaction object.
  */
 void transaction_activate(struct transaction* tr) {
     assert(tr->fs);
     assert(!transaction_is_active(tr));

     tr->failed = false;
     tr->invalid_block_found = false;
     tr->complete = false;
     tr->rebuild_free_set = false;
     tr->repaired = false;
     tr->min_free_block = 0;
     tr->last_free_block = 0;
     tr->last_tmp_free_block = 0;
     tr->new_free_set = NULL;

     block_set_init(tr->fs, &tr->tmp_allocated);
     block_set_init(tr->fs, &tr->allocated);
     block_set_init(tr->fs, &tr->freed);

     fs_file_tree_init(tr->fs, &tr->files_added);
     fs_file_tree_init(tr->fs, &tr->files_updated);
     fs_file_tree_init(tr->fs, &tr->files_removed);

     list_add_tail(&tr->fs->allocated, &tr->allocated.node);
     list_add_tail(&tr->fs->allocated, &tr->tmp_allocated.node);
 }

 /**
  * transaction_init - Initialize new transaction object
  * @tr:         Transaction object.
  * @fs:         File system state object.
  * @activate:   Activate transaction
  */
 void transaction_init(struct transaction* tr, struct fs* fs, bool activate) {
     assert(fs);
     assert(fs->dev);

     memset(tr, 0, sizeof(*tr));
     tr->fs = fs;

     list_initialize(&tr->open_files);
     list_add_tail(&fs->transactions, &tr->node);

     if (activate) {
         transaction_activate(tr);
     }
 }
	/*
	* Copyright (C) 2015-2016 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <assert.h>
	#include <inttypes.h>
	#include <lk/compiler.h>
	#include <stdbool.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <string.h>

	#include "array.h"
	#include "block_allocator.h"
	#include "block_set.h"
	#include "checkpoint.h"
	#include "debug.h"
	#include "file.h"
	#include "transaction.h"

	bool print_merge_free;

	/**
	* transaction_check_free
	* @tr: Transaction object.
	* @set: Free set to check.
	* @min_free Number of block that must be in @set
	*
	* Return: %true is @set contains @min_free or more blocks, %false otherwise.
	*/
	static bool transaction_check_free(struct transaction* tr,
	struct block_set* set,
	data_block_t min_free) {
	data_block_t next_block;
	struct block_range free_range;
	data_block_t count;

	next_block = 0;
	while (true) {
	free_range = block_set_find_next_range(tr, set, next_block);
	if (block_range_empty(free_range)) {
	return false;
	}
	count = free_range.end - free_range.start;
	if (count >= min_free) {
	return true;
	}
	min_free -= count;
	next_block = free_range.end;
	}
	}

	/**
	* transaction_merge_free_sets
	* @tr: Transaction object.
	* @new_set: Output set.
	* @set_i: Initial set.
	* @set_d: Set of blocks to delete.
	* @set_a: Set of blocks to add.
	*
	* Helper function to update the free_set when committing a transaction.
	* @new_set = @set_i - @set_d - @new_set-blocks + @set_a + set_[ida]-blocks
	* @new_set must start empty and will be initialized and filled in this
	* function.
	*/
	static void transaction_merge_free_sets(struct transaction* tr,
	struct block_set* new_set,
	struct block_set* set_i,
	struct block_set* set_d,
	struct block_set* set_a) {
	data_block_t next_block;
	struct block_range delete_range = BLOCK_RANGE_INITIAL_VALUE(delete_range);
	struct block_range add_range = BLOCK_RANGE_INITIAL_VALUE(add_range);

	/* new_set should start empty. */
	assert(block_set_find_next_block(tr, new_set, 1, true) == 0);
	block_set_copy(tr, new_set, set_i);

	full_assert(block_set_check(tr, set_i));
	full_assert(block_set_check(tr, set_d));
	full_assert(block_set_check(tr, set_a));

	assert(!block_mac_valid(tr, &set_i->block_tree.root) \|\|
	transaction_block_need_copy(
	tr, block_mac_to_block(tr, &set_i->block_tree.root)));
	if (print_merge_free) {
	printf("%s\n", __func__);
	}

	/* TODO: don't walk the whole tree each time */
	next_block = 1;
	while (next_block != 0) {
	tr->min_free_block = next_block;
	delete_range = block_set_find_next_range(tr, set_d, next_block);
	add_range = block_set_find_next_range(tr, set_a, next_block);
	if (print_merge_free) {
	printf("%s: add %" PRIu64 "-%" PRIu64 " or delete %" PRIu64
	"-%" PRIu64 "\n",
	__func__, add_range.start, add_range.end - 1,
	delete_range.start, delete_range.end - 1);
	}
	assert(!block_range_overlap(delete_range, add_range));
	if (block_range_before(delete_range, add_range)) {
	assert(delete_range.start >= next_block);
	tr->min_free_block = delete_range.end;
	block_allocator_suspend_set_updates(tr);
	block_set_remove_range(tr, new_set, delete_range);
	block_allocator_process_queue(tr);
	next_block = delete_range.end;
	} else if (!block_range_empty(add_range)) {
	assert(add_range.start >= next_block);
	tr->min_free_block = add_range.end;
	block_allocator_suspend_set_updates(tr);
	block_set_add_range(tr, new_set, add_range);
	block_allocator_process_queue(tr);
	next_block = add_range.end;
	} else {
	assert(block_range_empty(delete_range));
	assert(block_range_empty(add_range));
	next_block = 0;
	}
	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	return;
	}
	}
	full_assert(block_set_check(tr, new_set));
	}

	/** transaction_rebuild_free_set - Rebuild free set from referenced file blocks
	* @tr: Transaction object.
	* @new_free_set: Output free set.
	* @files: Root block and mac of the files tree.
	* @checkpoint: Checkpoint metadata block and mac
	*
	* Rebuilds the file system free set by walking the current files tree and
	* ensuring that all referenced blocks are marked as not free. @new_free_set
	* will be initialized to contain all blocks not referenced from the files root.
	* The @checkpoint metadata block will also be removed from the free set, but
	* its children (checkpoint files tree and free set) will not be checked. The
	* blocks in the checkpoint (beside the metadata block) are tracked as
	* free/allocated by the checkpoint free set rather than the active file system
	* free set.
	*
	* We ignore tr->freed and tr->fs->free here because we are reconstructing the
	* entire free set. All blocks that were freed in this transaction will not be
	* referenced by @new_files.
	*/
	static void transaction_rebuild_free_set(struct transaction* tr,
	struct block_set* new_free_set,
	struct block_mac* new_files,
	struct block_mac* new_checkpoint) {
	struct block_range init_range = {
	.start = tr->fs->min_block_num,
	.end = tr->fs->dev->block_count,
	};
	struct block_range range;
	struct block_set previously_allocated =
	BLOCK_SET_INITIAL_VALUE(previously_allocated);

	/*
	* Copy and save tr->allocated so that we can keep track of the blocks
	* already allocated for the current transaction when performing allocations
	* for the new free set tree nodes. We then reset tr->allocated so that it
	* will only hold new blocks allocated for new_free_set. All blocks
	* allocated for files will already be referenced in new_files, so we'll
	* already be removing them from new_free_set.
	*/
	assert(list_in_list(&tr->allocated.node));
	list_delete(&tr->allocated.node);
	block_set_copy_ro(tr, &previously_allocated, &tr->allocated);
	list_add_tail(&tr->fs->allocated, &previously_allocated.node);

	block_set_init(tr->fs, &tr->allocated);
	list_add_tail(&tr->fs->allocated, &tr->allocated.node);

	block_set_init(tr->fs, new_free_set);
	new_free_set->block_tree.copy_on_write = true;
	new_free_set->block_tree.allow_copy_on_write = true;

	block_set_add_range(tr, new_free_set, init_range);

	if (block_mac_valid(tr, new_checkpoint)) {
	block_set_remove_block(tr, new_free_set,
	block_mac_to_block(tr, new_checkpoint));
	}
	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	return;
	}

	if (block_mac_valid(tr, new_files)) {
	files_rebuild_free_set(tr, new_free_set, new_files);
	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	return;
	}
	}

	for (range = block_set_find_next_range(tr, &tr->allocated, 1);
	!block_range_empty(range);
	range = block_set_find_next_range(tr, &tr->allocated, range.end)) {
	tr->min_free_block = range.end;
	block_allocator_suspend_set_updates(tr);
	block_set_remove_range(tr, new_free_set, range);
	block_allocator_process_queue(tr);
	}

	/*
	* Copy the rest of the allocated blocks back to tr->allocated to maintain a
	* consistent state. We don't actually need to do this with the current code
	* calling this function, but this restores the transaction state to what
	* would be expected if it were to be used in the future.
	*/
	for (range = block_set_find_next_range(tr, &previously_allocated, 1);
	!block_range_empty(range);
	range = block_set_find_next_range(tr, &previously_allocated,
	range.end)) {
	block_set_add_range(tr, &tr->allocated, range);
	}
	list_delete(&previously_allocated.node);

	full_assert(block_set_check(tr, new_free_set));
	}

	/**
	* transaction_block_need_copy - Check if block needs copy
	* @tr: Transaction object.
	* @block: Block number to check.
	*
	* Return: %true if block has not been allocated as a non-tmp block for @tr,
	* %false otherwise.
	*/
	bool transaction_block_need_copy(struct transaction* tr, data_block_t block) {
	assert(block);
	assert(!block_set_block_in_set(tr, &tr->tmp_allocated, block));
	assert(!block_allocator_allocation_queued(tr, block, true));

	return !block_set_block_in_set(tr, &tr->allocated, block) &&
	!block_allocator_allocation_queued(tr, block, false);
	}

	/**
	* transaction_delete_active - Remove transaction from active lists (internal)
	* @tr: Transaction object.
	*/
	static void transaction_delete_active(struct transaction* tr) {
	assert(list_in_list(&tr->allocated.node));
	assert(list_in_list(&tr->tmp_allocated.node));
	list_delete(&tr->allocated.node);
	list_delete(&tr->tmp_allocated.node);
	}

	/**
	* transaction_fail - Fail transaction
	* @tr: Transaction object.
	*
	* Marks transaction as failed, removes it from active list, discards dirty
	* cache entries and restore open files to last committed state.
	*/
	void transaction_fail(struct transaction* tr) {
	assert(!tr->failed);

	tr->failed = true;

	if (tr->complete) {
	return;
	}

	block_cache_discard_transaction(tr, true);
	transaction_delete_active(tr);
	file_transaction_failed(tr);
	}

	/**
	* transaction_free - Free transaction
	* @tr: Transaction object.
	*
	* Prepare @tr for free. @tr must not be active and all open files must already
	* be closed.
	*/
	void transaction_free(struct transaction* tr) {
	assert(!transaction_is_active(tr));
	assert(list_is_empty(&tr->open_files));
	assert(list_in_list(&tr->node));
	list_delete(&tr->node);
	}

	/**
	* check_free_tree - Check tree of free set (internal)
	* @tr: Transaction object.
	* @free: Set object.
	*
	* Check that the blocks used by the tree for a free set are not in the same
	* set.
	*/
	static void check_free_tree(struct transaction* tr, struct block_set* free) {
	unsigned int i;
	struct block_tree_path path;

	block_tree_walk(tr, &free->block_tree, 0, true, &path);
	while (block_tree_path_get_key(&path)) {
	for (i = 0; i < path.count; i++) {
	assert(!block_set_block_in_set(
	tr, free,
	block_mac_to_block(tr, &path.entry[i].block_mac)));
	}
	block_tree_path_next(&path);
	}
	}

	/**
	* transaction_complete - Complete transaction, optionally updating checkpoint
	* @tr: Transaction object.
	* @update_checkpoint: If true, update checkpoint with the new file-system
	* state.
	*/
	void transaction_complete_etc(struct transaction* tr, bool update_checkpoint) {
	struct block_mac new_files;
	struct transaction* tmp_tr;
	struct transaction* other_tr;
	struct block_set new_free_set = BLOCK_SET_INITIAL_VALUE(new_free_set);
	struct checkpoint* new_checkpoint = NULL;
	struct block_mac new_checkpoint_mac;
	struct obj_ref new_checkpoint_ref =
	OBJ_REF_INITIAL_VALUE(new_checkpoint_ref);
	bool super_block_updated;

	assert(tr->fs);
	assert(!tr->complete);

	if (tr->fs->checkpoint_required) {
	tr->fs->checkpoint_required = false;
	if (!checkpoint_commit(tr->fs)) {
	/*
	* checkpoint creation failed, so we need to try again before we
	* commit the next transaction
	*/
	tr->fs->checkpoint_required = true;
	transaction_fail(tr);
	pr_warn("auto-checkpoint failed, abort\n");
	goto err_transaction_failed;
	}
	}

	// printf("%s: %" PRIu64 "\n", __func__, tr->version);

	block_mac_copy(tr, &new_checkpoint_mac, &tr->fs->checkpoint);

	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	goto err_transaction_failed;
	}

	assert(transaction_is_active(tr));

	file_transaction_complete(tr, &new_files);
	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	goto err_transaction_failed;
	}

	if (update_checkpoint) {
	new_checkpoint = checkpoint_get_new_block(tr, &new_checkpoint_ref,
	&new_checkpoint_mac);
	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	goto err_transaction_failed;
	}
	assert(new_checkpoint);
	}

	tr->new_free_set = &new_free_set;
	if (tr->rebuild_free_set) {
	transaction_rebuild_free_set(tr, &new_free_set, &new_files,
	&new_checkpoint_mac);
	} else {
	transaction_merge_free_sets(tr, &new_free_set, &tr->fs->free,
	&tr->allocated, &tr->freed);
	}
	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	goto err_transaction_failed;
	}

	if (!transaction_check_free(tr, &new_free_set, tr->fs->reserved_count)) {
	if (!tr->failed) {
	transaction_fail(tr);
	}
	pr_warn("transaction would leave fs too full, abort\n");
	goto err_transaction_failed;
	}

	if (tr->fs->alternate_data && tr->repaired) {
	if (!tr->failed) {
	transaction_fail(tr);
	}
	pr_warn("transaction cannot repair alternate fs, abort\n");
	goto err_transaction_failed;
	}

	if (0) {
	printf("%s: old free:\n", __func__);
	block_set_print(tr, &tr->fs->free);
	printf("%s: tmp_allocated:\n", __func__);
	block_set_print(tr, &tr->tmp_allocated);
	printf("%s: allocated:\n", __func__);
	block_set_print(tr, &tr->allocated);
	printf("%s: freed:\n", __func__);
	block_set_print(tr, &tr->freed);
	printf("%s: new free:\n", __func__);
	block_set_print(tr, &new_free_set);
	}

	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	goto err_transaction_failed;
	}

	if (update_checkpoint) {
	checkpoint_update_roots(tr, new_checkpoint, &new_files,
	&new_free_set.block_tree.root);
	block_put_dirty(tr, new_checkpoint, &new_checkpoint_ref,
	&new_checkpoint_mac, NULL);
	/*
	* We have now released the block reference new_checkpoint_ref, so make
	* sure we don't release it again in err_transaction_failed
	*/
	new_checkpoint = NULL;
	}

	block_cache_clean_transaction(tr);

	if (tr->failed) {
	pr_warn("transaction failed, abort\n");
	goto err_transaction_failed;
	}

	assert(block_range_empty(new_free_set.initial_range));
	check_free_tree(tr, &new_free_set);

	if (block_mac_same_block(tr, &tr->fs->free.block_tree.root,
	&new_free_set.block_tree.root)) {
	/*
	* If the root block of the free tree did not move, there can be no
	* other changes to the filesystem.
	*/
	assert(block_mac_eq(tr, &tr->fs->free.block_tree.root,
	&new_free_set.block_tree.root));
	assert(block_mac_eq(tr, &tr->fs->files.root, &new_files));

	/*
	* Skip super block write if there are no changes to the filesystem.
	* This is needed in case a previous write error has triggered a request
	* to write another copy of the old super block. There can only be one
	* copy of each block in the cache. If we try to write a new super block
	* here before cleaning the pending one, we get a conflict. If there
	* were changes to the filesystem, the pending super block has already
	* been cleaned at this point.
	*/
	goto complete_nop_transaction;
	}

	super_block_updated = update_super_block(tr, &new_free_set.block_tree.root,
	&new_files, &new_checkpoint_mac);
	if (!super_block_updated) {
	assert(tr->failed);
	pr_warn("failed to update super block, abort\n");
	goto err_transaction_failed;
	}
	block_cache_clean_transaction(tr);

	/*
	* If an error was detected writing the super block, it is not safe to
	* continue as we do not know if the write completed. We need to rewrite a
	* known state over the unknown super block to avoid an inconsistent view of
	* the filesystem.
	*
	* At this point block_cache_complete_write has been called by the block
	* device, so the current superblock slot in the block cache is free and not
	* associated with the pending transaction.
	*/
	if (tr->failed) {
	pr_warn("failed to write super block, notify fs and abort the transaction\n");
	/*
	* Superblock could have been written or not. Make sure no other blocks
	* are written to the filesystem before writing another copy of the
	* superblock with the existing file and free trees.
	*
	* TODO: Don't trigger a superblock write on unaffected filesystems.
	* We update all for now to simplify testing.
	*/
	fs_unknown_super_block_state_all();
	goto err_transaction_failed;
	}

	tr->fs->free.block_tree.root = new_free_set.block_tree.root;
	block_range_clear(
	&tr->fs->free
	.initial_range); /* clear for initial file-system state */
	tr->fs->files.root = new_files;
	tr->fs->super_block_version = tr->fs->written_super_block_version;
	tr->fs->checkpoint = new_checkpoint_mac;
	if (tr->repaired) {
	assert(!tr->fs->alternate_data);
	tr->fs->main_repaired = true;
	}
	if (update_checkpoint) {
	tr->fs->checkpoint_free.block_tree.root = new_free_set.block_tree.root;
	block_range_clear(&tr->fs->checkpoint_free.initial_range);
	}

	complete_nop_transaction:
	transaction_delete_active(tr);
	tr->complete = true;

	file_transaction_success(tr);
	assert(!tr->failed);

	check_free_tree(tr, &tr->fs->free);

	list_for_every_entry_safe(&tr->fs->transactions, other_tr, tmp_tr,
	struct transaction, node) {
	if (tr->failed) {
	break;
	}
	if (!transaction_is_active(other_tr)) {
	continue;
	}
	if (tr->rebuild_free_set) {
	/*
	* TODO: only fail actually conflicting transactions when rebuilding
	* the free set. When rebuilding, tr->freed does not contain all
	* freed blocks if tree nodes were dropped. We could rebuild a free
	* set delta by subtracting the new free set from the old one and
	* then compare this delta against other transactions.
	*/
	pr_warn("Rebuilding free set requires failing all pending transactions\n");
	transaction_fail(other_tr);
	} else if (block_set_overlap(tr, &tr->freed, &other_tr->freed)) {
	pr_warn("fail conflicting transaction\n");
	transaction_fail(other_tr);
	}
	}
	if (tr->failed) {
	pr_warn("transaction failed while failing conflicting transactions\n");
	tr->failed = false;
	list_for_every_entry_safe(&tr->fs->transactions, other_tr, tmp_tr,
	struct transaction, node) {
	if (!transaction_is_active(other_tr)) {
	continue;
	}
	pr_warn("fail possibly conflicting transaction\n");
	transaction_fail(other_tr);
	}
	}
	assert(!tr->failed);
	block_cache_discard_transaction(tr, false);

	err_transaction_failed:
	if (new_checkpoint) {
	block_put_dirty_discard(new_checkpoint, &new_checkpoint_ref);
	}
	if (tr->failed) {
	file_transaction_complete_failed(tr);
	}
	assert(!block_cache_debug_get_ref_block_count());
	}

	/**
	* transaction_initial_super_block_complete - Complete special transaction
	* @tr: Transaction object. Must match initial_super_block_tr in fs.
	*
	* Flush the initial superblock in @tr to disk. If the block could not be
	* written re-initialize @tr and leave it in place for another attempt.
	* Otherwise clear @tr->fs->initial_super_block_tr and free @tr.
	*
	* The initial superblock can only be flushed from the block cache by the
	* block_cache_clean_transaction() call here, as we do not allow initial
	* superblocks to be flushed to make room for other data. This ensures that we
	* don't run out of room to recreate the superblock write in case it fails.
	*/
	void transaction_initial_super_block_complete(struct transaction* tr) {
	assert(tr == tr->fs->initial_super_block_tr);
	block_cache_clean_transaction(tr);
	if (tr->failed) {
	/*
	* If we failed to write the superblock we re-initialize a new attempt
	* to write that superblock before the next time we write to this
	* filesystem.
	*/
	pr_err("%s: failed to write initial superblock, version %d.\n",
	__func__, tr->fs->written_super_block_version);
	write_current_super_block(tr->fs, true /* reinitialize */);
	return;
	}
	printf("%s: write initial superblock, version %d -> %d\n", __func__,
	tr->fs->super_block_version, tr->fs->written_super_block_version);

	assert(tr == tr->fs->initial_super_block_tr);
	tr->fs->super_block_version = tr->fs->written_super_block_version;
	tr->fs->initial_super_block_tr = NULL;

	/* not a real transaction, discard the state so it can be freed */
	transaction_fail(tr);
	transaction_free(tr);
	free(tr);
	}

	/**
	* transaction_activate - Activate transaction
	* @tr: Transaction object.
	*/
	void transaction_activate(struct transaction* tr) {
	assert(tr->fs);
	assert(!transaction_is_active(tr));

	tr->failed = false;
	tr->invalid_block_found = false;
	tr->complete = false;
	tr->rebuild_free_set = false;
	tr->repaired = false;
	tr->min_free_block = 0;
	tr->last_free_block = 0;
	tr->last_tmp_free_block = 0;
	tr->new_free_set = NULL;

	block_set_init(tr->fs, &tr->tmp_allocated);
	block_set_init(tr->fs, &tr->allocated);
	block_set_init(tr->fs, &tr->freed);

	fs_file_tree_init(tr->fs, &tr->files_added);
	fs_file_tree_init(tr->fs, &tr->files_updated);
	fs_file_tree_init(tr->fs, &tr->files_removed);

	list_add_tail(&tr->fs->allocated, &tr->allocated.node);
	list_add_tail(&tr->fs->allocated, &tr->tmp_allocated.node);
	}

	/**
	* transaction_init - Initialize new transaction object
	* @tr: Transaction object.
	* @fs: File system state object.
	* @activate: Activate transaction
	*/
	void transaction_init(struct transaction* tr, struct fs* fs, bool activate) {
	assert(fs);
	assert(fs->dev);

	memset(tr, 0, sizeof(*tr));
	tr->fs = fs;

	list_initialize(&tr->open_files);
	list_add_tail(&fs->transactions, &tr->node);

	if (activate) {
	transaction_activate(tr);
	}
	}