// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "dir-item.h"
#include "file-item.h"
#include "file.h"
#include "orphan.h"
#include "print-tree.h"
#include "tree-checker.h"

#define MAX_CONFLICT_INODES 10

/* magic values for the inode_only field in btrfs_log_inode:
 *
 * LOG_INODE_ALL means to log everything
 * LOG_INODE_EXISTS means to log just enough to recreate the inode
 * during log replay
 */
enum {
	LOG_INODE_ALL,
	LOG_INODE_EXISTS,
};

/*
 * directory trouble cases
 *
 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
 * log, we must force a full commit before doing an fsync of the directory
 * where the unlink was done.
 * ---> record transid of last unlink/rename per directory
 *
 * mkdir foo/some_dir
 * normal commit
 * rename foo/some_dir foo2/some_dir
 * mkdir foo/some_dir
 * fsync foo/some_dir/some_file
 *
 * The fsync above will unlink the original some_dir without recording
 * it in its new location (foo2).  After a crash, some_dir will be gone
 * unless the fsync of some_file forces a full commit
 *
 * 2) we must log any new names for any file or dir that is in the fsync
 * log. ---> check inode while renaming/linking.
 *
 * 2a) we must log any new names for any file or dir during rename
 * when the directory they are being removed from was logged.
 * ---> check inode and old parent dir during rename
 *
 *  2a is actually the more important variant.  With the extra logging
 *  a crash might unlink the old name without recreating the new one
 *
 * 3) after a crash, we must go through any directories with a link count
 * of zero and redo the rm -rf
 *
 * mkdir f1/foo
 * normal commit
 * rm -rf f1/foo
 * fsync(f1)
 *
 * The directory f1 was fully removed from the FS, but fsync was never
 * called on f1, only its parent dir.  After a crash the rm -rf must
 * be replayed.  This must be able to recurse down the entire
 * directory tree.  The inode link count fixup code takes care of the
 * ugly details.
 */

/*
 * stages for the tree walking.  The first
 * stage (0) is to only pin down the blocks we find
 * the second stage (1) is to make sure that all the inodes
 * we find in the log are created in the subvolume.
 *
 * The last stage is to deal with directories and links and extents
 * and all the other fun semantics
 */
enum {
	LOG_WALK_PIN_ONLY,
	LOG_WALK_REPLAY_INODES,
	LOG_WALK_REPLAY_DIR_INDEX,
	LOG_WALK_REPLAY_ALL,
};

/*
 * The walk control struct is used to pass state down the chain when processing
 * the log tree. The stage field tells us which part of the log tree processing
 * we are currently doing.
 */
struct walk_control {
	/*
	 * Signal that we are freeing the metadata extents of a log tree.
	 * This is used at transaction commit time while freeing a log tree.
	 */
	bool free;

	/*
	 * Signal that we are pinning the metadata extents of a log tree and the
	 * data extents its leaves point to (if using mixed block groups).
	 * This happens in the first stage of log replay to ensure that during
	 * replay, while we are modifying subvolume trees, we don't overwrite
	 * the metadata extents of log trees.
	 */
	bool pin;

	/* What stage of the replay code we're currently in. */
	int stage;

	/*
	 * Ignore any items from the inode currently being processed. Needs
	 * to be set every time we find a BTRFS_INODE_ITEM_KEY.
	 */
	bool ignore_cur_inode;

	/*
	 * The root we are currently replaying to. This is NULL for the replay
	 * stage LOG_WALK_PIN_ONLY.
	 */
	struct btrfs_root *root;

	/* The log tree we are currently processing (not NULL for any stage). */
	struct btrfs_root *log;

	/* The transaction handle used for replaying all log trees. */
	struct btrfs_trans_handle *trans;

	/*
	 * The function that gets used to process blocks we find in the tree.
	 * Note the extent_buffer might not be up to date when it is passed in,
	 * and it must be checked or read if you need the data inside it.
	 */
	int (*process_func)(struct extent_buffer *eb,
			    struct walk_control *wc, u64 gen, int level);

	/*
	 * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
	 * and by the replay_one_buffer() callback.
	 */

	/* The current log leaf being processed. */
	struct extent_buffer *log_leaf;
	/* The key being processed of the current log leaf. */
	struct btrfs_key log_key;
	/* The slot being processed of the current log leaf. */
	int log_slot;

	/* A path used for searches and modifications to subvolume trees. */
	struct btrfs_path *subvol_path;
};

static void do_abort_log_replay(struct walk_control *wc, const char *function,
				unsigned int line, int error, const char *fmt, ...)
{
	struct btrfs_fs_info *fs_info = wc->trans->fs_info;
	struct va_format vaf;
	va_list args;

	/*
	 * Do nothing if we already aborted, to avoid dumping leaves again which
	 * can be verbose. Further more, only the first call is useful since it
	 * is where we have a problem. Note that we do not use the flag
	 * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
	 * are outside of tree-log.c that can abort transactions (such as
	 * btrfs_add_link() for example), so if that happens we still want to
	 * dump all log replay specific information below.
	 */
	if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
		return;

	btrfs_abort_transaction(wc->trans, error);

	if (wc->subvol_path && wc->subvol_path->nodes[0]) {
		btrfs_crit(fs_info,
			   "subvolume (root %llu) leaf currently being processed:",
			   btrfs_root_id(wc->root));
		btrfs_print_leaf(wc->subvol_path->nodes[0]);
	}

	if (wc->log_leaf) {
		btrfs_crit(fs_info,
	  "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):",
			   btrfs_root_id(wc->root), wc->log_slot,
			   wc->log_key.objectid, wc->log_key.type, wc->log_key.offset);
		btrfs_print_leaf(wc->log_leaf);
	}

	va_start(args, fmt);
	vaf.fmt = fmt;
	vaf.va = &args;

	btrfs_crit(fs_info,
	   "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
		   function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);

	va_end(args);
}

/*
 * Use this for aborting a transaction during log replay while we are down the
 * call chain of replay_one_buffer(), so that we get a lot more useful
 * information for debugging issues when compared to a plain call to
 * btrfs_abort_transaction().
 */
#define btrfs_abort_log_replay(wc, error, fmt, args...) \
	do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)

static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			   struct btrfs_inode *inode,
			   int inode_only,
			   struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
static noinline int replay_dir_deletes(struct walk_control *wc,
				       u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);

/*
 * tree logging is a special write ahead log used to make sure that
 * fsyncs and O_SYNCs can happen without doing full tree commits.
 *
 * Full tree commits are expensive because they require commonly
 * modified blocks to be recowed, creating many dirty pages in the
 * extent tree an 4x-6x higher write load than ext3.
 *
 * Instead of doing a tree commit on every fsync, we use the
 * key ranges and transaction ids to find items for a given file or directory
 * that have changed in this transaction.  Those items are copied into
 * a special tree (one per subvolume root), that tree is written to disk
 * and then the fsync is considered complete.
 *
 * After a crash, items are copied out of the log-tree back into the
 * subvolume tree.  Any file data extents found are recorded in the extent
 * allocation tree, and the log-tree freed.
 *
 * The log tree is read three times, once to pin down all the extents it is
 * using in ram and once, once to create all the inodes logged in the tree
 * and once to do all the other items.
 */

static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
	unsigned int nofs_flag;
	struct btrfs_inode *inode;

	/* Only meant to be called for subvolume roots and not for log roots. */
	ASSERT(btrfs_is_fstree(btrfs_root_id(root)));

	/*
	 * We're holding a transaction handle whether we are logging or
	 * replaying a log tree, so we must make sure NOFS semantics apply
	 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
	 * to allocate an inode, which can recurse back into the filesystem and
	 * attempt a transaction commit, resulting in a deadlock.
	 */
	nofs_flag = memalloc_nofs_save();
	inode = btrfs_iget(objectid, root);
	memalloc_nofs_restore(nofs_flag);

	return inode;
}

/*
 * start a sub transaction and setup the log tree
 * this increments the log tree writer count to make the people
 * syncing the tree wait for us to finish
 */
static int start_log_trans(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct btrfs_log_ctx *ctx)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *tree_root = fs_info->tree_root;
	const bool zoned = btrfs_is_zoned(fs_info);
	int ret = 0;
	bool created = false;

	/*
	 * First check if the log root tree was already created. If not, create
	 * it before locking the root's log_mutex, just to keep lockdep happy.
	 */
	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
		mutex_lock(&tree_root->log_mutex);
		if (!fs_info->log_root_tree) {
			ret = btrfs_init_log_root_tree(trans, fs_info);
			if (!ret) {
				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
				created = true;
			}
		}
		mutex_unlock(&tree_root->log_mutex);
		if (ret)
			return ret;
	}

	mutex_lock(&root->log_mutex);

again:
	if (root->log_root) {
		int index = (root->log_transid + 1) % 2;

		if (btrfs_need_log_full_commit(trans)) {
			ret = BTRFS_LOG_FORCE_COMMIT;
			goto out;
		}

		if (zoned && atomic_read(&root->log_commit[index])) {
			wait_log_commit(root, root->log_transid - 1);
			goto again;
		}

		if (!root->log_start_pid) {
			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
			root->log_start_pid = current->pid;
		} else if (root->log_start_pid != current->pid) {
			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
		}
	} else {
		/*
		 * This means fs_info->log_root_tree was already created
		 * for some other FS trees. Do the full commit not to mix
		 * nodes from multiple log transactions to do sequential
		 * writing.
		 */
		if (zoned && !created) {
			ret = BTRFS_LOG_FORCE_COMMIT;
			goto out;
		}

		ret = btrfs_add_log_tree(trans, root);
		if (ret)
			goto out;

		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
		root->log_start_pid = current->pid;
	}

	atomic_inc(&root->log_writers);
	if (!ctx->logging_new_name) {
		int index = root->log_transid % 2;
		list_add_tail(&ctx->list, &root->log_ctxs[index]);
		ctx->log_transid = root->log_transid;
	}

out:
	mutex_unlock(&root->log_mutex);
	return ret;
}

/*
 * returns 0 if there was a log transaction running and we were able
 * to join, or returns -ENOENT if there were not transactions
 * in progress
 */
static int join_running_log_trans(struct btrfs_root *root)
{
	const bool zoned = btrfs_is_zoned(root->fs_info);
	int ret = -ENOENT;

	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
		return ret;

	mutex_lock(&root->log_mutex);
again:
	if (root->log_root) {
		int index = (root->log_transid + 1) % 2;

		ret = 0;
		if (zoned && atomic_read(&root->log_commit[index])) {
			wait_log_commit(root, root->log_transid - 1);
			goto again;
		}
		atomic_inc(&root->log_writers);
	}
	mutex_unlock(&root->log_mutex);
	return ret;
}

/*
 * This either makes the current running log transaction wait
 * until you call btrfs_end_log_trans() or it makes any future
 * log transactions wait until you call btrfs_end_log_trans()
 */
void btrfs_pin_log_trans(struct btrfs_root *root)
{
	atomic_inc(&root->log_writers);
}

/*
 * indicate we're done making changes to the log tree
 * and wake up anyone waiting to do a sync
 */
void btrfs_end_log_trans(struct btrfs_root *root)
{
	if (atomic_dec_and_test(&root->log_writers)) {
		/* atomic_dec_and_test implies a barrier */
		cond_wake_up_nomb(&root->log_writer_wait);
	}
}

/*
 * process_func used to pin down extents, write them or wait on them
 */
static int process_one_buffer(struct extent_buffer *eb,
			      struct walk_control *wc, u64 gen, int level)
{
	struct btrfs_root *log = wc->log;
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_fs_info *fs_info = log->fs_info;
	int ret = 0;

	/*
	 * If this fs is mixed then we need to be able to process the leaves to
	 * pin down any logged extents, so we have to read the block.
	 */
	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
		struct btrfs_tree_parent_check check = {
			.level = level,
			.transid = gen
		};

		ret = btrfs_read_extent_buffer(eb, &check);
		if (unlikely(ret)) {
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);
			return ret;
		}
	}

	if (wc->pin) {
		ASSERT(trans != NULL);
		ret = btrfs_pin_extent_for_log_replay(trans, eb);
		if (unlikely(ret)) {
			btrfs_abort_transaction(trans, ret);
			return ret;
		}

		if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
			ret = btrfs_exclude_logged_extents(eb);
			if (ret)
				btrfs_abort_transaction(trans, ret);
		}
	}
	return ret;
}

/*
 * Item overwrite used by log replay. The given log tree leaf, slot and key
 * from the walk_control structure all refer to the source data we are copying
 * out.
 *
 * The given root is for the tree we are copying into, and path is a scratch
 * path for use in this function (it should be released on entry and will be
 * released on exit).
 *
 * If the key is already in the destination tree the existing item is
 * overwritten.  If the existing item isn't big enough, it is extended.
 * If it is too large, it is truncated.
 *
 * If the key isn't in the destination yet, a new item is inserted.
 */
static int overwrite_item(struct walk_control *wc)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	int ret;
	u32 item_size;
	u64 saved_i_size = 0;
	int save_old_i_size = 0;
	unsigned long src_ptr;
	unsigned long dst_ptr;
	struct extent_buffer *dst_eb;
	int dst_slot;
	const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);

	/*
	 * This is only used during log replay, so the root is always from a
	 * fs/subvolume tree. In case we ever need to support a log root, then
	 * we'll have to clone the leaf in the path, release the path and use
	 * the leaf before writing into the log tree. See the comments at
	 * copy_items() for more details.
	 */
	ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);

	item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
	src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);

	/* Look for the key in the destination tree. */
	ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
		"failed to search subvolume tree for key (%llu %u %llu) root %llu",
				       wc->log_key.objectid, wc->log_key.type,
				       wc->log_key.offset, btrfs_root_id(root));
		return ret;
	}

	dst_eb = wc->subvol_path->nodes[0];
	dst_slot = wc->subvol_path->slots[0];

	if (ret == 0) {
		char *src_copy;
		const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);

		if (dst_size != item_size)
			goto insert;

		if (item_size == 0) {
			btrfs_release_path(wc->subvol_path);
			return 0;
		}
		src_copy = kmalloc(item_size, GFP_NOFS);
		if (!src_copy) {
			btrfs_abort_log_replay(wc, -ENOMEM,
			       "failed to allocate memory for log leaf item");
			return -ENOMEM;
		}

		read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
		dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
		ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);

		kfree(src_copy);
		/*
		 * they have the same contents, just return, this saves
		 * us from cowing blocks in the destination tree and doing
		 * extra writes that may not have been done by a previous
		 * sync
		 */
		if (ret == 0) {
			btrfs_release_path(wc->subvol_path);
			return 0;
		}

		/*
		 * We need to load the old nbytes into the inode so when we
		 * replay the extents we've logged we get the right nbytes.
		 */
		if (is_inode_item) {
			struct btrfs_inode_item *item;
			u64 nbytes;
			u32 mode;

			item = btrfs_item_ptr(dst_eb, dst_slot,
					      struct btrfs_inode_item);
			nbytes = btrfs_inode_nbytes(dst_eb, item);
			item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
					      struct btrfs_inode_item);
			btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);

			/*
			 * If this is a directory we need to reset the i_size to
			 * 0 so that we can set it up properly when replaying
			 * the rest of the items in this log.
			 */
			mode = btrfs_inode_mode(wc->log_leaf, item);
			if (S_ISDIR(mode))
				btrfs_set_inode_size(wc->log_leaf, item, 0);
		}
	} else if (is_inode_item) {
		struct btrfs_inode_item *item;
		u32 mode;

		/*
		 * New inode, set nbytes to 0 so that the nbytes comes out
		 * properly when we replay the extents.
		 */
		item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
		btrfs_set_inode_nbytes(wc->log_leaf, item, 0);

		/*
		 * If this is a directory we need to reset the i_size to 0 so
		 * that we can set it up properly when replaying the rest of
		 * the items in this log.
		 */
		mode = btrfs_inode_mode(wc->log_leaf, item);
		if (S_ISDIR(mode))
			btrfs_set_inode_size(wc->log_leaf, item, 0);
	}
insert:
	btrfs_release_path(wc->subvol_path);
	/* try to insert the key into the destination tree */
	wc->subvol_path->skip_release_on_error = 1;
	ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
	wc->subvol_path->skip_release_on_error = 0;

	dst_eb = wc->subvol_path->nodes[0];
	dst_slot = wc->subvol_path->slots[0];

	/* make sure any existing item is the correct size */
	if (ret == -EEXIST || ret == -EOVERFLOW) {
		const u32 found_size = btrfs_item_size(dst_eb, dst_slot);

		if (found_size > item_size)
			btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
		else if (found_size < item_size)
			btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
	} else if (ret) {
		btrfs_abort_log_replay(wc, ret,
				       "failed to insert item for key (%llu %u %llu)",
				       wc->log_key.objectid, wc->log_key.type,
				       wc->log_key.offset);
		return ret;
	}
	dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);

	/* don't overwrite an existing inode if the generation number
	 * was logged as zero.  This is done when the tree logging code
	 * is just logging an inode to make sure it exists after recovery.
	 *
	 * Also, don't overwrite i_size on directories during replay.
	 * log replay inserts and removes directory items based on the
	 * state of the tree found in the subvolume, and i_size is modified
	 * as it goes
	 */
	if (is_inode_item && ret == -EEXIST) {
		struct btrfs_inode_item *src_item;
		struct btrfs_inode_item *dst_item;

		src_item = (struct btrfs_inode_item *)src_ptr;
		dst_item = (struct btrfs_inode_item *)dst_ptr;

		if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
			const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);

			/*
			 * For regular files an ino_size == 0 is used only when
			 * logging that an inode exists, as part of a directory
			 * fsync, and the inode wasn't fsynced before. In this
			 * case don't set the size of the inode in the fs/subvol
			 * tree, otherwise we would be throwing valid data away.
			 */
			if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
			    ino_size != 0)
				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
			goto no_copy;
		}

		if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
		    S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
			save_old_i_size = 1;
			saved_i_size = btrfs_inode_size(dst_eb, dst_item);
		}
	}

	copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);

	if (save_old_i_size) {
		struct btrfs_inode_item *dst_item;

		dst_item = (struct btrfs_inode_item *)dst_ptr;
		btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
	}

	/* make sure the generation is filled in */
	if (is_inode_item) {
		struct btrfs_inode_item *dst_item;

		dst_item = (struct btrfs_inode_item *)dst_ptr;
		if (btrfs_inode_generation(dst_eb, dst_item) == 0)
			btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
	}
no_copy:
	btrfs_release_path(wc->subvol_path);
	return 0;
}

static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
			       struct fscrypt_str *name)
{
	char *buf;

	buf = kmalloc(len, GFP_NOFS);
	if (!buf)
		return -ENOMEM;

	read_extent_buffer(eb, buf, (unsigned long)start, len);
	name->name = buf;
	name->len = len;
	return 0;
}

/* replays a single extent in 'eb' at 'slot' with 'key' into the
 * subvolume 'root'.  path is released on entry and should be released
 * on exit.
 *
 * extents in the log tree have not been allocated out of the extent
 * tree yet.  So, this completes the allocation, taking a reference
 * as required if the extent already exists or creating a new extent
 * if it isn't in the extent allocation tree yet.
 *
 * The extent is inserted into the file, dropping any existing extents
 * from the file that overlap the new one.
 */
static noinline int replay_one_extent(struct walk_control *wc)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	struct btrfs_drop_extents_args drop_args = { 0 };
	struct btrfs_fs_info *fs_info = root->fs_info;
	int found_type;
	u64 extent_end;
	const u64 start = wc->log_key.offset;
	u64 nbytes = 0;
	u64 csum_start;
	u64 csum_end;
	LIST_HEAD(ordered_sums);
	u64 offset;
	unsigned long dest_offset;
	struct btrfs_key ins;
	struct btrfs_file_extent_item *item;
	struct btrfs_inode *inode = NULL;
	int ret = 0;

	item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
	found_type = btrfs_file_extent_type(wc->log_leaf, item);

	if (found_type == BTRFS_FILE_EXTENT_REG ||
	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
		extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
		/* Holes don't take up space. */
		if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
			nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
		nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
		extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
	} else {
		btrfs_abort_log_replay(wc, -EUCLEAN,
		       "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
				       found_type, btrfs_root_id(root),
				       wc->log_key.objectid, wc->log_key.offset);
		return -EUCLEAN;
	}

	inode = btrfs_iget_logging(wc->log_key.objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		btrfs_abort_log_replay(wc, ret,
				       "failed to get inode %llu for root %llu",
				       wc->log_key.objectid, btrfs_root_id(root));
		return ret;
	}

	/*
	 * first check to see if we already have this extent in the
	 * file.  This must be done before the btrfs_drop_extents run
	 * so we don't try to drop this extent.
	 */
	ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
				       btrfs_ino(inode), start, 0);

	if (ret == 0 &&
	    (found_type == BTRFS_FILE_EXTENT_REG ||
	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
		struct extent_buffer *leaf = wc->subvol_path->nodes[0];
		struct btrfs_file_extent_item existing;
		unsigned long ptr;

		ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
		read_extent_buffer(leaf, &existing, ptr, sizeof(existing));

		/*
		 * we already have a pointer to this exact extent,
		 * we don't have to do anything
		 */
		if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
					 sizeof(existing)) == 0) {
			btrfs_release_path(wc->subvol_path);
			goto out;
		}
	}
	btrfs_release_path(wc->subvol_path);

	/* drop any overlapping extents */
	drop_args.start = start;
	drop_args.end = extent_end;
	drop_args.drop_cache = true;
	drop_args.path = wc->subvol_path;
	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
	       "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
				       wc->log_key.objectid, start, extent_end,
				       btrfs_root_id(root));
		goto out;
	}

	if (found_type == BTRFS_FILE_EXTENT_INLINE) {
		/* inline extents are easy, we just overwrite them */
		ret = overwrite_item(wc);
		if (ret)
			goto out;
		goto update_inode;
	}

	/*
	 * If not an inline extent, it can only be a regular or prealloc one.
	 * We have checked that above and returned -EUCLEAN if not.
	 */

	/* A hole and NO_HOLES feature enabled, nothing else to do. */
	if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
	    btrfs_fs_incompat(fs_info, NO_HOLES))
		goto update_inode;

	ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
				      &wc->log_key, sizeof(*item));
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
		       "failed to insert item with key (%llu %u %llu) root %llu",
				       wc->log_key.objectid, wc->log_key.type,
				       wc->log_key.offset, btrfs_root_id(root));
		goto out;
	}
	dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
					    wc->subvol_path->slots[0]);
	copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
			   (unsigned long)item, sizeof(*item));

	/*
	 * We have an explicit hole and NO_HOLES is not enabled. We have added
	 * the hole file extent item to the subvolume tree, so we don't have
	 * anything else to do other than update the file extent item range and
	 * update the inode item.
	 */
	if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
		btrfs_release_path(wc->subvol_path);
		goto update_inode;
	}

	ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
	ins.type = BTRFS_EXTENT_ITEM_KEY;
	ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
	offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);

	/*
	 * Manually record dirty extent, as here we did a shallow file extent
	 * item copy and skip normal backref update, but modifying extent tree
	 * all by ourselves. So need to manually record dirty extent for qgroup,
	 * as the owner of the file extent changed from log tree (doesn't affect
	 * qgroup) to fs/file tree (affects qgroup).
	 */
	ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
				       ins.objectid, ins.offset,
				       wc->log_key.objectid, btrfs_root_id(root));
		goto out;
	}

	/*
	 * Is this extent already allocated in the extent tree?
	 * If so, just add a reference.
	 */
	ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
				       ins.objectid, ins.offset,
				       wc->log_key.objectid, btrfs_root_id(root));
		goto out;
	} else if (ret == 0) {
		struct btrfs_ref ref = {
			.action = BTRFS_ADD_DELAYED_REF,
			.bytenr = ins.objectid,
			.num_bytes = ins.offset,
			.owning_root = btrfs_root_id(root),
			.ref_root = btrfs_root_id(root),
		};

		btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
		ret = btrfs_inc_extent_ref(trans, &ref);
		if (ret) {
			btrfs_abort_log_replay(wc, ret,
"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
					       ins.objectid, ins.offset,
					       wc->log_key.objectid,
					       btrfs_root_id(root));
			goto out;
		}
	} else {
		/* Insert the extent pointer in the extent tree. */
		ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
						     wc->log_key.objectid, offset, &ins);
		if (ret) {
			btrfs_abort_log_replay(wc, ret,
"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
					       ins.objectid, ins.offset, offset,
					       wc->log_key.objectid, btrfs_root_id(root));
			goto out;
		}
	}

	btrfs_release_path(wc->subvol_path);

	if (btrfs_file_extent_compression(wc->log_leaf, item)) {
		csum_start = ins.objectid;
		csum_end = csum_start + ins.offset;
	} else {
		csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
		csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
	}

	ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
				      &ordered_sums, false);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
	       "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
				       csum_start, csum_end, wc->log_key.objectid,
				       btrfs_root_id(root));
		goto out;
	}
	ret = 0;
	/*
	 * Now delete all existing cums in the csum root that cover our range.
	 * We do this because we can have an extent that is completely
	 * referenced by one file extent item and partially referenced by
	 * another file extent item (like after using the clone or extent_same
	 * ioctls). In this case if we end up doing the replay of the one that
	 * partially references the extent first, and we do not do the csum
	 * deletion below, we can get 2 csum items in the csum tree that overlap
	 * each other. For example, imagine our log has the two following file
	 * extent items:
	 *
	 * key (257 EXTENT_DATA 409600)
	 *     extent data disk byte 12845056 nr 102400
	 *     extent data offset 20480 nr 20480 ram 102400
	 *
	 * key (257 EXTENT_DATA 819200)
	 *     extent data disk byte 12845056 nr 102400
	 *     extent data offset 0 nr 102400 ram 102400
	 *
	 * Where the second one fully references the 100K extent that starts at
	 * disk byte 12845056, and the log tree has a single csum item that
	 * covers the entire range of the extent:
	 *
	 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
	 *
	 * After the first file extent item is replayed, the csum tree gets the
	 * following csum item:
	 *
	 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
	 *
	 * Which covers the 20K sub-range starting at offset 20K of our extent.
	 * Now when we replay the second file extent item, if we do not delete
	 * existing csum items that cover any of its blocks, we end up getting
	 * two csum items in our csum tree that overlap each other:
	 *
	 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
	 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
	 *
	 * Which is a problem, because after this anyone trying to lookup for
	 * the checksum of any block of our extent starting at an offset of 40K
	 * or higher, will end up looking at the second csum item only, which
	 * does not contain the checksum for any block starting at offset 40K or
	 * higher of our extent.
	 */
	while (!list_empty(&ordered_sums)) {
		struct btrfs_ordered_sum *sums;
		struct btrfs_root *csum_root;

		sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
		csum_root = btrfs_csum_root(fs_info, sums->logical);
		if (!ret) {
			ret = btrfs_del_csums(trans, csum_root, sums->logical,
					      sums->len);
			if (ret)
				btrfs_abort_log_replay(wc, ret,
	       "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
						       sums->logical,
						       sums->logical + sums->len,
						       wc->log_key.objectid,
						       btrfs_root_id(root));
		}
		if (!ret) {
			ret = btrfs_csum_file_blocks(trans, csum_root, sums);
			if (ret)
				btrfs_abort_log_replay(wc, ret,
	       "failed to add csums for range [%llu, %llu) inode %llu root %llu",
						       sums->logical,
						       sums->logical + sums->len,
						       wc->log_key.objectid,
						       btrfs_root_id(root));
		}
		list_del(&sums->list);
		kfree(sums);
	}
	if (ret)
		goto out;

update_inode:
	ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
	       "failed to set file extent range [%llu, %llu) inode %llu root %llu",
				       start, extent_end, wc->log_key.objectid,
				       btrfs_root_id(root));
		goto out;
	}

	btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
	ret = btrfs_update_inode(trans, inode);
	if (ret)
		btrfs_abort_log_replay(wc, ret,
				       "failed to update inode %llu root %llu",
				       wc->log_key.objectid, btrfs_root_id(root));
out:
	iput(&inode->vfs_inode);
	return ret;
}

static int unlink_inode_for_log_replay(struct walk_control *wc,
				       struct btrfs_inode *dir,
				       struct btrfs_inode *inode,
				       const struct fscrypt_str *name)
{
	struct btrfs_trans_handle *trans = wc->trans;
	int ret;

	ret = btrfs_unlink_inode(trans, dir, inode, name);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
	       "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
				       btrfs_ino(inode), btrfs_ino(dir), name->len,
				       name->name, btrfs_root_id(inode->root));
		return ret;
	}
	/*
	 * Whenever we need to check if a name exists or not, we check the
	 * fs/subvolume tree. So after an unlink we must run delayed items, so
	 * that future checks for a name during log replay see that the name
	 * does not exists anymore.
	 */
	ret = btrfs_run_delayed_items(trans);
	if (ret)
		btrfs_abort_log_replay(wc, ret,
"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
				       btrfs_ino(inode), btrfs_ino(dir), name->len,
				       name->name, btrfs_root_id(inode->root));

	return ret;
}

/*
 * when cleaning up conflicts between the directory names in the
 * subvolume, directory names in the log and directory names in the
 * inode back references, we may have to unlink inodes from directories.
 *
 * This is a helper function to do the unlink of a specific directory
 * item
 */
static noinline int drop_one_dir_item(struct walk_control *wc,
				      struct btrfs_inode *dir,
				      struct btrfs_dir_item *di)
{
	struct btrfs_root *root = dir->root;
	struct btrfs_inode *inode;
	struct fscrypt_str name;
	struct extent_buffer *leaf = wc->subvol_path->nodes[0];
	struct btrfs_key location;
	int ret;

	btrfs_dir_item_key_to_cpu(leaf, di, &location);
	ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
				       "failed to allocate name for dir %llu root %llu",
				       btrfs_ino(dir), btrfs_root_id(root));
		return ret;
	}

	btrfs_release_path(wc->subvol_path);

	inode = btrfs_iget_logging(location.objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		btrfs_abort_log_replay(wc, ret,
		       "failed to open inode %llu parent dir %llu name %.*s root %llu",
				       location.objectid, btrfs_ino(dir),
				       name.len, name.name, btrfs_root_id(root));
		inode = NULL;
		goto out;
	}

	ret = link_to_fixup_dir(wc, location.objectid);
	if (ret)
		goto out;

	ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
out:
	kfree(name.name);
	if (inode)
		iput(&inode->vfs_inode);
	return ret;
}

/*
 * See if a given name and sequence number found in an inode back reference are
 * already in a directory and correctly point to this inode.
 *
 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 * exists.
 */
static noinline int inode_in_dir(struct btrfs_root *root,
				 struct btrfs_path *path,
				 u64 dirid, u64 objectid, u64 index,
				 struct fscrypt_str *name)
{
	struct btrfs_dir_item *di;
	struct btrfs_key location;
	int ret = 0;

	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
					 index, name, 0);
	if (IS_ERR(di)) {
		ret = PTR_ERR(di);
		goto out;
	} else if (di) {
		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
		if (location.objectid != objectid)
			goto out;
	} else {
		goto out;
	}

	btrfs_release_path(path);
	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
	if (IS_ERR(di)) {
		ret = PTR_ERR(di);
		goto out;
	} else if (di) {
		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
		if (location.objectid == objectid)
			ret = 1;
	}
out:
	btrfs_release_path(path);
	return ret;
}

/*
 * helper function to check a log tree for a named back reference in
 * an inode.  This is used to decide if a back reference that is
 * found in the subvolume conflicts with what we find in the log.
 *
 * inode backreferences may have multiple refs in a single item,
 * during replay we process one reference at a time, and we don't
 * want to delete valid links to a file from the subvolume if that
 * link is also in the log.
 */
static noinline int backref_in_log(struct btrfs_root *log,
				   struct btrfs_key *key,
				   u64 ref_objectid,
				   const struct fscrypt_str *name)
{
	BTRFS_PATH_AUTO_FREE(path);
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
	if (ret < 0)
		return ret;
	if (ret == 1)
		return 0;

	if (key->type == BTRFS_INODE_EXTREF_KEY)
		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
						       path->slots[0],
						       ref_objectid, name);
	else
		ret = !!btrfs_find_name_in_backref(path->nodes[0],
						   path->slots[0], name);
	return ret;
}

static int unlink_refs_not_in_log(struct walk_control *wc,
				  struct btrfs_key *search_key,
				  struct btrfs_inode *dir,
				  struct btrfs_inode *inode)
{
	struct extent_buffer *leaf = wc->subvol_path->nodes[0];
	unsigned long ptr;
	unsigned long ptr_end;

	/*
	 * Check all the names in this back reference to see if they are in the
	 * log. If so, we allow them to stay otherwise they must be unlinked as
	 * a conflict.
	 */
	ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
	ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
	while (ptr < ptr_end) {
		struct fscrypt_str victim_name;
		struct btrfs_inode_ref *victim_ref;
		int ret;

		victim_ref = (struct btrfs_inode_ref *)ptr;
		ret = read_alloc_one_name(leaf, (victim_ref + 1),
					  btrfs_inode_ref_name_len(leaf, victim_ref),
					  &victim_name);
		if (ret) {
			btrfs_abort_log_replay(wc, ret,
	       "failed to allocate name for inode %llu parent dir %llu root %llu",
					       btrfs_ino(inode), btrfs_ino(dir),
					       btrfs_root_id(inode->root));
			return ret;
		}

		ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
		if (ret) {
			if (ret < 0) {
				btrfs_abort_log_replay(wc, ret,
"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
						       btrfs_ino(inode), btrfs_ino(dir),
						       victim_name.len, victim_name.name,
						       btrfs_root_id(inode->root));
				kfree(victim_name.name);
				return ret;
			}
			kfree(victim_name.name);
			ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
			continue;
		}

		inc_nlink(&inode->vfs_inode);
		btrfs_release_path(wc->subvol_path);

		ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
		kfree(victim_name.name);
		if (ret)
			return ret;
		return -EAGAIN;
	}

	return 0;
}

static int unlink_extrefs_not_in_log(struct walk_control *wc,
				     struct btrfs_key *search_key,
				     struct btrfs_inode *dir,
				     struct btrfs_inode *inode)
{
	struct extent_buffer *leaf = wc->subvol_path->nodes[0];
	const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
	const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
	u32 cur_offset = 0;

	while (cur_offset < item_size) {
		struct btrfs_root *log_root = wc->log;
		struct btrfs_inode_extref *extref;
		struct fscrypt_str victim_name;
		int ret;

		extref = (struct btrfs_inode_extref *)(base + cur_offset);
		victim_name.len = btrfs_inode_extref_name_len(leaf, extref);

		if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
			goto next;

		ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
					  &victim_name);
		if (ret) {
			btrfs_abort_log_replay(wc, ret,
	       "failed to allocate name for inode %llu parent dir %llu root %llu",
					       btrfs_ino(inode), btrfs_ino(dir),
					       btrfs_root_id(inode->root));
			return ret;
		}

		search_key->objectid = btrfs_ino(inode);
		search_key->type = BTRFS_INODE_EXTREF_KEY;
		search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
						       victim_name.name,
						       victim_name.len);
		ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
		if (ret) {
			if (ret < 0) {
				btrfs_abort_log_replay(wc, ret,
"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
						       btrfs_ino(inode), btrfs_ino(dir),
						       victim_name.len, victim_name.name,
						       btrfs_root_id(inode->root));
				kfree(victim_name.name);
				return ret;
			}
			kfree(victim_name.name);
next:
			cur_offset += victim_name.len + sizeof(*extref);
			continue;
		}

		inc_nlink(&inode->vfs_inode);
		btrfs_release_path(wc->subvol_path);

		ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
		kfree(victim_name.name);
		if (ret)
			return ret;
		return -EAGAIN;
	}

	return 0;
}

static inline int __add_inode_ref(struct walk_control *wc,
				  struct btrfs_inode *dir,
				  struct btrfs_inode *inode,
				  u64 ref_index, struct fscrypt_str *name)
{
	int ret;
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	struct btrfs_dir_item *di;
	struct btrfs_key search_key;
	struct btrfs_inode_extref *extref;

again:
	/* Search old style refs */
	search_key.objectid = btrfs_ino(inode);
	search_key.type = BTRFS_INODE_REF_KEY;
	search_key.offset = btrfs_ino(dir);
	ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
	       "failed to search subvolume tree for key (%llu %u %llu) root %llu",
				       search_key.objectid, search_key.type,
				       search_key.offset, btrfs_root_id(root));
		return ret;
	} else if (ret == 0) {
		/*
		 * Are we trying to overwrite a back ref for the root directory?
		 * If so, we're done.
		 */
		if (search_key.objectid == search_key.offset)
			return 1;

		ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
		if (ret == -EAGAIN)
			goto again;
		else if (ret)
			return ret;
	}
	btrfs_release_path(wc->subvol_path);

	/* Same search but for extended refs */
	extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
					   btrfs_ino(inode), btrfs_ino(dir));
	if (IS_ERR(extref)) {
		return PTR_ERR(extref);
	} else if (extref) {
		ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
		if (ret == -EAGAIN)
			goto again;
		else if (ret)
			return ret;
	}
	btrfs_release_path(wc->subvol_path);

	/* look for a conflicting sequence number */
	di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
					 ref_index, name, 0);
	if (IS_ERR(di)) {
		ret = PTR_ERR(di);
		btrfs_abort_log_replay(wc, ret,
"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
				       btrfs_ino(dir), ref_index, name->len,
				       name->name, btrfs_root_id(root));
		return ret;
	} else if (di) {
		ret = drop_one_dir_item(wc, dir, di);
		if (ret)
			return ret;
	}
	btrfs_release_path(wc->subvol_path);

	/* look for a conflicting name */
	di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
	if (IS_ERR(di)) {
		ret = PTR_ERR(di);
		btrfs_abort_log_replay(wc, ret,
	"failed to lookup dir item for dir %llu name %.*s root %llu",
				       btrfs_ino(dir), name->len, name->name,
				       btrfs_root_id(root));
		return ret;
	} else if (di) {
		ret = drop_one_dir_item(wc, dir, di);
		if (ret)
			return ret;
	}
	btrfs_release_path(wc->subvol_path);

	return 0;
}

static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
			     struct fscrypt_str *name, u64 *index,
			     u64 *parent_objectid)
{
	struct btrfs_inode_extref *extref;
	int ret;

	extref = (struct btrfs_inode_extref *)ref_ptr;

	ret = read_alloc_one_name(eb, &extref->name,
				  btrfs_inode_extref_name_len(eb, extref), name);
	if (ret)
		return ret;

	if (index)
		*index = btrfs_inode_extref_index(eb, extref);
	if (parent_objectid)
		*parent_objectid = btrfs_inode_extref_parent(eb, extref);

	return 0;
}

static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
			  struct fscrypt_str *name, u64 *index)
{
	struct btrfs_inode_ref *ref;
	int ret;

	ref = (struct btrfs_inode_ref *)ref_ptr;

	ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
				  name);
	if (ret)
		return ret;

	if (index)
		*index = btrfs_inode_ref_index(eb, ref);

	return 0;
}

/*
 * Take an inode reference item from the log tree and iterate all names from the
 * inode reference item in the subvolume tree with the same key (if it exists).
 * For any name that is not in the inode reference item from the log tree, do a
 * proper unlink of that name (that is, remove its entry from the inode
 * reference item and both dir index keys).
 */
static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
{
	struct btrfs_root *root = wc->root;
	int ret;
	unsigned long ref_ptr;
	unsigned long ref_end;
	struct extent_buffer *eb;

again:
	btrfs_release_path(wc->subvol_path);
	ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
	if (ret > 0) {
		ret = 0;
		goto out;
	}
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
	       "failed to search subvolume tree for key (%llu %u %llu) root %llu",
				       wc->log_key.objectid, wc->log_key.type,
				       wc->log_key.offset, btrfs_root_id(root));
		goto out;
	}

	eb = wc->subvol_path->nodes[0];
	ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
	ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
	while (ref_ptr < ref_end) {
		struct fscrypt_str name;
		u64 parent_id;

		if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
			ret = extref_get_fields(eb, ref_ptr, &name,
						NULL, &parent_id);
			if (ret) {
				btrfs_abort_log_replay(wc, ret,
			       "failed to get extref details for inode %llu root %llu",
						       btrfs_ino(inode),
						       btrfs_root_id(root));
				goto out;
			}
		} else {
			parent_id = wc->log_key.offset;
			ret = ref_get_fields(eb, ref_ptr, &name, NULL);
			if (ret) {
				btrfs_abort_log_replay(wc, ret,
	       "failed to get ref details for inode %llu parent_id %llu root %llu",
						       btrfs_ino(inode), parent_id,
						       btrfs_root_id(root));
				goto out;
			}
		}

		if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
			ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
							       parent_id, &name);
		else
			ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
							   &name);

		if (!ret) {
			struct btrfs_inode *dir;

			btrfs_release_path(wc->subvol_path);
			dir = btrfs_iget_logging(parent_id, root);
			if (IS_ERR(dir)) {
				ret = PTR_ERR(dir);
				kfree(name.name);
				btrfs_abort_log_replay(wc, ret,
				       "failed to lookup dir inode %llu root %llu",
						       parent_id, btrfs_root_id(root));
				goto out;
			}
			ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
			kfree(name.name);
			iput(&dir->vfs_inode);
			if (ret)
				goto out;
			goto again;
		}

		kfree(name.name);
		ref_ptr += name.len;
		if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
			ref_ptr += sizeof(struct btrfs_inode_extref);
		else
			ref_ptr += sizeof(struct btrfs_inode_ref);
	}
	ret = 0;
 out:
	btrfs_release_path(wc->subvol_path);
	return ret;
}

/*
 * Replay one inode back reference item found in the log tree.
 * Path is for temporary use by this function (it should be released on return).
 */
static noinline int add_inode_ref(struct walk_control *wc)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	struct btrfs_inode *dir = NULL;
	struct btrfs_inode *inode = NULL;
	unsigned long ref_ptr;
	unsigned long ref_end;
	struct fscrypt_str name = { 0 };
	int ret;
	const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
	u64 parent_objectid;
	u64 inode_objectid;
	u64 ref_index = 0;
	int ref_struct_size;

	ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
	ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);

	if (is_extref_item) {
		struct btrfs_inode_extref *r;

		ref_struct_size = sizeof(struct btrfs_inode_extref);
		r = (struct btrfs_inode_extref *)ref_ptr;
		parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
	} else {
		ref_struct_size = sizeof(struct btrfs_inode_ref);
		parent_objectid = wc->log_key.offset;
	}
	inode_objectid = wc->log_key.objectid;

	/*
	 * it is possible that we didn't log all the parent directories
	 * for a given inode.  If we don't find the dir, just don't
	 * copy the back ref in.  The link count fixup code will take
	 * care of the rest
	 */
	dir = btrfs_iget_logging(parent_objectid, root);
	if (IS_ERR(dir)) {
		ret = PTR_ERR(dir);
		if (ret == -ENOENT)
			ret = 0;
		else
			btrfs_abort_log_replay(wc, ret,
			       "failed to lookup dir inode %llu root %llu",
					       parent_objectid, btrfs_root_id(root));
		dir = NULL;
		goto out;
	}

	inode = btrfs_iget_logging(inode_objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		btrfs_abort_log_replay(wc, ret,
				       "failed to lookup inode %llu root %llu",
				       inode_objectid, btrfs_root_id(root));
		inode = NULL;
		goto out;
	}

	while (ref_ptr < ref_end) {
		if (is_extref_item) {
			ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
						&ref_index, &parent_objectid);
			if (ret) {
				btrfs_abort_log_replay(wc, ret,
			       "failed to get extref details for inode %llu root %llu",
						       btrfs_ino(inode),
						       btrfs_root_id(root));
				goto out;
			}
			/*
			 * parent object can change from one array
			 * item to another.
			 */
			if (!dir) {
				dir = btrfs_iget_logging(parent_objectid, root);
				if (IS_ERR(dir)) {
					ret = PTR_ERR(dir);
					dir = NULL;
					/*
					 * A new parent dir may have not been
					 * logged and not exist in the subvolume
					 * tree, see the comment above before
					 * the loop when getting the first
					 * parent dir.
					 */
					if (ret == -ENOENT) {
						/*
						 * The next extref may refer to
						 * another parent dir that
						 * exists, so continue.
						 */
						ret = 0;
						goto next;
					} else {
						btrfs_abort_log_replay(wc, ret,
				       "failed to lookup dir inode %llu root %llu",
								       parent_objectid,
								       btrfs_root_id(root));
					}
					goto out;
				}
			}
		} else {
			ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
			if (ret) {
				btrfs_abort_log_replay(wc, ret,
	"failed to get ref details for inode %llu parent_objectid %llu root %llu",
						       btrfs_ino(inode),
						       parent_objectid,
						       btrfs_root_id(root));
				goto out;
			}
		}

		ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
				   btrfs_ino(inode), ref_index, &name);
		if (ret < 0) {
			btrfs_abort_log_replay(wc, ret,
"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
					       btrfs_ino(inode), btrfs_ino(dir),
					       ref_index, name.len, name.name,
					       btrfs_root_id(root));
			goto out;
		} else if (ret == 0) {
			/*
			 * look for a conflicting back reference in the
			 * metadata. if we find one we have to unlink that name
			 * of the file before we add our new link.  Later on, we
			 * overwrite any existing back reference, and we don't
			 * want to create dangling pointers in the directory.
			 */
			ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
			if (ret) {
				if (ret == 1)
					ret = 0;
				goto out;
			}

			/* insert our name */
			ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
			if (ret) {
				btrfs_abort_log_replay(wc, ret,
"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
						       btrfs_ino(inode),
						       btrfs_ino(dir), ref_index,
						       name.len, name.name,
						       btrfs_root_id(root));
				goto out;
			}

			ret = btrfs_update_inode(trans, inode);
			if (ret) {
				btrfs_abort_log_replay(wc, ret,
				       "failed to update inode %llu root %llu",
						       btrfs_ino(inode),
						       btrfs_root_id(root));
				goto out;
			}
		}
		/* Else, ret == 1, we already have a perfect match, we're done. */

next:
		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
		kfree(name.name);
		name.name = NULL;
		if (is_extref_item && dir) {
			iput(&dir->vfs_inode);
			dir = NULL;
		}
	}

	/*
	 * Before we overwrite the inode reference item in the subvolume tree
	 * with the item from the log tree, we must unlink all names from the
	 * parent directory that are in the subvolume's tree inode reference
	 * item, otherwise we end up with an inconsistent subvolume tree where
	 * dir index entries exist for a name but there is no inode reference
	 * item with the same name.
	 */
	ret = unlink_old_inode_refs(wc, inode);
	if (ret)
		goto out;

	/* finally write the back reference in the inode */
	ret = overwrite_item(wc);
out:
	btrfs_release_path(wc->subvol_path);
	kfree(name.name);
	if (dir)
		iput(&dir->vfs_inode);
	if (inode)
		iput(&inode->vfs_inode);
	return ret;
}

static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
	int ret = 0;
	int name_len;
	unsigned int nlink = 0;
	u32 item_size;
	u32 cur_offset = 0;
	u64 inode_objectid = btrfs_ino(inode);
	u64 offset = 0;
	unsigned long ptr;
	struct btrfs_inode_extref *extref;
	struct extent_buffer *leaf;

	while (1) {
		ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
					    path, &extref, &offset);
		if (ret)
			break;

		leaf = path->nodes[0];
		item_size = btrfs_item_size(leaf, path->slots[0]);
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		cur_offset = 0;

		while (cur_offset < item_size) {
			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
			name_len = btrfs_inode_extref_name_len(leaf, extref);

			nlink++;

			cur_offset += name_len + sizeof(*extref);
		}

		offset++;
		btrfs_release_path(path);
	}
	btrfs_release_path(path);

	if (ret < 0 && ret != -ENOENT)
		return ret;
	return nlink;
}

static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
	int ret;
	struct btrfs_key key;
	unsigned int nlink = 0;
	unsigned long ptr;
	unsigned long ptr_end;
	int name_len;
	u64 ino = btrfs_ino(inode);

	key.objectid = ino;
	key.type = BTRFS_INODE_REF_KEY;
	key.offset = (u64)-1;

	while (1) {
		ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
		if (ret < 0)
			break;
		if (ret > 0) {
			if (path->slots[0] == 0)
				break;
			path->slots[0]--;
		}
process_slot:
		btrfs_item_key_to_cpu(path->nodes[0], &key,
				      path->slots[0]);
		if (key.objectid != ino ||
		    key.type != BTRFS_INODE_REF_KEY)
			break;
		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
		ptr_end = ptr + btrfs_item_size(path->nodes[0],
						   path->slots[0]);
		while (ptr < ptr_end) {
			struct btrfs_inode_ref *ref;

			ref = (struct btrfs_inode_ref *)ptr;
			name_len = btrfs_inode_ref_name_len(path->nodes[0],
							    ref);
			ptr = (unsigned long)(ref + 1) + name_len;
			nlink++;
		}

		if (key.offset == 0)
			break;
		if (path->slots[0] > 0) {
			path->slots[0]--;
			goto process_slot;
		}
		key.offset--;
		btrfs_release_path(path);
	}
	btrfs_release_path(path);

	return nlink;
}

/*
 * There are a few corners where the link count of the file can't
 * be properly maintained during replay.  So, instead of adding
 * lots of complexity to the log code, we just scan the backrefs
 * for any file that has been through replay.
 *
 * The scan will update the link count on the inode to reflect the
 * number of back refs found.  If it goes down to zero, the iput
 * will free the inode.
 */
static noinline int fixup_inode_link_count(struct walk_control *wc,
					   struct btrfs_inode *inode)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = inode->root;
	int ret;
	u64 nlink = 0;
	const u64 ino = btrfs_ino(inode);

	ret = count_inode_refs(inode, wc->subvol_path);
	if (ret < 0)
		goto out;

	nlink = ret;

	ret = count_inode_extrefs(inode, wc->subvol_path);
	if (ret < 0)
		goto out;

	nlink += ret;

	ret = 0;

	if (nlink != inode->vfs_inode.i_nlink) {
		set_nlink(&inode->vfs_inode, nlink);
		ret = btrfs_update_inode(trans, inode);
		if (ret)
			goto out;
	}
	if (S_ISDIR(inode->vfs_inode.i_mode))
		inode->index_cnt = (u64)-1;

	if (inode->vfs_inode.i_nlink == 0) {
		if (S_ISDIR(inode->vfs_inode.i_mode)) {
			ret = replay_dir_deletes(wc, ino, true);
			if (ret)
				goto out;
		}
		ret = btrfs_insert_orphan_item(trans, root, ino);
		if (ret == -EEXIST)
			ret = 0;
	}

out:
	btrfs_release_path(wc->subvol_path);
	return ret;
}

static noinline int fixup_inode_link_counts(struct walk_control *wc)
{
	int ret;
	struct btrfs_key key;

	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
	key.type = BTRFS_ORPHAN_ITEM_KEY;
	key.offset = (u64)-1;
	while (1) {
		struct btrfs_trans_handle *trans = wc->trans;
		struct btrfs_root *root = wc->root;
		struct btrfs_inode *inode;

		ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
		if (ret < 0)
			break;

		if (ret == 1) {
			ret = 0;
			if (wc->subvol_path->slots[0] == 0)
				break;
			wc->subvol_path->slots[0]--;
		}

		btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
		    key.type != BTRFS_ORPHAN_ITEM_KEY)
			break;

		ret = btrfs_del_item(trans, root, wc->subvol_path);
		if (ret)
			break;

		btrfs_release_path(wc->subvol_path);
		inode = btrfs_iget_logging(key.offset, root);
		if (IS_ERR(inode)) {
			ret = PTR_ERR(inode);
			break;
		}

		ret = fixup_inode_link_count(wc, inode);
		iput(&inode->vfs_inode);
		if (ret)
			break;

		/*
		 * fixup on a directory may create new entries,
		 * make sure we always look for the highest possible
		 * offset
		 */
		key.offset = (u64)-1;
	}
	btrfs_release_path(wc->subvol_path);
	return ret;
}


/*
 * record a given inode in the fixup dir so we can check its link
 * count when replay is done.  The link count is incremented here
 * so the inode won't go away until we check it
 */
static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	struct btrfs_key key;
	int ret = 0;
	struct btrfs_inode *inode;
	struct inode *vfs_inode;

	inode = btrfs_iget_logging(objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		btrfs_abort_log_replay(wc, ret,
				       "failed to lookup inode %llu root %llu",
				       objectid, btrfs_root_id(root));
		return ret;
	}

	vfs_inode = &inode->vfs_inode;
	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
	key.type = BTRFS_ORPHAN_ITEM_KEY;
	key.offset = objectid;

	ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);

	btrfs_release_path(wc->subvol_path);
	if (ret == 0) {
		if (!vfs_inode->i_nlink)
			set_nlink(vfs_inode, 1);
		else
			inc_nlink(vfs_inode);
		ret = btrfs_update_inode(trans, inode);
		if (ret)
			btrfs_abort_log_replay(wc, ret,
				       "failed to update inode %llu root %llu",
					       objectid, btrfs_root_id(root));
	} else if (ret == -EEXIST) {
		ret = 0;
	} else {
		btrfs_abort_log_replay(wc, ret,
		       "failed to insert fixup item for inode %llu root %llu",
				       objectid, btrfs_root_id(root));
	}
	iput(vfs_inode);

	return ret;
}

/*
 * when replaying the log for a directory, we only insert names
 * for inodes that actually exist.  This means an fsync on a directory
 * does not implicitly fsync all the new files in it
 */
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root,
				    u64 dirid, u64 index,
				    const struct fscrypt_str *name,
				    struct btrfs_key *location)
{
	struct btrfs_inode *inode;
	struct btrfs_inode *dir;
	int ret;

	inode = btrfs_iget_logging(location->objectid, root);
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	dir = btrfs_iget_logging(dirid, root);
	if (IS_ERR(dir)) {
		iput(&inode->vfs_inode);
		return PTR_ERR(dir);
	}

	ret = btrfs_add_link(trans, dir, inode, name, 1, index);

	/* FIXME, put inode into FIXUP list */

	iput(&inode->vfs_inode);
	iput(&dir->vfs_inode);
	return ret;
}

static int delete_conflicting_dir_entry(struct walk_control *wc,
					struct btrfs_inode *dir,
					struct btrfs_dir_item *dst_di,
					const struct btrfs_key *log_key,
					u8 log_flags,
					bool exists)
{
	struct btrfs_key found_key;

	btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
	/* The existing dentry points to the same inode, don't delete it. */
	if (found_key.objectid == log_key->objectid &&
	    found_key.type == log_key->type &&
	    found_key.offset == log_key->offset &&
	    btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
		return 1;

	/*
	 * Don't drop the conflicting directory entry if the inode for the new
	 * entry doesn't exist.
	 */
	if (!exists)
		return 0;

	return drop_one_dir_item(wc, dir, dst_di);
}

/*
 * take a single entry in a log directory item and replay it into
 * the subvolume.
 *
 * if a conflicting item exists in the subdirectory already,
 * the inode it points to is unlinked and put into the link count
 * fix up tree.
 *
 * If a name from the log points to a file or directory that does
 * not exist in the FS, it is skipped.  fsyncs on directories
 * do not force down inodes inside that directory, just changes to the
 * names or unlinks in a directory.
 *
 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
 * non-existing inode) and 1 if the name was replayed.
 */
static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	struct fscrypt_str name = { 0 };
	struct btrfs_dir_item *dir_dst_di;
	struct btrfs_dir_item *index_dst_di;
	bool dir_dst_matches = false;
	bool index_dst_matches = false;
	struct btrfs_key log_key;
	struct btrfs_key search_key;
	struct btrfs_inode *dir;
	u8 log_flags;
	bool exists;
	int ret;
	bool update_size = true;
	bool name_added = false;

	dir = btrfs_iget_logging(wc->log_key.objectid, root);
	if (IS_ERR(dir)) {
		ret = PTR_ERR(dir);
		btrfs_abort_log_replay(wc, ret,
				       "failed to lookup dir inode %llu root %llu",
				       wc->log_key.objectid, btrfs_root_id(root));
		return ret;
	}

	ret = read_alloc_one_name(wc->log_leaf, di + 1,
				  btrfs_dir_name_len(wc->log_leaf, di), &name);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
			       "failed to allocate name for dir %llu root %llu",
				       btrfs_ino(dir), btrfs_root_id(root));
		goto out;
	}

	log_flags = btrfs_dir_flags(wc->log_leaf, di);
	btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
	ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
	btrfs_release_path(wc->subvol_path);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
				       "failed to lookup inode %llu root %llu",
				       log_key.objectid, btrfs_root_id(root));
		goto out;
	}
	exists = (ret == 0);
	ret = 0;

	dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
					   wc->log_key.objectid, &name, 1);
	if (IS_ERR(dir_dst_di)) {
		ret = PTR_ERR(dir_dst_di);
		btrfs_abort_log_replay(wc, ret,
		       "failed to lookup dir item for dir %llu name %.*s root %llu",
				       wc->log_key.objectid, name.len, name.name,
				       btrfs_root_id(root));
		goto out;
	} else if (dir_dst_di) {
		ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
						   &log_key, log_flags, exists);
		if (ret < 0) {
			btrfs_abort_log_replay(wc, ret,
	       "failed to delete conflicting entry for dir %llu name %.*s root %llu",
					       btrfs_ino(dir), name.len, name.name,
					       btrfs_root_id(root));
			goto out;
		}
		dir_dst_matches = (ret == 1);
	}

	btrfs_release_path(wc->subvol_path);

	index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
						   wc->log_key.objectid,
						   wc->log_key.offset, &name, 1);
	if (IS_ERR(index_dst_di)) {
		ret = PTR_ERR(index_dst_di);
		btrfs_abort_log_replay(wc, ret,
	       "failed to lookup dir index item for dir %llu name %.*s root %llu",
				       wc->log_key.objectid, name.len, name.name,
				       btrfs_root_id(root));
		goto out;
	} else if (index_dst_di) {
		ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
						   &log_key, log_flags, exists);
		if (ret < 0) {
			btrfs_abort_log_replay(wc, ret,
	       "failed to delete conflicting entry for dir %llu name %.*s root %llu",
					       btrfs_ino(dir), name.len, name.name,
					       btrfs_root_id(root));
			goto out;
		}
		index_dst_matches = (ret == 1);
	}

	btrfs_release_path(wc->subvol_path);

	if (dir_dst_matches && index_dst_matches) {
		ret = 0;
		update_size = false;
		goto out;
	}

	/*
	 * Check if the inode reference exists in the log for the given name,
	 * inode and parent inode
	 */
	search_key.objectid = log_key.objectid;
	search_key.type = BTRFS_INODE_REF_KEY;
	search_key.offset = wc->log_key.objectid;
	ret = backref_in_log(root->log_root, &search_key, 0, &name);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
				       search_key.objectid, btrfs_ino(dir),
				       name.len, name.name, btrfs_root_id(root));
	        goto out;
	} else if (ret) {
	        /* The dentry will be added later. */
	        ret = 0;
	        update_size = false;
	        goto out;
	}

	search_key.objectid = log_key.objectid;
	search_key.type = BTRFS_INODE_EXTREF_KEY;
	search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
	ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
				       search_key.objectid, btrfs_ino(dir),
				       name.len, name.name, btrfs_root_id(root));
		goto out;
	} else if (ret) {
		/* The dentry will be added later. */
		ret = 0;
		update_size = false;
		goto out;
	}
	ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
			      &name, &log_key);
	if (ret && ret != -ENOENT && ret != -EEXIST) {
		btrfs_abort_log_replay(wc, ret,
		       "failed to insert name %.*s for inode %llu dir %llu root %llu",
				       name.len, name.name, log_key.objectid,
				       btrfs_ino(dir), btrfs_root_id(root));
		goto out;
	}
	if (!ret)
		name_added = true;
	update_size = false;
	ret = 0;

out:
	if (!ret && update_size) {
		btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
		ret = btrfs_update_inode(trans, dir);
		if (ret)
			btrfs_abort_log_replay(wc, ret,
				       "failed to update dir inode %llu root %llu",
					       btrfs_ino(dir), btrfs_root_id(root));
	}
	kfree(name.name);
	iput(&dir->vfs_inode);
	if (!ret && name_added)
		ret = 1;
	return ret;
}

/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct walk_control *wc)
{
	int ret;
	struct btrfs_dir_item *di;

	/* We only log dir index keys, which only contain a single dir item. */
	ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY);

	di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
	ret = replay_one_name(wc, di);
	if (ret < 0)
		return ret;

	/*
	 * If this entry refers to a non-directory (directories can not have a
	 * link count > 1) and it was added in the transaction that was not
	 * committed, make sure we fixup the link count of the inode the entry
	 * points to. Otherwise something like the following would result in a
	 * directory pointing to an inode with a wrong link that does not account
	 * for this dir entry:
	 *
	 * mkdir testdir
	 * touch testdir/foo
	 * touch testdir/bar
	 * sync
	 *
	 * ln testdir/bar testdir/bar_link
	 * ln testdir/foo testdir/foo_link
	 * xfs_io -c "fsync" testdir/bar
	 *
	 * <power failure>
	 *
	 * mount fs, log replay happens
	 *
	 * File foo would remain with a link count of 1 when it has two entries
	 * pointing to it in the directory testdir. This would make it impossible
	 * to ever delete the parent directory has it would result in stale
	 * dentries that can never be deleted.
	 */
	if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
		struct btrfs_key di_key;

		btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
		ret = link_to_fixup_dir(wc, di_key.objectid);
	}

	return ret;
}

/*
 * directory replay has two parts.  There are the standard directory
 * items in the log copied from the subvolume, and range items
 * created in the log while the subvolume was logged.
 *
 * The range items tell us which parts of the key space the log
 * is authoritative for.  During replay, if a key in the subvolume
 * directory is in a logged range item, but not actually in the log
 * that means it was deleted from the directory before the fsync
 * and should be removed.
 */
static noinline int find_dir_range(struct btrfs_root *root,
				   struct btrfs_path *path,
				   u64 dirid,
				   u64 *start_ret, u64 *end_ret)
{
	struct btrfs_key key;
	u64 found_end;
	struct btrfs_dir_log_item *item;
	int ret;
	int nritems;

	if (*start_ret == (u64)-1)
		return 1;

	key.objectid = dirid;
	key.type = BTRFS_DIR_LOG_INDEX_KEY;
	key.offset = *start_ret;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		if (path->slots[0] == 0)
			goto out;
		path->slots[0]--;
	}
	if (ret != 0)
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
		ret = 1;
		goto next;
	}
	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
			      struct btrfs_dir_log_item);
	found_end = btrfs_dir_log_end(path->nodes[0], item);

	if (*start_ret >= key.offset && *start_ret <= found_end) {
		ret = 0;
		*start_ret = key.offset;
		*end_ret = found_end;
		goto out;
	}
	ret = 1;
next:
	/* check the next slot in the tree to see if it is a valid item */
	nritems = btrfs_header_nritems(path->nodes[0]);
	path->slots[0]++;
	if (path->slots[0] >= nritems) {
		ret = btrfs_next_leaf(root, path);
		if (ret)
			goto out;
	}

	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
		ret = 1;
		goto out;
	}
	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
			      struct btrfs_dir_log_item);
	found_end = btrfs_dir_log_end(path->nodes[0], item);
	*start_ret = key.offset;
	*end_ret = found_end;
	ret = 0;
out:
	btrfs_release_path(path);
	return ret;
}

/*
 * this looks for a given directory item in the log.  If the directory
 * item is not in the log, the item is removed and the inode it points
 * to is unlinked
 */
static noinline int check_item_in_log(struct walk_control *wc,
				      struct btrfs_path *log_path,
				      struct btrfs_inode *dir,
				      struct btrfs_key *dir_key,
				      bool force_remove)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = dir->root;
	int ret;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_dir_item *di;
	struct fscrypt_str name = { 0 };
	struct btrfs_inode *inode = NULL;
	struct btrfs_key location;

	/*
	 * Currently we only log dir index keys. Even if we replay a log created
	 * by an older kernel that logged both dir index and dir item keys, all
	 * we need to do is process the dir index keys, we (and our caller) can
	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
	 */
	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);

	eb = wc->subvol_path->nodes[0];
	slot = wc->subvol_path->slots[0];
	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
		       "failed to allocate name for dir %llu index %llu root %llu",
				       btrfs_ino(dir), dir_key->offset,
				       btrfs_root_id(root));
		goto out;
	}

	if (!force_remove) {
		struct btrfs_dir_item *log_di;

		log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
						     dir_key->objectid,
						     dir_key->offset, &name, 0);
		if (IS_ERR(log_di)) {
			ret = PTR_ERR(log_di);
			btrfs_abort_log_replay(wc, ret,
	"failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
					       btrfs_ino(dir), dir_key->offset,
					       name.len, name.name,
					       btrfs_root_id(root));
			goto out;
		} else if (log_di) {
			/* The dentry exists in the log, we have nothing to do. */
			ret = 0;
			goto out;
		}
	}

	btrfs_dir_item_key_to_cpu(eb, di, &location);
	btrfs_release_path(wc->subvol_path);
	btrfs_release_path(log_path);
	inode = btrfs_iget_logging(location.objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		inode = NULL;
		btrfs_abort_log_replay(wc, ret,
				       "failed to lookup inode %llu root %llu",
				       location.objectid, btrfs_root_id(root));
		goto out;
	}

	ret = link_to_fixup_dir(wc, location.objectid);
	if (ret)
		goto out;

	inc_nlink(&inode->vfs_inode);
	ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
	/*
	 * Unlike dir item keys, dir index keys can only have one name (entry) in
	 * them, as there are no key collisions since each key has a unique offset
	 * (an index number), so we're done.
	 */
out:
	btrfs_release_path(wc->subvol_path);
	btrfs_release_path(log_path);
	kfree(name.name);
	if (inode)
		iput(&inode->vfs_inode);
	return ret;
}

static int replay_xattr_deletes(struct walk_control *wc)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_root *root = wc->root;
	struct btrfs_root *log = wc->log;
	struct btrfs_key search_key;
	BTRFS_PATH_AUTO_FREE(log_path);
	const u64 ino = wc->log_key.objectid;
	int nritems;
	int ret;

	log_path = btrfs_alloc_path();
	if (!log_path) {
		btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
		return -ENOMEM;
	}

	search_key.objectid = ino;
	search_key.type = BTRFS_XATTR_ITEM_KEY;
	search_key.offset = 0;
again:
	ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
	if (ret < 0) {
		btrfs_abort_log_replay(wc, ret,
			       "failed to search xattrs for inode %llu root %llu",
				       ino, btrfs_root_id(root));
		goto out;
	}
process_leaf:
	nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
	for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
		struct btrfs_key key;
		struct btrfs_dir_item *di;
		struct btrfs_dir_item *log_di;
		u32 total_size;
		u32 cur;

		btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
			ret = 0;
			goto out;
		}

		di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
		total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
		cur = 0;
		while (cur < total_size) {
			u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
			u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
			u32 this_len = sizeof(*di) + name_len + data_len;
			char *name;

			name = kmalloc(name_len, GFP_NOFS);
			if (!name) {
				ret = -ENOMEM;
				btrfs_abort_log_replay(wc, ret,
				       "failed to allocate memory for name of length %u",
						       name_len);
				goto out;
			}
			read_extent_buffer(wc->subvol_path->nodes[0], name,
					   (unsigned long)(di + 1), name_len);

			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
						    name, name_len, 0);
			btrfs_release_path(log_path);
			if (!log_di) {
				/* Doesn't exist in log tree, so delete it. */
				btrfs_release_path(wc->subvol_path);
				di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
							name, name_len, -1);
				if (IS_ERR(di)) {
					ret = PTR_ERR(di);
					btrfs_abort_log_replay(wc, ret,
		       "failed to lookup xattr with name %.*s for inode %llu root %llu",
							       name_len, name, ino,
							       btrfs_root_id(root));
					kfree(name);
					goto out;
				}
				ASSERT(di);
				ret = btrfs_delete_one_dir_name(trans, root,
								wc->subvol_path, di);
				if (ret) {
					btrfs_abort_log_replay(wc, ret,
		       "failed to delete xattr with name %.*s for inode %llu root %llu",
							       name_len, name, ino,
							       btrfs_root_id(root));
					kfree(name);
					goto out;
				}
				btrfs_release_path(wc->subvol_path);
				kfree(name);
				search_key = key;
				goto again;
			}
			if (IS_ERR(log_di)) {
				ret = PTR_ERR(log_di);
				btrfs_abort_log_replay(wc, ret,
	"failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
						       name_len, name, ino,
						       btrfs_root_id(root));
				kfree(name);
				goto out;
			}
			kfree(name);
			cur += this_len;
			di = (struct btrfs_dir_item *)((char *)di + this_len);
		}
	}
	ret = btrfs_next_leaf(root, wc->subvol_path);
	if (ret > 0)
		ret = 0;
	else if (ret == 0)
		goto process_leaf;
	else
		btrfs_abort_log_replay(wc, ret,
			       "failed to get next leaf in subvolume root %llu",
				       btrfs_root_id(root));
out:
	btrfs_release_path(wc->subvol_path);
	return ret;
}


/*
 * deletion replay happens before we copy any new directory items
 * out of the log or out of backreferences from inodes.  It
 * scans the log to find ranges of keys that log is authoritative for,
 * and then scans the directory to find items in those ranges that are
 * not present in the log.
 *
 * Anything we don't find in the log is unlinked and removed from the
 * directory.
 */
static noinline int replay_dir_deletes(struct walk_control *wc,
				       u64 dirid, bool del_all)
{
	struct btrfs_root *root = wc->root;
	struct btrfs_root *log = (del_all ? NULL : wc->log);
	u64 range_start;
	u64 range_end;
	int ret = 0;
	struct btrfs_key dir_key;
	struct btrfs_key found_key;
	struct btrfs_path *log_path;
	struct btrfs_inode *dir;

	dir_key.objectid = dirid;
	dir_key.type = BTRFS_DIR_INDEX_KEY;
	log_path = btrfs_alloc_path();
	if (!log_path) {
		btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
		return -ENOMEM;
	}

	dir = btrfs_iget_logging(dirid, root);
	/*
	 * It isn't an error if the inode isn't there, that can happen because
	 * we replay the deletes before we copy in the inode item from the log.
	 */
	if (IS_ERR(dir)) {
		btrfs_free_path(log_path);
		ret = PTR_ERR(dir);
		if (ret == -ENOENT)
			ret = 0;
		else
			btrfs_abort_log_replay(wc, ret,
			       "failed to lookup dir inode %llu root %llu",
					       dirid, btrfs_root_id(root));
		return ret;
	}

	range_start = 0;
	range_end = 0;
	while (1) {
		if (del_all)
			range_end = (u64)-1;
		else {
			ret = find_dir_range(log, wc->subvol_path, dirid,
					     &range_start, &range_end);
			if (ret < 0) {
				btrfs_abort_log_replay(wc, ret,
			       "failed to find range for dir %llu in log tree root %llu",
						       dirid, btrfs_root_id(root));
				goto out;
			} else if (ret > 0) {
				break;
			}
		}

		dir_key.offset = range_start;
		while (1) {
			int nritems;
			ret = btrfs_search_slot(NULL, root, &dir_key,
						wc->subvol_path, 0, 0);
			if (ret < 0) {
				btrfs_abort_log_replay(wc, ret,
			       "failed to search root %llu for key (%llu %u %llu)",
						       btrfs_root_id(root),
						       dir_key.objectid, dir_key.type,
						       dir_key.offset);
				goto out;
			}

			nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
			if (wc->subvol_path->slots[0] >= nritems) {
				ret = btrfs_next_leaf(root, wc->subvol_path);
				if (ret == 1) {
					break;
				} else if (ret < 0) {
					btrfs_abort_log_replay(wc, ret,
				       "failed to get next leaf in subvolume root %llu",
							       btrfs_root_id(root));
					goto out;
				}
			}
			btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
					      wc->subvol_path->slots[0]);
			if (found_key.objectid != dirid ||
			    found_key.type != dir_key.type) {
				ret = 0;
				goto out;
			}

			if (found_key.offset > range_end)
				break;

			ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
			if (ret)
				goto out;
			if (found_key.offset == (u64)-1)
				break;
			dir_key.offset = found_key.offset + 1;
		}
		btrfs_release_path(wc->subvol_path);
		if (range_end == (u64)-1)
			break;
		range_start = range_end + 1;
	}
	ret = 0;
out:
	btrfs_release_path(wc->subvol_path);
	btrfs_free_path(log_path);
	iput(&dir->vfs_inode);
	return ret;
}

/*
 * the process_func used to replay items from the log tree.  This
 * gets called in two different stages.  The first stage just looks
 * for inodes and makes sure they are all copied into the subvolume.
 *
 * The second stage copies all the other item types from the log into
 * the subvolume.  The two stage approach is slower, but gets rid of
 * lots of complexity around inodes referencing other inodes that exist
 * only in the log (references come from either directory items or inode
 * back refs).
 */
static int replay_one_buffer(struct extent_buffer *eb,
			     struct walk_control *wc, u64 gen, int level)
{
	int nritems;
	struct btrfs_tree_parent_check check = {
		.transid = gen,
		.level = level
	};
	struct btrfs_root *root = wc->root;
	struct btrfs_trans_handle *trans = wc->trans;
	int ret;

	if (level != 0)
		return 0;

	/*
	 * Set to NULL since it was not yet read and in case we abort log replay
	 * on error, we have no valid log tree leaf to dump.
	 */
	wc->log_leaf = NULL;
	ret = btrfs_read_extent_buffer(eb, &check);
	if (ret) {
		btrfs_abort_log_replay(wc, ret,
		       "failed to read log tree leaf %llu for root %llu",
				       eb->start, btrfs_root_id(root));
		return ret;
	}

	ASSERT(wc->subvol_path == NULL);
	wc->subvol_path = btrfs_alloc_path();
	if (!wc->subvol_path) {
		btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
		return -ENOMEM;
	}

	wc->log_leaf = eb;

	nritems = btrfs_header_nritems(eb);
	for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
		struct btrfs_inode_item *inode_item = NULL;

		btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);

		if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
			inode_item = btrfs_item_ptr(eb, wc->log_slot,
						    struct btrfs_inode_item);
			/*
			 * An inode with no links is either:
			 *
			 * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
			 *    got linked before the fsync, skip it, as replaying
			 *    it is pointless since it would be deleted later.
			 *    We skip logging tmpfiles, but it's always possible
			 *    we are replaying a log created with a kernel that
			 *    used to log tmpfiles;
			 *
			 * 2) A non-tmpfile which got its last link deleted
			 *    while holding an open fd on it and later got
			 *    fsynced through that fd. We always log the
			 *    parent inodes when inode->last_unlink_trans is
			 *    set to the current transaction, so ignore all the
			 *    inode items for this inode. We will delete the
			 *    inode when processing the parent directory with
			 *    replay_dir_deletes().
			 */
			if (btrfs_inode_nlink(eb, inode_item) == 0) {
				wc->ignore_cur_inode = true;
				continue;
			} else {
				wc->ignore_cur_inode = false;
			}
		}

		/* Inode keys are done during the first stage. */
		if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
		    wc->stage == LOG_WALK_REPLAY_INODES) {
			u32 mode;

			ret = replay_xattr_deletes(wc);
			if (ret)
				break;
			mode = btrfs_inode_mode(eb, inode_item);
			if (S_ISDIR(mode)) {
				ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
				if (ret)
					break;
			}
			ret = overwrite_item(wc);
			if (ret)
				break;

			/*
			 * Before replaying extents, truncate the inode to its
			 * size. We need to do it now and not after log replay
			 * because before an fsync we can have prealloc extents
			 * added beyond the inode's i_size. If we did it after,
			 * through orphan cleanup for example, we would drop
			 * those prealloc extents just after replaying them.
			 */
			if (S_ISREG(mode)) {
				struct btrfs_drop_extents_args drop_args = { 0 };
				struct btrfs_inode *inode;
				u64 from;

				inode = btrfs_iget_logging(wc->log_key.objectid, root);
				if (IS_ERR(inode)) {
					ret = PTR_ERR(inode);
					btrfs_abort_log_replay(wc, ret,
					       "failed to lookup inode %llu root %llu",
							       wc->log_key.objectid,
							       btrfs_root_id(root));
					break;
				}
				from = ALIGN(i_size_read(&inode->vfs_inode),
					     root->fs_info->sectorsize);
				drop_args.start = from;
				drop_args.end = (u64)-1;
				drop_args.drop_cache = true;
				drop_args.path = wc->subvol_path;
				ret = btrfs_drop_extents(trans, root, inode,  &drop_args);
				if (ret) {
					btrfs_abort_log_replay(wc, ret,
		       "failed to drop extents for inode %llu root %llu offset %llu",
							       btrfs_ino(inode),
							       btrfs_root_id(root),
							       from);
				} else {
					inode_sub_bytes(&inode->vfs_inode,
							drop_args.bytes_found);
					/* Update the inode's nbytes. */
					ret = btrfs_update_inode(trans, inode);
					if (ret)
						btrfs_abort_log_replay(wc, ret,
					       "failed to update inode %llu root %llu",
								       btrfs_ino(inode),
								       btrfs_root_id(root));
				}
				iput(&inode->vfs_inode);
				if (ret)
					break;
			}

			ret = link_to_fixup_dir(wc, wc->log_key.objectid);
			if (ret)
				break;
		}

		if (wc->ignore_cur_inode)
			continue;

		if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
			ret = replay_one_dir_item(wc);
			if (ret)
				break;
		}

		if (wc->stage < LOG_WALK_REPLAY_ALL)
			continue;

		/* these keys are simply copied */
		if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
			ret = overwrite_item(wc);
			if (ret)
				break;
		} else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
			   wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
			ret = add_inode_ref(wc);
			if (ret)
				break;
		} else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
			ret = replay_one_extent(wc);
			if (ret)
				break;
		}
		/*
		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
		 * older kernel with such keys, ignore them.
		 */
	}
	btrfs_free_path(wc->subvol_path);
	wc->subvol_path = NULL;
	return ret;
}

static int clean_log_buffer(struct btrfs_trans_handle *trans,
			    struct extent_buffer *eb)
{
	struct btrfs_fs_info *fs_info = eb->fs_info;
	struct btrfs_block_group *bg;

	btrfs_tree_lock(eb);
	btrfs_clear_buffer_dirty(trans, eb);
	wait_on_extent_buffer_writeback(eb);
	btrfs_tree_unlock(eb);

	if (trans) {
		int ret;

		ret = btrfs_pin_reserved_extent(trans, eb);
		if (ret)
			btrfs_abort_transaction(trans, ret);
		return ret;
	}

	bg = btrfs_lookup_block_group(fs_info, eb->start);
	if (!bg) {
		btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
		btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
		return -ENOENT;
	}

	spin_lock(&bg->space_info->lock);
	spin_lock(&bg->lock);
	bg->reserved -= fs_info->nodesize;
	bg->space_info->bytes_reserved -= fs_info->nodesize;
	spin_unlock(&bg->lock);
	spin_unlock(&bg->space_info->lock);

	btrfs_put_block_group(bg);

	return 0;
}

static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
				       struct walk_control *wc)
{
	struct btrfs_trans_handle *trans = wc->trans;
	struct btrfs_fs_info *fs_info = wc->log->fs_info;
	u64 bytenr;
	u64 ptr_gen;
	struct extent_buffer *next;
	struct extent_buffer *cur;
	int ret = 0;

	while (*level > 0) {
		struct btrfs_tree_parent_check check = { 0 };

		cur = path->nodes[*level];

		WARN_ON(btrfs_header_level(cur) != *level);

		if (path->slots[*level] >=
		    btrfs_header_nritems(cur))
			break;

		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
		check.transid = ptr_gen;
		check.level = *level - 1;
		check.has_first_key = true;
		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);

		next = btrfs_find_create_tree_block(fs_info, bytenr,
						    btrfs_header_owner(cur),
						    *level - 1);
		if (IS_ERR(next)) {
			ret = PTR_ERR(next);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);
			return ret;
		}

		if (*level == 1) {
			ret = wc->process_func(next, wc, ptr_gen, *level - 1);
			if (ret) {
				free_extent_buffer(next);
				return ret;
			}

			path->slots[*level]++;
			if (wc->free) {
				ret = btrfs_read_extent_buffer(next, &check);
				if (ret) {
					free_extent_buffer(next);
					if (trans)
						btrfs_abort_transaction(trans, ret);
					else
						btrfs_handle_fs_error(fs_info, ret, NULL);
					return ret;
				}

				ret = clean_log_buffer(trans, next);
				if (ret) {
					free_extent_buffer(next);
					return ret;
				}
			}
			free_extent_buffer(next);
			continue;
		}
		ret = btrfs_read_extent_buffer(next, &check);
		if (ret) {
			free_extent_buffer(next);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);
			return ret;
		}

		if (path->nodes[*level-1])
			free_extent_buffer(path->nodes[*level-1]);
		path->nodes[*level-1] = next;
		*level = btrfs_header_level(next);
		path->slots[*level] = 0;
		cond_resched();
	}
	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);

	cond_resched();
	return 0;
}

static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
				     struct walk_control *wc)
{
	int i;
	int slot;
	int ret;

	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
		slot = path->slots[i];
		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
			path->slots[i]++;
			*level = i;
			WARN_ON(*level == 0);
			return 0;
		} else {
			ret = wc->process_func(path->nodes[*level], wc,
				 btrfs_header_generation(path->nodes[*level]),
				 *level);
			if (ret)
				return ret;

			if (wc->free) {
				ret = clean_log_buffer(wc->trans, path->nodes[*level]);
				if (ret)
					return ret;
			}
			free_extent_buffer(path->nodes[*level]);
			path->nodes[*level] = NULL;
			*level = i + 1;
		}
	}
	return 1;
}

/*
 * drop the reference count on the tree rooted at 'snap'.  This traverses
 * the tree freeing any blocks that have a ref count of zero after being
 * decremented.
 */
static int walk_log_tree(struct walk_control *wc)
{
	struct btrfs_root *log = wc->log;
	int ret = 0;
	int wret;
	int level;
	BTRFS_PATH_AUTO_FREE(path);
	int orig_level;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	level = btrfs_header_level(log->node);
	orig_level = level;
	path->nodes[level] = log->node;
	refcount_inc(&log->node->refs);
	path->slots[level] = 0;

	while (1) {
		wret = walk_down_log_tree(path, &level, wc);
		if (wret > 0)
			break;
		if (wret < 0)
			return wret;

		wret = walk_up_log_tree(path, &level, wc);
		if (wret > 0)
			break;
		if (wret < 0)
			return wret;
	}

	/* was the root node processed? if not, catch it here */
	if (path->nodes[orig_level]) {
		ret = wc->process_func(path->nodes[orig_level], wc,
			 btrfs_header_generation(path->nodes[orig_level]),
			 orig_level);
		if (ret)
			return ret;
		if (wc->free)
			ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
	}

	return ret;
}

/*
 * helper function to update the item for a given subvolumes log root
 * in the tree of log roots
 */
static int update_log_root(struct btrfs_trans_handle *trans,
			   struct btrfs_root *log,
			   struct btrfs_root_item *root_item)
{
	struct btrfs_fs_info *fs_info = log->fs_info;
	int ret;

	if (log->log_transid == 1) {
		/* insert root item on the first sync */
		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
				&log->root_key, root_item);
	} else {
		ret = btrfs_update_root(trans, fs_info->log_root_tree,
				&log->root_key, root_item);
	}
	return ret;
}

static void wait_log_commit(struct btrfs_root *root, int transid)
{
	DEFINE_WAIT(wait);
	int index = transid % 2;

	/*
	 * we only allow two pending log transactions at a time,
	 * so we know that if ours is more than 2 older than the
	 * current transaction, we're done
	 */
	for (;;) {
		prepare_to_wait(&root->log_commit_wait[index],
				&wait, TASK_UNINTERRUPTIBLE);

		if (!(root->log_transid_committed < transid &&
		      atomic_read(&root->log_commit[index])))
			break;

		mutex_unlock(&root->log_mutex);
		schedule();
		mutex_lock(&root->log_mutex);
	}
	finish_wait(&root->log_commit_wait[index], &wait);
}

static void wait_for_writer(struct btrfs_root *root)
{
	DEFINE_WAIT(wait);

	for (;;) {
		prepare_to_wait(&root->log_writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);
		if (!atomic_read(&root->log_writers))
			break;

		mutex_unlock(&root->log_mutex);
		schedule();
		mutex_lock(&root->log_mutex);
	}
	finish_wait(&root->log_writer_wait, &wait);
}

void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
{
	ctx->log_ret = 0;
	ctx->log_transid = 0;
	ctx->log_new_dentries = false;
	ctx->logging_new_name = false;
	ctx->logging_new_delayed_dentries = false;
	ctx->logged_before = false;
	ctx->inode = inode;
	INIT_LIST_HEAD(&ctx->list);
	INIT_LIST_HEAD(&ctx->ordered_extents);
	INIT_LIST_HEAD(&ctx->conflict_inodes);
	ctx->num_conflict_inodes = 0;
	ctx->logging_conflict_inodes = false;
	ctx->scratch_eb = NULL;
}

void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
{
	struct btrfs_inode *inode = ctx->inode;

	if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
		return;

	/*
	 * Don't care about allocation failure. This is just for optimization,
	 * if we fail to allocate here, we will try again later if needed.
	 */
	ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
}

void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
{
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_extent *tmp;

	btrfs_assert_inode_locked(ctx->inode);

	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
		list_del_init(&ordered->log_list);
		btrfs_put_ordered_extent(ordered);
	}
}


static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
					struct btrfs_log_ctx *ctx)
{
	mutex_lock(&root->log_mutex);
	list_del_init(&ctx->list);
	mutex_unlock(&root->log_mutex);
}

/* 
 * Invoked in log mutex context, or be sure there is no other task which
 * can access the list.
 */
static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
					     int index, int error)
{
	struct btrfs_log_ctx *ctx;
	struct btrfs_log_ctx *safe;

	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
		list_del_init(&ctx->list);
		ctx->log_ret = error;
	}
}

/*
 * Sends a given tree log down to the disk and updates the super blocks to
 * record it.  When this call is done, you know that any inodes previously
 * logged are safely on disk only if it returns 0.
 *
 * Any other return value means you need to call btrfs_commit_transaction.
 * Some of the edge cases for fsyncing directories that have had unlinks
 * or renames done in the past mean that sometimes the only safe
 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
 * that has happened.
 */
int btrfs_sync_log(struct btrfs_trans_handle *trans,
		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
	int index1;
	int index2;
	int mark;
	int ret;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *log = root->log_root;
	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
	struct btrfs_root_item new_root_item;
	int log_transid = 0;
	struct btrfs_log_ctx root_log_ctx;
	struct blk_plug plug;
	u64 log_root_start;
	u64 log_root_level;

	mutex_lock(&root->log_mutex);
	log_transid = ctx->log_transid;
	if (root->log_transid_committed >= log_transid) {
		mutex_unlock(&root->log_mutex);
		return ctx->log_ret;
	}

	index1 = log_transid % 2;
	if (atomic_read(&root->log_commit[index1])) {
		wait_log_commit(root, log_transid);
		mutex_unlock(&root->log_mutex);
		return ctx->log_ret;
	}
	ASSERT(log_transid == root->log_transid);
	atomic_set(&root->log_commit[index1], 1);

	/* wait for previous tree log sync to complete */
	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
		wait_log_commit(root, log_transid - 1);

	while (1) {
		int batch = atomic_read(&root->log_batch);
		/* when we're on an ssd, just kick the log commit out */
		if (!btrfs_test_opt(fs_info, SSD) &&
		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
			mutex_unlock(&root->log_mutex);
			schedule_timeout_uninterruptible(1);
			mutex_lock(&root->log_mutex);
		}
		wait_for_writer(root);
		if (batch == atomic_read(&root->log_batch))
			break;
	}

	/* bail out if we need to do a full commit */
	if (btrfs_need_log_full_commit(trans)) {
		ret = BTRFS_LOG_FORCE_COMMIT;
		mutex_unlock(&root->log_mutex);
		goto out;
	}

	if (log_transid % 2 == 0)
		mark = EXTENT_DIRTY_LOG1;
	else
		mark = EXTENT_DIRTY_LOG2;

	/* we start IO on  all the marked extents here, but we don't actually
	 * wait for them until later.
	 */
	blk_start_plug(&plug);
	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
	/*
	 * -EAGAIN happens when someone, e.g., a concurrent transaction
	 *  commit, writes a dirty extent in this tree-log commit. This
	 *  concurrent write will create a hole writing out the extents,
	 *  and we cannot proceed on a zoned filesystem, requiring
	 *  sequential writing. While we can bail out to a full commit
	 *  here, but we can continue hoping the concurrent writing fills
	 *  the hole.
	 */
	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
		ret = 0;
	if (ret) {
		blk_finish_plug(&plug);
		btrfs_set_log_full_commit(trans);
		mutex_unlock(&root->log_mutex);
		goto out;
	}

	/*
	 * We _must_ update under the root->log_mutex in order to make sure we
	 * have a consistent view of the log root we are trying to commit at
	 * this moment.
	 *
	 * We _must_ copy this into a local copy, because we are not holding the
	 * log_root_tree->log_mutex yet.  This is important because when we
	 * commit the log_root_tree we must have a consistent view of the
	 * log_root_tree when we update the super block to point at the
	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
	 * with the commit and possibly point at the new block which we may not
	 * have written out.
	 */
	btrfs_set_root_node(&log->root_item, log->node);
	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));

	btrfs_set_root_log_transid(root, root->log_transid + 1);
	log->log_transid = root->log_transid;
	root->log_start_pid = 0;
	/*
	 * IO has been started, blocks of the log tree have WRITTEN flag set
	 * in their headers. new modifications of the log will be written to
	 * new positions. so it's safe to allow log writers to go in.
	 */
	mutex_unlock(&root->log_mutex);

	if (btrfs_is_zoned(fs_info)) {
		mutex_lock(&fs_info->tree_root->log_mutex);
		if (!log_root_tree->node) {
			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
			if (ret) {
				mutex_unlock(&fs_info->tree_root->log_mutex);
				blk_finish_plug(&plug);
				goto out;
			}
		}
		mutex_unlock(&fs_info->tree_root->log_mutex);
	}

	btrfs_init_log_ctx(&root_log_ctx, NULL);

	mutex_lock(&log_root_tree->log_mutex);

	index2 = log_root_tree->log_transid % 2;
	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
	root_log_ctx.log_transid = log_root_tree->log_transid;

	/*
	 * Now we are safe to update the log_root_tree because we're under the
	 * log_mutex, and we're a current writer so we're holding the commit
	 * open until we drop the log_mutex.
	 */
	ret = update_log_root(trans, log, &new_root_item);
	if (ret) {
		list_del_init(&root_log_ctx.list);
		blk_finish_plug(&plug);
		btrfs_set_log_full_commit(trans);
		if (ret != -ENOSPC)
			btrfs_err(fs_info,
				  "failed to update log for root %llu ret %d",
				  btrfs_root_id(root), ret);
		btrfs_wait_tree_log_extents(log, mark);
		mutex_unlock(&log_root_tree->log_mutex);
		goto out;
	}

	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
		blk_finish_plug(&plug);
		list_del_init(&root_log_ctx.list);
		mutex_unlock(&log_root_tree->log_mutex);
		ret = root_log_ctx.log_ret;
		goto out;
	}

	if (atomic_read(&log_root_tree->log_commit[index2])) {
		blk_finish_plug(&plug);
		ret = btrfs_wait_tree_log_extents(log, mark);
		wait_log_commit(log_root_tree,
				root_log_ctx.log_transid);
		mutex_unlock(&log_root_tree->log_mutex);
		if (!ret)
			ret = root_log_ctx.log_ret;
		goto out;
	}
	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
	atomic_set(&log_root_tree->log_commit[index2], 1);

	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
		wait_log_commit(log_root_tree,
				root_log_ctx.log_transid - 1);
	}

	/*
	 * now that we've moved on to the tree of log tree roots,
	 * check the full commit flag again
	 */
	if (btrfs_need_log_full_commit(trans)) {
		blk_finish_plug(&plug);
		btrfs_wait_tree_log_extents(log, mark);
		mutex_unlock(&log_root_tree->log_mutex);
		ret = BTRFS_LOG_FORCE_COMMIT;
		goto out_wake_log_root;
	}

	ret = btrfs_write_marked_extents(fs_info,
					 &log_root_tree->dirty_log_pages,
					 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
	blk_finish_plug(&plug);
	/*
	 * As described above, -EAGAIN indicates a hole in the extents. We
	 * cannot wait for these write outs since the waiting cause a
	 * deadlock. Bail out to the full commit instead.
	 */
	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
		btrfs_set_log_full_commit(trans);
		btrfs_wait_tree_log_extents(log, mark);
		mutex_unlock(&log_root_tree->log_mutex);
		goto out_wake_log_root;
	} else if (ret) {
		btrfs_set_log_full_commit(trans);
		mutex_unlock(&log_root_tree->log_mutex);
		goto out_wake_log_root;
	}
	ret = btrfs_wait_tree_log_extents(log, mark);
	if (!ret)
		ret = btrfs_wait_tree_log_extents(log_root_tree,
						  EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
	if (ret) {
		btrfs_set_log_full_commit(trans);
		mutex_unlock(&log_root_tree->log_mutex);
		goto out_wake_log_root;
	}

	log_root_start = log_root_tree->node->start;
	log_root_level = btrfs_header_level(log_root_tree->node);
	log_root_tree->log_transid++;
	mutex_unlock(&log_root_tree->log_mutex);

	/*
	 * Here we are guaranteed that nobody is going to write the superblock
	 * for the current transaction before us and that neither we do write
	 * our superblock before the previous transaction finishes its commit
	 * and writes its superblock, because:
	 *
	 * 1) We are holding a handle on the current transaction, so no body
	 *    can commit it until we release the handle;
	 *
	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
	 *    if the previous transaction is still committing, and hasn't yet
	 *    written its superblock, we wait for it to do it, because a
	 *    transaction commit acquires the tree_log_mutex when the commit
	 *    begins and releases it only after writing its superblock.
	 */
	mutex_lock(&fs_info->tree_log_mutex);

	/*
	 * The previous transaction writeout phase could have failed, and thus
	 * marked the fs in an error state.  We must not commit here, as we
	 * could have updated our generation in the super_for_commit and
	 * writing the super here would result in transid mismatches.  If there
	 * is an error here just bail.
	 */
	if (BTRFS_FS_ERROR(fs_info)) {
		ret = -EIO;
		btrfs_set_log_full_commit(trans);
		btrfs_abort_transaction(trans, ret);
		mutex_unlock(&fs_info->tree_log_mutex);
		goto out_wake_log_root;
	}

	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
	ret = write_all_supers(fs_info, 1);
	mutex_unlock(&fs_info->tree_log_mutex);
	if (unlikely(ret)) {
		btrfs_set_log_full_commit(trans);
		btrfs_abort_transaction(trans, ret);
		goto out_wake_log_root;
	}

	/*
	 * We know there can only be one task here, since we have not yet set
	 * root->log_commit[index1] to 0 and any task attempting to sync the
	 * log must wait for the previous log transaction to commit if it's
	 * still in progress or wait for the current log transaction commit if
	 * someone else already started it. We use <= and not < because the
	 * first log transaction has an ID of 0.
	 */
	ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
	btrfs_set_root_last_log_commit(root, log_transid);

out_wake_log_root:
	mutex_lock(&log_root_tree->log_mutex);
	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);

	log_root_tree->log_transid_committed++;
	atomic_set(&log_root_tree->log_commit[index2], 0);
	mutex_unlock(&log_root_tree->log_mutex);

	/*
	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
	 * all the updates above are seen by the woken threads. It might not be
	 * necessary, but proving that seems to be hard.
	 */
	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
	mutex_lock(&root->log_mutex);
	btrfs_remove_all_log_ctxs(root, index1, ret);
	root->log_transid_committed++;
	atomic_set(&root->log_commit[index1], 0);
	mutex_unlock(&root->log_mutex);

	/*
	 * The barrier before waitqueue_active (in cond_wake_up) is need