@@ -1049,6 +1049,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int depth;
int count = 0;
ext4_fsblk_t first_block = 0;
+ struct buffer_head *sbh = NULL;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
@@ -1155,6 +1156,25 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto cleanup;
+ if (SNAPMAP_ISCOW(flags)) {
+ /*
+ * COWing block or creating COW bitmap.
+ * we now have exclusive access to the COW destination block
+ * and we are about to create the snapshot block mapping
+ * and make it public.
+ * grab the buffer cache entry and mark it new
+ * to indicate a pending COW operation.
+ * the refcount for the buffer cache will be released
+ * when the COW operation is either completed or canceled.
+ */
+ sbh = sb_getblk(inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ if (!sbh) {
+ err = -EIO;
+ goto cleanup;
+ }
+ ext4_snapshot_start_pending_cow(sbh);
+ }
+
if (map->m_flags & EXT4_MAP_REMAP) {
map->m_len = count;
/* move old block to snapshot */
@@ -1198,6 +1218,12 @@ got_it:
/* Clean up and exit */
partial = chain + depth - 1; /* the whole chain */
cleanup:
+ /* cancel pending COW operation on failure to alloc snapshot block */
+ if (SNAPMAP_ISCOW(flags)) {
+ if (err < 0 && sbh)
+ ext4_snapshot_end_pending_cow(sbh);
+ brelse(sbh);
+ }
while (partial > chain) {
BUFFER_TRACE(partial->bh, "call brelse");
brelse(partial->bh);
@@ -115,6 +115,8 @@ ext4_snapshot_complete_cow(handle_t *handle, struct inode *snapshot,
if (sync)
sync_dirty_buffer(sbh);
out:
+ /* COW operation is complete */
+ ext4_snapshot_end_pending_cow(sbh);
return err;
}
@@ -688,6 +690,12 @@ int ext4_snapshot_test_and_cow(const char *where, handle_t *handle,
* we allocated this block -
* copy block data to snapshot and complete COW operation
*/
+ snapshot_debug(3, "COWing block [%llu/%llu] of snapshot "
+ "(%u)...\n",
+ SNAPSHOT_BLOCK_TUPLE(block),
+ active_snapshot->i_generation);
+ /* sleep 1 tunable delay unit */
+ snapshot_test_delay(SNAPTEST_COW);
err = ext4_snapshot_copy_buffer_cow(handle, active_snapshot,
sbh, bh);
if (err)
@@ -700,6 +708,9 @@ int ext4_snapshot_test_and_cow(const char *where, handle_t *handle,
trace_cow_inc(handle, copied);
test_pending_cow:
+ if (sbh)
+ /* wait for pending COW to complete */
+ ext4_snapshot_test_pending_cow(sbh, block);
cowed:
/* mark the buffer COWed in the current transaction */
@@ -474,6 +474,70 @@ static inline int ext4_snapshot_mow_in_tid(struct inode *inode)
ext4_snapshot_get_tid(inode->i_sb));
}
+/*
+ * Pending COW functions
+ */
+
+/*
+ * Start pending COW operation from get_blocks_handle()
+ * after allocating snapshot block and before connecting it
+ * to the snapshot inode.
+ */
+static inline void ext4_snapshot_start_pending_cow(struct buffer_head *sbh)
+{
+ /*
+ * setting the 'new' flag on a newly allocated snapshot block buffer
+ * indicates that the COW operation is pending.
+ */
+ set_buffer_new(sbh);
+ /* keep buffer in cache as long as we need to test the 'new' flag */
+ get_bh(sbh);
+}
+
+/*
+ * End pending COW operation started in get_blocks_handle().
+ * Called on failure to connect the new snapshot block to the inode
+ * or on successful completion of the COW operation.
+ */
+static inline void ext4_snapshot_end_pending_cow(struct buffer_head *sbh)
+{
+ /*
+ * clearing the 'new' flag from the snapshot block buffer
+ * indicates that the COW operation is complete.
+ */
+ clear_buffer_new(sbh);
+ /* we no longer need to keep the buffer in cache */
+ put_bh(sbh);
+}
+
+/*
+ * Test for pending COW operation and wait for its completion.
+ */
+static inline void ext4_snapshot_test_pending_cow(struct buffer_head *sbh,
+ sector_t blocknr)
+{
+ while (buffer_new(sbh)) {
+ /* wait for pending COW to complete */
+ snapshot_debug_once(2, "waiting for pending cow: "
+ "block = [%llu/%llu]...\n",
+ SNAPSHOT_BLOCK_TUPLE(blocknr));
+ /*
+ * An unusually long pending COW operation can be caused by
+ * the debugging function snapshot_test_delay(SNAPTEST_COW)
+ * and by waiting for tracked reads to complete.
+ * The new COW buffer is locked during those events, so wait
+ * on the buffer before the short msleep.
+ */
+ wait_on_buffer(sbh);
+ /*
+ * This is an unlikely event that can happen only once per
+ * block/snapshot, so msleep(1) is sufficient and there is
+ * no need for a wait queue.
+ */
+ msleep(1);
+ /* XXX: Should we fail after N retries? */
+ }
+}
#else /* CONFIG_EXT4_FS_SNAPSHOT */
@@ -183,6 +183,7 @@ static int ext4_snapshot_read_through(struct inode *inode, sector_t iblock,
int err;
struct ext4_map_blocks map;
struct inode *prev_snapshot;
+ struct buffer_head *sbh = NULL;
map.m_lblk = iblock;
map.m_pblk = 0;
@@ -214,6 +215,45 @@ get_block:
bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
map.m_flags;
+ /*
+ * On read of active snapshot, a mapped block may belong to a non
+ * completed COW operation. Use the buffer cache to test this
+ * condition. if (bh_result->b_blocknr == SNAPSHOT_BLOCK(iblock)),
+ * then this is either read through to block device or moved block.
+ * Either way, it is not a COWed block, so it cannot be pending COW.
+ */
+ if (ext4_snapshot_is_active(inode) &&
+ bh_result->b_blocknr != SNAPSHOT_BLOCK(iblock))
+ sbh = sb_find_get_block(inode->i_sb, bh_result->b_blocknr);
+ if (!sbh)
+ return 0;
+ /* wait for pending COW to complete */
+ ext4_snapshot_test_pending_cow(sbh, SNAPSHOT_BLOCK(iblock));
+ lock_buffer(sbh);
+ if (buffer_uptodate(sbh)) {
+ /*
+ * Avoid disk I/O and copy out snapshot page directly
+ * from block device page when possible.
+ */
+ BUG_ON(!sbh->b_page);
+ BUG_ON(!bh_result->b_page);
+ lock_buffer(bh_result);
+ copy_highpage(bh_result->b_page, sbh->b_page);
+ set_buffer_uptodate(bh_result);
+ unlock_buffer(bh_result);
+ } else if (buffer_dirty(sbh)) {
+ /*
+ * If snapshot data buffer is dirty (just been COWed),
+ * then it is not safe to read it from disk yet.
+ * We shouldn't get here because snapshot data buffer
+ * only becomes dirty during COW and because we waited
+ * for pending COW to complete, which means that a
+ * dirty snapshot data buffer should be uptodate.
+ */
+ WARN_ON(1);
+ }
+ unlock_buffer(sbh);
+ brelse(sbh);
return 0;
}