===================================================================
@@ -1493,7 +1493,47 @@ static int do_journal_get_write_access(h
}
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+ int started = 0;
+
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /*
+ * ext4_get_block in prepare for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after IO complete.
+ */
+ create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+ if (!handle) {
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ started = 1;
+ }
+
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ if (started)
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -2607,746 +2647,497 @@ out:
return ret;
}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ iput(io->inode);
+ kfree(io);
+}
+
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+ ext4_debug("inode %lu completed_io list is empty\n",
inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
- * - journal_submit_inode_data_buffers (no journal handle)
- * - shrink_page_list via pdflush (no journal handle)
- * - grab_page_cache when doing write_begin (have journal handle)
- *
- * We don't do any block allocation in this function. If we have page with
- * multiple blocks we need to write those buffer_heads that are mapped. This
- * is important for mmaped based write. So if we do with blocksize 1K
- * truncate(f, 1024);
- * a = mmap(f, 0, 4096);
- * a[0] = 'a';
- * truncate(f, 4096);
- * we have in the page first buffer_head mapped via page_mkwrite call back
- * but other bufer_heads would be unmapped but dirty(dirty done via the
- * do_wp_page). So writepage should write the first block. If we modify
- * the mmap area beyond 1024 we will again get a page_fault and the
- * page_mkwrite callback will do the block allocation and mark the
- * buffer_heads mapped.
- *
- * We redirty the page if we have any buffer_heads that is either delay or
- * unwritten in the page.
- *
- * We can get recursively called as show below.
- *
- * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext4_writepage()
- *
- * But since we don't do any block allocation we should not deadlock.
- * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ * check a range of space and convert unwritten extents to written.
*/
-static int ext4_writepage(struct page *page,
- struct writeback_control *wbc)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
int ret = 0;
- loff_t size;
- unsigned int len;
- struct buffer_head *page_bufs = NULL;
- struct inode *inode = page->mapping->host;
- trace_ext4_writepage(inode, page);
- size = i_size_read(inode);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
+ ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
- if (page_has_buffers(page)) {
- page_bufs = page_buffers(page);
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- /*
- * We don't want to do block allocation
- * So redirty the page and return
- * We may reach here when we do a journal commit
- * via journal_submit_inode_data_buffers.
- * If we don't have mapping block we just ignore
- * them. We can also reach here via shrink_page_list
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
- } else {
- /*
- * The test for page_has_buffers() is subtle:
- * We know the page is dirty but it lost buffers. That means
- * that at some moment in time after write_begin()/write_end()
- * has been called all buffers have been clean and thus they
- * must have been written at least once. So they are all
- * mapped and we can happily proceed with mapping them
- * and writing the page.
- *
- * Try to initialize the buffer_heads and check whether
- * all are mapped and non delay. We don't want to
- * do block allocation here.
- */
- ret = block_prepare_write(page, 0, len,
- noalloc_get_block_write);
- if (!ret) {
- page_bufs = page_buffers(page);
- /* check whether all are mapped and non delay */
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
- } else {
- /*
- * We can't do block allocation here
- * so just redity the page and unlock
- * and return
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
- /* now mark the buffer_heads as dirty and uptodate */
- block_commit_write(page, 0, len);
- }
+ if (list_empty(&io->list))
+ return ret;
- if (PageChecked(page) && ext4_should_journal_data(inode)) {
- /*
- * It's mmapped pagecache. Add buffers and journal it. There
- * doesn't seem much point in redirtying the page here.
- */
- ClearPageChecked(page);
- return __ext4_journalled_writepage(page, len);
- }
+ if (io->flag != EXT4_IO_WRITTEN)
+ return ret;
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, noalloc_get_block_write, wbc);
- else if (page_bufs && buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- ret = block_write_full_page_endio(page, noalloc_get_block_write,
- wbc, ext4_end_io_buffer_write);
- } else
- ret = block_write_full_page(page, noalloc_get_block_write,
- wbc);
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
return ret;
}
/*
- * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * work on completed aio dio IO, to convert unwritten extents to extents
*/
-
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+static void ext4_end_io_work(struct work_struct *work)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-
- /*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
- */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ int ret = 0;
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_io_nolock(io);
+ if (ret >= 0) {
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ mutex_unlock(&inode->i_mutex);
}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- pgoff_t index;
- int range_whole = 0;
- handle_t *handle = NULL;
- struct mpage_da_data mpd;
- struct inode *inode = mapping->host;
- int no_nrwrite_index_update;
- int pages_written = 0;
- long pages_skipped;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
- struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-
- trace_ext4_da_writepages(inode, wbc);
-
- /*
- * No pages to write? This is mainly a kludge to avoid starting
- * a transaction for special inodes like journal inode on last iput()
- * because that could violate lock ordering on umount
- */
- if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- /*
- * If the filesystem has aborted, it is read-only, so return
- * right away instead of dumping stack traces later on that
- * will obscure the real source of the problem. We test
- * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
- * the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
- * *never* be called, so if that ever happens, we would want
- * the stack trace.
- */
- if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
- return -EROFS;
-
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
-
- range_cyclic = wbc->range_cyclic;
- if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
- cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- } else
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole)
- desired_nr_to_write = wbc->nr_to_write * 8;
- else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
- }
-
- mpd.wbc = wbc;
- mpd.inode = mapping->host;
-
- /*
- * we don't want write_cache_pages to update
- * nr_to_write and writeback_index
- */
- no_nrwrite_index_update = wbc->no_nrwrite_index_update;
- wbc->no_nrwrite_index_update = 1;
- pages_skipped = wbc->pages_skipped;
-
-retry:
- while (!ret && wbc->nr_to_write > 0) {
-
- /*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
- */
- BUG_ON(ext4_should_journal_data(inode));
- needed_blocks = ext4_da_writepages_trans_blocks(inode);
-
- /* start a new transaction*/
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
- "%ld pages, ino %lu; err %d\n", __func__,
- wbc->nr_to_write, inode->i_ino, ret);
- goto out_writepages;
- }
-
- /*
- * Now call __mpage_da_writepage to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4. We don't actually
- * submit the blocks for I/O here, even though
- * write_cache_pages thinks it will, and will set the
- * pages as clean for write before calling
- * __mpage_da_writepage().
- */
- mpd.b_size = 0;
- mpd.b_state = 0;
- mpd.b_blocknr = 0;
- mpd.first_page = 0;
- mpd.next_page = 0;
- mpd.io_done = 0;
- mpd.pages_written = 0;
- mpd.retval = 0;
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
- &mpd);
- /*
- * If we have a contigous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- if (mpage_da_map_blocks(&mpd) == 0)
- mpage_da_submit_io(&mpd);
- mpd.io_done = 1;
- ret = MPAGE_DA_EXTENT_TAIL;
- }
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
- ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
- * free blocks released in the transaction
- * and try again
- */
- jbd2_journal_force_commit_nested(sbi->s_journal);
- wbc->pages_skipped = pages_skipped;
- ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * got one extent now try with
- * rest of the pages
- */
- pages_written += mpd.pages_written;
- wbc->pages_skipped = pages_skipped;
- ret = 0;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
- break;
- }
- if (!io_done && !cycled) {
- cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
- goto retry;
- }
- if (pages_skipped != wbc->pages_skipped)
- ext4_msg(inode->i_sb, KERN_CRIT,
- "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
- __func__, wbc->nr_to_write, ret);
-
- /* Update index */
- index += pages_written;
- wbc->range_cyclic = range_cyclic;
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- /*
- * set the writeback_index so that range_cyclic
- * mode will write it back later
- */
- mapping->writeback_index = index;
-
-out_writepages:
- if (!no_nrwrite_index_update)
- wbc->no_nrwrite_index_update = 0;
- if (wbc->nr_to_write > nr_to_writebump)
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
- return ret;
-}
-
-#define FALL_BACK_TO_NONDELALLOC 1
-static int ext4_nonda_switch(struct super_block *sb)
-{
- s64 free_blocks, dirty_blocks;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- /*
- * switch to non delalloc mode if we are running low
- * on free block. The free block accounting via percpu
- * counters can get slightly wrong with percpu_counter_batch getting
- * accumulated on each CPU without updating global counters
- * Delalloc need an accurate free block accounting. So switch
- * to non delalloc when we are near to error range.
- */
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- dirty_blocks =
percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
- if (2 * free_blocks < 3 * dirty_blocks ||
- free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
- /*
- * free block count is less that 150% of dirty blocks
- * or free blocks is less that watermark
- */
- return 1;
- }
- return 0;
-}
-
-static int ext4_da_write_begin(struct file *file, struct
address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+int flush_completed_IO(struct inode *inode)
{
- int ret, retries = 0;
- struct page *page;
- pgoff_t index;
- unsigned from, to;
- struct inode *inode = mapping->host;
- handle_t *handle;
-
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
-
- if (ext4_nonda_switch(inode->i_sb)) {
- *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
- return ext4_write_begin(file, mapping, pos,
- len, flags, pagep, fsdata);
- }
- *fsdata = (void *)0;
- trace_ext4_da_write_begin(inode, pos, len, flags);
-retry:
- /*
- * With delayed allocation, we don't log the i_disksize update
- * if there is delayed block allocation. But we still need
- * to journalling the i_disksize update if writes to the end
- * of file which has an already mapped buffer.
- */
- handle = ext4_journal_start(inode, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
+ ext4_io_end_t *io, *tmp;
+ int ret = 0;
+ int ret2 = 0;
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
- ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
- }
- *pagep = page;
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list))
+ return ret;
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_da_get_block_prep);
- if (ret < 0) {
- unlock_page(page);
- ext4_journal_stop(handle);
- page_cache_release(page);
+ dump_completed_IO(inode);
+ list_for_each_entry_safe(io, tmp,
+ &EXT4_I(inode)->i_completed_io_list, list) {
+ if (io->flag == EXT4_IO_UNWRITTEN)
+ continue;
/*
- * block_write_begin may have instantiated a few blocks
- * outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * Calling ext4_end_io_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
*/
- if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ret = ext4_end_io_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
}
-
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
- return ret;
+ return (ret2 < 0) ? ret2 : 0;
}
-/*
- * Check if we should update i_disksize
- * when write to the end of file but not require block allocation
- */
-static int ext4_da_should_update_i_disksize(struct page *page,
- unsigned long offset)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
{
- struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
- unsigned int idx;
- int i;
-
- bh = page_buffers(page);
- idx = offset >> inode->i_blkbits;
-
- for (i = 0; i < idx; i++)
- bh = bh->b_this_page;
-
- if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
- return 0;
- return 1;
-}
+ ext4_io_end_t *io = NULL;
-static int ext4_da_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
- handle_t *handle = ext4_journal_current_handle();
- loff_t new_i_size;
- unsigned long start, end;
- int write_mode = (int)(unsigned long)fsdata;
+ io = kmalloc(sizeof(*io), GFP_NOFS);
- if (write_mode == FALL_BACK_TO_NONDELALLOC) {
- if (ext4_should_order_data(inode)) {
- return ext4_ordered_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- } else if (ext4_should_writeback_data(inode)) {
- return ext4_writeback_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- } else {
- BUG();
- }
+ if (io) {
+ igrab(inode);
+ io->inode = inode;
+ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_io_work);
+ INIT_LIST_HEAD(&io->list);
}
- trace_ext4_da_write_end(inode, pos, len, copied);
- start = pos & (PAGE_CACHE_SIZE - 1);
- end = start + copied - 1;
+ return io;
+}
- /*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_da_should_update_i_disksize(page, end)) {
- down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- /*
- * Updating i_disksize when extending file
- * without needing block allocation
- */
- if (ext4_should_order_data(inode))
- ret = ext4_jbd2_file_inode(handle,
- inode);
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
- EXT4_I(inode)->i_disksize = new_i_size;
- }
- up_write(&EXT4_I(inode)->i_data_sem);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != EXT4_IO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
+ return;
}
- ret2 = generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
- return ret ? ret : copied;
+ io_end->offset = offset;
+ io_end->size = size;
+ io_end->flag = EXT4_IO_WRITTEN;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_completed_io_list);
+ iocb->private = NULL;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
- /*
- * Drop reserved blocks
- */
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
+ ext4_io_end_t *io_end = bh->b_private;
+ struct workqueue_struct *wq;
+
+ if (!io_end)
goto out;
+ io_end->flag = EXT4_IO_WRITTEN;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+out:
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ clear_buffer_uninit(bh);
+ end_buffer_async_write(bh, uptodate);
+}
- ext4_da_page_release_reservation(page, offset);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+ ext4_io_end_t *io_end;
+ struct page *page = bh->b_page;
+ loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+ size_t size = bh->b_size;
-out:
- ext4_invalidatepage(page, offset);
+ io_end = ext4_init_io_end(inode);
+ if (!io_end)
+ return -ENOMEM;
+ io_end->offset = offset;
+ io_end->size = size;
+ io_end->flag = EXT4_IO_UNWRITTEN;
+ /* Add the io_end to per-inode completed io list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_completed_io_list);
- return;
+ bh->b_private = io_end;
+ bh->b_end_io = ext4_end_io_buffer_write;
+ return 0;
}
/*
- * Force all delayed allocation blocks to be allocated for a given inode.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ * - ext4_da_writepages after taking page lock (have journal handle)
+ * - journal_submit_inode_data_buffers (no journal handle)
+ * - shrink_page_list via pdflush (no journal handle)
+ * - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ * ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
*/
-#if 1
-int ext4_alloc_da_blocks(struct inode *inode)
+static int ext4_writepage(struct page *page,
+ struct writeback_control *wbc)
{
- trace_ext4_alloc_da_blocks(inode);
-
- if (!EXT4_I(inode)->i_reserved_data_blocks &&
- !EXT4_I(inode)->i_reserved_meta_blocks)
- return 0;
+ int ret = 0;
+ loff_t size;
+ unsigned int len;
+ struct buffer_head *page_bufs = NULL;
+ struct inode *inode = page->mapping->host;
- /*
- * We do something simple for now. The filemap_flush() will
- * also start triggering a write of the data blocks, which is
- * not strictly speaking necessary (and for users of
- * laptop_mode, not even desirable). However, to do otherwise
- * would require replicating code paths in:
- *
- * ext4_da_writepages() ->
- * write_cache_pages() ---> (via passed in callback function)
- * __mpage_da_writepage() -->
- * mpage_add_bh_to_extent()
- * mpage_da_map_blocks()
- *
- * The problem is that write_cache_pages(), located in
- * mm/page-writeback.c, marks pages clean in preparation for
- * doing I/O, which is not desirable if we're not planning on
- * doing I/O at all.
- *
- * We could call write_cache_pages(), and then redirty all of
- * the pages by calling redirty_page_for_writeback() but that
- * would be ugly in the extreme. So instead we would need to
- * replicate parts of the code in the above functions,
- * simplifying them becuase we wouldn't actually intend to
- * write out the pages, but rather only collect contiguous
- * logical block extents, call the multi-block allocator, and
- * then update the buffer heads with the block allocations.
- *
- * For now, though, we'll cheat by calling filemap_flush(),
- * which will map the blocks, and start the I/O, but not
- * actually wait for the I/O to complete.
- */
- return filemap_flush(inode->i_mapping);
-}
-#else
-static int flush_alloc_da_page(struct page *page, struct mpage_da_data *mpd)
-{
- struct inode *inode = mpd->inode;
- struct buffer_head *bh, *head;
- sector_t logical;
+ trace_ext4_writepage(inode, page);
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
- /*
- * Can we merge this page to current extent?
- */
- if (mpd->next_page != page->index) {
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
/*
- * Nope, we can't. So, we map non-allocated blocks
- * and start IO on them using writepage()
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
*/
- if (mpd->next_page != mpd->first_page) {
- printk(KERN_INFO
- "flush_alloc_da_page map_blocks: "
- "ino %lu blk %llu, size %u\n",
- mpd->inode->i_ino, mpd->b_blocknr,
- mpd->b_size >> mpd->inode->i_blkbits);
- mpage_da_map_blocks(mpd);
+ ret = block_prepare_write(page, 0, len,
+ noalloc_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
/*
- * skip rest of the page in the page_vec
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
*/
+ redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
+ return 0;
}
+ /* now mark the buffer_heads as dirty and uptodate */
+ block_commit_write(page, 0, len);
+ }
+ if (PageChecked(page) && ext4_should_journal_data(inode)) {
/*
- * Start next extent of pages ...
- */
- mpd->first_page = page->index;
-
- /*
- * ... and blocks
+ * It's mmapped pagecache. Add buffers and journal it. There
+ * doesn't seem much point in redirtying the page here.
*/
- mpd->b_size = 0;
- mpd->b_state = 0;
- mpd->b_blocknr = 0;
+ ClearPageChecked(page);
+ return __ext4_journalled_writepage(page, len);
}
- mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, noalloc_get_block_write, wbc);
+ else if (page_bufs && buffer_uninit(page_bufs)) {
+ ext4_set_bh_endio(page_bufs, inode);
+ ret = block_write_full_page_endio(page, noalloc_get_block_write,
+ wbc, ext4_end_io_buffer_write);
+ } else
+ ret = block_write_full_page(page, noalloc_get_block_write,
+ wbc);
- if (!page_has_buffers(page)) {
- mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
- } else {
- /*
- * Page with regular buffer heads, just add all dirty ones
- */
- head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate
- * unmapped blocks in the same page.
- * Otherwise we won't make progress
- * with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_size,
- bh->b_state);
- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- /*
- * mapped dirty buffer. We need to update
- * the b_state because we look at
- * b_state in mpage_da_map_blocks. We don't
- * update b_size because if we find an
- * unmapped buffer_head later we need to
- * use the b_state flag of that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state = bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
- }
- return 0;
+ return ret;
}
-int ext4_alloc_da_blocks(struct inode *inode)
+/*
+ * This is called via ext4_da_writepages() to
+ * calulate the total number of credits to reserve to fit
+ * a single extent allocation into a single transaction,
+ * ext4_da_writpeages() will loop calling this before
+ * the block allocation.
+ */
+
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
- struct pagevec pvec;
- pgoff_t index = 0;
+ int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+ /*
+ * With non-extent format the journal credit needed to
+ * insert nrblocks contiguous block is dependent on
+ * number of contiguous block. So we will limit
+ * number of contiguous block to a sane value
+ */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ (max_blocks > EXT4_MAX_TRANS_DATA))
+ max_blocks = EXT4_MAX_TRANS_DATA;
+
+ return ext4_chunk_trans_blocks(inode, max_blocks);
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
struct mpage_da_data mpd;
- int i;
- int nr_pages;
+ struct inode *inode = mapping->host;
+ int no_nrwrite_index_update;
+ int pages_written = 0;
+ long pages_skipped;
+ unsigned int max_pages;
+ int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- if (ext4_should_journal_data(inode))
- return 0;
+ trace_ext4_da_writepages(inode, wbc);
/*
- * If no pages to write, return right away.
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
/*
- * If the filesystem has aborted, return immediately with an
- * EROFS error.
+ * If the filesystem has aborted, it is read-only, so return
+ * right away instead of dumping stack traces later on that
+ * will obscure the real source of the problem. We test
+ * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+ * the latter could be true if the filesystem is mounted
+ * read-only, and in that case, ext4_da_writepages should
+ * *never* be called, so if that ever happens, we would want
+ * the stack trace.
*/
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
- printk(KERN_INFO "ext4_alloc_da_pages(%lu)\n", inode->i_ino);
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+
+ range_cyclic = wbc->range_cyclic;
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index;
+ if (index)
+ cycled = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = LLONG_MAX;
+ wbc->range_cyclic = 0;
+ } else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole)
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
+ mpd.wbc = wbc;
mpd.inode = mapping->host;
- while (1) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+ wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;
+
+retry:
+ while (!ret && wbc->nr_to_write > 0) {
+
/*
- * we insert one extent at a time. So we need
+ * we insert one extent at a time. So we need
ext4: ext4_get_block_write and io_end code cleanup Move ext4_get_block_write and io_end related code forward to get rid of function declearation. Signed-off-by: Jiaying Zhang <jiayingz@google.com> --- fs/ext4/inode.c | 2179 +++++++++++++++++++++++++++----------------------------- 1 file changed, 1087 insertions(+), 1092 deletions(-) * credit needed for single extent allocation. * journalled mode is currently not supported * by delalloc @@ -3354,67 +3145,48 @@ int ext4_alloc_da_blocks(struct inode *i BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); - pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - if (nr_pages == 0) - break; - /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) - break; - - mpd.b_size = 0; - mpd.b_state = 0; - mpd.b_blocknr = 0; - mpd.first_page = 0; - mpd.next_page = 0; - mpd.io_done = 0; - mpd.pages_written = 0; - mpd.retval = 0; - - do { - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - lock_page(page); - if (unlikely(page->mapping != mapping) || - !PageDirty(page) || - PageWriteback(page)) { - unlock_page(page); - continue; - } - - ret = flush_alloc_da_page(page, &mpd); - if (ret) { - pagevec_release(&pvec); - goto map_extent; - } - } - pagevec_release(&pvec); - cond_resched(); - - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - } while (nr_pages); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + goto out_writepages; + } /* + * Now call __mpage_da_writepage to find the next + * contiguous region of logical blocks that need + * blocks to be allocated by ext4. We don't actually + * submit the blocks for I/O here, even though + * write_cache_pages thinks it will, and will set the + * pages as clean for write before calling + * __mpage_da_writepage(). + */ + mpd.b_size = 0; + mpd.b_state = 0; + mpd.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.io_done = 0; + mpd.pages_written = 0; + mpd.retval = 0; + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, + &mpd); + /* * If we have a contigous extent of pages and we * haven't done the I/O yet, map the blocks and submit * them for I/O. */ - map_extent: if (!mpd.io_done && mpd.next_page != mpd.first_page) { - printk(KERN_INFO - "ext4_alloc_da_blocks map_blocks: " - "ino %lu blk %llu, size %u\n", - mpd.inode->i_ino, mpd.b_blocknr, - mpd.b_size >> mpd.inode->i_blkbits); - mpage_da_map_blocks(&mpd); + if (mpage_da_map_blocks(&mpd) == 0) + mpage_da_submit_io(&mpd); + mpd.io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; } + trace_ext4_da_write_pages(inode, &mpd); + wbc->nr_to_write -= mpd.pages_written; ext4_journal_stop(handle); @@ -3424,484 +3196,707 @@ int ext4_alloc_da_blocks(struct inode *i * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); - } + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; + ret = 0; + io_done = 1; + } else if (wbc->nr_to_write) + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + break; } - printk(KERN_INFO "ext4_alloc_da_pages(%lu) exit\n", inode->i_ino); + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } + if (pages_skipped != wbc->pages_skipped) + ext4_msg(inode->i_sb, KERN_CRIT, + "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + wbc->range_cyclic = range_cyclic; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; + +out_writepages: + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + if (wbc->nr_to_write > nr_to_writebump) + wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; } -#endif -/* - * bmap() is special. It gets used by applications such as lilo and by - * the swapper to find the on-disk block of a specific piece of data. - * - * Naturally, this is dangerous if the block concerned is still in the - * journal. If somebody makes a swapfile on an ext4 data-journaling - * filesystem and enables swap, then they may get a nasty shock when the - * data getting swapped to that swapfile suddenly gets overwritten by - * the original zero's written out previously to the journal and - * awaiting writeback in the kernel's buffer cache. - * - * So, if we see any bmap calls here on a modified, data-journaled file, - * take extra steps to flush any blocks which might be in the cache. - */ -static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +#define FALL_BACK_TO_NONDELALLOC 1 +static int ext4_nonda_switch(struct super_block *sb) { - struct inode *inode = mapping->host; - journal_t *journal; - int err; + s64 free_blocks, dirty_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with percpu_counter_batch getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file + * free block count is less that 150% of dirty blocks + * or free blocks is less that watermark */ - filemap_write_and_wait(mapping); + return 1; } + return 0; +} - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; + trace_ext4_da_write_begin(inode, pos, len, flags); +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT4_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. */ + if (pos + len > inode->i_size) + ext4_truncate(inode); + } - EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; - journal = EXT4_JOURNAL(inode); - jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal); - jbd2_journal_unlock_updates(journal); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} - if (err) - return 0; +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) { + if (ext4_should_order_data(inode)) { + return ext4_ordered_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else if (ext4_should_writeback_data(inode)) { + return ext4_writeback_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else { + BUG(); + } } - return generic_block_bmap(mapping, block, ext4_get_block); -} + trace_ext4_da_write_end(inode, pos, len, copied); + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied - 1; -static int ext4_readpage(struct file *file, struct page *page) -{ - return mpage_readpage(page, ext4_get_block); -} + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT4_JOURNAL(page->mapping->host); - /* - * If it's a full truncate we just forget about the pending dirtying + * Drop reserved blocks */ - if (offset == 0) - ClearPageChecked(page); + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; - if (journal) - jbd2_journal_invalidatepage(journal, page, offset); - else - block_invalidatepage(page, offset); -} + ext4_da_page_release_reservation(page, offset); -static int ext4_releasepage(struct page *page, gfp_t wait) -{ - journal_t *journal = EXT4_JOURNAL(page->mapping->host); +out: + ext4_invalidatepage(page, offset); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, wait); - else - return try_to_free_buffers(page); + return; } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. + * Force all delayed allocation blocks to be allocated for a given inode. */ -static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) +#if 1 +int ext4_alloc_da_blocks(struct inode *inode) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; + trace_ext4_alloc_da_blocks(inode); - if (rw == WRITE) { - loff_t final_size = offset + count; + if (!EXT4_I(inode)->i_reserved_data_blocks && + !EXT4_I(inode)->i_reserved_meta_blocks) + return 0; - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_da_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writeback() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them becuase we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} +#else +static int flush_alloc_da_page(struct page *page, struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct buffer_head *bh, *head; + sector_t logical; + + /* + * Can we merge this page to current extent? + */ + if (mpd->next_page != page->index) { + /* + * Nope, we can't. So, we map non-allocated blocks + * and start IO on them using writepage() + */ + if (mpd->next_page != mpd->first_page) { + printk(KERN_INFO + "flush_alloc_da_page map_blocks: " + "ino %lu blk %llu, size %u\n", + mpd->inode->i_ino, mpd->b_blocknr, + mpd->b_size >> mpd->inode->i_blkbits); + mpage_da_map_blocks(mpd); + /* + * skip rest of the page in the page_vec + */ + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; } - } -retry: - if (rw == READ && test_opt(inode->i_sb, DIOREAD_NOLOCK) - && (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL); - else - ret = blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + /* + * Start next extent of pages ... + */ + mpd->first_page = page->index; - if (orphan) { - int err; + /* + * ... and blocks + */ + mpd->b_size = 0; + mpd->b_state = 0; + mpd->b_blocknr = 0; + } - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + } else { + /* + * Page with regular buffer heads, just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_writepage + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. + * mapped dirty buffer. We need to update + * the b_state because we look at + * b_state in mpage_da_map_blocks. We don't + * update b_size because if we find an + * unmapped buffer_head later we need to + * use the b_state flag of that buffer_head. */ - ext4_mark_inode_dirty(handle, inode); + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; + logical++; + } while ((bh = bh->b_this_page) != head); } -out: - return ret; + return 0; } -static int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +int ext4_alloc_da_blocks(struct inode *inode) { - handle_t *handle = ext4_journal_current_handle(); - int ret = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int dio_credits; - int started = 0; + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0; + handle_t *handle = NULL; + struct mpage_da_data mpd; + int i; + int nr_pages; + int needed_blocks, ret = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + if (ext4_should_journal_data(inode)) + return 0; - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", - inode->i_ino, create); /* - * ext4_get_block in prepare for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after IO complete. + * If no pages to write, return right away. */ - create = EXT4_GET_BLOCKS_IO_CREATE_EXT; - - if (!handle) { - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - started = 1; - } - - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - if (started) - ext4_journal_stop(handle); -out: - return ret; -} + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - iput(io->inode); - kfree(io); -} + /* + * If the filesystem has aborted, return immediately with an + * EROFS error. + */ + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) + return -EROFS; -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; + printk(KERN_INFO "ext4_alloc_da_pages(%lu)\n", inode->i_ino); + mpd.inode = mapping->host; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } + while (1) { + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); + pagevec_init(&pvec, 0); + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } -#endif -} + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) + break; -/* - * check a range of space and convert unwritten extents to written. - */ -static int ext4_end_io_nolock(ext4_io_end_t *io) -{ - struct inode *inode = io->inode; - loff_t offset = io->offset; - size_t size = io->size; - int ret = 0; + mpd.b_size = 0; + mpd.b_state = 0; + mpd.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.io_done = 0; + mpd.pages_written = 0; + mpd.retval = 0; - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," - "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); + do { + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; - if (list_empty(&io->list)) - return ret; + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page)) { + unlock_page(page); + continue; + } - if (io->flag != EXT4_IO_WRITTEN) - return ret; + ret = flush_alloc_da_page(page, &mpd); + if (ret) { + pagevec_release(&pvec); + goto map_extent; + } + } + pagevec_release(&pvec); + cond_resched(); - ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten" - "extents to written extents, error is %d" - " io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; - } + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + } while (nr_pages); - /* clear the DIO AIO unwritten flag */ - io->flag = 0; - return ret; -} + /* + * If we have a contigous extent of pages and we + * haven't done the I/O yet, map the blocks and submit + * them for I/O. + */ + map_extent: + if (!mpd.io_done && mpd.next_page != mpd.first_page) { + printk(KERN_INFO + "ext4_alloc_da_blocks map_blocks: " + "ino %lu blk %llu, size %u\n", + mpd.inode->i_ino, mpd.b_blocknr, + mpd.b_size >> mpd.inode->i_blkbits); + mpage_da_map_blocks(&mpd); + } -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) -{ - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - int ret = 0; + ext4_journal_stop(handle); - mutex_lock(&inode->i_mutex); - ret = ext4_end_io_nolock(io); - if (ret >= 0) { - if (!list_empty(&io->list)) - list_del_init(&io->list); - ext4_free_io_end(io); + if ((mpd.retval == -ENOSPC) && sbi->s_journal) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + } } - mutex_unlock(&inode->i_mutex); + printk(KERN_INFO "ext4_alloc_da_pages(%lu) exit\n", inode->i_ino); + return ret; } +#endif /* - * This function is called from ext4_sync_file(). + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext4 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. */ -int flush_completed_IO(struct inode *inode) +static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { - ext4_io_end_t *io, *tmp; - int ret = 0; - int ret2 = 0; + struct inode *inode = mapping->host; + journal_t *journal; + int err; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) - return ret; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } - dump_completed_IO(inode); - list_for_each_entry_safe(io, tmp, - &EXT4_I(inode)->i_completed_io_list, list) { - if (io->flag == EXT4_IO_UNWRITTEN) - continue; + if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) * - * Thus we need to keep the io structure still valid here after - * convertion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. + * NB. EXT4_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. */ - ret = ext4_end_io_nolock(io); - if (ret < 0) - ret2 = ret; - else - list_del_init(&io->list); - } - return (ret2 < 0) ? ret2 : 0; -} - -static ext4_io_end_t *ext4_init_io_end (struct inode *inode) -{ - ext4_io_end_t *io = NULL; - io = kmalloc(sizeof(*io), GFP_NOFS); + EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; + journal = EXT4_JOURNAL(inode); + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); - if (io) { - igrab(inode); - io->inode = inode; - io->flag = 0; - io->offset = 0; - io->size = 0; - io->error = 0; - INIT_WORK(&io->work, ext4_end_io_work); - INIT_LIST_HEAD(&io->list); + if (err) + return 0; } - return io; + return generic_block_bmap(mapping, block, ext4_get_block); } -static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) +static int ext4_readpage(struct file *file, struct page *page) { - ext4_io_end_t *io_end = iocb->private; - struct workqueue_struct *wq; - - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - return; - - ext_debug("ext4_end_io_dio(): io_end 0x%p" - "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", - iocb->private, io_end->inode->i_ino, iocb, offset, - size); + return mpage_readpage(page, ext4_get_block); +} - /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != EXT4_IO_UNWRITTEN){ - ext4_free_io_end(io_end); - iocb->private = NULL; - return; - } +static int +ext4_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); +} - io_end->offset = offset; - io_end->size = size; - io_end->flag = EXT4_IO_WRITTEN; - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; +static void ext4_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); - /* Add the io_end to per-inode completed aio dio list*/ - list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_completed_io_list); - iocb->private = NULL; + if (journal) + jbd2_journal_invalidatepage(journal, page, offset); + else + block_invalidatepage(page, offset); } -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +static int ext4_releasepage(struct page *page, gfp_t wait) { - ext4_io_end_t *io_end = bh->b_private; - struct workqueue_struct *wq; + journal_t *journal = EXT4_JOURNAL(page->mapping->host); - if (!io_end) - goto out; - io_end->flag = EXT4_IO_WRITTEN; - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); -out: - bh->b_private = NULL; - bh->b_end_io = NULL; - clear_buffer_uninit(bh); - end_buffer_async_write(bh, uptodate); + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); } -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; - size_t size = bh->b_size; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; - io_end = ext4_init_io_end(inode); - if (!io_end) - return -ENOMEM; - io_end->offset = offset; - io_end->size = size; - io_end->flag = EXT4_IO_UNWRITTEN; - /* Add the io_end to per-inode completed io list*/ - list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_completed_io_list); + if (rw == WRITE) { + loff_t final_size = offset + count; - bh->b_private = io_end; - bh->b_end_io = ext4_end_io_buffer_write; - return 0; + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && test_opt(inode->i_sb, DIOREAD_NOLOCK) + && (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + else + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; } /* -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html