diff mbox

ext4: add fsync batch tuning knobs

Message ID 20081125102232.GA9882@mit.edu
State Accepted, archived
Headers show

Commit Message

Theodore Ts'o Nov. 25, 2008, 10:22 a.m. UTC
On Thu, Nov 06, 2008 at 07:45:38AM -0500, Ric Wheeler wrote:
> I think that it could be useful to have a tunable knob for the max sleep  
> time. It would certainly be capped at a much lower level for a RAM disk  
> than it would be for a very large RAID volume...

Here is a patch which implements tuning knobs for Josef's fsync
batching patch, at least for ext4.  I've implemented both a minimum
and maximum batch time.  I've also changed the default max batch time
to be 15ms, independent of HZ setting.

It'd be great if someone could run some benchmarks to see what would
be ideal ways to twiddle these knobs for different types of storage
devices (and workloads, presumably).

							- Ted

-------------
ext4: add fsync batch tuning knobs

From: "Theodore Ts'o" <tytso@mit.edu>

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched into a synchronous handle.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Aneesh Kumar K.V Dec. 2, 2008, 2:45 p.m. UTC | #1
On Tue, Nov 25, 2008 at 05:22:32AM -0500, Theodore Tso wrote:
....
...

>  	{Opt_bh, "bh"},
>  	{Opt_commit, "commit=%u"},
> +	{Opt_min_batch_time, "min_batch_time=%u"},
> +	{Opt_max_batch_time, "max_batch_time=%u"},
>  	{Opt_journal_update, "journal=update"},
>  	{Opt_journal_inum, "journal=%u"},
>  	{Opt_journal_dev, "journal_dev=%u"},
> @@ -1107,6 +1119,20 @@ static int parse_options(char *options, struct super_block *sb,
>  				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
>  			sbi->s_commit_interval = HZ * option;
>  			break;
> +		case Opt_max_batch_time:
> +			if (match_int(&args[0], &option))
> +				return 0;
> +			if (option < 0)
> +				return 0;
> +			if (option == 0)
> +				option = EXT4_DEF_MAX_BATCH_TIME;
> +			sbi->s_max_batch_time = option;


We need a break here .


> +		case Opt_min_batch_time:
> +			if (match_int(&args[0], &option))
> +				return 0;
> +			if (option < 0)
> +				return 0;
> +			sbi->s_min_batch_time = option;

same here

>  		case Opt_data_journal:
>  			data_opt = EXT4_MOUNT_JOURNAL_DATA;
>  			goto datacheck;
> @@ -1955,6 +1981,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> 
>  	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
>  	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
> +	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> +	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
> +	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
> 
>  	set_opt(sbi->s_mount_opt, RESERVATION);
>  	set_opt(sbi->s_mount_opt, BARRIER);
....
..
The patch also needs documentation of new mount options

-aneesh
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8370ffd..6244c5d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -328,6 +328,7 @@  struct ext4_mount_options {
 	uid_t s_resuid;
 	gid_t s_resgid;
 	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
 	char *s_qf_names[MAXQUOTAS];
@@ -806,6 +807,12 @@  static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 
 /*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
+/*
  * Structure of a directory entry
  */
 #define EXT4_NAME_LEN 255
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 2f3b8b1..f5b5d56 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -74,6 +74,8 @@  struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4aa2040..35500ed 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -681,10 +681,19 @@  static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
+	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
+	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+		seq_printf(seq, ",min_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+		seq_printf(seq, ",max_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+
 	/*
 	 * We're changing the default of barrier mount option, so
 	 * let's always display its mount state so it's clear what its
@@ -850,7 +859,8 @@  enum {
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+	Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
@@ -889,6 +899,8 @@  static const match_table_t tokens = {
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
@@ -1107,6 +1119,20 @@  static int parse_options(char *options, struct super_block *sb,
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			break;
+		case Opt_max_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = option;
+		case Opt_min_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_min_batch_time = option;
 		case Opt_data_journal:
 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
 			goto datacheck;
@@ -1955,6 +1981,9 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
@@ -2484,11 +2513,9 @@  static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext4-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
 
 	spin_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
@@ -2977,6 +3004,8 @@  static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -3106,6 +3135,8 @@  restore_opts:
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657..dfacd9e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -961,6 +961,8 @@  static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 0e1fc34..e7de7ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1255,9 +1255,11 @@  int jbd2_journal_stop(handle_t *handle)
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
 
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
 		commit_time = min_t(u64, commit_time,
-				    1000*jiffies_to_usecs(1));
-
+				    1000*journal->j_max_batch_time);
+		
 		if (trans_time < commit_time) {
 			ktime_t expires = ktime_add_ns(ktime_get(),
 						       commit_time);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b8b8744..55946c3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -955,6 +955,14 @@  struct journal_s
 	 */
 	u64			j_average_commit_time;
 
+	/*
+	 * minimum and maximum times that we should wait for
+	 * additional filesystem operations to get batched into a
+	 * synchronous handle in microseconds
+	 */
+	u32			j_min_batch_time;
+	u32			j_max_batch_time;
+
 	/* This function is called when a transaction is closed */
 	void			(*j_commit_callback)(journal_t *,
 						     transaction_t *);