Patchwork powerpc/spufs: Limit size of gangs to avoid starvation due to reserved spus

login
register
mail settings
Submitter Andre Detsch
Date Sept. 11, 2008, 11:38 p.m.
Message ID <200809112038.22459.adetsch@br.ibm.com>
Download mbox | patch
Permalink /patch/257/
State Superseded
Delegated to: Jeremy Kerr
Headers show

Comments

Andre Detsch - Sept. 11, 2008, 11:38 p.m.
At context creation, determine the number of available spus for general
scheduling to avoid creating a gang that can;t be placed by the scheduler.
Gangs require concurrent scheduling, so multiple spus have to be allocated
at the same time.  Similarly, prevent the reservation of an spu, if it
would result in the inability to schedule an existing job.  A new data
structure is introduced to keep track of active gangs.  It is designed to
show the size of the largest active gang and is coded to handle the
dynamic addition and deletion of contexts within gangs.  An array is
allocated that has an element for each spus in the system.  As contexts
are added and removed, elements are incremented and decremented to show
the number of gangs of at least that size.  For example, element 0
represents the number of active gangs with at least 1 context. element 1
is the number of gangs with at least 2 contexts, and so on.  A high water
mark is kept to keep track of the largest gang.

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
Signed-off-by: Andre Detsch <adetsch@br.ibm.com>

Patch

diff --git a/arch/powerpc/platforms/cell/spufs/context.c 
b/arch/powerpc/platforms/cell/spufs/context.c
index c472519..7ca787e 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -32,6 +32,22 @@ 
 
 atomic_t nr_spu_contexts = ATOMIC_INIT(0);
 
+static void inc_active_gangs(struct spu_gang *gang)
+{
+	if (atomic_inc_return(&spu_active_gangs[gang->contexts]) == 1) {
+		atomic_set(&largest_active_gang, gang->contexts);
+		mb();  /* XXX atomic_set doesn't have a sync */
+	}
+}
+
+static void dec_active_gangs(struct spu_gang *gang)
+{
+	if (!atomic_dec_return(&spu_active_gangs[gang->contexts])) {
+		atomic_set(&largest_active_gang, gang->contexts);
+		mb();  /* XXX atomic_set doesn't have a sync */
+	}
+}
+
 struct spu_context *alloc_spu_context(struct spu_gang *gang)
 {
 	struct spu_context *ctx;
@@ -57,6 +73,8 @@  struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	if (spu_init_csa(&ctx->csa))
 		goto out_free_gang;
 
+	inc_active_gangs(gang);
+
 	/* If the gang is running, it needs to be stopped, since we have a
 	 * new context that needs to be gang scheduled.  Gangs are allowed
 	 * to grow and shrink over time, but they are unscheduled when it
@@ -89,6 +107,7 @@  struct spu_context *alloc_spu_context(struct spu_gang 
*gang)
 	ctx->stats.util_state = SPU_UTIL_IDLE_LOADED;
 
 	atomic_inc(&nr_spu_contexts);
+
 	goto out;
 
 out_free_gang:
@@ -123,6 +142,7 @@  void destroy_spu_context(struct kref *kref)
 	if (ctx->prof_priv_kref)
 		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	atomic_dec(&nr_spu_contexts);
+	dec_active_gangs(gang);
 	kfree(ctx->switch_log);
 	kfree(ctx);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c 
b/arch/powerpc/platforms/cell/spufs/inode.c
index cf97761..c455a44 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -263,6 +263,7 @@  spufs_mkdir(struct inode *dir, struct dentry *dentry, 
unsigned int flags,
 	struct inode *inode;
 	struct spu_context *ctx, *gang_ctx;
 	struct spu_gang *gang;
+	int node, avail_spus;
 
 	ret = -ENOSPC;
 	inode = spufs_new_inode(dir->i_sb, mode | S_IFDIR);
@@ -280,6 +281,27 @@  spufs_mkdir(struct inode *dir, struct dentry *dentry, 
unsigned int flags,
 		}
 	}
 
+	for (node = 0, avail_spus = 0; node < MAX_NUMNODES; node++) {
+		avail_spus += cbe_spu_info[node].n_spus - atomic_read(
+			&cbe_spu_info[node].reserved_spus);
+	}
+
+	/* Ensure there are enough available spus for scheduling. */
+	if (flags & SPU_CREATE_NOSCHED) {
+		/* Can't reserve an spu if it would starve an active gang */
+		if (avail_spus <= atomic_read(&largest_active_gang) + 1) {
+			ret = -EPERM;
+			goto out_iput;
+		}
+	}
+	else {
+		/* Can't create a gang too big either. */
+		if (!avail_spus || (gang && gang->contexts + 1 > avail_spus)) {
+			ret = -EPERM;
+			goto out_iput;
+		}
+	}
+
 	if (dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 		inode->i_mode &= S_ISGID;
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c 
b/arch/powerpc/platforms/cell/spufs/sched.c
index f3dee8d..8326034 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -90,6 +90,9 @@  static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 static void spu_unschedule(struct spu_gang *gang);
 
+atomic_t *spu_active_gangs;
+atomic_t largest_active_gang;
+
 /*
  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
  */
@@ -1492,12 +1495,27 @@  static const struct file_operations spu_loadavg_fops = 
{
 int __init spu_sched_init(void)
 {
 	struct proc_dir_entry *entry;
-	int err = -ENOMEM, i;
+	int err = -ENOMEM, node, nspus, i;
 
 	spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
 	if (!spu_prio)
 		goto out;
 
+	/*
+	 * A gang cannot be larger than the number of spus in the system
+	 * since they have to be scheduled at the same time.  Allocate an
+	 * array of that length to keep track of the size of active gangs.
+	 * We need to limit the number of spus that can be reserved to
+	 * the starvation of gangs.  A reserved spus can be used by the
+	 * scheduler.
+	 */
+	for (node = 0, nspus = 0; node < MAX_NUMNODES; node++)
+		nspus += cbe_spu_info[node].n_spus;
+	spu_active_gangs = kzalloc(sizeof(atomic_t) * nspus, GFP_KERNEL);
+	if (!spu_active_gangs)
+		goto out_free_spu_prio;
+	atomic_set(&largest_active_gang, 0);
+
 	for (i = 0; i < MAX_PRIO; i++) {
 		INIT_LIST_HEAD(&spu_prio->runq[i]);
 		__clear_bit(i, spu_prio->bitmap);
@@ -1551,5 +1569,6 @@  void spu_sched_exit(void)
 			}
 		spin_unlock(&cbe_spu_info[node].list_lock);
 	}
+	kfree(spu_active_gangs);
 	kfree(spu_prio);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h 
b/arch/powerpc/platforms/cell/spufs/spufs.h
index de436f2..6afc514 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -297,6 +297,17 @@  int put_spu_gang(struct spu_gang *gang);
 void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx);
 void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 
+/*
+ * Each element of the spu_active_gang[] identifies the number of active
+ * gangs of at least that size.  largest_active_gang identifies the size of
+ * the largest aactive gang in the system.  Array elements are incremented
+ * as contexts are created and they are decremented as contexts are 
destroyed.
+ * The first context in a gang increments element[1], the second element[2],
+ * and so on.  largest_active_gang is set to the highest non-zero array 
element.
+ */
+extern atomic_t largest_active_gang;
+extern atomic_t *spu_active_gangs;
+
 /* fault handling */