diff mbox

[09/18] powerpc/spufs: Limit size of gangs to avoid starvation due to reserved spus

Message ID 200812101740.51239.adetsch@br.ibm.com
State RFC
Headers show

Commit Message

Andre Detsch Dec. 10, 2008, 7:40 p.m. UTC
At context creation, determine the number of available spus for general
scheduling to avoid creating a gang that can;t be placed by the scheduler.
Gangs require concurrent scheduling, so multiple spus have to be allocated
at the same time.  Similarly, prevent the reservation of an spu, if it
would result in the inability to schedule an existing job.  A new data
structure is introduced to keep track of active gangs.  It is designed to
show the size of the largest active gang and is coded to handle the
dynamic addition and deletion of contexts within gangs.  An array is
allocated that has an element for each spus in the system.  As contexts
are added and removed, elements are incremented and decremented to show
the number of gangs of at least that size.  For example, element 0
represents the number of active gangs with at least 1 context. element 1
is the number of gangs with at least 2 contexts, and so on.  A high water
mark is kept to keep track of the largest gang.

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/context.c |   20 ++++++++++++++++++++
 arch/powerpc/platforms/cell/spufs/inode.c   |   22 ++++++++++++++++++++++
 arch/powerpc/platforms/cell/spufs/sched.c   |   21 ++++++++++++++++++++-
 arch/powerpc/platforms/cell/spufs/spufs.h   |   11 +++++++++++
 4 files changed, 73 insertions(+), 1 deletions(-)
diff mbox

Patch

diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index d3887fe..f48bcdd 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -32,6 +32,22 @@ 
 
 atomic_t nr_spu_contexts = ATOMIC_INIT(0);
 
+static void inc_active_gangs(struct spu_gang *gang)
+{
+	if (atomic_inc_return(&spu_active_gangs[gang->contexts]) == 1) {
+		atomic_set(&largest_active_gang, gang->contexts);
+		mb();  /* XXX atomic_set doesn't have a sync */
+	}
+}
+
+static void dec_active_gangs(struct spu_gang *gang)
+{
+	if (!atomic_dec_return(&spu_active_gangs[gang->contexts])) {
+		atomic_set(&largest_active_gang, gang->contexts);
+		mb();  /* XXX atomic_set doesn't have a sync */
+	}
+}
+
 struct spu_context *alloc_spu_context(struct spu_gang *gang)
 {
 	struct spu_context *ctx;
@@ -57,6 +73,8 @@  struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	if (spu_init_csa(&ctx->csa))
 		goto out_free_gang;
 
+	inc_active_gangs(gang);
+
 	/* If the gang is running, it needs to be stopped, since we have a
 	 * new context that needs to be gang scheduled.  Gangs are allowed
 	 * to grow and shrink over time, but they are unscheduled when it
@@ -89,6 +107,7 @@  struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	ctx->stats.util_state = SPU_UTIL_IDLE_LOADED;
 
 	atomic_inc(&nr_spu_contexts);
+
 	goto out;
 
 out_free_gang:
@@ -123,6 +142,7 @@  void destroy_spu_context(struct kref *kref)
 	if (ctx->prof_priv_kref)
 		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	atomic_dec(&nr_spu_contexts);
+	dec_active_gangs(gang);
 	kfree(ctx->switch_log);
 	kfree(ctx);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index ad00772..922faad 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -263,6 +263,7 @@  spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
 	struct inode *inode;
 	struct spu_context *ctx, *gang_ctx;
 	struct spu_gang *gang;
+	int node, avail_spus;
 
 	ret = -ENOSPC;
 	inode = spufs_new_inode(dir->i_sb, mode | S_IFDIR);
@@ -280,6 +281,27 @@  spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
 		}
 	}
 
+	for (node = 0, avail_spus = 0; node < MAX_NUMNODES; node++) {
+		avail_spus += cbe_spu_info[node].n_spus - atomic_read(
+			&cbe_spu_info[node].reserved_spus);
+	}
+
+	/* Ensure there are enough available spus for scheduling. */
+	if (flags & SPU_CREATE_NOSCHED) {
+		/* Can't reserve an spu if it would starve an active gang */
+		if (avail_spus <= atomic_read(&largest_active_gang) + 1) {
+			ret = -EPERM;
+			goto out_iput;
+		}
+	}
+	else {
+		/* Can't create a gang too big either. */
+		if (!avail_spus || (gang && gang->contexts + 1 > avail_spus)) {
+			ret = -EPERM;
+			goto out_iput;
+		}
+	}
+
 	if (dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 		inode->i_mode &= S_ISGID;
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index acd584e..3bc0308 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -90,6 +90,9 @@  static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 static void spu_unschedule(struct spu_gang *gang);
 
+atomic_t *spu_active_gangs;
+atomic_t largest_active_gang;
+
 /*
  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
  */
@@ -1506,12 +1509,27 @@  static const struct file_operations spu_loadavg_fops = {
 int __init spu_sched_init(void)
 {
 	struct proc_dir_entry *entry;
-	int err = -ENOMEM, i;
+	int err = -ENOMEM, node, nspus, i;
 
 	spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
 	if (!spu_prio)
 		goto out;
 
+	/*
+	 * A gang cannot be larger than the number of spus in the system
+	 * since they have to be scheduled at the same time.  Allocate an
+	 * array of that length to keep track of the size of active gangs.
+	 * We need to limit the number of spus that can be reserved to
+	 * the starvation of gangs.  A reserved spus can be used by the
+	 * scheduler.
+	 */
+	for (node = 0, nspus = 0; node < MAX_NUMNODES; node++)
+		nspus += cbe_spu_info[node].n_spus;
+	spu_active_gangs = kzalloc(sizeof(atomic_t) * nspus, GFP_KERNEL);
+	if (!spu_active_gangs)
+		goto out_free_spu_prio;
+	atomic_set(&largest_active_gang, 0);
+
 	for (i = 0; i < MAX_PRIO; i++) {
 		INIT_LIST_HEAD(&spu_prio->runq[i]);
 		__clear_bit(i, spu_prio->bitmap);
@@ -1565,5 +1583,6 @@  void spu_sched_exit(void)
 			}
 		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
+	kfree(spu_active_gangs);
 	kfree(spu_prio);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 952272f..bbbfc6a 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -296,6 +296,17 @@  int put_spu_gang(struct spu_gang *gang);
 void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx);
 void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 
+/*
+ * Each element of the spu_active_gang[] identifies the number of active
+ * gangs of at least that size.  largest_active_gang identifies the size of
+ * the largest aactive gang in the system.  Array elements are incremented
+ * as contexts are created and they are decremented as contexts are destroyed.
+ * The first context in a gang increments element[1], the second element[2],
+ * and so on.  largest_active_gang is set to the highest non-zero array element.
+ */
+extern atomic_t largest_active_gang;
+extern atomic_t *spu_active_gangs;
+
 /* fault handling */
 int spufs_handle_class1(struct spu_context *ctx);
 int spufs_handle_class0(struct spu_context *ctx);