Patch Detail

GET /api/patches/253/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 253,
    "url": "http://patchwork.ozlabs.org/api/patches/253/?format=api",
    "web_url": "http://patchwork.ozlabs.org/project/cbe-oss-dev/patch/200809112038.08899.adetsch@br.ibm.com/",
    "project": {
        "id": 1,
        "url": "http://patchwork.ozlabs.org/api/projects/1/?format=api",
        "name": "Cell Broadband Engine development",
        "link_name": "cbe-oss-dev",
        "list_id": "cbe-oss-dev.ozlabs.org",
        "list_email": "cbe-oss-dev@ozlabs.org",
        "web_url": null,
        "scm_url": null,
        "webscm_url": null,
        "list_archive_url": "",
        "list_archive_url_format": "",
        "commit_url_format": ""
    },
    "msgid": "<200809112038.08899.adetsch@br.ibm.com>",
    "list_archive_url": null,
    "date": "2008-09-11T23:38:08",
    "name": "powerpc/spufs: Implement spu gang scheduling.",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": false,
    "hash": "075c0262993c57272bac3a97e39c3adb73ef7a24",
    "submitter": {
        "id": 93,
        "url": "http://patchwork.ozlabs.org/api/people/93/?format=api",
        "name": "Andre Detsch",
        "email": "adetsch@br.ibm.com"
    },
    "delegate": {
        "id": 1,
        "url": "http://patchwork.ozlabs.org/api/users/1/?format=api",
        "username": "jk",
        "first_name": "Jeremy",
        "last_name": "Kerr",
        "email": "jk@ozlabs.org"
    },
    "mbox": "http://patchwork.ozlabs.org/project/cbe-oss-dev/patch/200809112038.08899.adetsch@br.ibm.com/mbox/",
    "series": [],
    "comments": "http://patchwork.ozlabs.org/api/patches/253/comments/",
    "check": "pending",
    "checks": "http://patchwork.ozlabs.org/api/patches/253/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<cbe-oss-dev-bounces+patchwork=ozlabs.org@ozlabs.org>",
        "X-Original-To": [
            "patchwork@ozlabs.org",
            "cbe-oss-dev@ozlabs.org"
        ],
        "Delivered-To": [
            "patchwork@ozlabs.org",
            "cbe-oss-dev@ozlabs.org"
        ],
        "Received": [
            "from ozlabs.org (localhost [127.0.0.1])\n\tby ozlabs.org (Postfix) with ESMTP id 4A4FFDE5FC\n\tfor <patchwork@ozlabs.org>; Fri, 12 Sep 2008 09:40:03 +1000 (EST)",
            "from igw1.br.ibm.com (igw1.br.ibm.com [32.104.18.24])\n\t(using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits))\n\t(Client did not present a certificate)\n\tby ozlabs.org (Postfix) with ESMTPS id 1B070DE026;\n\tFri, 12 Sep 2008 09:39:00 +1000 (EST)",
            "from mailhub1.br.ibm.com (mailhub1 [9.18.232.109])\n\tby igw1.br.ibm.com (Postfix) with ESMTP id 7E12F32C063;\n\tThu, 11 Sep 2008 20:08:21 -0300 (BRT)",
            "from d24av01.br.ibm.com (d24av01.br.ibm.com [9.18.232.46])\n\tby mailhub1.br.ibm.com (8.13.8/8.13.8/NCO v9.1) with ESMTP id\n\tm8BNcwAn2470122; Thu, 11 Sep 2008 20:38:58 -0300",
            "from d24av01.br.ibm.com (loopback [127.0.0.1])\n\tby d24av01.br.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id\n\tm8BNcpnA024251; Thu, 11 Sep 2008 20:38:51 -0300",
            "from [9.8.10.86] ([9.8.10.86])\n\tby d24av01.br.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id\n\tm8BNcnqI024222; Thu, 11 Sep 2008 20:38:50 -0300"
        ],
        "From": "Andre Detsch <adetsch@br.ibm.com>",
        "To": "cbe-oss-dev@ozlabs.org",
        "Date": "Thu, 11 Sep 2008 20:38:08 -0300",
        "User-Agent": "KMail/1.9.6",
        "References": "<200809111955.28780.adetsch@br.ibm.com>",
        "In-Reply-To": "<200809111955.28780.adetsch@br.ibm.com>",
        "MIME-Version": "1.0",
        "Content-Disposition": "inline",
        "Message-Id": "<200809112038.08899.adetsch@br.ibm.com>",
        "Cc": "LukeBrowning@us.ibm.com, Jeremy Kerr <jk@ozlabs.org>",
        "Subject": "[Cbe-oss-dev] [PATCH 04/11] powerpc/spufs: Implement spu gang\n\tscheduling.",
        "X-BeenThere": "cbe-oss-dev@ozlabs.org",
        "X-Mailman-Version": "2.1.11",
        "Precedence": "list",
        "List-Id": "Discussion about Open Source Software for the Cell Broadband Engine\n\t<cbe-oss-dev.ozlabs.org>",
        "List-Unsubscribe": "<https://ozlabs.org/mailman/options/cbe-oss-dev>,\n\t<mailto:cbe-oss-dev-request@ozlabs.org?subject=unsubscribe>",
        "List-Archive": "<http://ozlabs.org/pipermail/cbe-oss-dev>",
        "List-Post": "<mailto:cbe-oss-dev@ozlabs.org>",
        "List-Help": "<mailto:cbe-oss-dev-request@ozlabs.org?subject=help>",
        "List-Subscribe": "<https://ozlabs.org/mailman/listinfo/cbe-oss-dev>,\n\t<mailto:cbe-oss-dev-request@ozlabs.org?subject=subscribe>",
        "Content-Type": "text/plain; charset=\"us-ascii\"",
        "Content-Transfer-Encoding": "7bit",
        "Sender": "cbe-oss-dev-bounces+patchwork=ozlabs.org@ozlabs.org",
        "Errors-To": "cbe-oss-dev-bounces+patchwork=ozlabs.org@ozlabs.org"
    },
    "content": "This patch provides the base support for gang scheudling, including spu\nmgmt, runqueue management, placement, activation, deactivation, time slicing,\nyield, and preemption.  Basically, all of the core scheduling capabilities.\n\nAll spu contexts belong to a gang.  For standalone spu contexts, an internal\ngang structure is allocated to present a uniform data abstraction, so that\nthe gang can be queued on the runqueue.  The priority of the gang dictates\nits position on the runqueue.  All gang's have a single priority, policy,\nand NUMA attachment which is inherited from creator of the spu context. These\nvalues do not currently change, although there is nothing to prohibit such\nsupport to be added in the future.\n\nAll contexts within a gang are scheduled and unscheduled at the same time.\nspu_schedule and spu_unschedule have been changed to invoke spu_bind_context\nand spu_unbind_context in a loop.  The former is more complicated in that it\nmust allocate enough spus in a secure manner so that it can get successfully\nthrough its critical section without running out of spus.  For this reason,\nSPUs are preallocated.  A reserved spu has the following state.\n\n(spu->alloc_state == SPU_FREE and spu->gang != <gang>)\n\nTimeslicing follows a two step algorithm. 1) all running contexts are\ntimesliced. The tick counter is implemented at the context level to\nsimplify the logic as they are all set and decremented at the same time.\nWhen a count goes to zero, the gang is unscheduled.  This frees up space\nas much space as posible before the scheduler tries to place a job that\nis queued on the runqueue. This is a critical as the size of the job\nwaiting to run is not known apriori. 2) Sequentially place as many gangs\nas possible. Skip over gangs as necessary across all run levels. This is\nconsistent with spu_yield which unloads the spu across user mode calls.\n\nA simple hueristic has been implemented to prevent too many contexts switches\nin step 1.  A limit is based on the number of runnable contexts that are\navailable on the runqueue.  If the count is less than the number of physical\nspus, some spus may not be time sliced.  This is not guaranteed as they\nmay be part of a gang that is time sliced.  A simple one pass scan is used.\n\nA new gang nstarted counter has been added to the gang structure to create\na synchronization point for gang start.  The counter is incremented, when\na context calls spu_run().  When all of the contexts have been started,\nthe gang is considered runnable.\n\nThe start synchronization point is implemented by passing the first\nN-1 contexts directly through spu_run() to the spufs_wait() as before\nwhere they wait on a private spe event word.  As before, they update\ntheir csa area instead of hardware registers. The Nth thread through\nspu_run() either places the gang or puts it on the runqueue.\n\nNearly all of the spu_run() critical section is the same.  It is context\nbased and runs almost entirely under the context lock.  The gang lock\nis only taken when the context is in the SPU_SCHED_STATE, signifying that\nthe context needs to be activated.  This is an important optimization\nthat avoids lock contention in the controlling thread.\n\nA gang nrunnable count has been implemented that is incremented and\ndecremented on entry and exit of spu_run respectively. This count is\nintended to provide a measure of whether all of the contexts in the\ngang are executing user mode code.  In this case, all of the spus in\nthe gang are stopped and this is a good point to preempt the gang.  This\nis implemented by spu_yield() which triggers a call to spu_deactivate\nwhich unloads the gang.  Importantly, in this case, the gang is not added\nto the runqueue as the contexts are stopped.  This is designed to prevent\nthe pollution of the runqueue with stopped jobs that could only be\nlazily loaded.  In this case, it is safe to not queue it as the\napplication is expected to re-drive the context via spu_run.\n\nFinally, this means that a gang is eligible to be run as long as\none context in the gang is runnable.  Major page faulting is the other\nevent that may cause a gang to be preempted.  It is implemented via a\nnfaulting count and a call to yield.  In this case, it is put on the\nrunqueue as the context is in kernel mode.  It is sort of a step down\nscheduling technique to give something else a chance to run.\n\nSigned-off-by: Luke Browning <lukebrowning@us.ibm.com>\nSigned-off-by: Andre Detsch <adetsch@br.ibm.com>",
    "diff": "diff --git a/arch/powerpc/include/asm/spu.h b/arch/powerpc/include/asm/spu.h\nindex 9d799b6..b758947 100644\n--- a/arch/powerpc/include/asm/spu.h\n+++ b/arch/powerpc/include/asm/spu.h\n@@ -137,6 +137,7 @@ struct spu {\n \tunsigned int slb_replace;\n \tstruct mm_struct *mm;\n \tstruct spu_context *ctx;\n+\tstruct spu_gang *gang;\n \tstruct spu_runqueue *rq;\n \tunsigned long long timestamp;\n \tpid_t pid;\ndiff --git a/arch/powerpc/platforms/cell/spufs/context.c \nb/arch/powerpc/platforms/cell/spufs/context.c\nindex ace2273..21ba409 100644\n--- a/arch/powerpc/platforms/cell/spufs/context.c\n+++ b/arch/powerpc/platforms/cell/spufs/context.c\n@@ -68,9 +68,9 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)\n \tinit_waitqueue_head(&ctx->mfc_wq);\n \tinit_waitqueue_head(&ctx->run_wq);\n \tctx->state = SPU_STATE_SAVED;\n+\tset_bit(SPU_SCHED_JUST_CREATED, &ctx->sched_flags);\n \tctx->ops = &spu_backing_ops;\n \tctx->owner = get_task_mm(current);\n-\tINIT_LIST_HEAD(&ctx->rq);\n \tINIT_LIST_HEAD(&ctx->aff_list);\n \tspu_gang_add_ctx(gang, ctx);\n \t__spu_update_sched_info(ctx);\n@@ -98,14 +98,19 @@ void destroy_spu_context(struct kref *kref)\n \tgang = ctx->gang;\n \n \tspu_context_nospu_trace(destroy_spu_context__enter, ctx);\n-\tmutex_lock(&ctx->state_mutex);\n-\tspu_deactivate(ctx);\n-\tmutex_unlock(&ctx->state_mutex);\n+\n+\t/*\n+\t * Deactivate and make it non-runnable while we work on it.\n+\t */\n+\tmutex_lock(&gang->mutex);\n+\tWARN_ON(ctx->gang != gang);\n+\tspu_deactivate(gang);\n+\tmutex_unlock(&gang->mutex);\n+\n \tspu_fini_csa(&ctx->csa);\n \tspu_gang_remove_ctx(ctx->gang, ctx);\n \tif (ctx->prof_priv_kref)\n \t\tkref_put(ctx->prof_priv_kref, ctx->prof_priv_release);\n-\tBUG_ON(!list_empty(&ctx->rq));\n \tatomic_dec(&nr_spu_contexts);\n \tkfree(ctx->switch_log);\n \tkfree(ctx);\n@@ -126,20 +131,19 @@ int put_spu_context(struct spu_context *ctx)\n void spu_forget(struct spu_context *ctx)\n {\n \tstruct mm_struct *mm;\n+\tstruct spu_gang *gang = ctx->gang;\n \n \t/*\n-\t * This is basically an open-coded spu_acquire_saved, except that\n-\t * we don't acquire the state mutex interruptible, and we don't\n-\t * want this context to be rescheduled on release.\n+\t * The context is being destroyed but the gang may live on as there\n+\t * is no requirement that all contexts within the gang have to die\n+\t * at the same time.\n \t */\n-\tmutex_lock(&ctx->state_mutex);\n-\tif (ctx->state != SPU_STATE_SAVED)\n-\t\tspu_deactivate(ctx);\n-\n+\tmutex_lock(&gang->mutex);\n+\tspu_deactivate(gang);\n \tmm = ctx->owner;\n \tctx->owner = NULL;\n \tmmput(mm);\n-\tspu_release(ctx);\n+\tmutex_unlock(&gang->mutex);\n }\n \n void spu_unmap_mappings(struct spu_context *ctx)\n@@ -168,18 +172,21 @@ void spu_unmap_mappings(struct spu_context *ctx)\n  */\n int spu_acquire_saved(struct spu_context *ctx)\n {\n+\tstruct spu_gang *gang = ctx->gang;\n \tint ret;\n \n \tspu_context_nospu_trace(spu_acquire_saved__enter, ctx);\n \n-\tret = spu_acquire(ctx);\n+\tret = mutex_lock_interruptible(&gang->mutex);\n \tif (ret)\n \t\treturn ret;\n \n-\tif (ctx->state != SPU_STATE_SAVED) {\n-\t\tset_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);\n-\t\tspu_deactivate(ctx);\n-\t}\n+\t/*\n+\t * Deactivate unconditionally as we leave holding the gang lock.\n+\t * If the gang is on the runqueue, the scheduler would block on it,\n+\t * if it tried to dispatch it.\n+\t */\n+\tspu_deactivate(gang);\n \n \treturn 0;\n }\n@@ -190,10 +197,12 @@ int spu_acquire_saved(struct spu_context *ctx)\n  */\n void spu_release_saved(struct spu_context *ctx)\n {\n+\tmutex_lock(&ctx->state_mutex);\n+\tmutex_unlock(&ctx->gang->mutex);\n+\n \tBUG_ON(ctx->state != SPU_STATE_SAVED);\n \n-\tif (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags) &&\n-\t\t\ttest_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))\n+\tif (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))\n \t\tspu_activate(ctx, 0);\n \n \tspu_release(ctx);\ndiff --git a/arch/powerpc/platforms/cell/spufs/file.c \nb/arch/powerpc/platforms/cell/spufs/file.c\nindex 010a51f..1e2111c 100644\n--- a/arch/powerpc/platforms/cell/spufs/file.c\n+++ b/arch/powerpc/platforms/cell/spufs/file.c\n@@ -2605,7 +2605,7 @@ static int spufs_show_ctx(struct seq_file *s, void \n*private)\n \t\tctx->prio,\n \t\tctx->time_slice,\n \t\tctx->spu ? ctx->spu->number : -1,\n-\t\t!list_empty(&ctx->rq) ? 'q' : ' ',\n+\t\t!list_empty(&ctx->gang->rq) ? 'q' : ' ',\n \t\tctx->csa.class_0_pending,\n \t\tctx->csa.class_0_dar,\n \t\tctx->csa.class_1_dsisr,\ndiff --git a/arch/powerpc/platforms/cell/spufs/gang.c \nb/arch/powerpc/platforms/cell/spufs/gang.c\nindex 2a01271..3fcbdc7 100644\n--- a/arch/powerpc/platforms/cell/spufs/gang.c\n+++ b/arch/powerpc/platforms/cell/spufs/gang.c\n@@ -38,6 +38,7 @@ struct spu_gang *alloc_spu_gang(void)\n \tmutex_init(&gang->aff_mutex);\n \tINIT_LIST_HEAD(&gang->list);\n \tINIT_LIST_HEAD(&gang->aff_list_head);\n+\tINIT_LIST_HEAD(&gang->rq);\n \n \t/*\n \t * Inherit scheduling parameters from the creator of the gang.\n@@ -91,7 +92,16 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct \nspu_context *ctx)\n \t}\n \tlist_del_init(&ctx->gang_list);\n \tgang->contexts--;\n-\tmutex_unlock(&gang->mutex);\n+\tatomic_dec(&gang->nstarted);\n+\tif (spu_gang_runnable(gang)) {\n+\t\tctx = list_first_entry(&gang->list,\n+\t\t\t\tstruct spu_context, gang_list);\n+\t\tmutex_lock(&ctx->state_mutex);\n+\t\tmutex_unlock(&gang->mutex);\n+\t\tspu_activate(ctx, 0);\n+\t\tmutex_unlock(&ctx->state_mutex);\n+\t} else\n+\t\tmutex_unlock(&gang->mutex);\n \n \tput_spu_gang(gang);\n }\ndiff --git a/arch/powerpc/platforms/cell/spufs/run.c \nb/arch/powerpc/platforms/cell/spufs/run.c\nindex c9bb7cf..b7f5339 100644\n--- a/arch/powerpc/platforms/cell/spufs/run.c\n+++ b/arch/powerpc/platforms/cell/spufs/run.c\n@@ -172,11 +172,45 @@ out:\n static int spu_run_init(struct spu_context *ctx, u32 *npc)\n {\n \tunsigned long runcntl = SPU_RUNCNTL_RUNNABLE;\n+\tstruct spu_gang *gang = ctx->gang;\n \tint ret;\n \n \tspuctx_switch_state(ctx, SPU_UTIL_SYSTEM);\n \n \t/*\n+\t * Gang start.  The nstarted variable is incremented the first\n+\t * time that a context is started.  When all ctxts in a gang have\n+\t * been started, the gang is considered runnable.\n+\t *\n+\t * Gang runnable.  The nrunnable variable is the number of\n+\t * contexts that are currently being processed by spufs_run_spu.\n+\t * The count is incremented on entry and decremented on return\n+\t * to user mode.  When a context is in user mode, the spu is\n+\t * stopped.  When the count goes to zero, the gang is unloaded\n+\t * if there is another gang waiting to run.  In this case, it\n+\t * is unloaded, but it is not added to the runqueue as there\n+\t * are no runnable contexts in the gang.  There has to be at\n+\t * least one runnable context to be added to the runq.  In this\n+\t * way, we prevent the runqueue from being over run with non-\n+\t * runnable contexts.  The gang is guaranteed to be put back on\n+\t * the runqueue, when a context is started again.  This is\n+\t * driven explcitly by the program.\n+\t *\n+\t * Gang preemption. In addition to the case mentioned above\n+\t * when all of the contexts are invoking user mode code, the\n+\t * gang is also preempted when all of the contexts are either\n+\t * in user mode or processing page faults.  The nfaulting variable\n+\t * is the number of contexts in a gang that are currently\n+\t * processing page faults.  When it equals nrunnable, the gang\n+\t * is yielded and another gang runs if there is one on the\n+\t * runqueue.  In this case, the gang is added to the runqueue\n+\t * as the nrunnable field is still positive.\n+\t */\n+\tif (test_and_clear_bit(SPU_SCHED_JUST_CREATED, &ctx->sched_flags))\n+\t\tatomic_inc(&gang->nstarted);\n+\tatomic_inc(&gang->nrunnable);\n+\n+\t/*\n \t * NOSCHED is synchronous scheduling with respect to the caller.\n \t * The caller waits for the context to be loaded.\n \t */\n@@ -242,8 +276,6 @@ static int spu_run_fini(struct spu_context *ctx, u32 *npc,\n {\n \tint ret = 0;\n \n-\tspu_del_from_rq(ctx);\n-\n \t*status = ctx->ops->status_read(ctx);\n \t*npc = ctx->ops->npc_read(ctx);\n \n@@ -251,6 +283,8 @@ static int spu_run_fini(struct spu_context *ctx, u32 *npc,\n \tclear_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);\n \tspu_release(ctx);\n \n+\tatomic_dec(&ctx->gang->nrunnable);\n+\n \tif (signal_pending(current))\n \t\tret = -ERESTARTSYS;\n \n@@ -356,6 +390,18 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 \n*event)\n \n \tctx->event_return = 0;\n \n+\t/*\n+\t * This routine is almost entirely context based and runs mostly\n+\t * under the context lock.  The locking strategy is driven by the\n+\t * state of the context.  On input, the gang and context locks are\n+\t * taken if the context is in the non-runnable state.  Both locks\n+\t * are required to activate the context as it is a gang level\n+\t * operation.  This occurs before the loop in spu_run_init. The\n+\t * loop itself is processed just with the context lock and if the\n+\t * system call returns and is re-entered as long as the context\n+\t * remains in the runnable state, the gang lock is not required\n+\t * as the context does not need to be activated.\n+\t */\n \tret = spu_acquire(ctx);\n \tif (ret)\n \t\tgoto out_unlock;\ndiff --git a/arch/powerpc/platforms/cell/spufs/sched.c \nb/arch/powerpc/platforms/cell/spufs/sched.c\nindex 0e29f12..c0cc876 100644\n--- a/arch/powerpc/platforms/cell/spufs/sched.c\n+++ b/arch/powerpc/platforms/cell/spufs/sched.c\n@@ -48,11 +48,39 @@\n #include <asm/spu_priv1.h>\n #include \"spufs.h\"\n \n+/*\n+ * Gang scheduling locking strategy.\n+ *\n+ * All contexts in a gang are scheduled / unscheduled at the same time.\n+ * Several locks have to be taken at the same time to make this happen\n+ * atomically.  The hierarchy is gang, ctxt, runq_lock, and\n+ * cbe_spu_info[].list_lock.  The gang lock is taken principally to ensure\n+ * that additional contexts cannot be added or removed while the scheduler\n+ * is operating on the gang.  The ctxt lock must be taken to serialize\n+ * access to physical spu resources including registers, state save,\n+ * and restore.  The runqueue lock to synchronize access to the runqueue\n+ * and to ensure that the view of available resources and active gangs\n+ * is perfomed atomically across the system.  The list_lock is needed\n+ * to allocate and free individual spu resources.\n+ */\n+\n+struct spu_sched_stats {\n+\tint inode;\t/* node with the most idle spus (idle node) */\n+\tint inode_icnt;\t/* count of idle spus on idle node */\n+\tint inode_pcnt;\t/* count of preemptible spus on idle node */\n+\tint pnode;\t/* node with the most preemptible spus (preempt node) */\n+\tint pnode_icnt;\t/* count of idle spus on preempt node */\n+\tint pnode_pcnt; /* count of preemptible spus on preempt node */\n+\tint total_icnt; /* total idle spus across system */\n+\tint total_pcnt; /* total preemptible spus across system */\n+};\n+\n struct spu_prio_array {\n \tDECLARE_BITMAP(bitmap, MAX_PRIO);\n \tstruct list_head runq[MAX_PRIO];\n \tspinlock_t runq_lock;\n \tint nr_waiting;\n+\tint nr_contexts;\n };\n \n static unsigned long spu_avenrun[3];\n@@ -60,6 +88,7 @@ static struct spu_prio_array *spu_prio;\n static struct task_struct *spusched_task;\n static struct timer_list spusched_timer;\n static struct timer_list spuloadavg_timer;\n+static void spu_unschedule(struct spu_gang *gang);\n \n /*\n  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).\n@@ -85,6 +114,14 @@ static struct timer_list spuloadavg_timer;\n #define SCALE_PRIO(x, prio) \\\n \tmax(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)\n \n+/* These constants are used when reserving spus for gang scheduling. */\n+#define SPU_RESERVE\t\t0\n+#define SPU_RESERVE_UNDO\t1\n+\n+/* Constants used by the scheduler's placement algorithm. */\n+#define SPU_PLACE_NONE\t\t-2\n+#define SPU_PLACE_ALL\t\t-1\n+\n /*\n  * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:\n  * [800ms ... 100ms ... 5ms]\n@@ -109,12 +146,6 @@ void spu_set_timeslice(struct spu_context *ctx)\n void __spu_update_sched_info(struct spu_context *ctx)\n {\n \t/*\n-\t * assert that the context is not on the runqueue, so it is safe\n-\t * to change its scheduling parameters.\n-\t */\n-\tBUG_ON(!list_empty(&ctx->rq));\n-\n-\t/*\n \t * 32-Bit assignments are atomic on powerpc, and we don't care about\n \t * memory ordering here because retrieving the controlling thread is\n \t * per definition racy.\n@@ -178,24 +209,24 @@ void spu_update_sched_info(struct spu_context *ctx)\n \t}\n }\n \n-static int __node_allowed(struct spu_context *ctx, int node)\n+static int __node_allowed(struct spu_gang *gang, int node)\n {\n \tif (nr_cpus_node(node)) {\n \t\tcpumask_t mask = node_to_cpumask(node);\n \n-\t\tif (cpus_intersects(mask, ctx->cpus_allowed))\n+\t\tif (cpus_intersects(mask, gang->cpus_allowed))\n \t\t\treturn 1;\n \t}\n \n \treturn 0;\n }\n \n-static int node_allowed(struct spu_context *ctx, int node)\n+static int node_allowed(struct spu_gang *gang, int node)\n {\n \tint rval;\n \n \tspin_lock(&cbe_spu_info[node].list_lock);\n-\trval = __node_allowed(ctx, node);\n+\trval = __node_allowed(gang, node);\n \tspin_unlock(&cbe_spu_info[node].list_lock);\n \n \treturn rval;\n@@ -329,7 +360,7 @@ static struct spu *aff_ref_location(struct spu_context \n*ctx, int mem_aff,\n \t\tint available_spus;\n \n \t\tnode = (node < MAX_NUMNODES) ? node : 0;\n-\t\tif (!node_allowed(ctx, node))\n+\t\tif (!node_allowed(ctx->gang, node))\n \t\t\tcontinue;\n \n \t\tavailable_spus = 0;\n@@ -413,15 +444,17 @@ static struct spu *ctx_location(struct spu *ref, int \noffset, int node)\n  * affinity_check is called each time a context is going to be scheduled.\n  * It returns the spu ptr on which the context must run.\n  */\n-static int has_affinity(struct spu_context *ctx)\n+static int has_affinity(struct spu_gang *gang)\n {\n-\tstruct spu_gang *gang = ctx->gang;\n-\n-\tif (list_empty(&ctx->aff_list))\n+\tif (list_empty(&gang->aff_list_head))\n \t\treturn 0;\n \n-\tif (atomic_read(&ctx->gang->aff_sched_count) == 0)\n-\t\tctx->gang->aff_ref_spu = NULL;\n+\t/*\n+\t * TODO: fix SPU Affinity to work with gang scheduling.\n+\t */\n+\n+\tif (atomic_read(&gang->aff_sched_count) == 0)\n+\t\tgang->aff_ref_spu = NULL;\n \n \tif (!gang->aff_ref_spu) {\n \t\tif (!(gang->aff_flags & AFF_MERGED))\n@@ -487,302 +520,463 @@ static void spu_unbind_context(struct spu *spu, struct \nspu_context *ctx)\n }\n \n /**\n- * spu_add_to_rq - add a context to the runqueue\n- * @ctx:       context to add\n+ * spu_add_to_rq - add a gang to the runqueue\n+ * @gang:       gang to add\n  */\n-static void __spu_add_to_rq(struct spu_context *ctx)\n+static void __spu_add_to_rq(struct spu_gang *gang)\n {\n-\t/*\n-\t * Unfortunately this code path can be called from multiple threads\n-\t * on behalf of a single context due to the way the problem state\n-\t * mmap support works.\n-\t *\n-\t * Fortunately we need to wake up all these threads at the same time\n-\t * and can simply skip the runqueue addition for every but the first\n-\t * thread getting into this codepath.\n-\t *\n-\t * It's still quite hacky, and long-term we should proxy all other\n-\t * threads through the owner thread so that spu_run is in control\n-\t * of all the scheduling activity for a given context.\n-\t */\n-\tif (list_empty(&ctx->rq)) {\n-\t\tlist_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);\n-\t\tset_bit(ctx->prio, spu_prio->bitmap);\n+\tif (atomic_read(&gang->nrunnable) && list_empty(&gang->rq)) {\n+\t\tint prio = gang->prio;\n+\n+\t\tlist_add_tail(&gang->rq, &spu_prio->runq[prio]);\n+\t\tset_bit(prio, spu_prio->bitmap);\n+\t\tspu_prio->nr_contexts += gang->contexts;\n \t\tif (!spu_prio->nr_waiting++)\n \t\t\t__mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);\n \t}\n }\n \n-static void spu_add_to_rq(struct spu_context *ctx)\n+static void spu_add_to_rq(struct spu_gang *gang)\n {\n \tspin_lock(&spu_prio->runq_lock);\n-\t__spu_add_to_rq(ctx);\n+\t__spu_add_to_rq(gang);\n \tspin_unlock(&spu_prio->runq_lock);\n }\n \n-static void __spu_del_from_rq(struct spu_context *ctx)\n+static void __spu_del_from_rq(struct spu_gang *gang)\n {\n-\tint prio = ctx->prio;\n+\tif (!list_empty(&gang->rq)) {\n+\t\tint prio = gang->prio;\n \n-\tif (!list_empty(&ctx->rq)) {\n+\t\tspu_prio->nr_contexts -= gang->contexts;\n \t\tif (!--spu_prio->nr_waiting)\n \t\t\tdel_timer(&spusched_timer);\n-\t\tlist_del_init(&ctx->rq);\n-\n+\t\tlist_del_init(&gang->rq);\n \t\tif (list_empty(&spu_prio->runq[prio]))\n \t\t\tclear_bit(prio, spu_prio->bitmap);\n \t}\n }\n \n-void spu_del_from_rq(struct spu_context *ctx)\n+static void spu_del_from_rq(struct spu_gang *gang)\n {\n \tspin_lock(&spu_prio->runq_lock);\n-\t__spu_del_from_rq(ctx);\n+\t__spu_del_from_rq(gang);\n \tspin_unlock(&spu_prio->runq_lock);\n }\n \n-static void spu_prio_wait(struct spu_context *ctx)\n-{\n-\tDEFINE_WAIT(wait);\n \n-\t/*\n-\t * The caller must explicitly wait for a context to be loaded\n-\t * if the nosched flag is set.  If NOSCHED is not set, the caller\n-\t * queues the context and waits for an spu event or error.\n-\t */\n-\tBUG_ON(!(ctx->flags & SPU_CREATE_NOSCHED));\n-\n-\tspin_lock(&spu_prio->runq_lock);\n-\tprepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);\n-\tif (!signal_pending(current)) {\n-\t\t__spu_add_to_rq(ctx);\n-\t\tspin_unlock(&spu_prio->runq_lock);\n-\t\tmutex_unlock(&ctx->state_mutex);\n-\t\tschedule();\n-\t\tmutex_lock(&ctx->state_mutex);\n-\t\tspin_lock(&spu_prio->runq_lock);\n-\t\t__spu_del_from_rq(ctx);\n-\t}\n-\tspin_unlock(&spu_prio->runq_lock);\n-\t__set_current_state(TASK_RUNNING);\n-\tremove_wait_queue(&ctx->stop_wq, &wait);\n-}\n-\n-static struct spu *spu_get_idle(struct spu_context *ctx)\n+static struct spu *spu_bind(struct spu_gang *gang,\n+\t\t\t\tstruct spu_context *ctx, int node)\n {\n-\tstruct spu *spu, *aff_ref_spu;\n-\tint node, n;\n-\n-\tspu_context_nospu_trace(spu_get_idle__enter, ctx);\n-\n-\tif (ctx->gang) {\n-\t\tmutex_lock(&ctx->gang->aff_mutex);\n-\t\tif (has_affinity(ctx)) {\n-\t\t\taff_ref_spu = ctx->gang->aff_ref_spu;\n-\t\t\tatomic_inc(&ctx->gang->aff_sched_count);\n-\t\t\tmutex_unlock(&ctx->gang->aff_mutex);\n-\t\t\tnode = aff_ref_spu->node;\n+\tstruct spu *spu;\n+\tint n;\n \n-\t\t\tspin_lock(&cbe_spu_info[node].list_lock);\n-\t\t\tspu = ctx_location(aff_ref_spu, ctx->aff_offset, node);\n-\t\t\tif (spu && spu->alloc_state == SPU_FREE)\n-\t\t\t\tgoto found;\n-\t\t\tspin_unlock(&cbe_spu_info[node].list_lock);\n+\tif (node == SPU_PLACE_ALL)\n+\t\tnode = cpu_to_node(raw_smp_processor_id());\n \n-\t\t\tatomic_dec(&ctx->gang->aff_sched_count);\n-\t\t\tgoto not_found;\n-\t\t}\n-\t\tmutex_unlock(&ctx->gang->aff_mutex);\n-\t}\n-\tnode = cpu_to_node(raw_smp_processor_id());\n \tfor (n = 0; n < MAX_NUMNODES; n++, node++) {\n \t\tnode = (node < MAX_NUMNODES) ? node : 0;\n-\t\tif (!node_allowed(ctx, node))\n+\t\tif (!node_allowed(gang, node))\n \t\t\tcontinue;\n \n \t\tspin_lock(&cbe_spu_info[node].list_lock);\n \t\tlist_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {\n-\t\t\tif (spu->alloc_state == SPU_FREE)\n+\t\t\tif ((spu->alloc_state == SPU_FREE) &&\n+\t\t\t    (spu->gang == gang))\n \t\t\t\tgoto found;\n \t\t}\n \t\tspin_unlock(&cbe_spu_info[node].list_lock);\n \t}\n-\n- not_found:\n-\tspu_context_nospu_trace(spu_get_idle__not_found, ctx);\n \treturn NULL;\n \n- found:\n+found:\n+\tcbe_spu_info[node].nr_active++;\n \tspu->alloc_state = SPU_USED;\n+\tspu->ctx = ctx;\n \tspin_unlock(&cbe_spu_info[node].list_lock);\n-\tspu_context_trace(spu_get_idle__found, ctx, spu);\n-\tspu_init_channels(spu);\n+\n \treturn spu;\n }\n \n+static void __spu_schedule(struct spu_gang *gang, int node_chosen)\n+{\n+\tstruct spu_context *ctx;\n+\tstruct spu *spu;\n+\n+\tspu_del_from_rq(gang);\n+\n+\tlist_for_each_entry(ctx, &gang->list, gang_list) {\n+\t\tmutex_lock(&ctx->state_mutex);\n+\t\tBUG_ON(ctx->spu);\n+\t\tspu = spu_bind(gang, ctx, node_chosen);\n+\t\tBUG_ON(!spu);\n+\t\tspu_bind_context(spu, ctx);\n+\t\tspu_set_timeslice(ctx);\n+\t\twake_up_all(&ctx->run_wq);\n+\t\tmutex_unlock(&ctx->state_mutex);\n+\t}\n+}\n+\n+static void spu_unschedule(struct spu_gang *gang)\n+{\n+\tstruct spu_context *ctx;\n+\tstruct spu *spu;\n+\tint node;\n+\n+\tlist_for_each_entry(ctx, &gang->list, gang_list) {\n+\n+\t\tmutex_lock(&ctx->state_mutex);\n+\t\tspu = ctx->spu;\n+\t\tBUG_ON(!spu);\n+\t\tBUG_ON(spu->ctx != ctx);\n+\t\tBUG_ON(spu->gang != gang);\n+\t\tBUG_ON(spu->alloc_state != SPU_USED);\n+\t\tnode = spu->node;\n+\t\tspu_unbind_context(spu, ctx);\n+\t\tspin_lock(&cbe_spu_info[node].list_lock);\n+\t\tcbe_spu_info[node].nr_active--;\n+\t\tspu->alloc_state = SPU_FREE;\n+\t\tspu->ctx = NULL;\n+\t\tspu->gang = NULL;\n+\t\tctx->stats.invol_ctx_switch++;\n+\t\tspu->stats.invol_ctx_switch++;\n+\t\tspin_unlock(&cbe_spu_info[node].list_lock);\n+\t\tmutex_unlock(&ctx->state_mutex);\n+\t}\n+}\n+\n+static int spu_get_idle(struct spu_gang *gang, int node)\n+{\n+\tstruct spu *spu;\n+\tint n, lnode, count, mode;\n+\tint ret = 0, found = 0;\n+\n+\tspu_context_nospu_trace(spu_get_idle__enter, gang);\n+\n+\t/* TO DO: SPU affinity scheduling. */\n+\n+\tmode = SPU_RESERVE;\n+\n+spu_get_idle_top:\n+\n+\tif (node == SPU_PLACE_ALL)\n+\t\tlnode = cpu_to_node(raw_smp_processor_id());\n+\telse\n+\t\tlnode = node;\n+\n+\tfor (n = 0, count = 0;\n+\t     n < MAX_NUMNODES && count < gang->contexts;\n+\t     n++, lnode++) {\n+\n+\t\tlnode = (lnode < MAX_NUMNODES) ? lnode : 0;\n+\t\tif (!node_allowed(gang, lnode))\n+\t\t\tcontinue;\n+\n+\t\tspin_lock(&cbe_spu_info[lnode].list_lock);\n+\t\tlist_for_each_entry(spu, &cbe_spu_info[lnode].spus, cbe_list) {\n+\t\t\tswitch (mode) {\n+\t\t\tcase SPU_RESERVE :\n+\t\t\t\tif ((spu->alloc_state == SPU_FREE) &&\n+\t\t\t\t    (spu->gang == NULL)) {\n+\t\t\t\t\tspu_init_channels(spu);\n+\t\t\t\t\tspu->gang = gang;\n+\t\t\t\t\tif (++count == gang->contexts)\n+\t\t\t\t\t\tgoto spu_get_idle_found;\n+\t\t\t\t}\n+\t\t\t\tbreak;\n+\t\t\tcase SPU_RESERVE_UNDO :\n+\t\t\t\tif ((spu->alloc_state == SPU_FREE) &&\n+\t\t\t\t    (spu->gang == gang)) {\n+\t\t\t\t\tspu->gang = NULL;\n+\t\t\t\t\tif (++count == found)\n+\t\t\t\t\t\tgoto spu_get_idle_found;\n+\t\t\t\t}\n+\t\t\t\tbreak;\n+\t\t\t}\n+\t\t}\n+\t\tspin_unlock(&cbe_spu_info[lnode].list_lock);\n+\t}\n+\tBUG_ON(mode != SPU_RESERVE);\n+\n+\tfound = count;\n+\tret = -1;\n+\tmode = SPU_RESERVE_UNDO;\n+\n+\tif (found)\n+\t\tgoto spu_get_idle_top;\n+\telse\n+\t\tgoto spu_get_idle_out;\n+\n+spu_get_idle_found:\n+\n+\tfound = count;\n+\n+\tspin_unlock(&cbe_spu_info[lnode].list_lock);\n+\n+spu_get_idle_out:\n+\n+\tif (ret)\n+\t\tspu_gang_nospu_trace(spu_get_idle__not_found,\n+\t\t\t\t\tgang, gang->contexts, found)\n+\telse\n+\t\tspu_gang_trace(spu_get_idle__found, gang, gang->contexts)\n+\n+\treturn ret;\n+}\n+\n /**\n- * find_victim - find a lower priority context to preempt\n- * @ctx:\tcanidate context for running\n+ * find_victim - find a lower priority gang to preempt\n+ * @gang:\tcanidate gang for running\n  *\n- * Returns the freed physical spu to run the new context on.\n+ * Returns 0 for success (preemption occurred), -1 for failure\n  */\n-static struct spu *find_victim(struct spu_context *ctx)\n+static int find_victim(struct spu_gang *gang, int node)\n {\n-\tstruct spu_context *victim = NULL;\n+\tstruct spu_gang *victim;\n+\tstruct spu_context *ctx;\n \tstruct spu *spu;\n-\tint node, n;\n+\tint n, retry = 3;\n \n-\tspu_context_nospu_trace(spu_find_victim__enter, ctx);\n+\tspu_context_nospu_trace(spu_find_victim__enter, gang);\n+\n+\tif (node == SPU_PLACE_ALL)\n+\t\tnode = cpu_to_node(raw_smp_processor_id());\n \n-\t/*\n-\t * Look for a possible preemption candidate on the local node first.\n-\t * If there is no candidate look at the other nodes.  This isn't\n-\t * exactly fair, but so far the whole spu scheduler tries to keep\n-\t * a strong node affinity.  We might want to fine-tune this in\n-\t * the future.\n-\t */\n- restart:\n-\tnode = cpu_to_node(raw_smp_processor_id());\n \tfor (n = 0; n < MAX_NUMNODES; n++, node++) {\n+\n \t\tnode = (node < MAX_NUMNODES) ? node : 0;\n-\t\tif (!node_allowed(ctx, node))\n+\t\tif (!node_allowed(gang, node))\n \t\t\tcontinue;\n+restart_node:\n+\t\t/*\n+\t\t * Retry to avoid algorithm deadlock.  The act of\n+\t\t * unscheduling takes place before scheduling, so we can't\n+\t\t * spin indefinitely in the unschedule (ie. find_victim)\n+\t\t * waiting for a priority change to occur.\n+\t\t */\n+\t\tif (!--retry)\n+\t\t\tcontinue;\n+\n+\t\tvictim = NULL;\n \n \t\tspin_lock(&cbe_spu_info[node].list_lock);\n \t\tlist_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {\n-\t\t\tstruct spu_context *tmp = spu->ctx;\n+\t\t\tstruct spu_gang *tmp = spu->gang;\n+\t\t\tstruct spu_context *tmpc = spu->ctx;\n \n-\t\t\tif (tmp && tmp->prio > ctx->prio &&\n-\t\t\t    !(tmp->flags & SPU_CREATE_NOSCHED) &&\n+\t\t\tif ((spu->alloc_state == SPU_USED) &&\n+\t\t\t    (tmp != NULL) &&\n+\t\t\t    (tmp->prio > gang->prio) &&\n+\t\t\t   !(tmpc->flags & SPU_CREATE_NOSCHED) &&\n \t\t\t    (!victim || tmp->prio > victim->prio)) {\n-\t\t\t\tvictim = spu->ctx;\n+\t\t\t\tvictim = tmp;\n \t\t\t}\n \t\t}\n-\t\tif (victim)\n-\t\t\tget_spu_context(victim);\n \t\tspin_unlock(&cbe_spu_info[node].list_lock);\n \n \t\tif (victim) {\n \t\t\t/*\n-\t\t\t * This nests ctx->state_mutex, but we always lock\n+\t\t\t * This nests gang->mutex, but we always lock\n \t\t\t * higher priority contexts before lower priority\n \t\t\t * ones, so this is safe until we introduce\n \t\t\t * priority inheritance schemes.\n \t\t\t *\n-\t\t\t * XXX if the highest priority context is locked,\n-\t\t\t * this can loop a long time.  Might be better to\n-\t\t\t * look at another context or give up after X retries.\n+\t\t\t * XXX If victim no longer exists, we are referencing\n+\t\t\t * stray kernel memory and changing it with the mutex\n+\t\t\t * lock and subsequent actions.\n \t\t\t */\n-\t\t\tif (!mutex_trylock(&victim->state_mutex)) {\n-\t\t\t\tput_spu_context(victim);\n-\t\t\t\tvictim = NULL;\n-\t\t\t\tgoto restart;\n+\t\t\tif (test_bit(SPU_SCHED_DEACTIVATED, &gang->sched_flags))\n+\t\t\t\tgoto restart_node;\n+\n+\t\t\tif (!mutex_trylock(&victim->mutex)) {\n+\t\t\t\tgoto restart_node;\n \t\t\t}\n \n-\t\t\tspu = victim->spu;\n-\t\t\tif (!spu || victim->prio <= ctx->prio) {\n+\t\t\tif (!spu_gang_runnable(victim)) {\n \t\t\t\t/*\n \t\t\t\t * This race can happen because we've dropped\n \t\t\t\t * the active list mutex.  Not a problem, just\n \t\t\t\t * restart the search.\n \t\t\t\t */\n-\t\t\t\tmutex_unlock(&victim->state_mutex);\n-\t\t\t\tput_spu_context(victim);\n-\t\t\t\tvictim = NULL;\n-\t\t\t\tgoto restart;\n+\t\t\t\tmutex_unlock(&victim->mutex);\n+\t\t\t\tgoto restart_node;\n \t\t\t}\n \n-\t\t\tspu_context_trace(__spu_deactivate__unload, ctx, spu);\n+\t\t\tctx = list_first_entry(&victim->list,\n+\t\t\t\t\tstruct spu_context, gang_list);\n \n-\t\t\tspu_unbind_context(spu, victim);\n+\t\t\tif (!ctx->spu || victim->prio <= gang->prio) {\n+\t\t\t\t/*\n+\t\t\t\t * This race can happen because we've dropped\n+\t\t\t\t * the active list mutex.  Not a problem, just\n+\t\t\t\t * restart the search.\n+\t\t\t\t */\n+\t\t\t\tmutex_unlock(&victim->mutex);\n+\t\t\t\tgoto restart_node;\n+\t\t\t}\n \n-\t\t\tspin_lock(&cbe_spu_info[node].list_lock);\n-\t\t\tspu->ctx = NULL;\n-\t\t\tcbe_spu_info[node].nr_active--;\n-\t\t\tspin_unlock(&cbe_spu_info[node].list_lock);\n+\t\t\tspu_gang_trace(__spu_deactivate__unload,\n+\t\t\t\t\t\tvictim, victim->contexts);\n \n-\t\t\tvictim->stats.invol_ctx_switch++;\n-\t\t\tspu->stats.invol_ctx_switch++;\n-\t\t\tif (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))\n-\t\t\t\tspu_add_to_rq(victim);\n+\t\t\tspu_unschedule(victim);\n+\t\t\tspu_add_to_rq(victim);\n \n-\t\t\tmutex_unlock(&victim->state_mutex);\n-\t\t\tput_spu_context(victim);\n+\t\t\tmutex_unlock(&victim->mutex);\n \n-\t\t\treturn spu;\n+\t\t\treturn 0;\n \t\t}\n \t}\n+\treturn -1;\n+}\n \n-\treturn NULL;\n+static int spu_gang_placeable(struct spu_gang *gang,\n+\tstruct spu_sched_stats *stats, int *node_chosen, int *node_preempt)\n+{\n+\t/*\n+\t * Strategy is to minimize preemption and to place contexts on\n+\t * a single node, if possible.  Affinity gangs must be scheduled\n+\t * on one node.  Identify nodes to preempt if necessary.\n+\t */\n+\tif (has_affinity(gang)) {\n+\t\tif (stats->inode_icnt + stats->inode_pcnt >= gang->contexts) {\n+\t\t\t*node_chosen = stats->inode;\n+\t\t\tif (stats->inode_icnt < gang->contexts)\n+\t\t\t\t*node_preempt = stats->inode_icnt;\n+\t\t\telse\n+\t\t\t\t*node_preempt = SPU_PLACE_NONE;\n+\t\t\treturn 1;\n+\t\t}\n+\t\tif (stats->pnode_icnt + stats->pnode_pcnt >= gang->contexts) {\n+\t\t\t*node_chosen = stats->pnode;\n+\t\t\tif (stats->pnode_icnt < gang->contexts)\n+\t\t\t\t*node_preempt = stats->pnode;\n+\t\t\telse\n+\t\t\t\t*node_preempt = SPU_PLACE_NONE;\n+\t\t\treturn 1;\n+\t\t}\n+\t\treturn 0;\n+\t}\n+\tif (stats->inode_icnt >= gang->contexts) {\n+\t\t*node_chosen = stats->inode;\t/* allocate idle node */\n+\t\t*node_preempt = SPU_PLACE_NONE;\n+\t\treturn 1;\n+\t}\n+\tif (stats->total_icnt >= gang->contexts) {\n+\t\t*node_chosen = SPU_PLACE_ALL;\t/* allocate all nodes */\n+\t\t*node_preempt = SPU_PLACE_NONE;\n+\t\treturn 1;\n+\t}\n+\tif (stats->inode_icnt + stats->inode_pcnt >= gang->contexts) {\n+\t\t*node_chosen = stats->inode;\t/* allocate idle node */\n+\t\t*node_preempt = stats->inode;\t/* preempt on idle node */\n+\t\treturn 1;\n+\t}\n+\tif (stats->pnode_icnt + stats->pnode_pcnt >= gang->contexts) {\n+\t\t*node_chosen = stats->pnode;\t/* allocate other node */\n+\t\t*node_preempt = stats->pnode;\t/* preempt on other node */\n+\t\treturn 1;\n+\t}\n+\tif (stats->total_icnt + stats->inode_pcnt >= gang->contexts) {\n+\t\t*node_chosen = SPU_PLACE_ALL;\t/* allocate all nodes */\n+\t\t*node_preempt = stats->inode;\t/* preempt on idle node */\n+\t\treturn 1;\n+\t}\n+\tif (stats->total_icnt + stats->pnode_pcnt >= gang->contexts) {\n+\t\t*node_chosen = SPU_PLACE_ALL;\t/* allocate all nodes */\n+\t\t*node_preempt = stats->pnode;\t/* preempt other node */\n+\t\treturn 1;\n+\t}\n+\tif (stats->total_icnt + stats->total_pcnt >= gang->contexts) {\n+\t\t*node_chosen = SPU_PLACE_ALL;\t/* allocate nodes */\n+\t\t*node_preempt = SPU_PLACE_ALL;\t/* preempt all nodes */\n+\t\treturn 1;\n+\t}\n+\treturn 0;\n }\n \n-static void __spu_schedule(struct spu *spu, struct spu_context *ctx)\n+static void spu_get_run_stats(struct spu_gang *gang,\n+\t\t\t\t\tstruct spu_sched_stats *stats)\n {\n-\tint node = spu->node;\n-\tint success = 0;\n+\tstruct spu *spu;\n+\tint n, node, count_idle, count_preempt;\n \n-\tspu_set_timeslice(ctx);\n+\tstats->inode = -1;\t\t/* node with most idle spus */\n+\tstats->inode_icnt = 0; \t\t/* n idle on idle node */\n+\tstats->inode_pcnt = 0;\t\t/* n preemptable on idle node */\n+\tstats->pnode = -1;\t\t/* node with most preemptable spus */\n+\tstats->pnode_icnt = 0; \t\t/* n idle on preempt node */\n+\tstats->pnode_pcnt = 0; \t\t/* n preemptable on preempt node */\n+\tstats->total_icnt = 0; \t\t/* total idle across all nodes */\n+\tstats->total_pcnt = 0;\t\t/* total preemptable across all nodes */\n \n-\tspin_lock(&cbe_spu_info[node].list_lock);\n-\tif (spu->ctx == NULL) {\n-\t\tcbe_spu_info[node].nr_active++;\n-\t\tspu->alloc_state = SPU_USED;\n-\t\tspu->ctx = ctx;\n-\t\tsuccess = 1;\n-\t}\n-\tspin_unlock(&cbe_spu_info[node].list_lock);\n+\tnode = cpu_to_node(raw_smp_processor_id());\n+\tfor (n = 0; n < MAX_NUMNODES; n++, node++) {\n \n-\tspu_bind_context(spu, ctx);\n+\t\tnode = (node < MAX_NUMNODES) ? node : 0;\n+\t\tif (!node_allowed(gang, node))\n+\t\t\tcontinue;\n \n-\tif (success)\n-\t\twake_up_all(&ctx->run_wq);\n-\telse\n-\t\tspu_add_to_rq(ctx);\n-}\n+\t\tcount_idle = 0;\n+\t\tcount_preempt = 0;\n \n-static void spu_schedule(struct spu *spu, struct spu_context *ctx)\n-{\n-\t/* not a candidate for interruptible because it's called either\n-\t   from the scheduler thread or from spu_deactivate */\n-\tmutex_lock(&ctx->state_mutex);\n-\t__spu_schedule(spu, ctx);\n-\tspu_release(ctx);\n+\t\tspin_lock(&cbe_spu_info[node].list_lock);\n+\t\tlist_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {\n+\t\t\tstruct spu_gang *tmp = spu->gang;\n+\t\t\tstruct spu_context *tmpc = spu->ctx;\n+\n+\t\t\tswitch (spu->alloc_state) {\n+\t\t\tcase SPU_FREE :\n+\t\t\t\tif (!tmp)\n+\t\t\t\t\tcount_idle++;\n+\t\t\t\tbreak;\n+\t\t\tcase SPU_USED :\n+\t\t\t\tif (tmpc &&\n+\t\t\t\t   !(tmpc->flags & SPU_CREATE_NOSCHED)) {\n+\t\t\t\t\tif (tmp == gang) /* yield/deactivate */\n+\t\t\t\t\t\tcount_idle++;\n+\t\t\t\t\telse if (tmp->prio > gang->prio)\n+\t\t\t\t\t\tcount_preempt++;\n+\t\t\t\t}\n+\t\t\t\tbreak;\n+\t\t\t}\n+\t\t}\n+\t\tspin_unlock(&cbe_spu_info[node].list_lock);\n+\n+\t\tif (stats->inode == -1 || stats->inode_icnt < count_idle) {\n+\t\t\tstats->inode_icnt = count_idle;\n+\t\t\tstats->inode_pcnt = count_preempt;\n+\t\t\tstats->inode = node;\n+\t\t}\n+\n+\t\tif (stats->pnode == -1 || stats->pnode_pcnt < count_preempt) {\n+\t\t\tstats->pnode_icnt = count_idle;\n+\t\t\tstats->pnode_pcnt = count_preempt;\n+\t\t\tstats->pnode = node;\n+\t\t}\n+\n+\t\tstats->total_icnt += count_idle;\n+\t\tstats->total_pcnt += count_preempt;\n+\t}\n }\n \n-/**\n- * spu_unschedule - remove a context from a spu, and possibly release it.\n- * @spu:\tThe SPU to unschedule from\n- * @ctx:\tThe context currently scheduled on the SPU\n- * @free_spu\tWhether to free the SPU for other contexts\n- *\n- * Unbinds the context @ctx from the SPU @spu. If @free_spu is non-zero, the\n- * SPU is made available for other contexts (ie, may be returned by\n- * spu_get_idle). If this is zero, the caller is expected to schedule another\n- * context to this spu.\n- *\n- * Should be called with ctx->state_mutex held.\n- */\n-static void spu_unschedule(struct spu *spu, struct spu_context *ctx,\n-\t\tint free_spu)\n+static int spu_gang_schedulable(struct spu_gang *gang,\n+\t\tint *node_chosen, int *node_preempt)\n {\n-\tint node = spu->node;\n+\tstruct spu_sched_stats stats;\n+\tint ret;\n \n-\tif (free_spu)\n-\t\tspu->alloc_state = SPU_FREE;\n-\tspu_unbind_context(spu, ctx);\n-\n-\tspin_lock(&cbe_spu_info[node].list_lock);\n-\tcbe_spu_info[node].nr_active--;\n-\tspu->ctx = NULL;\n-\tctx->stats.invol_ctx_switch++;\n-\tspu->stats.invol_ctx_switch++;\n-\tspin_unlock(&cbe_spu_info[node].list_lock);\n+\tspin_lock(&spu_prio->runq_lock);\n+\tspu_get_run_stats(gang, &stats);\n+\tspin_unlock(&spu_prio->runq_lock);\n+\tret = spu_gang_placeable(gang, &stats, node_chosen, node_preempt);\n+\treturn ret;\n }\n \n+\n /**\n  * spu_activate - find a free spu for a context and execute it\n- * @ctx:\tspu context to schedule\n- * @flags:\tflags (currently ignored)\n+ * @ctx:\t\tspu context to schedule\n+ * @flags:\t\tflags (currently ignored)\n  *\n  * Tries to find a free spu to run @ctx.  If no free spu is available\n  * add the context to the runqueue so it gets woken up once an spu\n@@ -790,100 +984,184 @@ static void spu_unschedule(struct spu *spu, struct \nspu_context *ctx,\n  */\n int spu_activate(struct spu_context *ctx, unsigned long flags)\n {\n-\tstruct spu *spu;\n+\tstruct spu_gang *gang = ctx->gang;\n+\tint ret, node_chosen, node_preempt;\n+\n+\tif (signal_pending(current))\n+\t\treturn -ERESTARTSYS;\n \n-\t/*\n-\t * If there are multiple threads waiting for a single context\n-\t * only one actually binds the context while the others will\n-\t * only be able to acquire the state_mutex once the context\n-\t * already is in runnable state.\n-\t */\n \tif (ctx->spu)\n \t\treturn 0;\n \n-spu_activate_top:\n-\tif (signal_pending(current))\n-\t\treturn -ERESTARTSYS;\n+\tif (!mutex_trylock(&gang->mutex)) {\n+\t\tspu_release(ctx);\n+\t\tmutex_lock(&gang->mutex);\n+\t\tmutex_lock(&ctx->state_mutex);\n+\t}\n \n-\tspu = spu_get_idle(ctx);\n \t/*\n-\t * If this is a realtime thread we try to get it running by\n-\t * preempting a lower priority thread.\n+\t * Recheck as we released lock. Context could have been activated\n+\t * if it was previously started and sliced.\n \t */\n-\tif (!spu && rt_prio(ctx->prio))\n-\t\tspu = find_victim(ctx);\n-\tif (spu) {\n-\t\tunsigned long runcntl;\n-\n-\t\truncntl = ctx->ops->runcntl_read(ctx);\n-\t\t__spu_schedule(spu, ctx);\n-\t\tif (runcntl & SPU_RUNCNTL_RUNNABLE)\n-\t\t\tspuctx_switch_state(ctx, SPU_UTIL_USER);\n-\n+\tif (ctx->spu) {\n+\t\tmutex_unlock(&gang->mutex);\n \t\treturn 0;\n \t}\n \n-\tif (ctx->flags & SPU_CREATE_NOSCHED) {\n-\t\tspu_prio_wait(ctx);\n-\t\tgoto spu_activate_top;\n+\tif (!spu_gang_runnable(gang)) {\n+\t\tmutex_unlock(&gang->mutex);\n+\t\tif (ctx->flags & SPU_CREATE_NOSCHED) {\n+\t\t\tret = spufs_wait(ctx->run_wq,\n+\t\t\t\tctx->state == SPU_STATE_RUNNABLE);\n+\t\t\tif (unlikely(ret)) {\n+\t\t\t\tspu_del_from_rq(gang);\n+\t\t\t\tmutex_lock(&ctx->state_mutex);\n+\t\t\t}\n+\t\t\treturn ret;\n+\t\t}\n+\t\treturn 0;\n \t}\n+\tspu_release(ctx);\n+\n+\t/*\n+\t * Activation assumes gang is not on the runqueue as it is\n+\t * about to be activated. It could be on the runqueue, if it\n+\t * were time sliced while executing user mode code.\n+\t */\n+\tspu_del_from_rq(gang);\n+\n+\tclear_bit(SPU_SCHED_DEACTIVATED, &gang->sched_flags);\n+\n+spu_activate_top:\n \n-\tspu_add_to_rq(ctx);\n+\tif (spu_gang_schedulable(gang, &node_chosen, &node_preempt)) {\n \n+\t\tif (node_preempt != SPU_PLACE_NONE) {\n+\t\t\tret = find_victim(gang, node_preempt);\n+\t\t\tif (unlikely(ret))\n+\t\t\t\tgoto spu_activate_rq;\n+\t\t}\n+\n+\t\tret = spu_get_idle(gang, node_chosen);\n+\t\tif (ret)\n+\t\t\tgoto spu_activate_top;\n+\t\t__spu_schedule(gang, node_chosen);\n+\t} else {\n+\n+spu_activate_rq:\n+\t\tspu_add_to_rq(gang);\n+\t\tif (ctx->flags & SPU_CREATE_NOSCHED) {\n+\t\t\tspu_acquire(ctx);\n+\t\t\tmutex_unlock(&gang->mutex);\n+\t\t\tret = spufs_wait(ctx->run_wq,\n+\t\t\t\tctx->state == SPU_STATE_RUNNABLE);\n+\t\t\tif (unlikely(ret)) {\n+\t\t\t\tspu_del_from_rq(gang);\n+\t\t\t\tmutex_lock(&ctx->state_mutex);\n+\t\t\t}\n+\t\t\treturn ret;\n+\t\t}\n+\t}\n+\tmutex_lock(&ctx->state_mutex);\n+\tmutex_unlock(&gang->mutex);\n \treturn 0;\n }\n \n /**\n- * grab_runnable_context - try to find a runnable context\n+ * grab_runnable_gang - try to find a runnable gang\n  *\n- * Remove the highest priority context on the runqueue and return it\n- * to the caller.  Returns %NULL if no runnable context was found.\n+ * Remove the highest priority gang on the runqueue and return it\n+ * to the caller.  Returns %NULL if no runnable gang was found.\n  */\n-static struct spu_context *grab_runnable_context(int prio, int node)\n+static struct spu_gang *grab_runnable_gang(struct spu_gang *old,\n+\t\tint prio, int *node_chosen, int *node_preempt)\n {\n+\tstruct spu_sched_stats stats, lstats;\n \tstruct spu_context *ctx;\n-\tint best;\n+\tstruct spu_gang *gang;\n+\tint ret, best;\n \n+\t/*\n+\t * If old != NULL, then caller is spu_yield or spu_deactivate.  We\n+\t * can use idle spus and spus assigned to old.  Else the caller is the\n+\t * scheduler thread (time slicer) and we may only use idle spus.  In\n+\t * neither case are we allowed to preempt.  Preemption is only done by\n+\t * spu_activation which assumes that the gang is not on the runqueue.\n+\t * The fact that we are getting the gang from the runqueue implies\n+\t * that it has a less favored priority than anything currently\n+\t * running. spu_activate does not call this routine.\n+\t */\n \tspin_lock(&spu_prio->runq_lock);\n+\tif (old) {\n+\t\tctx = list_first_entry(&old->list,\n+\t\t\t\tstruct spu_context, gang_list);\n+\t\tBUG_ON(!ctx->spu);\n+\t\tspu_get_run_stats(old, &stats);\n+\t}\n \tbest = find_first_bit(spu_prio->bitmap, prio);\n \twhile (best < prio) {\n \t\tstruct list_head *rq = &spu_prio->runq[best];\n \n-\t\tlist_for_each_entry(ctx, rq, rq) {\n-\t\t\t/* XXX(hch): check for affinity here aswell */\n-\t\t\tif (__node_allowed(ctx, node)) {\n-\t\t\t\t__spu_del_from_rq(ctx);\n+\t\tlist_for_each_entry(gang, rq, rq) {\n+\t\t\tif (old) {\n+\t\t\t\tlstats = stats;\n+\t\t\t\tif (!node_allowed(gang, lstats.inode))\n+\t\t\t\t\tlstats.inode_icnt = 0;\n+\t\t\t} else {\n+\t\t\t\tspu_get_run_stats(gang, &lstats);\n+\t\t\t}\n+\t\t\tret = spu_gang_placeable(gang, &lstats,\n+\t\t\t\t\t\tnode_chosen, node_preempt);\n+\t\t\tif (ret && (*node_preempt == SPU_PLACE_NONE)) {\n+\t\t\t\t__spu_del_from_rq(gang);\n \t\t\t\tgoto found;\n \t\t\t}\n \t\t}\n \t\tbest++;\n \t}\n-\tctx = NULL;\n- found:\n+\tgang = NULL;\n+found:\n \tspin_unlock(&spu_prio->runq_lock);\n-\treturn ctx;\n+\treturn gang;\n }\n \n-static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)\n+static int __spu_deactivate(struct spu_gang *gang, int force, int max_prio)\n {\n-\tstruct spu *spu = ctx->spu;\n-\tstruct spu_context *new = NULL;\n+\tstruct spu_gang *new;\n+\tstruct spu_context *ctx;\n+\tint ret, node_chosen, node_preempt;\n \n-\tif (spu) {\n-\t\tnew = grab_runnable_context(max_prio, spu->node);\n-\t\tif (new || force) {\n-\t\t\tspu_unschedule(spu, ctx, new == NULL);\n-\t\t\tif (new) {\n-\t\t\t\tif (new->flags & SPU_CREATE_NOSCHED)\n-\t\t\t\t\twake_up(&new->stop_wq);\n-\t\t\t\telse {\n-\t\t\t\t\tspu_release(ctx);\n-\t\t\t\t\tspu_schedule(spu, new);\n-\t\t\t\t\t/* this one can't easily be made\n-\t\t\t\t\t   interruptible */\n-\t\t\t\t\tmutex_lock(&ctx->state_mutex);\n-\t\t\t\t}\n+\tctx = list_first_entry(&gang->list, struct spu_context, gang_list);\n+\tif (!ctx->spu)\n+\t\treturn 0;\n+\n+\tnew = grab_runnable_gang(gang, max_prio, &node_chosen, &node_preempt);\n+\tif (new || force) {\n+\t\tspu_unschedule(gang);\n+\t\tif (new) {\n+\t\t\t/*\n+\t\t\t * None of these locks can be easily made interruptible\n+\t\t\t */\n+\t\t\tmutex_unlock(&gang->mutex);\n+\n+\t\t\t/*\n+\t\t\t * Schedule gang taken from runqueue.  Should fit, but\n+\t\t\t * might not as we dropped the lock above.  Also, it\n+\t\t\t * could have been activated by spu_run.\n+\t\t\t */\n+\t\t\tmutex_lock(&new->mutex);\n+\t\t\tctx = list_first_entry(&new->list,\n+\t\t\t\tstruct spu_context, gang_list);\n+\t\t\tif (!ctx->spu) {\n+\t\t\t\tret = spu_get_idle(new, node_chosen);\n+\t\t\t\tif (ret)\n+\t\t\t\t\tspu_add_to_rq(new);\n+\t\t\t\telse\n+\t\t\t\t\t__spu_schedule(new, node_chosen);\n \t\t\t}\n+\t\t\tmutex_unlock(&new->mutex);\n+\n+\t\t\tmutex_lock(&gang->mutex);\n \t\t}\n \t}\n \n@@ -897,67 +1175,85 @@ static int __spu_deactivate(struct spu_context *ctx, \nint force, int max_prio)\n  * Unbind @ctx from the physical spu it is running on and schedule\n  * the highest priority context to run on the freed physical spu.\n  */\n-void spu_deactivate(struct spu_context *ctx)\n+void spu_deactivate(struct spu_gang *gang)\n {\n-\tspu_context_nospu_trace(spu_deactivate__enter, ctx);\n-\t__spu_deactivate(ctx, 1, MAX_PRIO);\n+\tspu_context_nospu_trace(spu_deactivate__enter, gang);\n+\tset_bit(SPU_SCHED_DEACTIVATED, &gang->sched_flags);\n+\t__spu_deactivate(gang, 1, MAX_PRIO);\n+\tspu_del_from_rq(gang);\n }\n \n /**\n  * spu_yield -\tyield a physical spu if others are waiting\n  * @ctx:\tspu context to yield\n  *\n- * Check if there is a higher priority context waiting and if yes\n- * unbind @ctx from the physical spu and schedule the highest\n- * priority context to run on the freed physical spu instead.\n+ * Check if there is a higher priority gang waiting and if yes\n+ * unbind @ctx and any other within the same gang freeing one or\n+ * more physical spus and schedule the highest priority gang\n+ * to run on the freed physical spu(s) instead.  The gang is added\n+ * back to the runqueue, so that it may be resumed by the scheduler\n+ * as soon as there are idle spus.\n  */\n void spu_yield(struct spu_context *ctx)\n {\n-\tspu_context_nospu_trace(spu_yield__enter, ctx);\n+\tstruct spu_gang *gang = ctx->gang;\n+\tint ret;\n+\n+\tspu_context_nospu_trace(spu_yield__enter, gang);\n \tif (!(ctx->flags & SPU_CREATE_NOSCHED)) {\n-\t\tmutex_lock(&ctx->state_mutex);\n-\t\t__spu_deactivate(ctx, 0, MAX_PRIO);\n-\t\tmutex_unlock(&ctx->state_mutex);\n+\t\tmutex_lock(&gang->mutex);\n+\t\tret = __spu_deactivate(gang, 0, MAX_PRIO);\n+\t\tif (ret)\n+\t\t\tspu_add_to_rq(gang);\n+\t\tmutex_unlock(&gang->mutex);\n \t}\n }\n \n-static noinline void spusched_tick(struct spu_context *ctx)\n+static noinline int spusched_tick(struct spu_gang *gang,\n+\t\t\t\t\tstruct spu_context *ctx,\n+\t\t\t\t\tint preempt)\n {\n-\tstruct spu_context *new = NULL;\n-\tstruct spu *spu = NULL;\n+\tint best, yield;\n+\tint npreempted = 0;\n \n-\tif (spu_acquire(ctx))\n-\t\tBUG();\t/* a kernel thread never has signals pending */\n+\tif (test_bit(SPU_SCHED_DEACTIVATED, &gang->sched_flags))\n+\t\treturn 0;\n \n-\tif (ctx->state != SPU_STATE_RUNNABLE)\n-\t\tgoto out;\n-\tif (ctx->flags & SPU_CREATE_NOSCHED)\n-\t\tgoto out;\n-\tif (ctx->policy == SCHED_FIFO)\n-\t\tgoto out;\n+\tmutex_lock(&gang->mutex);\n \n-\tif (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))\n-\t\tgoto out;\n+\tBUG_ON(!spu_gang_runnable(gang));\n \n-\tspu = ctx->spu;\n+\tif (!ctx->spu)\n+\t\tgoto out;\n \n-\tspu_context_trace(spusched_tick__preempt, ctx, spu);\n+\tif (ctx->flags & SPU_CREATE_NOSCHED)\n+\t\tgoto out;\n \n-\tnew = grab_runnable_context(ctx->prio + 1, spu->node);\n-\tif (new) {\n-\t\tspu_unschedule(spu, ctx, 0);\n-\t\tif (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))\n-\t\t\tspu_add_to_rq(ctx);\n-\t} else {\n+\t/*\n+\t * If nrunnable is zero, then all of the contexts are in user mode.\n+\t */\n+\tyield = !atomic_read(&gang->nrunnable);\n+\n+\tif (yield || ((ctx->policy != SCHED_FIFO) && (!--ctx->time_slice))) {\n+\t\tif (spu_prio->nr_waiting) {\n+\t\t\tbest = find_first_bit(spu_prio->bitmap, gang->prio);\n+\t\t\tif (yield || (preempt && best <= gang->prio)) {\n+\t\t\t\tspu_context_trace(spusched_tick__preempt,\n+\t\t\t\t\t\t\tctx, ctx->spu);\n+\t\t\t\tnpreempted = gang->contexts;\n+\t\t\t\tspu_unschedule(gang);\n+\t\t\t\tspu_add_to_rq(gang);\n+\t\t\t\tgoto out;\n+\t\t\t}\n+\t\t}\n \t\tspu_context_nospu_trace(spusched_tick__newslice, ctx);\n-\t\tif (!ctx->time_slice)\n+\t\tif (ctx->policy != SCHED_FIFO)\n \t\t\tctx->time_slice++;\n \t}\n out:\n-\tspu_release(ctx);\n+\tmutex_unlock(&gang->mutex);\n \n-\tif (new)\n-\t\tspu_schedule(spu, new);\n+\treturn npreempted;\n }\n \n /**\n@@ -1010,30 +1306,104 @@ static void spuloadavg_wake(unsigned long data)\n \n static int spusched_thread(void *unused)\n {\n+\tstruct spu_gang *gang;\n+\tstruct spu_context *ctx;\n \tstruct spu *spu;\n-\tint node;\n+\tint node, active, pnode, preempted, maxcontexts, retry, deactivated;\n \n \twhile (!kthread_should_stop()) {\n \t\tset_current_state(TASK_INTERRUPTIBLE);\n \t\tschedule();\n-\t\tfor (node = 0; node < MAX_NUMNODES; node++) {\n+\n+\t\t/*\n+\t\t * Try to limit the number of context switches because\n+\t\t * we preempt all of the contexts first and then place\n+\t\t * them.  We preempt first as the next gang to run may\n+\t\t * not fit.  Also, it helps us place NUMA and SPU affinity\n+\t\t * jobs as they have special requirements.  Not that we\n+\t\t * are doing that now, but we could. The downside is\n+\t\t * that we will have a little more idle time.  What we\n+\t\t * want to avoid is preempting and then rescheduling\n+\t\t * the same job as it creates hicups in the execution.\n+\t\t */\n+\t\tmaxcontexts = spu_prio->nr_contexts;\n+\t\tpreempted = 0;\n+\n+\t\t/*\n+\t\t * Time slice contexts first assuming that there are\n+\t\t * spe jobs waiting to run.  The next gang to be scheduled\n+\t\t * is of indeterminate size, so it is not sufficient to\n+\t\t * follow the traditional mode which assumes a 1:1\n+\t\t * replacement (1 spu : 1 spe context).\n+\t\t */\n+\t\tfor (node = 0, active = 0; node < MAX_NUMNODES; node++) {\n \t\t\tspinlock_t *l = &cbe_spu_info[node].list_lock;\n \n \t\t\tspin_lock(l);\n \t\t\tlist_for_each_entry(spu, &cbe_spu_info[node].spus,\n \t\t\t\t\tcbe_list) {\n-\t\t\t\tstruct spu_context *ctx = spu->ctx;\n-\n-\t\t\t\tif (ctx) {\n+\t\t\t\tif (spu->alloc_state == SPU_USED) {\n+\t\t\t\t\tgang = spu->gang;\n+\t\t\t\t\tBUG_ON(!gang);\n+\t\t\t\t\tctx = spu->ctx;\n \t\t\t\t\tget_spu_context(ctx);\n \t\t\t\t\tspin_unlock(l);\n-\t\t\t\t\tspusched_tick(ctx);\n+\t\t\t\t\tpreempted += spusched_tick(gang, ctx,\n+\t\t\t\t\t\tpreempted < maxcontexts);\n \t\t\t\t\tspin_lock(l);\n \t\t\t\t\tput_spu_context(ctx);\n+\t\t\t\t\tactive++;\n \t\t\t\t}\n \t\t\t}\n \t\t\tspin_unlock(l);\n \t\t}\n+\n+\t\tspu_sched_stats(spusched_thread__timeslice,\n+\t\t\t\t\tmaxcontexts, active, preempted);\n+\n+\t\t/*\n+\t\t * Place as many gangs as possible.  A gang might fail to\n+\t\t * be placed if it has NUMA bindings, SPU Affinity, or is\n+\t\t * to big to fit.  On failure, the gang is added to the\n+\t\t * back of the runqueue, which effectively allows us to\n+\t\t * skip over a job, assuming it is not the only one at a\n+\t\t * given priority level.\n+\t\t */\n+\t\tretry = 0;\n+\t\tactive = 0;\n+\t\twhile (retry < maxcontexts) {\n+\n+\t\t\tgang = grab_runnable_gang(NULL, MAX_PRIO,\n+\t\t\t\t\t&node, &pnode);\n+\t\t\tif (!gang)\n+\t\t\t\tbreak;\n+\n+\t\t\t/*\n+\t\t\t * Must recheck state as we have dropped all locks.\n+\t\t\t * It could be running, deactivated, or destroyed. The\n+\t\t\t * latter is still a problem. See find_victim (XXX).\n+\t\t\t */\n+\t\t\tdeactivated = test_bit(SPU_SCHED_DEACTIVATED,\n+\t\t\t\t\t\t&gang->sched_flags);\n+\t\t\tif (!deactivated) {\n+\t\t\t\tmutex_lock(&gang->mutex);\n+\t\t\t\tctx = list_first_entry(&gang->list,\n+\t\t\t\t\tstruct spu_context, gang_list);\n+\t\t\t\tif (!ctx->spu) {\n+\t\t\t\t\tif (spu_get_idle(gang, node)) {\n+\t\t\t\t\t\tspu_add_to_rq(gang);\n+\t\t\t\t\t\tretry++;\n+\t\t\t\t\t} else {\n+\t\t\t\t\t\t__spu_schedule(gang, node);\n+\t\t\t\t\t\tactive++;\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t\tmutex_unlock(&gang->mutex);\n+\t\t\t}\n+\t\t}\n+\n+\t\tspu_sched_stats(spusched_thread__scheduled,\n+\t\t\t\tspu_prio->nr_contexts, active, retry);\n \t}\n \n \treturn 0;\n@@ -1171,8 +1541,10 @@ void spu_sched_exit(void)\n \tfor (node = 0; node < MAX_NUMNODES; node++) {\n \t\tspin_lock(&cbe_spu_info[node].list_lock);\n \t\tlist_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)\n-\t\t\tif (spu->alloc_state != SPU_FREE)\n+\t\t\tif (spu->alloc_state != SPU_FREE) {\n \t\t\t\tspu->alloc_state = SPU_FREE;\n+\t\t\t\tspu->gang = NULL;\n+\t\t\t}\n \t\tspin_unlock(&cbe_spu_info[node].list_lock);\n \t}\n \tkfree(spu_prio);\ndiff --git a/arch/powerpc/platforms/cell/spufs/spufs.h \nb/arch/powerpc/platforms/cell/spufs/spufs.h\nindex 03f0a3c..f7d3e96 100644\n--- a/arch/powerpc/platforms/cell/spufs/spufs.h\n+++ b/arch/powerpc/platforms/cell/spufs/spufs.h\n@@ -47,11 +47,12 @@ enum {\n struct spu_context_ops;\n struct spu_gang;\n \n-/* ctx->sched_flags */\n+/* ctx->sched_flags and gang->sched_flags */\n enum {\n \tSPU_SCHED_NOTIFY_ACTIVE,\n-\tSPU_SCHED_WAS_ACTIVE,\t/* was active upon spu_acquire_saved()  */\n \tSPU_SCHED_SPU_RUN,\t/* context is within spu_run */\n+\tSPU_SCHED_JUST_CREATED,\t/* context created but not started */\n+\tSPU_SCHED_DEACTIVATED,\t/* gang has been deactivated */\n };\n \n enum {\n@@ -122,7 +123,6 @@ struct spu_context {\n \tpid_t tid;\n \n \t/* scheduler fields */\n-\tstruct list_head rq;\n \tunsigned int time_slice;\n \tunsigned long sched_flags;\n \tcpumask_t cpus_allowed;\n@@ -166,7 +166,12 @@ struct spu_gang {\n \tcpumask_t cpus_allowed;\n \tint policy;\n \tint prio;\n+\tatomic_t nstarted;\n+\tatomic_t nrunnable;\n+\tunsigned long sched_flags;\n+\tstruct list_head rq;\n \n+\t/* spu scheduler affinity fields */\n \tstruct spu_context *aff_ref_ctx;\n \tstruct list_head aff_list_head;\n \tstruct mutex aff_mutex;\n@@ -175,6 +180,11 @@ struct spu_gang {\n \tatomic_t aff_sched_count;\n };\n \n+static inline int spu_gang_runnable(struct spu_gang *g)\n+{\n+\treturn (g->contexts && (atomic_read(&g->nstarted) == g->contexts));\n+}\n+\n /* Flag bits for spu_gang aff_flags */\n #define AFF_OFFSETS_SET\t\t1\n #define AFF_MERGED\t\t2\n@@ -299,9 +309,8 @@ int __must_check spu_acquire_saved(struct spu_context \n*ctx);\n void spu_release_saved(struct spu_context *ctx);\n \n int spu_stopped(struct spu_context *ctx, u32 * stat);\n-void spu_del_from_rq(struct spu_context *ctx);\n int spu_activate(struct spu_context *ctx, unsigned long flags);\n-void spu_deactivate(struct spu_context *ctx);\n+void spu_deactivate(struct spu_gang *gang);\n void spu_yield(struct spu_context *ctx);\n void spu_switch_notify(struct spu *spu, struct spu_context *ctx);\n",
    "prefixes": []
}