diff mbox series

[RFC,v2,10/17] WIP: gpu: host1x: Add no-recovery mode

Message ID 20200905103420.3021852-11-mperttunen@nvidia.com
State Changes Requested
Headers show
Series Host1x/TegraDRM UAPI | expand

Commit Message

Mikko Perttunen Sept. 5, 2020, 10:34 a.m. UTC
Add a new property for jobs to enable or disable recovery i.e.
CPU increments of syncpoints to max value on job timeout. This
allows for a more solid model for hanged jobs, where userspace
doesn't need to guess if a syncpoint increment happened because
the job completed, or because job timeout was triggered.

On job timeout, we stop the channel, NOP all future jobs on the
channel using the same syncpoint, mark the syncpoint as locked
and resume the channel from the next job, if any.

The future jobs are NOPed, since because we don't do the CPU
increments, the value of the syncpoint is no longer synchronized,
and any waiters would become confused if a future job incremented
the syncpoint. The syncpoint is marked locked to ensure that any
future jobs cannot increment the syncpoint either, until the
application has recognized the situation and reallocated the
syncpoint.

WIP: There is a race condition between the locking and submission:
* Submission passes locking check
* Concurrent existing job timeouts, locking the syncpoint
* Submission still goes ahead

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c        |  1 +
 drivers/gpu/host1x/cdma.c          | 42 +++++++++++++++++++++++++-----
 drivers/gpu/host1x/hw/channel_hw.c |  6 ++++-
 drivers/gpu/host1x/job.c           |  4 +++
 drivers/gpu/host1x/syncpt.c        |  2 ++
 drivers/gpu/host1x/syncpt.h        | 12 +++++++++
 include/linux/host1x.h             |  9 +++++++
 7 files changed, 69 insertions(+), 7 deletions(-)

Comments

Dmitry Osipenko Sept. 11, 2020, 4:40 p.m. UTC | #1
05.09.2020 13:34, Mikko Perttunen пишет:
> +	} else {
> +		struct host1x_job *failed_job = job;
> +
> +		host1x_job_dump(dev, job);
> +
> +		host1x_syncpt_set_locked(job->syncpt);
> +		failed_job->cancelled = true;
> +
> +		list_for_each_entry_continue(job, &cdma->sync_queue, list) {
> +			unsigned int i;
> +
> +			if (job->syncpt != failed_job->syncpt)
> +				continue;
> +
> +			for (i = 0; i < job->num_slots; i++) {
> +				unsigned int slot = (job->first_get/8 + i) %
> +						    HOST1X_PUSHBUFFER_SLOTS;
> +				u32 *mapped = cdma->push_buffer.mapped;
> +
> +				mapped[2*slot+0] = 0x1bad0000;
> +				mapped[2*slot+1] = 0x1bad0000;

The 0x1bad0000 is a valid memory address on Tegra20.

The 0x60000000 is invalid phys address for all hardware generations.
It's used by grate-kernel [1] and VDE driver [2]. Note that the 0x6 <<
28 is also invalid Host1x opcode, while 0x1 should break CDMA parser
during of PB debug-dumping.

[1]
https://github.com/grate-driver/linux/blob/master/drivers/gpu/drm/tegra/gem.h#L16

[2]
https://elixir.bootlin.com/linux/v5.9-rc4/source/drivers/staging/media/tegra-vde/iommu.c#L99

The VDE driver reserves the trapping IOVA addresses, I assume the Host1x
driver should do the same.
Mikko Perttunen Sept. 11, 2020, 10:11 p.m. UTC | #2
On 9/11/20 7:40 PM, Dmitry Osipenko wrote:
> 05.09.2020 13:34, Mikko Perttunen пишет:
>> +	} else {
>> +		struct host1x_job *failed_job = job;
>> +
>> +		host1x_job_dump(dev, job);
>> +
>> +		host1x_syncpt_set_locked(job->syncpt);
>> +		failed_job->cancelled = true;
>> +
>> +		list_for_each_entry_continue(job, &cdma->sync_queue, list) {
>> +			unsigned int i;
>> +
>> +			if (job->syncpt != failed_job->syncpt)
>> +				continue;
>> +
>> +			for (i = 0; i < job->num_slots; i++) {
>> +				unsigned int slot = (job->first_get/8 + i) %
>> +						    HOST1X_PUSHBUFFER_SLOTS;
>> +				u32 *mapped = cdma->push_buffer.mapped;
>> +
>> +				mapped[2*slot+0] = 0x1bad0000;
>> +				mapped[2*slot+1] = 0x1bad0000;
> 
> The 0x1bad0000 is a valid memory address on Tegra20.
> 
> The 0x60000000 is invalid phys address for all hardware generations.
> It's used by grate-kernel [1] and VDE driver [2]. Note that the 0x6 <<
> 28 is also invalid Host1x opcode, while 0x1 should break CDMA parser
> during of PB debug-dumping.
> 
> [1]
> https://github.com/grate-driver/linux/blob/master/drivers/gpu/drm/tegra/gem.h#L16
> 
> [2]
> https://elixir.bootlin.com/linux/v5.9-rc4/source/drivers/staging/media/tegra-vde/iommu.c#L99
> 
> The VDE driver reserves the trapping IOVA addresses, I assume the Host1x
> driver should do the same.
> 

The 0x1bad0000's are not intended to be memory addresses, they are NOOP 
opcodes (INCR of 0 words to offset 0xbad). I'll fix this to use proper 
functions to construct the opcodes and add some comments. These need to 
be NOOP opcodes so the command parser skips over these "cancelled" jobs 
when the channel is resumed.

BTW, 0x60000000 is valid on Tegra194 and later.

Mikko
Dmitry Osipenko Sept. 12, 2020, 12:53 p.m. UTC | #3
12.09.2020 01:11, Mikko Perttunen пишет:
> On 9/11/20 7:40 PM, Dmitry Osipenko wrote:
>> 05.09.2020 13:34, Mikko Perttunen пишет:
>>> +    } else {
>>> +        struct host1x_job *failed_job = job;
>>> +
>>> +        host1x_job_dump(dev, job);
>>> +
>>> +        host1x_syncpt_set_locked(job->syncpt);
>>> +        failed_job->cancelled = true;
>>> +
>>> +        list_for_each_entry_continue(job, &cdma->sync_queue, list) {
>>> +            unsigned int i;
>>> +
>>> +            if (job->syncpt != failed_job->syncpt)
>>> +                continue;
>>> +
>>> +            for (i = 0; i < job->num_slots; i++) {
>>> +                unsigned int slot = (job->first_get/8 + i) %
>>> +                            HOST1X_PUSHBUFFER_SLOTS;
>>> +                u32 *mapped = cdma->push_buffer.mapped;
>>> +
>>> +                mapped[2*slot+0] = 0x1bad0000;
>>> +                mapped[2*slot+1] = 0x1bad0000;
>>
>> The 0x1bad0000 is a valid memory address on Tegra20.
>>
>> The 0x60000000 is invalid phys address for all hardware generations.
>> It's used by grate-kernel [1] and VDE driver [2]. Note that the 0x6 <<
>> 28 is also invalid Host1x opcode, while 0x1 should break CDMA parser
>> during of PB debug-dumping.
>>
>> [1]
>> https://github.com/grate-driver/linux/blob/master/drivers/gpu/drm/tegra/gem.h#L16
>>
>>
>> [2]
>> https://elixir.bootlin.com/linux/v5.9-rc4/source/drivers/staging/media/tegra-vde/iommu.c#L99
>>
>>
>> The VDE driver reserves the trapping IOVA addresses, I assume the Host1x
>> driver should do the same.
>>
> 
> The 0x1bad0000's are not intended to be memory addresses, they are NOOP
> opcodes (INCR of 0 words to offset 0xbad). I'll fix this to use proper
> functions to construct the opcodes and add some comments. These need to
> be NOOP opcodes so the command parser skips over these "cancelled" jobs
> when the channel is resumed.
> 
> BTW, 0x60000000 is valid on Tegra194 and later.

At a quick glance it looked like a memory address :)

I'm now taking a closer look at this patch and it raises some more
questions, like for example by looking at the "On job timeout, we stop
the channel, NOP all future jobs on the channel using the same syncpoint
..." through the prism of grate-kernel experience, I'm not sure how it
could co-exist with the drm-scheduler and why it's needed at all. But I
think we need a feature-complete version (at least a rough version), so
that we could start the testing, and then it should be easier to review
and discuss such things.
Mikko Perttunen Sept. 12, 2020, 1:31 p.m. UTC | #4
On 9/12/20 3:53 PM, Dmitry Osipenko wrote:
> 12.09.2020 01:11, Mikko Perttunen пишет:
>> On 9/11/20 7:40 PM, Dmitry Osipenko wrote:
>>> 05.09.2020 13:34, Mikko Perttunen пишет:
>>>> +    } else {
>>>> +        struct host1x_job *failed_job = job;
>>>> +
>>>> +        host1x_job_dump(dev, job);
>>>> +
>>>> +        host1x_syncpt_set_locked(job->syncpt);
>>>> +        failed_job->cancelled = true;
>>>> +
>>>> +        list_for_each_entry_continue(job, &cdma->sync_queue, list) {
>>>> +            unsigned int i;
>>>> +
>>>> +            if (job->syncpt != failed_job->syncpt)
>>>> +                continue;
>>>> +
>>>> +            for (i = 0; i < job->num_slots; i++) {
>>>> +                unsigned int slot = (job->first_get/8 + i) %
>>>> +                            HOST1X_PUSHBUFFER_SLOTS;
>>>> +                u32 *mapped = cdma->push_buffer.mapped;
>>>> +
>>>> +                mapped[2*slot+0] = 0x1bad0000;
>>>> +                mapped[2*slot+1] = 0x1bad0000;
>>>
>>> The 0x1bad0000 is a valid memory address on Tegra20.
>>>
>>> The 0x60000000 is invalid phys address for all hardware generations.
>>> It's used by grate-kernel [1] and VDE driver [2]. Note that the 0x6 <<
>>> 28 is also invalid Host1x opcode, while 0x1 should break CDMA parser
>>> during of PB debug-dumping.
>>>
>>> [1]
>>> https://github.com/grate-driver/linux/blob/master/drivers/gpu/drm/tegra/gem.h#L16
>>>
>>>
>>> [2]
>>> https://elixir.bootlin.com/linux/v5.9-rc4/source/drivers/staging/media/tegra-vde/iommu.c#L99
>>>
>>>
>>> The VDE driver reserves the trapping IOVA addresses, I assume the Host1x
>>> driver should do the same.
>>>
>>
>> The 0x1bad0000's are not intended to be memory addresses, they are NOOP
>> opcodes (INCR of 0 words to offset 0xbad). I'll fix this to use proper
>> functions to construct the opcodes and add some comments. These need to
>> be NOOP opcodes so the command parser skips over these "cancelled" jobs
>> when the channel is resumed.
>>
>> BTW, 0x60000000 is valid on Tegra194 and later.
> 
> At a quick glance it looked like a memory address :)

It does look a bit like one :) I'll add a comment to make it clear.

> 
> I'm now taking a closer look at this patch and it raises some more
> questions, like for example by looking at the "On job timeout, we stop
> the channel, NOP all future jobs on the channel using the same syncpoint
> ..." through the prism of grate-kernel experience, I'm not sure how it
> could co-exist with the drm-scheduler and why it's needed at all. But I
> think we need a feature-complete version (at least a rough version), so
> that we could start the testing, and then it should be easier to review
> and discuss such things.
> 

The reason this is needed is that if a job times out and we don't do its 
syncpoint increments on the CPU, then a successive job incrementing that 
same syncpoint would cause fences to signal incorrectly. The job that 
was supposed to signal those fences didn't actually run; and any data 
those fences were protecting would still be garbage.
Dmitry Osipenko Sept. 12, 2020, 9:51 p.m. UTC | #5
12.09.2020 16:31, Mikko Perttunen пишет:
...
>> I'm now taking a closer look at this patch and it raises some more
>> questions, like for example by looking at the "On job timeout, we stop
>> the channel, NOP all future jobs on the channel using the same syncpoint
>> ..." through the prism of grate-kernel experience, I'm not sure how it
>> could co-exist with the drm-scheduler and why it's needed at all. But I
>> think we need a feature-complete version (at least a rough version), so
>> that we could start the testing, and then it should be easier to review
>> and discuss such things.
>>
> 
> The reason this is needed is that if a job times out and we don't do its
> syncpoint increments on the CPU, then a successive job incrementing that
> same syncpoint would cause fences to signal incorrectly. The job that
> was supposed to signal those fences didn't actually run; and any data
> those fences were protecting would still be garbage.

I'll need to re-read the previous discussion because IIRC, I was
suggesting that once job is hung, all jobs should be removed from
queue/PB and re-submitted, then the re-submitted jobs will use the
new/updated sync point base.

And we probably should need another drm_tegra_submit_cmd type that waits
for a relative sync point increment. The
drm_tegra_submit_cmd_wait_syncpt uses absolute sync point value and it
shouldn't be used for sync point increments that are internal to a job
because it complicates the recovery.

All waits that are internal to a job should only wait for relative sync
point increments.

In the grate-kernel every job uses unique-and-clean sync point (which is
also internal to the kernel driver) and a relative wait [1] is used for
the job's internal sync point increments [2][3][4], and thus, kernel
driver simply jumps over a hung job by updating DMAGET to point at the
start of a next job.

[1]
https://github.com/grate-driver/linux/blob/master/drivers/gpu/drm/tegra/uapi/patching.c#L367

[2]
https://github.com/grate-driver/xf86-video-opentegra/blob/master/src/gpu/gr3d.c#L486
[3]
https://github.com/grate-driver/xf86-video-opentegra/blob/master/src/exa/copy_2d.c#L389
[4]
https://github.com/grate-driver/xf86-video-opentegra/blob/master/src/gpu/tegra_stream_v2.c#L536
Mikko Perttunen Sept. 13, 2020, 9:51 a.m. UTC | #6
On 9/13/20 12:51 AM, Dmitry Osipenko wrote:
> 12.09.2020 16:31, Mikko Perttunen пишет:
> ...
>>> I'm now taking a closer look at this patch and it raises some more
>>> questions, like for example by looking at the "On job timeout, we stop
>>> the channel, NOP all future jobs on the channel using the same syncpoint
>>> ..." through the prism of grate-kernel experience, I'm not sure how it
>>> could co-exist with the drm-scheduler and why it's needed at all. But I
>>> think we need a feature-complete version (at least a rough version), so
>>> that we could start the testing, and then it should be easier to review
>>> and discuss such things.
>>>
>>
>> The reason this is needed is that if a job times out and we don't do its
>> syncpoint increments on the CPU, then a successive job incrementing that
>> same syncpoint would cause fences to signal incorrectly. The job that
>> was supposed to signal those fences didn't actually run; and any data
>> those fences were protecting would still be garbage.
> 
> I'll need to re-read the previous discussion because IIRC, I was
> suggesting that once job is hung, all jobs should be removed from
> queue/PB and re-submitted, then the re-submitted jobs will use the
> new/updated sync point base. >
> And we probably should need another drm_tegra_submit_cmd type that waits
> for a relative sync point increment. The
> drm_tegra_submit_cmd_wait_syncpt uses absolute sync point value and it
> shouldn't be used for sync point increments that are internal to a job
> because it complicates the recovery.
> 
> All waits that are internal to a job should only wait for relative sync
> point increments. >
> In the grate-kernel every job uses unique-and-clean sync point (which is
> also internal to the kernel driver) and a relative wait [1] is used for
> the job's internal sync point increments [2][3][4], and thus, kernel
> driver simply jumps over a hung job by updating DMAGET to point at the
> start of a next job.

Issues I have with this approach:

* Both this and my approach have the requirement for userspace, that if 
a job hangs, the userspace must ensure all external waiters have timed 
out / been stopped before the syncpoint can be freed, as if the 
syncpoint gets reused before then, false waiter completions can happen.

So freeing the syncpoint must be exposed to userspace. The kernel cannot 
do this since there may be waiters that the kernel is not aware of. My 
proposal only has one syncpoint, which I feel makes this part simpler, too.

* I believe this proposal requires allocating a syncpoint for each 
externally visible syncpoint increment that the job does. This can use 
up quite a few syncpoints, and it makes syncpoints a dynamically 
allocated resource with unbounded allocation latency. This is a problem 
for safety-related systems.

* If a job fails on a "virtual channel" (userctx), I think it's a 
reasonable expectation that further jobs on that "virtual channel" will 
not execute, and I think implementing that model is simpler than doing 
recovery.

Mikko

> 
> [1]
> https://github.com/grate-driver/linux/blob/master/drivers/gpu/drm/tegra/uapi/patching.c#L367
> 
> [2]
> https://github.com/grate-driver/xf86-video-opentegra/blob/master/src/gpu/gr3d.c#L486
> [3]
> https://github.com/grate-driver/xf86-video-opentegra/blob/master/src/exa/copy_2d.c#L389
> [4]
> https://github.com/grate-driver/xf86-video-opentegra/blob/master/src/gpu/tegra_stream_v2.c#L536
>
Dmitry Osipenko Sept. 13, 2020, 6:37 p.m. UTC | #7
13.09.2020 12:51, Mikko Perttunen пишет:
...
>> All waits that are internal to a job should only wait for relative sync
>> point increments. >
>> In the grate-kernel every job uses unique-and-clean sync point (which is
>> also internal to the kernel driver) and a relative wait [1] is used for
>> the job's internal sync point increments [2][3][4], and thus, kernel
>> driver simply jumps over a hung job by updating DMAGET to point at the
>> start of a next job.
> 
> Issues I have with this approach:
> 
> * Both this and my approach have the requirement for userspace, that if
> a job hangs, the userspace must ensure all external waiters have timed
> out / been stopped before the syncpoint can be freed, as if the
> syncpoint gets reused before then, false waiter completions can happen.
> 
> So freeing the syncpoint must be exposed to userspace. The kernel cannot
> do this since there may be waiters that the kernel is not aware of. My
> proposal only has one syncpoint, which I feel makes this part simpler, too.
> 
> * I believe this proposal requires allocating a syncpoint for each
> externally visible syncpoint increment that the job does. This can use
> up quite a few syncpoints, and it makes syncpoints a dynamically
> allocated resource with unbounded allocation latency. This is a problem
> for safety-related systems.

Maybe we could have a special type of a "shared" sync point that is
allocated per-hardware engine? Then shared SP won't be a scarce resource
and job won't depend on it. The kernel or userspace driver may take care
of recovering the counter value of a shared SP when job hangs or do
whatever else is needed without affecting the job's sync point.

Primarily I'm not feeling very happy about retaining the job's sync
point recovery code because it was broken the last time I touched it and
grate-kernel works fine without it.

> * If a job fails on a "virtual channel" (userctx), I think it's a
> reasonable expectation that further jobs on that "virtual channel" will
> not execute, and I think implementing that model is simpler than doing
> recovery.

Couldn't jobs just use explicit fencing? Then a second job won't be
executed if first job hangs and explicit dependency is expressed. I'm
not sure that concept of a "virtual channel" is applicable to drm-scheduler.

I'll need to see a full-featured driver implementation and the test
cases that cover all the problems that you're worried about because I'm
not aware about all the T124+ needs and seeing code should help. Maybe
in the end yours approach will be the best, but for now it's not clear :)
Mikko Perttunen Sept. 15, 2020, 10:57 a.m. UTC | #8
On 9/13/20 9:37 PM, Dmitry Osipenko wrote:
> 13.09.2020 12:51, Mikko Perttunen пишет:
> ...
>>> All waits that are internal to a job should only wait for relative sync
>>> point increments. >
>>> In the grate-kernel every job uses unique-and-clean sync point (which is
>>> also internal to the kernel driver) and a relative wait [1] is used for
>>> the job's internal sync point increments [2][3][4], and thus, kernel
>>> driver simply jumps over a hung job by updating DMAGET to point at the
>>> start of a next job.
>>
>> Issues I have with this approach:
>>
>> * Both this and my approach have the requirement for userspace, that if
>> a job hangs, the userspace must ensure all external waiters have timed
>> out / been stopped before the syncpoint can be freed, as if the
>> syncpoint gets reused before then, false waiter completions can happen.
>>
>> So freeing the syncpoint must be exposed to userspace. The kernel cannot
>> do this since there may be waiters that the kernel is not aware of. My
>> proposal only has one syncpoint, which I feel makes this part simpler, too.
>>
>> * I believe this proposal requires allocating a syncpoint for each
>> externally visible syncpoint increment that the job does. This can use
>> up quite a few syncpoints, and it makes syncpoints a dynamically
>> allocated resource with unbounded allocation latency. This is a problem
>> for safety-related systems.
> 
> Maybe we could have a special type of a "shared" sync point that is
> allocated per-hardware engine? Then shared SP won't be a scarce resource
> and job won't depend on it. The kernel or userspace driver may take care
> of recovering the counter value of a shared SP when job hangs or do
> whatever else is needed without affecting the job's sync point.

Having a shared syncpoint opens up possibilities for interference 
between jobs (if we're not using the firewall, the HW cannot distinguish 
between jobs on the same channel), and doesn't work if there are 
multiple channels using the same engine, which we want to do for newer 
chips (for performance and virtualization reasons).

Even then, even if we need to allocate one syncpoint per job, the issue 
seems to be there.

> 
> Primarily I'm not feeling very happy about retaining the job's sync
> point recovery code because it was broken the last time I touched it and
> grate-kernel works fine without it.

I'm not planning to retain it any longer than necessary, which is until 
the staging interface is removed. Technically I can already remove it 
now -- that would cause any users of the staging interface to 
potentially behave weirdly if a job times out, but maybe we don't care 
about that all that much?

> 
>> * If a job fails on a "virtual channel" (userctx), I think it's a
>> reasonable expectation that further jobs on that "virtual channel" will
>> not execute, and I think implementing that model is simpler than doing
>> recovery.
> 
> Couldn't jobs just use explicit fencing? Then a second job won't be
> executed if first job hangs and explicit dependency is expressed. I'm
> not sure that concept of a "virtual channel" is applicable to drm-scheduler.

I assume what you mean is that each job incrementing a syncpoint would 
first wait for the preceding job incrementing that syncpoint to 
complete, by waiting for the preceding job's fence value.

I would consider what I do in this patch to be an optimization of that. 
Let's say we detect a timed out job and just skip that job in the CDMA 
pushbuffer (but do not CPU-increment syncpoints), then at every 
subsequent job using that syncpoint, we will be detecting a timeout and 
skipping it eventually. With the "NOPping" in this patch we just 
pre-emptively cancel those jobs so that we don't have to spend time 
waiting for timeouts in the future. Functionally these should be the 
same, though.

The wait-for-preceding-job-to-complete thing should already be there in 
form of the "serialize" operation if the jobs use the same syncpoint.

So, if DRM scheduler's current operation is just skipping the timing out 
job and continuing from the next job, that's functionally fine. But we 
could improve DRM scheduler to allow for also cancelling future jobs 
that we know will time out. That would be in essence "virtual channel" 
support.

Userspace still has options -- if it puts in other prefences, timeouts 
will happen as usual. If it wants to have multiple "threads" of 
execution where a timeout on one doesn't affect the others, it can use 
different syncpoints for them.

> 
> I'll need to see a full-featured driver implementation and the test
> cases that cover all the problems that you're worried about because I'm
> not aware about all the T124+ needs and seeing code should help. Maybe
> in the end yours approach will be the best, but for now it's not clear :)
> 

My primary goal is simplicity of programming model and implementation. 
Regarding the resource management concerns, I can of course create a 
test case that allocates a lot of resources, but what I'm afraid about 
is that once we put this into a big system, with several VMs with their 
own resource reservations (including syncpoints), and the GPU and camera 
subsystems using hundreds of syncpoints, dynamic usage of those 
resources will create uncertainty in the system, and bug reports.

And of course, if we want to make a safety-related system, you also need 
to document before-hand how you are ensuring that e.g. job submission 
(including syncpoint allocation if that is dynamic) happens under x 
microseconds.

I don't think the model used in the grate host1x driver is bad, and I 
think the code there and its integration with the existing kernel 
frameworks are beautiful, and that is definitely a goal for the mainline 
driver as well. But I think we can make things even simpler overall and 
more reliable.

Mikko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index ceea9db341f0..7437c67924aa 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -197,6 +197,7 @@  int tegra_drm_submit(struct tegra_drm_context *context,
 	job->client = client;
 	job->class = client->class;
 	job->serialize = true;
+	job->syncpt_recovery = true;
 
 	/*
 	 * Track referenced BOs so that they can be unreferenced after the
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 6e6ca774f68d..59ad4ca38292 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -312,10 +312,6 @@  static void update_cdma_locked(struct host1x_cdma *cdma)
 	bool signal = false;
 	struct host1x_job *job, *n;
 
-	/* If CDMA is stopped, queue is cleared and we can return */
-	if (!cdma->running)
-		return;
-
 	/*
 	 * Walk the sync queue, reading the sync point registers as necessary,
 	 * to consume as many sync queue entries as possible without blocking
@@ -324,7 +320,8 @@  static void update_cdma_locked(struct host1x_cdma *cdma)
 		struct host1x_syncpt *sp = job->syncpt;
 
 		/* Check whether this syncpt has completed, and bail if not */
-		if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
+		if (!host1x_syncpt_is_expired(sp, job->syncpt_end) &&
+		    !job->cancelled) {
 			/* Start timer on next pending syncpt */
 			if (job->timeout)
 				cdma_start_timer_locked(cdma, job);
@@ -413,8 +410,11 @@  void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 	else
 		restart_addr = cdma->last_pos;
 
+	if (!job)
+		goto resume;
+
 	/* do CPU increments for the remaining syncpts */
-	if (job) {
+	if (job->syncpt_recovery) {
 		dev_dbg(dev, "%s: perform CPU incr on pending buffers\n",
 			__func__);
 
@@ -433,8 +433,38 @@  void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 
 		dev_dbg(dev, "%s: finished sync_queue modification\n",
 			__func__);
+	} else {
+		struct host1x_job *failed_job = job;
+
+		host1x_job_dump(dev, job);
+
+		host1x_syncpt_set_locked(job->syncpt);
+		failed_job->cancelled = true;
+
+		list_for_each_entry_continue(job, &cdma->sync_queue, list) {
+			unsigned int i;
+
+			if (job->syncpt != failed_job->syncpt)
+				continue;
+
+			for (i = 0; i < job->num_slots; i++) {
+				unsigned int slot = (job->first_get/8 + i) %
+						    HOST1X_PUSHBUFFER_SLOTS;
+				u32 *mapped = cdma->push_buffer.mapped;
+
+				mapped[2*slot+0] = 0x1bad0000;
+				mapped[2*slot+1] = 0x1bad0000;
+			}
+
+			job->cancelled = true;
+		}
+
+		wmb();
+
+		update_cdma_locked(cdma);
 	}
 
+resume:
 	/* roll back DMAGET and start up channel again */
 	host1x_hw_cdma_resume(host1x, cdma, restart_addr);
 }
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index d4c28faf27d1..145746c6f6fb 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -129,6 +129,10 @@  static int channel_submit(struct host1x_job *job)
 				    job->num_gathers, job->num_relocs,
 				    job->syncpt->id, job->syncpt_incrs);
 
+	/* TODO this is racy */
+	if (job->syncpt->locked)
+		return -EPERM;
+
 	/* before error checks, return current max */
 	prev_max = job->syncpt_end = host1x_syncpt_read_max(sp);
 
@@ -191,7 +195,7 @@  static int channel_submit(struct host1x_job *job)
 	/* schedule a submit complete interrupt */
 	err = host1x_intr_add_action(host, sp, syncval,
 				     HOST1X_INTR_ACTION_SUBMIT_COMPLETE, ch,
-				     completed_waiter, NULL);
+				     completed_waiter, &job->waiter);
 	completed_waiter = NULL;
 	WARN(err, "Failed to set submit complete interrupt");
 
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index d8345d3bf0b3..e4f16fc899b0 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -79,6 +79,10 @@  static void job_free(struct kref *ref)
 {
 	struct host1x_job *job = container_of(ref, struct host1x_job, ref);
 
+	if (job->waiter)
+		host1x_intr_put_ref(job->syncpt->host, job->syncpt->id,
+				    job->waiter);
+
 	if (job->syncpt)
 		host1x_syncpt_put(job->syncpt);
 
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index b31b994624fa..2fad8b2a55cc 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -385,6 +385,8 @@  static void syncpt_release(struct kref *ref)
 {
 	struct host1x_syncpt *sp = container_of(ref, struct host1x_syncpt, ref);
 
+	sp->locked = false;
+
 	mutex_lock(&sp->host->syncpt_mutex);
 
 	host1x_syncpt_base_free(sp->base);
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index eb49d7003743..d19461704ea2 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -40,6 +40,13 @@  struct host1x_syncpt {
 
 	/* interrupt data */
 	struct host1x_syncpt_intr intr;
+
+	/* 
+	 * If a submission incrementing this syncpoint fails, lock it so that
+	 * further submission cannot be made until application has handled the
+	 * failure.
+	 */
+	bool locked;
 };
 
 /* Initialize sync point array  */
@@ -120,4 +127,9 @@  struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 					  struct host1x_client *client,
 					  unsigned long flags);
 
+static inline void host1x_syncpt_set_locked(struct host1x_syncpt *sp)
+{
+	sp->locked = true;
+}
+
 #endif
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 73a247e180a9..3ffe16152ebc 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -229,9 +229,15 @@  struct host1x_job {
 	u32 syncpt_incrs;
 	u32 syncpt_end;
 
+	/* Completion waiter ref */
+	void *waiter;
+
 	/* Maximum time to wait for this job */
 	unsigned int timeout;
 
+	/* Job has timed out and should be released */
+	bool cancelled;
+
 	/* Index and number of slots used in the push buffer */
 	unsigned int first_get;
 	unsigned int num_slots;
@@ -252,6 +258,9 @@  struct host1x_job {
 
 	/* Add a channel wait for previous ops to complete */
 	bool serialize;
+
+	/* Fast-forward syncpoint increments on job timeout */
+	bool syncpt_recovery;
 };
 
 struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,