diff mbox series

[04/10] gpu: host1x: Lock classes during job submission

Message ID 20171105110118.15142-5-mperttunen@nvidia.com
State Deferred
Headers show
Series Dynamic Host1x channel allocation | expand

Commit Message

Mikko Perttunen Nov. 5, 2017, 11:01 a.m. UTC
Host1x has a feature called MLOCKs which allow a certain class
(~HW unit) to be locked (in the mutex sense) and unlocked during
command execution, preventing other channels from accessing the
class while it is locked. This is necessary to prevent concurrent
jobs from messing up class state.

This has not been necessary so far since due to our channel allocation
model, there has only been a single hardware channel submitting
commands to each class. Future patches, however, change the channel
allocation model to allow hardware-scheduled concurrency, and as such
we need to start locking.

This patch implements locking on all platforms from Tegra20 to
Tegra186.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
---
 drivers/gpu/host1x/cdma.c                      |   1 +
 drivers/gpu/host1x/cdma.h                      |   1 +
 drivers/gpu/host1x/hw/cdma_hw.c                | 122 +++++++++++++++++++++++++
 drivers/gpu/host1x/hw/channel_hw.c             |  71 ++++++++++----
 drivers/gpu/host1x/hw/host1x01_hardware.h      |  10 ++
 drivers/gpu/host1x/hw/host1x02_hardware.h      |  10 ++
 drivers/gpu/host1x/hw/host1x04_hardware.h      |  10 ++
 drivers/gpu/host1x/hw/host1x05_hardware.h      |  10 ++
 drivers/gpu/host1x/hw/host1x06_hardware.h      |  10 ++
 drivers/gpu/host1x/hw/hw_host1x01_sync.h       |   6 ++
 drivers/gpu/host1x/hw/hw_host1x02_sync.h       |   6 ++
 drivers/gpu/host1x/hw/hw_host1x04_sync.h       |   6 ++
 drivers/gpu/host1x/hw/hw_host1x05_sync.h       |   6 ++
 drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h |   5 +
 14 files changed, 257 insertions(+), 17 deletions(-)

Comments

Dmitry Osipenko Nov. 5, 2017, 4:46 p.m. UTC | #1
On 05.11.2017 14:01, Mikko Perttunen wrote:
> Host1x has a feature called MLOCKs which allow a certain class
> (~HW unit) to be locked (in the mutex sense) and unlocked during
> command execution, preventing other channels from accessing the
> class while it is locked. This is necessary to prevent concurrent
> jobs from messing up class state.
> 
> This has not been necessary so far since due to our channel allocation
> model, there has only been a single hardware channel submitting
> commands to each class. Future patches, however, change the channel
> allocation model to allow hardware-scheduled concurrency, and as such
> we need to start locking.
> 
> This patch implements locking on all platforms from Tegra20 to
> Tegra186.
> 
> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> ---
>  drivers/gpu/host1x/cdma.c                      |   1 +
>  drivers/gpu/host1x/cdma.h                      |   1 +
>  drivers/gpu/host1x/hw/cdma_hw.c                | 122 +++++++++++++++++++++++++
>  drivers/gpu/host1x/hw/channel_hw.c             |  71 ++++++++++----
>  drivers/gpu/host1x/hw/host1x01_hardware.h      |  10 ++
>  drivers/gpu/host1x/hw/host1x02_hardware.h      |  10 ++
>  drivers/gpu/host1x/hw/host1x04_hardware.h      |  10 ++
>  drivers/gpu/host1x/hw/host1x05_hardware.h      |  10 ++
>  drivers/gpu/host1x/hw/host1x06_hardware.h      |  10 ++
>  drivers/gpu/host1x/hw/hw_host1x01_sync.h       |   6 ++
>  drivers/gpu/host1x/hw/hw_host1x02_sync.h       |   6 ++
>  drivers/gpu/host1x/hw/hw_host1x04_sync.h       |   6 ++
>  drivers/gpu/host1x/hw/hw_host1x05_sync.h       |   6 ++
>  drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h |   5 +
>  14 files changed, 257 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
> index 28541b280739..f787cfe69c11 100644
> --- a/drivers/gpu/host1x/cdma.c
> +++ b/drivers/gpu/host1x/cdma.c
> @@ -232,6 +232,7 @@ static void cdma_start_timer_locked(struct host1x_cdma *cdma,
>  	}
>  
>  	cdma->timeout.client = job->client;
> +	cdma->timeout.class = job->class;
>  	cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
>  	cdma->timeout.syncpt_val = job->syncpt_end;
>  	cdma->timeout.start_ktime = ktime_get();
> diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
> index 286d49386be9..e72660fc83c9 100644
> --- a/drivers/gpu/host1x/cdma.h
> +++ b/drivers/gpu/host1x/cdma.h
> @@ -59,6 +59,7 @@ struct buffer_timeout {
>  	ktime_t start_ktime;		/* starting time */
>  	/* context timeout information */
>  	int client;
> +	u32 class;
>  };
>  
>  enum cdma_event {
> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
> index ce320534cbed..4d5970d863d5 100644
> --- a/drivers/gpu/host1x/hw/cdma_hw.c
> +++ b/drivers/gpu/host1x/hw/cdma_hw.c
> @@ -16,6 +16,7 @@
>   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <linux/iopoll.h>
>  #include <linux/slab.h>
>  #include <linux/scatterlist.h>
>  #include <linux/dma-mapping.h>
> @@ -243,6 +244,125 @@ static void cdma_resume(struct host1x_cdma *cdma, u32 getptr)
>  	cdma_timeout_restart(cdma, getptr);
>  }
>  
> +static int mlock_id_for_class(unsigned int class)
> +{
> +#if HOST1X_HW >= 6
> +	switch (class)
> +	{
> +	case HOST1X_CLASS_HOST1X:
> +		return 0;
> +	case HOST1X_CLASS_VIC:
> +		return 17;

What is the meaning of returned ID values that you have defined here? Why VIC
should have different ID on T186?

> +	default:
> +		return -EINVAL;
> +	}
> +#else
> +	switch (class)
> +	{
> +	case HOST1X_CLASS_HOST1X:
> +		return 0;
> +	case HOST1X_CLASS_GR2D:
> +		return 1;
> +	case HOST1X_CLASS_GR2D_SB:
> +		return 2;

Note that we are allowing to switch 2d classes in the same jobs context and
currently jobs class is somewhat hardcoded to GR2D.

Even though that GR2D and GR2D_SB use different register banks, is it okay to
trigger execution of different classes simultaneously? Would syncpoint
differentiate classes on OP_DONE event?

I suppose that MLOCK (the module lock) implies the whole module locking,
wouldn't it make sense to just use the module ID's defined in the TRM?

> +	case HOST1X_CLASS_VIC:
> +		return 3;
> +	case HOST1X_CLASS_GR3D:
> +		return 4;
> +	default:
> +		return -EINVAL;
> +	}
> +#endif
> +}
> +
> +static void timeout_release_mlock(struct host1x_cdma *cdma)
> +{
> +#if HOST1X_HW >= 6
> +	struct host1x_channel *ch = cdma_to_channel(cdma);
> +	struct host1x *host = cdma_to_host1x(cdma);
> +	u32 pb_pos, pb_temp[3], val;
> +	int err, mlock_id;
> +
> +	if (!host->hv_regs)
> +		return;
> +
> +	mlock_id = mlock_id_for_class(cdma->timeout.class);
> +	if (WARN(mlock_id < 0, "Invalid class ID"))
> +		return;
> +
> +	val = host1x_hypervisor_readl(host, HOST1X_HV_MLOCK(mlock_id));
> +	if (!HOST1X_HV_MLOCK_LOCKED_V(val) ||
> +	    HOST1X_HV_MLOCK_CH_V(val) != ch->id)
> +	{
> +		/* Channel is not holding the MLOCK, nothing to release. */
> +		return;
> +	}
> +
> +	/*
> +	 * On Tegra186, there is no register to unlock an MLOCK (don't ask me
> +	 * why). As such, we have to execute a release_mlock instruction to
> +	 * do it. We do this by backing up the first three opcodes of the
> +	 * pushbuffer and replacing them with our own short sequence to do
> +	 * the unlocking. We set the .pos field to 12, which causes DMAEND
> +	 * to be set accordingly such that only the three opcodes we set
> +	 * here are executed before CDMA stops. Finally we restore the value
> +	 * of pos and pushbuffer contents.
> +	 */
> +
> +	pb_pos = cdma->push_buffer.pos;
> +	memcpy(pb_temp, cdma->push_buffer.mapped, ARRAY_SIZE(pb_temp) * 4);
> +
> +	{
> +		u32 *pb = cdma->push_buffer.mapped;
> +		pb[0] = host1x_opcode_acquire_mlock(cdma->timeout.class);
> +		pb[1] = host1x_opcode_setclass(cdma->timeout.class, 0, 0);
> +		pb[2] = host1x_opcode_release_mlock(cdma->timeout.class);
> +	}
> +
> +	/* Flush writecombine buffer */
> +	wmb();
> +
> +	cdma->push_buffer.pos = ARRAY_SIZE(pb_temp) * 4;
> +
> +	cdma_resume(cdma, 0);
> +
> +	/* Wait until the release_mlock opcode has been executed */
> +	err = readl_relaxed_poll_timeout(
> +		host->hv_regs + HOST1X_HV_MLOCK(mlock_id), val,
> +		!HOST1X_HV_MLOCK_LOCKED_V(val) ||
> +			HOST1X_HV_MLOCK_CH_V(val) != ch->id,
> +		10, 10000);
> +	WARN(err, "Failed to unlock mlock %u\n", mlock_id);
> +
> +	cdma_freeze(cdma);
> +
> +	/* Restore original pushbuffer state */
> +	cdma->push_buffer.pos = pb_pos;
> +	memcpy(cdma->push_buffer.mapped, pb_temp, ARRAY_SIZE(pb_temp) * 4);
> +	wmb();
> +#else
> +	struct host1x_channel *ch = cdma_to_channel(cdma);
> +	struct host1x *host = cdma_to_host1x(cdma);
> +	int mlock_id;
> +	u32 val;
> +
> +	mlock_id = mlock_id_for_class(cdma->timeout.class);
> +	if (WARN(mlock_id < 0, "Invalid class ID"))
> +		return;
> +
> +	val = host1x_sync_readl(host, HOST1X_SYNC_MLOCK_OWNER(mlock_id));
> +	if (!HOST1X_SYNC_MLOCK_OWNER_CH_OWNS_V(val) ||
> +	    HOST1X_SYNC_MLOCK_OWNER_CHID_V(val) != ch->id)
> +	{
> +		/* Channel is not holding the MLOCK, nothing to release. */
> +		return;
> +	}
> +
> +	/* Unlock MLOCK */
> +	host1x_sync_writel(host, 0x0, HOST1X_SYNC_MLOCK(mlock_id));
> +#endif
> +}
> +
>  /*
>   * If this timeout fires, it indicates the current sync_queue entry has
>   * exceeded its TTL and the userctx should be timed out and remaining
> @@ -293,6 +413,8 @@ static void cdma_timeout_handler(struct work_struct *work)
>  	/* stop HW, resetting channel/module */
>  	host1x_hw_cdma_freeze(host1x, cdma);
>  
> +	timeout_release_mlock(cdma);
> +
>  	host1x_cdma_update_sync_queue(cdma, ch->dev);
>  	mutex_unlock(&cdma->lock);
>  }
> diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
> index 246b78c41281..f80fb8be38e6 100644
> --- a/drivers/gpu/host1x/hw/channel_hw.c
> +++ b/drivers/gpu/host1x/hw/channel_hw.c
> @@ -89,6 +89,34 @@ static inline void synchronize_syncpt_base(struct host1x_job *job)
>  			 HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(value));
>  }
>  
> +static void channel_submit_serialize(struct host1x_channel *ch,
> +				     struct host1x_syncpt *sp)
> +{
> +#if HOST1X_HW >= 6
> +	/*
> +	 * On T186, due to a hw issue, we need to issue an mlock acquire
> +	 * for HOST1X class. It is still interpreted as a no-op in
> +	 * hardware.
> +	 */
> +	host1x_cdma_push(&ch->cdma,
> +		host1x_opcode_acquire_mlock(HOST1X_CLASS_HOST1X),
> +		host1x_opcode_setclass(HOST1X_CLASS_HOST1X, 0, 0));
> +	host1x_cdma_push(&ch->cdma,
> +		host1x_opcode_nonincr(host1x_uclass_wait_syncpt_r(), 1),
> +		host1x_class_host_wait_syncpt(host1x_syncpt_id(sp),
> +					      host1x_syncpt_read_max(sp)));
> +	host1x_cdma_push(&ch->cdma,
> +		host1x_opcode_release_mlock(HOST1X_CLASS_HOST1X),
> +		HOST1X_OPCODE_NOP);
> +#else
> +	host1x_cdma_push(&ch->cdma,
> +		host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
> +				       host1x_uclass_wait_syncpt_r(), 1),
> +		host1x_class_host_wait_syncpt(host1x_syncpt_id(sp),
> +					      host1x_syncpt_read_max(sp)));
> +#endif
> +}
> +
>  static int channel_submit(struct host1x_job *job)
>  {
>  	struct host1x_channel *ch = job->channel;
> @@ -96,6 +124,7 @@ static int channel_submit(struct host1x_job *job)
>  	u32 user_syncpt_incrs = job->syncpt_incrs;
>  	u32 prev_max = 0;
>  	u32 syncval;
> +	u32 mlock;
>  	int err;
>  	struct host1x_waitlist *completed_waiter = NULL;
>  	struct host1x *host = dev_get_drvdata(ch->dev->parent);
> @@ -128,17 +157,12 @@ static int channel_submit(struct host1x_job *job)
>  		goto error;
>  	}
>  
> -	if (job->serialize) {
> -		/*
> -		 * Force serialization by inserting a host wait for the
> -		 * previous job to finish before this one can commence.
> -		 */
> -		host1x_cdma_push(&ch->cdma,
> -				 host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
> -					host1x_uclass_wait_syncpt_r(), 1),
> -				 host1x_class_host_wait_syncpt(job->syncpt_id,
> -					host1x_syncpt_read_max(sp)));
> -	}
> +	/*
> +	 * Force serialization by inserting a host wait for the
> +	 * previous job to finish before this one can commence.
> +	 */
> +	if (job->serialize)
> +		channel_submit_serialize(ch, sp);
>  
>  	/* Synchronize base register to allow using it for relative waiting */
>  	if (sp->base)
> @@ -149,16 +173,29 @@ static int channel_submit(struct host1x_job *job)
>  	/* assign syncpoint to channel */
>  	host1x_hw_syncpt_assign_channel(host, sp, ch);
>  
> -	job->syncpt_end = syncval;
> -
> -	/* add a setclass for modules that require it */
> -	if (job->class)
> +	/* acquire MLOCK and set channel class to specified class */
> +	mlock = HOST1X_HW >= 6 ? job->class : mlock_id_for_class(job->class);
> +	if (job->class) {
>  		host1x_cdma_push(&ch->cdma,
> -				 host1x_opcode_setclass(job->class, 0, 0),
> -				 HOST1X_OPCODE_NOP);
> +				 host1x_opcode_acquire_mlock(mlock),
> +				 host1x_opcode_setclass(job->class, 0, 0));
> +	}
>  
>  	submit_gathers(job);
>  
> +	if (job->class) {
> +		/*
> +		 * Push additional increment to catch jobs that crash before
> +		 * finishing their gather, not reaching the unlock opcode.
> +		 */
> +		syncval = host1x_syncpt_incr_max(sp, 1);
> +		host1x_cdma_push(&ch->cdma,
> +			host1x_opcode_imm_incr_syncpt(0, job->syncpt_id),
> +			host1x_opcode_release_mlock(mlock));
> +	}
> +
> +	job->syncpt_end = syncval;
> +
>  	/* end CDMA submit & stash pinned hMems into sync queue */
>  	host1x_cdma_end(&ch->cdma, job);
>  
> diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
> index 5f0fb866efa8..6a2c9f905acc 100644
> --- a/drivers/gpu/host1x/hw/host1x01_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
> @@ -138,6 +138,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>  	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>  }
>  
> +static inline u32 host1x_opcode_acquire_mlock(unsigned id)
> +{
> +	return (14 << 28) | id;
> +}
> +
> +static inline u32 host1x_opcode_release_mlock(unsigned id)
> +{
> +	return (14 << 28) | (1 << 24) | id;
> +}
> +
>  #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>  
>  #endif
> diff --git a/drivers/gpu/host1x/hw/host1x02_hardware.h b/drivers/gpu/host1x/hw/host1x02_hardware.h
> index 154901860bc6..c524c6c8d82f 100644
> --- a/drivers/gpu/host1x/hw/host1x02_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x02_hardware.h
> @@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>  	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>  }
>  
> +static inline u32 host1x_opcode_acquire_mlock(unsigned id)
> +{
> +	return (14 << 28) | id;
> +}
> +
> +static inline u32 host1x_opcode_release_mlock(unsigned id)
> +{
> +	return (14 << 28) | (1 << 24) | id;
> +}
> +
>  #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>  
>  #endif
> diff --git a/drivers/gpu/host1x/hw/host1x04_hardware.h b/drivers/gpu/host1x/hw/host1x04_hardware.h
> index de1a38175328..8dedd77b7a1b 100644
> --- a/drivers/gpu/host1x/hw/host1x04_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x04_hardware.h
> @@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>  	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>  }
>  
> +static inline u32 host1x_opcode_acquire_mlock(unsigned id)
> +{
> +	return (14 << 28) | id;
> +}
> +
> +static inline u32 host1x_opcode_release_mlock(unsigned id)
> +{
> +	return (14 << 28) | (1 << 24) | id;
> +}
> +
>  #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>  
>  #endif
> diff --git a/drivers/gpu/host1x/hw/host1x05_hardware.h b/drivers/gpu/host1x/hw/host1x05_hardware.h
> index 2937ebb6be11..6aafcb5e97e6 100644
> --- a/drivers/gpu/host1x/hw/host1x05_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x05_hardware.h
> @@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>  	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>  }
>  
> +static inline u32 host1x_opcode_acquire_mlock(unsigned id)
> +{
> +	return (14 << 28) | id;
> +}
> +
> +static inline u32 host1x_opcode_release_mlock(unsigned id)
> +{
> +	return (14 << 28) | (1 << 24) | id;
> +}
> +
>  #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>  
>  #endif
> diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
> index 3039c92ea605..60147d26ad9b 100644
> --- a/drivers/gpu/host1x/hw/host1x06_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
> @@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
>  	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
>  }
>  
> +static inline u32 host1x_opcode_acquire_mlock(unsigned id)
> +{
> +	return (14 << 28) | id;
> +}
> +
> +static inline u32 host1x_opcode_release_mlock(unsigned id)
> +{
> +	return (14 << 28) | (1 << 24) | id;
> +}
> +
>  #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
>  
>  #endif
> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_sync.h b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
> index 31238c285d46..4630fec2237a 100644
> --- a/drivers/gpu/host1x/hw/hw_host1x01_sync.h
> +++ b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
> @@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
>  }
>  #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
>  	host1x_sync_ip_busy_timeout_r()
> +static inline u32 host1x_sync_mlock_r(unsigned int id)
> +{
> +	return 0x2c0 + id * REGISTER_STRIDE;
> +}
> +#define HOST1X_SYNC_MLOCK(id) \
> +	host1x_sync_mlock_r(id)
>  static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
>  {
>  	return 0x340 + id * REGISTER_STRIDE;
> diff --git a/drivers/gpu/host1x/hw/hw_host1x02_sync.h b/drivers/gpu/host1x/hw/hw_host1x02_sync.h
> index 540c7b65995f..dec8e812a114 100644
> --- a/drivers/gpu/host1x/hw/hw_host1x02_sync.h
> +++ b/drivers/gpu/host1x/hw/hw_host1x02_sync.h
> @@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
>  }
>  #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
>  	host1x_sync_ip_busy_timeout_r()
> +static inline u32 host1x_sync_mlock_r(unsigned int id)
> +{
> +	return 0x2c0 + id * REGISTER_STRIDE;
> +}
> +#define HOST1X_SYNC_MLOCK(id) \
> +	host1x_sync_mlock_r(id)
>  static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
>  {
>  	return 0x340 + id * REGISTER_STRIDE;
> diff --git a/drivers/gpu/host1x/hw/hw_host1x04_sync.h b/drivers/gpu/host1x/hw/hw_host1x04_sync.h
> index 3d6c8ec65934..9a951a4db07f 100644
> --- a/drivers/gpu/host1x/hw/hw_host1x04_sync.h
> +++ b/drivers/gpu/host1x/hw/hw_host1x04_sync.h
> @@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
>  }
>  #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
>  	host1x_sync_ip_busy_timeout_r()
> +static inline u32 host1x_sync_mlock_r(unsigned int id)
> +{
> +	return 0x2c0 + id * REGISTER_STRIDE;
> +}
> +#define HOST1X_SYNC_MLOCK(id) \
> +	host1x_sync_mlock_r(id)
>  static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
>  {
>  	return 0x340 + id * REGISTER_STRIDE;
> diff --git a/drivers/gpu/host1x/hw/hw_host1x05_sync.h b/drivers/gpu/host1x/hw/hw_host1x05_sync.h
> index ca10eee5045c..e29e2e19a60b 100644
> --- a/drivers/gpu/host1x/hw/hw_host1x05_sync.h
> +++ b/drivers/gpu/host1x/hw/hw_host1x05_sync.h
> @@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
>  }
>  #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
>  	host1x_sync_ip_busy_timeout_r()
> +static inline u32 host1x_sync_mlock_r(unsigned int id)
> +{
> +	return 0x2c0 + id * REGISTER_STRIDE;
> +}
> +#define HOST1X_SYNC_MLOCK(id) \
> +	host1x_sync_mlock_r(id)
>  static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
>  {
>  	return 0x340 + id * REGISTER_STRIDE;
> diff --git a/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h b/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
> index c05dab8a178b..be73530c3585 100644
> --- a/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
> +++ b/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
> @@ -18,6 +18,11 @@
>  #define HOST1X_HV_SYNCPT_PROT_EN			0x1ac4
>  #define HOST1X_HV_SYNCPT_PROT_EN_CH_EN			BIT(1)
>  #define HOST1X_HV_CH_KERNEL_FILTER_GBUFFER(x)		(0x2020 + (x * 4))
> +#define HOST1X_HV_MLOCK(x)				(0x2030 + (x * 4))
> +#define HOST1X_HV_MLOCK_CH(x)				(((x) & 0x3f) << 2)
> +#define HOST1X_HV_MLOCK_CH_V(x)				(((x) >> 2) & 0x3f)
> +#define HOST1X_HV_MLOCK_LOCKED				BIT(0)
> +#define HOST1X_HV_MLOCK_LOCKED_V(x)			((x) & 0x1)
>  #define HOST1X_HV_CMDFIFO_PEEK_CTRL			0x233c
>  #define HOST1X_HV_CMDFIFO_PEEK_CTRL_ADDR(x)		(x)
>  #define HOST1X_HV_CMDFIFO_PEEK_CTRL_CHANNEL(x)		((x) << 16)
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mikko Perttunen Nov. 7, 2017, 12:28 p.m. UTC | #2
On 05.11.2017 18:46, Dmitry Osipenko wrote:
> On 05.11.2017 14:01, Mikko Perttunen wrote:
 >> ...
>>
>> +static int mlock_id_for_class(unsigned int class)
>> +{
>> +#if HOST1X_HW >= 6
>> +	switch (class)
>> +	{
>> +	case HOST1X_CLASS_HOST1X:
>> +		return 0;
>> +	case HOST1X_CLASS_VIC:
>> +		return 17;
>
> What is the meaning of returned ID values that you have defined here? Why VIC
> should have different ID on T186?

On T186, MLOCKs are not "generic" - the HW knows that each MLOCK 
corresponds to a specific class. Therefore we must map that correctly.

>
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +#else
>> +	switch (class)
>> +	{
>> +	case HOST1X_CLASS_HOST1X:
>> +		return 0;
>> +	case HOST1X_CLASS_GR2D:
>> +		return 1;
>> +	case HOST1X_CLASS_GR2D_SB:
>> +		return 2;
>
> Note that we are allowing to switch 2d classes in the same jobs context and
> currently jobs class is somewhat hardcoded to GR2D.
>
> Even though that GR2D and GR2D_SB use different register banks, is it okay to
> trigger execution of different classes simultaneously? Would syncpoint
> differentiate classes on OP_DONE event?

Good point, we might need to use the same lock for these two.

>
> I suppose that MLOCK (the module lock) implies the whole module locking,
> wouldn't it make sense to just use the module ID's defined in the TRM?

Can you point out where these are defined?

Mikko

--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko Nov. 7, 2017, 9:23 p.m. UTC | #3
On 07.11.2017 15:28, Mikko Perttunen wrote:
> On 05.11.2017 18:46, Dmitry Osipenko wrote:
>> On 05.11.2017 14:01, Mikko Perttunen wrote:
>>> ...
>>>
>>> +static int mlock_id_for_class(unsigned int class)
>>> +{
>>> +#if HOST1X_HW >= 6
>>> +    switch (class)
>>> +    {
>>> +    case HOST1X_CLASS_HOST1X:
>>> +        return 0;
>>> +    case HOST1X_CLASS_VIC:
>>> +        return 17;
>>
>> What is the meaning of returned ID values that you have defined here? Why VIC
>> should have different ID on T186?
> 
> On T186, MLOCKs are not "generic" - the HW knows that each MLOCK corresponds to
> a specific class. Therefore we must map that correctly.
> 

Okay.

>>
>>> +    default:
>>> +        return -EINVAL;
>>> +    }
>>> +#else
>>> +    switch (class)
>>> +    {
>>> +    case HOST1X_CLASS_HOST1X:
>>> +        return 0;
>>> +    case HOST1X_CLASS_GR2D:
>>> +        return 1;
>>> +    case HOST1X_CLASS_GR2D_SB:
>>> +        return 2;
>>
>> Note that we are allowing to switch 2d classes in the same jobs context and
>> currently jobs class is somewhat hardcoded to GR2D.
>>
>> Even though that GR2D and GR2D_SB use different register banks, is it okay to
>> trigger execution of different classes simultaneously? Would syncpoint
>> differentiate classes on OP_DONE event?
> 
> Good point, we might need to use the same lock for these two.
> 
>>
>> I suppose that MLOCK (the module lock) implies the whole module locking,
>> wouldn't it make sense to just use the module ID's defined in the TRM?
> 
> Can you point out where these are defined?

See INDMODID / REGF_MODULEID fields of HOST1X_CHANNEL_INDOFF2_0 /
HOST1X_SYNC_REGF_ADDR_0 registers, bit numbers of HOST1X_SYNC_INTSTATUS_0 /
HOST1X_SYNC_INTC0MASK_0 / HOST1X_SYNC_MOD_TEARDOWN_0.
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mikko Perttunen Dec. 5, 2017, 1:21 p.m. UTC | #4
On 07.11.2017 23:23, Dmitry Osipenko wrote:
> On 07.11.2017 15:28, Mikko Perttunen wrote:
>> On 05.11.2017 18:46, Dmitry Osipenko wrote:
>>> On 05.11.2017 14:01, Mikko Perttunen wrote:
>>>> ...
>>>>
>>>> +static int mlock_id_for_class(unsigned int class)
>>>> +{
>>>> +#if HOST1X_HW >= 6
>>>> +    switch (class)
>>>> +    {
>>>> +    case HOST1X_CLASS_HOST1X:
>>>> +        return 0;
>>>> +    case HOST1X_CLASS_VIC:
>>>> +        return 17;
>>>
>>> What is the meaning of returned ID values that you have defined here? Why VIC
>>> should have different ID on T186?
>>
>> On T186, MLOCKs are not "generic" - the HW knows that each MLOCK corresponds to
>> a specific class. Therefore we must map that correctly.
>>
>
> Okay.
>
>>>
>>>> +    default:
>>>> +        return -EINVAL;
>>>> +    }
>>>> +#else
>>>> +    switch (class)
>>>> +    {
>>>> +    case HOST1X_CLASS_HOST1X:
>>>> +        return 0;
>>>> +    case HOST1X_CLASS_GR2D:
>>>> +        return 1;
>>>> +    case HOST1X_CLASS_GR2D_SB:
>>>> +        return 2;
>>>
>>> Note that we are allowing to switch 2d classes in the same jobs context and
>>> currently jobs class is somewhat hardcoded to GR2D.
>>>
>>> Even though that GR2D and GR2D_SB use different register banks, is it okay to
>>> trigger execution of different classes simultaneously? Would syncpoint
>>> differentiate classes on OP_DONE event?
>>
>> Good point, we might need to use the same lock for these two.
>>
>>>
>>> I suppose that MLOCK (the module lock) implies the whole module locking,
>>> wouldn't it make sense to just use the module ID's defined in the TRM?
>>
>> Can you point out where these are defined?
>
> See INDMODID / REGF_MODULEID fields of HOST1X_CHANNEL_INDOFF2_0 /
> HOST1X_SYNC_REGF_ADDR_0 registers, bit numbers of HOST1X_SYNC_INTSTATUS_0 /
> HOST1X_SYNC_INTC0MASK_0 / HOST1X_SYNC_MOD_TEARDOWN_0.

These values look like they would work on T20, but at least on T124 the 
module numbering for modules we want to lock goes above the number of 
MLOCKs so the indexing scheme would not work there..

Mikko

> --
> To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko Dec. 5, 2017, 1:43 p.m. UTC | #5
On 05.12.2017 16:21, Mikko Perttunen wrote:
> On 07.11.2017 23:23, Dmitry Osipenko wrote:
>> On 07.11.2017 15:28, Mikko Perttunen wrote:
>>> On 05.11.2017 18:46, Dmitry Osipenko wrote:
>>>> On 05.11.2017 14:01, Mikko Perttunen wrote:
>>>>> ...
>>>>>
>>>>> +static int mlock_id_for_class(unsigned int class)
>>>>> +{
>>>>> +#if HOST1X_HW >= 6
>>>>> +    switch (class)
>>>>> +    {
>>>>> +    case HOST1X_CLASS_HOST1X:
>>>>> +        return 0;
>>>>> +    case HOST1X_CLASS_VIC:
>>>>> +        return 17;
>>>>
>>>> What is the meaning of returned ID values that you have defined here? Why VIC
>>>> should have different ID on T186?
>>>
>>> On T186, MLOCKs are not "generic" - the HW knows that each MLOCK corresponds to
>>> a specific class. Therefore we must map that correctly.
>>>
>>
>> Okay.
>>
>>>>
>>>>> +    default:
>>>>> +        return -EINVAL;
>>>>> +    }
>>>>> +#else
>>>>> +    switch (class)
>>>>> +    {
>>>>> +    case HOST1X_CLASS_HOST1X:
>>>>> +        return 0;
>>>>> +    case HOST1X_CLASS_GR2D:
>>>>> +        return 1;
>>>>> +    case HOST1X_CLASS_GR2D_SB:
>>>>> +        return 2;
>>>>
>>>> Note that we are allowing to switch 2d classes in the same jobs context and
>>>> currently jobs class is somewhat hardcoded to GR2D.
>>>>
>>>> Even though that GR2D and GR2D_SB use different register banks, is it okay to
>>>> trigger execution of different classes simultaneously? Would syncpoint
>>>> differentiate classes on OP_DONE event?
>>>
>>> Good point, we might need to use the same lock for these two.
>>>
>>>>
>>>> I suppose that MLOCK (the module lock) implies the whole module locking,
>>>> wouldn't it make sense to just use the module ID's defined in the TRM?
>>>
>>> Can you point out where these are defined?
>>
>> See INDMODID / REGF_MODULEID fields of HOST1X_CHANNEL_INDOFF2_0 /
>> HOST1X_SYNC_REGF_ADDR_0 registers, bit numbers of HOST1X_SYNC_INTSTATUS_0 /
>> HOST1X_SYNC_INTC0MASK_0 / HOST1X_SYNC_MOD_TEARDOWN_0.
> 
> These values look like they would work on T20, but at least on T124 the module
> numbering for modules we want to lock goes above the number of MLOCKs so the
> indexing scheme would not work there..
> 

Indeed, for some reason I was thinking that there are 32 MLOCK's instead of 16.
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 28541b280739..f787cfe69c11 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -232,6 +232,7 @@  static void cdma_start_timer_locked(struct host1x_cdma *cdma,
 	}
 
 	cdma->timeout.client = job->client;
+	cdma->timeout.class = job->class;
 	cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
 	cdma->timeout.syncpt_val = job->syncpt_end;
 	cdma->timeout.start_ktime = ktime_get();
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index 286d49386be9..e72660fc83c9 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -59,6 +59,7 @@  struct buffer_timeout {
 	ktime_t start_ktime;		/* starting time */
 	/* context timeout information */
 	int client;
+	u32 class;
 };
 
 enum cdma_event {
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index ce320534cbed..4d5970d863d5 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -16,6 +16,7 @@ 
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/iopoll.h>
 #include <linux/slab.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
@@ -243,6 +244,125 @@  static void cdma_resume(struct host1x_cdma *cdma, u32 getptr)
 	cdma_timeout_restart(cdma, getptr);
 }
 
+static int mlock_id_for_class(unsigned int class)
+{
+#if HOST1X_HW >= 6
+	switch (class)
+	{
+	case HOST1X_CLASS_HOST1X:
+		return 0;
+	case HOST1X_CLASS_VIC:
+		return 17;
+	default:
+		return -EINVAL;
+	}
+#else
+	switch (class)
+	{
+	case HOST1X_CLASS_HOST1X:
+		return 0;
+	case HOST1X_CLASS_GR2D:
+		return 1;
+	case HOST1X_CLASS_GR2D_SB:
+		return 2;
+	case HOST1X_CLASS_VIC:
+		return 3;
+	case HOST1X_CLASS_GR3D:
+		return 4;
+	default:
+		return -EINVAL;
+	}
+#endif
+}
+
+static void timeout_release_mlock(struct host1x_cdma *cdma)
+{
+#if HOST1X_HW >= 6
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	struct host1x *host = cdma_to_host1x(cdma);
+	u32 pb_pos, pb_temp[3], val;
+	int err, mlock_id;
+
+	if (!host->hv_regs)
+		return;
+
+	mlock_id = mlock_id_for_class(cdma->timeout.class);
+	if (WARN(mlock_id < 0, "Invalid class ID"))
+		return;
+
+	val = host1x_hypervisor_readl(host, HOST1X_HV_MLOCK(mlock_id));
+	if (!HOST1X_HV_MLOCK_LOCKED_V(val) ||
+	    HOST1X_HV_MLOCK_CH_V(val) != ch->id)
+	{
+		/* Channel is not holding the MLOCK, nothing to release. */
+		return;
+	}
+
+	/*
+	 * On Tegra186, there is no register to unlock an MLOCK (don't ask me
+	 * why). As such, we have to execute a release_mlock instruction to
+	 * do it. We do this by backing up the first three opcodes of the
+	 * pushbuffer and replacing them with our own short sequence to do
+	 * the unlocking. We set the .pos field to 12, which causes DMAEND
+	 * to be set accordingly such that only the three opcodes we set
+	 * here are executed before CDMA stops. Finally we restore the value
+	 * of pos and pushbuffer contents.
+	 */
+
+	pb_pos = cdma->push_buffer.pos;
+	memcpy(pb_temp, cdma->push_buffer.mapped, ARRAY_SIZE(pb_temp) * 4);
+
+	{
+		u32 *pb = cdma->push_buffer.mapped;
+		pb[0] = host1x_opcode_acquire_mlock(cdma->timeout.class);
+		pb[1] = host1x_opcode_setclass(cdma->timeout.class, 0, 0);
+		pb[2] = host1x_opcode_release_mlock(cdma->timeout.class);
+	}
+
+	/* Flush writecombine buffer */
+	wmb();
+
+	cdma->push_buffer.pos = ARRAY_SIZE(pb_temp) * 4;
+
+	cdma_resume(cdma, 0);
+
+	/* Wait until the release_mlock opcode has been executed */
+	err = readl_relaxed_poll_timeout(
+		host->hv_regs + HOST1X_HV_MLOCK(mlock_id), val,
+		!HOST1X_HV_MLOCK_LOCKED_V(val) ||
+			HOST1X_HV_MLOCK_CH_V(val) != ch->id,
+		10, 10000);
+	WARN(err, "Failed to unlock mlock %u\n", mlock_id);
+
+	cdma_freeze(cdma);
+
+	/* Restore original pushbuffer state */
+	cdma->push_buffer.pos = pb_pos;
+	memcpy(cdma->push_buffer.mapped, pb_temp, ARRAY_SIZE(pb_temp) * 4);
+	wmb();
+#else
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	struct host1x *host = cdma_to_host1x(cdma);
+	int mlock_id;
+	u32 val;
+
+	mlock_id = mlock_id_for_class(cdma->timeout.class);
+	if (WARN(mlock_id < 0, "Invalid class ID"))
+		return;
+
+	val = host1x_sync_readl(host, HOST1X_SYNC_MLOCK_OWNER(mlock_id));
+	if (!HOST1X_SYNC_MLOCK_OWNER_CH_OWNS_V(val) ||
+	    HOST1X_SYNC_MLOCK_OWNER_CHID_V(val) != ch->id)
+	{
+		/* Channel is not holding the MLOCK, nothing to release. */
+		return;
+	}
+
+	/* Unlock MLOCK */
+	host1x_sync_writel(host, 0x0, HOST1X_SYNC_MLOCK(mlock_id));
+#endif
+}
+
 /*
  * If this timeout fires, it indicates the current sync_queue entry has
  * exceeded its TTL and the userctx should be timed out and remaining
@@ -293,6 +413,8 @@  static void cdma_timeout_handler(struct work_struct *work)
 	/* stop HW, resetting channel/module */
 	host1x_hw_cdma_freeze(host1x, cdma);
 
+	timeout_release_mlock(cdma);
+
 	host1x_cdma_update_sync_queue(cdma, ch->dev);
 	mutex_unlock(&cdma->lock);
 }
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index 246b78c41281..f80fb8be38e6 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -89,6 +89,34 @@  static inline void synchronize_syncpt_base(struct host1x_job *job)
 			 HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(value));
 }
 
+static void channel_submit_serialize(struct host1x_channel *ch,
+				     struct host1x_syncpt *sp)
+{
+#if HOST1X_HW >= 6
+	/*
+	 * On T186, due to a hw issue, we need to issue an mlock acquire
+	 * for HOST1X class. It is still interpreted as a no-op in
+	 * hardware.
+	 */
+	host1x_cdma_push(&ch->cdma,
+		host1x_opcode_acquire_mlock(HOST1X_CLASS_HOST1X),
+		host1x_opcode_setclass(HOST1X_CLASS_HOST1X, 0, 0));
+	host1x_cdma_push(&ch->cdma,
+		host1x_opcode_nonincr(host1x_uclass_wait_syncpt_r(), 1),
+		host1x_class_host_wait_syncpt(host1x_syncpt_id(sp),
+					      host1x_syncpt_read_max(sp)));
+	host1x_cdma_push(&ch->cdma,
+		host1x_opcode_release_mlock(HOST1X_CLASS_HOST1X),
+		HOST1X_OPCODE_NOP);
+#else
+	host1x_cdma_push(&ch->cdma,
+		host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
+				       host1x_uclass_wait_syncpt_r(), 1),
+		host1x_class_host_wait_syncpt(host1x_syncpt_id(sp),
+					      host1x_syncpt_read_max(sp)));
+#endif
+}
+
 static int channel_submit(struct host1x_job *job)
 {
 	struct host1x_channel *ch = job->channel;
@@ -96,6 +124,7 @@  static int channel_submit(struct host1x_job *job)
 	u32 user_syncpt_incrs = job->syncpt_incrs;
 	u32 prev_max = 0;
 	u32 syncval;
+	u32 mlock;
 	int err;
 	struct host1x_waitlist *completed_waiter = NULL;
 	struct host1x *host = dev_get_drvdata(ch->dev->parent);
@@ -128,17 +157,12 @@  static int channel_submit(struct host1x_job *job)
 		goto error;
 	}
 
-	if (job->serialize) {
-		/*
-		 * Force serialization by inserting a host wait for the
-		 * previous job to finish before this one can commence.
-		 */
-		host1x_cdma_push(&ch->cdma,
-				 host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
-					host1x_uclass_wait_syncpt_r(), 1),
-				 host1x_class_host_wait_syncpt(job->syncpt_id,
-					host1x_syncpt_read_max(sp)));
-	}
+	/*
+	 * Force serialization by inserting a host wait for the
+	 * previous job to finish before this one can commence.
+	 */
+	if (job->serialize)
+		channel_submit_serialize(ch, sp);
 
 	/* Synchronize base register to allow using it for relative waiting */
 	if (sp->base)
@@ -149,16 +173,29 @@  static int channel_submit(struct host1x_job *job)
 	/* assign syncpoint to channel */
 	host1x_hw_syncpt_assign_channel(host, sp, ch);
 
-	job->syncpt_end = syncval;
-
-	/* add a setclass for modules that require it */
-	if (job->class)
+	/* acquire MLOCK and set channel class to specified class */
+	mlock = HOST1X_HW >= 6 ? job->class : mlock_id_for_class(job->class);
+	if (job->class) {
 		host1x_cdma_push(&ch->cdma,
-				 host1x_opcode_setclass(job->class, 0, 0),
-				 HOST1X_OPCODE_NOP);
+				 host1x_opcode_acquire_mlock(mlock),
+				 host1x_opcode_setclass(job->class, 0, 0));
+	}
 
 	submit_gathers(job);
 
+	if (job->class) {
+		/*
+		 * Push additional increment to catch jobs that crash before
+		 * finishing their gather, not reaching the unlock opcode.
+		 */
+		syncval = host1x_syncpt_incr_max(sp, 1);
+		host1x_cdma_push(&ch->cdma,
+			host1x_opcode_imm_incr_syncpt(0, job->syncpt_id),
+			host1x_opcode_release_mlock(mlock));
+	}
+
+	job->syncpt_end = syncval;
+
 	/* end CDMA submit & stash pinned hMems into sync queue */
 	host1x_cdma_end(&ch->cdma, job);
 
diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
index 5f0fb866efa8..6a2c9f905acc 100644
--- a/drivers/gpu/host1x/hw/host1x01_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
@@ -138,6 +138,16 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+	return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+	return (14 << 28) | (1 << 24) | id;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/host1x02_hardware.h b/drivers/gpu/host1x/hw/host1x02_hardware.h
index 154901860bc6..c524c6c8d82f 100644
--- a/drivers/gpu/host1x/hw/host1x02_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x02_hardware.h
@@ -137,6 +137,16 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+	return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+	return (14 << 28) | (1 << 24) | id;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/host1x04_hardware.h b/drivers/gpu/host1x/hw/host1x04_hardware.h
index de1a38175328..8dedd77b7a1b 100644
--- a/drivers/gpu/host1x/hw/host1x04_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x04_hardware.h
@@ -137,6 +137,16 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+	return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+	return (14 << 28) | (1 << 24) | id;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/host1x05_hardware.h b/drivers/gpu/host1x/hw/host1x05_hardware.h
index 2937ebb6be11..6aafcb5e97e6 100644
--- a/drivers/gpu/host1x/hw/host1x05_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x05_hardware.h
@@ -137,6 +137,16 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+	return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+	return (14 << 28) | (1 << 24) | id;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
index 3039c92ea605..60147d26ad9b 100644
--- a/drivers/gpu/host1x/hw/host1x06_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
@@ -137,6 +137,16 @@  static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+	return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+	return (14 << 28) | (1 << 24) | id;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_sync.h b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
index 31238c285d46..4630fec2237a 100644
--- a/drivers/gpu/host1x/hw/hw_host1x01_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
@@ -125,6 +125,12 @@  static inline u32 host1x_sync_ip_busy_timeout_r(void)
 }
 #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
 	host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+	return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+	host1x_sync_mlock_r(id)
 static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
 {
 	return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x02_sync.h b/drivers/gpu/host1x/hw/hw_host1x02_sync.h
index 540c7b65995f..dec8e812a114 100644
--- a/drivers/gpu/host1x/hw/hw_host1x02_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x02_sync.h
@@ -125,6 +125,12 @@  static inline u32 host1x_sync_ip_busy_timeout_r(void)
 }
 #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
 	host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+	return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+	host1x_sync_mlock_r(id)
 static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
 {
 	return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x04_sync.h b/drivers/gpu/host1x/hw/hw_host1x04_sync.h
index 3d6c8ec65934..9a951a4db07f 100644
--- a/drivers/gpu/host1x/hw/hw_host1x04_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x04_sync.h
@@ -125,6 +125,12 @@  static inline u32 host1x_sync_ip_busy_timeout_r(void)
 }
 #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
 	host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+	return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+	host1x_sync_mlock_r(id)
 static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
 {
 	return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x05_sync.h b/drivers/gpu/host1x/hw/hw_host1x05_sync.h
index ca10eee5045c..e29e2e19a60b 100644
--- a/drivers/gpu/host1x/hw/hw_host1x05_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x05_sync.h
@@ -125,6 +125,12 @@  static inline u32 host1x_sync_ip_busy_timeout_r(void)
 }
 #define HOST1X_SYNC_IP_BUSY_TIMEOUT \
 	host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+	return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+	host1x_sync_mlock_r(id)
 static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
 {
 	return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h b/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
index c05dab8a178b..be73530c3585 100644
--- a/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
+++ b/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
@@ -18,6 +18,11 @@ 
 #define HOST1X_HV_SYNCPT_PROT_EN			0x1ac4
 #define HOST1X_HV_SYNCPT_PROT_EN_CH_EN			BIT(1)
 #define HOST1X_HV_CH_KERNEL_FILTER_GBUFFER(x)		(0x2020 + (x * 4))
+#define HOST1X_HV_MLOCK(x)				(0x2030 + (x * 4))
+#define HOST1X_HV_MLOCK_CH(x)				(((x) & 0x3f) << 2)
+#define HOST1X_HV_MLOCK_CH_V(x)				(((x) >> 2) & 0x3f)
+#define HOST1X_HV_MLOCK_LOCKED				BIT(0)
+#define HOST1X_HV_MLOCK_LOCKED_V(x)			((x) & 0x1)
 #define HOST1X_HV_CMDFIFO_PEEK_CTRL			0x233c
 #define HOST1X_HV_CMDFIFO_PEEK_CTRL_ADDR(x)		(x)
 #define HOST1X_HV_CMDFIFO_PEEK_CTRL_CHANNEL(x)		((x) << 16)