Patchwork [v7,1/8] Talitos: Support for async_tx XOR offload

login
register
mail settings
Submitter Qiang Liu
Date Aug. 9, 2012, 8:20 a.m.
Message ID <1344500448-10927-1-git-send-email-qiang.liu@freescale.com>
Download mbox | patch
Permalink /patch/176022/
State Not Applicable
Delegated to: Kumar Gala
Headers show

Comments

Qiang Liu - Aug. 9, 2012, 8:20 a.m.
From: Qiang Liu <qiang.liu@freescale.com>

Expose Talitos's XOR functionality to be used for RAID parity
calculation via the Async_tx layer.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/crypto/Kconfig   |    9 +
 drivers/crypto/talitos.c |  413 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/crypto/talitos.h |   53 ++++++
 3 files changed, 475 insertions(+), 0 deletions(-)

--
1.7.5.1
Geanta Neag Horia Ioan-B05471 - Aug. 30, 2012, 2:23 p.m.
On Thu, 9 Aug 2012 11:20:48 +0300, qiang.liu@freescale.com wrote:
> From: Qiang Liu <qiang.liu@freescale.com>
> 
> Expose Talitos's XOR functionality to be used for RAID parity
> calculation via the Async_tx layer.
> 
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> ---
>  drivers/crypto/Kconfig   |    9 +
>  drivers/crypto/talitos.c |  413 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/crypto/talitos.h |   53 ++++++
>  3 files changed, 475 insertions(+), 0 deletions(-)


> +static void talitos_xor_run_tx_complete_actions(struct talitos_xor_desc *desc,
> +		struct talitos_xor_chan *xor_chan)
> +{
> +	struct device *dev = xor_chan->dev;
> +	dma_addr_t dest, addr;
> +	unsigned int src_cnt = desc->unmap_src_cnt;
> +	unsigned int len = desc->unmap_len;
> +	enum dma_ctrl_flags flags = desc->async_tx.flags;
> +	struct dma_async_tx_descriptor *tx = &desc->async_tx;
> +
> +	/* unmap dma addresses */
> +	dest = desc->hwdesc.ptr[6].ptr;
> +	if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
> +		dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
> +
> +	desc->idx = 6 - src_cnt;
> +	if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
> +		while(desc->idx < 6) {
> +			addr = desc->hwdesc.ptr[desc->idx++].ptr;
> +			if (addr == dest)
> +				continue;
> +			dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
> +		}
> +	}

No need for braces around the while block.

> +	/* run dependent operations */
> +	dma_run_dependencies(tx);
> +}


> +static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
> +				void *context, int error)
> +{
> +	struct talitos_xor_desc *desc = context;
> +	struct talitos_xor_chan *xor_chan;
> +	dma_async_tx_callback callback;
> +	void *callback_param;
> +
> +	if (unlikely(error))
> +		dev_err(dev, "xor operation: talitos error %d\n", error);
> +
> +	xor_chan = container_of(desc->async_tx.chan, struct talitos_xor_chan,
> +				common);
> +	spin_lock_bh(&xor_chan->desc_lock);
> +	if (xor_chan->completed_cookie < desc->async_tx.cookie)
> +		xor_chan->completed_cookie = desc->async_tx.cookie;
> +
> +	callback = desc->async_tx.callback;
> +	callback_param = desc->async_tx.callback_param;
> +
> +	if (callback) {
> +		spin_unlock_bh(&xor_chan->desc_lock);
> +		callback(callback_param);
> +		spin_lock_bh(&xor_chan->desc_lock);
> +	}

Since callback_param is used only here, maybe:

if (callback) {
	void *callback_param = desc->async_tx.callback_param;

	spin_unlock_bh(&xor_chan->desc_lock);
	callback(callback_param);
	spin_lock_bh(&xor_chan->desc_lock);
}

> +
> +	talitos_xor_run_tx_complete_actions(desc, xor_chan);
> +
> +	list_del(&desc->node);
> +	list_add_tail(&desc->node, &xor_chan->free_desc);
> +	spin_unlock_bh(&xor_chan->desc_lock);
> +	if (!list_empty(&xor_chan->pending_q))
> +		talitos_process_pending(xor_chan);
> +}


> +static int talitos_alloc_chan_resources(struct dma_chan *chan)
> +{
> +	struct talitos_xor_chan *xor_chan;
> +	struct talitos_xor_desc *desc;
> +	LIST_HEAD(tmp_list);
> +	int i;
> +
> +	xor_chan = container_of(chan, struct talitos_xor_chan, common);
> +
> +	if (!list_empty(&xor_chan->free_desc))
> +		return xor_chan->total_desc;
> +
> +	for (i = 0; i < TALITOS_MAX_DESCRIPTOR_NR; i++) {
> +		desc = talitos_xor_alloc_descriptor(xor_chan,
> +				GFP_KERNEL | GFP_DMA);

talitos_xor_alloc_descriptor() is called here without holding
the xor_chan->desc_lock and it increments xor_chan->total_desc.
Isn't this an issue ?

> +		if (!desc) {
> +			dev_err(xor_chan->common.device->dev,
> +				"Only %d initial descriptors\n", i);
> +			break;
> +		}
> +		list_add_tail(&desc->node, &tmp_list);
> +	}
> +
> +	if (!i)
> +		return -ENOMEM;
> +
> +	/* At least one desc is allocated */
> +	spin_lock_bh(&xor_chan->desc_lock);
> +	list_splice_init(&tmp_list, &xor_chan->free_desc);
> +	spin_unlock_bh(&xor_chan->desc_lock);
> +
> +	return xor_chan->total_desc;
> +}


> +/**
> + * talitos_register_dma_async - Initialize the Freescale XOR ADMA device
> + * It is registered as a DMA device with the capability to perform
> + * XOR operation with the Async_tx layer.
> + * The various queues and channel resources are also allocated.
> + */
> +static int talitos_register_async_tx(struct device *dev, int max_xor_srcs)
> +{
> +	struct talitos_private *priv = dev_get_drvdata(dev);
> +	struct dma_device *dma_dev = &priv->dma_dev_common;
> +	struct talitos_xor_chan *xor_chan;
> +	int err;
> +
> +	xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
> +	if (!xor_chan) {
> +		dev_err(dev, "unable to allocate xor channel\n");
> +		return -ENOMEM;
> +	}
> +
> +	dma_dev->dev = dev;
> +	dma_dev->device_alloc_chan_resources = talitos_alloc_chan_resources;
> +	dma_dev->device_free_chan_resources = talitos_free_chan_resources;
> +	dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
> +	dma_dev->max_xor = max_xor_srcs;
> +	dma_dev->device_tx_status = talitos_is_tx_complete;
> +	dma_dev->device_issue_pending = talitos_issue_pending;
> +	INIT_LIST_HEAD(&dma_dev->channels);
> +	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
> +
> +	xor_chan->dev = dev;
> +	xor_chan->common.device = dma_dev;
> +	xor_chan->total_desc = 0;
> +	INIT_LIST_HEAD(&xor_chan->submit_q);
> +	INIT_LIST_HEAD(&xor_chan->pending_q);
> +	INIT_LIST_HEAD(&xor_chan->in_progress_q);
> +	INIT_LIST_HEAD(&xor_chan->free_desc);
> +	spin_lock_init(&xor_chan->desc_lock);
> +
> +	list_add_tail(&xor_chan->common.device_node, &dma_dev->channels);
> +	dma_dev->chancnt++;
> +
> +	err = dma_async_device_register(dma_dev);
> +	if (err) {
> +		dev_err(dev, "Unable to register XOR with Async_tx\n");
> +		goto err_out;

Replace the jump with talitos_unregister_async_xor(dev) and
remove code under err_out label.

> +	}
> +
> +	return err;
> +
> +err_out:
> +	talitos_unregister_async_xor(dev);
> +	return err;
> +}
> +#endif


> diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
> index 61a1405..fc9d125 100644
> --- a/drivers/crypto/talitos.h
> +++ b/drivers/crypto/talitos.h
> @@ -30,6 +30,7 @@
> 
>  #define TALITOS_TIMEOUT 100000
>  #define TALITOS_MAX_DATA_LEN 65535
> +#define TALITOS_MAX_DESCRIPTOR_NR 256

This refers only to xor descriptors, so renaming it similar to
TALITOS_MAX_XOR_DESCRIPTOR_NR would make sense.

Besides these:
1. As Dan Williams mentioned, you should explain why you are using
both spin_lock_bh and spin_lock_irqsave on the same lock.
2. I don't see anything added to talitos_remove(). Shouldn't
talitos_unregister_async_xor() be called? Anything else?
Have you tested with talitos built as a module?

Horia
Liu Qiang-B32616 - Aug. 31, 2012, 3:08 a.m.
> -----Original Message-----
> From: Geanta Neag Horia Ioan-B05471
> Sent: Thursday, August 30, 2012 10:23 PM
> To: Liu Qiang-B32616; linux-crypto@vger.kernel.org;
> dan.j.williams@gmail.com; herbert@gondor.hengli.com.au;
> davem@davemloft.net; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org
> Cc: Li Yang-R58472; Phillips Kim-R1AAHA; vinod.koul@intel.com;
> dan.j.williams@intel.com; arnd@arndb.de; gregkh@linuxfoundation.org; Liu
> Qiang-B32616
> Subject: RE: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
> 
> On Thu, 9 Aug 2012 11:20:48 +0300, qiang.liu@freescale.com wrote:
> > From: Qiang Liu <qiang.liu@freescale.com>
> >
> > Expose Talitos's XOR functionality to be used for RAID parity
> > calculation via the Async_tx layer.
> >
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: David S. Miller <davem@davemloft.net>
> > Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> > Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> > Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> > Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> > Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> > ---
> >  drivers/crypto/Kconfig   |    9 +
> >  drivers/crypto/talitos.c |  413
> ++++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/crypto/talitos.h |   53 ++++++
> >  3 files changed, 475 insertions(+), 0 deletions(-)
> 
> 
> > +static void talitos_xor_run_tx_complete_actions(struct
> talitos_xor_desc *desc,
> > +		struct talitos_xor_chan *xor_chan)
> > +{
> > +	struct device *dev = xor_chan->dev;
> > +	dma_addr_t dest, addr;
> > +	unsigned int src_cnt = desc->unmap_src_cnt;
> > +	unsigned int len = desc->unmap_len;
> > +	enum dma_ctrl_flags flags = desc->async_tx.flags;
> > +	struct dma_async_tx_descriptor *tx = &desc->async_tx;
> > +
> > +	/* unmap dma addresses */
> > +	dest = desc->hwdesc.ptr[6].ptr;
> > +	if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
> > +		dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
> > +
> > +	desc->idx = 6 - src_cnt;
> > +	if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
> > +		while(desc->idx < 6) {
> > +			addr = desc->hwdesc.ptr[desc->idx++].ptr;
> > +			if (addr == dest)
> > +				continue;
> > +			dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
> > +		}
> > +	}
> 
> No need for braces around the while block.
I will remove it.

> 
> > +	/* run dependent operations */
> > +	dma_run_dependencies(tx);
> > +}
> 
> 
> > +static void talitos_release_xor(struct device *dev, struct
> talitos_desc *hwdesc,
> > +				void *context, int error)
> > +{
> > +	struct talitos_xor_desc *desc = context;
> > +	struct talitos_xor_chan *xor_chan;
> > +	dma_async_tx_callback callback;
> > +	void *callback_param;
> > +
> > +	if (unlikely(error))
> > +		dev_err(dev, "xor operation: talitos error %d\n", error);
> > +
> > +	xor_chan = container_of(desc->async_tx.chan, struct
> talitos_xor_chan,
> > +				common);
> > +	spin_lock_bh(&xor_chan->desc_lock);
> > +	if (xor_chan->completed_cookie < desc->async_tx.cookie)
> > +		xor_chan->completed_cookie = desc->async_tx.cookie;
> > +
> > +	callback = desc->async_tx.callback;
> > +	callback_param = desc->async_tx.callback_param;
> > +
> > +	if (callback) {
> > +		spin_unlock_bh(&xor_chan->desc_lock);
> > +		callback(callback_param);
> > +		spin_lock_bh(&xor_chan->desc_lock);
> > +	}
> 
> Since callback_param is used only here, maybe:
> 
> if (callback) {
> 	void *callback_param = desc->async_tx.callback_param;
> 
> 	spin_unlock_bh(&xor_chan->desc_lock);
> 	callback(callback_param);
> 	spin_lock_bh(&xor_chan->desc_lock);
> }
Fine. I will modify it in next.

> 
> > +
> > +	talitos_xor_run_tx_complete_actions(desc, xor_chan);
> > +
> > +	list_del(&desc->node);
> > +	list_add_tail(&desc->node, &xor_chan->free_desc);
> > +	spin_unlock_bh(&xor_chan->desc_lock);
> > +	if (!list_empty(&xor_chan->pending_q))
> > +		talitos_process_pending(xor_chan);
> > +}
> 
> 
> > +static int talitos_alloc_chan_resources(struct dma_chan *chan)
> > +{
> > +	struct talitos_xor_chan *xor_chan;
> > +	struct talitos_xor_desc *desc;
> > +	LIST_HEAD(tmp_list);
> > +	int i;
> > +
> > +	xor_chan = container_of(chan, struct talitos_xor_chan, common);
> > +
> > +	if (!list_empty(&xor_chan->free_desc))
> > +		return xor_chan->total_desc;
> > +
> > +	for (i = 0; i < TALITOS_MAX_DESCRIPTOR_NR; i++) {
> > +		desc = talitos_xor_alloc_descriptor(xor_chan,
> > +				GFP_KERNEL | GFP_DMA);
> 
> talitos_xor_alloc_descriptor() is called here without holding
> the xor_chan->desc_lock and it increments xor_chan->total_desc.
> Isn't this an issue ?

No, please refer to the code as below, 
+	list_add_tail(&desc->node, &tmp_list);

The list is temporary list, it will be merged to xor_chan->free_desc in next step, here is protected by lock,
+	spin_lock_bh(&xor_chan->desc_lock);
+	list_splice_init(&tmp_list, &xor_chan->free_desc);
+	spin_unlock_bh(&xor_chan->desc_lock);

> 
> > +		if (!desc) {
> > +			dev_err(xor_chan->common.device->dev,
> > +				"Only %d initial descriptors\n", i);
> > +			break;
> > +		}
> > +		list_add_tail(&desc->node, &tmp_list);
> > +	}
> > +
> > +	if (!i)
> > +		return -ENOMEM;
> > +
> > +	/* At least one desc is allocated */
> > +	spin_lock_bh(&xor_chan->desc_lock);
> > +	list_splice_init(&tmp_list, &xor_chan->free_desc);
> > +	spin_unlock_bh(&xor_chan->desc_lock);
> > +
> > +	return xor_chan->total_desc;
> > +}
> 
> 
> > +/**
> > + * talitos_register_dma_async - Initialize the Freescale XOR ADMA
> device
> > + * It is registered as a DMA device with the capability to perform
> > + * XOR operation with the Async_tx layer.
> > + * The various queues and channel resources are also allocated.
> > + */
> > +static int talitos_register_async_tx(struct device *dev, int
> max_xor_srcs)
> > +{
> > +	struct talitos_private *priv = dev_get_drvdata(dev);
> > +	struct dma_device *dma_dev = &priv->dma_dev_common;
> > +	struct talitos_xor_chan *xor_chan;
> > +	int err;
> > +
> > +	xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
> > +	if (!xor_chan) {
> > +		dev_err(dev, "unable to allocate xor channel\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	dma_dev->dev = dev;
> > +	dma_dev->device_alloc_chan_resources = talitos_alloc_chan_resources;
> > +	dma_dev->device_free_chan_resources = talitos_free_chan_resources;
> > +	dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
> > +	dma_dev->max_xor = max_xor_srcs;
> > +	dma_dev->device_tx_status = talitos_is_tx_complete;
> > +	dma_dev->device_issue_pending = talitos_issue_pending;
> > +	INIT_LIST_HEAD(&dma_dev->channels);
> > +	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
> > +
> > +	xor_chan->dev = dev;
> > +	xor_chan->common.device = dma_dev;
> > +	xor_chan->total_desc = 0;
> > +	INIT_LIST_HEAD(&xor_chan->submit_q);
> > +	INIT_LIST_HEAD(&xor_chan->pending_q);
> > +	INIT_LIST_HEAD(&xor_chan->in_progress_q);
> > +	INIT_LIST_HEAD(&xor_chan->free_desc);
> > +	spin_lock_init(&xor_chan->desc_lock);
> > +
> > +	list_add_tail(&xor_chan->common.device_node, &dma_dev->channels);
> > +	dma_dev->chancnt++;
> > +
> > +	err = dma_async_device_register(dma_dev);
> > +	if (err) {
> > +		dev_err(dev, "Unable to register XOR with Async_tx\n");
> > +		goto err_out;
> 
> Replace the jump with talitos_unregister_async_xor(dev) and
> remove code under err_out label.
No, here should be reserved, it should free xor_chan and remove "dma_dev->chancnt++;".
Actually, I find most of code doesn't care this return value.
I will correct it in next.

> 
> > +	}
> > +
> > +	return err;
> > +
> > +err_out:
> > +	talitos_unregister_async_xor(dev);
> > +	return err;
> > +}
> > +#endif
> 
> 
> > diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
> > index 61a1405..fc9d125 100644
> > --- a/drivers/crypto/talitos.h
> > +++ b/drivers/crypto/talitos.h
> > @@ -30,6 +30,7 @@
> >
> >  #define TALITOS_TIMEOUT 100000
> >  #define TALITOS_MAX_DATA_LEN 65535
> > +#define TALITOS_MAX_DESCRIPTOR_NR 256
> 
> This refers only to xor descriptors, so renaming it similar to
> TALITOS_MAX_XOR_DESCRIPTOR_NR would make sense.
I remember it is applied to other descriptors, I will check it again, if not, I will rename it as you suggested.

> 
> Besides these:
> 1. As Dan Williams mentioned, you should explain why you are using
> both spin_lock_bh and spin_lock_irqsave on the same lock.
I'm waiting for Dan's feedback about this patch, I will add description or correct it with other comments together.

> 2. I don't see anything added to talitos_remove(). Shouldn't
> talitos_unregister_async_xor() be called? Anything else?
> Have you tested with talitos built as a module?
My fault, it should be added in talitos_remove(); I will correct it in next.
Thanks for your review.

> 
> Horia
Geanta Neag Horia Ioan-B05471 - Aug. 31, 2012, 10:38 a.m.
On Fri, 31 Aug 2012 06:08:05 +0300, Liu Qiang-B32616 <B32616@freescale.com>
wrote:
> > -----Original Message-----
> > From: Geanta Neag Horia Ioan-B05471
> > Sent: Thursday, August 30, 2012 10:23 PM
> > To: Liu Qiang-B32616; linux-crypto@vger.kernel.org;
> > dan.j.williams@gmail.com; herbert@gondor.hengli.com.au;
> > davem@davemloft.net; linux-kernel@vger.kernel.org; linuxppc-
> > dev@lists.ozlabs.org
> > Cc: Li Yang-R58472; Phillips Kim-R1AAHA; vinod.koul@intel.com;
> > dan.j.williams@intel.com; arnd@arndb.de; gregkh@linuxfoundation.org; Liu
> > Qiang-B32616
> > Subject: RE: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
> >
> > On Thu, 9 Aug 2012 11:20:48 +0300, qiang.liu@freescale.com wrote:
> > > From: Qiang Liu <qiang.liu@freescale.com>
> > >
> > > Expose Talitos's XOR functionality to be used for RAID parity
> > > calculation via the Async_tx layer.
> > >
> > > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > > Cc: David S. Miller <davem@davemloft.net>
> > > Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> > > Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> > > Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> > > Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> > > Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> > > ---
> > >  drivers/crypto/Kconfig   |    9 +
> > >  drivers/crypto/talitos.c |  413
> > ++++++++++++++++++++++++++++++++++++++++++++++
> > >  drivers/crypto/talitos.h |   53 ++++++
> > >  3 files changed, 475 insertions(+), 0 deletions(-)
> >

> > > +static int talitos_alloc_chan_resources(struct dma_chan *chan)
> > > +{
> > > +	struct talitos_xor_chan *xor_chan;
> > > +	struct talitos_xor_desc *desc;
> > > +	LIST_HEAD(tmp_list);
> > > +	int i;
> > > +
> > > +	xor_chan = container_of(chan, struct talitos_xor_chan, common);
> > > +
> > > +	if (!list_empty(&xor_chan->free_desc))
> > > +		return xor_chan->total_desc;
> > > +
> > > +	for (i = 0; i < TALITOS_MAX_DESCRIPTOR_NR; i++) {
> > > +		desc = talitos_xor_alloc_descriptor(xor_chan,
> > > +				GFP_KERNEL | GFP_DMA);
> >
> > talitos_xor_alloc_descriptor() is called here without holding
> > the xor_chan->desc_lock and it increments xor_chan->total_desc.
> > Isn't this an issue ?
> 
> No, please refer to the code as below,
> +	list_add_tail(&desc->node, &tmp_list);
> 
> The list is temporary list, it will be merged to xor_chan->free_desc in next step, here is protected
> by lock,
> +	spin_lock_bh(&xor_chan->desc_lock);
> +	list_splice_init(&tmp_list, &xor_chan->free_desc);
> +	spin_unlock_bh(&xor_chan->desc_lock);

I was not referring to the list, but to xor_chan->total_desc variable.
The following access:
talitos_alloc_chan_resources()->talitos_xor_alloc_descriptor()->total_desc++
is not protected by the xor_chan->desc_lock.
Liu Qiang-B32616 - Aug. 31, 2012, 10:41 a.m.
> -----Original Message-----
> From: Geanta Neag Horia Ioan-B05471
> Sent: Friday, August 31, 2012 6:39 PM
> To: Liu Qiang-B32616; linux-crypto@vger.kernel.org;
> dan.j.williams@gmail.com; herbert@gondor.hengli.com.au;
> davem@davemloft.net; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org
> Cc: Li Yang-R58472; Phillips Kim-R1AAHA; vinod.koul@intel.com; Dan
> Williams; arnd@arndb.de; gregkh@linuxfoundation.org
> Subject: RE: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
> 
> On Fri, 31 Aug 2012 06:08:05 +0300, Liu Qiang-B32616
> <B32616@freescale.com>
> wrote:
> > > -----Original Message-----
> > > From: Geanta Neag Horia Ioan-B05471
> > > Sent: Thursday, August 30, 2012 10:23 PM
> > > To: Liu Qiang-B32616; linux-crypto@vger.kernel.org;
> > > dan.j.williams@gmail.com; herbert@gondor.hengli.com.au;
> > > davem@davemloft.net; linux-kernel@vger.kernel.org; linuxppc-
> > > dev@lists.ozlabs.org
> > > Cc: Li Yang-R58472; Phillips Kim-R1AAHA; vinod.koul@intel.com;
> > > dan.j.williams@intel.com; arnd@arndb.de; gregkh@linuxfoundation.org;
> Liu
> > > Qiang-B32616
> > > Subject: RE: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
> > >
> > > On Thu, 9 Aug 2012 11:20:48 +0300, qiang.liu@freescale.com wrote:
> > > > From: Qiang Liu <qiang.liu@freescale.com>
> > > >
> > > > Expose Talitos's XOR functionality to be used for RAID parity
> > > > calculation via the Async_tx layer.
> > > >
> > > > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > > > Cc: David S. Miller <davem@davemloft.net>
> > > > Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> > > > Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> > > > Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> > > > Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> > > > Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> > > > ---
> > > >  drivers/crypto/Kconfig   |    9 +
> > > >  drivers/crypto/talitos.c |  413
> > > ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  drivers/crypto/talitos.h |   53 ++++++
> > > >  3 files changed, 475 insertions(+), 0 deletions(-)
> > >
> 
> > > > +static int talitos_alloc_chan_resources(struct dma_chan *chan)
> > > > +{
> > > > +	struct talitos_xor_chan *xor_chan;
> > > > +	struct talitos_xor_desc *desc;
> > > > +	LIST_HEAD(tmp_list);
> > > > +	int i;
> > > > +
> > > > +	xor_chan = container_of(chan, struct talitos_xor_chan,
> common);
> > > > +
> > > > +	if (!list_empty(&xor_chan->free_desc))
> > > > +		return xor_chan->total_desc;
> > > > +
> > > > +	for (i = 0; i < TALITOS_MAX_DESCRIPTOR_NR; i++) {
> > > > +		desc = talitos_xor_alloc_descriptor(xor_chan,
> > > > +				GFP_KERNEL | GFP_DMA);
> > >
> > > talitos_xor_alloc_descriptor() is called here without holding
> > > the xor_chan->desc_lock and it increments xor_chan->total_desc.
> > > Isn't this an issue ?
> >
> > No, please refer to the code as below,
> > +	list_add_tail(&desc->node, &tmp_list);
> >
> > The list is temporary list, it will be merged to xor_chan->free_desc in
> next step, here is protected
> > by lock,
> > +	spin_lock_bh(&xor_chan->desc_lock);
> > +	list_splice_init(&tmp_list, &xor_chan->free_desc);
> > +	spin_unlock_bh(&xor_chan->desc_lock);
> 
> I was not referring to the list, but to xor_chan->total_desc variable.
> The following access:
> talitos_alloc_chan_resources()->talitos_xor_alloc_descriptor()-
> >total_desc++
> is not protected by the xor_chan->desc_lock.
Ok, I will correct it in next series. Thanks.
Dan Williams - Sept. 2, 2012, 6:47 a.m.
On Thu, Aug 30, 2012 at 7:23 AM, Geanta Neag Horia Ioan-B05471
<B05471@freescale.com> wrote:
>
> Besides these:
> 1. As Dan Williams mentioned, you should explain why you are using
> both spin_lock_bh and spin_lock_irqsave on the same lock.

It looks like talitos_process_pending() can be called from hardirq
context via talitos_error().  So spin_lock_bh is not sufficient.
Dan Williams - Sept. 2, 2012, 8:12 a.m.
On Thu, Aug 9, 2012 at 1:20 AM,  <qiang.liu@freescale.com> wrote:
> From: Qiang Liu <qiang.liu@freescale.com>
>
> Expose Talitos's XOR functionality to be used for RAID parity
> calculation via the Async_tx layer.
>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> ---
>  drivers/crypto/Kconfig   |    9 +
>  drivers/crypto/talitos.c |  413 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/crypto/talitos.h |   53 ++++++
>  3 files changed, 475 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index be6b2ba..f0a7c29 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -222,6 +222,15 @@ config CRYPTO_DEV_TALITOS
>           To compile this driver as a module, choose M here: the module
>           will be called talitos.
>
> +config CRYPTO_DEV_TALITOS_RAIDXOR
> +       bool "Talitos RAID5 XOR Calculation Offload"
> +       default y

No, default y.  The user should explicitly enable this.

> +       select DMA_ENGINE
> +       depends on CRYPTO_DEV_TALITOS
> +       help
> +         Say 'Y' here to use the Freescale Security Engine (SEC) to
> +         offload RAID XOR parity Calculation
> +

Is it faster than cpu xor?

Will this engine be coordinating with another to handle memory copies?
 The dma mapping code for async_tx/raid is broken when dma mapping
requests overlap or cross dma device boundaries [1].

[1]: http://marc.info/?l=linux-arm-kernel&m=129407269402930&w=2

> +static void talitos_process_pending(struct talitos_xor_chan *xor_chan)
> +{
> +       struct talitos_xor_desc *desc, *_desc;
> +       unsigned long flags;
> +       int status;
> +       struct talitos_private *priv;
> +       int ch;
> +
> +       priv = dev_get_drvdata(xor_chan->dev);
> +       ch = atomic_inc_return(&priv->last_chan) &
> +                                 (priv->num_channels - 1);

Maybe a comment about why this this is duplicated from
talitos_cra_init()?  It sticks out here and I had to go looking to
find out why this channel number increments on submit.


> +       spin_lock_irqsave(&xor_chan->desc_lock, flags);
> +
> +       list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q, node) {
> +               status = talitos_submit(xor_chan->dev, ch, &desc->hwdesc,
> +                                       talitos_release_xor, desc);
> +               if (status != -EINPROGRESS)
> +                       break;
> +
> +               list_del(&desc->node);
> +               list_add_tail(&desc->node, &xor_chan->in_progress_q);
> +       }
> +
> +       spin_unlock_irqrestore(&xor_chan->desc_lock, flags);
> +}
> +
> +static void talitos_xor_run_tx_complete_actions(struct talitos_xor_desc *desc,
> +               struct talitos_xor_chan *xor_chan)
> +{
> +       struct device *dev = xor_chan->dev;
> +       dma_addr_t dest, addr;
> +       unsigned int src_cnt = desc->unmap_src_cnt;
> +       unsigned int len = desc->unmap_len;
> +       enum dma_ctrl_flags flags = desc->async_tx.flags;
> +       struct dma_async_tx_descriptor *tx = &desc->async_tx;
> +
> +       /* unmap dma addresses */
> +       dest = desc->hwdesc.ptr[6].ptr;
> +       if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
> +               dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
> +
> +       desc->idx = 6 - src_cnt;
> +       if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
> +               while(desc->idx < 6) {

Checkpatch says:
ERROR: space required before the open parenthesis '('


> +                       addr = desc->hwdesc.ptr[desc->idx++].ptr;
> +                       if (addr == dest)
> +                               continue;
> +                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
> +               }
> +       }
> +
> +       /* run dependent operations */
> +       dma_run_dependencies(tx);

Here is where we run into problems if another engine accesses these
same buffers, especially on ARM v6+.

> +}
> +
> +static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
> +                               void *context, int error)
> +{
> +       struct talitos_xor_desc *desc = context;
> +       struct talitos_xor_chan *xor_chan;
> +       dma_async_tx_callback callback;
> +       void *callback_param;
> +
> +       if (unlikely(error))
> +               dev_err(dev, "xor operation: talitos error %d\n", error);
> +
> +       xor_chan = container_of(desc->async_tx.chan, struct talitos_xor_chan,
> +                               common);
> +       spin_lock_bh(&xor_chan->desc_lock);
> +       if (xor_chan->completed_cookie < desc->async_tx.cookie)
> +               xor_chan->completed_cookie = desc->async_tx.cookie;
> +

Use dma_cookie_complete().

> +       callback = desc->async_tx.callback;
> +       callback_param = desc->async_tx.callback_param;
> +
> +       if (callback) {
> +               spin_unlock_bh(&xor_chan->desc_lock);
> +               callback(callback_param);
> +               spin_lock_bh(&xor_chan->desc_lock);

As mentioned you'll either need to ensure that
talitos_process_pending() is only called from the tasklet, or upgrade
these locks to hardirq safe.

> +       }
> +
> +       talitos_xor_run_tx_complete_actions(desc, xor_chan);
> +
> +       list_del(&desc->node);
> +       list_add_tail(&desc->node, &xor_chan->free_desc);
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +       if (!list_empty(&xor_chan->pending_q))
> +               talitos_process_pending(xor_chan);
> +}
> +
> +/**
> + * talitos_issue_pending - move the descriptors in submit
> + * queue to pending queue and submit them for processing
> + * @chan: DMA channel
> + */
> +static void talitos_issue_pending(struct dma_chan *chan)
> +{
> +       struct talitos_xor_chan *xor_chan;
> +
> +       xor_chan = container_of(chan, struct talitos_xor_chan, common);
> +       spin_lock_bh(&xor_chan->desc_lock);
> +       list_splice_tail_init(&xor_chan->submit_q,
> +                                &xor_chan->pending_q);
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +       talitos_process_pending(xor_chan);
> +}
> +
> +static dma_cookie_t talitos_async_tx_submit(struct dma_async_tx_descriptor *tx)
> +{
> +       struct talitos_xor_desc *desc;
> +       struct talitos_xor_chan *xor_chan;
> +       dma_cookie_t cookie;
> +
> +       desc = container_of(tx, struct talitos_xor_desc, async_tx);
> +       xor_chan = container_of(tx->chan, struct talitos_xor_chan, common);
> +
> +       spin_lock_bh(&xor_chan->desc_lock);
> +
> +       cookie = xor_chan->common.cookie + 1;
> +       if (cookie < 0)
> +               cookie = 1;

Should use the new dma_cookie_assign() helper.

> +
> +       desc->async_tx.cookie = cookie;
> +       xor_chan->common.cookie = desc->async_tx.cookie;
> +
> +       list_splice_tail_init(&desc->tx_list,
> +                                &xor_chan->submit_q);
> +
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +
> +       return cookie;
> +}
> +
> +static struct talitos_xor_desc *talitos_xor_alloc_descriptor(
> +                               struct talitos_xor_chan *xor_chan, gfp_t flags)
> +{
> +       struct talitos_xor_desc *desc;
> +
> +       desc = kmalloc(sizeof(*desc), flags);
> +       if (desc) {
> +               xor_chan->total_desc++;
> +               desc->async_tx.tx_submit = talitos_async_tx_submit;
> +       }
> +
> +       return desc;
> +}
> +
[..]
> +
> +static struct dma_async_tx_descriptor *talitos_prep_dma_xor(
> +                       struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
> +                       unsigned int src_cnt, size_t len, unsigned long flags)
> +{
> +       struct talitos_xor_chan *xor_chan;
> +       struct talitos_xor_desc *new;
> +       struct talitos_desc *desc;
> +       int i, j;
> +
> +       BUG_ON(len > TALITOS_MAX_DATA_LEN);
> +
> +       xor_chan = container_of(chan, struct talitos_xor_chan, common);
> +
> +       spin_lock_bh(&xor_chan->desc_lock);
> +       if (!list_empty(&xor_chan->free_desc)) {
> +               new = container_of(xor_chan->free_desc.next,
> +                                  struct talitos_xor_desc, node);
> +               list_del(&new->node);
> +       } else {
> +                new = talitos_xor_alloc_descriptor(xor_chan, GFP_KERNEL | GFP_DMA);

You can't hold a spin_lock over a GFP_KERNEL allocation.

> +       }
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +
> +       if (!new) {
> +               dev_err(xor_chan->common.device->dev,
> +                       "No free memory for XOR DMA descriptor\n");
> +               return NULL;
> +       }
> +       dma_async_tx_descriptor_init(&new->async_tx, &xor_chan->common);
> +
> +       INIT_LIST_HEAD(&new->node);
> +       INIT_LIST_HEAD(&new->tx_list);
> +
> +       desc = &new->hwdesc;
> +       /* Set destination: Last pointer pair */
> +       to_talitos_ptr(&desc->ptr[6], dest);
> +       desc->ptr[6].len = cpu_to_be16(len);
> +       desc->ptr[6].j_extent = 0;
> +       new->unmap_src_cnt = src_cnt;
> +       new->unmap_len = len;
> +
> +       /* Set Sources: End loading from second-last pointer pair */
> +       for (i = 5, j = 0; j < src_cnt && i >= 0; i--, j++) {
> +               to_talitos_ptr(&desc->ptr[i], src[j]);
> +               desc->ptr[i].len = cpu_to_be16(len);
> +               desc->ptr[i].j_extent = 0;
> +       }
> +
> +       /*
> +        * documentation states first 0 ptr/len combo marks end of sources
> +        * yet device produces scatter boundary error unless all subsequent
> +        * sources are zeroed out
> +        */
> +       for (; i >= 0; i--) {
> +               to_talitos_ptr(&desc->ptr[i], 0);
> +               desc->ptr[i].len = 0;
> +               desc->ptr[i].j_extent = 0;
> +       }
> +
> +       desc->hdr = DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_XOR |
> +               DESC_HDR_TYPE_RAID_XOR;
> +
> +       new->async_tx.parent = NULL;
> +       new->async_tx.next = NULL;
> +       new->async_tx.cookie = 0;
> +       async_tx_ack(&new->async_tx);
> +
> +       list_add_tail(&new->node, &new->tx_list);
> +
> +       new->async_tx.flags = flags;
> +       new->async_tx.cookie = -EBUSY;
> +
> +       return &new->async_tx;
> +}
> +
> +static void talitos_unregister_async_xor(struct device *dev)
> +{
> +       struct talitos_private *priv = dev_get_drvdata(dev);
> +       struct talitos_xor_chan *xor_chan;
> +       struct dma_chan *chan, *_chan;
> +
> +       if (priv->dma_dev_common.chancnt)
> +               dma_async_device_unregister(&priv->dma_dev_common);
> +
> +       list_for_each_entry_safe(chan, _chan, &priv->dma_dev_common.channels,
> +                               device_node) {
> +               xor_chan = container_of(chan, struct talitos_xor_chan,
> +                                       common);
> +               list_del(&chan->device_node);
> +               priv->dma_dev_common.chancnt--;
> +               kfree(xor_chan);
> +       }
> +}
> +
> +/**
> + * talitos_register_dma_async - Initialize the Freescale XOR ADMA device
> + * It is registered as a DMA device with the capability to perform
> + * XOR operation with the Async_tx layer.
> + * The various queues and channel resources are also allocated.
> + */
> +static int talitos_register_async_tx(struct device *dev, int max_xor_srcs)
> +{
> +       struct talitos_private *priv = dev_get_drvdata(dev);
> +       struct dma_device *dma_dev = &priv->dma_dev_common;
> +       struct talitos_xor_chan *xor_chan;
> +       int err;
> +
> +       xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
> +       if (!xor_chan) {
> +               dev_err(dev, "unable to allocate xor channel\n");
> +               return -ENOMEM;
> +       }
> +
> +       dma_dev->dev = dev;
> +       dma_dev->device_alloc_chan_resources = talitos_alloc_chan_resources;
> +       dma_dev->device_free_chan_resources = talitos_free_chan_resources;
> +       dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
> +       dma_dev->max_xor = max_xor_srcs;
> +       dma_dev->device_tx_status = talitos_is_tx_complete;
> +       dma_dev->device_issue_pending = talitos_issue_pending;
> +       INIT_LIST_HEAD(&dma_dev->channels);
> +       dma_cap_set(DMA_XOR, dma_dev->cap_mask);
> +
> +       xor_chan->dev = dev;
> +       xor_chan->common.device = dma_dev;
> +       xor_chan->total_desc = 0;
> +       INIT_LIST_HEAD(&xor_chan->submit_q);
> +       INIT_LIST_HEAD(&xor_chan->pending_q);
> +       INIT_LIST_HEAD(&xor_chan->in_progress_q);
> +       INIT_LIST_HEAD(&xor_chan->free_desc);
> +       spin_lock_init(&xor_chan->desc_lock);
> +
> +       list_add_tail(&xor_chan->common.device_node, &dma_dev->channels);
> +       dma_dev->chancnt++;
> +
> +       err = dma_async_device_register(dma_dev);
> +       if (err) {
> +               dev_err(dev, "Unable to register XOR with Async_tx\n");
> +               goto err_out;
> +       }
> +
> +       return err;
> +
> +err_out:
> +       talitos_unregister_async_xor(dev);
> +       return err;
> +}
> +#endif
> +
>  /*
>   * crypto alg
>   */
> @@ -2891,6 +3284,26 @@ static int talitos_probe(struct platform_device *ofdev)
>                         dev_info(dev, "hwrng\n");
>         }
>
> +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR

Kill these ifdefs in the C file.  Do something like: if
(xor_enabled()) { } around the code sections that are optional so you
always get compile coverage.  Where xor_enabled() is a shorter form of
IS_ENABLED(CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR).

> +       /*
> +        * register with async_tx xor, if capable
> +        * SEC 2.x support up to 3 RAID sources,
> +        * SEC 3.x support up to 6
> +        */
> +       if (hw_supports(dev, DESC_HDR_SEL0_AESU | DESC_HDR_TYPE_RAID_XOR)) {
> +               int max_xor_srcs = 3;
> +               if (of_device_is_compatible(np, "fsl,sec3.0"))
> +                       max_xor_srcs = 6;
> +               err = talitos_register_async_tx(dev, max_xor_srcs);
> +               if (err) {
> +                       dev_err(dev, "failed to register async_tx xor: %d\n",
> +                                       err);
> +                       goto err_out;
> +               }
> +               dev_info(dev, "max_xor_srcs %d\n", max_xor_srcs);
> +       }
> +#endif
> +
>         /* register crypto algorithms the device supports */
>         for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
>                 if (hw_supports(dev, driver_algs[i].desc_hdr_template)) {
> diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
> index 61a1405..fc9d125 100644
> --- a/drivers/crypto/talitos.h
> +++ b/drivers/crypto/talitos.h
> @@ -30,6 +30,7 @@
>
>  #define TALITOS_TIMEOUT 100000
>  #define TALITOS_MAX_DATA_LEN 65535
> +#define TALITOS_MAX_DESCRIPTOR_NR 256
>
>  #define DESC_TYPE(desc_hdr) ((be32_to_cpu(desc_hdr) >> 3) & 0x1f)
>  #define PRIMARY_EU(desc_hdr) ((be32_to_cpu(desc_hdr) >> 28) & 0xf)
> @@ -131,7 +132,57 @@ struct talitos_private {
>
>         /* hwrng device */
>         struct hwrng rng;
> +
> +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
> +       /* XOR Device */
> +       struct dma_device dma_dev_common;
> +#endif
> +};
> +
> +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
> +/**
> + * talitos_xor_chan - context management for the async_tx channel
> + * @completed_cookie: the last completed cookie
> + * @desc_lock: lock for tx queue
> + * @total_desc: number of descriptors allocated
> + * @submit_q: queue of submitted descriptors
> + * @pending_q: queue of pending descriptors
> + * @in_progress_q: queue of descriptors in progress
> + * @free_desc: queue of unused descriptors
> + * @dev: talitos device implementing this channel
> + * @common: the corresponding xor channel in async_tx
> + */
> +struct talitos_xor_chan {
> +       dma_cookie_t completed_cookie;
> +       spinlock_t desc_lock;
> +       unsigned int total_desc;
> +       struct list_head submit_q;
> +       struct list_head pending_q;
> +       struct list_head in_progress_q;
> +       struct list_head free_desc;
> +       struct device *dev;
> +       struct dma_chan common;
> +};
> +
> +/**
> + * talitos_xor_desc - software xor descriptor
> + * @async_tx: the referring async_tx descriptor
> + * @node:
> + * @hwdesc: h/w descriptor
> + * @unmap_src_cnt: number of xor sources
> + * @unmap_len: transaction byte count
> + * @idx: index of xor sources
> + */
> +struct talitos_xor_desc {
> +       struct dma_async_tx_descriptor async_tx;
> +       struct list_head tx_list;
> +       struct list_head node;
> +       struct talitos_desc hwdesc;
> +       unsigned int unmap_src_cnt;
> +       unsigned int unmap_len;
> +       unsigned int idx;
>  };
> +#endif
>
>  extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
>                           void (*callback)(struct device *dev,
> @@ -284,6 +335,7 @@ extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
>  /* primary execution unit mode (MODE0) and derivatives */
>  #define        DESC_HDR_MODE0_ENCRYPT          cpu_to_be32(0x00100000)
>  #define        DESC_HDR_MODE0_AESU_CBC         cpu_to_be32(0x00200000)
> +#define        DESC_HDR_MODE0_AESU_XOR         cpu_to_be32(0x0c600000)
>  #define        DESC_HDR_MODE0_DEU_CBC          cpu_to_be32(0x00400000)
>  #define        DESC_HDR_MODE0_DEU_3DES         cpu_to_be32(0x00200000)
>  #define        DESC_HDR_MODE0_MDEU_CONT        cpu_to_be32(0x08000000)
> @@ -344,6 +396,7 @@ extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
>  #define DESC_HDR_TYPE_IPSEC_ESP                        cpu_to_be32(1 << 3)
>  #define DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU  cpu_to_be32(2 << 3)
>  #define DESC_HDR_TYPE_HMAC_SNOOP_NO_AFEU       cpu_to_be32(4 << 3)
> +#define DESC_HDR_TYPE_RAID_XOR                  cpu_to_be32(21 << 3)
>
>  /* link table extent field bits */
>  #define DESC_PTR_LNKTBL_JUMP                   0x80
Liu Qiang-B32616 - Sept. 4, 2012, 12:28 p.m.
> -----Original Message-----
> From: dan.j.williams@gmail.com [mailto:dan.j.williams@gmail.com] On
> Behalf Of Dan Williams
> Sent: Sunday, September 02, 2012 4:12 PM
> To: Liu Qiang-B32616
> Cc: linux-crypto@vger.kernel.org; herbert@gondor.hengli.com.au;
> davem@davemloft.net; linux-kernel@vger.kernel.org; linuxppc-
> dev@lists.ozlabs.org; Li Yang-R58472; Phillips Kim-R1AAHA;
> vinod.koul@intel.com; dan.j.williams@intel.com; arnd@arndb.de;
> gregkh@linuxfoundation.org
> Subject: Re: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
> 
> On Thu, Aug 9, 2012 at 1:20 AM,  <qiang.liu@freescale.com> wrote:
> > From: Qiang Liu <qiang.liu@freescale.com>
> >
> > Expose Talitos's XOR functionality to be used for RAID parity
> > calculation via the Async_tx layer.
> >
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: David S. Miller <davem@davemloft.net>
> > Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> > Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> > Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> > Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> > Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> > ---
> >  drivers/crypto/Kconfig   |    9 +
> >  drivers/crypto/talitos.c |  413
> ++++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/crypto/talitos.h |   53 ++++++
> >  3 files changed, 475 insertions(+), 0 deletions(-)
> >
> > diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> > index be6b2ba..f0a7c29 100644
> > --- a/drivers/crypto/Kconfig
> > +++ b/drivers/crypto/Kconfig
> > @@ -222,6 +222,15 @@ config CRYPTO_DEV_TALITOS
> >           To compile this driver as a module, choose M here: the module
> >           will be called talitos.
> >
> > +config CRYPTO_DEV_TALITOS_RAIDXOR
> > +       bool "Talitos RAID5 XOR Calculation Offload"
> > +       default y
> 
> No, default y.  The user should explicitly enable this.
Ok, I will change it to N.

> 
> > +       select DMA_ENGINE
> > +       depends on CRYPTO_DEV_TALITOS
> > +       help
> > +         Say 'Y' here to use the Freescale Security Engine (SEC) to
> > +         offload RAID XOR parity Calculation
> > +
> 
> Is it faster than cpu xor?
Yes, I tested with IOZone, cpu load is also reduced.
I think one possible reason is the processor of p1022ds is not strong as others, so the performance is improved obviously.

> 
> Will this engine be coordinating with another to handle memory copies?
>  The dma mapping code for async_tx/raid is broken when dma mapping
> requests overlap or cross dma device boundaries [1].
> 
> [1]: http://marc.info/?l=linux-arm-kernel&m=129407269402930&w=2
Yes, it needs fsl-dma to handle memcpy copies.
I read your link, the unmap address is stored in talitos hwdesc, the address will be unmapped when async_tx ack this descriptor, I know fsl-dma won't wait this ack flag in current kernel, so I fix it in fsl-dma patch 5/8. Do you mean that?

> 
> > +static void talitos_process_pending(struct talitos_xor_chan *xor_chan)
> > +{
> > +       struct talitos_xor_desc *desc, *_desc;
> > +       unsigned long flags;
> > +       int status;
> > +       struct talitos_private *priv;
> > +       int ch;
> > +
> > +       priv = dev_get_drvdata(xor_chan->dev);
> > +       ch = atomic_inc_return(&priv->last_chan) &
> > +                                 (priv->num_channels - 1);
> 
> Maybe a comment about why this this is duplicated from
> talitos_cra_init()?  It sticks out here and I had to go looking to
> find out why this channel number increments on submit.
My fault, I will correct it as below,

atomic_read((&priv->last_chan) & (priv->num_channels - 1);

> 
> 
> > +       spin_lock_irqsave(&xor_chan->desc_lock, flags);
> > +
> > +       list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q,
> node) {
> > +               status = talitos_submit(xor_chan->dev, ch, &desc-
> >hwdesc,
> > +                                       talitos_release_xor, desc);
> > +               if (status != -EINPROGRESS)
> > +                       break;
> > +
> > +               list_del(&desc->node);
> > +               list_add_tail(&desc->node, &xor_chan->in_progress_q);
> > +       }
> > +
> > +       spin_unlock_irqrestore(&xor_chan->desc_lock, flags);
> > +}
> > +
> > +static void talitos_xor_run_tx_complete_actions(struct
> talitos_xor_desc *desc,
> > +               struct talitos_xor_chan *xor_chan)
> > +{
> > +       struct device *dev = xor_chan->dev;
> > +       dma_addr_t dest, addr;
> > +       unsigned int src_cnt = desc->unmap_src_cnt;
> > +       unsigned int len = desc->unmap_len;
> > +       enum dma_ctrl_flags flags = desc->async_tx.flags;
> > +       struct dma_async_tx_descriptor *tx = &desc->async_tx;
> > +
> > +       /* unmap dma addresses */
> > +       dest = desc->hwdesc.ptr[6].ptr;
> > +       if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
> > +               dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
> > +
> > +       desc->idx = 6 - src_cnt;
> > +       if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
> > +               while(desc->idx < 6) {
> 
> Checkpatch says:
> ERROR: space required before the open parenthesis '('
I will correct it next.

> 
> 
> > +                       addr = desc->hwdesc.ptr[desc->idx++].ptr;
> > +                       if (addr == dest)
> > +                               continue;
> > +                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
> > +               }
> > +       }
> > +
> > +       /* run dependent operations */
> > +       dma_run_dependencies(tx);
> 
> Here is where we run into problems if another engine accesses these
> same buffers, especially on ARM v6+.
Do you mean in dma_run_dependencies() will occur to the async_tx descriptor of fsl-dma will be processed? I'm not clearly.


> 
> > +}
> > +
> > +static void talitos_release_xor(struct device *dev, struct
> talitos_desc *hwdesc,
> > +                               void *context, int error)
> > +{
> > +       struct talitos_xor_desc *desc = context;
> > +       struct talitos_xor_chan *xor_chan;
> > +       dma_async_tx_callback callback;
> > +       void *callback_param;
> > +
> > +       if (unlikely(error))
> > +               dev_err(dev, "xor operation: talitos error %d\n",
> error);
> > +
> > +       xor_chan = container_of(desc->async_tx.chan, struct
> talitos_xor_chan,
> > +                               common);
> > +       spin_lock_bh(&xor_chan->desc_lock);
> > +       if (xor_chan->completed_cookie < desc->async_tx.cookie)
> > +               xor_chan->completed_cookie = desc->async_tx.cookie;
> > +
> 
> Use dma_cookie_complete().
Ok, I will correct it.

> 
> > +       callback = desc->async_tx.callback;
> > +       callback_param = desc->async_tx.callback_param;
> > +
> > +       if (callback) {
> > +               spin_unlock_bh(&xor_chan->desc_lock);
> > +               callback(callback_param);
> > +               spin_lock_bh(&xor_chan->desc_lock);
> 
> As mentioned you'll either need to ensure that
> talitos_process_pending() is only called from the tasklet, or upgrade
> these locks to hardirq safe.
The point is flush_channel can be called both of interrupt and tasklet.
So the process should be redesigned that make sure talitos_process_pending() is only called from tasklet. I will fix it in next series.
Thanks.

> 
> > +       }
> > +
> > +       talitos_xor_run_tx_complete_actions(desc, xor_chan);
> > +
> > +       list_del(&desc->node);
> > +       list_add_tail(&desc->node, &xor_chan->free_desc);
> > +       spin_unlock_bh(&xor_chan->desc_lock);
> > +       if (!list_empty(&xor_chan->pending_q))
> > +               talitos_process_pending(xor_chan);
> > +}
> > +
> > +/**
> > + * talitos_issue_pending - move the descriptors in submit
> > + * queue to pending queue and submit them for processing
> > + * @chan: DMA channel
> > + */
> > +static void talitos_issue_pending(struct dma_chan *chan)
> > +{
> > +       struct talitos_xor_chan *xor_chan;
> > +
> > +       xor_chan = container_of(chan, struct talitos_xor_chan, common);
> > +       spin_lock_bh(&xor_chan->desc_lock);
> > +       list_splice_tail_init(&xor_chan->submit_q,
> > +                                &xor_chan->pending_q);
> > +       spin_unlock_bh(&xor_chan->desc_lock);
> > +       talitos_process_pending(xor_chan);
> > +}
> > +
> > +static dma_cookie_t talitos_async_tx_submit(struct
> dma_async_tx_descriptor *tx)
> > +{
> > +       struct talitos_xor_desc *desc;
> > +       struct talitos_xor_chan *xor_chan;
> > +       dma_cookie_t cookie;
> > +
> > +       desc = container_of(tx, struct talitos_xor_desc, async_tx);
> > +       xor_chan = container_of(tx->chan, struct talitos_xor_chan,
> common);
> > +
> > +       spin_lock_bh(&xor_chan->desc_lock);
> > +
> > +       cookie = xor_chan->common.cookie + 1;
> > +       if (cookie < 0)
> > +               cookie = 1;
> 
> Should use the new dma_cookie_assign() helper.
Ok.

> 
> > +
> > +       desc->async_tx.cookie = cookie;
> > +       xor_chan->common.cookie = desc->async_tx.cookie;
> > +
> > +       list_splice_tail_init(&desc->tx_list,
> > +                                &xor_chan->submit_q);
> > +
> > +       spin_unlock_bh(&xor_chan->desc_lock);
> > +
> > +       return cookie;
> > +}
> > +
> > +static struct talitos_xor_desc *talitos_xor_alloc_descriptor(
> > +                               struct talitos_xor_chan *xor_chan,
> gfp_t flags)
> > +{
> > +       struct talitos_xor_desc *desc;
> > +
> > +       desc = kmalloc(sizeof(*desc), flags);
> > +       if (desc) {
> > +               xor_chan->total_desc++;
> > +               desc->async_tx.tx_submit = talitos_async_tx_submit;
> > +       }
> > +
> > +       return desc;
> > +}
> > +
> [..]
I will remove this line.

> > +
> > +static struct dma_async_tx_descriptor *talitos_prep_dma_xor(
> > +                       struct dma_chan *chan, dma_addr_t dest,
> dma_addr_t *src,
> > +                       unsigned int src_cnt, size_t len, unsigned long
> flags)
> > +{
> > +       struct talitos_xor_chan *xor_chan;
> > +       struct talitos_xor_desc *new;
> > +       struct talitos_desc *desc;
> > +       int i, j;
> > +
> > +       BUG_ON(len > TALITOS_MAX_DATA_LEN);
> > +
> > +       xor_chan = container_of(chan, struct talitos_xor_chan, common);
> > +
> > +       spin_lock_bh(&xor_chan->desc_lock);
> > +       if (!list_empty(&xor_chan->free_desc)) {
> > +               new = container_of(xor_chan->free_desc.next,
> > +                                  struct talitos_xor_desc, node);
> > +               list_del(&new->node);
> > +       } else {
> > +                new = talitos_xor_alloc_descriptor(xor_chan,
> GFP_KERNEL | GFP_DMA);
> 
> You can't hold a spin_lock over a GFP_KERNEL allocation.
Ok, I will fix it in next.

> 
> > +       }
> > +       spin_unlock_bh(&xor_chan->desc_lock);
> > +
> > +       if (!new) {
> > +               dev_err(xor_chan->common.device->dev,
> > +                       "No free memory for XOR DMA descriptor\n");
> > +               return NULL;
> > +       }
> > +       dma_async_tx_descriptor_init(&new->async_tx, &xor_chan->common);
> > +
> > +       INIT_LIST_HEAD(&new->node);
> > +       INIT_LIST_HEAD(&new->tx_list);
> > +
> > +       desc = &new->hwdesc;
> > +       /* Set destination: Last pointer pair */
> > +       to_talitos_ptr(&desc->ptr[6], dest);
> > +       desc->ptr[6].len = cpu_to_be16(len);
> > +       desc->ptr[6].j_extent = 0;
> > +       new->unmap_src_cnt = src_cnt;
> > +       new->unmap_len = len;
> > +
> > +       /* Set Sources: End loading from second-last pointer pair */
> > +       for (i = 5, j = 0; j < src_cnt && i >= 0; i--, j++) {
> > +               to_talitos_ptr(&desc->ptr[i], src[j]);
> > +               desc->ptr[i].len = cpu_to_be16(len);
> > +               desc->ptr[i].j_extent = 0;
> > +       }
> > +
> > +       /*
> > +        * documentation states first 0 ptr/len combo marks end of
> sources
> > +        * yet device produces scatter boundary error unless all
> subsequent
> > +        * sources are zeroed out
> > +        */
> > +       for (; i >= 0; i--) {
> > +               to_talitos_ptr(&desc->ptr[i], 0);
> > +               desc->ptr[i].len = 0;
> > +               desc->ptr[i].j_extent = 0;
> > +       }
> > +
> > +       desc->hdr = DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_XOR |
> > +               DESC_HDR_TYPE_RAID_XOR;
> > +
> > +       new->async_tx.parent = NULL;
> > +       new->async_tx.next = NULL;
> > +       new->async_tx.cookie = 0;
> > +       async_tx_ack(&new->async_tx);
> > +
> > +       list_add_tail(&new->node, &new->tx_list);
> > +
> > +       new->async_tx.flags = flags;
> > +       new->async_tx.cookie = -EBUSY;
> > +
> > +       return &new->async_tx;
> > +}
> > +
> > +static void talitos_unregister_async_xor(struct device *dev)
> > +{
> > +       struct talitos_private *priv = dev_get_drvdata(dev);
> > +       struct talitos_xor_chan *xor_chan;
> > +       struct dma_chan *chan, *_chan;
> > +
> > +       if (priv->dma_dev_common.chancnt)
> > +               dma_async_device_unregister(&priv->dma_dev_common);
> > +
> > +       list_for_each_entry_safe(chan, _chan, &priv-
> >dma_dev_common.channels,
> > +                               device_node) {
> > +               xor_chan = container_of(chan, struct talitos_xor_chan,
> > +                                       common);
> > +               list_del(&chan->device_node);
> > +               priv->dma_dev_common.chancnt--;
> > +               kfree(xor_chan);
> > +       }
> > +}
> > +
> > +/**
> > + * talitos_register_dma_async - Initialize the Freescale XOR ADMA
> device
> > + * It is registered as a DMA device with the capability to perform
> > + * XOR operation with the Async_tx layer.
> > + * The various queues and channel resources are also allocated.
> > + */
> > +static int talitos_register_async_tx(struct device *dev, int
> max_xor_srcs)
> > +{
> > +       struct talitos_private *priv = dev_get_drvdata(dev);
> > +       struct dma_device *dma_dev = &priv->dma_dev_common;
> > +       struct talitos_xor_chan *xor_chan;
> > +       int err;
> > +
> > +       xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
> > +       if (!xor_chan) {
> > +               dev_err(dev, "unable to allocate xor channel\n");
> > +               return -ENOMEM;
> > +       }
> > +
> > +       dma_dev->dev = dev;
> > +       dma_dev->device_alloc_chan_resources =
> talitos_alloc_chan_resources;
> > +       dma_dev->device_free_chan_resources =
> talitos_free_chan_resources;
> > +       dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
> > +       dma_dev->max_xor = max_xor_srcs;
> > +       dma_dev->device_tx_status = talitos_is_tx_complete;
> > +       dma_dev->device_issue_pending = talitos_issue_pending;
> > +       INIT_LIST_HEAD(&dma_dev->channels);
> > +       dma_cap_set(DMA_XOR, dma_dev->cap_mask);
> > +
> > +       xor_chan->dev = dev;
> > +       xor_chan->common.device = dma_dev;
> > +       xor_chan->total_desc = 0;
> > +       INIT_LIST_HEAD(&xor_chan->submit_q);
> > +       INIT_LIST_HEAD(&xor_chan->pending_q);
> > +       INIT_LIST_HEAD(&xor_chan->in_progress_q);
> > +       INIT_LIST_HEAD(&xor_chan->free_desc);
> > +       spin_lock_init(&xor_chan->desc_lock);
> > +
> > +       list_add_tail(&xor_chan->common.device_node, &dma_dev-
> >channels);
> > +       dma_dev->chancnt++;
> > +
> > +       err = dma_async_device_register(dma_dev);
> > +       if (err) {
> > +               dev_err(dev, "Unable to register XOR with Async_tx\n");
> > +               goto err_out;
> > +       }
> > +
> > +       return err;
> > +
> > +err_out:
> > +       talitos_unregister_async_xor(dev);
> > +       return err;
> > +}
> > +#endif
> > +
> >  /*
> >   * crypto alg
> >   */
> > @@ -2891,6 +3284,26 @@ static int talitos_probe(struct platform_device
> *ofdev)
> >                         dev_info(dev, "hwrng\n");
> >         }
> >
> > +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
> 
> Kill these ifdefs in the C file.  Do something like: if
> (xor_enabled()) { } around the code sections that are optional so you
> always get compile coverage.  Where xor_enabled() is a shorter form of
> IS_ENABLED(CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR).
I'm confused how to implement this interface,
First, this feature should be added in Kconfig (default N)
Second, the only judge condition is SEC version, but some time we don't want involve function XOR even though the SEC (version is not higher than 3.0, SEC 4.0 is CAAM module) support this feature.
Last, there is not any hardware support.
Can you give me an example? Thanks.
Dan Williams - Sept. 5, 2012, 1:19 a.m.
On Tue, Sep 4, 2012 at 5:28 AM, Liu Qiang-B32616 <B32616@freescale.com> wrote:
>> Will this engine be coordinating with another to handle memory copies?
>>  The dma mapping code for async_tx/raid is broken when dma mapping
>> requests overlap or cross dma device boundaries [1].
>>
>> [1]: http://marc.info/?l=linux-arm-kernel&m=129407269402930&w=2
> Yes, it needs fsl-dma to handle memcpy copies.
> I read your link, the unmap address is stored in talitos hwdesc, the address will be unmapped when async_tx ack this descriptor, I know fsl-dma won't wait this ack flag in current kernel, so I fix it in fsl-dma patch 5/8. Do you mean that?

Unfortunately no.  I'm open to other suggestions. but as far as I can
see it requires deeper changes to rip out the dma mapping that happens
in async_tx and the automatic unmapping done by drivers.  It should
all be pushed to the client (md).

Currently async_tx hides hardware details from md such that it doesn't
even care if the operation is offloaded to hardware at all, but that
takes things too far.  In the worst case an copy->xor chain handled by
multiple channels results in :

1/ dma_map(copy_chan...)
2/ dma_map(xor_chan...)
3/ <exec copy>
4/ dma_unmap(copy_chan...)
5/ <exec xor> <---initiated by the copy_chan
6/ dma_unmap(xor_chan...)

Step 2 violates the dma api since the buffers belong to the xor_chan
until unmap.  Step 5 also causes the random completion context of the
copy channel to bleed into submission context of the xor channel which
is problematic.  So the order needs to be:

1/ dma_map(copy_chan...)
2/ <exec copy>
3/ dma_unmap(copy_chan...)
4/ dma_map(xor_chan...)
5/ <exec xor> <--initiated by md in a static context
6/ dma_unmap(xor_chan...)

Also, if xor_chan and copy_chan lie with the same dma mapping domain
(iommu or parent device) then we can map the stripe once and skip the
extra maintenance for the duration of the chain of operations.  This
dumps a lot of hardware details on md, but I think it is the only way
to get consistent semantics when arbitrary offload devices are
involved.

--
Dan
Liu Qiang-B32616 - Sept. 12, 2012, 9:45 a.m.
> >> Will this engine be coordinating with another to handle memory copies?
> >>  The dma mapping code for async_tx/raid is broken when dma mapping
> >> requests overlap or cross dma device boundaries [1].
> >>
> >> [1]: http://marc.info/?l=linux-arm-kernel&m=129407269402930&w=2
> > Yes, it needs fsl-dma to handle memcpy copies.
> > I read your link, the unmap address is stored in talitos hwdesc, the
> address will be unmapped when async_tx ack this descriptor, I know fsl-
> dma won't wait this ack flag in current kernel, so I fix it in fsl-dma
> patch 5/8. Do you mean that?
> 
> Unfortunately no.  I'm open to other suggestions. but as far as I can
> see it requires deeper changes to rip out the dma mapping that happens
> in async_tx and the automatic unmapping done by drivers.  It should
> all be pushed to the client (md).
> 
> Currently async_tx hides hardware details from md such that it doesn't
> even care if the operation is offloaded to hardware at all, but that
> takes things too far.  In the worst case an copy->xor chain handled by
> multiple channels results in :
> 
> 1/ dma_map(copy_chan...)
> 2/ dma_map(xor_chan...)
> 3/ <exec copy>
> 4/ dma_unmap(copy_chan...)
> 5/ <exec xor> <---initiated by the copy_chan
> 6/ dma_unmap(xor_chan...)
> 
> Step 2 violates the dma api since the buffers belong to the xor_chan
> until unmap.  Step 5 also causes the random completion context of the
> copy channel to bleed into submission context of the xor channel which
> is problematic.  So the order needs to be:
> 
> 1/ dma_map(copy_chan...)
> 2/ <exec copy>
> 3/ dma_unmap(copy_chan...)
> 4/ dma_map(xor_chan...)
> 5/ <exec xor> <--initiated by md in a static context
> 6/ dma_unmap(xor_chan...)
> 
> Also, if xor_chan and copy_chan lie with the same dma mapping domain
> (iommu or parent device) then we can map the stripe once and skip the
> extra maintenance for the duration of the chain of operations.  This
> dumps a lot of hardware details on md, but I think it is the only way
> to get consistent semantics when arbitrary offload devices are
> involved.
Thanks for your answer and links, I did some investigate these days,

first, powerpc processor should be hardware assured cache coherency, it should
be ok for hardware when in step 5 (but I will avoid map same address on different
device).

second, I have a workaround to make dma_map/unmap by order when using 2 different
device to offload, I will submit next descriptor until current descriptor complete,
        if (submit->flags & ASYNC_TX_ACK)
                async_tx_ack(tx);
        if (depend_tx)
                async_tx_ack(depend_tx);

+       /* do more check to support 2 devices offload? */
+       if (dma_wait_for_async_tx(tx) == DMA_ERROR)
+               panic("%s: DMA_ERROR waiting for tx\n", __func__);
}
EXPORT_SYMBOL_GPL(async_tx_submit);

Also use your example, 
1/ dma_map(copy_chan...)
2/ tx->submit(tx); async_tx_ack(tx);
3/ dma_unmap(copy_chan...)
4/ dma_map(xor_chan...)
5/ <exec xor> <-- initialized by tx->submit(tx);
6/ dma_unmap(xor_chan...)

Under this way, actually dma_run_dependency() is useless, so this can make sure copy
and xor with same page processed by order, and only one descriptor per channel is
served. dma_unmap in driver is controlled by client (tx->flags)

How's you thinking or any suggestions? I test it on our powerpc, I don't know whether
it does work on other architecture.

Thanks.

> 
> --
> Dan

Patch

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index be6b2ba..f0a7c29 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -222,6 +222,15 @@  config CRYPTO_DEV_TALITOS
 	  To compile this driver as a module, choose M here: the module
 	  will be called talitos.

+config CRYPTO_DEV_TALITOS_RAIDXOR
+	bool "Talitos RAID5 XOR Calculation Offload"
+	default y
+	select DMA_ENGINE
+	depends on CRYPTO_DEV_TALITOS
+	help
+	  Say 'Y' here to use the Freescale Security Engine (SEC) to
+	  offload RAID XOR parity Calculation
+
 config CRYPTO_DEV_IXP4XX
 	tristate "Driver for IXP4xx crypto hardware acceleration"
 	depends on ARCH_IXP4XX
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index efff788..b34264e 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -619,6 +619,399 @@  static void talitos_unregister_rng(struct device *dev)
 	hwrng_unregister(&priv->rng);
 }

+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
+				void *context, int error);
+
+static enum dma_status talitos_is_tx_complete(struct dma_chan *chan,
+					      dma_cookie_t cookie,
+					      struct dma_tx_state *state)
+{
+	struct talitos_xor_chan *xor_chan;
+	dma_cookie_t last_used;
+	dma_cookie_t last_complete;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	last_used = chan->cookie;
+	last_complete = xor_chan->completed_cookie;
+
+	if (state->last)
+		state->last = last_complete;
+
+	if (state->used)
+		state->used = last_used;
+
+	return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static void talitos_process_pending(struct talitos_xor_chan *xor_chan)
+{
+	struct talitos_xor_desc *desc, *_desc;
+	unsigned long flags;
+	int status;
+	struct talitos_private *priv;
+	int ch;
+
+	priv = dev_get_drvdata(xor_chan->dev);
+	ch = atomic_inc_return(&priv->last_chan) &
+				  (priv->num_channels - 1);
+	spin_lock_irqsave(&xor_chan->desc_lock, flags);
+
+	list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q, node) {
+		status = talitos_submit(xor_chan->dev, ch, &desc->hwdesc,
+					talitos_release_xor, desc);
+		if (status != -EINPROGRESS)
+			break;
+
+		list_del(&desc->node);
+		list_add_tail(&desc->node, &xor_chan->in_progress_q);
+	}
+
+	spin_unlock_irqrestore(&xor_chan->desc_lock, flags);
+}
+
+static void talitos_xor_run_tx_complete_actions(struct talitos_xor_desc *desc,
+		struct talitos_xor_chan *xor_chan)
+{
+	struct device *dev = xor_chan->dev;
+	dma_addr_t dest, addr;
+	unsigned int src_cnt = desc->unmap_src_cnt;
+	unsigned int len = desc->unmap_len;
+	enum dma_ctrl_flags flags = desc->async_tx.flags;
+	struct dma_async_tx_descriptor *tx = &desc->async_tx;
+
+	/* unmap dma addresses */
+	dest = desc->hwdesc.ptr[6].ptr;
+	if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
+		dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
+
+	desc->idx = 6 - src_cnt;
+	if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
+		while(desc->idx < 6) {
+			addr = desc->hwdesc.ptr[desc->idx++].ptr;
+			if (addr == dest)
+				continue;
+			dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
+		}
+	}
+
+	/* run dependent operations */
+	dma_run_dependencies(tx);
+}
+
+static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
+				void *context, int error)
+{
+	struct talitos_xor_desc *desc = context;
+	struct talitos_xor_chan *xor_chan;
+	dma_async_tx_callback callback;
+	void *callback_param;
+
+	if (unlikely(error))
+		dev_err(dev, "xor operation: talitos error %d\n", error);
+
+	xor_chan = container_of(desc->async_tx.chan, struct talitos_xor_chan,
+				common);
+	spin_lock_bh(&xor_chan->desc_lock);
+	if (xor_chan->completed_cookie < desc->async_tx.cookie)
+		xor_chan->completed_cookie = desc->async_tx.cookie;
+
+	callback = desc->async_tx.callback;
+	callback_param = desc->async_tx.callback_param;
+
+	if (callback) {
+		spin_unlock_bh(&xor_chan->desc_lock);
+		callback(callback_param);
+		spin_lock_bh(&xor_chan->desc_lock);
+	}
+
+	talitos_xor_run_tx_complete_actions(desc, xor_chan);
+
+	list_del(&desc->node);
+	list_add_tail(&desc->node, &xor_chan->free_desc);
+	spin_unlock_bh(&xor_chan->desc_lock);
+	if (!list_empty(&xor_chan->pending_q))
+		talitos_process_pending(xor_chan);
+}
+
+/**
+ * talitos_issue_pending - move the descriptors in submit
+ * queue to pending queue and submit them for processing
+ * @chan: DMA channel
+ */
+static void talitos_issue_pending(struct dma_chan *chan)
+{
+	struct talitos_xor_chan *xor_chan;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+	spin_lock_bh(&xor_chan->desc_lock);
+	list_splice_tail_init(&xor_chan->submit_q,
+				 &xor_chan->pending_q);
+	spin_unlock_bh(&xor_chan->desc_lock);
+	talitos_process_pending(xor_chan);
+}
+
+static dma_cookie_t talitos_async_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct talitos_xor_desc *desc;
+	struct talitos_xor_chan *xor_chan;
+	dma_cookie_t cookie;
+
+	desc = container_of(tx, struct talitos_xor_desc, async_tx);
+	xor_chan = container_of(tx->chan, struct talitos_xor_chan, common);
+
+	spin_lock_bh(&xor_chan->desc_lock);
+
+	cookie = xor_chan->common.cookie + 1;
+	if (cookie < 0)
+		cookie = 1;
+
+	desc->async_tx.cookie = cookie;
+	xor_chan->common.cookie = desc->async_tx.cookie;
+
+	list_splice_tail_init(&desc->tx_list,
+				 &xor_chan->submit_q);
+
+	spin_unlock_bh(&xor_chan->desc_lock);
+
+	return cookie;
+}
+
+static struct talitos_xor_desc *talitos_xor_alloc_descriptor(
+				struct talitos_xor_chan *xor_chan, gfp_t flags)
+{
+	struct talitos_xor_desc *desc;
+
+	desc = kmalloc(sizeof(*desc), flags);
+	if (desc) {
+		xor_chan->total_desc++;
+		desc->async_tx.tx_submit = talitos_async_tx_submit;
+	}
+
+	return desc;
+}
+
+static void talitos_free_chan_resources(struct dma_chan *chan)
+{
+	struct talitos_xor_chan *xor_chan;
+	struct talitos_xor_desc *desc, *_desc;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	spin_lock_bh(&xor_chan->desc_lock);
+
+	list_for_each_entry_safe(desc, _desc, &xor_chan->submit_q, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc, &xor_chan->in_progress_q, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc, &xor_chan->free_desc, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+
+	/* Some descriptor not freed? */
+	if (unlikely(xor_chan->total_desc))
+		dev_warn(chan->device->dev, "Failed to free xor channel resource\n");
+
+	spin_unlock_bh(&xor_chan->desc_lock);
+}
+
+static int talitos_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct talitos_xor_chan *xor_chan;
+	struct talitos_xor_desc *desc;
+	LIST_HEAD(tmp_list);
+	int i;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	if (!list_empty(&xor_chan->free_desc))
+		return xor_chan->total_desc;
+
+	for (i = 0; i < TALITOS_MAX_DESCRIPTOR_NR; i++) {
+		desc = talitos_xor_alloc_descriptor(xor_chan,
+				GFP_KERNEL | GFP_DMA);
+		if (!desc) {
+			dev_err(xor_chan->common.device->dev,
+				"Only %d initial descriptors\n", i);
+			break;
+		}
+		list_add_tail(&desc->node, &tmp_list);
+	}
+
+	if (!i)
+		return -ENOMEM;
+
+	/* At least one desc is allocated */
+	spin_lock_bh(&xor_chan->desc_lock);
+	list_splice_init(&tmp_list, &xor_chan->free_desc);
+	spin_unlock_bh(&xor_chan->desc_lock);
+
+	return xor_chan->total_desc;
+}
+
+static struct dma_async_tx_descriptor *talitos_prep_dma_xor(
+			struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+			unsigned int src_cnt, size_t len, unsigned long flags)
+{
+	struct talitos_xor_chan *xor_chan;
+	struct talitos_xor_desc *new;
+	struct talitos_desc *desc;
+	int i, j;
+
+	BUG_ON(len > TALITOS_MAX_DATA_LEN);
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	spin_lock_bh(&xor_chan->desc_lock);
+	if (!list_empty(&xor_chan->free_desc)) {
+		new = container_of(xor_chan->free_desc.next,
+				   struct talitos_xor_desc, node);
+		list_del(&new->node);
+	} else {
+		 new = talitos_xor_alloc_descriptor(xor_chan, GFP_KERNEL | GFP_DMA);
+	}
+	spin_unlock_bh(&xor_chan->desc_lock);
+
+	if (!new) {
+		dev_err(xor_chan->common.device->dev,
+			"No free memory for XOR DMA descriptor\n");
+		return NULL;
+	}
+	dma_async_tx_descriptor_init(&new->async_tx, &xor_chan->common);
+
+	INIT_LIST_HEAD(&new->node);
+	INIT_LIST_HEAD(&new->tx_list);
+
+	desc = &new->hwdesc;
+	/* Set destination: Last pointer pair */
+	to_talitos_ptr(&desc->ptr[6], dest);
+	desc->ptr[6].len = cpu_to_be16(len);
+	desc->ptr[6].j_extent = 0;
+	new->unmap_src_cnt = src_cnt;
+	new->unmap_len = len;
+
+	/* Set Sources: End loading from second-last pointer pair */
+	for (i = 5, j = 0; j < src_cnt && i >= 0; i--, j++) {
+		to_talitos_ptr(&desc->ptr[i], src[j]);
+		desc->ptr[i].len = cpu_to_be16(len);
+		desc->ptr[i].j_extent = 0;
+	}
+
+	/*
+	 * documentation states first 0 ptr/len combo marks end of sources
+	 * yet device produces scatter boundary error unless all subsequent
+	 * sources are zeroed out
+	 */
+	for (; i >= 0; i--) {
+		to_talitos_ptr(&desc->ptr[i], 0);
+		desc->ptr[i].len = 0;
+		desc->ptr[i].j_extent = 0;
+	}
+
+	desc->hdr = DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_XOR |
+		DESC_HDR_TYPE_RAID_XOR;
+
+	new->async_tx.parent = NULL;
+	new->async_tx.next = NULL;
+	new->async_tx.cookie = 0;
+	async_tx_ack(&new->async_tx);
+
+	list_add_tail(&new->node, &new->tx_list);
+
+	new->async_tx.flags = flags;
+	new->async_tx.cookie = -EBUSY;
+
+	return &new->async_tx;
+}
+
+static void talitos_unregister_async_xor(struct device *dev)
+{
+	struct talitos_private *priv = dev_get_drvdata(dev);
+	struct talitos_xor_chan *xor_chan;
+	struct dma_chan *chan, *_chan;
+
+	if (priv->dma_dev_common.chancnt)
+		dma_async_device_unregister(&priv->dma_dev_common);
+
+	list_for_each_entry_safe(chan, _chan, &priv->dma_dev_common.channels,
+				device_node) {
+		xor_chan = container_of(chan, struct talitos_xor_chan,
+					common);
+		list_del(&chan->device_node);
+		priv->dma_dev_common.chancnt--;
+		kfree(xor_chan);
+	}
+}
+
+/**
+ * talitos_register_dma_async - Initialize the Freescale XOR ADMA device
+ * It is registered as a DMA device with the capability to perform
+ * XOR operation with the Async_tx layer.
+ * The various queues and channel resources are also allocated.
+ */
+static int talitos_register_async_tx(struct device *dev, int max_xor_srcs)
+{
+	struct talitos_private *priv = dev_get_drvdata(dev);
+	struct dma_device *dma_dev = &priv->dma_dev_common;
+	struct talitos_xor_chan *xor_chan;
+	int err;
+
+	xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
+	if (!xor_chan) {
+		dev_err(dev, "unable to allocate xor channel\n");
+		return -ENOMEM;
+	}
+
+	dma_dev->dev = dev;
+	dma_dev->device_alloc_chan_resources = talitos_alloc_chan_resources;
+	dma_dev->device_free_chan_resources = talitos_free_chan_resources;
+	dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
+	dma_dev->max_xor = max_xor_srcs;
+	dma_dev->device_tx_status = talitos_is_tx_complete;
+	dma_dev->device_issue_pending = talitos_issue_pending;
+	INIT_LIST_HEAD(&dma_dev->channels);
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+
+	xor_chan->dev = dev;
+	xor_chan->common.device = dma_dev;
+	xor_chan->total_desc = 0;
+	INIT_LIST_HEAD(&xor_chan->submit_q);
+	INIT_LIST_HEAD(&xor_chan->pending_q);
+	INIT_LIST_HEAD(&xor_chan->in_progress_q);
+	INIT_LIST_HEAD(&xor_chan->free_desc);
+	spin_lock_init(&xor_chan->desc_lock);
+
+	list_add_tail(&xor_chan->common.device_node, &dma_dev->channels);
+	dma_dev->chancnt++;
+
+	err = dma_async_device_register(dma_dev);
+	if (err) {
+		dev_err(dev, "Unable to register XOR with Async_tx\n");
+		goto err_out;
+	}
+
+	return err;
+
+err_out:
+	talitos_unregister_async_xor(dev);
+	return err;
+}
+#endif
+
 /*
  * crypto alg
  */
@@ -2891,6 +3284,26 @@  static int talitos_probe(struct platform_device *ofdev)
 			dev_info(dev, "hwrng\n");
 	}

+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+	/*
+	 * register with async_tx xor, if capable
+	 * SEC 2.x support up to 3 RAID sources,
+	 * SEC 3.x support up to 6
+	 */
+	if (hw_supports(dev, DESC_HDR_SEL0_AESU | DESC_HDR_TYPE_RAID_XOR)) {
+		int max_xor_srcs = 3;
+		if (of_device_is_compatible(np, "fsl,sec3.0"))
+			max_xor_srcs = 6;
+		err = talitos_register_async_tx(dev, max_xor_srcs);
+		if (err) {
+			dev_err(dev, "failed to register async_tx xor: %d\n",
+					err);
+			goto err_out;
+		}
+		dev_info(dev, "max_xor_srcs %d\n", max_xor_srcs);
+	}
+#endif
+
 	/* register crypto algorithms the device supports */
 	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
 		if (hw_supports(dev, driver_algs[i].desc_hdr_template)) {
diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
index 61a1405..fc9d125 100644
--- a/drivers/crypto/talitos.h
+++ b/drivers/crypto/talitos.h
@@ -30,6 +30,7 @@ 

 #define TALITOS_TIMEOUT 100000
 #define TALITOS_MAX_DATA_LEN 65535
+#define TALITOS_MAX_DESCRIPTOR_NR 256

 #define DESC_TYPE(desc_hdr) ((be32_to_cpu(desc_hdr) >> 3) & 0x1f)
 #define PRIMARY_EU(desc_hdr) ((be32_to_cpu(desc_hdr) >> 28) & 0xf)
@@ -131,7 +132,57 @@  struct talitos_private {

 	/* hwrng device */
 	struct hwrng rng;
+
+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+	/* XOR Device */
+	struct dma_device dma_dev_common;
+#endif
+};
+
+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+/**
+ * talitos_xor_chan - context management for the async_tx channel
+ * @completed_cookie: the last completed cookie
+ * @desc_lock: lock for tx queue
+ * @total_desc: number of descriptors allocated
+ * @submit_q: queue of submitted descriptors
+ * @pending_q: queue of pending descriptors
+ * @in_progress_q: queue of descriptors in progress
+ * @free_desc: queue of unused descriptors
+ * @dev: talitos device implementing this channel
+ * @common: the corresponding xor channel in async_tx
+ */
+struct talitos_xor_chan {
+	dma_cookie_t completed_cookie;
+	spinlock_t desc_lock;
+	unsigned int total_desc;
+	struct list_head submit_q;
+	struct list_head pending_q;
+	struct list_head in_progress_q;
+	struct list_head free_desc;
+	struct device *dev;
+	struct dma_chan common;
+};
+
+/**
+ * talitos_xor_desc - software xor descriptor
+ * @async_tx: the referring async_tx descriptor
+ * @node:
+ * @hwdesc: h/w descriptor
+ * @unmap_src_cnt: number of xor sources
+ * @unmap_len: transaction byte count
+ * @idx: index of xor sources
+ */
+struct talitos_xor_desc {
+	struct dma_async_tx_descriptor async_tx;
+	struct list_head tx_list;
+	struct list_head node;
+	struct talitos_desc hwdesc;
+	unsigned int unmap_src_cnt;
+	unsigned int unmap_len;
+	unsigned int idx;
 };
+#endif

 extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
 			  void (*callback)(struct device *dev,
@@ -284,6 +335,7 @@  extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
 /* primary execution unit mode (MODE0) and derivatives */
 #define	DESC_HDR_MODE0_ENCRYPT		cpu_to_be32(0x00100000)
 #define	DESC_HDR_MODE0_AESU_CBC		cpu_to_be32(0x00200000)
+#define	DESC_HDR_MODE0_AESU_XOR         cpu_to_be32(0x0c600000)
 #define	DESC_HDR_MODE0_DEU_CBC		cpu_to_be32(0x00400000)
 #define	DESC_HDR_MODE0_DEU_3DES		cpu_to_be32(0x00200000)
 #define	DESC_HDR_MODE0_MDEU_CONT	cpu_to_be32(0x08000000)
@@ -344,6 +396,7 @@  extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
 #define DESC_HDR_TYPE_IPSEC_ESP			cpu_to_be32(1 << 3)
 #define DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU	cpu_to_be32(2 << 3)
 #define DESC_HDR_TYPE_HMAC_SNOOP_NO_AFEU	cpu_to_be32(4 << 3)
+#define DESC_HDR_TYPE_RAID_XOR                  cpu_to_be32(21 << 3)

 /* link table extent field bits */
 #define DESC_PTR_LNKTBL_JUMP			0x80