Message ID | 200901130343.06895.yur@emcraft.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
On Mon, Jan 12, 2009 at 5:43 PM, Yuri Tikhonov <yur@emcraft.com> wrote: > This adds support for doing asynchronous GF multiplication by adding > four additional functions to async_tx API: > > async_pq() does simultaneous XOR of sources and XOR of sources > GF-multiplied by given coefficients. > > async_pq_zero_sum() checks if results of calculations match given > ones. > > async_gen_syndrome() does sumultaneous XOR and R/S syndrome of sources. > > async_syndrome_zerosum() checks if results of XOR/syndrome calculation > matches given ones. > > Latter two functions just use async_pq() with the approprite coefficients > in asynchronous case but have significant optimizations if synchronous > case. > > To support this API dmaengine driver should set DMA_PQ and > DMA_PQ_ZERO_SUM capabilities and provide device_prep_dma_pq and > device_prep_dma_pqzero_sum methods in dma_device structure. > > Signed-off-by: Yuri Tikhonov <yur@emcraft.com> > Signed-off-by: Ilya Yanok <yanok@emcraft.com> > --- > crypto/async_tx/Kconfig | 4 + > crypto/async_tx/Makefile | 1 + > crypto/async_tx/async_pq.c | 615 +++++++++++++++++++++++++++++++++++++++++++ > crypto/async_tx/async_xor.c | 2 +- > include/linux/async_tx.h | 46 +++- > include/linux/dmaengine.h | 30 ++- > 6 files changed, 693 insertions(+), 5 deletions(-) > create mode 100644 crypto/async_tx/async_pq.c > > diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig > index d8fb391..cb6d731 100644 > --- a/crypto/async_tx/Kconfig > +++ b/crypto/async_tx/Kconfig > @@ -14,3 +14,7 @@ config ASYNC_MEMSET > tristate > select ASYNC_CORE > > +config ASYNC_PQ > + tristate > + select ASYNC_CORE > + > diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile > index 27baa7d..1b99265 100644 > --- a/crypto/async_tx/Makefile > +++ b/crypto/async_tx/Makefile > @@ -2,3 +2,4 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o > obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o > obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o > obj-$(CONFIG_ASYNC_XOR) += async_xor.o > +obj-$(CONFIG_ASYNC_PQ) += async_pq.o > diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c > new file mode 100644 > index 0000000..5871651 > --- /dev/null > +++ b/crypto/async_tx/async_pq.c > @@ -0,0 +1,615 @@ > +/* > + * Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com> > + * > + * Developed for DENX Software Engineering GmbH > + * > + * Asynchronous GF-XOR calculations ASYNC_TX API. > + * > + * based on async_xor.c code written by: > + * Dan Williams <dan.j.williams@intel.com> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the Free > + * Software Foundation; either version 2 of the License, or (at your option) > + * any later version. > + * > + * This program is distributed in the hope that it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 > + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. > + * > + * The full GNU General Public License is included in this distribution in the > + * file called COPYING. > + */ > +#include <linux/kernel.h> > +#include <linux/interrupt.h> > +#include <linux/dma-mapping.h> > +#include <linux/raid/xor.h> > +#include <linux/async_tx.h> > + > +#include "../drivers/md/raid6.h" > + > +/** > + * The following static variables are used in cases of synchronous > + * zero sum to save the values to check. Two pages used for zero sum and > + * the third one is for dumb P destination when calling gen_syndrome() > + */ > +static spinlock_t spare_lock; > +static struct page *spare_pages[3]; > + > +/** > + * do_async_pq - asynchronously calculate P and/or Q > + */ > +static struct dma_async_tx_descriptor * > +do_async_pq(struct dma_chan *chan, struct page **blocks, unsigned char *scfs, > + unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags, > + struct dma_async_tx_descriptor *depend_tx, > + dma_async_tx_callback cb_fn, void *cb_param) > +{ > + struct dma_device *dma = chan->device; > + dma_addr_t dma_dest[2], dma_src[src_cnt]; > + struct dma_async_tx_descriptor *tx = NULL; > + dma_async_tx_callback _cb_fn; > + void *_cb_param; > + unsigned char *scf = NULL; > + int i, src_off = 0; > + unsigned short pq_src_cnt; > + enum async_tx_flags async_flags; > + enum dma_ctrl_flags dma_flags = 0; > + > + /* If we won't handle src_cnt in one shot, then the following > + * flag(s) will be set only on the first pass of prep_dma > + */ > + if (flags & ASYNC_TX_PQ_ZERO_P) > + dma_flags |= DMA_PREP_ZERO_P; > + if (flags & ASYNC_TX_PQ_ZERO_Q) > + dma_flags |= DMA_PREP_ZERO_Q; > + > + /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */ > + if (blocks[src_cnt]) { > + dma_dest[0] = dma_map_page(dma->dev, blocks[src_cnt], > + offset, len, DMA_BIDIRECTIONAL); > + dma_flags |= DMA_PREP_HAVE_P; > + } > + if (blocks[src_cnt+1]) { > + dma_dest[1] = dma_map_page(dma->dev, blocks[src_cnt+1], > + offset, len, DMA_BIDIRECTIONAL); > + dma_flags |= DMA_PREP_HAVE_Q; > + } > + > + for (i = 0; i < src_cnt; i++) > + dma_src[i] = dma_map_page(dma->dev, blocks[i], > + offset, len, DMA_TO_DEVICE); > + > + while (src_cnt) { > + async_flags = flags; > + pq_src_cnt = min(src_cnt, (int)dma->max_pq); > + /* if we are submitting additional pqs, leave the chain open, > + * clear the callback parameters, and leave the destination > + * buffers mapped > + */ > + if (src_cnt > pq_src_cnt) { > + async_flags &= ~ASYNC_TX_ACK; > + dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP; > + _cb_fn = NULL; > + _cb_param = NULL; > + } else { > + _cb_fn = cb_fn; > + _cb_param = cb_param; > + } > + if (_cb_fn) > + dma_flags |= DMA_PREP_INTERRUPT; > + if (scfs) > + scf = &scfs[src_off]; > + > + /* Since we have clobbered the src_list we are committed > + * to doing this asynchronously. Drivers force forward > + * progress in case they can not provide a descriptor > + */ > + tx = dma->device_prep_dma_pq(chan, dma_dest, > + &dma_src[src_off], pq_src_cnt, > + scf, len, dma_flags); > + if (unlikely(!tx)) > + async_tx_quiesce(&depend_tx); > + > + /* spin wait for the preceeding transactions to complete */ > + while (unlikely(!tx)) { > + dma_async_issue_pending(chan); > + tx = dma->device_prep_dma_pq(chan, dma_dest, > + &dma_src[src_off], pq_src_cnt, > + scf, len, dma_flags); > + } > + > + async_tx_submit(chan, tx, async_flags, depend_tx, > + _cb_fn, _cb_param); > + > + depend_tx = tx; > + flags |= ASYNC_TX_DEP_ACK; > + > + if (src_cnt > pq_src_cnt) { > + /* drop completed sources */ > + src_cnt -= pq_src_cnt; > + src_off += pq_src_cnt; > + > + /* use the intermediate result as a source; we > + * clear DMA_PREP_ZERO, so prep_dma_pq will > + * include destination(s) into calculations. Thus > + * keep DMA_PREP_HAVE_x in dma_flags only > + */ > + dma_flags &= (DMA_PREP_HAVE_P | DMA_PREP_HAVE_Q); I don't think this will work as we will be mixing Q into the new P and P into the new Q. In order to support (src_cnt > device->max_pq) we need to explicitly tell the driver that the operation is being continued (DMA_PREP_CONTINUE) and to apply different coeffeicients to P and Q to cancel the effect of including them as sources. Here is an example of supporting a 5 source pq operation where max_pq == 4 (the minimum). p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08})) p', q' = PQ(p, q, q, src4, COEF({00}, {01}, {00}, {10})) p' = p + q + q + src4 = p + src4 = P q' = {00}*p + {01}*q + {00}*q + {10}*src4 = q + {10)*src4 = Q ...at no point do we need to zero P or Q. Yes, this requires a lot of extra work for incremental sources, but at this point I do not see a cleaner alternatve for engines like iop13xx. > + } else > + break; > + } > + > + return tx; > +} > + > +/** > + * do_sync_pq - synchronously calculate P and Q > + */ > +static void > +do_sync_pq(struct page **blocks, unsigned char *scfs, unsigned int offset, > + int src_cnt, size_t len, enum async_tx_flags flags, > + struct dma_async_tx_descriptor *depend_tx, > + dma_async_tx_callback cb_fn, void *cb_param) > +{ > + int i, pos; > + uint8_t *p = NULL, *q = NULL, *src; > + > + /* set destination addresses */ > + if (blocks[src_cnt]) > + p = (uint8_t *)(page_address(blocks[src_cnt]) + offset); > + if (blocks[src_cnt+1]) > + q = (uint8_t *)(page_address(blocks[src_cnt+1]) + offset); > + > + if (flags & ASYNC_TX_PQ_ZERO_P) { > + BUG_ON(!p); > + memset(p, 0, len); > + } > + > + if (flags & ASYNC_TX_PQ_ZERO_Q) { > + BUG_ON(!q); > + memset(q, 0, len); > + } > + > + for (i = 0; i < src_cnt; i++) { > + src = (uint8_t *)(page_address(blocks[i]) + offset); > + for (pos = 0; pos < len; pos++) { > + if (p) > + p[pos] ^= src[pos]; > + if (q) > + q[pos] ^= raid6_gfmul[scfs[i]][src[pos]]; > + } > + } > + async_tx_sync_epilog(cb_fn, cb_param); > +} sync_pq like sync_gensyndrome should not care about the current contents of p and q, just regenerate from the current sources. This kills another site where ASYNC_TX_PQ_ZERO_{P,Q} is used. > + > +/** > + * async_pq - attempt to do XOR and Galois calculations in parallel using > + * a dma engine. > + * @blocks: source block array from 0 to (src_cnt-1) with the p destination > + * at blocks[src_cnt] and q at blocks[src_cnt + 1]. Only one of two > + * destinations may be present (another then has to be set to NULL). > + * By default, the result of calculations is XOR-ed with the initial > + * content of the destinationa buffers. Use ASYNC_TX_PQ_ZERO_x flags > + * to avoid this. > + * NOTE: client code must assume the contents of this array are destroyed > + * @scfs: array of source coefficients used in GF-multiplication > + * @offset: offset in pages to start transaction > + * @src_cnt: number of source pages > + * @len: length in bytes > + * @flags: ASYNC_TX_PQ_ZERO_P, ASYNC_TX_PQ_ZERO_Q, ASYNC_TX_ASSUME_COHERENT, > + * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY > + * @depend_tx: depends on the result of this transaction. > + * @cb_fn: function to call when the operation completes > + * @cb_param: parameter to pass to the callback routine > + */ > +struct dma_async_tx_descriptor * > +async_pq(struct page **blocks, unsigned char *scfs, unsigned int offset, > + int src_cnt, size_t len, enum async_tx_flags flags, > + struct dma_async_tx_descriptor *depend_tx, > + dma_async_tx_callback cb_fn, void *cb_param) > +{ > + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ, > + &blocks[src_cnt], 2, > + blocks, src_cnt, len); > + struct dma_device *device = chan ? chan->device : NULL; > + struct dma_async_tx_descriptor *tx = NULL; > + > + if (!device && (flags & ASYNC_TX_ASYNC_ONLY)) > + return NULL; > + > + if (device) { > + /* run pq asynchronously */ > + tx = do_async_pq(chan, blocks, scfs, offset, src_cnt, > + len, flags, depend_tx, cb_fn,cb_param); > + } else { > + /* run pq synchronously */ > + if (!blocks[src_cnt+1]) { > + struct page *pdst = blocks[src_cnt]; > + int i; > + > + /* Calculate P-parity only. > + * As opposite to async_xor(), async_pq() assumes > + * that destinations are included into calculations, > + * so we should re-arrange the xor src list to > + * achieve the similar behavior. > + */ > + if (!(flags & ASYNC_TX_PQ_ZERO_P)) { > + /* If async_pq() user doesn't set ZERO flag, > + * it's assumed that destination has some > + * reasonable data to include in calculations. > + * The destination must be at position 0, so > + * shift the sources and put pdst at the > + * beginning of the list. > + */ > + for (i = src_cnt - 1; i >= 0; i--) > + blocks[i+1] = blocks[i]; > + blocks[0] = pdst; > + src_cnt++; > + flags |= ASYNC_TX_XOR_DROP_DST; > + } else { > + /* If async_pq() user want to clear P, then > + * this will be done automatically in async > + * case, and with the help of ZERO_DST in > + * the sync one. > + */ > + flags &= ~ASYNC_TX_PQ_ZERO_P; > + flags |= ASYNC_TX_XOR_ZERO_DST; > + } > + > + return async_xor(pdst, blocks, offset, > + src_cnt, len, flags, depend_tx, > + cb_fn, cb_param); If we assume that async_pq always regenerates parity and never reuses the old value then we can get gid of the !(flags & ASYNC_TX_PQ_ZERO_P) path. In the case where code does need to reuse the old P, async_r6recov.c, it should call async_xor directly since that routine provides this semantic. > + } > + > + /* wait for any prerequisite operations */ > + async_tx_quiesce(&depend_tx); > + > + do_sync_pq(blocks, scfs, offset, src_cnt, len, flags, > + depend_tx, cb_fn, cb_param); > + } > + > + return tx; > +} > +EXPORT_SYMBOL_GPL(async_pq); > + > +/** > + * do_sync_gen_syndrome - synchronously calculate P (xor) and Q (Reed-Solomon > + * code) > + */ > +static void > +do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt, > + size_t len, enum async_tx_flags flags, > + struct dma_async_tx_descriptor *depend_tx, > + dma_async_tx_callback cb_fn, void *cb_param) > +{ > + int i; > + void *tsrc[src_cnt+2]; > + > + for (i = 0; i < src_cnt + 2; i++) > + tsrc[i] = page_address(blocks[i]) + offset; > + > + raid6_call.gen_syndrome(i, len, tsrc); > + > + async_tx_sync_epilog(cb_fn, cb_param); > +} > + [..] > diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h > index 64dea2a..4a72082 100644 > --- a/include/linux/dmaengine.h > +++ b/include/linux/dmaengine.h > @@ -55,7 +55,7 @@ enum dma_status { > enum dma_transaction_type { > DMA_MEMCPY, > DMA_XOR, > - DMA_PQ_XOR, > + DMA_PQ, > DMA_DUAL_XOR, > DMA_PQ_UPDATE, > DMA_ZERO_SUM, > @@ -81,14 +81,28 @@ enum dma_transaction_type { > * dependency chains > * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) > * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) > + * @DMA_PREP_HAVE_P - set if the destination list includes the correct > + * address of P (P-parity should be handled) > + * @DMA_PREP_HAVE_Q - set if the destination list includes the correct > + * address of Q (Q-parity should be handled) > + * @DMA_PREP_ZERO_P - set if P has to be zeroed before proceeding > + * @DMA_PREP_ZERO_Q - set if Q has to be zeroed before proceeding > */ > enum dma_ctrl_flags { > DMA_PREP_INTERRUPT = (1 << 0), > DMA_CTRL_ACK = (1 << 1), > DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2), > DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), > + > + DMA_PREP_HAVE_P = (1 << 4), > + DMA_PREP_HAVE_Q = (1 << 5), > + DMA_PREP_ZERO_P = (1 << 6), > + DMA_PREP_ZERO_Q = (1 << 7), > }; > > +#define DMA_PCHECK_FAILED (1 << 0) > +#define DMA_QCHECK_FAILED (1 << 1) Perhaps turn these into an enum such that we can pass around a enum pq_check_flags pointer rather than a non-descript u32 *. > + > /** > * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. > * See linux/cpumask.h > @@ -211,6 +225,7 @@ struct dma_async_tx_descriptor { > * @global_node: list_head for global dma_device_list > * @cap_mask: one or more dma_capability flags > * @max_xor: maximum number of xor sources, 0 if no capability > + * @max_pq: maximum number of PQ sources, 0 if no capability > * @refcount: reference count > * @done: IO completion struct > * @dev_id: unique device ID > @@ -220,7 +235,9 @@ struct dma_async_tx_descriptor { > * @device_free_chan_resources: release DMA channel's resources > * @device_prep_dma_memcpy: prepares a memcpy operation > * @device_prep_dma_xor: prepares a xor operation > + * @device_prep_dma_pq: prepares a pq operation > * @device_prep_dma_zero_sum: prepares a zero_sum operation > + * @device_prep_dma_pqzero_sum: prepares a pqzero_sum operation > * @device_prep_dma_memset: prepares a memset operation > * @device_prep_dma_interrupt: prepares an end of chain interrupt operation > * @device_prep_slave_sg: prepares a slave dma operation > @@ -233,7 +250,8 @@ struct dma_device { > struct list_head channels; > struct list_head global_node; > dma_cap_mask_t cap_mask; > - int max_xor; > + unsigned short max_xor; > + unsigned short max_pq; > > int dev_id; > struct device *dev; > @@ -247,9 +265,17 @@ struct dma_device { > struct dma_async_tx_descriptor *(*device_prep_dma_xor)( > struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, > unsigned int src_cnt, size_t len, unsigned long flags); > + struct dma_async_tx_descriptor *(*device_prep_dma_pq)( > + struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, > + unsigned int src_cnt, unsigned char *scf, > + size_t len, unsigned long flags); > struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)( > struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, > size_t len, u32 *result, unsigned long flags); > + struct dma_async_tx_descriptor *(*device_prep_dma_pqzero_sum)( > + struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, > + unsigned char *scf, size_t len, u32 *pqres, > + unsigned long flags); > struct dma_async_tx_descriptor *(*device_prep_dma_memset)( > struct dma_chan *chan, dma_addr_t dest, int value, size_t len, > unsigned long flags); > -- > 1.6.0.6 > Regards, Dan
Hello Dan, Thanks for review. Some comments below. On Thursday, January 15, 2009 you wrote: [..] >> +/** >> + * do_async_pq - asynchronously calculate P and/or Q >> + */ >> +static struct dma_async_tx_descriptor * >> +do_async_pq(struct dma_chan *chan, struct page **blocks, unsigned char *scfs, >> + unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags, >> + struct dma_async_tx_descriptor *depend_tx, >> + dma_async_tx_callback cb_fn, void *cb_param) >> +{ >> + struct dma_device *dma = chan->device; >> + dma_addr_t dma_dest[2], dma_src[src_cnt]; >> + struct dma_async_tx_descriptor *tx = NULL; >> + dma_async_tx_callback _cb_fn; >> + void *_cb_param; >> + unsigned char *scf = NULL; >> + int i, src_off = 0; >> + unsigned short pq_src_cnt; >> + enum async_tx_flags async_flags; >> + enum dma_ctrl_flags dma_flags = 0; >> + >> + /* If we won't handle src_cnt in one shot, then the following >> + * flag(s) will be set only on the first pass of prep_dma >> + */ >> + if (flags & ASYNC_TX_PQ_ZERO_P) >> + dma_flags |= DMA_PREP_ZERO_P; >> + if (flags & ASYNC_TX_PQ_ZERO_Q) >> + dma_flags |= DMA_PREP_ZERO_Q; >> + >> + /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */ >> + if (blocks[src_cnt]) { >> + dma_dest[0] = dma_map_page(dma->dev, blocks[src_cnt], >> + offset, len, DMA_BIDIRECTIONAL); >> + dma_flags |= DMA_PREP_HAVE_P; >> + } >> + if (blocks[src_cnt+1]) { >> + dma_dest[1] = dma_map_page(dma->dev, blocks[src_cnt+1], >> + offset, len, DMA_BIDIRECTIONAL); >> + dma_flags |= DMA_PREP_HAVE_Q; >> + } >> + >> + for (i = 0; i < src_cnt; i++) >> + dma_src[i] = dma_map_page(dma->dev, blocks[i], >> + offset, len, DMA_TO_DEVICE); >> + >> + while (src_cnt) { >> + async_flags = flags; >> + pq_src_cnt = min(src_cnt, (int)dma->max_pq); >> + /* if we are submitting additional pqs, leave the chain open, >> + * clear the callback parameters, and leave the destination >> + * buffers mapped >> + */ >> + if (src_cnt > pq_src_cnt) { >> + async_flags &= ~ASYNC_TX_ACK; >> + dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP; >> + _cb_fn = NULL; >> + _cb_param = NULL; >> + } else { >> + _cb_fn = cb_fn; >> + _cb_param = cb_param; >> + } >> + if (_cb_fn) >> + dma_flags |= DMA_PREP_INTERRUPT; >> + if (scfs) >> + scf = &scfs[src_off]; >> + >> + /* Since we have clobbered the src_list we are committed >> + * to doing this asynchronously. Drivers force forward >> + * progress in case they can not provide a descriptor >> + */ >> + tx = dma->device_prep_dma_pq(chan, dma_dest, >> + &dma_src[src_off], pq_src_cnt, >> + scf, len, dma_flags); >> + if (unlikely(!tx)) >> + async_tx_quiesce(&depend_tx); >> + >> + /* spin wait for the preceeding transactions to complete */ >> + while (unlikely(!tx)) { >> + dma_async_issue_pending(chan); >> + tx = dma->device_prep_dma_pq(chan, dma_dest, >> + &dma_src[src_off], pq_src_cnt, >> + scf, len, dma_flags); >> + } >> + >> + async_tx_submit(chan, tx, async_flags, depend_tx, >> + _cb_fn, _cb_param); >> + >> + depend_tx = tx; >> + flags |= ASYNC_TX_DEP_ACK; >> + >> + if (src_cnt > pq_src_cnt) { >> + /* drop completed sources */ >> + src_cnt -= pq_src_cnt; >> + src_off += pq_src_cnt; >> + >> + /* use the intermediate result as a source; we >> + * clear DMA_PREP_ZERO, so prep_dma_pq will >> + * include destination(s) into calculations. Thus >> + * keep DMA_PREP_HAVE_x in dma_flags only >> + */ >> + dma_flags &= (DMA_PREP_HAVE_P | DMA_PREP_HAVE_Q); > I don't think this will work as we will be mixing Q into the new P and > P into the new Q. In order to support (src_cnt > device->max_pq) we > need to explicitly tell the driver that the operation is being > continued (DMA_PREP_CONTINUE) and to apply different coeffeicients to > P and Q to cancel the effect of including them as sources. With DMA_PREP_ZERO_P/Q approach, the Q isn't mixed into new P, and P isn't mixed into new Q. For your example of max_pq=4: p, q = PQ(src0, src1, src2, src3, src4, COEF({01}, {02}, {04}, {08}, {10})) with the current implementation will be split into: p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08}) p`,q` = PQ(src4, COEF({10})) which will result to the following: p = ((dma_flags & DMA_PREP_ZERO_P) ? 0 : old_p) + src0 + src1 + src2 + src3 q = ((dma_flags & DMA_PREP_ZERO_Q) ? 0 : old_q) + {01}*src0 + {02}*src1 + {04}*src2 + {08}*src3 p` = p + src4 q` = q + {10}*src4 But, if we get rid of DMA_PREP_ZERO_P/Q, then the mess with P/Q will have a place indeed. > Here is an > example of supporting a 5 source pq operation where max_pq == 4 (the > minimum). > p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08})) > p', q' = PQ(p, q, q, src4, COEF({00}, {01}, {00}, {10})) > p' = p + q + q + src4 = p + src4 = P > q' = {00}*p + {01}*q + {00}*q + {10}*src4 = q + {10)*src4 = Q > ...at no point do we need to zero P or Q. Yes, this requires a lot of > extra work for incremental sources, I would say, that 'very very lot'. In general this means that for the cases of N sources > max_pq we'll have to do: C = 1 + ceil((N-max_pq)/(max_pq - 3)) number of calls to ADMA. E.g., for max_pq = 4: N = 5 => C = 2, N = 6 => C = 3, .. N = 15 => C = 12, N = 16 => C = 13, .. N = 128 => C = 125. If we stay with the current approach of using DMA_PREP_ZERO_P/Q, then C = 1 + ceil((N-max_pq)/max_pq)) number of calls to ADMA. And the same series will result to: N = 5 => C = 2, N = 6 => C = 2, .. N = 15 => C = 4, N = 16 => C = 4, .. N = 128 => C = 32. I'm afraid that the difference (13/4, 125/32) is very significant, so getting rid of DMA_PREP_ZERO_P/Q will eat most of the improvement which could be achieved with the current approach. > but at this point I do not see a cleaner alternatve for engines like iop13xx. I can't find any description of iop13xx processors at Intel's web-site, only 3xx: http://www.intel.com/design/iio/index.htm?iid=ipp_embed+embed_io So, it's hard for me to do any suggestions. I just wonder - doesn't iop13xx allow users to program destination addresses into the sources fields of descriptors? >> + } else >> + break; >> + } >> + >> + return tx; >> +} >> + >> +/** >> + * do_sync_pq - synchronously calculate P and Q >> + */ >> +static void >> +do_sync_pq(struct page **blocks, unsigned char *scfs, unsigned int offset, >> + int src_cnt, size_t len, enum async_tx_flags flags, >> + struct dma_async_tx_descriptor *depend_tx, >> + dma_async_tx_callback cb_fn, void *cb_param) >> +{ >> + int i, pos; >> + uint8_t *p = NULL, *q = NULL, *src; >> + >> + /* set destination addresses */ >> + if (blocks[src_cnt]) >> + p = (uint8_t *)(page_address(blocks[src_cnt]) + offset); >> + if (blocks[src_cnt+1]) >> + q = (uint8_t *)(page_address(blocks[src_cnt+1]) + offset); >> + >> + if (flags & ASYNC_TX_PQ_ZERO_P) { >> + BUG_ON(!p); >> + memset(p, 0, len); >> + } >> + >> + if (flags & ASYNC_TX_PQ_ZERO_Q) { >> + BUG_ON(!q); >> + memset(q, 0, len); >> + } >> + >> + for (i = 0; i < src_cnt; i++) { >> + src = (uint8_t *)(page_address(blocks[i]) + offset); >> + for (pos = 0; pos < len; pos++) { >> + if (p) >> + p[pos] ^= src[pos]; >> + if (q) >> + q[pos] ^= raid6_gfmul[scfs[i]][src[pos]]; >> + } >> + } >> + async_tx_sync_epilog(cb_fn, cb_param); >> +} > sync_pq like sync_gensyndrome should not care about the current > contents of p and q, just regenerate from the current sources. This > kills another site where ASYNC_TX_PQ_ZERO_{P,Q} is used. Well, perhaps you are right. The ASYNC_TX_PQ_ZERO_{P,Q} is set for the most common cases of using async_pq, i.e. the parity generating. The wrap-around async_gen_syndrome() function always set these flags before calling async_pq(). The cases where ASYNC_TX_PQ_ZERO_{P,Q} isn't set are: (a) async_pq can't process the sources in one short because of src_cnt > max_pq, so it should re-use the intermediate results (destination) as the sources; (b) async_r6_dd_recov() does XOR with async_pq() assuming re-using the destination as the source. So, I would say that ASYNC_TX_PQ_ZERO_{P,Q} should definitely go away, if there were no significant overheads in (a) implemented without these flags (see above). >> + >> +/** >> + * async_pq - attempt to do XOR and Galois calculations in parallel using >> + * a dma engine. >> + * @blocks: source block array from 0 to (src_cnt-1) with the p destination >> + * at blocks[src_cnt] and q at blocks[src_cnt + 1]. Only one of two >> + * destinations may be present (another then has to be set to NULL). >> + * By default, the result of calculations is XOR-ed with the initial >> + * content of the destinationa buffers. Use ASYNC_TX_PQ_ZERO_x flags >> + * to avoid this. >> + * NOTE: client code must assume the contents of this array are destroyed >> + * @scfs: array of source coefficients used in GF-multiplication >> + * @offset: offset in pages to start transaction >> + * @src_cnt: number of source pages >> + * @len: length in bytes >> + * @flags: ASYNC_TX_PQ_ZERO_P, ASYNC_TX_PQ_ZERO_Q, ASYNC_TX_ASSUME_COHERENT, >> + * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY >> + * @depend_tx: depends on the result of this transaction. >> + * @cb_fn: function to call when the operation completes >> + * @cb_param: parameter to pass to the callback routine >> + */ >> +struct dma_async_tx_descriptor * >> +async_pq(struct page **blocks, unsigned char *scfs, unsigned int offset, >> + int src_cnt, size_t len, enum async_tx_flags flags, >> + struct dma_async_tx_descriptor *depend_tx, >> + dma_async_tx_callback cb_fn, void *cb_param) >> +{ >> + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ, >> + &blocks[src_cnt], 2, >> + blocks, src_cnt, len); >> + struct dma_device *device = chan ? chan->device : NULL; >> + struct dma_async_tx_descriptor *tx = NULL; >> + >> + if (!device && (flags & ASYNC_TX_ASYNC_ONLY)) >> + return NULL; >> + >> + if (device) { >> + /* run pq asynchronously */ >> + tx = do_async_pq(chan, blocks, scfs, offset, src_cnt, >> + len, flags, depend_tx, cb_fn,cb_param); >> + } else { >> + /* run pq synchronously */ >> + if (!blocks[src_cnt+1]) { >> + struct page *pdst = blocks[src_cnt]; >> + int i; >> + >> + /* Calculate P-parity only. >> + * As opposite to async_xor(), async_pq() assumes >> + * that destinations are included into calculations, >> + * so we should re-arrange the xor src list to >> + * achieve the similar behavior. >> + */ >> + if (!(flags & ASYNC_TX_PQ_ZERO_P)) { >> + /* If async_pq() user doesn't set ZERO flag, >> + * it's assumed that destination has some >> + * reasonable data to include in calculations. >> + * The destination must be at position 0, so >> + * shift the sources and put pdst at the >> + * beginning of the list. >> + */ >> + for (i = src_cnt - 1; i >= 0; i--) >> + blocks[i+1] = blocks[i]; >> + blocks[0] = pdst; >> + src_cnt++; >> + flags |= ASYNC_TX_XOR_DROP_DST; >> + } else { >> + /* If async_pq() user want to clear P, then >> + * this will be done automatically in async >> + * case, and with the help of ZERO_DST in >> + * the sync one. >> + */ >> + flags &= ~ASYNC_TX_PQ_ZERO_P; >> + flags |= ASYNC_TX_XOR_ZERO_DST; >> + } >> + >> + return async_xor(pdst, blocks, offset, >> + src_cnt, len, flags, depend_tx, >> + cb_fn, cb_param); > If we assume that async_pq always regenerates parity and never reuses > the old value then we can get gid of the !(flags & ASYNC_TX_PQ_ZERO_P) > path. In the case where code does need to reuse the old P, > async_r6recov.c, it should call async_xor directly since that routine > provides this semantic. Right. The question is - will we get rid of ZERO_P/Q or not. [..] >> @@ -81,14 +81,28 @@ enum dma_transaction_type { >> * dependency chains >> * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) >> * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) >> + * @DMA_PREP_HAVE_P - set if the destination list includes the correct >> + * address of P (P-parity should be handled) >> + * @DMA_PREP_HAVE_Q - set if the destination list includes the correct >> + * address of Q (Q-parity should be handled) >> + * @DMA_PREP_ZERO_P - set if P has to be zeroed before proceeding >> + * @DMA_PREP_ZERO_Q - set if Q has to be zeroed before proceeding >> */ >> enum dma_ctrl_flags { >> DMA_PREP_INTERRUPT = (1 << 0), >> DMA_CTRL_ACK = (1 << 1), >> DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2), >> DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), >> + >> + DMA_PREP_HAVE_P = (1 << 4), >> + DMA_PREP_HAVE_Q = (1 << 5), >> + DMA_PREP_ZERO_P = (1 << 6), >> + DMA_PREP_ZERO_Q = (1 << 7), >> }; >> >> +#define DMA_PCHECK_FAILED (1 << 0) >> +#define DMA_QCHECK_FAILED (1 << 1) > Perhaps turn these into an enum such that we can pass around a enum > pq_check_flags pointer rather than a non-descript u32 *. Agree. Regards, Yuri -- Yuri Tikhonov, Senior Software Engineer Emcraft Systems, www.emcraft.com
On Fri, Jan 16, 2009 at 4:41 AM, Yuri Tikhonov <yur@emcraft.com> wrote: >> I don't think this will work as we will be mixing Q into the new P and >> P into the new Q. In order to support (src_cnt > device->max_pq) we >> need to explicitly tell the driver that the operation is being >> continued (DMA_PREP_CONTINUE) and to apply different coeffeicients to >> P and Q to cancel the effect of including them as sources. > > With DMA_PREP_ZERO_P/Q approach, the Q isn't mixed into new P, and P > isn't mixed into new Q. For your example of max_pq=4: > > p, q = PQ(src0, src1, src2, src3, src4, COEF({01}, {02}, {04}, {08}, {10})) > > with the current implementation will be split into: > > p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08}) > p`,q` = PQ(src4, COEF({10})) > > which will result to the following: > > p = ((dma_flags & DMA_PREP_ZERO_P) ? 0 : old_p) + src0 + src1 + src2 + src3 > q = ((dma_flags & DMA_PREP_ZERO_Q) ? 0 : old_q) + {01}*src0 + {02}*src1 + {04}*src2 + {08}*src3 > > p` = p + src4 > q` = q + {10}*src4 > Huh? Does the ppc440spe engine have some notion of flagging a source as old_p/old_q? Otherwise I do not see how the engine will not turn this into: p` = p + src4 + q q` = q + {10}*src4 + {x}*p I think you missed the fact that we have passed p and q back in as sources. Unless we have multiple p destinations and multiple q destinations, or hardware support for continuations I do not see how you can guarantee this split. > I'm afraid that the difference (13/4, 125/32) is very significant, so > getting rid of DMA_PREP_ZERO_P/Q will eat most of the improvement > which could be achieved with the current approach. Data corruption is a slightly higher cost :-). > >> but at this point I do not see a cleaner alternatve for engines like iop13xx. > > I can't find any description of iop13xx processors at Intel's > web-site, only 3xx: > > http://www.intel.com/design/iio/index.htm?iid=ipp_embed+embed_io > > So, it's hard for me to do any suggestions. I just wonder - doesn't > iop13xx allow users to program destination addresses into the sources > fields of descriptors? Yes it does, but the engine does not know it is a destination. Take a look at page 496 of the following and tell me if you come to a different conclusion. http://download.intel.com/design/iio/docs/31503602.pdf Thanks, Dan
Hello Dan, On Friday, January 16, 2009 you wrote: > On Fri, Jan 16, 2009 at 4:41 AM, Yuri Tikhonov <yur@emcraft.com> wrote: >>> I don't think this will work as we will be mixing Q into the new P and >>> P into the new Q. In order to support (src_cnt > device->max_pq) we >>> need to explicitly tell the driver that the operation is being >>> continued (DMA_PREP_CONTINUE) and to apply different coeffeicients to >>> P and Q to cancel the effect of including them as sources. >> >> With DMA_PREP_ZERO_P/Q approach, the Q isn't mixed into new P, and P >> isn't mixed into new Q. For your example of max_pq=4: >> >> p, q = PQ(src0, src1, src2, src3, src4, COEF({01}, {02}, {04}, {08}, {10})) >> >> with the current implementation will be split into: >> >> p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08}) >> p`,q` = PQ(src4, COEF({10})) >> >> which will result to the following: >> >> p = ((dma_flags & DMA_PREP_ZERO_P) ? 0 : old_p) + src0 + src1 + src2 + src3 >> q = ((dma_flags & DMA_PREP_ZERO_Q) ? 0 : old_q) + {01}*src0 + {02}*src1 + {04}*src2 + {08}*src3 >> >> p` = p + src4 >> q` = q + {10}*src4 >> > Huh? Does the ppc440spe engine have some notion of flagging a source > as old_p/old_q? Otherwise I do not see how the engine will not turn > this into: > p` = p + src4 + q > q` = q + {10}*src4 + {x}*p > I think you missed the fact that we have passed p and q back in as > sources. Unless we have multiple p destinations and multiple q > destinations, or hardware support for continuations I do not see how > you can guarantee this split. I guess, I've got your point. You are missing the fact that destinations for 'p' and 'q' are passed in device_prep_dma_pq() method separately from sources. Speaking your words: we do not have multiple destinations through the while() cycles, the destinations are the same in each pass. Please look at do_async_pq() implementation more carefully: 'blocks' is a pointer to 'src_cnt' sources _plus_ two destination pages (as it's stated in async_pq() description). Before coming into the while() cycle we save destinations in the dma_dest[] array, and then pass this to device_prep_dma_pq() in each (src_cnt/max_pq) cycle. That is, we do not passes destinations as the sources explicitly: we just clear DMA_PREP_ZERO_P/Q flags to notify ADMA level that this have to XOR the current content of destination(s) with the result of new operation. >> I'm afraid that the difference (13/4, 125/32) is very significant, so >> getting rid of DMA_PREP_ZERO_P/Q will eat most of the improvement >> which could be achieved with the current approach. > Data corruption is a slightly higher cost :-). >> >>> but at this point I do not see a cleaner alternatve for engines like iop13xx. >> >> I can't find any description of iop13xx processors at Intel's >> web-site, only 3xx: >> >> http://www.intel.com/design/iio/index.htm?iid=ipp_embed+embed_io >> >> So, it's hard for me to do any suggestions. I just wonder - doesn't >> iop13xx allow users to program destination addresses into the sources >> fields of descriptors? > Yes it does, but the engine does not know it is a destination. > Take a look at page 496 of the following and tell me if you come to a > different conclusion. > http://download.intel.com/design/iio/docs/31503602.pdf I see. The major difference in the implementation of support for P+Q in ppc440spe DMA engines is that ppc440spe allows to include (xor) the previous content of P_Result and/or Q_Result just by setting a corresponding indication in the destination (P_Result and/or Q_Result) address(es) The "5.7.5 P+Q Update Operation" case won't help here, since, if I understand it right, it doesn't allow to set up different multipliers for Old and New Data. So, it looks like your approach: p', q' = PQ(p, q, q, src4, COEF({00}, {01}, {00}, {10})) is the only possible way of including the previous P/Q content into the calculation. But I still think, that this p'/q' hack should have a place on the ADMA level, not ASYNC_TX. It looks more generic if ASYNC_TX will assume that ADMA is capable of p'=p+src / q'=q+{}*src. Otherwise, we'll have an overhead for the DMAs which could work without this overhead. In your case, the IOP ADMA driver should handle the situation when it receives 4 sources to be P+Qed with the previous contents of destinations, for example, by generating the sequence of 4 descriptors to process such a request. Regards, Yuri -- Yuri Tikhonov, Senior Software Engineer Emcraft Systems, www.emcraft.com
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig index d8fb391..cb6d731 100644 --- a/crypto/async_tx/Kconfig +++ b/crypto/async_tx/Kconfig @@ -14,3 +14,7 @@ config ASYNC_MEMSET tristate select ASYNC_CORE +config ASYNC_PQ + tristate + select ASYNC_CORE + diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile index 27baa7d..1b99265 100644 --- a/crypto/async_tx/Makefile +++ b/crypto/async_tx/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o obj-$(CONFIG_ASYNC_XOR) += async_xor.o +obj-$(CONFIG_ASYNC_PQ) += async_pq.o diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c new file mode 100644 index 0000000..5871651 --- /dev/null +++ b/crypto/async_tx/async_pq.c @@ -0,0 +1,615 @@ +/* + * Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com> + * + * Developed for DENX Software Engineering GmbH + * + * Asynchronous GF-XOR calculations ASYNC_TX API. + * + * based on async_xor.c code written by: + * Dan Williams <dan.j.williams@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/dma-mapping.h> +#include <linux/raid/xor.h> +#include <linux/async_tx.h> + +#include "../drivers/md/raid6.h" + +/** + * The following static variables are used in cases of synchronous + * zero sum to save the values to check. Two pages used for zero sum and + * the third one is for dumb P destination when calling gen_syndrome() + */ +static spinlock_t spare_lock; +static struct page *spare_pages[3]; + +/** + * do_async_pq - asynchronously calculate P and/or Q + */ +static struct dma_async_tx_descriptor * +do_async_pq(struct dma_chan *chan, struct page **blocks, unsigned char *scfs, + unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_device *dma = chan->device; + dma_addr_t dma_dest[2], dma_src[src_cnt]; + struct dma_async_tx_descriptor *tx = NULL; + dma_async_tx_callback _cb_fn; + void *_cb_param; + unsigned char *scf = NULL; + int i, src_off = 0; + unsigned short pq_src_cnt; + enum async_tx_flags async_flags; + enum dma_ctrl_flags dma_flags = 0; + + /* If we won't handle src_cnt in one shot, then the following + * flag(s) will be set only on the first pass of prep_dma + */ + if (flags & ASYNC_TX_PQ_ZERO_P) + dma_flags |= DMA_PREP_ZERO_P; + if (flags & ASYNC_TX_PQ_ZERO_Q) + dma_flags |= DMA_PREP_ZERO_Q; + + /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */ + if (blocks[src_cnt]) { + dma_dest[0] = dma_map_page(dma->dev, blocks[src_cnt], + offset, len, DMA_BIDIRECTIONAL); + dma_flags |= DMA_PREP_HAVE_P; + } + if (blocks[src_cnt+1]) { + dma_dest[1] = dma_map_page(dma->dev, blocks[src_cnt+1], + offset, len, DMA_BIDIRECTIONAL); + dma_flags |= DMA_PREP_HAVE_Q; + } + + for (i = 0; i < src_cnt; i++) + dma_src[i] = dma_map_page(dma->dev, blocks[i], + offset, len, DMA_TO_DEVICE); + + while (src_cnt) { + async_flags = flags; + pq_src_cnt = min(src_cnt, (int)dma->max_pq); + /* if we are submitting additional pqs, leave the chain open, + * clear the callback parameters, and leave the destination + * buffers mapped + */ + if (src_cnt > pq_src_cnt) { + async_flags &= ~ASYNC_TX_ACK; + dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP; + _cb_fn = NULL; + _cb_param = NULL; + } else { + _cb_fn = cb_fn; + _cb_param = cb_param; + } + if (_cb_fn) + dma_flags |= DMA_PREP_INTERRUPT; + if (scfs) + scf = &scfs[src_off]; + + /* Since we have clobbered the src_list we are committed + * to doing this asynchronously. Drivers force forward + * progress in case they can not provide a descriptor + */ + tx = dma->device_prep_dma_pq(chan, dma_dest, + &dma_src[src_off], pq_src_cnt, + scf, len, dma_flags); + if (unlikely(!tx)) + async_tx_quiesce(&depend_tx); + + /* spin wait for the preceeding transactions to complete */ + while (unlikely(!tx)) { + dma_async_issue_pending(chan); + tx = dma->device_prep_dma_pq(chan, dma_dest, + &dma_src[src_off], pq_src_cnt, + scf, len, dma_flags); + } + + async_tx_submit(chan, tx, async_flags, depend_tx, + _cb_fn, _cb_param); + + depend_tx = tx; + flags |= ASYNC_TX_DEP_ACK; + + if (src_cnt > pq_src_cnt) { + /* drop completed sources */ + src_cnt -= pq_src_cnt; + src_off += pq_src_cnt; + + /* use the intermediate result as a source; we + * clear DMA_PREP_ZERO, so prep_dma_pq will + * include destination(s) into calculations. Thus + * keep DMA_PREP_HAVE_x in dma_flags only + */ + dma_flags &= (DMA_PREP_HAVE_P | DMA_PREP_HAVE_Q); + } else + break; + } + + return tx; +} + +/** + * do_sync_pq - synchronously calculate P and Q + */ +static void +do_sync_pq(struct page **blocks, unsigned char *scfs, unsigned int offset, + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + int i, pos; + uint8_t *p = NULL, *q = NULL, *src; + + /* set destination addresses */ + if (blocks[src_cnt]) + p = (uint8_t *)(page_address(blocks[src_cnt]) + offset); + if (blocks[src_cnt+1]) + q = (uint8_t *)(page_address(blocks[src_cnt+1]) + offset); + + if (flags & ASYNC_TX_PQ_ZERO_P) { + BUG_ON(!p); + memset(p, 0, len); + } + + if (flags & ASYNC_TX_PQ_ZERO_Q) { + BUG_ON(!q); + memset(q, 0, len); + } + + for (i = 0; i < src_cnt; i++) { + src = (uint8_t *)(page_address(blocks[i]) + offset); + for (pos = 0; pos < len; pos++) { + if (p) + p[pos] ^= src[pos]; + if (q) + q[pos] ^= raid6_gfmul[scfs[i]][src[pos]]; + } + } + async_tx_sync_epilog(cb_fn, cb_param); +} + +/** + * async_pq - attempt to do XOR and Galois calculations in parallel using + * a dma engine. + * @blocks: source block array from 0 to (src_cnt-1) with the p destination + * at blocks[src_cnt] and q at blocks[src_cnt + 1]. Only one of two + * destinations may be present (another then has to be set to NULL). + * By default, the result of calculations is XOR-ed with the initial + * content of the destinationa buffers. Use ASYNC_TX_PQ_ZERO_x flags + * to avoid this. + * NOTE: client code must assume the contents of this array are destroyed + * @scfs: array of source coefficients used in GF-multiplication + * @offset: offset in pages to start transaction + * @src_cnt: number of source pages + * @len: length in bytes + * @flags: ASYNC_TX_PQ_ZERO_P, ASYNC_TX_PQ_ZERO_Q, ASYNC_TX_ASSUME_COHERENT, + * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY + * @depend_tx: depends on the result of this transaction. + * @cb_fn: function to call when the operation completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_pq(struct page **blocks, unsigned char *scfs, unsigned int offset, + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ, + &blocks[src_cnt], 2, + blocks, src_cnt, len); + struct dma_device *device = chan ? chan->device : NULL; + struct dma_async_tx_descriptor *tx = NULL; + + if (!device && (flags & ASYNC_TX_ASYNC_ONLY)) + return NULL; + + if (device) { + /* run pq asynchronously */ + tx = do_async_pq(chan, blocks, scfs, offset, src_cnt, + len, flags, depend_tx, cb_fn,cb_param); + } else { + /* run pq synchronously */ + if (!blocks[src_cnt+1]) { + struct page *pdst = blocks[src_cnt]; + int i; + + /* Calculate P-parity only. + * As opposite to async_xor(), async_pq() assumes + * that destinations are included into calculations, + * so we should re-arrange the xor src list to + * achieve the similar behavior. + */ + if (!(flags & ASYNC_TX_PQ_ZERO_P)) { + /* If async_pq() user doesn't set ZERO flag, + * it's assumed that destination has some + * reasonable data to include in calculations. + * The destination must be at position 0, so + * shift the sources and put pdst at the + * beginning of the list. + */ + for (i = src_cnt - 1; i >= 0; i--) + blocks[i+1] = blocks[i]; + blocks[0] = pdst; + src_cnt++; + flags |= ASYNC_TX_XOR_DROP_DST; + } else { + /* If async_pq() user want to clear P, then + * this will be done automatically in async + * case, and with the help of ZERO_DST in + * the sync one. + */ + flags &= ~ASYNC_TX_PQ_ZERO_P; + flags |= ASYNC_TX_XOR_ZERO_DST; + } + + return async_xor(pdst, blocks, offset, + src_cnt, len, flags, depend_tx, + cb_fn, cb_param); + } + + /* wait for any prerequisite operations */ + async_tx_quiesce(&depend_tx); + + do_sync_pq(blocks, scfs, offset, src_cnt, len, flags, + depend_tx, cb_fn, cb_param); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_pq); + +/** + * do_sync_gen_syndrome - synchronously calculate P (xor) and Q (Reed-Solomon + * code) + */ +static void +do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt, + size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + int i; + void *tsrc[src_cnt+2]; + + for (i = 0; i < src_cnt + 2; i++) + tsrc[i] = page_address(blocks[i]) + offset; + + raid6_call.gen_syndrome(i, len, tsrc); + + async_tx_sync_epilog(cb_fn, cb_param); +} + +/** + * async_gen_syndrome - attempt to generate P (xor) and Q (Reed-Solomon code) + * with a dma engine for a given set of blocks. This routine assumes a + * field of GF(2^8) with a primitive polynomial of 0x11d and a generator + * of {02}. + * @blocks: source block array ordered from 0..src_cnt-1 with the P destination + * at blocks[src_cnt] and Q at blocks[src_cnt + 1]. Only one of two + * destinations may be present (another then has to be set to NULL). + * NOTE: client code must assume the contents of this array are destroyed + * @offset: offset in pages to start transaction + * @src_cnt: number of source pages: 2 < src_cnt <= 255 + * @len: length of blocks in bytes + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY + * @depend_tx: P+Q operation depends on the result of this transaction. + * @cb_fn: function to call when P+Q generation completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt, + size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ, + &blocks[src_cnt], 2, + blocks, src_cnt, len); + struct dma_device *device = chan ? chan->device : NULL; + struct dma_async_tx_descriptor *tx = NULL; + + BUG_ON(src_cnt > 255 || (!blocks[src_cnt] && !blocks[src_cnt+1])); + + if (!device && (flags & ASYNC_TX_ASYNC_ONLY)) + return NULL; + + /* Synchronous gen_syndrome() doesn't take care of destinations, + * but asynchronous implies them as sources; so, when generating + * syndromes - command to clear destinations up explicitly + */ + if (blocks[src_cnt]) + flags |= ASYNC_TX_PQ_ZERO_P; + if (blocks[src_cnt+1]) + flags |= ASYNC_TX_PQ_ZERO_Q; + + if (device) { + /* run the xor asynchronously */ + tx = do_async_pq(chan, blocks, (uint8_t *)raid6_gfexp, + offset, src_cnt, len, flags, depend_tx, + cb_fn, cb_param); + } else { + /* run the pq synchronously */ + /* wait for any prerequisite operations */ + async_tx_quiesce(&depend_tx); + + if (!blocks[src_cnt]) + blocks[src_cnt] = spare_pages[2]; + if (!blocks[src_cnt+1]) + blocks[src_cnt+1] = spare_pages[2]; + do_sync_gen_syndrome(blocks, offset, src_cnt, len, flags, + depend_tx, cb_fn, cb_param); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_gen_syndrome); + +/** + * async_pq_zero_sum - attempt a PQ parities check with a dma engine. + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the + * src_cnt and src_cnt+1 are the P and Q destinations to check, resp. + * Only one of two destinations may be present. + * NOTE: client code must assume the contents of this array are destroyed + * @scfs: coefficients to use in GF-multiplications + * @offset: offset in pages to start transaction + * @src_cnt: number of source pages + * @len: length in bytes + * @presult: where to store the result of P-ckeck, which is 0 if P-parity + * OK, and non-zero otherwise. + * @qresult: where to store the result of Q-ckeck, which is 0 if Q-parity + * OK, and non-zero otherwise. + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: depends on the result of this transaction. + * @cb_fn: function to call when the xor completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_pq_zero_sum(struct page **blocks, unsigned char *scfs, + unsigned int offset, int src_cnt, size_t len, u32 *pqres, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, + DMA_PQ_ZERO_SUM, + &blocks[src_cnt], 2, + blocks, src_cnt, len); + struct dma_device *device = chan ? chan->device : NULL; + struct dma_async_tx_descriptor *tx = NULL; + enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; + + BUG_ON(src_cnt < 2); + + if (blocks[src_cnt]) + dma_flags |= DMA_PREP_HAVE_P; + if (blocks[src_cnt+1]) + dma_flags |= DMA_PREP_HAVE_Q; + + if (device && src_cnt <= (int)device->max_pq) { + dma_addr_t dma_src[src_cnt + 2]; + int i; + + for (i = 0; i < src_cnt + 2; i++) { + if (likely(blocks[i])) { + dma_src[i] = dma_map_page(device->dev, + blocks[i], offset, + len, DMA_TO_DEVICE); + } + } + + tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt, + scfs, len, pqres, + dma_flags); + + if (unlikely(!tx)) { + async_tx_quiesce(&depend_tx); + + while (unlikely(!tx)) { + dma_async_issue_pending(chan); + tx = device->device_prep_dma_pqzero_sum(chan, + dma_src, src_cnt, scfs, len, + pqres, dma_flags); + } + } + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + } else { + struct page *pdest = blocks[src_cnt]; + struct page *qdest = blocks[src_cnt + 1]; + enum async_tx_flags lflags = flags; + + lflags &= ~ASYNC_TX_ACK; + lflags |= ASYNC_TX_PQ_ZERO_P | ASYNC_TX_PQ_ZERO_Q; + + spin_lock(&spare_lock); + blocks[src_cnt] = spare_pages[0]; + blocks[src_cnt + 1] = spare_pages[1]; + tx = async_pq(blocks, scfs, offset, src_cnt, len, lflags, + depend_tx, NULL, NULL); + async_tx_quiesce(&tx); + + if (dma_flags & DMA_PREP_HAVE_P) { + if (memcmp(page_address(pdest) + offset, + page_address(spare_pages[0]) + offset, + len) == 0) + *pqres &= ~DMA_PCHECK_FAILED; + else + *pqres |= DMA_PCHECK_FAILED; + } + if (dma_flags & DMA_PREP_HAVE_Q) { + if (memcmp(page_address(qdest) + offset, + page_address(spare_pages[1]) + offset, + len) == 0) + *pqres &= ~DMA_QCHECK_FAILED; + else + *pqres |= DMA_QCHECK_FAILED; + } + spin_unlock(&spare_lock); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_pq_zero_sum); + +/** + * async_syndrome_zero_sum - attempt a P (xor) and Q (Reed-Solomon code) + * parities check with a dma engine. This routine assumes a field of + * GF(2^8) with a primitive polynomial of 0x11d and a generator of {02}. + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the + * src_cnt and src_cnt+1 are the P and Q destinations to check, resp. + * Only one of two destinations may be present. + * NOTE: client code must assume the contents of this array are destroyed + * @offset: offset in pages to start transaction + * @src_cnt: number of source pages + * @len: length in bytes + * @pqres: the pointer, where to flag about the result of the check: the + * result of P-check is stored at bit0, the result of Q-check is stored + * at bit1. If the bit is cleared, then the corresponding parity is OK. + * If the bit is set, then the corresponding parity is bad. + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: depends on the result of this transaction. + * @cb_fn: function to call when the xor completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_syndrome_zero_sum(struct page **blocks, unsigned int offset, + int src_cnt, size_t len, u32 *pqres, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, + DMA_PQ_ZERO_SUM, + &blocks[src_cnt], 2, + blocks, src_cnt, len); + struct dma_device *device = chan ? chan->device : NULL; + struct dma_async_tx_descriptor *tx = NULL; + enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; + + BUG_ON(src_cnt < 2); + + if (blocks[src_cnt]) + dma_flags |= DMA_PREP_HAVE_P; + if (blocks[src_cnt+1]) + dma_flags |= DMA_PREP_HAVE_Q; + + if (device && src_cnt <= (int)device->max_pq) { + dma_addr_t dma_src[src_cnt + 2]; + int i; + + + for (i = 0; i < src_cnt + 2; i++) { + if (likely(blocks[i])) { + dma_src[i] = dma_map_page(device->dev, + blocks[i], offset, + len, DMA_TO_DEVICE); + } + } + + tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt, + (uint8_t *)raid6_gfexp, + len, pqres, dma_flags); + + if (unlikely(!tx)) { + async_tx_quiesce(&depend_tx); + while (unlikely(!tx)) { + dma_async_issue_pending(chan); + tx = device->device_prep_dma_pqzero_sum(chan, + dma_src, src_cnt, + (uint8_t *)raid6_gfexp, len, + pqres, dma_flags); + } + } + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + } else { + struct page *pdest = blocks[src_cnt]; + struct page *qdest = blocks[src_cnt + 1]; + enum async_tx_flags lflags = flags; + + lflags &= ~ASYNC_TX_ACK; + + spin_lock(&spare_lock); + blocks[src_cnt] = spare_pages[0]; + blocks[src_cnt + 1] = spare_pages[1]; + tx = async_gen_syndrome(blocks, offset, + src_cnt, len, lflags, + depend_tx, NULL, NULL); + async_tx_quiesce(&tx); + + if (dma_flags & DMA_PREP_HAVE_P) { + if (memcmp(page_address(pdest) + offset, + page_address(spare_pages[0]) + offset, + len) == 0) + *pqres &= ~DMA_PCHECK_FAILED; + else + *pqres |= DMA_PCHECK_FAILED; + } + if (dma_flags & DMA_PREP_HAVE_Q) { + if (memcmp(page_address(qdest) + offset, + page_address(spare_pages[1]) + offset, + len) == 0) + *pqres &= ~DMA_QCHECK_FAILED; + else + *pqres |= DMA_QCHECK_FAILED; + } + spin_unlock(&spare_lock); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_syndrome_zero_sum); + +static int __init async_pq_init(void) +{ + spin_lock_init(&spare_lock); + + spare_pages[0] = alloc_page(GFP_KERNEL); + if (!spare_pages[0]) + goto abort; + spare_pages[1] = alloc_page(GFP_KERNEL); + if (!spare_pages[1]) + goto abort; + spare_pages[2] = alloc_page(GFP_KERNEL); + if (!spare_pages[2]) + goto abort; + return 0; +abort: + safe_put_page(spare_pages[2]); + safe_put_page(spare_pages[1]); + safe_put_page(spare_pages[0]); + printk(KERN_ERR "%s: cannot allocate spare!\n", __func__); + return -ENOMEM; +} + +static void __exit async_pq_exit(void) +{ + safe_put_page(spare_pages[2]); + safe_put_page(spare_pages[1]); + safe_put_page(spare_pages[0]); +} + +module_init(async_pq_init); +module_exit(async_pq_exit); + +MODULE_AUTHOR("Yuri Tikhonov <yur@emcraft.com>"); +MODULE_DESCRIPTION("asynchronous pq/pq-zero-sum api"); +MODULE_LICENSE("GPL"); diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index e1f1f28..c2bc0ea 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -68,7 +68,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, while (src_cnt) { async_flags = flags; dma_flags = 0; - xor_src_cnt = min(src_cnt, dma->max_xor); + xor_src_cnt = min(src_cnt, (int)dma->max_xor); /* if we are submitting additional xors, leave the chain open, * clear the callback parameters, and leave the destination * buffer mapped diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 45f6297..2f92d87 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -47,15 +47,26 @@ struct dma_chan_ref { * address is an implied source, whereas the asynchronous case it must be listed * as a source. The destination address must be the first address in the source * array. + * @ASYNC_TX_PQ_ZERO_P: this flag must be used for async_pq operations since the + * destination there is always the source (the result of P after async_pq is + * xor-ed with the previous content of P block if this flag isn't set). + * @ASYNC_TX_PQ_ZERO_Q: this flag must be used for async_pq operations since the + * destination there is always the source (the result of Q after async_pq is + * xor-ed with the previous content of Q block if this flag isn't set). * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a * dependency chain * @ASYNC_TX_DEP_ACK: ack the dependency descriptor. Useful for chaining. + * @ASYNC_TX_ASYNC_ONLY: if set then try to perform operation requested only in + * the asynchronous mode. Useful for R6 recovery. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), - ASYNC_TX_ACK = (1 << 3), - ASYNC_TX_DEP_ACK = (1 << 4), + ASYNC_TX_PQ_ZERO_P = (1 << 2), + ASYNC_TX_PQ_ZERO_Q = (1 << 3), + ASYNC_TX_ACK = (1 << 4), + ASYNC_TX_DEP_ACK = (1 << 5), + ASYNC_TX_ASYNC_ONLY = (1 << 6), }; #ifdef CONFIG_DMA_ENGINE @@ -131,5 +142,36 @@ async_trigger_callback(enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, dma_async_tx_callback cb_fn, void *cb_fn_param); +struct dma_async_tx_descriptor * +async_pq(struct page **blocks, unsigned char *scoef_list, + unsigned int offset, int src_cnt, size_t len, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback callback, void *callback_param); + +struct dma_async_tx_descriptor * +async_gen_syndrome(struct page **blocks, unsigned int offset, + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback callback, void *callback_param); + +struct dma_async_tx_descriptor * +async_pq_zero_sum(struct page **blocks, unsigned char *scoef_list, + unsigned int offset, int src_cnt, size_t len, + u32 *pqres, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback callback, void *callback_param); + +struct dma_async_tx_descriptor * +async_syndrome_zero_sum(struct page **blocks, unsigned int offset, + int src_cnt, size_t len, u32 *pqres, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback callback, void *callback_param); + +struct dma_async_tx_descriptor * +async_syndrome_zero_sum(struct page **src_list, unsigned int offset, + int src_cnt, size_t len, u32 *pqres, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback callback, void *callback_param); + void async_tx_quiesce(struct dma_async_tx_descriptor **tx); #endif /* _ASYNC_TX_H_ */ diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 64dea2a..4a72082 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -55,7 +55,7 @@ enum dma_status { enum dma_transaction_type { DMA_MEMCPY, DMA_XOR, - DMA_PQ_XOR, + DMA_PQ, DMA_DUAL_XOR, DMA_PQ_UPDATE, DMA_ZERO_SUM, @@ -81,14 +81,28 @@ enum dma_transaction_type { * dependency chains * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) + * @DMA_PREP_HAVE_P - set if the destination list includes the correct + * address of P (P-parity should be handled) + * @DMA_PREP_HAVE_Q - set if the destination list includes the correct + * address of Q (Q-parity should be handled) + * @DMA_PREP_ZERO_P - set if P has to be zeroed before proceeding + * @DMA_PREP_ZERO_Q - set if Q has to be zeroed before proceeding */ enum dma_ctrl_flags { DMA_PREP_INTERRUPT = (1 << 0), DMA_CTRL_ACK = (1 << 1), DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2), DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), + + DMA_PREP_HAVE_P = (1 << 4), + DMA_PREP_HAVE_Q = (1 << 5), + DMA_PREP_ZERO_P = (1 << 6), + DMA_PREP_ZERO_Q = (1 << 7), }; +#define DMA_PCHECK_FAILED (1 << 0) +#define DMA_QCHECK_FAILED (1 << 1) + /** * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. * See linux/cpumask.h @@ -211,6 +225,7 @@ struct dma_async_tx_descriptor { * @global_node: list_head for global dma_device_list * @cap_mask: one or more dma_capability flags * @max_xor: maximum number of xor sources, 0 if no capability + * @max_pq: maximum number of PQ sources, 0 if no capability * @refcount: reference count * @done: IO completion struct * @dev_id: unique device ID @@ -220,7 +235,9 @@ struct dma_async_tx_descriptor { * @device_free_chan_resources: release DMA channel's resources * @device_prep_dma_memcpy: prepares a memcpy operation * @device_prep_dma_xor: prepares a xor operation + * @device_prep_dma_pq: prepares a pq operation * @device_prep_dma_zero_sum: prepares a zero_sum operation + * @device_prep_dma_pqzero_sum: prepares a pqzero_sum operation * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation @@ -233,7 +250,8 @@ struct dma_device { struct list_head channels; struct list_head global_node; dma_cap_mask_t cap_mask; - int max_xor; + unsigned short max_xor; + unsigned short max_pq; int dev_id; struct device *dev; @@ -247,9 +265,17 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_dma_xor)( struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt, size_t len, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_pq)( + struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, + unsigned int src_cnt, unsigned char *scf, + size_t len, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)( struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, size_t len, u32 *result, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_pqzero_sum)( + struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, + unsigned char *scf, size_t len, u32 *pqres, + unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset)( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags);