Patchwork RAID/DMA/caamxor: support XOR offload by CAAM

login
register
mail settings
Submitter b29237@freescale.com
Date Aug. 22, 2012, 7:05 a.m.
Message ID <1345619150-21268-1-git-send-email-b29237@freescale.com>
Download mbox | patch
Permalink /patch/179253/
State Not Applicable
Delegated to: Kumar Gala
Headers show

Comments

b29237@freescale.com - Aug. 22, 2012, 7:05 a.m.
From: Xuelin Shi <b29237@freescale.com>

Add XOR offloading functionality by CAAM and interface with async_tx layer
so that it can be used for RAID parity computation.

Signed-off-by: Naveen Burmi <naveenburmi@freescale.com>
Signed-off-by: Yuan Kang <Yuan.Kang@freescale.com>
Signed-off-by: Xuelin Shi <b29237@freescale.com>
---
 drivers/crypto/caam/Kconfig       |   15 +
 drivers/crypto/caam/Makefile      |    1 +
 drivers/crypto/caam/caamxor.c     |  880 +++++++++++++++++++++++++++++++++++++
 drivers/crypto/caam/desc_constr.h |   53 +++-
 drivers/crypto/caam/intern.h      |    7 +
 drivers/crypto/caam/jr.c          |    8 +-
 6 files changed, 959 insertions(+), 5 deletions(-)
 create mode 100644 drivers/crypto/caam/caamxor.c

Patch

diff --git a/drivers/crypto/caam/Kconfig b/drivers/crypto/caam/Kconfig
index 65c7668..643ca0a 100644
--- a/drivers/crypto/caam/Kconfig
+++ b/drivers/crypto/caam/Kconfig
@@ -98,3 +98,18 @@  config CRYPTO_DEV_FSL_CAAM_RNG_API
 
 	  To compile this as a module, choose M here: the module
 	  will be called caamrng.
+
+config CRYPTO_DEV_FSL_CAAM_DMAXOR_API
+	tristate "Freescale CAAM XOR support"
+	depends on CRYPTO_DEV_FSL_CAAM && EXPERIMENTAL
+	default n
+	select DMA_ENGINE
+	select ASYNC_XOR
+	help
+	  Selecting this will offload the xor-parity-calculation for
+	  users of the Asynchronous Transfers/Transforms API (such as
+	  md-raid5 driver) to the SEC4.
+
+
+	  To compile this as a module, choose M here: the module
+	  will be called caamxor.
diff --git a/drivers/crypto/caam/Makefile b/drivers/crypto/caam/Makefile
index b1eb448..457192c 100644
--- a/drivers/crypto/caam/Makefile
+++ b/drivers/crypto/caam/Makefile
@@ -6,5 +6,6 @@  obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM) += caam.o
 obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_CRYPTO_API) += caamalg.o
 obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_AHASH_API) += caamhash.o
 obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_RNG_API) += caamrng.o
+obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_DMAXOR_API) += caamxor.o
 
 caam-objs := ctrl.o jr.o error.o key_gen.o
diff --git a/drivers/crypto/caam/caamxor.c b/drivers/crypto/caam/caamxor.c
new file mode 100644
index 0000000..f060cff
--- /dev/null
+++ b/drivers/crypto/caam/caamxor.c
@@ -0,0 +1,880 @@ 
+/*
+ * caam - Freescale Integrated Security Engine (SEC) device driver
+ * Support for off-loading XOR Parity Calculations to CAAM.
+ *
+ * Copyright 2011 Freescale Semiconductor, Inc
+ *
+ * relationship between job descriptors, shared descriptors and sources:
+ * ------------------------------           -------------------
+ * | ShareDesc                  |<------\   | JobDesc         |
+ * |   Load src pointers to ctx |        \--| ShareDesc ptr   |
+ * | new src jump dst:          |<-----\    | SEQ_OUT_PTR     |
+ * |   Load ith src             |      |    | (output buffer) |
+ * | new src mv dst:            |      |    | (output length) |
+ * |   (ith src commands)       |      |    | SEQ_IN_PTR      |
+ * | load:                      |<---\ |    | (src commands)  |----\
+ * |   Seq load chunk           |    | |    -------------------    |
+ * | return:                    |<---|-|-\                         |
+ * |   XOR quarter chunk        |    | | |                         |
+ * |   Pass complete?           |----^-^---\                       |
+ * |   Half chunk left?         |----^-+ | |                       |
+ * |   Default                  |----^-^-+ |                       |
+ * | store:                     |<---|-|-|-/                       |
+ * |   Seq store chunk          |    | | |    -------------------  |
+ * |   No data left to write?   |X   | | |    | first src ptr   |<-/
+ * |   Put src1 chunk in result |    | | |    | first src len   |
+ * |   Default                  |----^-+ |  /-| shared hdr jump |
+ * | first:                     |<---|-|-|-/  | nop (if needed) |
+ * |   No data left to read?    |----^-^-+    -------------------
+ * |   Seq load chunk           |    | | |    | ith src ptr     |
+ * |   Load src2                |    | | |    | ith src len     |
+ * |   Not first pass?          |----^-^-/    | load src i + 1  |
+ * | first pass:                |    | |      | nop (if needed) |
+ * |   Put src1 chunk in result |    | |      -------------------
+ * |   set output size          |    | |      | last src ptr    |
+ * |   Default                  |----^-/      | last src len    |
+ * | last:                      |<---|--------| shared hdr jump |
+ * |   Update index             |    |        | nop (if needed) |
+ * |   Load src1                |    |        -------------------
+ * |   Default                  |----/
+ * ------------------------------
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dmaengine.h>
+
+#include "compat.h"
+#include "regs.h"
+#include "jr.h"
+#include "error.h"
+#include "intern.h"
+#include "desc.h"
+#include "desc_constr.h"
+
+#define MAX_INITIAL_DESCS	64
+#define MAX_XOR_SRCS		8
+
+#define JOB_DESC_BYTES		(4 * CAAM_CMD_SZ + 3 * CAAM_PTR_SZ)
+#define JOB_DESC_LEN		(JOB_DESC_BYTES / CAAM_CMD_SZ)
+#define CMD_DESC_LEN		32
+
+#define LONG_PTR		(CAAM_PTR_SZ > CAAM_CMD_SZ)
+
+#define CTX1_SLOTS		4
+#define SRC_CMD_BYTES		(4 * CAAM_CMD_SZ)
+#define SRC_CMD_LEN		(SRC_CMD_BYTES / CAAM_CMD_SZ)
+#define CHUNK_SIZE		128
+#define CHUNK_SIZE_H		64
+#define CHUNK_SIZE_Q		32
+#define REG_SIZE		8
+
+#define CMD_MOVE_OVERFLOW_LEN	1
+
+#define LABEL_SRC_JMP_BYTES	(5 * CAAM_CMD_SZ)
+#define LABEL_SRC_JMP		(LABEL_SRC_JMP_BYTES / CAAM_CMD_SZ)
+#define LABEL_SRC_MV_BYTES	(CAAM_CMD_SZ + LABEL_SRC_JMP_BYTES)
+#define LABEL_SRC_MV		(LABEL_SRC_MV_BYTES / CAAM_CMD_SZ)
+#define LABEL_FIRST_BYTES	(28 * CAAM_CMD_SZ + LABEL_SRC_MV_BYTES)
+#define LABEL_FIRST		(LABEL_FIRST_BYTES / CAAM_CMD_SZ)
+#define LABEL_LAST_BYTES	(13 * CAAM_CMD_SZ + LABEL_FIRST_BYTES)
+#define LABEL_LAST		(LABEL_LAST_BYTES / CAAM_CMD_SZ)
+#define SH_DESC_BYTES		(5 * CAAM_CMD_SZ + LABEL_LAST_BYTES)
+#define SH_DESC_LEN		(SH_DESC_BYTES / CAAM_CMD_SZ)
+
+#ifdef DEBUG
+/* for print_hex_dumps with line references */
+#define xstr(s) str(s)
+#define str(s) (#s)
+#define debug(format, arg...) printk(format, arg)
+#else
+#define debug(format, arg...)
+#endif
+
+struct caam_xor_sh_desc {
+	u32 desc[SH_DESC_LEN + CMD_MOVE_OVERFLOW_LEN];
+	dma_addr_t sh_desc_phys;
+};
+
+struct caam_dma_async_tx_desc {
+	struct dma_async_tx_descriptor async_tx;
+	struct list_head node;
+	struct caam_dma_jr *dma_jr;
+	u32 job_desc[JOB_DESC_LEN];
+	u32 cmd_desc[CMD_DESC_LEN];
+	dma_addr_t cmd_desc_phys;
+	dma_addr_t dest;
+	dma_addr_t src[MAX_XOR_SRCS];
+	u32 src_cnt;
+	u32 dma_len;
+};
+
+struct caam_dma_desc_pool {
+	int desc_cnt;
+	struct list_head head;
+};
+
+/*
+ * caam_dma_jr - job ring/channel data
+ * @completed_cookie: cookie of latest latest, completed job
+ * @chan: dma channel used by async_tx API
+ * @desc_lock: lock on job descriptor
+ * @submit_q: queue of pending (submitted, but not enqueued) jobs
+ * @done_lock: lock on done_not_acked
+ * @done_not_acked: jobs that have been completed by jr, but maybe not acked
+ * @handle_done: tasklet for cleaning done_not_acked
+ * @caam_hw_jr: jr device data
+ * @pool_lock: lock on soft_desc
+ * @soft_desc: pool of pre-allocated caam_dma_async_tx_desc structures
+ */
+struct caam_dma_jr {
+	dma_cookie_t completed_cookie;
+	struct dma_chan chan;
+	struct device *dev;
+	spinlock_t desc_lock;
+	struct list_head submit_q;
+	spinlock_t done_lock;
+	struct list_head done_not_acked;
+	struct tasklet_struct handle_done;
+	struct caam_drv_private_jr *caam_hw_jr;
+	spinlock_t pool_lock;
+	struct caam_dma_desc_pool *soft_desc;
+};
+
+static inline u32 load_source(u32 ctx, u32 offset, u32 target)
+{
+	return ctx | MOVE_DEST_DESCBUF | SRC_CMD_BYTES |
+	       (target << (2 + MOVE_OFFSET_SHIFT)) |
+	       (offset << MOVE_AUX_SHIFT);
+}
+
+static inline u32 *write_load_source(u32 *desc, u32 ctx, u32 offset, u32 target)
+{
+	return write_move(desc, load_source(ctx, offset, target));
+}
+
+/* generate source commands and job descriptor for each request */
+static void prepare_caam_xor_desc(struct device *dev,
+				  struct caam_dma_async_tx_desc *desc,
+				  dma_addr_t sh_desc_phys,
+				  dma_addr_t dest, dma_addr_t *src,
+				  u32 src_cnt, size_t len)
+{
+	u32 label_src_mv = LABEL_SRC_MV + CMD_MOVE_OVERFLOW_LEN;
+	u32 label_first = LABEL_FIRST + CMD_MOVE_OVERFLOW_LEN;
+	u32 label_last = LABEL_LAST + CMD_MOVE_OVERFLOW_LEN;
+	u32 sh_desc_len = SH_DESC_LEN + CMD_MOVE_OVERFLOW_LEN;
+	int i;
+	u32 *job_descptr = desc->job_desc;
+	u32 *cmd_desc = desc->cmd_desc;
+
+	desc->dest = dest;
+	memcpy(desc->src, src, src_cnt*sizeof(dma_addr_t));
+	desc->src_cnt = src_cnt;
+	desc->dma_len = len;
+
+	/* first source: jump to special commands */
+	cmd_desc = write_ptr(cmd_desc, src[0]);
+	cmd_desc = write_cmd(cmd_desc, len);
+	init_sh_desc(cmd_desc, (label_first & HDR_START_IDX_MASK) <<
+		      HDR_START_IDX_SHIFT);
+	cmd_desc++;
+	if (!LONG_PTR)
+		cmd_desc = write_nop(cmd_desc, 1);
+
+	i = 1;
+	/* sources that load next source from first context */
+	while (i < src_cnt - 1 && i < CTX1_SLOTS - 1) {
+		cmd_desc = write_ptr(cmd_desc, src[i]);
+		cmd_desc = write_cmd(cmd_desc, len);
+		cmd_desc = write_load_source(cmd_desc, MOVE_SRC_CLASS1CTX, i +
+					     1, label_src_mv);
+		if (!LONG_PTR)
+			cmd_desc = write_nop(cmd_desc, 1);
+		i++;
+	}
+	/* sources that load next source from second context */
+	while (i < src_cnt - 1) {
+		cmd_desc = write_ptr(cmd_desc, src[i]);
+		cmd_desc = write_cmd(cmd_desc, len);
+		cmd_desc = write_load_source(cmd_desc, MOVE_SRC_CLASS2CTX, i +
+					     1, label_src_mv);
+		if (!LONG_PTR)
+			cmd_desc = write_nop(cmd_desc, 1);
+		i++;
+	}
+
+	/* last source: jump to special commands */
+	cmd_desc = write_ptr(cmd_desc, src[i]);
+	cmd_desc = write_cmd(cmd_desc, len);
+	init_sh_desc(cmd_desc, (label_last & HDR_START_IDX_MASK) <<
+		      HDR_START_IDX_SHIFT);
+	cmd_desc++;
+	if (!LONG_PTR)
+		cmd_desc = write_nop(cmd_desc, 1);
+
+	desc->cmd_desc_phys = dma_map_single(dev, desc->cmd_desc,
+					     CMD_DESC_LEN * sizeof(u32),
+					     DMA_TO_DEVICE);
+	init_job_desc_shared(job_descptr, sh_desc_phys, sh_desc_len,
+			     HDR_SHARE_WAIT | HDR_REVERSE);
+
+	append_seq_out_ptr(job_descptr, dest, len, 0);
+	append_seq_in_ptr_intlen(job_descptr, desc->cmd_desc_phys,
+					MAX_XOR_SRCS * SRC_CMD_BYTES, 0);
+
+#ifdef DEBUG
+	print_hex_dump(KERN_ERR, "job desc @"xstr(__LINE__)": ",
+		       DUMP_PREFIX_ADDRESS, 16, 4, job_descptr, CAAM_CMD_SZ *
+		       desc_len(job_descptr), 1);
+	print_hex_dump(KERN_ERR, "srcs @"xstr(__LINE__)": ",
+		       DUMP_PREFIX_ADDRESS, 16, 4, src, src_cnt * CAAM_PTR_SZ,
+		       1);
+	print_hex_dump(KERN_ERR, "src cmd@"xstr(__LINE__)": ",
+		       DUMP_PREFIX_ADDRESS, 16, 4, desc->cmd_desc,
+		       SRC_CMD_BYTES * src_cnt, 1);
+#endif
+}
+
+/* generate shared descriptor for each device */
+static void prepare_caam_xor_sh_desc(u32 *descptr, u32 src_cnt)
+{
+	bool overflow;
+	u32 label_src_jmp, label_src_mv;
+	u32 *store_jump_cmd;
+	u32 label_load, label_return, label_store;
+
+	overflow = src_cnt > CTX1_SLOTS;
+	label_src_jmp = LABEL_SRC_JMP + CMD_MOVE_OVERFLOW_LEN;
+	label_src_mv = label_src_jmp + 1;
+	init_sh_desc(descptr, HDR_SHARE_SERIAL);
+	/* Store up to 4 sources in ctx1 */
+	append_cmd(descptr, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+		   LDST_CLASS_1_CCB | (overflow ?
+		   (CTX1_SLOTS * SRC_CMD_BYTES) : (src_cnt * SRC_CMD_BYTES)));
+
+	/* Store any overflow in ctx2 */
+	if (overflow)
+		append_cmd(descptr, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+			   LDST_CLASS_2_CCB | (src_cnt - 4) * 16);
+	else
+		append_cmd(descptr, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+			   LDST_CLASS_2_CCB | 4 * 16);
+
+	append_cmd(descptr, CMD_LOAD | DISABLE_AUTO_INFO_FIFO);
+
+	/* Load first source */
+	append_move(descptr, load_source(MOVE_SRC_CLASS1CTX, 0, label_src_mv) |
+		    MOVE_WAITCOMP);
+
+	/* Refresh shared descriptor */
+	append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+		   ((label_src_jmp & HDR_START_IDX_MASK) <<
+		    HDR_START_IDX_SHIFT));
+
+	/* Load source and run loaded commands */
+	append_cmd(descptr, CMD_SEQ_IN_PTR | SQIN_EXT);
+	append_len(descptr, SRC_CMD_LEN);
+
+	/* Skip read data */
+	append_seq_fifo_load(descptr, 0, KEY_VLF | FIFOLD_CLASS_SKIP);
+
+	/* Load chunk to ififo */
+	label_load = desc_len(descptr);
+	append_seq_fifo_load(descptr, CHUNK_SIZE, FIFOLD_TYPE_PK |
+			     LDST_CLASS_1_CCB);
+
+	/* Update added number of bytes in ififo */
+	append_math_add_imm_u32(descptr, VARSEQOUTLEN, VARSEQOUTLEN, IMM,
+				CHUNK_SIZE);
+
+	/* Load chunk from ififo to math registers via DECO alignment block*/
+	append_load_imm_u32(descptr, NFIFOENTRY_LC1 | NFIFOENTRY_DTYPE_MSG |
+			    CHUNK_SIZE, LDST_SRCDST_WORD_INFO_FIFO);
+	label_return = desc_len(descptr);
+	append_move(descptr, MOVE_WAITCOMP | MOVE_SRC_INFIFO |
+		    MOVE_DEST_MATH0 | CHUNK_SIZE_Q);
+
+	/* XOR math registers with ofifo */
+	append_math_xor(descptr, REG0, REG0, OUTFIFO, REG_SIZE);
+	append_math_xor(descptr, REG1, REG1, OUTFIFO, REG_SIZE);
+	append_math_xor(descptr, REG2, REG2, OUTFIFO, REG_SIZE);
+	append_math_xor(descptr, REG3, REG3, OUTFIFO, REG_SIZE);
+
+	/* Move result to ofifo */
+	append_move(descptr, MOVE_SRC_MATH0 | MOVE_WAITCOMP |
+		    MOVE_DEST_OUTFIFO | CHUNK_SIZE_Q);
+
+	/* Update reduced number of bytes in ififo */
+	append_math_sub_imm_u32(descptr, VARSEQOUTLEN, VARSEQOUTLEN, IMM,
+				CHUNK_SIZE_Q);
+
+	/* If ififo has no more data, store chunk */
+	store_jump_cmd = append_jump(descptr, JUMP_TEST_ALL |
+				     JUMP_COND_MATH_Z);
+
+	/* If half of chunk left, use next source */
+	append_math_sub_imm_u32(descptr, NONE, VARSEQOUTLEN, IMM,
+				CHUNK_SIZE_H);
+	append_jump_to(descptr, JUMP_TEST_ALL | JUMP_COND_MATH_Z,
+		       label_src_jmp);
+
+	/* Else, keep XORing */
+	append_jump_to(descptr, 0, label_return);
+
+	/* Store */
+	label_store = desc_len(descptr);
+	set_jump_tgt_here(descptr, store_jump_cmd);
+
+	/* Store chunk to seqout */
+	append_seq_fifo_store(descptr, CHUNK_SIZE, FIFOST_TYPE_MESSAGE_DATA);
+
+	/* Halt if no more data */
+	append_math_sub(descptr, NONE, SEQOUTLEN, ONE, CAAM_CMD_SZ);
+	append_jump(descptr, JUMP_TYPE_HALT_USER | JUMP_TEST_ALL |
+			JUMP_COND_MATH_N);
+
+	/* Load first source's next chunk to ofifo */
+	append_move(descptr, MOVE_SRC_INFIFO | MOVE_DEST_OUTFIFO |
+			MOVE_WAITCOMP | CHUNK_SIZE);
+
+	/* Goto source */
+	append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+		   ((label_src_jmp & HDR_START_IDX_MASK) <<
+		     HDR_START_IDX_SHIFT));
+
+	/* First source, skip read data */
+	append_seq_fifo_load(descptr, 0, KEY_VLF | FIFOLD_CLASS_SKIP);
+
+	/* If no more data to read, go XOR read data */
+	append_math_sub(descptr, NONE, SEQINLEN, ONE, CAAM_CMD_SZ);
+	append_jump_to(descptr, JUMP_TEST_ALL | JUMP_COND_MATH_N,
+		       label_return);
+
+	/* Otherwise, load chunk from first source to DECO alignment block */
+	append_seq_fifo_load(descptr, CHUNK_SIZE, FIFOLD_TYPE_PK |
+			     LDST_CLASS_1_CCB);
+	append_load_imm_u32(descptr, NFIFOENTRY_LC1 | NFIFOENTRY_DTYPE_MSG |
+			    CHUNK_SIZE, LDST_SRCDST_WORD_INFO_FIFO);
+
+	/* Load second source */
+	append_move(descptr, load_source(MOVE_SRC_CLASS1CTX, 1, label_src_mv));
+
+	/* XOR previous pass if this is not first pass */
+	append_math_sub(descptr, NONE, VARSEQINLEN, ONE, CAAM_CMD_SZ);
+	append_jump_to(descptr, JUMP_TEST_INVALL | JUMP_COND_MATH_N,
+		       label_return);
+
+	/* Else, move chunk for DECO alignment block to ofifo */
+	append_move(descptr, MOVE_SRC_INFIFO | MOVE_DEST_OUTFIFO |
+				MOVE_WAITCOMP | CHUNK_SIZE);
+
+	/* and track number of bytes to write*/
+	append_math_add_imm_u32(descptr, SEQOUTLEN, SEQINLEN, IMM, CHUNK_SIZE);
+
+	/* Goto source */
+	append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+		   ((label_src_jmp & HDR_START_IDX_MASK) <<
+		     HDR_START_IDX_SHIFT));
+
+	/* Last source, skip read data */
+	append_seq_fifo_load(descptr, 0, KEY_VLF | FIFOLD_CLASS_SKIP);
+
+	/* Update number of bytes to skip */
+	append_math_add_imm_u32(descptr, VARSEQINLEN, VARSEQINLEN, IMM,
+				CHUNK_SIZE);
+
+	/* Load first source */
+	append_move(descptr, load_source(MOVE_SRC_CLASS1CTX, 0, label_src_mv));
+
+	/* Goto data loading */
+	append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+		   ((label_load & HDR_START_IDX_MASK) << HDR_START_IDX_SHIFT));
+
+#ifdef DEBUG
+	print_hex_dump(KERN_ERR, "shdesc @"xstr(__LINE__)": ",
+		       DUMP_PREFIX_ADDRESS, 16, 4, descptr, CAAM_CMD_SZ *
+		       desc_len(descptr), 1);
+#endif
+}
+
+static enum dma_status caam_jr_tx_status(struct dma_chan *chan,
+					 dma_cookie_t cookie,
+					 struct dma_tx_state *txstate)
+{
+	struct caam_dma_jr *jr = NULL;
+	dma_cookie_t last_used;
+	dma_cookie_t last_complete;
+
+	jr = container_of(chan, struct caam_dma_jr, chan);
+
+	last_used = chan->cookie;
+	last_complete = jr->completed_cookie;
+
+	dma_set_tx_state(txstate, last_complete, last_used, 0);
+
+	return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static inline void try_clear_desc(struct caam_dma_jr *dma_jr)
+{
+	spin_lock_bh(&dma_jr->done_lock);
+	if (!list_empty(&dma_jr->done_not_acked)) {
+		spin_unlock_bh(&dma_jr->done_lock);
+		tasklet_schedule(&dma_jr->handle_done);
+	} else {
+		spin_unlock_bh(&dma_jr->done_lock);
+	}
+}
+
+/*
+ * tasklet function for checking requests that are completed,
+ * but may not have been acked --delete only if acked
+ */
+static void check_done(unsigned long data)
+{
+	struct caam_dma_jr *dma_jr = (struct caam_dma_jr *) data;
+	struct caam_dma_async_tx_desc *desc, *_desc;
+
+	spin_lock_bh(&dma_jr->done_lock);
+	list_for_each_entry_safe(desc, _desc, &dma_jr->done_not_acked, node) {
+		spin_unlock_bh(&dma_jr->done_lock);
+		if (async_tx_test_ack(&desc->async_tx)) {
+			spin_lock_bh(&dma_jr->done_lock);
+			list_del(&desc->node);
+			spin_unlock_bh(&dma_jr->done_lock);
+			spin_lock_bh(&dma_jr->pool_lock);
+			if (dma_jr->soft_desc->desc_cnt < MAX_INITIAL_DESCS) {
+				INIT_LIST_HEAD(&desc->node);
+				list_add(&desc->node, &dma_jr->soft_desc->head);
+				dma_jr->soft_desc->desc_cnt++;
+				spin_unlock_bh(&dma_jr->pool_lock);
+			} else {
+				spin_unlock_bh(&dma_jr->pool_lock);
+				kfree(desc);
+			}
+		}
+		spin_lock_bh(&dma_jr->done_lock);
+	}
+	spin_unlock_bh(&dma_jr->done_lock);
+}
+
+static void caam_dma_xor_done(struct device *dev, u32 *hwdesc, u32 status,
+			      void *auxarg)
+{
+	struct caam_dma_async_tx_desc *desc;
+	struct caam_dma_jr *dma_jr;
+	dma_async_tx_callback callback;
+	void *callback_param;
+	struct device *jrdev;
+	enum dma_ctrl_flags flags;
+
+	desc = (struct caam_dma_async_tx_desc *)auxarg;
+	dma_jr = desc->dma_jr;
+	jrdev = dma_jr->caam_hw_jr->parentdev;
+	flags = desc->async_tx.flags;
+
+	if (status) {
+		char tmp[256];
+		dev_err(dev, "%s\n", caam_jr_strstatus(tmp, status));
+	}
+
+	dma_run_dependencies(&desc->async_tx);
+
+	spin_lock_bh(&dma_jr->desc_lock);
+	if (dma_jr->completed_cookie < desc->async_tx.cookie) {
+		dma_jr->completed_cookie = desc->async_tx.cookie;
+		if (dma_jr->completed_cookie == DMA_MAX_COOKIE)
+			dma_jr->completed_cookie = DMA_MIN_COOKIE;
+	}
+	spin_unlock_bh(&dma_jr->desc_lock);
+
+	callback = desc->async_tx.callback;
+	callback_param = desc->async_tx.callback_param;
+
+	dma_unmap_single(jrdev, desc->cmd_desc_phys,
+			CMD_DESC_LEN * sizeof(u32), DMA_TO_DEVICE);
+
+	if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
+		dma_unmap_page(jrdev, desc->dest, desc->dma_len,
+						DMA_BIDIRECTIONAL);
+
+	if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
+		u32 i;
+		for (i = 0; i < desc->src_cnt; i++) {
+			if (desc->src[i] == desc->dest)
+				continue;
+			dma_unmap_page(jrdev, desc->src[i],
+					desc->dma_len, DMA_TO_DEVICE);
+		}
+	}
+
+	if (async_tx_test_ack(&desc->async_tx)) {
+		spin_lock_bh(&dma_jr->pool_lock);
+		if (dma_jr->soft_desc->desc_cnt < MAX_INITIAL_DESCS) {
+			list_add(&desc->node, &dma_jr->soft_desc->head);
+			dma_jr->soft_desc->desc_cnt++;
+			spin_unlock_bh(&dma_jr->pool_lock);
+		} else {
+			spin_unlock_bh(&dma_jr->pool_lock);
+			kfree(desc);
+		}
+	} else {
+		spin_lock_bh(&dma_jr->done_lock);
+		INIT_LIST_HEAD(&desc->node);
+		list_add_tail(&desc->node, &dma_jr->done_not_acked);
+		spin_unlock_bh(&dma_jr->done_lock);
+	}
+	try_clear_desc(dma_jr);
+
+	if (callback)
+		callback(callback_param);
+}
+
+static void caam_jr_issue_pending(struct dma_chan *chan)
+{
+	struct caam_dma_jr *dma_jr = NULL;
+	struct caam_dma_async_tx_desc *desc, *_desc;
+	struct device *dev;
+
+	dma_jr = container_of(chan, struct caam_dma_jr, chan);
+	dev = dma_jr->dev;
+
+	spin_lock_bh(&dma_jr->desc_lock);
+	list_for_each_entry_safe(desc, _desc, &dma_jr->submit_q, node) {
+		desc->dma_jr = dma_jr;
+		if (caam_jr_enqueue(dev, desc->job_desc,
+				    caam_dma_xor_done, desc) < 0) {
+			spin_unlock_bh(&dma_jr->desc_lock);
+			return;
+		}
+
+		list_del(&desc->node);
+	}
+
+	spin_unlock_bh(&dma_jr->desc_lock);
+}
+
+static dma_cookie_t caam_jr_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct caam_dma_async_tx_desc *desc = NULL;
+	struct caam_dma_jr *jr = NULL;
+	dma_cookie_t cookie;
+
+	desc = container_of(tx, struct caam_dma_async_tx_desc, async_tx);
+	jr = container_of(tx->chan, struct caam_dma_jr, chan);
+
+	spin_lock_bh(&jr->desc_lock);
+
+	cookie = jr->chan.cookie + 1;
+	if (cookie < DMA_MIN_COOKIE)
+		cookie = DMA_MIN_COOKIE;
+
+	desc->async_tx.cookie = cookie;
+	jr->chan.cookie = desc->async_tx.cookie;
+	list_add_tail(&desc->node, &jr->submit_q);
+
+	spin_unlock_bh(&jr->desc_lock);
+
+	return cookie;
+}
+
+static struct dma_async_tx_descriptor *
+caam_jr_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+		     unsigned int src_cnt, size_t len, unsigned long flags)
+{
+	struct caam_dma_jr *jr = NULL;
+	struct caam_dma_async_tx_desc *desc = NULL;
+	struct caam_drv_private *priv;
+
+	jr = container_of(chan, struct caam_dma_jr, chan);
+
+	if (src_cnt > MAX_XOR_SRCS) {
+		dev_err(jr->dev, "%d srcs exceed max supported %d srcs\n",
+							src_cnt, MAX_XOR_SRCS);
+		return NULL;
+	}
+
+	spin_lock_bh(&jr->pool_lock);
+	if (jr->soft_desc->desc_cnt) {
+		desc = container_of(jr->soft_desc->head.next,
+				    struct caam_dma_async_tx_desc, node);
+		jr->soft_desc->desc_cnt--;
+		list_del(&desc->node);
+	}
+	spin_unlock_bh(&jr->pool_lock);
+
+	if (!desc) {
+		desc = kzalloc(sizeof(struct caam_dma_async_tx_desc),
+			       GFP_KERNEL);
+		if (!desc) {
+			dev_err(jr->dev, "Out of memory for XOR async tx\n");
+			try_clear_desc(jr);
+
+			return ERR_PTR(-ENOMEM);
+		}
+
+		desc->async_tx.tx_submit = caam_jr_tx_submit;
+	}
+
+	dma_async_tx_descriptor_init(&desc->async_tx, &jr->chan);
+
+	priv = dev_get_drvdata(jr->caam_hw_jr->parentdev);
+
+	prepare_caam_xor_desc(jr->caam_hw_jr->parentdev, desc,
+			      priv->xor_sh_desc[0].sh_desc_phys, dest,
+			      src, src_cnt, len);
+
+	desc->async_tx.flags = flags;
+	desc->async_tx.cookie = -EBUSY;
+	return &desc->async_tx;
+}
+
+static void caam_jr_free_chan_resources(struct dma_chan *chan)
+{
+	struct caam_dma_jr *jr = container_of(chan, struct caam_dma_jr, chan);
+	struct caam_dma_async_tx_desc *desc;
+	struct list_head *current_node;
+
+	current_node = jr->soft_desc->head.next;
+	while (jr->soft_desc->desc_cnt > 0) {
+		desc = container_of(current_node, struct caam_dma_async_tx_desc,
+				    node);
+		current_node = current_node->next;
+		list_del(&desc->node);
+		kfree(desc);
+		jr->soft_desc->desc_cnt--;
+	}
+
+	kfree(jr->soft_desc);
+
+	return;
+}
+
+static int caam_jr_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct caam_dma_jr *jr = container_of(chan, struct caam_dma_jr, chan);
+	struct caam_dma_async_tx_desc *desc;
+	unsigned int i;
+
+	jr->soft_desc = kzalloc(sizeof(struct caam_dma_desc_pool), GFP_KERNEL);
+	if (!jr->soft_desc) {
+		pr_err("%s: Failed to allocate resources for DMA channel\n",
+		       __func__);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&jr->soft_desc->head);
+	for (i = 0; i < MAX_INITIAL_DESCS; i++) {
+		desc = kzalloc(sizeof(struct caam_dma_async_tx_desc),
+			       GFP_KERNEL);
+		if (!desc)
+			return -ENOMEM;
+
+		desc->async_tx.tx_submit = caam_jr_tx_submit;
+		jr->soft_desc->desc_cnt++;
+		list_add_tail(&desc->node, &jr->soft_desc->head);
+	}
+
+	return 0;
+}
+
+static int caam_jr_chan_bind(struct device *ctrldev, struct device *dev)
+{
+	struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+	struct caam_drv_private_jr *jrpriv = dev_get_drvdata(dev);
+	struct dma_device *dma_dev = &priv->dma_dev;
+	struct caam_dma_jr *dma_jr;
+
+	dma_jr = kzalloc(sizeof(struct caam_dma_jr), GFP_KERNEL);
+	if (!dma_jr) {
+		dev_err(dev, "Failed to allocate memory for caam job queue\n");
+		return -ENOMEM;
+	}
+
+	dma_jr->chan.device = dma_dev;
+	dma_jr->chan.private = dma_jr;
+
+	INIT_LIST_HEAD(&dma_jr->submit_q);
+	spin_lock_init(&dma_jr->desc_lock);
+	spin_lock_init(&dma_jr->pool_lock);
+	list_add_tail(&dma_jr->chan.device_node, &dma_dev->channels);
+	dma_dev->chancnt++;
+
+	dma_jr->caam_hw_jr = jrpriv;
+	dma_jr->dev = dev;
+	jrpriv->jrdev = dev;
+
+	INIT_LIST_HEAD(&dma_jr->done_not_acked);
+	spin_lock_init(&dma_jr->done_lock);
+	tasklet_init(&dma_jr->handle_done, check_done, (unsigned long) dma_jr);
+
+	return 0;
+}
+
+static inline void caam_jr_chan_unbind(struct device *ctrldev,
+				       struct dma_chan *chan)
+{
+	struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+	struct dma_device *dma_dev = &priv->dma_dev;
+
+	list_del(&chan->device_node);
+	dma_dev->chancnt--;
+}
+
+static inline void caam_jr_free(struct dma_chan *chan)
+{
+	struct caam_dma_jr *dma_jr = container_of(chan, struct caam_dma_jr,
+						  chan);
+
+	list_del(&chan->device_node);
+	kfree(dma_jr);
+}
+
+static int caam_jr_dma_init(struct device *ctrldev)
+{
+	struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+	struct dma_device *dma_dev = NULL;
+	struct caam_xor_sh_desc *sh_desc;
+	int i;
+
+	priv->xor_sh_desc =
+	    kzalloc(sizeof(struct caam_xor_sh_desc), GFP_KERNEL);
+	if (!priv->xor_sh_desc) {
+		dev_err(ctrldev,
+			"Failed to allocate memory for XOR Shared"
+			"descriptor\n");
+		return -ENOMEM;
+	}
+
+	sh_desc = priv->xor_sh_desc;
+	prepare_caam_xor_sh_desc(sh_desc->desc, MAX_XOR_SRCS);
+	sh_desc->sh_desc_phys = dma_map_single(ctrldev, &sh_desc->desc,
+						SH_DESC_LEN * sizeof(u32),
+						DMA_TO_DEVICE);
+
+	dma_dev = &priv->dma_dev;
+	dma_dev->dev = ctrldev;
+	INIT_LIST_HEAD(&dma_dev->channels);
+
+	dma_dev->max_xor = MAX_XOR_SRCS;
+
+	/*
+	 * xor transaction must be 128 bytes aligned. For unaligned
+	 * transaction, xor-parity calculations will not be off-loaded
+	 * to caam
+	 */
+	dma_dev->xor_align = 8;
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+
+	dma_dev->device_alloc_chan_resources = caam_jr_alloc_chan_resources;
+	dma_dev->device_tx_status = caam_jr_tx_status;
+	dma_dev->device_issue_pending = caam_jr_issue_pending;
+	dma_dev->device_prep_dma_xor = caam_jr_prep_dma_xor;
+	dma_dev->device_free_chan_resources = caam_jr_free_chan_resources;
+
+	for (i = 0; i < priv->total_jobrs; i++)
+		caam_jr_chan_bind(ctrldev, priv->jrdev[i]);
+
+	dma_async_device_register(dma_dev);
+	dev_info(ctrldev, "caam xor support with %d job rings\n",
+		 priv->total_jobrs);
+
+	return 0;
+}
+
+static void caam_jr_dma_exit(struct device *ctrldev)
+{
+	struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+	struct dma_device *dma_dev = &priv->dma_dev;
+	struct dma_chan *chan, *_chan;
+	struct list_head to_free;
+	int i;
+
+	i = 0;
+	INIT_LIST_HEAD(&to_free);
+	/* before unregistering device, remove channels... */
+	list_for_each_entry_safe(chan, _chan, &dma_dev->channels, device_node) {
+		caam_jr_chan_unbind(ctrldev, chan);
+		list_add_tail(&chan->device_node, &to_free);
+		i++;
+	}
+
+	dma_async_device_unregister(dma_dev);
+
+	/*
+	 * ...but don't delete them until device has been unregistered, so
+	 * that deleted channels will not be used
+	 */
+	list_for_each_entry_safe(chan, _chan, &to_free, device_node) {
+		caam_jr_free(chan);
+	}
+
+	for (i = 0; i < (MAX_XOR_SRCS - 2); i++) {
+		dma_unmap_single(ctrldev, priv->xor_sh_desc[i].sh_desc_phys,
+				 SH_DESC_LEN * sizeof(u32), DMA_TO_DEVICE);
+	}
+
+	kfree(priv->xor_sh_desc);
+	dev_info(ctrldev, "caam xor support disabled\n");
+}
+
+static int __init caam_xor_init(void)
+{
+	struct device_node *dev_node;
+	struct platform_device *pdev;
+	struct device *ctrldev;
+	struct caam_drv_private *priv;
+	int err = 0;
+
+	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
+	if (!dev_node)
+		return -ENODEV;
+
+	pdev = of_find_device_by_node(dev_node);
+	if (!pdev)
+		return -ENODEV;
+
+	ctrldev = &pdev->dev;
+	priv = dev_get_drvdata(ctrldev);
+	of_node_put(dev_node);
+
+	atomic_set(&priv->tfm_count, -1);
+
+	/* register caam device */
+	err = caam_jr_dma_init(ctrldev);
+	if (err)
+		dev_err(ctrldev, "error in xor initialization: %d\n", err);
+
+	return err;
+}
+
+static void __exit caam_xor_exit(void)
+{
+	struct device_node *dev_node;
+	struct platform_device *pdev;
+	struct device *ctrldev;
+	struct caam_drv_private *priv;
+
+	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
+	if (!dev_node)
+		return;
+
+	pdev = of_find_device_by_node(dev_node);
+	if (!pdev)
+		return;
+
+	ctrldev = &pdev->dev;
+	of_node_put(dev_node);
+	priv = dev_get_drvdata(ctrldev);
+
+	caam_jr_dma_exit(ctrldev);
+}
+
+module_init(caam_xor_init);
+module_exit(caam_xor_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("FSL CAAM support for crypto API");
+MODULE_AUTHOR("Freescale Semiconductor - NMG/STC");
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index c85c1f0..d06bf68 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -9,11 +9,13 @@ 
 #define IMMEDIATE (1 << 23)
 #define CAAM_CMD_SZ sizeof(u32)
 #define CAAM_PTR_SZ sizeof(dma_addr_t)
+#define CAAM_PTR_LEN (CAAM_PTR_SZ / CAAM_CMD_SZ)
 #define CAAM_DESC_BYTES_MAX (CAAM_CMD_SZ * MAX_CAAM_DESCSIZE)
 
 #ifdef DEBUG
-#define PRINT_POS do { printk(KERN_DEBUG "%02d: %s\n", desc_len(desc),\
-			      &__func__[sizeof("append")]); } while (0)
+#define PRINT_POS do { pr_debug("%02d: %s\n", desc_len(desc),\
+				&__func__[sizeof("append")]);\
+		} while (0)
 #else
 #define PRINT_POS
 #endif
@@ -82,6 +84,20 @@  static inline void append_ptr(u32 *desc, dma_addr_t ptr)
 	(*desc) += CAAM_PTR_SZ / CAAM_CMD_SZ;
 }
 
+/* Write command without affecting header, and return pointer to next word */
+static inline u32 *write_ptr(u32 *desc, dma_addr_t ptr)
+{
+	memcpy(desc, &ptr, CAAM_PTR_SZ);
+
+	return desc + CAAM_PTR_LEN;
+}
+
+/* Increase descriptor length */
+static inline void append_len(u32 *desc, unsigned int len)
+{
+	(*desc) += len;
+}
+
 static inline void init_job_desc_shared(u32 *desc, dma_addr_t ptr, int len,
 					u32 options)
 {
@@ -110,6 +126,14 @@  static inline void append_cmd(u32 *desc, u32 command)
 	(*desc)++;
 }
 
+/* Write command without affecting header, and return pointer to next word */
+static inline u32 *write_cmd(u32 *desc, u32 command)
+{
+	*desc = command;
+
+	return desc + 1;
+}
+
 static inline void append_cmd_ptr(u32 *desc, dma_addr_t ptr, int len,
 				  u32 command)
 {
@@ -143,11 +167,28 @@  static inline u32 *append_jump(u32 *desc, u32 options)
 	return cmd;
 }
 
+/* Given destination, as offset from header, append jump */
+static inline void append_jump_to(u32 *desc, u32 options, u32 target)
+{
+	PRINT_POS;
+
+	append_jump(desc, options | ((target - desc_len(desc)) &
+		    JUMP_OFFSET_MASK));
+}
+
 static inline void set_jump_tgt_here(u32 *desc, u32 *jump_cmd)
 {
 	*jump_cmd = *jump_cmd | (desc_len(desc) - (jump_cmd - desc));
 }
 
+/* len words have no commands */
+static inline u32 *write_nop(u32 *desc, int len)
+{
+	*desc = CMD_JUMP | len;
+
+	return desc + len;
+}
+
 #define APPEND_CMD(cmd, op) \
 static inline void append_##cmd(u32 *desc, u32 options) \
 { \
@@ -157,6 +198,14 @@  static inline void append_##cmd(u32 *desc, u32 options) \
 APPEND_CMD(operation, OPERATION)
 APPEND_CMD(move, MOVE)
 
+#define WRITE_CMD(cmd, op) \
+static inline u32 *write_##cmd(u32 *desc, u32 options) \
+{ \
+	PRINT_POS; \
+	return write_cmd(desc, CMD_##op | options); \
+}
+WRITE_CMD(move, MOVE)
+
 #define APPEND_CMD_LEN(cmd, op) \
 static inline void append_##cmd(u32 *desc, unsigned int len, u32 options) \
 { \
diff --git a/drivers/crypto/caam/intern.h b/drivers/crypto/caam/intern.h
index 5cd4c1b..2b41e31 100644
--- a/drivers/crypto/caam/intern.h
+++ b/drivers/crypto/caam/intern.h
@@ -26,6 +26,8 @@ 
 #define JOBR_INTC_COUNT_THLD 0
 #endif
 
+#define CAAM_NAPI_WEIGHT	63
+
 /*
  * Storage for tracking each in-process entry moving across a ring
  * Each entry on an output ring needs one of these
@@ -58,6 +60,7 @@  struct caam_drv_private_jr {
 	int out_ring_read_index;	/* Output index "tail" */
 	int tail;			/* entinfo (s/w ring) tail index */
 	struct jr_outentry *outring;	/* Base of output ring, DMA-safe */
+	struct device *jrdev;
 };
 
 /*
@@ -91,6 +94,10 @@  struct caam_drv_private {
 	/* list of registered hash algorithms (mk generic context handle?) */
 	struct list_head hash_list;
 
+	/* For DMA-XOR support */
+	struct dma_device dma_dev;
+	struct caam_xor_sh_desc *xor_sh_desc;
+
 	/*
 	 * debugfs entries for developer view into driver/device
 	 * variables at runtime.
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index 53c8c51..8dc81cf 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -80,8 +80,8 @@  static void caam_jr_dequeue(unsigned long devarg)
 		/* we should never fail to find a matching descriptor */
 		BUG_ON(CIRC_CNT(head, tail + i, JOBR_DEPTH) <= 0);
 
-		/* Unmap just-run descriptor so we can post-process */
-		dma_unmap_single(dev, jrp->outring[hw_idx].desc,
+		/* Unmap just-run job descriptor so we can post-process */
+		dma_unmap_single(jrp->jrdev, jrp->outring[hw_idx].desc,
 				 jrp->entinfo[sw_idx].desc_size,
 				 DMA_TO_DEVICE);
 
@@ -230,7 +230,7 @@  int caam_jr_enqueue(struct device *dev, u32 *desc,
 	dma_addr_t desc_dma;
 
 	desc_size = (*desc & HDR_JD_LENGTH_MASK) * sizeof(u32);
-	desc_dma = dma_map_single(dev, desc, desc_size, DMA_TO_DEVICE);
+	desc_dma = dma_map_single(jrp->jrdev, desc, desc_size, DMA_TO_DEVICE);
 	if (dma_mapping_error(dev, desc_dma)) {
 		dev_err(dev, "caam_jr_enqueue(): can't map jobdesc\n");
 		return -EIO;
@@ -466,6 +466,8 @@  int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
 	else
 		dma_set_mask(jrdev, DMA_BIT_MASK(32));
 
+	jrpriv->jrdev = jrdev;
+
 	/* Identify the interrupt */
 	jrpriv->irq = of_irq_to_resource(np, 0, NULL);