From d793327fac3a9292edfdcee0ab6c29e4fdd08d65 Mon Sep 17 00:00:00 2001
From: Hanna Hawa <hannah@marvell.com>
Date: Fri, 5 May 2017 11:57:51 +0200
Subject: dmaengine: mv_xor_v2: implement proper interrupt coalescing

Until now, the driver was not using interrupt coalescing: one interrupt
was generated for each descriptor processed by the XOR engine. This
commit changes that by using the interrupt coalescing features of the
hardware, by setting both a number of descriptors processed before an
interrupt is generated and a timeout before an interrupt is generated.

Signed-off-by: Hanna Hawa <hannah@marvell.com>
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/mv_xor_v2.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c
index f3e211f8f6c5..5292ff2d5380 100644
--- a/drivers/dma/mv_xor_v2.c
+++ b/drivers/dma/mv_xor_v2.c
@@ -42,6 +42,7 @@
 #define MV_XOR_V2_DMA_IMSG_THRD_OFF			0x018
 #define   MV_XOR_V2_DMA_IMSG_THRD_MASK			0x7FFF
 #define   MV_XOR_V2_DMA_IMSG_THRD_SHIFT			0x0
+#define   MV_XOR_V2_DMA_IMSG_TIMER_EN			BIT(18)
 #define MV_XOR_V2_DMA_DESQ_AWATTR_OFF			0x01C
   /* Same flags as MV_XOR_V2_DMA_DESQ_ARATTR_OFF */
 #define MV_XOR_V2_DMA_DESQ_ALLOC_OFF			0x04C
@@ -55,6 +56,9 @@
 #define MV_XOR_V2_DMA_DESQ_STOP_OFF			0x800
 #define MV_XOR_V2_DMA_DESQ_DEALLOC_OFF			0x804
 #define MV_XOR_V2_DMA_DESQ_ADD_OFF			0x808
+#define MV_XOR_V2_DMA_IMSG_TMOT				0x810
+#define   MV_XOR_V2_DMA_IMSG_TIMER_THRD_MASK		0x1FFF
+#define   MV_XOR_V2_DMA_IMSG_TIMER_THRD_SHIFT		0
 
 /* XOR Global registers */
 #define MV_XOR_V2_GLOB_BW_CTRL				0x4
@@ -90,6 +94,13 @@
  */
 #define MV_XOR_V2_DESC_NUM				1024
 
+/*
+ * Threshold values for descriptors and timeout, determined by
+ * experimentation as giving a good level of performance.
+ */
+#define MV_XOR_V2_DONE_IMSG_THRD  0x14
+#define MV_XOR_V2_TIMER_THRD      0xB0
+
 /**
  * struct mv_xor_v2_descriptor - DMA HW descriptor
  * @desc_id: used by S/W and is not affected by H/W.
@@ -246,6 +257,29 @@ static int mv_xor_v2_set_desc_size(struct mv_xor_v2_device *xor_dev)
 	return MV_XOR_V2_EXT_DESC_SIZE;
 }
 
+/*
+ * Set the IMSG threshold
+ */
+static inline
+void mv_xor_v2_enable_imsg_thrd(struct mv_xor_v2_device *xor_dev)
+{
+	u32 reg;
+
+	/* Configure threshold of number of descriptors, and enable timer */
+	reg = readl(xor_dev->dma_base + MV_XOR_V2_DMA_IMSG_THRD_OFF);
+	reg &= (~MV_XOR_V2_DMA_IMSG_THRD_MASK << MV_XOR_V2_DMA_IMSG_THRD_SHIFT);
+	reg |= (MV_XOR_V2_DONE_IMSG_THRD << MV_XOR_V2_DMA_IMSG_THRD_SHIFT);
+	reg |= MV_XOR_V2_DMA_IMSG_TIMER_EN;
+	writel(reg, xor_dev->dma_base + MV_XOR_V2_DMA_IMSG_THRD_OFF);
+
+	/* Configure Timer Threshold */
+	reg = readl(xor_dev->dma_base + MV_XOR_V2_DMA_IMSG_TMOT);
+	reg &= (~MV_XOR_V2_DMA_IMSG_TIMER_THRD_MASK <<
+		MV_XOR_V2_DMA_IMSG_TIMER_THRD_SHIFT);
+	reg |= (MV_XOR_V2_TIMER_THRD << MV_XOR_V2_DMA_IMSG_TIMER_THRD_SHIFT);
+	writel(reg, xor_dev->dma_base + MV_XOR_V2_DMA_IMSG_TMOT);
+}
+
 static irqreturn_t mv_xor_v2_interrupt_handler(int irq, void *data)
 {
 	struct mv_xor_v2_device *xor_dev = data;
@@ -795,6 +829,8 @@ static int mv_xor_v2_probe(struct platform_device *pdev)
 	list_add_tail(&xor_dev->dmachan.device_node,
 		      &dma_dev->channels);
 
+	mv_xor_v2_enable_imsg_thrd(xor_dev);
+
 	mv_xor_v2_descq_init(xor_dev);
 
 	ret = dma_async_device_register(dma_dev);
-- 
cgit v1.2.3


From 35e34480c5177f96d822b8ea8e9e42b5fcca6ff6 Mon Sep 17 00:00:00 2001
From: Hanna Hawa <hannah@marvell.com>
Date: Fri, 5 May 2017 11:57:52 +0200
Subject: dmaengine: mv_xor_v2: remove unnecessary write to DESQ_STOP register

Remove unnecessary write to DESQ_STOP register, this register is used to
enable or disable the XOR engine, and not to issue all pending
descriptors in the queue. mv_xor_v2 driver already writes to this
register and enable XOR engine in the mv_xor_v2_descq_init() function,
called during initialization.

Signed-off-by: Hanna Hawa <hannah@marvell.com>
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/mv_xor_v2.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c
index 5292ff2d5380..f386b88bb50c 100644
--- a/drivers/dma/mv_xor_v2.c
+++ b/drivers/dma/mv_xor_v2.c
@@ -535,9 +535,6 @@ static void mv_xor_v2_issue_pending(struct dma_chan *chan)
 	mv_xor_v2_add_desc_to_desq(xor_dev, xor_dev->npendings);
 	xor_dev->npendings = 0;
 
-	/* Activate the channel */
-	writel(0, xor_dev->dma_base + MV_XOR_V2_DMA_DESQ_STOP_OFF);
-
 	spin_unlock_bh(&xor_dev->lock);
 }
 
-- 
cgit v1.2.3


From ecfa77145b13138677c25b4c117d5043bb98cbc8 Mon Sep 17 00:00:00 2001
From: Hanna Hawa <hannah@marvell.com>
Date: Fri, 5 May 2017 11:57:53 +0200
Subject: dmaengine: mv_xor_v2: add support for suspend/resume

This commit adds the support for suspend/resume in the mv_xor_v2
driver. The suspend suspend function disables the XOR engine after the
DMA stack has handled all pending descriptors in the queue. The resume
function re-configures the XOR engine and re-enables the engine.

Signed-off-by: Hanna Hawa <hannah@marvell.com>
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/mv_xor_v2.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c
index f386b88bb50c..f652a0e0f5a2 100644
--- a/drivers/dma/mv_xor_v2.c
+++ b/drivers/dma/mv_xor_v2.c
@@ -696,6 +696,27 @@ static int mv_xor_v2_descq_init(struct mv_xor_v2_device *xor_dev)
 	return 0;
 }
 
+static int mv_xor_v2_suspend(struct platform_device *dev, pm_message_t state)
+{
+	struct mv_xor_v2_device *xor_dev = platform_get_drvdata(dev);
+
+	/* Set this bit to disable to stop the XOR unit. */
+	writel(0x1, xor_dev->dma_base + MV_XOR_V2_DMA_DESQ_STOP_OFF);
+
+	return 0;
+}
+
+static int mv_xor_v2_resume(struct platform_device *dev)
+{
+	struct mv_xor_v2_device *xor_dev = platform_get_drvdata(dev);
+
+	mv_xor_v2_set_desc_size(xor_dev);
+	mv_xor_v2_enable_imsg_thrd(xor_dev);
+	mv_xor_v2_descq_init(xor_dev);
+
+	return 0;
+}
+
 static int mv_xor_v2_probe(struct platform_device *pdev)
 {
 	struct mv_xor_v2_device *xor_dev;
@@ -877,6 +898,8 @@ MODULE_DEVICE_TABLE(of, mv_xor_v2_dt_ids);
 
 static struct platform_driver mv_xor_v2_driver = {
 	.probe		= mv_xor_v2_probe,
+	.suspend	= mv_xor_v2_suspend,
+	.resume		= mv_xor_v2_resume,
 	.remove		= mv_xor_v2_remove,
 	.driver		= {
 		.name	= "mv_xor_v2",
-- 
cgit v1.2.3


From b5dceda1f7ef66cba6f8d766502f242a27f96e6d Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@broadcom.com>
Date: Mon, 15 May 2017 10:34:52 +0530
Subject: lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position

The raid6_gfexp table represents {2}^n values for 0 <= n < 256. The
Linux async_tx framework pass values from raid6_gfexp as coefficients
for each source to prep_dma_pq() callback of DMA channel with PQ
capability. This creates problem for RAID6 offload engines (such as
Broadcom SBA) which take disk position (i.e. log of {2}) instead of
multiplicative cofficients from raid6_gfexp table.

This patch adds raid6_gflog table having log-of-2 value for any given
x such that 0 <= x < 256. For any given disk coefficient x, the
corresponding disk position is given by raid6_gflog[x]. The RAID6
offload engine driver can use this newly added raid6_gflog table to
get disk position from multiplicative coefficient.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Acked-by: Shaohua Li <shli@fb.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/raid/pq.h |  1 +
 lib/raid6/mktables.c    | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 4d57bbaaa1bf..30f945329818 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -142,6 +142,7 @@ int raid6_select_algo(void);
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
 extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gflog[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 39787db588b0..e824d088f72c 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -125,6 +125,26 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
 	printf("#endif\n");
 
+	/* Compute log-of-2 table */
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gflog[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++) {
+			v = 255;
+			for (k = 0; k < 256; k++)
+				if (exptbl[k] == (i + j)) {
+					v = k;
+					break;
+				}
+			printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+		}
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gflog);\n");
+	printf("#endif\n");
+
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
 	       "raid6_gfinv[256] =\n" "{\n");
-- 
cgit v1.2.3


From baae03a0e2497f49704628fd0aaf993cf98e1b99 Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@broadcom.com>
Date: Mon, 15 May 2017 10:34:53 +0530
Subject: async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()

The DMA_PREP_FENCE is to be used when preparing Tx descriptor if output
of Tx descriptor is to be used by next/dependent Tx descriptor.

The DMA_PREP_FENSE will not be set correctly in do_async_gen_syndrome()
when calling dma->device_prep_dma_pq() under following conditions:
1. ASYNC_TX_FENCE not set in submit->flags
2. DMA_PREP_FENCE not set in dma_flags
3. src_cnt (= (disks - 2)) is greater than dma_maxpq(dma, dma_flags)

This patch fixes DMA_PREP_FENCE usage in do_async_gen_syndrome() taking
inspiration from do_async_xor() implementation.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 crypto/async_tx/async_pq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index f83de99d7d71..56bd612927ab 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -62,9 +62,6 @@ do_async_gen_syndrome(struct dma_chan *chan,
 	dma_addr_t dma_dest[2];
 	int src_off = 0;
 
-	if (submit->flags & ASYNC_TX_FENCE)
-		dma_flags |= DMA_PREP_FENCE;
-
 	while (src_cnt > 0) {
 		submit->flags = flags_orig;
 		pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
@@ -83,6 +80,8 @@ do_async_gen_syndrome(struct dma_chan *chan,
 			if (cb_fn_orig)
 				dma_flags |= DMA_PREP_INTERRUPT;
 		}
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 
 		/* Drivers force forward progress in case they can not provide
 		 * a descriptor
-- 
cgit v1.2.3


From 743e1c8ffe4ee5dd7596556dcc3f022ccde13d7b Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@broadcom.com>
Date: Mon, 15 May 2017 10:34:54 +0530
Subject: dmaengine: Add Broadcom SBA RAID driver

The Broadcom stream buffer accelerator (SBA) provides offloading
capabilities for RAID operations. This SBA offload engine is
accessible via Broadcom SoC specific ring manager.

This patch adds Broadcom SBA RAID driver which provides one
DMA device with RAID capabilities using one or more Broadcom
SoC specific ring manager channels. The SBA RAID driver in its
current shape implements memcpy, xor, and pq operations.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig        |   14 +
 drivers/dma/Makefile       |    1 +
 drivers/dma/bcm-sba-raid.c | 1785 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1800 insertions(+)
 create mode 100644 drivers/dma/bcm-sba-raid.c

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 24e8597b2c3e..b7e0ab9585bc 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -99,6 +99,20 @@ config AXI_DMAC
 	  controller is often used in Analog Device's reference designs for FPGA
 	  platforms.
 
+config BCM_SBA_RAID
+	tristate "Broadcom SBA RAID engine support"
+	depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
+	select DMA_ENGINE
+	select DMA_ENGINE_RAID
+	select ASYNC_TX_DISABLE_XOR_VAL_DMA
+	select ASYNC_TX_DISABLE_PQ_VAL_DMA
+	default ARCH_BCM_IPROC
+	help
+	  Enable support for Broadcom SBA RAID Engine. The SBA RAID
+	  engine is available on most of the Broadcom iProc SoCs. It
+	  has the capability to offload memcpy, xor and pq computation
+	  for raid5/6.
+
 config COH901318
 	bool "ST-Ericsson COH901318 DMA support"
 	select DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 0b723e94d9e6..d12ab2985ed1 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_AT_XDMAC) += at_xdmac.o
 obj-$(CONFIG_AXI_DMAC) += dma-axi-dmac.o
+obj-$(CONFIG_BCM_SBA_RAID) += bcm-sba-raid.o
 obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
 obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
 obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o
diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
new file mode 100644
index 000000000000..d6b927b960ab
--- /dev/null
+++ b/drivers/dma/bcm-sba-raid.c
@@ -0,0 +1,1785 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Broadcom SBA RAID Driver
+ *
+ * The Broadcom stream buffer accelerator (SBA) provides offloading
+ * capabilities for RAID operations. The SBA offload engine is accessible
+ * via Broadcom SoC specific ring manager. Two or more offload engines
+ * can share same Broadcom SoC specific ring manager due to this Broadcom
+ * SoC specific ring manager driver is implemented as a mailbox controller
+ * driver and offload engine drivers are implemented as mallbox clients.
+ *
+ * Typically, Broadcom SoC specific ring manager will implement larger
+ * number of hardware rings over one or more SBA hardware devices. By
+ * design, the internal buffer size of SBA hardware device is limited
+ * but all offload operations supported by SBA can be broken down into
+ * multiple small size requests and executed parallely on multiple SBA
+ * hardware devices for achieving high through-put.
+ *
+ * The Broadcom SBA RAID driver does not require any register programming
+ * except submitting request to SBA hardware device via mailbox channels.
+ * This driver implements a DMA device with one DMA channel using a set
+ * of mailbox channels provided by Broadcom SoC specific ring manager
+ * driver. To exploit parallelism (as described above), all DMA request
+ * coming to SBA RAID DMA channel are broken down to smaller requests
+ * and submitted to multiple mailbox channels in round-robin fashion.
+ * For having more SBA DMA channels, we can create more SBA device nodes
+ * in Broadcom SoC specific DTS based on number of hardware rings supported
+ * by Broadcom SoC ring manager.
+ */
+
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/list.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/raid/pq.h>
+
+#include "dmaengine.h"
+
+/* SBA command related defines */
+#define SBA_TYPE_SHIFT					48
+#define SBA_TYPE_MASK					GENMASK(1, 0)
+#define SBA_TYPE_A					0x0
+#define SBA_TYPE_B					0x2
+#define SBA_TYPE_C					0x3
+#define SBA_USER_DEF_SHIFT				32
+#define SBA_USER_DEF_MASK				GENMASK(15, 0)
+#define SBA_R_MDATA_SHIFT				24
+#define SBA_R_MDATA_MASK				GENMASK(7, 0)
+#define SBA_C_MDATA_MS_SHIFT				18
+#define SBA_C_MDATA_MS_MASK				GENMASK(1, 0)
+#define SBA_INT_SHIFT					17
+#define SBA_INT_MASK					BIT(0)
+#define SBA_RESP_SHIFT					16
+#define SBA_RESP_MASK					BIT(0)
+#define SBA_C_MDATA_SHIFT				8
+#define SBA_C_MDATA_MASK				GENMASK(7, 0)
+#define SBA_C_MDATA_BNUMx_SHIFT(__bnum)			(2 * (__bnum))
+#define SBA_C_MDATA_BNUMx_MASK				GENMASK(1, 0)
+#define SBA_C_MDATA_DNUM_SHIFT				5
+#define SBA_C_MDATA_DNUM_MASK				GENMASK(4, 0)
+#define SBA_C_MDATA_LS(__v)				((__v) & 0xff)
+#define SBA_C_MDATA_MS(__v)				(((__v) >> 8) & 0x3)
+#define SBA_CMD_SHIFT					0
+#define SBA_CMD_MASK					GENMASK(3, 0)
+#define SBA_CMD_ZERO_BUFFER				0x4
+#define SBA_CMD_ZERO_ALL_BUFFERS			0x8
+#define SBA_CMD_LOAD_BUFFER				0x9
+#define SBA_CMD_XOR					0xa
+#define SBA_CMD_GALOIS_XOR				0xb
+#define SBA_CMD_WRITE_BUFFER				0xc
+#define SBA_CMD_GALOIS					0xe
+
+/* Driver helper macros */
+#define to_sba_request(tx)		\
+	container_of(tx, struct sba_request, tx)
+#define to_sba_device(dchan)		\
+	container_of(dchan, struct sba_device, dma_chan)
+
+enum sba_request_state {
+	SBA_REQUEST_STATE_FREE = 1,
+	SBA_REQUEST_STATE_ALLOCED = 2,
+	SBA_REQUEST_STATE_PENDING = 3,
+	SBA_REQUEST_STATE_ACTIVE = 4,
+	SBA_REQUEST_STATE_RECEIVED = 5,
+	SBA_REQUEST_STATE_COMPLETED = 6,
+	SBA_REQUEST_STATE_ABORTED = 7,
+};
+
+struct sba_request {
+	/* Global state */
+	struct list_head node;
+	struct sba_device *sba;
+	enum sba_request_state state;
+	bool fence;
+	/* Chained requests management */
+	struct sba_request *first;
+	struct list_head next;
+	unsigned int next_count;
+	atomic_t next_pending_count;
+	/* BRCM message data */
+	void *resp;
+	dma_addr_t resp_dma;
+	struct brcm_sba_command *cmds;
+	struct brcm_message msg;
+	struct dma_async_tx_descriptor tx;
+};
+
+enum sba_version {
+	SBA_VER_1 = 0,
+	SBA_VER_2
+};
+
+struct sba_device {
+	/* Underlying device */
+	struct device *dev;
+	/* DT configuration parameters */
+	enum sba_version ver;
+	/* Derived configuration parameters */
+	u32 max_req;
+	u32 hw_buf_size;
+	u32 hw_resp_size;
+	u32 max_pq_coefs;
+	u32 max_pq_srcs;
+	u32 max_cmd_per_req;
+	u32 max_xor_srcs;
+	u32 max_resp_pool_size;
+	u32 max_cmds_pool_size;
+	/* Maibox client and Mailbox channels */
+	struct mbox_client client;
+	int mchans_count;
+	atomic_t mchans_current;
+	struct mbox_chan **mchans;
+	struct device *mbox_dev;
+	/* DMA device and DMA channel */
+	struct dma_device dma_dev;
+	struct dma_chan dma_chan;
+	/* DMA channel resources */
+	void *resp_base;
+	dma_addr_t resp_dma_base;
+	void *cmds_base;
+	dma_addr_t cmds_dma_base;
+	spinlock_t reqs_lock;
+	struct sba_request *reqs;
+	bool reqs_fence;
+	struct list_head reqs_alloc_list;
+	struct list_head reqs_pending_list;
+	struct list_head reqs_active_list;
+	struct list_head reqs_received_list;
+	struct list_head reqs_completed_list;
+	struct list_head reqs_aborted_list;
+	struct list_head reqs_free_list;
+	int reqs_free_count;
+};
+
+/* ====== SBA command helper routines ===== */
+
+static inline u64 __pure sba_cmd_enc(u64 cmd, u32 val, u32 shift, u32 mask)
+{
+	cmd &= ~((u64)mask << shift);
+	cmd |= ((u64)(val & mask) << shift);
+	return cmd;
+}
+
+static inline u32 __pure sba_cmd_load_c_mdata(u32 b0)
+{
+	return b0 & SBA_C_MDATA_BNUMx_MASK;
+}
+
+static inline u32 __pure sba_cmd_write_c_mdata(u32 b0)
+{
+	return b0 & SBA_C_MDATA_BNUMx_MASK;
+}
+
+static inline u32 __pure sba_cmd_xor_c_mdata(u32 b1, u32 b0)
+{
+	return (b0 & SBA_C_MDATA_BNUMx_MASK) |
+	       ((b1 & SBA_C_MDATA_BNUMx_MASK) << SBA_C_MDATA_BNUMx_SHIFT(1));
+}
+
+static inline u32 __pure sba_cmd_pq_c_mdata(u32 d, u32 b1, u32 b0)
+{
+	return (b0 & SBA_C_MDATA_BNUMx_MASK) |
+	       ((b1 & SBA_C_MDATA_BNUMx_MASK) << SBA_C_MDATA_BNUMx_SHIFT(1)) |
+	       ((d & SBA_C_MDATA_DNUM_MASK) << SBA_C_MDATA_DNUM_SHIFT);
+}
+
+/* ====== Channel resource management routines ===== */
+
+static struct sba_request *sba_alloc_request(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req = NULL;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	req = list_first_entry_or_null(&sba->reqs_free_list,
+				       struct sba_request, node);
+	if (req) {
+		list_move_tail(&req->node, &sba->reqs_alloc_list);
+		req->state = SBA_REQUEST_STATE_ALLOCED;
+		req->fence = false;
+		req->first = req;
+		INIT_LIST_HEAD(&req->next);
+		req->next_count = 1;
+		atomic_set(&req->next_pending_count, 1);
+
+		sba->reqs_free_count--;
+
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return req;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_pending_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_PENDING;
+	list_move_tail(&req->node, &sba->reqs_pending_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static bool _sba_active_request(struct sba_device *sba,
+				struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	if (sba->reqs_fence)
+		return false;
+	req->state = SBA_REQUEST_STATE_ACTIVE;
+	list_move_tail(&req->node, &sba->reqs_active_list);
+	if (req->fence)
+		sba->reqs_fence = true;
+	return true;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_abort_request(struct sba_device *sba,
+			       struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_ABORTED;
+	list_move_tail(&req->node, &sba->reqs_aborted_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_free_request(struct sba_device *sba,
+			      struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_FREE;
+	list_move_tail(&req->node, &sba->reqs_free_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	sba->reqs_free_count++;
+}
+
+static void sba_received_request(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	req->state = SBA_REQUEST_STATE_RECEIVED;
+	list_move_tail(&req->node, &sba->reqs_received_list);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_complete_chained_requests(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_request *nreq;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	req->state = SBA_REQUEST_STATE_COMPLETED;
+	list_move_tail(&req->node, &sba->reqs_completed_list);
+	list_for_each_entry(nreq, &req->next, next) {
+		nreq->state = SBA_REQUEST_STATE_COMPLETED;
+		list_move_tail(&nreq->node, &sba->reqs_completed_list);
+	}
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_free_chained_requests(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_request *nreq;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	_sba_free_request(sba, req);
+	list_for_each_entry(nreq, &req->next, next)
+		_sba_free_request(sba, nreq);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_chain_request(struct sba_request *first,
+			      struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	list_add_tail(&req->next, &first->next);
+	req->first = first;
+	first->next_count++;
+	atomic_set(&first->next_pending_count, first->next_count);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_cleanup_nonpending_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all alloced request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_alloc_list, node)
+		_sba_free_request(sba, req);
+
+	/* Freeup all received request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_received_list, node)
+		_sba_free_request(sba, req);
+
+	/* Freeup all completed request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node)
+		_sba_free_request(sba, req);
+
+	/* Set all active requests as aborted */
+	list_for_each_entry_safe(req, req1, &sba->reqs_active_list, node)
+		_sba_abort_request(sba, req);
+
+	/*
+	 * Note: We expect that aborted request will be eventually
+	 * freed by sba_receive_message()
+	 */
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_cleanup_pending_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node)
+		_sba_free_request(sba, req);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+/* ====== DMAENGINE callbacks ===== */
+
+static void sba_free_chan_resources(struct dma_chan *dchan)
+{
+	/*
+	 * Channel resources are pre-alloced so we just free-up
+	 * whatever we can so that we can re-use pre-alloced
+	 * channel resources next time.
+	 */
+	sba_cleanup_nonpending_requests(to_sba_device(dchan));
+}
+
+static int sba_device_terminate_all(struct dma_chan *dchan)
+{
+	/* Cleanup all pending requests */
+	sba_cleanup_pending_requests(to_sba_device(dchan));
+
+	return 0;
+}
+
+static int sba_send_mbox_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	int mchans_idx, ret = 0;
+
+	/* Select mailbox channel in round-robin fashion */
+	mchans_idx = atomic_inc_return(&sba->mchans_current);
+	mchans_idx = mchans_idx % sba->mchans_count;
+
+	/* Send message for the request */
+	req->msg.error = 0;
+	ret = mbox_send_message(sba->mchans[mchans_idx], &req->msg);
+	if (ret < 0) {
+		dev_err(sba->dev, "send message failed with error %d", ret);
+		return ret;
+	}
+	ret = req->msg.error;
+	if (ret < 0) {
+		dev_err(sba->dev, "message error %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void sba_issue_pending(struct dma_chan *dchan)
+{
+	int ret;
+	unsigned long flags;
+	struct sba_request *req, *req1;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Process all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node) {
+		/* Try to make request active */
+		if (!_sba_active_request(sba, req))
+			break;
+
+		/* Send request to mailbox channel */
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		ret = sba_send_mbox_request(sba, req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+
+		/* If something went wrong then keep request pending */
+		if (ret < 0) {
+			_sba_pending_request(sba, req);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static dma_cookie_t sba_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	unsigned long flags;
+	dma_cookie_t cookie;
+	struct sba_device *sba;
+	struct sba_request *req, *nreq;
+
+	if (unlikely(!tx))
+		return -EINVAL;
+
+	sba = to_sba_device(tx->chan);
+	req = to_sba_request(tx);
+
+	/* Assign cookie and mark all chained requests pending */
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	cookie = dma_cookie_assign(tx);
+	_sba_pending_request(sba, req);
+	list_for_each_entry(nreq, &req->next, next)
+		_sba_pending_request(sba, nreq);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return cookie;
+}
+
+static enum dma_status sba_tx_status(struct dma_chan *dchan,
+				     dma_cookie_t cookie,
+				     struct dma_tx_state *txstate)
+{
+	int mchan_idx;
+	enum dma_status ret;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
+		mbox_client_peek_data(sba->mchans[mchan_idx]);
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	return dma_cookie_status(dchan, cookie, txstate);
+}
+
+static void sba_fillup_interrupt_msg(struct sba_request *req,
+				     struct brcm_sba_command *cmds,
+				     struct brcm_message *msg)
+{
+	u64 cmd;
+	u32 c_mdata;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load dummy data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, req->sba->hw_resp_size,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = req->resp_dma;
+	cmdsp->data_len = req->sba->hw_resp_size;
+	cmdsp++;
+
+	/* Type-A command to write buf0 to dummy location */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, req->sba->hw_resp_size,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = req->resp_dma;
+	cmdsp->data_len = req->sba->hw_resp_size;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_interrupt(struct dma_chan *dchan, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+
+	/*
+	 * Force fence so that no requests are submitted
+	 * until DMA callback for this request is invoked.
+	 */
+	req->fence = true;
+
+	/* Fillup request message */
+	sba_fillup_interrupt_msg(req, req->cmds, &req->msg);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return (req) ? &req->tx : NULL;
+}
+
+static void sba_fillup_memcpy_msg(struct sba_request *req,
+				  struct brcm_sba_command *cmds,
+				  struct brcm_message *msg,
+				  dma_addr_t msg_offset, size_t msg_len,
+				  dma_addr_t dst, dma_addr_t src)
+{
+	u64 cmd;
+	u32 c_mdata;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+static struct sba_request *
+sba_prep_dma_memcpy_req(struct sba_device *sba,
+			dma_addr_t off, dma_addr_t dst, dma_addr_t src,
+			size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request message */
+	sba_fillup_memcpy_msg(req, req->cmds, &req->msg,
+			      off, len, dst, src);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_memcpy(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
+		    size_t len, unsigned long flags)
+{
+	size_t req_len;
+	dma_addr_t off = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		req = sba_prep_dma_memcpy_req(sba, off, dst, src,
+					      req_len, flags);
+		if (!req) {
+			if (first)
+				sba_free_chained_requests(first);
+			return NULL;
+		}
+
+		if (first)
+			sba_chain_request(first, req);
+		else
+			first = req;
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+}
+
+static void sba_fillup_xor_msg(struct sba_request *req,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t dst, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src[0] + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-B commands to xor data with buf0 and put it back in buf0 */
+	for (i = 1; i < src_cnt; i++) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_xor_req(struct sba_device *sba,
+		     dma_addr_t off, dma_addr_t dst, dma_addr_t *src,
+		     u32 src_cnt, size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request message */
+	sba_fillup_xor_msg(req, req->cmds, &req->msg,
+			   off, len, dst, src, src_cnt);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_xor(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t *src,
+		 u32 src_cnt, size_t len, unsigned long flags)
+{
+	size_t req_len;
+	dma_addr_t off = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Sanity checks */
+	if (unlikely(src_cnt > sba->max_xor_srcs))
+		return NULL;
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		req = sba_prep_dma_xor_req(sba, off, dst, src, src_cnt,
+					   req_len, flags);
+		if (!req) {
+			if (first)
+				sba_free_chained_requests(first);
+			return NULL;
+		}
+
+		if (first)
+			sba_chain_request(first, req);
+		else
+			first = req;
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+}
+
+static void sba_fillup_pq_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				const u8 *scf, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		if (dst_p) {
+			cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			cmd = sba_cmd_enc(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = sba_cmd_load_c_mdata(0);
+			cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_p + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+
+		/* Type-B command to load old Q into buf1 */
+		if (dst_q) {
+			cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			cmd = sba_cmd_enc(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = sba_cmd_load_c_mdata(1);
+			cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_q + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+	} else {
+		/* Type-A command to zero all buffers */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+	}
+
+	/* Type-B commands for generate P onto buf0 and Q onto buf1 */
+	for (i = 0; i < src_cnt; i++) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_pq_c_mdata(raid6_gflog[scf[i]], 1, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+				  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	if (dst_p) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, 0x1,
+				  SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = sba_cmd_write_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf1 */
+	if (dst_q) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, 0x1,
+				  SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = sba_cmd_write_c_mdata(1);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_pq_req(struct sba_device *sba, dma_addr_t off,
+		    dma_addr_t *dst_p, dma_addr_t *dst_q, dma_addr_t *src,
+		    u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	sba_fillup_pq_msg(req, dmaf_continue(flags),
+			  req->cmds, &req->msg,
+			  off, len, dst_p, dst_q, scf, src, src_cnt);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static void sba_fillup_pq_single_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				dma_addr_t src, u8 scf)
+{
+	u64 cmd;
+	u32 c_mdata;
+	u8 pos, dpos = raid6_gflog[scf];
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (!dst_p)
+		goto skip_p;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_load_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+
+		/*
+		 * Type-B commands to xor data with buf0 and put it
+		 * back in buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	} else {
+		/* Type-B command to load old P into buf0 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_load_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = *dst_p + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+skip_p:
+	if (!dst_q)
+		goto skip_q;
+
+	/* Type-A command to zero all buffers */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	cmdsp++;
+
+	if (dpos == 255)
+		goto skip_q_computation;
+	pos = (dpos < req->sba->max_pq_coefs) ?
+		dpos : (req->sba->max_pq_coefs - 1);
+
+	/*
+	 * Type-B command to generate initial Q from data
+	 * and store output into buf0
+	 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_pq_c_mdata(pos, 0, 0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+			  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	dpos -= pos;
+
+	/* Multiple Type-A command to generate final Q */
+	while (dpos) {
+		pos = (dpos < req->sba->max_pq_coefs) ?
+			dpos : (req->sba->max_pq_coefs - 1);
+
+		/*
+		 * Type-A command to generate Q with buf0 and
+		 * buf1 store result in buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_pq_c_mdata(pos, 0, 1);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+				  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+
+		dpos -= pos;
+	}
+
+skip_q_computation:
+	if (pq_continue) {
+		/*
+		 * Type-B command to XOR previous output with
+		 * buf0 and write it into buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = *dst_q + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+skip_q:
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_pq_single_req(struct sba_device *sba, dma_addr_t off,
+			   dma_addr_t *dst_p, dma_addr_t *dst_q,
+			   dma_addr_t src, u8 scf, size_t len,
+			   unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	sba_fillup_pq_single_msg(req,  dmaf_continue(flags),
+				 req->cmds, &req->msg, off, len,
+				 dst_p, dst_q, src, scf);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_pq(struct dma_chan *dchan, dma_addr_t *dst, dma_addr_t *src,
+		u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	u32 i, dst_q_index;
+	size_t req_len;
+	bool slow = false;
+	dma_addr_t off = 0;
+	dma_addr_t *dst_p = NULL, *dst_q = NULL;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Sanity checks */
+	if (unlikely(src_cnt > sba->max_pq_srcs))
+		return NULL;
+	for (i = 0; i < src_cnt; i++)
+		if (sba->max_pq_coefs <= raid6_gflog[scf[i]])
+			slow = true;
+
+	/* Figure-out P and Q destination addresses */
+	if (!(flags & DMA_PREP_PQ_DISABLE_P))
+		dst_p = &dst[0];
+	if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+		dst_q = &dst[1];
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		if (slow) {
+			dst_q_index = src_cnt;
+
+			if (dst_q) {
+				for (i = 0; i < src_cnt; i++) {
+					if (*dst_q == src[i]) {
+						dst_q_index = i;
+						break;
+					}
+				}
+			}
+
+			if (dst_q_index < src_cnt) {
+				i = dst_q_index;
+				req = sba_prep_dma_pq_single_req(sba,
+					off, dst_p, dst_q, src[i], scf[i],
+					req_len, flags | DMA_PREP_FENCE);
+				if (!req)
+					goto fail;
+
+				if (first)
+					sba_chain_request(first, req);
+				else
+					first = req;
+
+				flags |= DMA_PREP_CONTINUE;
+			}
+
+			for (i = 0; i < src_cnt; i++) {
+				if (dst_q_index == i)
+					continue;
+
+				req = sba_prep_dma_pq_single_req(sba,
+					off, dst_p, dst_q, src[i], scf[i],
+					req_len, flags | DMA_PREP_FENCE);
+				if (!req)
+					goto fail;
+
+				if (first)
+					sba_chain_request(first, req);
+				else
+					first = req;
+
+				flags |= DMA_PREP_CONTINUE;
+			}
+		} else {
+			req = sba_prep_dma_pq_req(sba, off,
+						  dst_p, dst_q, src, src_cnt,
+						  scf, req_len, flags);
+			if (!req)
+				goto fail;
+
+			if (first)
+				sba_chain_request(first, req);
+			else
+				first = req;
+		}
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+
+fail:
+	if (first)
+		sba_free_chained_requests(first);
+	return NULL;
+}
+
+/* ====== Mailbox callbacks ===== */
+
+static void sba_dma_tx_actions(struct sba_request *req)
+{
+	struct dma_async_tx_descriptor *tx = &req->tx;
+
+	WARN_ON(tx->cookie < 0);
+
+	if (tx->cookie > 0) {
+		dma_cookie_complete(tx);
+
+		/*
+		 * Call the callback (must not sleep or submit new
+		 * operations to this channel)
+		 */
+		if (tx->callback)
+			tx->callback(tx->callback_param);
+
+		dma_descriptor_unmap(tx);
+	}
+
+	/* Run dependent operations */
+	dma_run_dependencies(tx);
+
+	/* If waiting for 'ack' then move to completed list */
+	if (!async_tx_test_ack(&req->tx))
+		sba_complete_chained_requests(req);
+	else
+		sba_free_chained_requests(req);
+}
+
+static void sba_receive_message(struct mbox_client *cl, void *msg)
+{
+	unsigned long flags;
+	struct brcm_message *m = msg;
+	struct sba_request *req = m->ctx, *req1;
+	struct sba_device *sba = req->sba;
+
+	/* Error count if message has error */
+	if (m->error < 0)
+		dev_err(sba->dev, "%s got message with error %d",
+			dma_chan_name(&sba->dma_chan), m->error);
+
+	/* Mark request as received */
+	sba_received_request(req);
+
+	/* Wait for all chained requests to be completed */
+	if (atomic_dec_return(&req->first->next_pending_count))
+		goto done;
+
+	/* Point to first request */
+	req = req->first;
+
+	/* Update request */
+	if (req->state == SBA_REQUEST_STATE_RECEIVED)
+		sba_dma_tx_actions(req);
+	else
+		sba_free_chained_requests(req);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Re-check all completed request waiting for 'ack' */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node) {
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		sba_dma_tx_actions(req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+done:
+	/* Try to submit pending request */
+	sba_issue_pending(&sba->dma_chan);
+}
+
+/* ====== Platform driver routines ===== */
+
+static int sba_prealloc_channel_resources(struct sba_device *sba)
+{
+	int i, j, p, ret = 0;
+	struct sba_request *req = NULL;
+
+	sba->resp_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_resp_pool_size,
+					    &sba->resp_dma_base, GFP_KERNEL);
+	if (!sba->resp_base)
+		return -ENOMEM;
+
+	sba->cmds_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_cmds_pool_size,
+					    &sba->cmds_dma_base, GFP_KERNEL);
+	if (!sba->cmds_base) {
+		ret = -ENOMEM;
+		goto fail_free_resp_pool;
+	}
+
+	spin_lock_init(&sba->reqs_lock);
+	sba->reqs_fence = false;
+	INIT_LIST_HEAD(&sba->reqs_alloc_list);
+	INIT_LIST_HEAD(&sba->reqs_pending_list);
+	INIT_LIST_HEAD(&sba->reqs_active_list);
+	INIT_LIST_HEAD(&sba->reqs_received_list);
+	INIT_LIST_HEAD(&sba->reqs_completed_list);
+	INIT_LIST_HEAD(&sba->reqs_aborted_list);
+	INIT_LIST_HEAD(&sba->reqs_free_list);
+
+	sba->reqs = devm_kcalloc(sba->dev, sba->max_req,
+				 sizeof(*req), GFP_KERNEL);
+	if (!sba->reqs) {
+		ret = -ENOMEM;
+		goto fail_free_cmds_pool;
+	}
+
+	for (i = 0, p = 0; i < sba->max_req; i++) {
+		req = &sba->reqs[i];
+		INIT_LIST_HEAD(&req->node);
+		req->sba = sba;
+		req->state = SBA_REQUEST_STATE_FREE;
+		INIT_LIST_HEAD(&req->next);
+		req->next_count = 1;
+		atomic_set(&req->next_pending_count, 0);
+		req->fence = false;
+		req->resp = sba->resp_base + p;
+		req->resp_dma = sba->resp_dma_base + p;
+		p += sba->hw_resp_size;
+		req->cmds = devm_kcalloc(sba->dev, sba->max_cmd_per_req,
+					 sizeof(*req->cmds), GFP_KERNEL);
+		if (!req->cmds) {
+			ret = -ENOMEM;
+			goto fail_free_cmds_pool;
+		}
+		for (j = 0; j < sba->max_cmd_per_req; j++) {
+			req->cmds[j].cmd = 0;
+			req->cmds[j].cmd_dma = sba->cmds_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].cmd_dma_addr = sba->cmds_dma_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].flags = 0;
+		}
+		memset(&req->msg, 0, sizeof(req->msg));
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+		req->tx.tx_submit = sba_tx_submit;
+		req->tx.phys = req->resp_dma;
+		list_add_tail(&req->node, &sba->reqs_free_list);
+	}
+
+	sba->reqs_free_count = sba->max_req;
+
+	return 0;
+
+fail_free_cmds_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+fail_free_resp_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	return ret;
+}
+
+static void sba_freeup_channel_resources(struct sba_device *sba)
+{
+	dmaengine_terminate_all(&sba->dma_chan);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	sba->resp_base = NULL;
+	sba->resp_dma_base = 0;
+}
+
+static int sba_async_register(struct sba_device *sba)
+{
+	int ret;
+	struct dma_device *dma_dev = &sba->dma_dev;
+
+	/* Initialize DMA channel cookie */
+	sba->dma_chan.device = dma_dev;
+	dma_cookie_init(&sba->dma_chan);
+
+	/* Initialize DMA device capability mask */
+	dma_cap_zero(dma_dev->cap_mask);
+	dma_cap_set(DMA_INTERRUPT, dma_dev->cap_mask);
+	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+	dma_cap_set(DMA_PQ, dma_dev->cap_mask);
+
+	/*
+	 * Set mailbox channel device as the base device of
+	 * our dma_device because the actual memory accesses
+	 * will be done by mailbox controller
+	 */
+	dma_dev->dev = sba->mbox_dev;
+
+	/* Set base prep routines */
+	dma_dev->device_free_chan_resources = sba_free_chan_resources;
+	dma_dev->device_terminate_all = sba_device_terminate_all;
+	dma_dev->device_issue_pending = sba_issue_pending;
+	dma_dev->device_tx_status = sba_tx_status;
+
+	/* Set interrupt routine */
+	if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_interrupt = sba_prep_dma_interrupt;
+
+	/* Set memcpy routine */
+	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_memcpy = sba_prep_dma_memcpy;
+
+	/* Set xor routine and capability */
+	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_xor = sba_prep_dma_xor;
+		dma_dev->max_xor = sba->max_xor_srcs;
+	}
+
+	/* Set pq routine and capability */
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_pq = sba_prep_dma_pq;
+		dma_set_maxpq(dma_dev, sba->max_pq_srcs, 0);
+	}
+
+	/* Initialize DMA device channel list */
+	INIT_LIST_HEAD(&dma_dev->channels);
+	list_add_tail(&sba->dma_chan.device_node, &dma_dev->channels);
+
+	/* Register with Linux async DMA framework*/
+	ret = dma_async_device_register(dma_dev);
+	if (ret) {
+		dev_err(sba->dev, "async device register error %d", ret);
+		return ret;
+	}
+
+	dev_info(sba->dev, "%s capabilities: %s%s%s%s\n",
+	dma_chan_name(&sba->dma_chan),
+	dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "interrupt " : "",
+	dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "memcpy " : "",
+	dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
+	dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "");
+
+	return 0;
+}
+
+static int sba_probe(struct platform_device *pdev)
+{
+	int i, ret = 0, mchans_count;
+	struct sba_device *sba;
+	struct platform_device *mbox_pdev;
+	struct of_phandle_args args;
+
+	/* Allocate main SBA struct */
+	sba = devm_kzalloc(&pdev->dev, sizeof(*sba), GFP_KERNEL);
+	if (!sba)
+		return -ENOMEM;
+
+	sba->dev = &pdev->dev;
+	platform_set_drvdata(pdev, sba);
+
+	/* Determine SBA version from DT compatible string */
+	if (of_device_is_compatible(sba->dev->of_node, "brcm,iproc-sba"))
+		sba->ver = SBA_VER_1;
+	else if (of_device_is_compatible(sba->dev->of_node,
+					 "brcm,iproc-sba-v2"))
+		sba->ver = SBA_VER_2;
+	else
+		return -ENODEV;
+
+	/* Derived Configuration parameters */
+	switch (sba->ver) {
+	case SBA_VER_1:
+		sba->max_req = 1024;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 6;
+		sba->max_pq_srcs = 6;
+		break;
+	case SBA_VER_2:
+		sba->max_req = 1024;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 30;
+		/*
+		 * We can support max_pq_srcs == max_pq_coefs because
+		 * we are limited by number of SBA commands that we can
+		 * fit in one message for underlying ring manager HW.
+		 */
+		sba->max_pq_srcs = 12;
+		break;
+	default:
+		return -EINVAL;
+	}
+	sba->max_cmd_per_req = sba->max_pq_srcs + 3;
+	sba->max_xor_srcs = sba->max_cmd_per_req - 1;
+	sba->max_resp_pool_size = sba->max_req * sba->hw_resp_size;
+	sba->max_cmds_pool_size = sba->max_req *
+				  sba->max_cmd_per_req * sizeof(u64);
+
+	/* Setup mailbox client */
+	sba->client.dev			= &pdev->dev;
+	sba->client.rx_callback		= sba_receive_message;
+	sba->client.tx_block		= false;
+	sba->client.knows_txdone	= false;
+	sba->client.tx_tout		= 0;
+
+	/* Number of channels equals number of mailbox channels */
+	ret = of_count_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells");
+	if (ret <= 0)
+		return -ENODEV;
+	mchans_count = ret;
+	sba->mchans_count = 0;
+	atomic_set(&sba->mchans_current, 0);
+
+	/* Allocate mailbox channel array */
+	sba->mchans = devm_kcalloc(&pdev->dev, sba->mchans_count,
+				   sizeof(*sba->mchans), GFP_KERNEL);
+	if (!sba->mchans)
+		return -ENOMEM;
+
+	/* Request mailbox channels */
+	for (i = 0; i < mchans_count; i++) {
+		sba->mchans[i] = mbox_request_channel(&sba->client, i);
+		if (IS_ERR(sba->mchans[i])) {
+			ret = PTR_ERR(sba->mchans[i]);
+			goto fail_free_mchans;
+		}
+		sba->mchans_count++;
+	}
+
+	/* Find-out underlying mailbox device */
+	ret = of_parse_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells", 0, &args);
+	if (ret)
+		goto fail_free_mchans;
+	mbox_pdev = of_find_device_by_node(args.np);
+	of_node_put(args.np);
+	if (!mbox_pdev) {
+		ret = -ENODEV;
+		goto fail_free_mchans;
+	}
+	sba->mbox_dev = &mbox_pdev->dev;
+
+	/* All mailbox channels should be of same ring manager device */
+	for (i = 1; i < mchans_count; i++) {
+		ret = of_parse_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells", i, &args);
+		if (ret)
+			goto fail_free_mchans;
+		mbox_pdev = of_find_device_by_node(args.np);
+		of_node_put(args.np);
+		if (sba->mbox_dev != &mbox_pdev->dev) {
+			ret = -EINVAL;
+			goto fail_free_mchans;
+		}
+	}
+
+	/* Register DMA device with linux async framework */
+	ret = sba_async_register(sba);
+	if (ret)
+		goto fail_free_mchans;
+
+	/* Prealloc channel resource */
+	ret = sba_prealloc_channel_resources(sba);
+	if (ret)
+		goto fail_async_dev_unreg;
+
+	/* Print device info */
+	dev_info(sba->dev, "%s using SBAv%d and %d mailbox channels",
+		 dma_chan_name(&sba->dma_chan), sba->ver+1,
+		 sba->mchans_count);
+
+	return 0;
+
+fail_async_dev_unreg:
+	dma_async_device_unregister(&sba->dma_dev);
+fail_free_mchans:
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+	return ret;
+}
+
+static int sba_remove(struct platform_device *pdev)
+{
+	int i;
+	struct sba_device *sba = platform_get_drvdata(pdev);
+
+	sba_freeup_channel_resources(sba);
+
+	dma_async_device_unregister(&sba->dma_dev);
+
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+
+	return 0;
+}
+
+static const struct of_device_id sba_of_match[] = {
+	{ .compatible = "brcm,iproc-sba", },
+	{ .compatible = "brcm,iproc-sba-v2", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, sba_of_match);
+
+static struct platform_driver sba_driver = {
+	.probe = sba_probe,
+	.remove = sba_remove,
+	.driver = {
+		.name = "bcm-sba-raid",
+		.of_match_table = sba_of_match,
+	},
+};
+module_platform_driver(sba_driver);
+
+MODULE_DESCRIPTION("Broadcom SBA RAID driver");
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 94c24b26609448d3626e0f5e3c0da95385c0ef1f Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@broadcom.com>
Date: Mon, 15 May 2017 10:34:55 +0530
Subject: dt-bindings: Add DT bindings document for Broadcom SBA RAID driver

This patch adds the DT bindings document for newly added Broadcom
SBA RAID driver.

Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 .../devicetree/bindings/dma/brcm,iproc-sba.txt     | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt

diff --git a/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
new file mode 100644
index 000000000000..092913a28457
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
@@ -0,0 +1,29 @@
+* Broadcom SBA RAID engine
+
+Required properties:
+- compatible: Should be one of the following
+	      "brcm,iproc-sba"
+	      "brcm,iproc-sba-v2"
+  The "brcm,iproc-sba" has support for only 6 PQ coefficients
+  The "brcm,iproc-sba-v2" has support for only 30 PQ coefficients
+- mboxes: List of phandle and mailbox channel specifiers
+
+Example:
+
+raid_mbox: mbox@67400000 {
+	...
+	#mbox-cells = <3>;
+	...
+};
+
+raid0 {
+	compatible = "brcm,iproc-sba-v2";
+	mboxes = <&raid_mbox 0 0x1 0xffff>,
+		 <&raid_mbox 1 0x1 0xffff>,
+		 <&raid_mbox 2 0x1 0xffff>,
+		 <&raid_mbox 3 0x1 0xffff>,
+		 <&raid_mbox 4 0x1 0xffff>,
+		 <&raid_mbox 5 0x1 0xffff>,
+		 <&raid_mbox 6 0x1 0xffff>,
+		 <&raid_mbox 7 0x1 0xffff>;
+};
-- 
cgit v1.2.3


From 58d96125103c77686271f515d2ffad0d6d0dedc8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 May 2017 15:25:25 +0200
Subject: dmaengine: bcm-sba-raid: fix Kconfig dependencies

The new driver requires both mailbox and raid support for compile
testing:

drivers/dma/built-in.o: In function `sba_remove':
edma.c:(.text+0x4414): undefined reference to `mbox_free_channel'
drivers/dma/built-in.o: In function `sba_issue_pending':
edma.c:(.text+0x46cc): undefined reference to `mbox_send_message'
drivers/dma/built-in.o: In function `sba_probe':
edma.c:(.text+0x4e60): undefined reference to `mbox_request_channel'
edma.c:(.text+0x5038): undefined reference to `mbox_free_channel'
drivers/dma/built-in.o: In function `sba_tx_status':
edma.c:(.text+0x5210): undefined reference to `mbox_client_peek_data'

drivers/dma/built-in.o: In function `sba_prep_dma_pq_req':
edma.c:(.text+0x5784): undefined reference to `raid6_gflog'
edma.c:(.text+0x5798): undefined reference to `raid6_gflog'

This rearranges the Kconfig dependencies accordingly.

Fixes: 743e1c8ffe4e ("dmaengine: Add Broadcom SBA RAID driver")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index b7e0ab9585bc..686ae67d7e2a 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -101,7 +101,8 @@ config AXI_DMAC
 
 config BCM_SBA_RAID
 	tristate "Broadcom SBA RAID engine support"
-	depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
+	depends on ARM64 || COMPILE_TEST
+	depends on MAILBOX && RAID6_PQ
 	select DMA_ENGINE
 	select DMA_ENGINE_RAID
 	select ASYNC_TX_DISABLE_XOR_VAL_DMA
-- 
cgit v1.2.3


From 1fc63cb4f1de790019d57ccaf3e9cd2e6abbc648 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 17 May 2017 22:58:50 +0100
Subject: dmaengine: bcm-scm-raid: remove redundant null check on req

Req is never null on at the point of the null check, so
remove this redundant check and just return &req->tx.

Detected by CoverityScan, CID#1436147 ("Logically dead code")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/bcm-sba-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
index d6b927b960ab..e41bbc7cb094 100644
--- a/drivers/dma/bcm-sba-raid.c
+++ b/drivers/dma/bcm-sba-raid.c
@@ -582,7 +582,7 @@ sba_prep_dma_interrupt(struct dma_chan *dchan, unsigned long flags)
 	req->tx.flags = flags;
 	req->tx.cookie = -EBUSY;
 
-	return (req) ? &req->tx : NULL;
+	return &req->tx;
 }
 
 static void sba_fillup_memcpy_msg(struct sba_request *req,
-- 
cgit v1.2.3


From 427d5ecd270bad5b9ba15a42f0414accdfa4e2f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20S=C3=B6derlund?=
 <niklas.soderlund+renesas@ragnatech.se>
Date: Tue, 16 May 2017 01:09:15 +0200
Subject: dmaengine: rcar-dmac: store channel IRQ in struct rcar_dmac_chan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The IRQ number is needed after probe to be able to add synchronisation
points in other places in the driver when freeing resources and to
implement a device_synchronize() callback. Store the IRQ number in the
struct rcar_dmac_chan so that it can be used later.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/sh/rcar-dmac.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c
index db41795fe42a..c68c3336bdad 100644
--- a/drivers/dma/sh/rcar-dmac.c
+++ b/drivers/dma/sh/rcar-dmac.c
@@ -144,6 +144,7 @@ struct rcar_dmac_chan_map {
  * @chan: base DMA channel object
  * @iomem: channel I/O memory base
  * @index: index of this channel in the controller
+ * @irq: channel IRQ
  * @src: slave memory address and size on the source side
  * @dst: slave memory address and size on the destination side
  * @mid_rid: hardware MID/RID for the DMA client using this channel
@@ -161,6 +162,7 @@ struct rcar_dmac_chan {
 	struct dma_chan chan;
 	void __iomem *iomem;
 	unsigned int index;
+	int irq;
 
 	struct rcar_dmac_chan_slave src;
 	struct rcar_dmac_chan_slave dst;
@@ -1647,7 +1649,6 @@ static int rcar_dmac_chan_probe(struct rcar_dmac *dmac,
 	struct dma_chan *chan = &rchan->chan;
 	char pdev_irqname[5];
 	char *irqname;
-	int irq;
 	int ret;
 
 	rchan->index = index;
@@ -1664,8 +1665,8 @@ static int rcar_dmac_chan_probe(struct rcar_dmac *dmac,
 
 	/* Request the channel interrupt. */
 	sprintf(pdev_irqname, "ch%u", index);
-	irq = platform_get_irq_byname(pdev, pdev_irqname);
-	if (irq < 0) {
+	rchan->irq = platform_get_irq_byname(pdev, pdev_irqname);
+	if (rchan->irq < 0) {
 		dev_err(dmac->dev, "no IRQ specified for channel %u\n", index);
 		return -ENODEV;
 	}
@@ -1675,11 +1676,13 @@ static int rcar_dmac_chan_probe(struct rcar_dmac *dmac,
 	if (!irqname)
 		return -ENOMEM;
 
-	ret = devm_request_threaded_irq(dmac->dev, irq, rcar_dmac_isr_channel,
+	ret = devm_request_threaded_irq(dmac->dev, rchan->irq,
+					rcar_dmac_isr_channel,
 					rcar_dmac_isr_channel_thread, 0,
 					irqname, rchan);
 	if (ret) {
-		dev_err(dmac->dev, "failed to request IRQ %u (%d)\n", irq, ret);
+		dev_err(dmac->dev, "failed to request IRQ %u (%d)\n",
+			rchan->irq, ret);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From 30c45005a46bf55ba52086da5e090ec5a2f1c949 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20S=C3=B6derlund?=
 <niklas.soderlund+renesas@ragnatech.se>
Date: Tue, 16 May 2017 01:09:16 +0200
Subject: dmaengine: rcar-dmac: implement device_synchronize()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the device_synchronize() callback which wait until a dma
channel is stopped to provide a synchronization point.

This protects the driver from multiple race conditions when terminating
and freeing resources. E.g. the completion callback still running after
device_terminate_all() has completed.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/sh/rcar-dmac.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c
index c68c3336bdad..fb07cd5fe77b 100644
--- a/drivers/dma/sh/rcar-dmac.c
+++ b/drivers/dma/sh/rcar-dmac.c
@@ -1365,6 +1365,13 @@ done:
 	spin_unlock_irqrestore(&rchan->lock, flags);
 }
 
+static void rcar_dmac_device_synchronize(struct dma_chan *chan)
+{
+	struct rcar_dmac_chan *rchan = to_rcar_dmac_chan(chan);
+
+	synchronize_irq(rchan->irq);
+}
+
 /* -----------------------------------------------------------------------------
  * IRQ handling
  */
@@ -1846,6 +1853,7 @@ static int rcar_dmac_probe(struct platform_device *pdev)
 	engine->device_terminate_all = rcar_dmac_chan_terminate_all;
 	engine->device_tx_status = rcar_dmac_tx_status;
 	engine->device_issue_pending = rcar_dmac_issue_pending;
+	engine->device_synchronize = rcar_dmac_device_synchronize;
 
 	ret = dma_async_device_register(engine);
 	if (ret < 0)
-- 
cgit v1.2.3


From a1ed64efc5f68fc830fea08e96363d4459bb07a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20S=C3=B6derlund?=
 <niklas.soderlund+renesas@ragnatech.se>
Date: Tue, 16 May 2017 01:09:17 +0200
Subject: dmaengine: rcar-dmac: wait for ISR to finish before freeing resources
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a race condition where the channel resources could be freed
before the ISR had finished running resulting in a NULL pointer
reference from the ISR.

[  167.148934] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[  167.157051] pgd = ffff80003c641000
[  167.160449] [00000000] *pgd=000000007c507003, *pud=000000007c4ff003, *pmd=0000000000000000
[  167.168719] Internal error: Oops: 96000046 [#1] PREEMPT SMP
[  167.174289] Modules linked in:
[  167.177348] CPU: 3 PID: 10547 Comm: dma_ioctl Not tainted 4.11.0-rc1-00001-g8d92afddc2f6633a #73
[  167.186131] Hardware name: Renesas Salvator-X board based on r8a7795 (DT)
[  167.192917] task: ffff80003a411a00 task.stack: ffff80003bcd4000
[  167.198850] PC is at rcar_dmac_chan_prep_sg+0xe0/0x400
[  167.203985] LR is at rcar_dmac_chan_prep_sg+0x48/0x400

Based of previous work by:
    Hiroyuki Yokoyama <hiroyuki.yokoyama.vx@renesas.com>.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/sh/rcar-dmac.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c
index fb07cd5fe77b..d2cb4a0916e6 100644
--- a/drivers/dma/sh/rcar-dmac.c
+++ b/drivers/dma/sh/rcar-dmac.c
@@ -1010,7 +1010,11 @@ static void rcar_dmac_free_chan_resources(struct dma_chan *chan)
 	rcar_dmac_chan_halt(rchan);
 	spin_unlock_irq(&rchan->lock);
 
-	/* Now no new interrupts will occur */
+	/*
+	 * Now no new interrupts will occur, but one might already be
+	 * running. Wait for it to finish before freeing resources.
+	 */
+	synchronize_irq(rchan->irq);
 
 	if (rchan->mid_rid >= 0) {
 		/* The caller is holding dma_list_mutex */
-- 
cgit v1.2.3


From 5466c34f8425ccd24124ebdae6e3b6552195956d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 19 May 2017 09:27:47 +0200
Subject: dt-bindings: rcar-dmac: Document missing error interrupt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The documentation for the "interrupt-names" property only mentions the
per-channel interrupts, while the error interrupt has always been
mandatory, too.

Fixes: 10f5c8438475909a ("dmaengine: rcar-dmac: Add device tree bindings
documentation")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt
index 3316a9c2e638..79a204d50234 100644
--- a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt
+++ b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt
@@ -30,8 +30,9 @@ Required Properties:
 
 - interrupts: interrupt specifiers for the DMAC, one for each entry in
   interrupt-names.
-- interrupt-names: one entry per channel, named "ch%u", where %u is the
-  channel number ranging from zero to the number of channels minus one.
+- interrupt-names: one entry for the error interrupt, named "error", plus one
+  entry per channel, named "ch%u", where %u is the channel number ranging from
+  zero to the number of channels minus one.
 
 - clock-names: "fck" for the functional clock
 - clocks: a list of phandle + clock-specifier pairs, one for each entry
-- 
cgit v1.2.3


From 4166a56aa8d5babe979d8e0834a741c9f015ad14 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 20 May 2017 23:42:50 +0200
Subject: ARM/dmaengine: pl08x: pass reasonable memcpy settings

We cannot use bits from configuration registers as API between
platforms and driver like this, abstract it out to two enums
and mimic the stuff passed as device tree data.

This is done to make it possible for the driver to generate the
ccfg word on-the-fly so we can support more PL08x derivatives.

Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 arch/arm/mach-lpc32xx/phy3250.c |   3 +
 arch/arm/mach-s3c64xx/pl080.c   |  28 +++------
 arch/arm/mach-spear/spear3xx.c  |  14 ++---
 arch/arm/mach-spear/spear6xx.c  |  14 ++---
 drivers/dma/amba-pl08x.c        | 131 ++++++++++++++++++++++++++++------------
 include/linux/amba/pl08x.h      |  30 +++++++--
 6 files changed, 138 insertions(+), 82 deletions(-)

diff --git a/arch/arm/mach-lpc32xx/phy3250.c b/arch/arm/mach-lpc32xx/phy3250.c
index 6c52bd32610e..e48cc06c2aec 100644
--- a/arch/arm/mach-lpc32xx/phy3250.c
+++ b/arch/arm/mach-lpc32xx/phy3250.c
@@ -137,6 +137,9 @@ static void pl08x_put_signal(const struct pl08x_channel_data *cd, int ch)
 }
 
 static struct pl08x_platform_data pl08x_pd = {
+	/* Some reasonable memcpy defaults */
+	.memcpy_burst_size = PL08X_BURST_SZ_256,
+	.memcpy_bus_width = PL08X_BUS_WIDTH_32_BITS,
 	.slave_channels = &pl08x_slave_channels[0],
 	.num_slave_channels = ARRAY_SIZE(pl08x_slave_channels),
 	.get_xfer_signal = pl08x_get_signal,
diff --git a/arch/arm/mach-s3c64xx/pl080.c b/arch/arm/mach-s3c64xx/pl080.c
index 261820a855ec..66fc774b70ec 100644
--- a/arch/arm/mach-s3c64xx/pl080.c
+++ b/arch/arm/mach-s3c64xx/pl080.c
@@ -137,16 +137,10 @@ static const struct dma_slave_map s3c64xx_dma0_slave_map[] = {
 };
 
 struct pl08x_platform_data s3c64xx_dma0_plat_data = {
-	.memcpy_channel = {
-		.bus_id = "memcpy",
-		.cctl_memcpy =
-			(PL080_BSIZE_4 << PL080_CONTROL_SB_SIZE_SHIFT |
-			PL080_BSIZE_4 << PL080_CONTROL_DB_SIZE_SHIFT |
-			PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT |
-			PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT |
-			PL080_CONTROL_PROT_BUFF | PL080_CONTROL_PROT_CACHE |
-			PL080_CONTROL_PROT_SYS),
-	},
+	.memcpy_burst_size = PL08X_BURST_SZ_4,
+	.memcpy_bus_width = PL08X_BUS_WIDTH_32_BITS,
+	.memcpy_prot_buff = true,
+	.memcpy_prot_cache = true,
 	.lli_buses = PL08X_AHB1,
 	.mem_buses = PL08X_AHB1,
 	.get_xfer_signal = pl08x_get_xfer_signal,
@@ -238,16 +232,10 @@ static const struct dma_slave_map s3c64xx_dma1_slave_map[] = {
 };
 
 struct pl08x_platform_data s3c64xx_dma1_plat_data = {
-	.memcpy_channel = {
-		.bus_id = "memcpy",
-		.cctl_memcpy =
-			(PL080_BSIZE_4 << PL080_CONTROL_SB_SIZE_SHIFT |
-			PL080_BSIZE_4 << PL080_CONTROL_DB_SIZE_SHIFT |
-			PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT |
-			PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT |
-			PL080_CONTROL_PROT_BUFF | PL080_CONTROL_PROT_CACHE |
-			PL080_CONTROL_PROT_SYS),
-	},
+	.memcpy_burst_size = PL08X_BURST_SZ_4,
+	.memcpy_bus_width = PL08X_BUS_WIDTH_32_BITS,
+	.memcpy_prot_buff = true,
+	.memcpy_prot_cache = true,
 	.lli_buses = PL08X_AHB1,
 	.mem_buses = PL08X_AHB1,
 	.get_xfer_signal = pl08x_get_xfer_signal,
diff --git a/arch/arm/mach-spear/spear3xx.c b/arch/arm/mach-spear/spear3xx.c
index 23394ac76cf2..8537fcffe5a8 100644
--- a/arch/arm/mach-spear/spear3xx.c
+++ b/arch/arm/mach-spear/spear3xx.c
@@ -44,16 +44,10 @@ struct pl022_ssp_controller pl022_plat_data = {
 
 /* dmac device registration */
 struct pl08x_platform_data pl080_plat_data = {
-	.memcpy_channel = {
-		.bus_id = "memcpy",
-		.cctl_memcpy =
-			(PL080_BSIZE_16 << PL080_CONTROL_SB_SIZE_SHIFT | \
-			PL080_BSIZE_16 << PL080_CONTROL_DB_SIZE_SHIFT | \
-			PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT | \
-			PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT | \
-			PL080_CONTROL_PROT_BUFF | PL080_CONTROL_PROT_CACHE | \
-			PL080_CONTROL_PROT_SYS),
-	},
+	.memcpy_burst_size = PL08X_BURST_SZ_16,
+	.memcpy_bus_width = PL08X_BUS_WIDTH_32_BITS,
+	.memcpy_prot_buff = true,
+	.memcpy_prot_cache = true,
 	.lli_buses = PL08X_AHB1,
 	.mem_buses = PL08X_AHB1,
 	.get_xfer_signal = pl080_get_signal,
diff --git a/arch/arm/mach-spear/spear6xx.c b/arch/arm/mach-spear/spear6xx.c
index ccf3573b831c..c5fc110134ba 100644
--- a/arch/arm/mach-spear/spear6xx.c
+++ b/arch/arm/mach-spear/spear6xx.c
@@ -322,16 +322,10 @@ static struct pl08x_channel_data spear600_dma_info[] = {
 };
 
 static struct pl08x_platform_data spear6xx_pl080_plat_data = {
-	.memcpy_channel = {
-		.bus_id = "memcpy",
-		.cctl_memcpy =
-			(PL080_BSIZE_16 << PL080_CONTROL_SB_SIZE_SHIFT | \
-			PL080_BSIZE_16 << PL080_CONTROL_DB_SIZE_SHIFT | \
-			PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT | \
-			PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT | \
-			PL080_CONTROL_PROT_BUFF | PL080_CONTROL_PROT_CACHE | \
-			PL080_CONTROL_PROT_SYS),
-	},
+	.memcpy_burst_size = PL08X_BURST_SZ_16,
+	.memcpy_bus_width = PL08X_BUS_WIDTH_32_BITS,
+	.memcpy_prot_buff = true,
+	.memcpy_prot_cache = true,
 	.lli_buses = PL08X_AHB1,
 	.mem_buses = PL08X_AHB1,
 	.get_xfer_signal = pl080_get_signal,
diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 6bb8813ca275..d3d660a36ad7 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -1433,6 +1433,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 	struct pl08x_driver_data *pl08x = plchan->host;
 	struct pl08x_txd *txd;
 	struct pl08x_sg *dsg;
+	u32 cctl = 0;
 	int ret;
 
 	txd = pl08x_get_txd(plchan);
@@ -1455,15 +1456,83 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 
 	/* Set platform data for m2m */
 	txd->ccfg |= PL080_FLOW_MEM2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
-	txd->cctl = pl08x->pd->memcpy_channel.cctl_memcpy &
-			~(PL080_CONTROL_DST_AHB2 | PL080_CONTROL_SRC_AHB2);
+
+	/* Conjure cctl */
+	switch (pl08x->pd->memcpy_burst_size) {
+	default:
+		dev_err(&pl08x->adev->dev,
+			"illegal burst size for memcpy, set to 1\n");
+		/* Fall through */
+	case PL08X_BURST_SZ_1:
+		cctl |= PL080_BSIZE_1 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_1 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_4:
+		cctl |= PL080_BSIZE_4 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_4 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_8:
+		cctl |= PL080_BSIZE_8 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_8 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_16:
+		cctl |= PL080_BSIZE_16 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_16 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_32:
+		cctl |= PL080_BSIZE_32 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_32 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_64:
+		cctl |= PL080_BSIZE_64 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_64 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_128:
+		cctl |= PL080_BSIZE_128 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_128 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	case PL08X_BURST_SZ_256:
+		cctl |= PL080_BSIZE_256 << PL080_CONTROL_SB_SIZE_SHIFT |
+			PL080_BSIZE_256 << PL080_CONTROL_DB_SIZE_SHIFT;
+		break;
+	}
+
+	switch (pl08x->pd->memcpy_bus_width) {
+	default:
+		dev_err(&pl08x->adev->dev,
+			"illegal bus width for memcpy, set to 8 bits\n");
+		/* Fall through */
+	case PL08X_BUS_WIDTH_8_BITS:
+		cctl |= PL080_WIDTH_8BIT << PL080_CONTROL_SWIDTH_SHIFT |
+			PL080_WIDTH_8BIT << PL080_CONTROL_DWIDTH_SHIFT;
+		break;
+	case PL08X_BUS_WIDTH_16_BITS:
+		cctl |= PL080_WIDTH_16BIT << PL080_CONTROL_SWIDTH_SHIFT |
+			PL080_WIDTH_16BIT << PL080_CONTROL_DWIDTH_SHIFT;
+		break;
+	case PL08X_BUS_WIDTH_32_BITS:
+		cctl |= PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT |
+			PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT;
+		break;
+	}
+
+	/* Protection flags */
+	if (pl08x->pd->memcpy_prot_buff)
+		cctl |= PL080_CONTROL_PROT_BUFF;
+	if (pl08x->pd->memcpy_prot_cache)
+		cctl |= PL080_CONTROL_PROT_CACHE;
+
+	/* We are the kernel, so we are in privileged mode */
+	cctl |= PL080_CONTROL_PROT_SYS;
 
 	/* Both to be incremented or the code will break */
-	txd->cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
+	cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
 
 	if (pl08x->vd->dualmaster)
-		txd->cctl |= pl08x_select_bus(pl08x->mem_buses,
-					      pl08x->mem_buses);
+		cctl |= pl08x_select_bus(pl08x->mem_buses,
+					 pl08x->mem_buses);
+
+	txd->cctl = cctl;
 
 	ret = pl08x_fill_llis_for_desc(plchan->host, txd);
 	if (!ret) {
@@ -1925,9 +1994,16 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x,
 			chan->signal = i;
 			pl08x_dma_slave_init(chan);
 		} else {
-			chan->cd = &pl08x->pd->memcpy_channel;
+			chan->cd = kzalloc(sizeof(*chan->cd), GFP_KERNEL);
+			if (!chan->cd) {
+				kfree(chan);
+				return -ENOMEM;
+			}
+			chan->cd->bus_id = "memcpy";
+			chan->cd->periph_buses = pl08x->pd->mem_buses;
 			chan->name = kasprintf(GFP_KERNEL, "memcpy%d", i);
 			if (!chan->name) {
+				kfree(chan->cd);
 				kfree(chan);
 				return -ENOMEM;
 			}
@@ -2099,7 +2175,6 @@ static int pl08x_of_probe(struct amba_device *adev,
 {
 	struct pl08x_platform_data *pd;
 	struct pl08x_channel_data *chanp = NULL;
-	u32 cctl_memcpy = 0;
 	u32 val;
 	int ret;
 	int i;
@@ -2139,36 +2214,28 @@ static int pl08x_of_probe(struct amba_device *adev,
 		dev_err(&adev->dev, "illegal burst size for memcpy, set to 1\n");
 		/* Fall through */
 	case 1:
-		cctl_memcpy |= PL080_BSIZE_1 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_1 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_1;
 		break;
 	case 4:
-		cctl_memcpy |= PL080_BSIZE_4 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_4 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_4;
 		break;
 	case 8:
-		cctl_memcpy |= PL080_BSIZE_8 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_8 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_8;
 		break;
 	case 16:
-		cctl_memcpy |= PL080_BSIZE_16 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_16 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_16;
 		break;
 	case 32:
-		cctl_memcpy |= PL080_BSIZE_32 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_32 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_32;
 		break;
 	case 64:
-		cctl_memcpy |= PL080_BSIZE_64 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_64 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_64;
 		break;
 	case 128:
-		cctl_memcpy |= PL080_BSIZE_128 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_128 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_128;
 		break;
 	case 256:
-		cctl_memcpy |= PL080_BSIZE_256 << PL080_CONTROL_SB_SIZE_SHIFT |
-			       PL080_BSIZE_256 << PL080_CONTROL_DB_SIZE_SHIFT;
+		pd->memcpy_burst_size = PL08X_BURST_SZ_256;
 		break;
 	}
 
@@ -2182,28 +2249,16 @@ static int pl08x_of_probe(struct amba_device *adev,
 		dev_err(&adev->dev, "illegal bus width for memcpy, set to 8 bits\n");
 		/* Fall through */
 	case 8:
-		cctl_memcpy |= PL080_WIDTH_8BIT << PL080_CONTROL_SWIDTH_SHIFT |
-			       PL080_WIDTH_8BIT << PL080_CONTROL_DWIDTH_SHIFT;
+		pd->memcpy_bus_width = PL08X_BUS_WIDTH_8_BITS;
 		break;
 	case 16:
-		cctl_memcpy |= PL080_WIDTH_16BIT << PL080_CONTROL_SWIDTH_SHIFT |
-			       PL080_WIDTH_16BIT << PL080_CONTROL_DWIDTH_SHIFT;
+		pd->memcpy_bus_width = PL08X_BUS_WIDTH_16_BITS;
 		break;
 	case 32:
-		cctl_memcpy |= PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT |
-			       PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT;
+		pd->memcpy_bus_width = PL08X_BUS_WIDTH_32_BITS;
 		break;
 	}
 
-	/* This is currently the only thing making sense */
-	cctl_memcpy |= PL080_CONTROL_PROT_SYS;
-
-	/* Set up memcpy channel */
-	pd->memcpy_channel.bus_id = "memcpy";
-	pd->memcpy_channel.cctl_memcpy = cctl_memcpy;
-	/* Use the buses that can access memory, obviously */
-	pd->memcpy_channel.periph_buses = pd->mem_buses;
-
 	/*
 	 * Allocate channel data for all possible slave channels (one
 	 * for each possible signal), channels will then be allocated
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 5308eae9ce35..79d1bcee738d 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -47,8 +47,6 @@ enum {
  * devices with static assignments
  * @muxval: a number usually used to poke into some mux regiser to
  * mux in the signal to this channel
- * @cctl_memcpy: options for the channel control register for memcpy
- *  *** not used for slave channels ***
  * @addr: source/target address in physical memory for this DMA channel,
  * can be the address of a FIFO register for burst requests for example.
  * This can be left undefined if the PrimeCell API is used for configuring
@@ -63,12 +61,28 @@ struct pl08x_channel_data {
 	int min_signal;
 	int max_signal;
 	u32 muxval;
-	u32 cctl_memcpy;
 	dma_addr_t addr;
 	bool single;
 	u8 periph_buses;
 };
 
+enum pl08x_burst_size {
+	PL08X_BURST_SZ_1,
+	PL08X_BURST_SZ_4,
+	PL08X_BURST_SZ_8,
+	PL08X_BURST_SZ_16,
+	PL08X_BURST_SZ_32,
+	PL08X_BURST_SZ_64,
+	PL08X_BURST_SZ_128,
+	PL08X_BURST_SZ_256,
+};
+
+enum pl08x_bus_width {
+	PL08X_BUS_WIDTH_8_BITS,
+	PL08X_BUS_WIDTH_16_BITS,
+	PL08X_BUS_WIDTH_32_BITS,
+};
+
 /**
  * struct pl08x_platform_data - the platform configuration for the PL08x
  * PrimeCells.
@@ -76,6 +90,11 @@ struct pl08x_channel_data {
  * platform, all inclusive, including multiplexed channels. The available
  * physical channels will be multiplexed around these signals as they are
  * requested, just enumerate all possible channels.
+ * @num_slave_channels: number of elements in the slave channel array
+ * @memcpy_burst_size: the appropriate burst size for memcpy operations
+ * @memcpy_bus_width: memory bus width
+ * @memcpy_prot_buff: whether memcpy DMA is bufferable
+ * @memcpy_prot_cache: whether memcpy DMA is cacheable
  * @get_xfer_signal: request a physical signal to be used for a DMA transfer
  * immediately: if there is some multiplexing or similar blocking the use
  * of the channel the transfer can be denied by returning less than zero,
@@ -90,7 +109,10 @@ struct pl08x_channel_data {
 struct pl08x_platform_data {
 	struct pl08x_channel_data *slave_channels;
 	unsigned int num_slave_channels;
-	struct pl08x_channel_data memcpy_channel;
+	enum pl08x_burst_size memcpy_burst_size;
+	enum pl08x_bus_width memcpy_bus_width;
+	bool memcpy_prot_buff;
+	bool memcpy_prot_cache;
 	int (*get_xfer_signal)(const struct pl08x_channel_data *);
 	void (*put_xfer_signal)(const struct pl08x_channel_data *, int);
 	u8 lli_buses;
-- 
cgit v1.2.3


From 9e972571d4040213775eb40f17ac80c109e09f8a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 20 May 2017 23:42:51 +0200
Subject: dmaengine: pl08x: Add Faraday FTDMAC020 to compatible list

This augments the PL08x bindings to include the Faraday Technology
FTDMAC020 DMA engine, as it is clearly a derivative of the PL08x
PrimeCell. Also specify that it needs the special peripheral ID
specified to work properly.

Cc: devicetree@vger.kernel.org
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/arm-pl08x.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/dma/arm-pl08x.txt b/Documentation/devicetree/bindings/dma/arm-pl08x.txt
index 8a0097a029d3..0ba81f79266f 100644
--- a/Documentation/devicetree/bindings/dma/arm-pl08x.txt
+++ b/Documentation/devicetree/bindings/dma/arm-pl08x.txt
@@ -3,6 +3,11 @@
 Required properties:
 - compatible: "arm,pl080", "arm,primecell";
 	      "arm,pl081", "arm,primecell";
+	      "faraday,ftdmac020", "arm,primecell"
+- arm,primecell-periphid: on the FTDMAC020 the primecell ID is not hard-coded
+  in the hardware and must be specified here as <0x0003b080>. This number
+  follows the PrimeCell standard numbering using the JEP106 vendor code 0x38
+  for Faraday Technology.
 - reg: Address range of the PL08x registers
 - interrupt: The PL08x interrupt number
 - clocks: The clock running the IP core clock
@@ -20,8 +25,8 @@ Optional properties:
 - dma-requests: contains the total number of DMA requests supported by the DMAC
 - memcpy-burst-size: the size of the bursts for memcpy: 1, 4, 8, 16, 32
   64, 128 or 256 bytes are legal values
-- memcpy-bus-width: the bus width used for memcpy: 8, 16 or 32 are legal
-  values
+- memcpy-bus-width: the bus width used for memcpy in bits: 8, 16 or 32 are legal
+  values, the Faraday FTDMAC020 can also accept 64 bits
 
 Clients
 Required properties:
-- 
cgit v1.2.3


From ebe9b3005d90d4500f4feb605a4ab6191e2dd34f Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 20 May 2017 23:42:52 +0200
Subject: dmaengine: pl08x: Make slave engine optional

If the vendor data does not specify any signals, we do not
have to support slave DMA. Make the registration of the slave
DMA engine optional, so we can use this for the FTDMAC020
in the Gemini that only has memcpy support.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/amba-pl08x.c | 128 +++++++++++++++++++++++++++++------------------
 1 file changed, 78 insertions(+), 50 deletions(-)

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index d3d660a36ad7..6ddb029dac50 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -253,8 +253,9 @@ struct pl08x_dma_chan {
 
 /**
  * struct pl08x_driver_data - the local state holder for the PL08x
- * @slave: slave engine for this instance
+ * @slave: optional slave engine for this instance
  * @memcpy: memcpy engine for this instance
+ * @has_slave: the PL08x has a slave engine (routed signals)
  * @base: virtual memory base (remapped) for the PL08x
  * @adev: the corresponding AMBA (PrimeCell) bus entry
  * @vd: vendor data for this PL08x variant
@@ -269,6 +270,7 @@ struct pl08x_dma_chan {
 struct pl08x_driver_data {
 	struct dma_device slave;
 	struct dma_device memcpy;
+	bool has_slave;
 	void __iomem *base;
 	struct amba_device *adev;
 	const struct vendor_data *vd;
@@ -705,7 +707,7 @@ static void pl08x_phy_free(struct pl08x_dma_chan *plchan)
 			break;
 		}
 
-	if (!next) {
+	if (!next && pl08x->has_slave) {
 		list_for_each_entry(p, &pl08x->slave.channels, vc.chan.device_node)
 			if (p->state == PL08X_CHAN_WAITING) {
 				next = p;
@@ -2085,12 +2087,15 @@ static int pl08x_debugfs_show(struct seq_file *s, void *data)
 			   pl08x_state_str(chan->state));
 	}
 
-	seq_printf(s, "\nPL08x virtual slave channels:\n");
-	seq_printf(s, "CHANNEL:\tSTATE:\n");
-	seq_printf(s, "--------\t------\n");
-	list_for_each_entry(chan, &pl08x->slave.channels, vc.chan.device_node) {
-		seq_printf(s, "%s\t\t%s\n", chan->name,
-			   pl08x_state_str(chan->state));
+	if (pl08x->has_slave) {
+		seq_printf(s, "\nPL08x virtual slave channels:\n");
+		seq_printf(s, "CHANNEL:\tSTATE:\n");
+		seq_printf(s, "--------\t------\n");
+		list_for_each_entry(chan, &pl08x->slave.channels,
+				    vc.chan.device_node) {
+			seq_printf(s, "%s\t\t%s\n", chan->name,
+				   pl08x_state_str(chan->state));
+		}
 	}
 
 	return 0;
@@ -2128,6 +2133,10 @@ static struct dma_chan *pl08x_find_chan_id(struct pl08x_driver_data *pl08x,
 {
 	struct pl08x_dma_chan *chan;
 
+	/* Trying to get a slave channel from something with no slave support */
+	if (!pl08x->has_slave)
+		return NULL;
+
 	list_for_each_entry(chan, &pl08x->slave.channels, vc.chan.device_node) {
 		if (chan->signal == id)
 			return &chan->vc.chan;
@@ -2265,20 +2274,24 @@ static int pl08x_of_probe(struct amba_device *adev,
 	 * for a device and have it's AHB interfaces set up at
 	 * translation time.
 	 */
-	chanp = devm_kcalloc(&adev->dev,
-			pl08x->vd->signals,
-			sizeof(struct pl08x_channel_data),
-			GFP_KERNEL);
-	if (!chanp)
-		return -ENOMEM;
+	if (pl08x->vd->signals) {
+		chanp = devm_kcalloc(&adev->dev,
+				     pl08x->vd->signals,
+				     sizeof(struct pl08x_channel_data),
+				     GFP_KERNEL);
+		if (!chanp)
+			return -ENOMEM;
 
-	pd->slave_channels = chanp;
-	for (i = 0; i < pl08x->vd->signals; i++) {
-		/* chanp->periph_buses will be assigned at translation */
-		chanp->bus_id = kasprintf(GFP_KERNEL, "slave%d", i);
-		chanp++;
+		pd->slave_channels = chanp;
+		for (i = 0; i < pl08x->vd->signals; i++) {
+			/*
+			 * chanp->periph_buses will be assigned at translation
+			 */
+			chanp->bus_id = kasprintf(GFP_KERNEL, "slave%d", i);
+			chanp++;
+		}
+		pd->num_slave_channels = pl08x->vd->signals;
 	}
-	pd->num_slave_channels = pl08x->vd->signals;
 
 	pl08x->pd = pd;
 
@@ -2340,24 +2353,34 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 	pl08x->memcpy.directions = BIT(DMA_MEM_TO_MEM);
 	pl08x->memcpy.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
 
-	/* Initialize slave engine */
-	dma_cap_set(DMA_SLAVE, pl08x->slave.cap_mask);
-	dma_cap_set(DMA_CYCLIC, pl08x->slave.cap_mask);
-	pl08x->slave.dev = &adev->dev;
-	pl08x->slave.device_free_chan_resources = pl08x_free_chan_resources;
-	pl08x->slave.device_prep_dma_interrupt = pl08x_prep_dma_interrupt;
-	pl08x->slave.device_tx_status = pl08x_dma_tx_status;
-	pl08x->slave.device_issue_pending = pl08x_issue_pending;
-	pl08x->slave.device_prep_slave_sg = pl08x_prep_slave_sg;
-	pl08x->slave.device_prep_dma_cyclic = pl08x_prep_dma_cyclic;
-	pl08x->slave.device_config = pl08x_config;
-	pl08x->slave.device_pause = pl08x_pause;
-	pl08x->slave.device_resume = pl08x_resume;
-	pl08x->slave.device_terminate_all = pl08x_terminate_all;
-	pl08x->slave.src_addr_widths = PL80X_DMA_BUSWIDTHS;
-	pl08x->slave.dst_addr_widths = PL80X_DMA_BUSWIDTHS;
-	pl08x->slave.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
-	pl08x->slave.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
+	/*
+	 * Initialize slave engine, if the block has no signals, that means
+	 * we have no slave support.
+	 */
+	if (vd->signals) {
+		pl08x->has_slave = true;
+		dma_cap_set(DMA_SLAVE, pl08x->slave.cap_mask);
+		dma_cap_set(DMA_CYCLIC, pl08x->slave.cap_mask);
+		pl08x->slave.dev = &adev->dev;
+		pl08x->slave.device_free_chan_resources =
+			pl08x_free_chan_resources;
+		pl08x->slave.device_prep_dma_interrupt =
+			pl08x_prep_dma_interrupt;
+		pl08x->slave.device_tx_status = pl08x_dma_tx_status;
+		pl08x->slave.device_issue_pending = pl08x_issue_pending;
+		pl08x->slave.device_prep_slave_sg = pl08x_prep_slave_sg;
+		pl08x->slave.device_prep_dma_cyclic = pl08x_prep_dma_cyclic;
+		pl08x->slave.device_config = pl08x_config;
+		pl08x->slave.device_pause = pl08x_pause;
+		pl08x->slave.device_resume = pl08x_resume;
+		pl08x->slave.device_terminate_all = pl08x_terminate_all;
+		pl08x->slave.src_addr_widths = PL80X_DMA_BUSWIDTHS;
+		pl08x->slave.dst_addr_widths = PL80X_DMA_BUSWIDTHS;
+		pl08x->slave.directions =
+			BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+		pl08x->slave.residue_granularity =
+			DMA_RESIDUE_GRANULARITY_SEGMENT;
+	}
 
 	/* Get the platform data */
 	pl08x->pd = dev_get_platdata(&adev->dev);
@@ -2465,13 +2488,15 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 	}
 
 	/* Register slave channels */
-	ret = pl08x_dma_init_virtual_channels(pl08x, &pl08x->slave,
-			pl08x->pd->num_slave_channels, true);
-	if (ret < 0) {
-		dev_warn(&pl08x->adev->dev,
-			"%s failed to enumerate slave channels - %d\n",
-				__func__, ret);
-		goto out_no_slave;
+	if (pl08x->has_slave) {
+		ret = pl08x_dma_init_virtual_channels(pl08x, &pl08x->slave,
+					pl08x->pd->num_slave_channels, true);
+		if (ret < 0) {
+			dev_warn(&pl08x->adev->dev,
+				 "%s failed to enumerate slave channels - %d\n",
+				 __func__, ret);
+			goto out_no_slave;
+		}
 	}
 
 	ret = dma_async_device_register(&pl08x->memcpy);
@@ -2482,12 +2507,14 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 		goto out_no_memcpy_reg;
 	}
 
-	ret = dma_async_device_register(&pl08x->slave);
-	if (ret) {
-		dev_warn(&pl08x->adev->dev,
+	if (pl08x->has_slave) {
+		ret = dma_async_device_register(&pl08x->slave);
+		if (ret) {
+			dev_warn(&pl08x->adev->dev,
 			"%s failed to register slave as an async device - %d\n",
 			__func__, ret);
-		goto out_no_slave_reg;
+			goto out_no_slave_reg;
+		}
 	}
 
 	amba_set_drvdata(adev, pl08x);
@@ -2501,7 +2528,8 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 out_no_slave_reg:
 	dma_async_device_unregister(&pl08x->memcpy);
 out_no_memcpy_reg:
-	pl08x_free_virtual_channels(&pl08x->slave);
+	if (pl08x->has_slave)
+		pl08x_free_virtual_channels(&pl08x->slave);
 out_no_slave:
 	pl08x_free_virtual_channels(&pl08x->memcpy);
 out_no_memcpy:
-- 
cgit v1.2.3


From 1e1cfc7213a37131a53e7dfada75dce77b8e043d Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 20 May 2017 23:42:53 +0200
Subject: dmaengine: pl08x: Add support for Faraday Technology FTDMAC020

After reading the specs for the Faraday Technology FTDMAC020 found
in the Gemini platform, it becomes pretty evident that this is just
another PL08x derivative, and should be handled like such by simply
extending the existing PL08x driver to handle the quirks in this
hardware.

This patch makes memcpy work and has been tested on the Gemini and
also regression-tested on the Nomadik NHK15 using dmatest with
10 threads per channel without a hinch for hours.

I have not implemented slave DMA in those codepaths, because this
device (Gemini) does not use slave DMA, and it seems like devices
using FTDMAC020 for device DMA have a slightly different register
layout so some real hardware is needed to proceed with this. I
left some FIXME etc in the code for this.

I had to do some refactorings of some helper functions, but I have
not split those into separate patches because these refactorings
do not make much sense without the increased complexity of handling
the FTDMAC020.

The DMA test would hang the platform on me on the Gemini after a
few thousand iterations, however after turning of the caches the
problem immediately disappeared and I could run the DMA engine
with 10 threads pers physical channel for days in a row without
a crash. I think there is no problem with the DMA driver: instead
it is something fishy in the FA526 cache handling code that get
pretty heavily exercised by the DMA engine and we need to go and
fix that instead.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig        |   6 +-
 drivers/dma/amba-pl08x.c   | 767 ++++++++++++++++++++++++++++++++++++---------
 include/linux/amba/pl080.h |  83 ++++-
 3 files changed, 698 insertions(+), 158 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 24e8597b2c3e..67ab15aa98c3 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -62,8 +62,10 @@ config AMBA_PL08X
 	select DMA_ENGINE
 	select DMA_VIRTUAL_CHANNELS
 	help
-	  Platform has a PL08x DMAC device
-	  which can provide DMA engine support
+	  Say yes if your platform has a PL08x DMAC device which can
+	  provide DMA engine support. This includes the original ARM
+	  PL080 and PL081, Samsungs PL080 derivative and Faraday
+	  Technology's FTDMAC020 PL080 derivative.
 
 config AMCC_PPC440SPE_ADMA
 	tristate "AMCC PPC440SPe ADMA support"
diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 6ddb029dac50..13cc95c0474c 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -1,9 +1,10 @@
 /*
  * Copyright (c) 2006 ARM Ltd.
  * Copyright (c) 2010 ST-Ericsson SA
+ * Copyirght (c) 2017 Linaro Ltd.
  *
  * Author: Peter Pearse <peter.pearse@arm.com>
- * Author: Linus Walleij <linus.walleij@stericsson.com>
+ * Author: Linus Walleij <linus.walleij@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -110,11 +111,12 @@ struct pl08x_driver_data;
  * @channels: the number of channels available in this variant
  * @signals: the number of request signals available from the hardware
  * @dualmaster: whether this version supports dual AHB masters or not.
- * @nomadik: whether the channels have Nomadik security extension bits
- *	that need to be checked for permission before use and some registers are
- *	missing
- * @pl080s: whether this version is a PL080S, which has separate register and
- *	LLI word for transfer size.
+ * @nomadik: whether this variant is a ST Microelectronics Nomadik, where the
+ *	channels have Nomadik security extension bits that need to be checked
+ *	for permission before use and some registers are missing
+ * @pl080s: whether this variant is a Samsung PL080S, which has separate
+ *	register and LLI word for transfer size.
+ * @ftdmac020: whether this variant is a Faraday Technology FTDMAC020
  * @max_transfer_size: the maximum single element transfer size for this
  *	PL08x variant.
  */
@@ -125,6 +127,7 @@ struct vendor_data {
 	bool dualmaster;
 	bool nomadik;
 	bool pl080s;
+	bool ftdmac020;
 	u32 max_transfer_size;
 };
 
@@ -148,19 +151,34 @@ struct pl08x_bus_data {
  * @id: physical index to this channel
  * @base: memory base address for this physical channel
  * @reg_config: configuration address for this physical channel
+ * @reg_control: control address for this physical channel
+ * @reg_src: transfer source address register
+ * @reg_dst: transfer destination address register
+ * @reg_lli: transfer LLI address register
+ * @reg_busy: if the variant has a special per-channel busy register,
+ * this contains a pointer to it
  * @lock: a lock to use when altering an instance of this struct
  * @serving: the virtual channel currently being served by this physical
  * channel
  * @locked: channel unavailable for the system, e.g. dedicated to secure
  * world
+ * @ftdmac020: channel is on a FTDMAC020
+ * @pl080s: channel is on a PL08s
  */
 struct pl08x_phy_chan {
 	unsigned int id;
 	void __iomem *base;
 	void __iomem *reg_config;
+	void __iomem *reg_control;
+	void __iomem *reg_src;
+	void __iomem *reg_dst;
+	void __iomem *reg_lli;
+	void __iomem *reg_busy;
 	spinlock_t lock;
 	struct pl08x_dma_chan *serving;
 	bool locked;
+	bool ftdmac020;
+	bool pl080s;
 };
 
 /**
@@ -362,10 +380,24 @@ static int pl08x_phy_channel_busy(struct pl08x_phy_chan *ch)
 {
 	unsigned int val;
 
+	/* If we have a special busy register, take a shortcut */
+	if (ch->reg_busy) {
+		val = readl(ch->reg_busy);
+		return !!(val & BIT(ch->id));
+	}
 	val = readl(ch->reg_config);
 	return val & PL080_CONFIG_ACTIVE;
 }
 
+/*
+ * pl08x_write_lli() - Write an LLI into the DMA controller.
+ *
+ * The PL08x derivatives support linked lists, but the first item of the
+ * list containing the source, destination, control word and next LLI is
+ * ignored. Instead the driver has to write those values directly into the
+ * SRC, DST, LLI and control registers. On FTDMAC020 also the SIZE
+ * register need to be set up for the first transfer.
+ */
 static void pl08x_write_lli(struct pl08x_driver_data *pl08x,
 		struct pl08x_phy_chan *phychan, const u32 *lli, u32 ccfg)
 {
@@ -383,11 +415,112 @@ static void pl08x_write_lli(struct pl08x_driver_data *pl08x,
 			phychan->id, lli[PL080_LLI_SRC], lli[PL080_LLI_DST],
 			lli[PL080_LLI_LLI], lli[PL080_LLI_CCTL], ccfg);
 
-	writel_relaxed(lli[PL080_LLI_SRC], phychan->base + PL080_CH_SRC_ADDR);
-	writel_relaxed(lli[PL080_LLI_DST], phychan->base + PL080_CH_DST_ADDR);
-	writel_relaxed(lli[PL080_LLI_LLI], phychan->base + PL080_CH_LLI);
-	writel_relaxed(lli[PL080_LLI_CCTL], phychan->base + PL080_CH_CONTROL);
+	writel_relaxed(lli[PL080_LLI_SRC], phychan->reg_src);
+	writel_relaxed(lli[PL080_LLI_DST], phychan->reg_dst);
+	writel_relaxed(lli[PL080_LLI_LLI], phychan->reg_lli);
 
+	/*
+	 * The FTMAC020 has a different layout in the CCTL word of the LLI
+	 * and the CCTL register which is split in CSR and SIZE registers.
+	 * Convert the LLI item CCTL into the proper values to write into
+	 * the CSR and SIZE registers.
+	 */
+	if (phychan->ftdmac020) {
+		u32 llictl = lli[PL080_LLI_CCTL];
+		u32 val = 0;
+
+		/* Write the transfer size (12 bits) to the size register */
+		writel_relaxed(llictl & FTDMAC020_LLI_TRANSFER_SIZE_MASK,
+			       phychan->base + FTDMAC020_CH_SIZE);
+		/*
+		 * Then write the control bits 28..16 to the control register
+		 * by shuffleing the bits around to where they are in the
+		 * main register. The mapping is as follows:
+		 * Bit 28: TC_MSK - mask on all except last LLI
+		 * Bit 27..25: SRC_WIDTH
+		 * Bit 24..22: DST_WIDTH
+		 * Bit 21..20: SRCAD_CTRL
+		 * Bit 19..17: DSTAD_CTRL
+		 * Bit 17: SRC_SEL
+		 * Bit 16: DST_SEL
+		 */
+		if (llictl & FTDMAC020_LLI_TC_MSK)
+			val |= FTDMAC020_CH_CSR_TC_MSK;
+		val |= ((llictl  & FTDMAC020_LLI_SRC_WIDTH_MSK) >>
+			(FTDMAC020_LLI_SRC_WIDTH_SHIFT -
+			 FTDMAC020_CH_CSR_SRC_WIDTH_SHIFT));
+		val |= ((llictl  & FTDMAC020_LLI_DST_WIDTH_MSK) >>
+			(FTDMAC020_LLI_DST_WIDTH_SHIFT -
+			 FTDMAC020_CH_CSR_DST_WIDTH_SHIFT));
+		val |= ((llictl  & FTDMAC020_LLI_SRCAD_CTL_MSK) >>
+			(FTDMAC020_LLI_SRCAD_CTL_SHIFT -
+			 FTDMAC020_CH_CSR_SRCAD_CTL_SHIFT));
+		val |= ((llictl  & FTDMAC020_LLI_DSTAD_CTL_MSK) >>
+			(FTDMAC020_LLI_DSTAD_CTL_SHIFT -
+			 FTDMAC020_CH_CSR_DSTAD_CTL_SHIFT));
+		if (llictl & FTDMAC020_LLI_SRC_SEL)
+			val |= FTDMAC020_CH_CSR_SRC_SEL;
+		if (llictl & FTDMAC020_LLI_DST_SEL)
+			val |= FTDMAC020_CH_CSR_DST_SEL;
+
+		/*
+		 * Set up the bits that exist in the CSR but are not
+		 * part the LLI, i.e. only gets written to the control
+		 * register right here.
+		 *
+		 * FIXME: do not just handle memcpy, also handle slave DMA.
+		 */
+		switch (pl08x->pd->memcpy_burst_size) {
+		default:
+		case PL08X_BURST_SZ_1:
+			val |= PL080_BSIZE_1 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_4:
+			val |= PL080_BSIZE_4 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_8:
+			val |= PL080_BSIZE_8 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_16:
+			val |= PL080_BSIZE_16 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_32:
+			val |= PL080_BSIZE_32 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_64:
+			val |= PL080_BSIZE_64 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_128:
+			val |= PL080_BSIZE_128 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		case PL08X_BURST_SZ_256:
+			val |= PL080_BSIZE_256 <<
+				FTDMAC020_CH_CSR_SRC_SIZE_SHIFT;
+			break;
+		}
+
+		/* Protection flags */
+		if (pl08x->pd->memcpy_prot_buff)
+			val |= FTDMAC020_CH_CSR_PROT2;
+		if (pl08x->pd->memcpy_prot_cache)
+			val |= FTDMAC020_CH_CSR_PROT3;
+		/* We are the kernel, so we are in privileged mode */
+		val |= FTDMAC020_CH_CSR_PROT1;
+
+		writel_relaxed(val, phychan->reg_control);
+	} else {
+		/* Bits are just identical */
+		writel_relaxed(lli[PL080_LLI_CCTL], phychan->reg_control);
+	}
+
+	/* Second control word on the PL080s */
 	if (pl08x->vd->pl080s)
 		writel_relaxed(lli[PL080S_LLI_CCTL2],
 				phychan->base + PL080S_CH_CONTROL2);
@@ -425,11 +558,25 @@ static void pl08x_start_next_txd(struct pl08x_dma_chan *plchan)
 		cpu_relax();
 
 	/* Do not access config register until channel shows as inactive */
-	val = readl(phychan->reg_config);
-	while ((val & PL080_CONFIG_ACTIVE) || (val & PL080_CONFIG_ENABLE))
+	if (phychan->ftdmac020) {
+		val = readl(phychan->reg_config);
+		while (val & FTDMAC020_CH_CFG_BUSY)
+			val = readl(phychan->reg_config);
+
+		val = readl(phychan->reg_control);
+		while (val & FTDMAC020_CH_CSR_EN)
+			val = readl(phychan->reg_control);
+
+		writel(val | FTDMAC020_CH_CSR_EN,
+		       phychan->reg_control);
+	} else {
 		val = readl(phychan->reg_config);
+		while ((val & PL080_CONFIG_ACTIVE) ||
+		       (val & PL080_CONFIG_ENABLE))
+			val = readl(phychan->reg_config);
 
-	writel(val | PL080_CONFIG_ENABLE, phychan->reg_config);
+		writel(val | PL080_CONFIG_ENABLE, phychan->reg_config);
+	}
 }
 
 /*
@@ -447,6 +594,14 @@ static void pl08x_pause_phy_chan(struct pl08x_phy_chan *ch)
 	u32 val;
 	int timeout;
 
+	if (ch->ftdmac020) {
+		/* Use the enable bit on the FTDMAC020 */
+		val = readl(ch->reg_control);
+		val &= ~FTDMAC020_CH_CSR_EN;
+		writel(val, ch->reg_control);
+		return;
+	}
+
 	/* Set the HALT bit and wait for the FIFO to drain */
 	val = readl(ch->reg_config);
 	val |= PL080_CONFIG_HALT;
@@ -466,6 +621,14 @@ static void pl08x_resume_phy_chan(struct pl08x_phy_chan *ch)
 {
 	u32 val;
 
+	/* Use the enable bit on the FTDMAC020 */
+	if (ch->ftdmac020) {
+		val = readl(ch->reg_control);
+		val |= FTDMAC020_CH_CSR_EN;
+		writel(val, ch->reg_control);
+		return;
+	}
+
 	/* Clear the HALT bit */
 	val = readl(ch->reg_config);
 	val &= ~PL080_CONFIG_HALT;
@@ -481,25 +644,68 @@ static void pl08x_resume_phy_chan(struct pl08x_phy_chan *ch)
 static void pl08x_terminate_phy_chan(struct pl08x_driver_data *pl08x,
 	struct pl08x_phy_chan *ch)
 {
-	u32 val = readl(ch->reg_config);
+	u32 val;
+
+	/* The layout for the FTDMAC020 is different */
+	if (ch->ftdmac020) {
+		/* Disable all interrupts */
+		val = readl(ch->reg_config);
+		val |= (FTDMAC020_CH_CFG_INT_ABT_MASK |
+			FTDMAC020_CH_CFG_INT_ERR_MASK |
+			FTDMAC020_CH_CFG_INT_TC_MASK);
+		writel(val, ch->reg_config);
+
+		/* Abort and disable channel */
+		val = readl(ch->reg_control);
+		val &= ~FTDMAC020_CH_CSR_EN;
+		val |= FTDMAC020_CH_CSR_ABT;
+		writel(val, ch->reg_control);
+
+		/* Clear ABT and ERR interrupt flags */
+		writel(BIT(ch->id) | BIT(ch->id + 16),
+		       pl08x->base + PL080_ERR_CLEAR);
+		writel(BIT(ch->id), pl08x->base + PL080_TC_CLEAR);
 
+		return;
+	}
+
+	val = readl(ch->reg_config);
 	val &= ~(PL080_CONFIG_ENABLE | PL080_CONFIG_ERR_IRQ_MASK |
 		 PL080_CONFIG_TC_IRQ_MASK);
-
 	writel(val, ch->reg_config);
 
 	writel(BIT(ch->id), pl08x->base + PL080_ERR_CLEAR);
 	writel(BIT(ch->id), pl08x->base + PL080_TC_CLEAR);
 }
 
-static inline u32 get_bytes_in_cctl(u32 cctl)
+static u32 get_bytes_in_phy_channel(struct pl08x_phy_chan *ch)
 {
-	/* The source width defines the number of bytes */
-	u32 bytes = cctl & PL080_CONTROL_TRANSFER_SIZE_MASK;
+	u32 val;
+	u32 bytes;
+
+	if (ch->ftdmac020) {
+		bytes = readl(ch->base + FTDMAC020_CH_SIZE);
+
+		val = readl(ch->reg_control);
+		val &= FTDMAC020_CH_CSR_SRC_WIDTH_MSK;
+		val >>= FTDMAC020_CH_CSR_SRC_WIDTH_SHIFT;
+	} else if (ch->pl080s) {
+		val = readl(ch->base + PL080S_CH_CONTROL2);
+		bytes = val & PL080S_CONTROL_TRANSFER_SIZE_MASK;
 
-	cctl &= PL080_CONTROL_SWIDTH_MASK;
+		val = readl(ch->reg_control);
+		val &= PL080_CONTROL_SWIDTH_MASK;
+		val >>= PL080_CONTROL_SWIDTH_SHIFT;
+	} else {
+		/* Plain PL08x */
+		val = readl(ch->reg_control);
+		bytes = val & PL080_CONTROL_TRANSFER_SIZE_MASK;
+
+		val &= PL080_CONTROL_SWIDTH_MASK;
+		val >>= PL080_CONTROL_SWIDTH_SHIFT;
+	}
 
-	switch (cctl >> PL080_CONTROL_SWIDTH_SHIFT) {
+	switch (val) {
 	case PL080_WIDTH_8BIT:
 		break;
 	case PL080_WIDTH_16BIT:
@@ -512,14 +718,35 @@ static inline u32 get_bytes_in_cctl(u32 cctl)
 	return bytes;
 }
 
-static inline u32 get_bytes_in_cctl_pl080s(u32 cctl, u32 cctl1)
+static u32 get_bytes_in_lli(struct pl08x_phy_chan *ch, const u32 *llis_va)
 {
-	/* The source width defines the number of bytes */
-	u32 bytes = cctl1 & PL080S_CONTROL_TRANSFER_SIZE_MASK;
+	u32 val;
+	u32 bytes;
+
+	if (ch->ftdmac020) {
+		val = llis_va[PL080_LLI_CCTL];
+		bytes = val & FTDMAC020_LLI_TRANSFER_SIZE_MASK;
+
+		val = llis_va[PL080_LLI_CCTL];
+		val &= FTDMAC020_LLI_SRC_WIDTH_MSK;
+		val >>= FTDMAC020_LLI_SRC_WIDTH_SHIFT;
+	} else if (ch->pl080s) {
+		val = llis_va[PL080S_LLI_CCTL2];
+		bytes = val & PL080S_CONTROL_TRANSFER_SIZE_MASK;
+
+		val = llis_va[PL080_LLI_CCTL];
+		val &= PL080_CONTROL_SWIDTH_MASK;
+		val >>= PL080_CONTROL_SWIDTH_SHIFT;
+	} else {
+		/* Plain PL08x */
+		val = llis_va[PL080_LLI_CCTL];
+		bytes = val & PL080_CONTROL_TRANSFER_SIZE_MASK;
 
-	cctl &= PL080_CONTROL_SWIDTH_MASK;
+		val &= PL080_CONTROL_SWIDTH_MASK;
+		val >>= PL080_CONTROL_SWIDTH_SHIFT;
+	}
 
-	switch (cctl >> PL080_CONTROL_SWIDTH_SHIFT) {
+	switch (val) {
 	case PL080_WIDTH_8BIT:
 		break;
 	case PL080_WIDTH_16BIT:
@@ -554,15 +781,10 @@ static u32 pl08x_getbytes_chan(struct pl08x_dma_chan *plchan)
 	 * Follow the LLIs to get the number of remaining
 	 * bytes in the currently active transaction.
 	 */
-	clli = readl(ch->base + PL080_CH_LLI) & ~PL080_LLI_LM_AHB2;
+	clli = readl(ch->reg_lli) & ~PL080_LLI_LM_AHB2;
 
 	/* First get the remaining bytes in the active transfer */
-	if (pl08x->vd->pl080s)
-		bytes = get_bytes_in_cctl_pl080s(
-				readl(ch->base + PL080_CH_CONTROL),
-				readl(ch->base + PL080S_CH_CONTROL2));
-	else
-		bytes = get_bytes_in_cctl(readl(ch->base + PL080_CH_CONTROL));
+	bytes = get_bytes_in_phy_channel(ch);
 
 	if (!clli)
 		return bytes;
@@ -583,12 +805,7 @@ static u32 pl08x_getbytes_chan(struct pl08x_dma_chan *plchan)
 	llis_va_limit = llis_va + llis_max_words;
 
 	for (; llis_va < llis_va_limit; llis_va += pl08x->lli_words) {
-		if (pl08x->vd->pl080s)
-			bytes += get_bytes_in_cctl_pl080s(
-						llis_va[PL080_LLI_CCTL],
-						llis_va[PL080S_LLI_CCTL2]);
-		else
-			bytes += get_bytes_in_cctl(llis_va[PL080_LLI_CCTL]);
+		bytes += get_bytes_in_lli(ch, llis_va);
 
 		/*
 		 * A LLI pointer going backward terminates the LLI list
@@ -748,9 +965,30 @@ static void pl08x_phy_free(struct pl08x_dma_chan *plchan)
  * LLI handling
  */
 
-static inline unsigned int pl08x_get_bytes_for_cctl(unsigned int coded)
+static inline unsigned int
+pl08x_get_bytes_for_lli(struct pl08x_driver_data *pl08x,
+			u32 cctl,
+			bool source)
 {
-	switch (coded) {
+	u32 val;
+
+	if (pl08x->vd->ftdmac020) {
+		if (source)
+			val = (cctl & FTDMAC020_LLI_SRC_WIDTH_MSK) >>
+				FTDMAC020_LLI_SRC_WIDTH_SHIFT;
+		else
+			val = (cctl & FTDMAC020_LLI_DST_WIDTH_MSK) >>
+				FTDMAC020_LLI_DST_WIDTH_SHIFT;
+	} else {
+		if (source)
+			val = (cctl & PL080_CONTROL_SWIDTH_MASK) >>
+				PL080_CONTROL_SWIDTH_SHIFT;
+		else
+			val = (cctl & PL080_CONTROL_DWIDTH_MASK) >>
+				PL080_CONTROL_DWIDTH_SHIFT;
+	}
+
+	switch (val) {
 	case PL080_WIDTH_8BIT:
 		return 1;
 	case PL080_WIDTH_16BIT:
@@ -764,49 +1002,106 @@ static inline unsigned int pl08x_get_bytes_for_cctl(unsigned int coded)
 	return 0;
 }
 
-static inline u32 pl08x_cctl_bits(u32 cctl, u8 srcwidth, u8 dstwidth,
-				  size_t tsize)
+static inline u32 pl08x_lli_control_bits(struct pl08x_driver_data *pl08x,
+					 u32 cctl,
+					 u8 srcwidth, u8 dstwidth,
+					 size_t tsize)
 {
 	u32 retbits = cctl;
 
-	/* Remove all src, dst and transfer size bits */
-	retbits &= ~PL080_CONTROL_DWIDTH_MASK;
-	retbits &= ~PL080_CONTROL_SWIDTH_MASK;
-	retbits &= ~PL080_CONTROL_TRANSFER_SIZE_MASK;
+	/*
+	 * Remove all src, dst and transfer size bits, then set the
+	 * width and size according to the parameters. The bit offsets
+	 * are different in the FTDMAC020 so we need to accound for this.
+	 */
+	if (pl08x->vd->ftdmac020) {
+		retbits &= ~FTDMAC020_LLI_DST_WIDTH_MSK;
+		retbits &= ~FTDMAC020_LLI_SRC_WIDTH_MSK;
+		retbits &= ~FTDMAC020_LLI_TRANSFER_SIZE_MASK;
+
+		switch (srcwidth) {
+		case 1:
+			retbits |= PL080_WIDTH_8BIT <<
+				FTDMAC020_LLI_SRC_WIDTH_SHIFT;
+			break;
+		case 2:
+			retbits |= PL080_WIDTH_16BIT <<
+				FTDMAC020_LLI_SRC_WIDTH_SHIFT;
+			break;
+		case 4:
+			retbits |= PL080_WIDTH_32BIT <<
+				FTDMAC020_LLI_SRC_WIDTH_SHIFT;
+			break;
+		default:
+			BUG();
+			break;
+		}
 
-	/* Then set the bits according to the parameters */
-	switch (srcwidth) {
-	case 1:
-		retbits |= PL080_WIDTH_8BIT << PL080_CONTROL_SWIDTH_SHIFT;
-		break;
-	case 2:
-		retbits |= PL080_WIDTH_16BIT << PL080_CONTROL_SWIDTH_SHIFT;
-		break;
-	case 4:
-		retbits |= PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT;
-		break;
-	default:
-		BUG();
-		break;
-	}
+		switch (dstwidth) {
+		case 1:
+			retbits |= PL080_WIDTH_8BIT <<
+				FTDMAC020_LLI_DST_WIDTH_SHIFT;
+			break;
+		case 2:
+			retbits |= PL080_WIDTH_16BIT <<
+				FTDMAC020_LLI_DST_WIDTH_SHIFT;
+			break;
+		case 4:
+			retbits |= PL080_WIDTH_32BIT <<
+				FTDMAC020_LLI_DST_WIDTH_SHIFT;
+			break;
+		default:
+			BUG();
+			break;
+		}
 
-	switch (dstwidth) {
-	case 1:
-		retbits |= PL080_WIDTH_8BIT << PL080_CONTROL_DWIDTH_SHIFT;
-		break;
-	case 2:
-		retbits |= PL080_WIDTH_16BIT << PL080_CONTROL_DWIDTH_SHIFT;
-		break;
-	case 4:
-		retbits |= PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT;
-		break;
-	default:
-		BUG();
-		break;
+		tsize &= FTDMAC020_LLI_TRANSFER_SIZE_MASK;
+		retbits |= tsize << FTDMAC020_LLI_TRANSFER_SIZE_SHIFT;
+	} else {
+		retbits &= ~PL080_CONTROL_DWIDTH_MASK;
+		retbits &= ~PL080_CONTROL_SWIDTH_MASK;
+		retbits &= ~PL080_CONTROL_TRANSFER_SIZE_MASK;
+
+		switch (srcwidth) {
+		case 1:
+			retbits |= PL080_WIDTH_8BIT <<
+				PL080_CONTROL_SWIDTH_SHIFT;
+			break;
+		case 2:
+			retbits |= PL080_WIDTH_16BIT <<
+				PL080_CONTROL_SWIDTH_SHIFT;
+			break;
+		case 4:
+			retbits |= PL080_WIDTH_32BIT <<
+				PL080_CONTROL_SWIDTH_SHIFT;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		switch (dstwidth) {
+		case 1:
+			retbits |= PL080_WIDTH_8BIT <<
+				PL080_CONTROL_DWIDTH_SHIFT;
+			break;
+		case 2:
+			retbits |= PL080_WIDTH_16BIT <<
+				PL080_CONTROL_DWIDTH_SHIFT;
+			break;
+		case 4:
+			retbits |= PL080_WIDTH_32BIT <<
+				PL080_CONTROL_DWIDTH_SHIFT;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		tsize &= PL080_CONTROL_TRANSFER_SIZE_MASK;
+		retbits |= tsize << PL080_CONTROL_TRANSFER_SIZE_SHIFT;
 	}
 
-	tsize &= PL080_CONTROL_TRANSFER_SIZE_MASK;
-	retbits |= tsize << PL080_CONTROL_TRANSFER_SIZE_SHIFT;
 	return retbits;
 }
 
@@ -827,13 +1122,35 @@ struct pl08x_lli_build_data {
  * - prefers the destination bus if both available
  * - prefers bus with fixed address (i.e. peripheral)
  */
-static void pl08x_choose_master_bus(struct pl08x_lli_build_data *bd,
-	struct pl08x_bus_data **mbus, struct pl08x_bus_data **sbus, u32 cctl)
+static void pl08x_choose_master_bus(struct pl08x_driver_data *pl08x,
+				    struct pl08x_lli_build_data *bd,
+				    struct pl08x_bus_data **mbus,
+				    struct pl08x_bus_data **sbus,
+				    u32 cctl)
 {
-	if (!(cctl & PL080_CONTROL_DST_INCR)) {
+	bool dst_incr;
+	bool src_incr;
+
+	/*
+	 * The FTDMAC020 only supports memory-to-memory transfer, so
+	 * source and destination always increase.
+	 */
+	if (pl08x->vd->ftdmac020) {
+		dst_incr = true;
+		src_incr = true;
+	} else {
+		dst_incr = !!(cctl & PL080_CONTROL_DST_INCR);
+		src_incr = !!(cctl & PL080_CONTROL_SRC_INCR);
+	}
+
+	/*
+	 * If either bus is not advancing, i.e. it is a peripheral, that
+	 * one becomes master
+	 */
+	if (!dst_incr) {
 		*mbus = &bd->dstbus;
 		*sbus = &bd->srcbus;
-	} else if (!(cctl & PL080_CONTROL_SRC_INCR)) {
+	} else if (!src_incr) {
 		*mbus = &bd->srcbus;
 		*sbus = &bd->dstbus;
 	} else {
@@ -871,10 +1188,16 @@ static void pl08x_fill_lli_for_desc(struct pl08x_driver_data *pl08x,
 	if (pl08x->vd->pl080s)
 		llis_va[PL080S_LLI_CCTL2] = cctl2;
 
-	if (cctl & PL080_CONTROL_SRC_INCR)
+	if (pl08x->vd->ftdmac020) {
+		/* FIXME: only memcpy so far so both increase */
 		bd->srcbus.addr += len;
-	if (cctl & PL080_CONTROL_DST_INCR)
 		bd->dstbus.addr += len;
+	} else {
+		if (cctl & PL080_CONTROL_SRC_INCR)
+			bd->srcbus.addr += len;
+		if (cctl & PL080_CONTROL_DST_INCR)
+			bd->dstbus.addr += len;
+	}
 
 	BUG_ON(bd->remainder < len);
 
@@ -885,12 +1208,12 @@ static inline void prep_byte_width_lli(struct pl08x_driver_data *pl08x,
 			struct pl08x_lli_build_data *bd, u32 *cctl, u32 len,
 			int num_llis, size_t *total_bytes)
 {
-	*cctl = pl08x_cctl_bits(*cctl, 1, 1, len);
+	*cctl = pl08x_lli_control_bits(pl08x, *cctl, 1, 1, len);
 	pl08x_fill_lli_for_desc(pl08x, bd, num_llis, len, *cctl, len);
 	(*total_bytes) += len;
 }
 
-#ifdef VERBOSE_DEBUG
+#if 1
 static void pl08x_dump_lli(struct pl08x_driver_data *pl08x,
 			   const u32 *llis_va, int num_llis)
 {
@@ -955,14 +1278,10 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	cctl = txd->cctl;
 
 	/* Find maximum width of the source bus */
-	bd.srcbus.maxwidth =
-		pl08x_get_bytes_for_cctl((cctl & PL080_CONTROL_SWIDTH_MASK) >>
-				       PL080_CONTROL_SWIDTH_SHIFT);
+	bd.srcbus.maxwidth = pl08x_get_bytes_for_lli(pl08x, cctl, true);
 
 	/* Find maximum width of the destination bus */
-	bd.dstbus.maxwidth =
-		pl08x_get_bytes_for_cctl((cctl & PL080_CONTROL_DWIDTH_MASK) >>
-				       PL080_CONTROL_DWIDTH_SHIFT);
+	bd.dstbus.maxwidth = pl08x_get_bytes_for_lli(pl08x, cctl, false);
 
 	list_for_each_entry(dsg, &txd->dsg_list, node) {
 		total_bytes = 0;
@@ -974,7 +1293,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 		bd.srcbus.buswidth = bd.srcbus.maxwidth;
 		bd.dstbus.buswidth = bd.dstbus.maxwidth;
 
-		pl08x_choose_master_bus(&bd, &mbus, &sbus, cctl);
+		pl08x_choose_master_bus(pl08x, &bd, &mbus, &sbus, cctl);
 
 		dev_vdbg(&pl08x->adev->dev,
 			"src=0x%08llx%s/%u dst=0x%08llx%s/%u len=%zu\n",
@@ -1011,8 +1330,14 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 		 *   supported. Thus, we can't have scattered addresses.
 		 */
 		if (!bd.remainder) {
-			u32 fc = (txd->ccfg & PL080_CONFIG_FLOW_CONTROL_MASK) >>
-				PL080_CONFIG_FLOW_CONTROL_SHIFT;
+			u32 fc;
+
+			/* FTDMAC020 only does memory-to-memory */
+			if (pl08x->vd->ftdmac020)
+				fc = PL080_FLOW_MEM2MEM;
+			else
+				fc = (txd->ccfg & PL080_CONFIG_FLOW_CONTROL_MASK) >>
+					PL080_CONFIG_FLOW_CONTROL_SHIFT;
 			if (!((fc >= PL080_FLOW_SRC2DST_DST) &&
 					(fc <= PL080_FLOW_SRC2DST_SRC))) {
 				dev_err(&pl08x->adev->dev, "%s sg len can't be zero",
@@ -1029,8 +1354,9 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 				return 0;
 			}
 
-			cctl = pl08x_cctl_bits(cctl, bd.srcbus.buswidth,
-					bd.dstbus.buswidth, 0);
+			cctl = pl08x_lli_control_bits(pl08x, cctl,
+					bd.srcbus.buswidth, bd.dstbus.buswidth,
+					0);
 			pl08x_fill_lli_for_desc(pl08x, &bd, num_llis++,
 					0, cctl, 0);
 			break;
@@ -1109,8 +1435,9 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 					"size 0x%08zx (remainder 0x%08zx)\n",
 					__func__, lli_len, bd.remainder);
 
-				cctl = pl08x_cctl_bits(cctl, bd.srcbus.buswidth,
-					bd.dstbus.buswidth, tsize);
+				cctl = pl08x_lli_control_bits(pl08x, cctl,
+					bd.srcbus.buswidth, bd.dstbus.buswidth,
+					tsize);
 				pl08x_fill_lli_for_desc(pl08x, &bd, num_llis++,
 						lli_len, cctl, tsize);
 				total_bytes += lli_len;
@@ -1153,7 +1480,10 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 		/* The final LLI terminates the LLI. */
 		last_lli[PL080_LLI_LLI] = 0;
 		/* The final LLI element shall also fire an interrupt. */
-		last_lli[PL080_LLI_CCTL] |= PL080_CONTROL_TC_IRQ_EN;
+		if (pl08x->vd->ftdmac020)
+			last_lli[PL080_LLI_CCTL] &= ~FTDMAC020_LLI_TC_MSK;
+		else
+			last_lli[PL080_LLI_CCTL] |= PL080_CONTROL_TC_IRQ_EN;
 	}
 
 	pl08x_dump_lli(pl08x, llis_va, num_llis);
@@ -1319,14 +1649,25 @@ static const struct burst_table burst_sizes[] = {
  * will be routed to each port.  We try to have source and destination
  * on separate ports, but always respect the allowable settings.
  */
-static u32 pl08x_select_bus(u8 src, u8 dst)
+static u32 pl08x_select_bus(bool ftdmac020, u8 src, u8 dst)
 {
 	u32 cctl = 0;
+	u32 dst_ahb2;
+	u32 src_ahb2;
+
+	/* The FTDMAC020 use different bits to indicate src/dst bus */
+	if (ftdmac020) {
+		dst_ahb2 = FTDMAC020_LLI_DST_SEL;
+		src_ahb2 = FTDMAC020_LLI_SRC_SEL;
+	} else {
+		dst_ahb2 = PL080_CONTROL_DST_AHB2;
+		src_ahb2 = PL080_CONTROL_SRC_AHB2;
+	}
 
 	if (!(dst & PL08X_AHB1) || ((dst & PL08X_AHB2) && (src & PL08X_AHB1)))
-		cctl |= PL080_CONTROL_DST_AHB2;
+		cctl |= dst_ahb2;
 	if (!(src & PL08X_AHB1) || ((src & PL08X_AHB2) && !(dst & PL08X_AHB2)))
-		cctl |= PL080_CONTROL_SRC_AHB2;
+		cctl |= src_ahb2;
 
 	return cctl;
 }
@@ -1414,50 +1755,14 @@ static struct pl08x_txd *pl08x_get_txd(struct pl08x_dma_chan *plchan)
 {
 	struct pl08x_txd *txd = kzalloc(sizeof(*txd), GFP_NOWAIT);
 
-	if (txd) {
+	if (txd)
 		INIT_LIST_HEAD(&txd->dsg_list);
-
-		/* Always enable error and terminal interrupts */
-		txd->ccfg = PL080_CONFIG_ERR_IRQ_MASK |
-			    PL080_CONFIG_TC_IRQ_MASK;
-	}
 	return txd;
 }
 
-/*
- * Initialize a descriptor to be used by memcpy submit
- */
-static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
-		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
-		size_t len, unsigned long flags)
+static u32 pl08x_memcpy_cctl(struct pl08x_driver_data *pl08x)
 {
-	struct pl08x_dma_chan *plchan = to_pl08x_chan(chan);
-	struct pl08x_driver_data *pl08x = plchan->host;
-	struct pl08x_txd *txd;
-	struct pl08x_sg *dsg;
 	u32 cctl = 0;
-	int ret;
-
-	txd = pl08x_get_txd(plchan);
-	if (!txd) {
-		dev_err(&pl08x->adev->dev,
-			"%s no memory for descriptor\n", __func__);
-		return NULL;
-	}
-
-	dsg = kzalloc(sizeof(struct pl08x_sg), GFP_NOWAIT);
-	if (!dsg) {
-		pl08x_free_txd(pl08x, txd);
-		return NULL;
-	}
-	list_add_tail(&dsg->node, &txd->dsg_list);
-
-	dsg->src_addr = src;
-	dsg->dst_addr = dest;
-	dsg->len = len;
-
-	/* Set platform data for m2m */
-	txd->ccfg |= PL080_FLOW_MEM2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 
 	/* Conjure cctl */
 	switch (pl08x->pd->memcpy_burst_size) {
@@ -1531,10 +1836,95 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 	cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
 
 	if (pl08x->vd->dualmaster)
-		cctl |= pl08x_select_bus(pl08x->mem_buses,
+		cctl |= pl08x_select_bus(false,
+					 pl08x->mem_buses,
+					 pl08x->mem_buses);
+
+	return cctl;
+}
+
+static u32 pl08x_ftdmac020_memcpy_cctl(struct pl08x_driver_data *pl08x)
+{
+	u32 cctl = 0;
+
+	/* Conjure cctl */
+	switch (pl08x->pd->memcpy_bus_width) {
+	default:
+		dev_err(&pl08x->adev->dev,
+			"illegal bus width for memcpy, set to 8 bits\n");
+		/* Fall through */
+	case PL08X_BUS_WIDTH_8_BITS:
+		cctl |= PL080_WIDTH_8BIT << FTDMAC020_LLI_SRC_WIDTH_SHIFT |
+			PL080_WIDTH_8BIT << FTDMAC020_LLI_DST_WIDTH_SHIFT;
+		break;
+	case PL08X_BUS_WIDTH_16_BITS:
+		cctl |= PL080_WIDTH_16BIT << FTDMAC020_LLI_SRC_WIDTH_SHIFT |
+			PL080_WIDTH_16BIT << FTDMAC020_LLI_DST_WIDTH_SHIFT;
+		break;
+	case PL08X_BUS_WIDTH_32_BITS:
+		cctl |= PL080_WIDTH_32BIT << FTDMAC020_LLI_SRC_WIDTH_SHIFT |
+			PL080_WIDTH_32BIT << FTDMAC020_LLI_DST_WIDTH_SHIFT;
+		break;
+	}
+
+	/*
+	 * By default mask the TC IRQ on all LLIs, it will be unmasked on
+	 * the last LLI item by other code.
+	 */
+	cctl |= FTDMAC020_LLI_TC_MSK;
+
+	/*
+	 * Both to be incremented so leave bits FTDMAC020_LLI_SRCAD_CTL
+	 * and FTDMAC020_LLI_DSTAD_CTL as zero
+	 */
+	if (pl08x->vd->dualmaster)
+		cctl |= pl08x_select_bus(true,
+					 pl08x->mem_buses,
 					 pl08x->mem_buses);
 
-	txd->cctl = cctl;
+	return cctl;
+}
+
+/*
+ * Initialize a descriptor to be used by memcpy submit
+ */
+static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+		size_t len, unsigned long flags)
+{
+	struct pl08x_dma_chan *plchan = to_pl08x_chan(chan);
+	struct pl08x_driver_data *pl08x = plchan->host;
+	struct pl08x_txd *txd;
+	struct pl08x_sg *dsg;
+	int ret;
+
+	txd = pl08x_get_txd(plchan);
+	if (!txd) {
+		dev_err(&pl08x->adev->dev,
+			"%s no memory for descriptor\n", __func__);
+		return NULL;
+	}
+
+	dsg = kzalloc(sizeof(struct pl08x_sg), GFP_NOWAIT);
+	if (!dsg) {
+		pl08x_free_txd(pl08x, txd);
+		return NULL;
+	}
+	list_add_tail(&dsg->node, &txd->dsg_list);
+
+	dsg->src_addr = src;
+	dsg->dst_addr = dest;
+	dsg->len = len;
+	if (pl08x->vd->ftdmac020) {
+		/* Writing CCFG zero ENABLES all interrupts */
+		txd->ccfg = 0;
+		txd->cctl = pl08x_ftdmac020_memcpy_cctl(pl08x);
+	} else {
+		txd->ccfg = PL080_CONFIG_ERR_IRQ_MASK |
+			PL080_CONFIG_TC_IRQ_MASK |
+			PL080_FLOW_MEM2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
+		txd->cctl = pl08x_memcpy_cctl(pl08x);
+	}
 
 	ret = pl08x_fill_llis_for_desc(plchan->host, txd);
 	if (!ret) {
@@ -1598,7 +1988,7 @@ static struct pl08x_txd *pl08x_init_txd(
 		return NULL;
 	}
 
-	txd->cctl = cctl | pl08x_select_bus(src_buses, dst_buses);
+	txd->cctl = cctl | pl08x_select_bus(false, src_buses, dst_buses);
 
 	if (plchan->cfg.device_fc)
 		tmp = (direction == DMA_MEM_TO_DEV) ? PL080_FLOW_MEM2PER_PER :
@@ -1607,7 +1997,9 @@ static struct pl08x_txd *pl08x_init_txd(
 		tmp = (direction == DMA_MEM_TO_DEV) ? PL080_FLOW_MEM2PER :
 			PL080_FLOW_PER2MEM;
 
-	txd->ccfg |= tmp << PL080_CONFIG_FLOW_CONTROL_SHIFT;
+	txd->ccfg = PL080_CONFIG_ERR_IRQ_MASK |
+		PL080_CONFIG_TC_IRQ_MASK |
+		tmp << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 
 	ret = pl08x_request_mux(plchan);
 	if (ret < 0) {
@@ -1884,6 +2276,11 @@ static void pl08x_ensure_on(struct pl08x_driver_data *pl08x)
 	/* The Nomadik variant does not have the config register */
 	if (pl08x->vd->nomadik)
 		return;
+	/* The FTDMAC020 variant does this in another register */
+	if (pl08x->vd->ftdmac020) {
+		writel(PL080_CONFIG_ENABLE, pl08x->base + FTDMAC020_CSR);
+		return;
+	}
 	writel(PL080_CONFIG_ENABLE, pl08x->base + PL080_CONFIG);
 }
 
@@ -2310,7 +2707,7 @@ static inline int pl08x_of_probe(struct amba_device *adev,
 static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 {
 	struct pl08x_driver_data *pl08x;
-	const struct vendor_data *vd = id->data;
+	struct vendor_data *vd = id->data;
 	struct device_node *np = adev->dev.of_node;
 	u32 tsfr_size;
 	int ret = 0;
@@ -2336,6 +2733,34 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 	pl08x->adev = adev;
 	pl08x->vd = vd;
 
+	pl08x->base = ioremap(adev->res.start, resource_size(&adev->res));
+	if (!pl08x->base) {
+		ret = -ENOMEM;
+		goto out_no_ioremap;
+	}
+
+	if (vd->ftdmac020) {
+		u32 val;
+
+		val = readl(pl08x->base + FTDMAC020_REVISION);
+		dev_info(&pl08x->adev->dev, "FTDMAC020 %d.%d rel %d\n",
+			 (val >> 16) & 0xff, (val >> 8) & 0xff, val & 0xff);
+		val = readl(pl08x->base + FTDMAC020_FEATURE);
+		dev_info(&pl08x->adev->dev, "FTDMAC020 %d channels, "
+			 "%s built-in bridge, %s, %s linked lists\n",
+			 (val >> 12) & 0x0f,
+			 (val & BIT(10)) ? "no" : "has",
+			 (val & BIT(9)) ? "AHB0 and AHB1" : "AHB0",
+			 (val & BIT(8)) ? "supports" : "does not support");
+
+		/* Vendor data from feature register */
+		if (!(val & BIT(8)))
+			dev_warn(&pl08x->adev->dev,
+				 "linked lists not supported, required\n");
+		vd->channels = (val >> 12) & 0x0f;
+		vd->dualmaster = !!(val & BIT(9));
+	}
+
 	/* Initialize memcpy engine */
 	dma_cap_set(DMA_MEMCPY, pl08x->memcpy.cap_mask);
 	pl08x->memcpy.dev = &adev->dev;
@@ -2352,6 +2777,9 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 	pl08x->memcpy.dst_addr_widths = PL80X_DMA_BUSWIDTHS;
 	pl08x->memcpy.directions = BIT(DMA_MEM_TO_MEM);
 	pl08x->memcpy.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
+	if (vd->ftdmac020)
+		pl08x->memcpy.copy_align = DMAENGINE_ALIGN_4_BYTES;
+
 
 	/*
 	 * Initialize slave engine, if the block has no signals, that means
@@ -2422,19 +2850,18 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 		goto out_no_lli_pool;
 	}
 
-	pl08x->base = ioremap(adev->res.start, resource_size(&adev->res));
-	if (!pl08x->base) {
-		ret = -ENOMEM;
-		goto out_no_ioremap;
-	}
-
 	/* Turn on the PL08x */
 	pl08x_ensure_on(pl08x);
 
-	/* Attach the interrupt handler */
-	writel(0x000000FF, pl08x->base + PL080_ERR_CLEAR);
+	/* Clear any pending interrupts */
+	if (vd->ftdmac020)
+		/* This variant has error IRQs in bits 16-19 */
+		writel(0x0000FFFF, pl08x->base + PL080_ERR_CLEAR);
+	else
+		writel(0x000000FF, pl08x->base + PL080_ERR_CLEAR);
 	writel(0x000000FF, pl08x->base + PL080_TC_CLEAR);
 
+	/* Attach the interrupt handler */
 	ret = request_irq(adev->irq[0], pl08x_irq, 0, DRIVER_NAME, pl08x);
 	if (ret) {
 		dev_err(&adev->dev, "%s failed to request interrupt %d\n",
@@ -2455,7 +2882,25 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
 
 		ch->id = i;
 		ch->base = pl08x->base + PL080_Cx_BASE(i);
-		ch->reg_config = ch->base + vd->config_offset;
+		if (vd->ftdmac020) {
+			/* FTDMA020 has a special channel busy register */
+			ch->reg_busy = ch->base + FTDMAC020_CH_BUSY;
+			ch->reg_config = ch->base + FTDMAC020_CH_CFG;
+			ch->reg_control = ch->base + FTDMAC020_CH_CSR;
+			ch->reg_src = ch->base + FTDMAC020_CH_SRC_ADDR;
+			ch->reg_dst = ch->base + FTDMAC020_CH_DST_ADDR;
+			ch->reg_lli = ch->base + FTDMAC020_CH_LLP;
+			ch->ftdmac020 = true;
+		} else {
+			ch->reg_config = ch->base + vd->config_offset;
+			ch->reg_control = ch->base + PL080_CH_CONTROL;
+			ch->reg_src = ch->base + PL080_CH_SRC_ADDR;
+			ch->reg_dst = ch->base + PL080_CH_DST_ADDR;
+			ch->reg_lli = ch->base + PL080_CH_LLI;
+		}
+		if (vd->pl080s)
+			ch->pl080s = true;
+
 		spin_lock_init(&ch->lock);
 
 		/*
@@ -2537,11 +2982,11 @@ out_no_memcpy:
 out_no_phychans:
 	free_irq(adev->irq[0], pl08x);
 out_no_irq:
-	iounmap(pl08x->base);
-out_no_ioremap:
 	dma_pool_destroy(pl08x->pool);
 out_no_lli_pool:
 out_no_platdata:
+	iounmap(pl08x->base);
+out_no_ioremap:
 	kfree(pl08x);
 out_no_pl08x:
 	amba_release_regions(adev);
@@ -2582,6 +3027,12 @@ static struct vendor_data vendor_pl081 = {
 	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
 };
 
+static struct vendor_data vendor_ftdmac020 = {
+	.config_offset = PL080_CH_CONFIG,
+	.ftdmac020 = true,
+	.max_transfer_size = PL080_CONTROL_TRANSFER_SIZE_MASK,
+};
+
 static struct amba_id pl08x_ids[] = {
 	/* Samsung PL080S variant */
 	{
@@ -2607,6 +3058,12 @@ static struct amba_id pl08x_ids[] = {
 		.mask	= 0x00ffffff,
 		.data	= &vendor_nomadik,
 	},
+	/* Faraday Technology FTDMAC020 */
+	{
+		.id	= 0x0003b080,
+		.mask	= 0x000fffff,
+		.data	= &vendor_ftdmac020,
+	},
 	{ 0, 0 },
 };
 
diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h
index 580b5323a717..10124c9f9db5 100644
--- a/include/linux/amba/pl080.h
+++ b/include/linux/amba/pl080.h
@@ -44,7 +44,14 @@
 
 #define PL080_SYNC				(0x34)
 
-/* Per channel configuration registers */
+/* The Faraday Technology FTDMAC020 variant registers */
+#define FTDMAC020_CH_BUSY			(0x20)
+/* Identical to PL080_CONFIG */
+#define FTDMAC020_CSR				(0x24)
+/* Identical to PL080_SYNC */
+#define FTDMAC020_SYNC				(0x2C)
+#define FTDMAC020_REVISION			(0x30)
+#define FTDMAC020_FEATURE			(0x34)
 
 /* Per channel configuration registers */
 #define PL080_Cx_BASE(x)			((0x100 + (x * 0x20)))
@@ -55,6 +62,13 @@
 #define PL080_CH_CONFIG				(0x10)
 #define PL080S_CH_CONTROL2			(0x10)
 #define PL080S_CH_CONFIG			(0x14)
+/* The Faraday FTDMAC020 derivative shuffles the registers around */
+#define FTDMAC020_CH_CSR			(0x00)
+#define FTDMAC020_CH_CFG			(0x04)
+#define FTDMAC020_CH_SRC_ADDR			(0x08)
+#define FTDMAC020_CH_DST_ADDR			(0x0C)
+#define FTDMAC020_CH_LLP			(0x10)
+#define FTDMAC020_CH_SIZE			(0x14)
 
 #define PL080_LLI_ADDR_MASK			(0x3fffffff << 2)
 #define PL080_LLI_ADDR_SHIFT			(2)
@@ -119,6 +133,73 @@
 #define PL080_FLOW_PER2MEM_PER			(0x6)
 #define PL080_FLOW_SRC2DST_SRC			(0x7)
 
+#define FTDMAC020_CH_CSR_TC_MSK			BIT(31)
+/* Later versions have a threshold in bits 24..26,  */
+#define FTDMAC020_CH_CSR_FIFOTH_MSK		(0x7 << 24)
+#define FTDMAC020_CH_CSR_FIFOTH_SHIFT		(24)
+#define FTDMAC020_CH_CSR_CHPR1_MSK		(0x3 << 22)
+#define FTDMAC020_CH_CSR_PROT3			BIT(21)
+#define FTDMAC020_CH_CSR_PROT2			BIT(20)
+#define FTDMAC020_CH_CSR_PROT1			BIT(19)
+#define FTDMAC020_CH_CSR_SRC_SIZE_MSK		(0x7 << 16)
+#define FTDMAC020_CH_CSR_SRC_SIZE_SHIFT		(16)
+#define FTDMAC020_CH_CSR_ABT			BIT(15)
+#define FTDMAC020_CH_CSR_SRC_WIDTH_MSK		(0x7 << 11)
+#define FTDMAC020_CH_CSR_SRC_WIDTH_SHIFT	(11)
+#define FTDMAC020_CH_CSR_DST_WIDTH_MSK		(0x7 << 8)
+#define FTDMAC020_CH_CSR_DST_WIDTH_SHIFT	(8)
+#define FTDMAC020_CH_CSR_MODE			BIT(7)
+/* 00 = increase, 01 = decrease, 10 = fix */
+#define FTDMAC020_CH_CSR_SRCAD_CTL_MSK		(0x3 << 5)
+#define FTDMAC020_CH_CSR_SRCAD_CTL_SHIFT	(5)
+#define FTDMAC020_CH_CSR_DSTAD_CTL_MSK		(0x3 << 3)
+#define FTDMAC020_CH_CSR_DSTAD_CTL_SHIFT	(3)
+#define FTDMAC020_CH_CSR_SRC_SEL		BIT(2)
+#define FTDMAC020_CH_CSR_DST_SEL		BIT(1)
+#define FTDMAC020_CH_CSR_EN			BIT(0)
+
+/* FIFO threshold setting */
+#define FTDMAC020_CH_CSR_FIFOTH_1		(0x0)
+#define FTDMAC020_CH_CSR_FIFOTH_2		(0x1)
+#define FTDMAC020_CH_CSR_FIFOTH_4		(0x2)
+#define FTDMAC020_CH_CSR_FIFOTH_8		(0x3)
+#define FTDMAC020_CH_CSR_FIFOTH_16		(0x4)
+/* The FTDMAC020 supports 64bit wide transfers */
+#define FTDMAC020_WIDTH_64BIT			(0x3)
+/* Address can be increased, decreased or fixed */
+#define FTDMAC020_CH_CSR_SRCAD_CTL_INC		(0x0)
+#define FTDMAC020_CH_CSR_SRCAD_CTL_DEC		(0x1)
+#define FTDMAC020_CH_CSR_SRCAD_CTL_FIXED	(0x2)
+
+#define FTDMAC020_CH_CFG_LLP_CNT_MASK		(0xf << 16)
+#define FTDMAC020_CH_CFG_LLP_CNT_SHIFT		(16)
+#define FTDMAC020_CH_CFG_BUSY			BIT(8)
+#define FTDMAC020_CH_CFG_INT_ABT_MASK		BIT(2)
+#define FTDMAC020_CH_CFG_INT_ERR_MASK		BIT(1)
+#define FTDMAC020_CH_CFG_INT_TC_MASK		BIT(0)
+
+/* Inside the LLIs, the applicable CSR fields are mapped differently */
+#define FTDMAC020_LLI_TC_MSK			BIT(28)
+#define FTDMAC020_LLI_SRC_WIDTH_MSK		(0x7 << 25)
+#define FTDMAC020_LLI_SRC_WIDTH_SHIFT		(25)
+#define FTDMAC020_LLI_DST_WIDTH_MSK		(0x7 << 22)
+#define FTDMAC020_LLI_DST_WIDTH_SHIFT		(22)
+#define FTDMAC020_LLI_SRCAD_CTL_MSK		(0x3 << 20)
+#define FTDMAC020_LLI_SRCAD_CTL_SHIFT		(20)
+#define FTDMAC020_LLI_DSTAD_CTL_MSK		(0x3 << 18)
+#define FTDMAC020_LLI_DSTAD_CTL_SHIFT		(18)
+#define FTDMAC020_LLI_SRC_SEL			BIT(17)
+#define FTDMAC020_LLI_DST_SEL			BIT(16)
+#define FTDMAC020_LLI_TRANSFER_SIZE_MASK	(0xfff << 0)
+#define FTDMAC020_LLI_TRANSFER_SIZE_SHIFT	(0)
+
+#define FTDMAC020_CFG_LLP_CNT_MASK		(0x0f << 16)
+#define FTDMAC020_CFG_LLP_CNT_SHIFT		(16)
+#define FTDMAC020_CFG_BUSY			BIT(8)
+#define FTDMAC020_CFG_INT_ABT_MSK		BIT(2)
+#define FTDMAC020_CFG_INT_ERR_MSK		BIT(1)
+#define FTDMAC020_CFG_INT_TC_MSK		BIT(0)
+
 /* DMA linked list chain structure */
 
 struct pl080_lli {
-- 
cgit v1.2.3


From fcc785417fba2dc81d2f6ba888caaff463f4f441 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 20 May 2017 23:42:54 +0200
Subject: dmaengine: pl08x: use GENMASK() to create bitmasks

This switches the arbitrary shifting of hex constants in
the pl080 header to use GENMASK().

Suggested-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/amba/pl080.h | 50 +++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h
index 10124c9f9db5..ab036b6b1804 100644
--- a/include/linux/amba/pl080.h
+++ b/include/linux/amba/pl080.h
@@ -70,12 +70,12 @@
 #define FTDMAC020_CH_LLP			(0x10)
 #define FTDMAC020_CH_SIZE			(0x14)
 
-#define PL080_LLI_ADDR_MASK			(0x3fffffff << 2)
+#define PL080_LLI_ADDR_MASK			GENMASK(31, 2)
 #define PL080_LLI_ADDR_SHIFT			(2)
 #define PL080_LLI_LM_AHB2			BIT(0)
 
 #define PL080_CONTROL_TC_IRQ_EN			BIT(31)
-#define PL080_CONTROL_PROT_MASK			(0x7 << 28)
+#define PL080_CONTROL_PROT_MASK			GENMASK(30, 28)
 #define PL080_CONTROL_PROT_SHIFT		(28)
 #define PL080_CONTROL_PROT_CACHE		BIT(30)
 #define PL080_CONTROL_PROT_BUFF			BIT(29)
@@ -84,16 +84,16 @@
 #define PL080_CONTROL_SRC_INCR			BIT(26)
 #define PL080_CONTROL_DST_AHB2			BIT(25)
 #define PL080_CONTROL_SRC_AHB2			BIT(24)
-#define PL080_CONTROL_DWIDTH_MASK		(0x7 << 21)
+#define PL080_CONTROL_DWIDTH_MASK		GENMASK(23, 21)
 #define PL080_CONTROL_DWIDTH_SHIFT		(21)
-#define PL080_CONTROL_SWIDTH_MASK		(0x7 << 18)
+#define PL080_CONTROL_SWIDTH_MASK		GENMASK(20, 18)
 #define PL080_CONTROL_SWIDTH_SHIFT		(18)
-#define PL080_CONTROL_DB_SIZE_MASK		(0x7 << 15)
+#define PL080_CONTROL_DB_SIZE_MASK		GENMASK(17, 15)
 #define PL080_CONTROL_DB_SIZE_SHIFT		(15)
-#define PL080_CONTROL_SB_SIZE_MASK		(0x7 << 12)
+#define PL080_CONTROL_SB_SIZE_MASK		GENMASK(14, 12)
 #define PL080_CONTROL_SB_SIZE_SHIFT		(12)
-#define PL080_CONTROL_TRANSFER_SIZE_MASK	(0xfff << 0)
-#define PL080S_CONTROL_TRANSFER_SIZE_MASK	(0x1ffffff << 0)
+#define PL080_CONTROL_TRANSFER_SIZE_MASK	GENMASK(11, 0)
+#define PL080S_CONTROL_TRANSFER_SIZE_MASK	GENMASK(24, 0)
 #define PL080_CONTROL_TRANSFER_SIZE_SHIFT	(0)
 
 #define PL080_BSIZE_1				(0x0)
@@ -116,11 +116,11 @@
 #define PL080_CONFIG_LOCK			BIT(16)
 #define PL080_CONFIG_TC_IRQ_MASK		BIT(15)
 #define PL080_CONFIG_ERR_IRQ_MASK		BIT(14)
-#define PL080_CONFIG_FLOW_CONTROL_MASK		(0x7 << 11)
+#define PL080_CONFIG_FLOW_CONTROL_MASK		GENMASK(13, 11)
 #define PL080_CONFIG_FLOW_CONTROL_SHIFT		(11)
-#define PL080_CONFIG_DST_SEL_MASK		(0xf << 6)
+#define PL080_CONFIG_DST_SEL_MASK		GENMASK(9, 6)
 #define PL080_CONFIG_DST_SEL_SHIFT		(6)
-#define PL080_CONFIG_SRC_SEL_MASK		(0xf << 1)
+#define PL080_CONFIG_SRC_SEL_MASK		GENMASK(4, 1)
 #define PL080_CONFIG_SRC_SEL_SHIFT		(1)
 #define PL080_CONFIG_ENABLE			BIT(0)
 
@@ -135,24 +135,24 @@
 
 #define FTDMAC020_CH_CSR_TC_MSK			BIT(31)
 /* Later versions have a threshold in bits 24..26,  */
-#define FTDMAC020_CH_CSR_FIFOTH_MSK		(0x7 << 24)
+#define FTDMAC020_CH_CSR_FIFOTH_MSK		GENMASK(26, 24)
 #define FTDMAC020_CH_CSR_FIFOTH_SHIFT		(24)
-#define FTDMAC020_CH_CSR_CHPR1_MSK		(0x3 << 22)
+#define FTDMAC020_CH_CSR_CHPR1_MSK		GENMASK(23, 22)
 #define FTDMAC020_CH_CSR_PROT3			BIT(21)
 #define FTDMAC020_CH_CSR_PROT2			BIT(20)
 #define FTDMAC020_CH_CSR_PROT1			BIT(19)
-#define FTDMAC020_CH_CSR_SRC_SIZE_MSK		(0x7 << 16)
+#define FTDMAC020_CH_CSR_SRC_SIZE_MSK		GENMASK(18, 16)
 #define FTDMAC020_CH_CSR_SRC_SIZE_SHIFT		(16)
 #define FTDMAC020_CH_CSR_ABT			BIT(15)
-#define FTDMAC020_CH_CSR_SRC_WIDTH_MSK		(0x7 << 11)
+#define FTDMAC020_CH_CSR_SRC_WIDTH_MSK		GENMASK(13, 11)
 #define FTDMAC020_CH_CSR_SRC_WIDTH_SHIFT	(11)
-#define FTDMAC020_CH_CSR_DST_WIDTH_MSK		(0x7 << 8)
+#define FTDMAC020_CH_CSR_DST_WIDTH_MSK		GENMASK(10, 8)
 #define FTDMAC020_CH_CSR_DST_WIDTH_SHIFT	(8)
 #define FTDMAC020_CH_CSR_MODE			BIT(7)
 /* 00 = increase, 01 = decrease, 10 = fix */
-#define FTDMAC020_CH_CSR_SRCAD_CTL_MSK		(0x3 << 5)
+#define FTDMAC020_CH_CSR_SRCAD_CTL_MSK		GENMASK(6, 5)
 #define FTDMAC020_CH_CSR_SRCAD_CTL_SHIFT	(5)
-#define FTDMAC020_CH_CSR_DSTAD_CTL_MSK		(0x3 << 3)
+#define FTDMAC020_CH_CSR_DSTAD_CTL_MSK		GENMASK(4, 3)
 #define FTDMAC020_CH_CSR_DSTAD_CTL_SHIFT	(3)
 #define FTDMAC020_CH_CSR_SRC_SEL		BIT(2)
 #define FTDMAC020_CH_CSR_DST_SEL		BIT(1)
@@ -171,7 +171,7 @@
 #define FTDMAC020_CH_CSR_SRCAD_CTL_DEC		(0x1)
 #define FTDMAC020_CH_CSR_SRCAD_CTL_FIXED	(0x2)
 
-#define FTDMAC020_CH_CFG_LLP_CNT_MASK		(0xf << 16)
+#define FTDMAC020_CH_CFG_LLP_CNT_MASK		GENMASK(19, 16)
 #define FTDMAC020_CH_CFG_LLP_CNT_SHIFT		(16)
 #define FTDMAC020_CH_CFG_BUSY			BIT(8)
 #define FTDMAC020_CH_CFG_INT_ABT_MASK		BIT(2)
@@ -180,20 +180,20 @@
 
 /* Inside the LLIs, the applicable CSR fields are mapped differently */
 #define FTDMAC020_LLI_TC_MSK			BIT(28)
-#define FTDMAC020_LLI_SRC_WIDTH_MSK		(0x7 << 25)
+#define FTDMAC020_LLI_SRC_WIDTH_MSK		GENMASK(27, 25)
 #define FTDMAC020_LLI_SRC_WIDTH_SHIFT		(25)
-#define FTDMAC020_LLI_DST_WIDTH_MSK		(0x7 << 22)
+#define FTDMAC020_LLI_DST_WIDTH_MSK		GENMASK(24, 22)
 #define FTDMAC020_LLI_DST_WIDTH_SHIFT		(22)
-#define FTDMAC020_LLI_SRCAD_CTL_MSK		(0x3 << 20)
+#define FTDMAC020_LLI_SRCAD_CTL_MSK		GENMASK(21, 20)
 #define FTDMAC020_LLI_SRCAD_CTL_SHIFT		(20)
-#define FTDMAC020_LLI_DSTAD_CTL_MSK		(0x3 << 18)
+#define FTDMAC020_LLI_DSTAD_CTL_MSK		GENMASK(19, 18)
 #define FTDMAC020_LLI_DSTAD_CTL_SHIFT		(18)
 #define FTDMAC020_LLI_SRC_SEL			BIT(17)
 #define FTDMAC020_LLI_DST_SEL			BIT(16)
-#define FTDMAC020_LLI_TRANSFER_SIZE_MASK	(0xfff << 0)
+#define FTDMAC020_LLI_TRANSFER_SIZE_MASK	GENMASK(11, 0)
 #define FTDMAC020_LLI_TRANSFER_SIZE_SHIFT	(0)
 
-#define FTDMAC020_CFG_LLP_CNT_MASK		(0x0f << 16)
+#define FTDMAC020_CFG_LLP_CNT_MASK		GENMASK(19, 16)
 #define FTDMAC020_CFG_LLP_CNT_SHIFT		(16)
 #define FTDMAC020_CFG_BUSY			BIT(8)
 #define FTDMAC020_CFG_INT_ABT_MSK		BIT(2)
-- 
cgit v1.2.3


From 702fce05f5ad0e2af2c00d5ef41356ffdd4a3a56 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Mon, 22 May 2017 16:01:48 +0530
Subject: dmaengine: DW DMAC: Handle return value of clk_prepare_enable

clk_prepare_enable() can fail here and we must check its return value.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw/platform.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index c639c60b825a..bc31fe802061 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -306,8 +306,12 @@ static int dw_resume_early(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct dw_dma_chip *chip = platform_get_drvdata(pdev);
+	int ret;
+
+	ret = clk_prepare_enable(chip->clk);
+	if (ret)
+		return ret;
 
-	clk_prepare_enable(chip->clk);
 	return dw_dma_enable(chip);
 }
 
-- 
cgit v1.2.3


From 4d6d74e22096543cb3b35e717cf1b9aea3655f37 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 19 May 2017 15:06:44 +0100
Subject: dmaengine: pl330: Add IOMMU support to slave tranfers

Wire up dma_map_resource() for slave transfers, so that we can let the
PL330 use IOMMU-backed DMA mapping ops on systems with an appropriate
IOMMU and RAM above 4GB, to avoid CPU bounce buffering.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pl330.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index 8b0da7fa520d..a9451a14ad07 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -443,7 +443,10 @@ struct dma_pl330_chan {
 	/* For D-to-M and M-to-D channels */
 	int burst_sz; /* the peripheral fifo width */
 	int burst_len; /* the number of burst */
-	dma_addr_t fifo_addr;
+	phys_addr_t fifo_addr;
+	/* DMA-mapped view of the FIFO; may differ if an IOMMU is present */
+	dma_addr_t fifo_dma;
+	enum dma_data_direction dir;
 
 	/* for cyclic capability */
 	bool cyclic;
@@ -2120,11 +2123,60 @@ static int pl330_alloc_chan_resources(struct dma_chan *chan)
 	return 1;
 }
 
+/*
+ * We need the data direction between the DMAC (the dma-mapping "device") and
+ * the FIFO (the dmaengine "dev"), from the FIFO's point of view. Confusing!
+ */
+static enum dma_data_direction
+pl330_dma_slave_map_dir(enum dma_transfer_direction dir)
+{
+	switch (dir) {
+	case DMA_MEM_TO_DEV:
+		return DMA_FROM_DEVICE;
+	case DMA_DEV_TO_MEM:
+		return DMA_TO_DEVICE;
+	case DMA_DEV_TO_DEV:
+		return DMA_BIDIRECTIONAL;
+	default:
+		return DMA_NONE;
+	}
+}
+
+static void pl330_unprep_slave_fifo(struct dma_pl330_chan *pch)
+{
+	if (pch->dir != DMA_NONE)
+		dma_unmap_resource(pch->chan.device->dev, pch->fifo_dma,
+				   1 << pch->burst_sz, pch->dir, 0);
+	pch->dir = DMA_NONE;
+}
+
+
+static bool pl330_prep_slave_fifo(struct dma_pl330_chan *pch,
+				  enum dma_transfer_direction dir)
+{
+	struct device *dev = pch->chan.device->dev;
+	enum dma_data_direction dma_dir = pl330_dma_slave_map_dir(dir);
+
+	/* Already mapped for this config? */
+	if (pch->dir == dma_dir)
+		return true;
+
+	pl330_unprep_slave_fifo(pch);
+	pch->fifo_dma = dma_map_resource(dev, pch->fifo_addr,
+					 1 << pch->burst_sz, dma_dir, 0);
+	if (dma_mapping_error(dev, pch->fifo_dma))
+		return false;
+
+	pch->dir = dma_dir;
+	return true;
+}
+
 static int pl330_config(struct dma_chan *chan,
 			struct dma_slave_config *slave_config)
 {
 	struct dma_pl330_chan *pch = to_pchan(chan);
 
+	pl330_unprep_slave_fifo(pch);
 	if (slave_config->direction == DMA_MEM_TO_DEV) {
 		if (slave_config->dst_addr)
 			pch->fifo_addr = slave_config->dst_addr;
@@ -2235,6 +2287,7 @@ static void pl330_free_chan_resources(struct dma_chan *chan)
 	spin_unlock_irqrestore(&pl330->lock, flags);
 	pm_runtime_mark_last_busy(pch->dmac->ddma.dev);
 	pm_runtime_put_autosuspend(pch->dmac->ddma.dev);
+	pl330_unprep_slave_fifo(pch);
 }
 
 static int pl330_get_current_xferred_count(struct dma_pl330_chan *pch,
@@ -2564,6 +2617,9 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 		return NULL;
 	}
 
+	if (!pl330_prep_slave_fifo(pch, direction))
+		return NULL;
+
 	for (i = 0; i < len / period_len; i++) {
 		desc = pl330_get_desc(pch);
 		if (!desc) {
@@ -2593,12 +2649,12 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 			desc->rqcfg.src_inc = 1;
 			desc->rqcfg.dst_inc = 0;
 			src = dma_addr;
-			dst = pch->fifo_addr;
+			dst = pch->fifo_dma;
 			break;
 		case DMA_DEV_TO_MEM:
 			desc->rqcfg.src_inc = 0;
 			desc->rqcfg.dst_inc = 1;
-			src = pch->fifo_addr;
+			src = pch->fifo_dma;
 			dst = dma_addr;
 			break;
 		default:
@@ -2711,12 +2767,12 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct dma_pl330_chan *pch = to_pchan(chan);
 	struct scatterlist *sg;
 	int i;
-	dma_addr_t addr;
 
 	if (unlikely(!pch || !sgl || !sg_len))
 		return NULL;
 
-	addr = pch->fifo_addr;
+	if (!pl330_prep_slave_fifo(pch, direction))
+		return NULL;
 
 	first = NULL;
 
@@ -2742,13 +2798,13 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		if (direction == DMA_MEM_TO_DEV) {
 			desc->rqcfg.src_inc = 1;
 			desc->rqcfg.dst_inc = 0;
-			fill_px(&desc->px,
-				addr, sg_dma_address(sg), sg_dma_len(sg));
+			fill_px(&desc->px, pch->fifo_dma, sg_dma_address(sg),
+				sg_dma_len(sg));
 		} else {
 			desc->rqcfg.src_inc = 0;
 			desc->rqcfg.dst_inc = 1;
-			fill_px(&desc->px,
-				sg_dma_address(sg), addr, sg_dma_len(sg));
+			fill_px(&desc->px, sg_dma_address(sg), pch->fifo_dma,
+				sg_dma_len(sg));
 		}
 
 		desc->rqcfg.brst_size = pch->burst_sz;
@@ -2906,6 +2962,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		pch->thread = NULL;
 		pch->chan.device = pd;
 		pch->dmac = pl330;
+		pch->dir = DMA_NONE;
 
 		/* Add the channel to the DMAC list */
 		list_add_tail(&pch->chan.device_node, &pd->channels);
-- 
cgit v1.2.3


From fb9caf370f4d0457789d13a1a1b110a8db846e5e Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 24 May 2017 12:09:53 +0530
Subject: dmaengine: imx-sdma: Handle return value of clk_prepare_enable

clk_prepare_enable() can fail here and we must check its return value.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/imx-sdma.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 085993cb2ccc..83d892c438cd 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1755,19 +1755,26 @@ static int sdma_probe(struct platform_device *pdev)
 	if (IS_ERR(sdma->clk_ahb))
 		return PTR_ERR(sdma->clk_ahb);
 
-	clk_prepare(sdma->clk_ipg);
-	clk_prepare(sdma->clk_ahb);
+	ret = clk_prepare(sdma->clk_ipg);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare(sdma->clk_ahb);
+	if (ret)
+		goto err_clk;
 
 	ret = devm_request_irq(&pdev->dev, irq, sdma_int_handler, 0, "sdma",
 			       sdma);
 	if (ret)
-		return ret;
+		goto err_irq;
 
 	sdma->irq = irq;
 
 	sdma->script_addrs = kzalloc(sizeof(*sdma->script_addrs), GFP_KERNEL);
-	if (!sdma->script_addrs)
-		return -ENOMEM;
+	if (!sdma->script_addrs) {
+		ret = -ENOMEM;
+		goto err_irq;
+	}
 
 	/* initially no scripts available */
 	saddr_arr = (s32 *)sdma->script_addrs;
@@ -1882,6 +1889,10 @@ err_register:
 	dma_async_device_unregister(&sdma->dma_device);
 err_init:
 	kfree(sdma->script_addrs);
+err_irq:
+	clk_unprepare(sdma->clk_ahb);
+err_clk:
+	clk_unprepare(sdma->clk_ipg);
 	return ret;
 }
 
@@ -1893,6 +1904,8 @@ static int sdma_remove(struct platform_device *pdev)
 	devm_free_irq(&pdev->dev, sdma->irq, sdma);
 	dma_async_device_unregister(&sdma->dma_device);
 	kfree(sdma->script_addrs);
+	clk_unprepare(sdma->clk_ahb);
+	clk_unprepare(sdma->clk_ipg);
 	/* Kill the tasklet */
 	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
 		struct sdma_channel *sdmac = &sdma->channel[i];
-- 
cgit v1.2.3


From ba6ab3b359c86a762f621de8fc6ba5a229aef23f Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 24 May 2017 12:19:06 +0530
Subject: dmaengine: imx-sdma: Fix compilation warning.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace '%d' by '%zu' to fix the following compilation warning:-

drivers/dma/imx-sdma.c: In function ‘sdma_prep_dma_cyclic’:
drivers/dma/imx-sdma.c:1327:5: warning: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘size_t’ [-Wformat=]
     channel, period_len, 0xffff);
     ^
drivers/dma/imx-sdma.c:1350:3: warning: format ‘%d’ expects argument of type ‘int’, but argument 5 has type ‘size_t’ [-Wformat=]
   dev_dbg(sdma->dev, "entry %d: count: %d dma: %#llx %s%s\n",

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/imx-sdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 83d892c438cd..a67ec1bdc4e0 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1323,7 +1323,7 @@ static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic(
 	}
 
 	if (period_len > 0xffff) {
-		dev_err(sdma->dev, "SDMA channel %d: maximum period size exceeded: %d > %d\n",
+		dev_err(sdma->dev, "SDMA channel %d: maximum period size exceeded: %zu > %d\n",
 				channel, period_len, 0xffff);
 		goto err_out;
 	}
@@ -1347,7 +1347,7 @@ static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic(
 		if (i + 1 == num_periods)
 			param |= BD_WRAP;
 
-		dev_dbg(sdma->dev, "entry %d: count: %d dma: %#llx %s%s\n",
+		dev_dbg(sdma->dev, "entry %d: count: %zu dma: %#llx %s%s\n",
 				i, period_len, (u64)dma_addr,
 				param & BD_WRAP ? "wrap" : "",
 				param & BD_INTR ? " intr" : "");
-- 
cgit v1.2.3


From 33cb0880e0544c1553d37f4877fbd7a7a84f444f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sun, 28 May 2017 11:30:44 +0200
Subject: dmaengine: use proper name for the R-Car SoC

It is 'R-Car', not 'r-car'. No code or binding changes, only descriptive text.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/shdma.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/dma/shdma.txt b/Documentation/devicetree/bindings/dma/shdma.txt
index 2a3f3b8946b9..a91920a49433 100644
--- a/Documentation/devicetree/bindings/dma/shdma.txt
+++ b/Documentation/devicetree/bindings/dma/shdma.txt
@@ -1,6 +1,6 @@
 * SHDMA Device Tree bindings
 
-Sh-/r-mobile and r-car systems often have multiple identical DMA controller
+Sh-/r-mobile and R-Car systems often have multiple identical DMA controller
 instances, capable of serving any of a common set of DMA slave devices, using
 the same configuration. To describe this topology we require all compatible
 SHDMA DT nodes to be placed under a DMA multiplexer node. All such compatible
-- 
cgit v1.2.3


From ce81801372313805aa649ea18d27b2fb5f6c16b5 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Tue, 30 May 2017 16:39:16 -0600
Subject: dmaengine: imx-dma: cleanup scatterlist layering violations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This dma engine driver directly accesses page_link assuming knowledge
that should be contained only in scatterlist.h.

We replace these with calls to sg_chain and sg_assign_page.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Per Förlin <per.forlin@axis.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/imx-dma.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index ab0fb804fb1e..f681df8f0ed3 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -888,7 +888,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
 	sg_init_table(imxdmac->sg_list, periods);
 
 	for (i = 0; i < periods; i++) {
-		imxdmac->sg_list[i].page_link = 0;
+		sg_assign_page(&imxdmac->sg_list[i], NULL);
 		imxdmac->sg_list[i].offset = 0;
 		imxdmac->sg_list[i].dma_address = dma_addr;
 		sg_dma_len(&imxdmac->sg_list[i]) = period_len;
@@ -896,10 +896,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
 	}
 
 	/* close the loop */
-	imxdmac->sg_list[periods].offset = 0;
-	sg_dma_len(&imxdmac->sg_list[periods]) = 0;
-	imxdmac->sg_list[periods].page_link =
-		((unsigned long)imxdmac->sg_list | 0x01) & ~0x02;
+	sg_chain(imxdmac->sg_list, periods + 1, imxdmac->sg_list);
 
 	desc->type = IMXDMA_DESC_CYCLIC;
 	desc->sg = imxdmac->sg_list;
-- 
cgit v1.2.3


From 838b56adab18040657ec40322fd01d6ebf312ee7 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Tue, 30 May 2017 16:39:17 -0600
Subject: dmaengine: ste_dma40: Cleanup scatterlist layering violations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This dma engine driver directly accesses page_link assuming knowledge
that should be contained only in scatterlist.h.

We replace this access with a call to sg_chain which is equivalent.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Per Förlin <per.forlin@axis.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/ste_dma40.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index a6620b671d1d..c3052fbfd092 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -2528,10 +2528,7 @@ dma40_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
 		dma_addr += period_len;
 	}
 
-	sg[periods].offset = 0;
-	sg_dma_len(&sg[periods]) = 0;
-	sg[periods].page_link =
-		((unsigned long)sg | 0x01) & ~0x02;
+	sg_chain(sg, periods + 1, sg);
 
 	txd = d40_prep_sg(chan, sg, sg, periods, direction,
 			  DMA_PREP_INTERRUPT);
-- 
cgit v1.2.3


From 4aff2f9355a0b4480e56b7e349fec69f17e45eb9 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Sun, 11 Jun 2017 12:03:11 -0300
Subject: dmaengine: mxs: Use %zu for printing a size_t variable

Use %zu for printing a size_t variable in order to fix the following
build warning:

drivers/dma/mxs-dma.c: In function 'mxs_dma_prep_dma_cyclic':
drivers/dma/mxs-dma.c:621:5: warning: format '%d' expects argument of type 'int', but argument 3 has type 'size_t' [-Wformat]

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/mxs-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index e217268c7098..41d167921fab 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -617,7 +617,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
 
 	if (period_len > MAX_XFER_BYTES) {
 		dev_err(mxs_dma->dma_device.dev,
-				"maximum period size exceeded: %d > %d\n",
+				"maximum period size exceeded: %zu > %d\n",
 				period_len, MAX_XFER_BYTES);
 		goto err_out;
 	}
-- 
cgit v1.2.3


From d762e4f35601239cbebfbfd43d99876d8f220927 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Thu, 8 Jun 2017 19:46:23 -0300
Subject: dmaengine: Kconfig: Extend the dependency for MXS_DMA

Currently it is not possible to select the mxs dma driver when only
mx6sx or mx7 are selected.

Extend the dependency to allow the mxs dma driver to be built whenever
ARCH_MXS or ARCH_MXC is selected.

This has the benefit to avoid having to add new entries in the
MXS_DMA Kconfig everytime a new i.MX SoC shows up and it also makes
it consistent with the other i.MX DMA engines, such as IMX_DMA and
IMX_SDMA.

While at it, also pass COMPILE_TEST for increasing the build coverage.

Acked-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 24e8597b2c3e..af08799a1f6d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -354,7 +354,7 @@ config MV_XOR_V2
 
 config MXS_DMA
 	bool "MXS DMA support"
-	depends on SOC_IMX23 || SOC_IMX28 || SOC_IMX6Q || SOC_IMX6UL
+	depends on ARCH_MXS || ARCH_MXC || COMPILE_TEST
 	select STMP_DEVICE
 	select DMA_ENGINE
 	help
-- 
cgit v1.2.3


From 036e9ef8becde736e693be4f4bef56d5b56fc298 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Tue, 13 Jun 2017 12:56:34 -0400
Subject: dmaengine: Replace WARN_TAINT_ONCE() with pr_warn_once()

The WARN_TAINT_ONCE() prints out a loud stack trace on broken BIOSes.
The systems that have this problem are several years out of support and
no longer have BIOS updates available.  The stack trace isn't necessary
and a pr_warn_once() will do.

Change WARN_TAINT_ONCE() to pr_warn_once() and taint.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Duyck, Alexander H <alexander.h.duyck@intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/ioat/dca.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c
index 0b9b6b07db9e..eab2fdda29ec 100644
--- a/drivers/dma/ioat/dca.c
+++ b/drivers/dma/ioat/dca.c
@@ -336,10 +336,10 @@ struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
 	}
 
 	if (dca3_tag_map_invalid(ioatdca->tag_map)) {
-		WARN_TAINT_ONCE(1, TAINT_FIRMWARE_WORKAROUND,
-				"%s %s: APICID_TAG_MAP set incorrectly by BIOS, disabling DCA\n",
-				dev_driver_string(&pdev->dev),
-				dev_name(&pdev->dev));
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+		pr_warn_once("%s %s: APICID_TAG_MAP set incorrectly by BIOS, disabling DCA\n",
+			     dev_driver_string(&pdev->dev),
+			     dev_name(&pdev->dev));
 		free_dca_provider(dca);
 		return NULL;
 	}
-- 
cgit v1.2.3


From d43674ecc002b49926f216cb414cff2d230ca3fb Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 15 Jun 2017 16:55:57 -0700
Subject: dmaengine: pl330: Delete unused functions

The functions _queue_empty(), _emit_ADDH(), _emit_NOP(), _emit_STZ()
and _emit_WFE() are not used. Delete them.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pl330.c | 67 -----------------------------------------------------
 1 file changed, 67 deletions(-)

diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index a9451a14ad07..311ee107e5bf 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -541,11 +541,6 @@ struct _xfer_spec {
 	struct dma_pl330_desc *desc;
 };
 
-static inline bool _queue_empty(struct pl330_thread *thrd)
-{
-	return thrd->req[0].desc == NULL && thrd->req[1].desc == NULL;
-}
-
 static inline bool _queue_full(struct pl330_thread *thrd)
 {
 	return thrd->req[0].desc != NULL && thrd->req[1].desc != NULL;
@@ -567,23 +562,6 @@ static inline u32 get_revision(u32 periph_id)
 	return (periph_id >> PERIPH_REV_SHIFT) & PERIPH_REV_MASK;
 }
 
-static inline u32 _emit_ADDH(unsigned dry_run, u8 buf[],
-		enum pl330_dst da, u16 val)
-{
-	if (dry_run)
-		return SZ_DMAADDH;
-
-	buf[0] = CMD_DMAADDH;
-	buf[0] |= (da << 1);
-	buf[1] = val;
-	buf[2] = val >> 8;
-
-	PL330_DBGCMD_DUMP(SZ_DMAADDH, "\tDMAADDH %s %u\n",
-		da == 1 ? "DA" : "SA", val);
-
-	return SZ_DMAADDH;
-}
-
 static inline u32 _emit_END(unsigned dry_run, u8 buf[])
 {
 	if (dry_run)
@@ -741,18 +719,6 @@ static inline u32 _emit_MOV(unsigned dry_run, u8 buf[],
 	return SZ_DMAMOV;
 }
 
-static inline u32 _emit_NOP(unsigned dry_run, u8 buf[])
-{
-	if (dry_run)
-		return SZ_DMANOP;
-
-	buf[0] = CMD_DMANOP;
-
-	PL330_DBGCMD_DUMP(SZ_DMANOP, "\tDMANOP\n");
-
-	return SZ_DMANOP;
-}
-
 static inline u32 _emit_RMB(unsigned dry_run, u8 buf[])
 {
 	if (dry_run)
@@ -820,39 +786,6 @@ static inline u32 _emit_STP(unsigned dry_run, u8 buf[],
 	return SZ_DMASTP;
 }
 
-static inline u32 _emit_STZ(unsigned dry_run, u8 buf[])
-{
-	if (dry_run)
-		return SZ_DMASTZ;
-
-	buf[0] = CMD_DMASTZ;
-
-	PL330_DBGCMD_DUMP(SZ_DMASTZ, "\tDMASTZ\n");
-
-	return SZ_DMASTZ;
-}
-
-static inline u32 _emit_WFE(unsigned dry_run, u8 buf[], u8 ev,
-		unsigned invalidate)
-{
-	if (dry_run)
-		return SZ_DMAWFE;
-
-	buf[0] = CMD_DMAWFE;
-
-	ev &= 0x1f;
-	ev <<= 3;
-	buf[1] = ev;
-
-	if (invalidate)
-		buf[1] |= (1 << 1);
-
-	PL330_DBGCMD_DUMP(SZ_DMAWFE, "\tDMAWFE %u%s\n",
-		ev >> 3, invalidate ? ", I" : "");
-
-	return SZ_DMAWFE;
-}
-
 static inline u32 _emit_WFP(unsigned dry_run, u8 buf[],
 		enum pl330_cond cond, u8 peri)
 {
-- 
cgit v1.2.3


From 2446563c74f8c059fd4da4abeaa537f09033b5ec Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Fri, 16 Jun 2017 11:58:46 -0300
Subject: dmaengine: Kconfig: Simplify the help text for MXS_DMA

Currently the help text for the MXS_DMA option is incomplete as it does
not mention MX6SX, MX6ULL and MX7D, for example.

Instead of extending this list everytime a new SoC comes out, let's
keep the text more generic.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index af08799a1f6d..098215806f14 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -359,8 +359,7 @@ config MXS_DMA
 	select DMA_ENGINE
 	help
 	  Support the MXS DMA engine. This engine including APBH-DMA
-	  and APBX-DMA is integrated into Freescale
-	  i.MX23/28/MX6Q/MX6DL/MX6UL chips.
+	  and APBX-DMA is integrated into some Freescale chips.
 
 config MX3_IPU
 	bool "MX3x Image Processing Unit support"
-- 
cgit v1.2.3


From ccc077292733f3143b444255fa5ec49a8ff2763b Mon Sep 17 00:00:00 2001
From: Thomas Breitung <thomas.breitung@izt-labs.de>
Date: Mon, 19 Jun 2017 16:40:04 +0200
Subject: dmaengine: fsldma: set BWC, DAHTS and SAHTS values correctly

The bits of BWC, DAHTS and SAHTS in the DMA mode register must be cleared
before a new value can be or-ed in.

Signed-off-by: Thomas Breitung <thomas.breitung@izt-labs.de>
Signed-off-by: Wolfgang Ocker <weo@reccoware.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/fsldma.c | 5 ++++-
 drivers/dma/fsldma.h | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 51c75bf2b9b6..3b8b752ede2d 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -269,6 +269,7 @@ static void fsl_chan_set_src_loop_size(struct fsldma_chan *chan, int size)
 	case 2:
 	case 4:
 	case 8:
+		mode &= ~FSL_DMA_MR_SAHTS_MASK;
 		mode |= FSL_DMA_MR_SAHE | (__ilog2(size) << 14);
 		break;
 	}
@@ -301,6 +302,7 @@ static void fsl_chan_set_dst_loop_size(struct fsldma_chan *chan, int size)
 	case 2:
 	case 4:
 	case 8:
+		mode &= ~FSL_DMA_MR_DAHTS_MASK;
 		mode |= FSL_DMA_MR_DAHE | (__ilog2(size) << 16);
 		break;
 	}
@@ -327,7 +329,8 @@ static void fsl_chan_set_request_count(struct fsldma_chan *chan, int size)
 	BUG_ON(size > 1024);
 
 	mode = get_mr(chan);
-	mode |= (__ilog2(size) << 24) & 0x0f000000;
+	mode &= ~FSL_DMA_MR_BWC_MASK;
+	mode |= (__ilog2(size) << 24) & FSL_DMA_MR_BWC_MASK;
 
 	set_mr(chan, mode);
 }
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index 31bffccdcc75..4787d485dd76 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -36,6 +36,10 @@
 #define FSL_DMA_MR_DAHE		0x00002000
 #define FSL_DMA_MR_SAHE		0x00001000
 
+#define FSL_DMA_MR_SAHTS_MASK	0x0000C000
+#define FSL_DMA_MR_DAHTS_MASK	0x00030000
+#define FSL_DMA_MR_BWC_MASK	0x0f000000
+
 /*
  * Bandwidth/pause control determines how many bytes a given
  * channel is allowed to transfer before the DMA engine pauses
-- 
cgit v1.2.3


From 13058e33040819c1e7da070769a3a0197dc9cdf1 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Thu, 22 Jun 2017 14:59:22 -0400
Subject: dmaengine: qcom_hidma: allow ACPI/DT parameters to be overridden

Parameters like maximum read/write request size and the maximum
number of active transactions are currently configured in DT/ACPI.

This patch allows a user to override these to fine tune performance
for their application.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/qcom/hidma.c      |  7 +++++--
 drivers/dma/qcom/hidma_mgmt.c | 47 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/qcom/hidma.c b/drivers/dma/qcom/hidma.c
index 5072a7d306d4..84e3699a19bd 100644
--- a/drivers/dma/qcom/hidma.c
+++ b/drivers/dma/qcom/hidma.c
@@ -1,7 +1,7 @@
 /*
  * Qualcomm Technologies HIDMA DMA engine interface
  *
- * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2015-2017, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -795,8 +795,11 @@ static int hidma_probe(struct platform_device *pdev)
 	device_property_read_u32(&pdev->dev, "desc-count",
 				 &dmadev->nr_descriptors);
 
-	if (!dmadev->nr_descriptors && nr_desc_prm)
+	if (nr_desc_prm) {
+		dev_info(&pdev->dev, "overriding number of descriptors as %d\n",
+			 nr_desc_prm);
 		dmadev->nr_descriptors = nr_desc_prm;
+	}
 
 	if (!dmadev->nr_descriptors)
 		dmadev->nr_descriptors = HIDMA_NR_DEFAULT_DESC;
diff --git a/drivers/dma/qcom/hidma_mgmt.c b/drivers/dma/qcom/hidma_mgmt.c
index f847d32cc4b5..5a0991bc4787 100644
--- a/drivers/dma/qcom/hidma_mgmt.c
+++ b/drivers/dma/qcom/hidma_mgmt.c
@@ -1,7 +1,7 @@
 /*
  * Qualcomm Technologies HIDMA DMA engine Management interface
  *
- * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2015-2017, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -49,6 +49,26 @@
 #define HIDMA_AUTOSUSPEND_TIMEOUT	2000
 #define HIDMA_MAX_CHANNEL_WEIGHT	15
 
+static unsigned int max_write_request;
+module_param(max_write_request, uint, 0644);
+MODULE_PARM_DESC(max_write_request,
+		"maximum write burst (default: ACPI/DT value)");
+
+static unsigned int max_read_request;
+module_param(max_read_request, uint, 0644);
+MODULE_PARM_DESC(max_read_request,
+		"maximum read burst (default: ACPI/DT value)");
+
+static unsigned int max_wr_xactions;
+module_param(max_wr_xactions, uint, 0644);
+MODULE_PARM_DESC(max_wr_xactions,
+	"maximum number of write transactions (default: ACPI/DT value)");
+
+static unsigned int max_rd_xactions;
+module_param(max_rd_xactions, uint, 0644);
+MODULE_PARM_DESC(max_rd_xactions,
+	"maximum number of read transactions (default: ACPI/DT value)");
+
 int hidma_mgmt_setup(struct hidma_mgmt_dev *mgmtdev)
 {
 	unsigned int i;
@@ -207,12 +227,25 @@ static int hidma_mgmt_probe(struct platform_device *pdev)
 		goto out;
 	}
 
+	if (max_write_request) {
+		dev_info(&pdev->dev, "overriding max-write-burst-bytes: %d\n",
+			max_write_request);
+		mgmtdev->max_write_request = max_write_request;
+	} else
+		max_write_request = mgmtdev->max_write_request;
+
 	rc = device_property_read_u32(&pdev->dev, "max-read-burst-bytes",
 				      &mgmtdev->max_read_request);
 	if (rc) {
 		dev_err(&pdev->dev, "max-read-burst-bytes missing\n");
 		goto out;
 	}
+	if (max_read_request) {
+		dev_info(&pdev->dev, "overriding max-read-burst-bytes: %d\n",
+			max_read_request);
+		mgmtdev->max_read_request = max_read_request;
+	} else
+		max_read_request = mgmtdev->max_read_request;
 
 	rc = device_property_read_u32(&pdev->dev, "max-write-transactions",
 				      &mgmtdev->max_wr_xactions);
@@ -220,6 +253,12 @@ static int hidma_mgmt_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "max-write-transactions missing\n");
 		goto out;
 	}
+	if (max_wr_xactions) {
+		dev_info(&pdev->dev, "overriding max-write-transactions: %d\n",
+			max_wr_xactions);
+		mgmtdev->max_wr_xactions = max_wr_xactions;
+	} else
+		max_wr_xactions = mgmtdev->max_wr_xactions;
 
 	rc = device_property_read_u32(&pdev->dev, "max-read-transactions",
 				      &mgmtdev->max_rd_xactions);
@@ -227,6 +266,12 @@ static int hidma_mgmt_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "max-read-transactions missing\n");
 		goto out;
 	}
+	if (max_rd_xactions) {
+		dev_info(&pdev->dev, "overriding max-read-transactions: %d\n",
+			max_rd_xactions);
+		mgmtdev->max_rd_xactions = max_rd_xactions;
+	} else
+		max_rd_xactions = mgmtdev->max_rd_xactions;
 
 	mgmtdev->priority = devm_kcalloc(&pdev->dev,
 					 mgmtdev->dma_channels,
-- 
cgit v1.2.3


From 5f54d3e869812ef288c63dff67848a7520979953 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Tue, 27 Jun 2017 17:38:27 +0530
Subject: dmaengine: fsl_raid: make of_device_ids const.

of_device_ids are not supposed to change at runtime. All functions
working with of_device_ids provided by <linux/of.h> work with const
of_device_ids. So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   3981	    608	      0	   4589	   11ed	drivers/dma/fsl_raid.o

File size after constify:
   text	   data	    bss	    dec	    hex	filename
   4381	    192	      0	   4573	   11dd	drivers/dma/fsl_raid.o

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/fsl_raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/fsl_raid.c b/drivers/dma/fsl_raid.c
index 90d29f90acfb..493dc6c59d1d 100644
--- a/drivers/dma/fsl_raid.c
+++ b/drivers/dma/fsl_raid.c
@@ -877,7 +877,7 @@ static int fsl_re_remove(struct platform_device *ofdev)
 	return 0;
 }
 
-static struct of_device_id fsl_re_ids[] = {
+static const struct of_device_id fsl_re_ids[] = {
 	{ .compatible = "fsl,raideng-v1.0", },
 	{}
 };
-- 
cgit v1.2.3


From 65a5c3dd3e11225a0d0e0df97d23fa64ab2d4ed2 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 6 Jun 2017 13:49:29 +0100
Subject: dmaengine: tegra-apb: Really fix runtime-pm usage

Commit edd3bdbe9db1 ("dmaengine: tegra-apb: Correct runtime-pm usage")
added pm_runtime_get/put() calls to the tegra-apb DMA system suspend
callbacks. Runtime PM is disabled during system suspend and so these
APIs cannot be used. Fix the suspend handling for the tegra-apb DMA by
moving the save and restore of the DMA register context into the
runtime PM suspend and resume callbacks, and then use the
pm_runtime_force_suspend/resume() APIs to invoke the runtime PM
callbacks during system suspend.

Fixes: edd3bdbe9db1 ("dmaengine: tegra-apb: Correct runtime-pm usage")
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/tegra20-apb-dma.c | 50 +++++++++----------------------------------
 1 file changed, 10 insertions(+), 40 deletions(-)

diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 3722b9d8d9fe..b9d75a54c896 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -1492,37 +1492,9 @@ static int tegra_dma_remove(struct platform_device *pdev)
 }
 
 static int tegra_dma_runtime_suspend(struct device *dev)
-{
-	struct tegra_dma *tdma = dev_get_drvdata(dev);
-
-	clk_disable_unprepare(tdma->dma_clk);
-	return 0;
-}
-
-static int tegra_dma_runtime_resume(struct device *dev)
-{
-	struct tegra_dma *tdma = dev_get_drvdata(dev);
-	int ret;
-
-	ret = clk_prepare_enable(tdma->dma_clk);
-	if (ret < 0) {
-		dev_err(dev, "clk_enable failed: %d\n", ret);
-		return ret;
-	}
-	return 0;
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int tegra_dma_pm_suspend(struct device *dev)
 {
 	struct tegra_dma *tdma = dev_get_drvdata(dev);
 	int i;
-	int ret;
-
-	/* Enable clock before accessing register */
-	ret = pm_runtime_get_sync(dev);
-	if (ret < 0)
-		return ret;
 
 	tdma->reg_gen = tdma_read(tdma, TEGRA_APBDMA_GENERAL);
 	for (i = 0; i < tdma->chip_data->nr_channels; i++) {
@@ -1543,21 +1515,21 @@ static int tegra_dma_pm_suspend(struct device *dev)
 						  TEGRA_APBDMA_CHAN_WCOUNT);
 	}
 
-	/* Disable clock */
-	pm_runtime_put(dev);
+	clk_disable_unprepare(tdma->dma_clk);
+
 	return 0;
 }
 
-static int tegra_dma_pm_resume(struct device *dev)
+static int tegra_dma_runtime_resume(struct device *dev)
 {
 	struct tegra_dma *tdma = dev_get_drvdata(dev);
-	int i;
-	int ret;
+	int i, ret;
 
-	/* Enable clock before accessing register */
-	ret = pm_runtime_get_sync(dev);
-	if (ret < 0)
+	ret = clk_prepare_enable(tdma->dma_clk);
+	if (ret < 0) {
+		dev_err(dev, "clk_enable failed: %d\n", ret);
 		return ret;
+	}
 
 	tdma_write(tdma, TEGRA_APBDMA_GENERAL, tdma->reg_gen);
 	tdma_write(tdma, TEGRA_APBDMA_CONTROL, 0);
@@ -1582,16 +1554,14 @@ static int tegra_dma_pm_resume(struct device *dev)
 			(ch_reg->csr & ~TEGRA_APBDMA_CSR_ENB));
 	}
 
-	/* Disable clock */
-	pm_runtime_put(dev);
 	return 0;
 }
-#endif
 
 static const struct dev_pm_ops tegra_dma_dev_pm_ops = {
 	SET_RUNTIME_PM_OPS(tegra_dma_runtime_suspend, tegra_dma_runtime_resume,
 			   NULL)
-	SET_SYSTEM_SLEEP_PM_OPS(tegra_dma_pm_suspend, tegra_dma_pm_resume)
+	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				pm_runtime_force_resume)
 };
 
 static const struct of_device_id tegra_dma_of_match[] = {
-- 
cgit v1.2.3


From 82474dade7f4c60206609e7274591d159db105bf Mon Sep 17 00:00:00 2001
From: Stefan Roese <sr@denx.de>
Date: Fri, 23 Jun 2017 11:05:49 +0200
Subject: dmaengine: zynqmp_dma: Remove max len check in zynqmp_dma_prep_memcpy

Remove check for "len > ZYNQMP_DMA_MAX_TRANS_LEN" as its not needed.
If the length is larger, the transfer is split up into multiple parts
with the max descriptor length already.

Signed-off-by: Stefan Roese <sr@denx.de>
Cc: Kedareswara rao Appana <appanad@xilinx.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/xilinx/zynqmp_dma.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/dma/xilinx/zynqmp_dma.c b/drivers/dma/xilinx/zynqmp_dma.c
index 6d221e5c72ee..47f64192d2fd 100644
--- a/drivers/dma/xilinx/zynqmp_dma.c
+++ b/drivers/dma/xilinx/zynqmp_dma.c
@@ -794,9 +794,6 @@ static struct dma_async_tx_descriptor *zynqmp_dma_prep_memcpy(
 
 	chan = to_chan(dchan);
 
-	if (len > ZYNQMP_DMA_MAX_TRANS_LEN)
-		return NULL;
-
 	desc_cnt = DIV_ROUND_UP(len, ZYNQMP_DMA_MAX_TRANS_LEN);
 
 	spin_lock_bh(&chan->lock);
-- 
cgit v1.2.3


From 99efdb3e48fb2fa84addb3102946d3eca341192b Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Fri, 30 Jun 2017 10:43:05 -0400
Subject: dmaengine: qcom_hidma: correct API violation for submit

Current code is violating the DMA Engine API by putting the submitted
requests directly into the HW queue. This causes queued transactions
to be started by another thread as soon as the first one finishes.

The DMA Engine document clearly states this.

"dmaengine_submit() will not start the DMA operation".

Move HW queuing of the requests into the issue_pending() routine
to comply with API requirements also create a new queued state for
temporarily holding the requests.

A descriptor goes through these transitions now.

free->prepared->queued->active->completed->free

as opposed to

free->prepared->active->completed->free

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/qcom/hidma.c | 15 ++++++++++++---
 drivers/dma/qcom/hidma.h |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/qcom/hidma.c b/drivers/dma/qcom/hidma.c
index 84e3699a19bd..34fb6afd229b 100644
--- a/drivers/dma/qcom/hidma.c
+++ b/drivers/dma/qcom/hidma.c
@@ -210,6 +210,7 @@ static int hidma_chan_init(struct hidma_dev *dmadev, u32 dma_sig)
 	INIT_LIST_HEAD(&mchan->prepared);
 	INIT_LIST_HEAD(&mchan->active);
 	INIT_LIST_HEAD(&mchan->completed);
+	INIT_LIST_HEAD(&mchan->queued);
 
 	spin_lock_init(&mchan->lock);
 	list_add_tail(&mchan->chan.device_node, &ddev->channels);
@@ -230,9 +231,15 @@ static void hidma_issue_pending(struct dma_chan *dmach)
 	struct hidma_chan *mchan = to_hidma_chan(dmach);
 	struct hidma_dev *dmadev = mchan->dmadev;
 	unsigned long flags;
+	struct hidma_desc *qdesc, *next;
 	int status;
 
 	spin_lock_irqsave(&mchan->lock, flags);
+	list_for_each_entry_safe(qdesc, next, &mchan->queued, node) {
+		hidma_ll_queue_request(dmadev->lldev, qdesc->tre_ch);
+		list_move_tail(&qdesc->node, &mchan->active);
+	}
+
 	if (!mchan->running) {
 		struct hidma_desc *desc = list_first_entry(&mchan->active,
 							   struct hidma_desc,
@@ -315,17 +322,18 @@ static dma_cookie_t hidma_tx_submit(struct dma_async_tx_descriptor *txd)
 		pm_runtime_put_autosuspend(dmadev->ddev.dev);
 		return -ENODEV;
 	}
+	pm_runtime_mark_last_busy(dmadev->ddev.dev);
+	pm_runtime_put_autosuspend(dmadev->ddev.dev);
 
 	mdesc = container_of(txd, struct hidma_desc, desc);
 	spin_lock_irqsave(&mchan->lock, irqflags);
 
-	/* Move descriptor to active */
-	list_move_tail(&mdesc->node, &mchan->active);
+	/* Move descriptor to queued */
+	list_move_tail(&mdesc->node, &mchan->queued);
 
 	/* Update cookie */
 	cookie = dma_cookie_assign(txd);
 
-	hidma_ll_queue_request(dmadev->lldev, mdesc->tre_ch);
 	spin_unlock_irqrestore(&mchan->lock, irqflags);
 
 	return cookie;
@@ -431,6 +439,7 @@ static int hidma_terminate_channel(struct dma_chan *chan)
 	list_splice_init(&mchan->active, &list);
 	list_splice_init(&mchan->prepared, &list);
 	list_splice_init(&mchan->completed, &list);
+	list_splice_init(&mchan->queued, &list);
 	spin_unlock_irqrestore(&mchan->lock, irqflags);
 
 	/* this suspends the existing transfer */
diff --git a/drivers/dma/qcom/hidma.h b/drivers/dma/qcom/hidma.h
index c7d014235c32..41e0aa283828 100644
--- a/drivers/dma/qcom/hidma.h
+++ b/drivers/dma/qcom/hidma.h
@@ -104,6 +104,7 @@ struct hidma_chan {
 	struct dma_chan			chan;
 	struct list_head		free;
 	struct list_head		prepared;
+	struct list_head		queued;
 	struct list_head		active;
 	struct list_head		completed;
 
-- 
cgit v1.2.3