4 files changed, 2303 insertions, 0 deletions
diff --git a/target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch b/target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch
new file mode 100644
index 0000000000..924f7970a8
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch
@@ -0,0 +1,143 @@
+From 7bd903c5ca47fde5ad52370a47776491813c772e Mon Sep 17 00:00:00 2001
+From: Peter Ujfalusi <peter.ujfalusi@ti.com>
+Date: Mon, 14 Dec 2015 22:47:39 +0200
+Subject: [PATCH 1/3] dmaengine: core: Move and merge the code paths using
+ private_candidate
+
+Channel matching with private_candidate() is used in two paths, the error
+checking is slightly different in them and they are duplicating code also.
+Move the code under find_candidate() to provide consistent execution and
+going to allow us to reuse this mode of channel lookup later.
+
+Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
+Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
+Reviewed-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Vinod Koul <vinod.koul@intel.com>
+---
+ drivers/dma/dmaengine.c | 81 +++++++++++++++++++++++++------------------------
+ 1 file changed, 42 insertions(+), 39 deletions(-)
+
+diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
+index f2cbff9..81a36fc 100644
+--- a/drivers/dma/dmaengine.c
++++ b/drivers/dma/dmaengine.c
+@@ -542,6 +542,42 @@ static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
+ 	return NULL;
+ }
+ 
++static struct dma_chan *find_candidate(struct dma_device *device,
++				       const dma_cap_mask_t *mask,
++				       dma_filter_fn fn, void *fn_param)
++{
++	struct dma_chan *chan = private_candidate(mask, device, fn, fn_param);
++	int err;
++
++	if (chan) {
++		/* Found a suitable channel, try to grab, prep, and return it.
++		 * We first set DMA_PRIVATE to disable balance_ref_count as this
++		 * channel will not be published in the general-purpose
++		 * allocator
++		 */
++		dma_cap_set(DMA_PRIVATE, device->cap_mask);
++		device->privatecnt++;
++		err = dma_chan_get(chan);
++
++		if (err) {
++			if (err == -ENODEV) {
++				pr_debug("%s: %s module removed\n", __func__,
++					 dma_chan_name(chan));
++				list_del_rcu(&device->global_node);
++			} else
++				pr_debug("%s: failed to get %s: (%d)\n",
++					 __func__, dma_chan_name(chan), err);
++
++			if (--device->privatecnt == 0)
++				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
++
++			chan = ERR_PTR(err);
++		}
++	}
++
++	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
++}
++
+ /**
+  * dma_get_slave_channel - try to get specific channel exclusively
+  * @chan: target channel
+@@ -580,7 +616,6 @@ struct dma_chan *dma_get_any_slave_channel(struct dma_device *device)
+ {
+ 	dma_cap_mask_t mask;
+ 	struct dma_chan *chan;
+-	int err;
+ 
+ 	dma_cap_zero(mask);
+ 	dma_cap_set(DMA_SLAVE, mask);
+@@ -588,23 +623,11 @@ struct dma_chan *dma_get_any_slave_channel(struct dma_device *device)
+ 	/* lock against __dma_request_channel */
+ 	mutex_lock(&dma_list_mutex);
+ 
+-	chan = private_candidate(&mask, device, NULL, NULL);
+-	if (chan) {
+-		dma_cap_set(DMA_PRIVATE, device->cap_mask);
+-		device->privatecnt++;
+-		err = dma_chan_get(chan);
+-		if (err) {
+-			pr_debug("%s: failed to get %s: (%d)\n",
+-				__func__, dma_chan_name(chan), err);
+-			chan = NULL;
+-			if (--device->privatecnt == 0)
+-				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
+-		}
+-	}
++	chan = find_candidate(device, &mask, NULL, NULL);
+ 
+ 	mutex_unlock(&dma_list_mutex);
+ 
+-	return chan;
++	return IS_ERR(chan) ? NULL : chan;
+ }
+ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
+ 
+@@ -621,35 +644,15 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ {
+ 	struct dma_device *device, *_d;
+ 	struct dma_chan *chan = NULL;
+-	int err;
+ 
+ 	/* Find a channel */
+ 	mutex_lock(&dma_list_mutex);
+ 	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+-		chan = private_candidate(mask, device, fn, fn_param);
+-		if (chan) {
+-			/* Found a suitable channel, try to grab, prep, and
+-			 * return it.  We first set DMA_PRIVATE to disable
+-			 * balance_ref_count as this channel will not be
+-			 * published in the general-purpose allocator
+-			 */
+-			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+-			device->privatecnt++;
+-			err = dma_chan_get(chan);
++		chan = find_candidate(device, mask, fn, fn_param);
++		if (!IS_ERR(chan))
++			break;
+ 
+-			if (err == -ENODEV) {
+-				pr_debug("%s: %s module removed\n",
+-					 __func__, dma_chan_name(chan));
+-				list_del_rcu(&device->global_node);
+-			} else if (err)
+-				pr_debug("%s: failed to get %s: (%d)\n",
+-					 __func__, dma_chan_name(chan), err);
+-			else
+-				break;
+-			if (--device->privatecnt == 0)
+-				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
+-			chan = NULL;
+-		}
++		chan = NULL;
+ 	}
+ 	mutex_unlock(&dma_list_mutex);
+ 
+-- 
+2.8.1
+
diff --git a/target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch b/target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch
new file mode 100644
index 0000000000..0296714639
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch
@@ -0,0 +1,345 @@
+From a8135d0d79e9d0ad3a4ff494fceeaae838becf38 Mon Sep 17 00:00:00 2001
+From: Peter Ujfalusi <peter.ujfalusi@ti.com>
+Date: Mon, 14 Dec 2015 22:47:40 +0200
+Subject: [PATCH 2/3] dmaengine: core: Introduce new, universal API to request
+ a channel
+
+The two API function can cover most, if not all current APIs used to
+request a channel. With minimal effort dmaengine drivers, platforms and
+dmaengine user drivers can be converted to use the two function.
+
+struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
+
+To request any channel matching with the requested capabilities, can be
+used to request channel for memcpy, memset, xor, etc where no hardware
+synchronization is needed.
+
+struct dma_chan *dma_request_chan(struct device *dev, const char *name);
+To request a slave channel. The dma_request_chan() will try to find the
+channel via DT, ACPI or in case if the kernel booted in non DT/ACPI mode
+it will use a filter lookup table and retrieves the needed information from
+the dma_slave_map provided by the DMA drivers.
+This legacy mode needs changes in platform code, in dmaengine drivers and
+finally the dmaengine user drivers can be converted:
+
+For each dmaengine driver an array of DMA device, slave and the parameter
+for the filter function needs to be added:
+
+static const struct dma_slave_map da830_edma_map[] = {
+	{ "davinci-mcasp.0", "rx", EDMA_FILTER_PARAM(0, 0) },
+	{ "davinci-mcasp.0", "tx", EDMA_FILTER_PARAM(0, 1) },
+	{ "davinci-mcasp.1", "rx", EDMA_FILTER_PARAM(0, 2) },
+	{ "davinci-mcasp.1", "tx", EDMA_FILTER_PARAM(0, 3) },
+	{ "davinci-mcasp.2", "rx", EDMA_FILTER_PARAM(0, 4) },
+	{ "davinci-mcasp.2", "tx", EDMA_FILTER_PARAM(0, 5) },
+	{ "spi_davinci.0", "rx", EDMA_FILTER_PARAM(0, 14) },
+	{ "spi_davinci.0", "tx", EDMA_FILTER_PARAM(0, 15) },
+	{ "da830-mmc.0", "rx", EDMA_FILTER_PARAM(0, 16) },
+	{ "da830-mmc.0", "tx", EDMA_FILTER_PARAM(0, 17) },
+	{ "spi_davinci.1", "rx", EDMA_FILTER_PARAM(0, 18) },
+	{ "spi_davinci.1", "tx", EDMA_FILTER_PARAM(0, 19) },
+};
+
+This information is going to be needed by the dmaengine driver, so
+modification to the platform_data is needed, and the driver map should be
+added to the pdata of the DMA driver:
+
+da8xx_edma0_pdata.slave_map = da830_edma_map;
+da8xx_edma0_pdata.slavecnt = ARRAY_SIZE(da830_edma_map);
+
+The DMA driver then needs to configure the needed device -> filter_fn
+mapping before it registers with dma_async_device_register() :
+
+ecc->dma_slave.filter_map.map = info->slave_map;
+ecc->dma_slave.filter_map.mapcnt = info->slavecnt;
+ecc->dma_slave.filter_map.fn = edma_filter_fn;
+
+When neither DT or ACPI lookup is available the dma_request_chan() will
+try to match the requester's device name with the filter_map's list of
+device names, when a match found it will use the information from the
+dma_slave_map to get the channel with the dma_get_channel() internal
+function.
+
+Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
+Reviewed-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Vinod Koul <vinod.koul@intel.com>
+---
+ Documentation/dmaengine/client.txt | 23 +++-------
+ drivers/dma/dmaengine.c            | 89 +++++++++++++++++++++++++++++++++-----
+ include/linux/dmaengine.h          | 51 +++++++++++++++++++---
+ 3 files changed, 127 insertions(+), 36 deletions(-)
+
+diff --git a/Documentation/dmaengine/client.txt b/Documentation/dmaengine/client.txt
+index 11fb87f..4b04d89 100644
+--- a/Documentation/dmaengine/client.txt
++++ b/Documentation/dmaengine/client.txt
+@@ -22,25 +22,14 @@ The slave DMA usage consists of following steps:
+    Channel allocation is slightly different in the slave DMA context,
+    client drivers typically need a channel from a particular DMA
+    controller only and even in some cases a specific channel is desired.
+-   To request a channel dma_request_channel() API is used.
++   To request a channel dma_request_chan() API is used.
+ 
+    Interface:
+-	struct dma_chan *dma_request_channel(dma_cap_mask_t mask,
+-			dma_filter_fn filter_fn,
+-			void *filter_param);
+-   where dma_filter_fn is defined as:
+-	typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
+-
+-   The 'filter_fn' parameter is optional, but highly recommended for
+-   slave and cyclic channels as they typically need to obtain a specific
+-   DMA channel.
+-
+-   When the optional 'filter_fn' parameter is NULL, dma_request_channel()
+-   simply returns the first channel that satisfies the capability mask.
+-
+-   Otherwise, the 'filter_fn' routine will be called once for each free
+-   channel which has a capability in 'mask'.  'filter_fn' is expected to
+-   return 'true' when the desired DMA channel is found.
++	struct dma_chan *dma_request_chan(struct device *dev, const char *name);
++
++   Which will find and return the 'name' DMA channel associated with the 'dev'
++   device. The association is done via DT, ACPI or board file based
++   dma_slave_map matching table.
+ 
+    A channel allocated via this interface is exclusive to the caller,
+    until dma_release_channel() is called.
+diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
+index 81a36fc..a094dbb 100644
+--- a/drivers/dma/dmaengine.c
++++ b/drivers/dma/dmaengine.c
+@@ -43,6 +43,7 @@
+ 
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ 
++#include <linux/platform_device.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+@@ -665,27 +666,73 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ }
+ EXPORT_SYMBOL_GPL(__dma_request_channel);
+ 
++static const struct dma_slave_map *dma_filter_match(struct dma_device *device,
++						    const char *name,
++						    struct device *dev)
++{
++	int i;
++
++	if (!device->filter.mapcnt)
++		return NULL;
++
++	for (i = 0; i < device->filter.mapcnt; i++) {
++		const struct dma_slave_map *map = &device->filter.map[i];
++
++		if (!strcmp(map->devname, dev_name(dev)) &&
++		    !strcmp(map->slave, name))
++			return map;
++	}
++
++	return NULL;
++}
++
+ /**
+- * dma_request_slave_channel_reason - try to allocate an exclusive slave channel
++ * dma_request_chan - try to allocate an exclusive slave channel
+  * @dev:	pointer to client device structure
+  * @name:	slave channel name
+  *
+  * Returns pointer to appropriate DMA channel on success or an error pointer.
+  */
+-struct dma_chan *dma_request_slave_channel_reason(struct device *dev,
+-						  const char *name)
++struct dma_chan *dma_request_chan(struct device *dev, const char *name)
+ {
++	struct dma_device *d, *_d;
++	struct dma_chan *chan = NULL;
++
+ 	/* If device-tree is present get slave info from here */
+ 	if (dev->of_node)
+-		return of_dma_request_slave_channel(dev->of_node, name);
++		chan = of_dma_request_slave_channel(dev->of_node, name);
+ 
+ 	/* If device was enumerated by ACPI get slave info from here */
+-	if (ACPI_HANDLE(dev))
+-		return acpi_dma_request_slave_chan_by_name(dev, name);
++	if (has_acpi_companion(dev) && !chan)
++		chan = acpi_dma_request_slave_chan_by_name(dev, name);
++
++	if (chan) {
++		/* Valid channel found or requester need to be deferred */
++		if (!IS_ERR(chan) || PTR_ERR(chan) == -EPROBE_DEFER)
++			return chan;
++	}
++
++	/* Try to find the channel via the DMA filter map(s) */
++	mutex_lock(&dma_list_mutex);
++	list_for_each_entry_safe(d, _d, &dma_device_list, global_node) {
++		dma_cap_mask_t mask;
++		const struct dma_slave_map *map = dma_filter_match(d, name, dev);
++
++		if (!map)
++			continue;
++
++		dma_cap_zero(mask);
++		dma_cap_set(DMA_SLAVE, mask);
+ 
+-	return ERR_PTR(-ENODEV);
++		chan = find_candidate(d, &mask, d->filter.fn, map->param);
++		if (!IS_ERR(chan))
++			break;
++	}
++	mutex_unlock(&dma_list_mutex);
++
++	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
+ }
+-EXPORT_SYMBOL_GPL(dma_request_slave_channel_reason);
++EXPORT_SYMBOL_GPL(dma_request_chan);
+ 
+ /**
+  * dma_request_slave_channel - try to allocate an exclusive slave channel
+@@ -697,17 +744,35 @@ EXPORT_SYMBOL_GPL(dma_request_slave_channel_reason);
+ struct dma_chan *dma_request_slave_channel(struct device *dev,
+ 					   const char *name)
+ {
+-	struct dma_chan *ch = dma_request_slave_channel_reason(dev, name);
++	struct dma_chan *ch = dma_request_chan(dev, name);
+ 	if (IS_ERR(ch))
+ 		return NULL;
+ 
+-	dma_cap_set(DMA_PRIVATE, ch->device->cap_mask);
+-	ch->device->privatecnt++;
+-
+ 	return ch;
+ }
+ EXPORT_SYMBOL_GPL(dma_request_slave_channel);
+ 
++/**
++ * dma_request_chan_by_mask - allocate a channel satisfying certain capabilities
++ * @mask: capabilities that the channel must satisfy
++ *
++ * Returns pointer to appropriate DMA channel on success or an error pointer.
++ */
++struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask)
++{
++	struct dma_chan *chan;
++
++	if (!mask)
++		return ERR_PTR(-ENODEV);
++
++	chan = __dma_request_channel(mask, NULL, NULL);
++	if (!chan)
++		chan = ERR_PTR(-ENODEV);
++
++	return chan;
++}
++EXPORT_SYMBOL_GPL(dma_request_chan_by_mask);
++
+ void dma_release_channel(struct dma_chan *chan)
+ {
+ 	mutex_lock(&dma_list_mutex);
+diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
+index c47c68e..d50a6b51 100644
+--- a/include/linux/dmaengine.h
++++ b/include/linux/dmaengine.h
+@@ -607,11 +607,38 @@ enum dmaengine_alignment {
+ };
+ 
+ /**
++ * struct dma_slave_map - associates slave device and it's slave channel with
++ * parameter to be used by a filter function
++ * @devname: name of the device
++ * @slave: slave channel name
++ * @param: opaque parameter to pass to struct dma_filter.fn
++ */
++struct dma_slave_map {
++	const char *devname;
++	const char *slave;
++	void *param;
++};
++
++/**
++ * struct dma_filter - information for slave device/channel to filter_fn/param
++ * mapping
++ * @fn: filter function callback
++ * @mapcnt: number of slave device/channel in the map
++ * @map: array of channel to filter mapping data
++ */
++struct dma_filter {
++	dma_filter_fn fn;
++	int mapcnt;
++	const struct dma_slave_map *map;
++};
++
++/**
+  * struct dma_device - info on the entity supplying DMA services
+  * @chancnt: how many DMA channels are supported
+  * @privatecnt: how many DMA channels are requested by dma_request_channel
+  * @channels: the list of struct dma_chan
+  * @global_node: list_head for global dma_device_list
++ * @filter: information for device/slave to filter function/param mapping
+  * @cap_mask: one or more dma_capability flags
+  * @max_xor: maximum number of xor sources, 0 if no capability
+  * @max_pq: maximum number of PQ sources and PQ-continue capability
+@@ -666,6 +693,7 @@ struct dma_device {
+ 	unsigned int privatecnt;
+ 	struct list_head channels;
+ 	struct list_head global_node;
++	struct dma_filter filter;
+ 	dma_cap_mask_t  cap_mask;
+ 	unsigned short max_xor;
+ 	unsigned short max_pq;
+@@ -1140,9 +1168,11 @@ enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
+ void dma_issue_pending_all(void);
+ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ 					dma_filter_fn fn, void *fn_param);
+-struct dma_chan *dma_request_slave_channel_reason(struct device *dev,
+-						  const char *name);
+ struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
++
++struct dma_chan *dma_request_chan(struct device *dev, const char *name);
++struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
++
+ void dma_release_channel(struct dma_chan *chan);
+ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps);
+ #else
+@@ -1166,16 +1196,21 @@ static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ {
+ 	return NULL;
+ }
+-static inline struct dma_chan *dma_request_slave_channel_reason(
+-					struct device *dev, const char *name)
+-{
+-	return ERR_PTR(-ENODEV);
+-}
+ static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
+ 							 const char *name)
+ {
+ 	return NULL;
+ }
++static inline struct dma_chan *dma_request_chan(struct device *dev,
++						const char *name)
++{
++	return ERR_PTR(-ENODEV);
++}
++static inline struct dma_chan *dma_request_chan_by_mask(
++						const dma_cap_mask_t *mask)
++{
++	return ERR_PTR(-ENODEV);
++}
+ static inline void dma_release_channel(struct dma_chan *chan)
+ {
+ }
+@@ -1186,6 +1221,8 @@ static inline int dma_get_slave_caps(struct dma_chan *chan,
+ }
+ #endif
+ 
++#define dma_request_slave_channel_reason(dev, name) dma_request_chan(dev, name)
++
+ static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx)
+ {
+ 	struct dma_slave_caps caps;
+-- 
+2.8.1
+
diff --git a/target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch b/target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch
new file mode 100644
index 0000000000..8fcf8caa8a
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch
@@ -0,0 +1,293 @@
+From b36f09c3c441a6e59eab9315032e7d546571de3f Mon Sep 17 00:00:00 2001
+From: Lars-Peter Clausen <lars@metafoo.de>
+Date: Tue, 20 Oct 2015 11:46:28 +0200
+Subject: [PATCH] dmaengine: Add transfer termination synchronization support
+
+The DMAengine API has a long standing race condition that is inherent to
+the API itself. Calling dmaengine_terminate_all() is supposed to stop and
+abort any pending or active transfers that have previously been submitted.
+Unfortunately it is possible that this operation races against a currently
+running (or with some drivers also scheduled) completion callback.
+
+Since the API allows dmaengine_terminate_all() to be called from atomic
+context as well as from within a completion callback it is not possible to
+synchronize to the execution of the completion callback from within
+dmaengine_terminate_all() itself.
+
+This means that a user of the DMAengine API does not know when it is safe
+to free resources used in the completion callback, which can result in a
+use-after-free race condition.
+
+This patch addresses the issue by introducing an explicit synchronization
+primitive to the DMAengine API called dmaengine_synchronize().
+
+The existing dmaengine_terminate_all() is deprecated in favor of
+dmaengine_terminate_sync() and dmaengine_terminate_async(). The former
+aborts all pending and active transfers and synchronizes to the current
+context, meaning it will wait until all running completion callbacks have
+finished. This means it is only possible to call this function from
+non-atomic context. The later function does not synchronize, but can still
+be used in atomic context or from within a complete callback. It has to be
+followed up by dmaengine_synchronize() before a client can free the
+resources used in a completion callback.
+
+In addition to this the semantics of the device_terminate_all() callback
+are slightly relaxed by this patch. It is now OK for a driver to only
+schedule the termination of the active transfer, but does not necessarily
+have to wait until the DMA controller has completely stopped. The driver
+must ensure though that the controller has stopped and no longer accesses
+any memory when the device_synchronize() callback returns.
+
+This was in part done since most drivers do not pay attention to this
+anyway at the moment and to emphasize that this needs to be done when the
+device_synchronize() callback is implemented. But it also helps with
+implementing support for devices where stopping the controller can require
+operations that may sleep.
+
+Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
+Signed-off-by: Vinod Koul <vinod.koul@intel.com>
+---
+ Documentation/dmaengine/client.txt   | 38 ++++++++++++++-
+ Documentation/dmaengine/provider.txt | 20 +++++++-
+ drivers/dma/dmaengine.c              |  5 +-
+ include/linux/dmaengine.h            | 90 ++++++++++++++++++++++++++++++++++++
+ 4 files changed, 148 insertions(+), 5 deletions(-)
+
+diff --git a/Documentation/dmaengine/client.txt b/Documentation/dmaengine/client.txt
+index 11fb87f..d9f9f46 100644
+--- a/Documentation/dmaengine/client.txt
++++ b/Documentation/dmaengine/client.txt
+@@ -128,7 +128,7 @@ The slave DMA usage consists of following steps:
+ 	transaction.
+ 
+ 	For cyclic DMA, a callback function may wish to terminate the
+-	DMA via dmaengine_terminate_all().
++	DMA via dmaengine_terminate_async().
+ 
+ 	Therefore, it is important that DMA engine drivers drop any
+ 	locks before calling the callback function which may cause a
+@@ -166,12 +166,29 @@ The slave DMA usage consists of following steps:
+ 
+ Further APIs:
+ 
+-1. int dmaengine_terminate_all(struct dma_chan *chan)
++1. int dmaengine_terminate_sync(struct dma_chan *chan)
++   int dmaengine_terminate_async(struct dma_chan *chan)
++   int dmaengine_terminate_all(struct dma_chan *chan) /* DEPRECATED */
+ 
+    This causes all activity for the DMA channel to be stopped, and may
+    discard data in the DMA FIFO which hasn't been fully transferred.
+    No callback functions will be called for any incomplete transfers.
+ 
++   Two variants of this function are available.
++
++   dmaengine_terminate_async() might not wait until the DMA has been fully
++   stopped or until any running complete callbacks have finished. But it is
++   possible to call dmaengine_terminate_async() from atomic context or from
++   within a complete callback. dmaengine_synchronize() must be called before it
++   is safe to free the memory accessed by the DMA transfer or free resources
++   accessed from within the complete callback.
++
++   dmaengine_terminate_sync() will wait for the transfer and any running
++   complete callbacks to finish before it returns. But the function must not be
++   called from atomic context or from within a complete callback.
++
++   dmaengine_terminate_all() is deprecated and should not be used in new code.
++
+ 2. int dmaengine_pause(struct dma_chan *chan)
+ 
+    This pauses activity on the DMA channel without data loss.
+@@ -197,3 +214,20 @@ Further APIs:
+ 	a running DMA channel.  It is recommended that DMA engine users
+ 	pause or stop (via dmaengine_terminate_all()) the channel before
+ 	using this API.
++
++5. void dmaengine_synchronize(struct dma_chan *chan)
++
++  Synchronize the termination of the DMA channel to the current context.
++
++  This function should be used after dmaengine_terminate_async() to synchronize
++  the termination of the DMA channel to the current context. The function will
++  wait for the transfer and any running complete callbacks to finish before it
++  returns.
++
++  If dmaengine_terminate_async() is used to stop the DMA channel this function
++  must be called before it is safe to free memory accessed by previously
++  submitted descriptors or to free any resources accessed within the complete
++  callback of previously submitted descriptors.
++
++  The behavior of this function is undefined if dma_async_issue_pending() has
++  been called between dmaengine_terminate_async() and this function.
+diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt
+index 67d4ce4..122b7f4 100644
+--- a/Documentation/dmaengine/provider.txt
++++ b/Documentation/dmaengine/provider.txt
+@@ -327,8 +327,24 @@ supported.
+ 
+    * device_terminate_all
+      - Aborts all the pending and ongoing transfers on the channel
+-     - This command should operate synchronously on the channel,
+-       terminating right away all the channels
++     - For aborted transfers the complete callback should not be called
++     - Can be called from atomic context or from within a complete
++       callback of a descriptor. Must not sleep. Drivers must be able
++       to handle this correctly.
++     - Termination may be asynchronous. The driver does not have to
++       wait until the currently active transfer has completely stopped.
++       See device_synchronize.
++
++   * device_synchronize
++     - Must synchronize the termination of a channel to the current
++       context.
++     - Must make sure that memory for previously submitted
++       descriptors is no longer accessed by the DMA controller.
++     - Must make sure that all complete callbacks for previously
++       submitted descriptors have finished running and none are
++       scheduled to run.
++     - May sleep.
++
+ 
+ Misc notes (stuff that should be documented, but don't really know
+ where to put them)
+diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
+index 3ecec14..d6fc82e 100644
+--- a/drivers/dma/dmaengine.c
++++ b/drivers/dma/dmaengine.c
+@@ -265,8 +265,11 @@ static void dma_chan_put(struct dma_chan *chan)
+ 	module_put(dma_chan_to_owner(chan));
+ 
+ 	/* This channel is not in use anymore, free it */
+-	if (!chan->client_count && chan->device->device_free_chan_resources)
++	if (!chan->client_count && chan->device->device_free_chan_resources) {
++		/* Make sure all operations have completed */
++		dmaengine_synchronize(chan);
+ 		chan->device->device_free_chan_resources(chan);
++	}
+ 
+ 	/* If the channel is used via a DMA request router, free the mapping */
+ 	if (chan->router && chan->router->route_free) {
+diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
+index c47c68e..4662d9a 100644
+--- a/include/linux/dmaengine.h
++++ b/include/linux/dmaengine.h
+@@ -654,6 +654,8 @@ enum dmaengine_alignment {
+  *	paused. Returns 0 or an error code
+  * @device_terminate_all: Aborts all transfers on a channel. Returns 0
+  *	or an error code
++ * @device_synchronize: Synchronizes the termination of a transfers to the
++ *  current context.
+  * @device_tx_status: poll for transaction completion, the optional
+  *	txstate parameter can be supplied with a pointer to get a
+  *	struct with auxiliary transfer status information, otherwise the call
+@@ -737,6 +739,7 @@ struct dma_device {
+ 	int (*device_pause)(struct dma_chan *chan);
+ 	int (*device_resume)(struct dma_chan *chan);
+ 	int (*device_terminate_all)(struct dma_chan *chan);
++	void (*device_synchronize)(struct dma_chan *chan);
+ 
+ 	enum dma_status (*device_tx_status)(struct dma_chan *chan,
+ 					    dma_cookie_t cookie,
+@@ -828,6 +831,13 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
+ 			src_sg, src_nents, flags);
+ }
+ 
++/**
++ * dmaengine_terminate_all() - Terminate all active DMA transfers
++ * @chan: The channel for which to terminate the transfers
++ *
++ * This function is DEPRECATED use either dmaengine_terminate_sync() or
++ * dmaengine_terminate_async() instead.
++ */
+ static inline int dmaengine_terminate_all(struct dma_chan *chan)
+ {
+ 	if (chan->device->device_terminate_all)
+@@ -836,6 +846,86 @@ static inline int dmaengine_terminate_all(struct dma_chan *chan)
+ 	return -ENOSYS;
+ }
+ 
++/**
++ * dmaengine_terminate_async() - Terminate all active DMA transfers
++ * @chan: The channel for which to terminate the transfers
++ *
++ * Calling this function will terminate all active and pending descriptors
++ * that have previously been submitted to the channel. It is not guaranteed
++ * though that the transfer for the active descriptor has stopped when the
++ * function returns. Furthermore it is possible the complete callback of a
++ * submitted transfer is still running when this function returns.
++ *
++ * dmaengine_synchronize() needs to be called before it is safe to free
++ * any memory that is accessed by previously submitted descriptors or before
++ * freeing any resources accessed from within the completion callback of any
++ * perviously submitted descriptors.
++ *
++ * This function can be called from atomic context as well as from within a
++ * complete callback of a descriptor submitted on the same channel.
++ *
++ * If none of the two conditions above apply consider using
++ * dmaengine_terminate_sync() instead.
++ */
++static inline int dmaengine_terminate_async(struct dma_chan *chan)
++{
++	if (chan->device->device_terminate_all)
++		return chan->device->device_terminate_all(chan);
++
++	return -EINVAL;
++}
++
++/**
++ * dmaengine_synchronize() - Synchronize DMA channel termination
++ * @chan: The channel to synchronize
++ *
++ * Synchronizes to the DMA channel termination to the current context. When this
++ * function returns it is guaranteed that all transfers for previously issued
++ * descriptors have stopped and and it is safe to free the memory assoicated
++ * with them. Furthermore it is guaranteed that all complete callback functions
++ * for a previously submitted descriptor have finished running and it is safe to
++ * free resources accessed from within the complete callbacks.
++ *
++ * The behavior of this function is undefined if dma_async_issue_pending() has
++ * been called between dmaengine_terminate_async() and this function.
++ *
++ * This function must only be called from non-atomic context and must not be
++ * called from within a complete callback of a descriptor submitted on the same
++ * channel.
++ */
++static inline void dmaengine_synchronize(struct dma_chan *chan)
++{
++	if (chan->device->device_synchronize)
++		chan->device->device_synchronize(chan);
++}
++
++/**
++ * dmaengine_terminate_sync() - Terminate all active DMA transfers
++ * @chan: The channel for which to terminate the transfers
++ *
++ * Calling this function will terminate all active and pending transfers
++ * that have previously been submitted to the channel. It is similar to
++ * dmaengine_terminate_async() but guarantees that the DMA transfer has actually
++ * stopped and that all complete callbacks have finished running when the
++ * function returns.
++ *
++ * This function must only be called from non-atomic context and must not be
++ * called from within a complete callback of a descriptor submitted on the same
++ * channel.
++ */
++static inline int dmaengine_terminate_sync(struct dma_chan *chan)
++{
++	int ret;
++
++	ret = dmaengine_terminate_async(chan);
++	if (ret)
++		return ret;
++
++	dmaengine_synchronize(chan);
++
++	return 0;
++}
++
+ static inline int dmaengine_pause(struct dma_chan *chan)
+ {
+ 	if (chan->device->device_pause)
+-- 
+2.8.1
+
diff --git a/target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch b/target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch
new file mode 100644
index 0000000000..96b11a82b6
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch
@@ -0,0 +1,1522 @@
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Subject: [PATCH v6 0/4] Fixes / cleanups in dw_dmac (affects on few subsystems)
+Date: Mon, 25 Apr 2016 15:35:05 +0300
+
+This patch series (v3: http://www.spinics.net/lists/kernel/msg2215303.html)
+contains a number of mostly minor fixes and cleanups for the DW DMA driver. A
+couple of them affect the DT binding so these may need to be updated to
+maintain compatibility (old format is still supported though). The rest should
+be relatively straight-forward.
+
+This version has been tested on the following bare metal platforms:
+- ATNGW100 (avr32 based platform) with dmatest
+- Sam460ex (powerpc 44x based platform) with SATA
+- Intel Braswell with UART
+- Intel Galileo (Intel Quark based platform) with UART
+
+(SATA driver and Intel Galileo UART support are based on this series and just
+ published recently for a review)
+
+Vinod, there are few patch sets developed on top of this one, so, the idea is
+to keep this in an immuutable branch / tag.
+
+Changes since v5:
+- fixed an issue found by kbuildbot
+
+Changes since v4:
+- send proper set of patches
+- add changelog
+
+Changes since v3:
+- add patch 1 to check value of dma-masters property
+- drop the upstreamed patches
+- update patch 2 to keep an array for data-width property as well
+
+Changes since v2:
+- add patch 1 to fix master selection which was broken for long time
+- remove "use field-by-field initialization" patch since like Mans metioned in
+  has mostly no value and even might increase error prone
+- rebase on top of recent linux-next
+- wide testing on several platforms
+
+Changes since v1:
+- zeroing struct dw_dma_slave before use
+- fall back to old data_width property if data-width is not found
+- append tags for few patches
+- correct title of cover letter
+- rebase on top of recent linux-next
+
+Andy Shevchenko (4):
+  dmaengine: dw: platform: check nr_masters to be non-zero
+  dmaengine: dw: revisit data_width property
+  dmaengine: dw: keep entire platform data in struct dw_dma
+  dmaengine: dw: pass platform data via struct dw_dma_chip
+
+ Documentation/devicetree/bindings/dma/snps-dma.txt |  6 +-
+ arch/arc/boot/dts/abilis_tb10x.dtsi                |  2 +-
+ arch/arm/boot/dts/spear13xx.dtsi                   |  4 +-
+ drivers/ata/sata_dwc_460ex.c                       |  2 +-
+ drivers/dma/dw/core.c                              | 75 ++++++++--------------
+ drivers/dma/dw/pci.c                               |  5 +-
+ drivers/dma/dw/platform.c                          | 32 +++++----
+ drivers/dma/dw/regs.h                              |  5 +-
+ include/linux/dma/dw.h                             |  5 +-
+ include/linux/platform_data/dma-dw.h               |  4 +-
+ sound/soc/intel/common/sst-firmware.c              |  2 +-
+ 11 files changed, 64 insertions(+), 78 deletions(-)
+
+--- a/drivers/dma/dw/core.c	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/core.c	2016-05-21 22:47:08.665465180 +0200
+@@ -45,22 +45,19 @@
+ 			DW_DMA_MSIZE_16;			\
+ 		u8 _dmsize = _is_slave ? _sconfig->dst_maxburst :	\
+ 			DW_DMA_MSIZE_16;			\
++		u8 _dms = (_dwc->direction == DMA_MEM_TO_DEV) ?		\
++			_dwc->p_master : _dwc->m_master;		\
++		u8 _sms = (_dwc->direction == DMA_DEV_TO_MEM) ?		\
++			_dwc->p_master : _dwc->m_master;		\
+ 								\
+ 		(DWC_CTLL_DST_MSIZE(_dmsize)			\
+ 		 | DWC_CTLL_SRC_MSIZE(_smsize)			\
+ 		 | DWC_CTLL_LLP_D_EN				\
+ 		 | DWC_CTLL_LLP_S_EN				\
+-		 | DWC_CTLL_DMS(_dwc->dst_master)		\
+-		 | DWC_CTLL_SMS(_dwc->src_master));		\
++		 | DWC_CTLL_DMS(_dms)				\
++		 | DWC_CTLL_SMS(_sms));				\
+ 	})
+ 
+-/*
+- * Number of descriptors to allocate for each channel. This should be
+- * made configurable somehow; preferably, the clients (at least the
+- * ones using slave transfers) should be able to give us a hint.
+- */
+-#define NR_DESCS_PER_CHANNEL	64
+-
+ /* The set of bus widths supported by the DMA controller */
+ #define DW_DMA_BUSWIDTHS			  \
+ 	BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED)	| \
+@@ -80,51 +77,65 @@ static struct dw_desc *dwc_first_active(
+ 	return to_dw_desc(dwc->active_list.next);
+ }
+ 
+-static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
++static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
+ {
+-	struct dw_desc *desc, *_desc;
+-	struct dw_desc *ret = NULL;
+-	unsigned int i = 0;
+-	unsigned long flags;
++	struct dw_desc		*desc = txd_to_dw_desc(tx);
++	struct dw_dma_chan	*dwc = to_dw_dma_chan(tx->chan);
++	dma_cookie_t		cookie;
++	unsigned long		flags;
+ 
+ 	spin_lock_irqsave(&dwc->lock, flags);
+-	list_for_each_entry_safe(desc, _desc, &dwc->free_list, desc_node) {
+-		i++;
+-		if (async_tx_test_ack(&desc->txd)) {
+-			list_del(&desc->desc_node);
+-			ret = desc;
+-			break;
+-		}
+-		dev_dbg(chan2dev(&dwc->chan), "desc %p not ACKed\n", desc);
+-	}
++	cookie = dma_cookie_assign(tx);
++
++	/*
++	 * REVISIT: We should attempt to chain as many descriptors as
++	 * possible, perhaps even appending to those already submitted
++	 * for DMA. But this is hard to do in a race-free manner.
++	 */
++
++	list_add_tail(&desc->desc_node, &dwc->queue);
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
++	dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n",
++		 __func__, desc->txd.cookie);
+ 
+-	dev_vdbg(chan2dev(&dwc->chan), "scanned %u descriptors on freelist\n", i);
++	return cookie;
++}
+ 
+-	return ret;
++static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
++{
++	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
++	struct dw_desc *desc;
++	dma_addr_t phys;
++
++	desc = dma_pool_zalloc(dw->desc_pool, GFP_ATOMIC, &phys);
++	if (!desc)
++		return NULL;
++
++	dwc->descs_allocated++;
++	INIT_LIST_HEAD(&desc->tx_list);
++	dma_async_tx_descriptor_init(&desc->txd, &dwc->chan);
++	desc->txd.tx_submit = dwc_tx_submit;
++	desc->txd.flags = DMA_CTRL_ACK;
++	desc->txd.phys = phys;
++	return desc;
+ }
+ 
+-/*
+- * Move a descriptor, including any children, to the free list.
+- * `desc' must not be on any lists.
+- */
+ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
+ {
+-	unsigned long flags;
++	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
++	struct dw_desc *child, *_next;
+ 
+-	if (desc) {
+-		struct dw_desc *child;
++	if (unlikely(!desc))
++		return;
+ 
+-		spin_lock_irqsave(&dwc->lock, flags);
+-		list_for_each_entry(child, &desc->tx_list, desc_node)
+-			dev_vdbg(chan2dev(&dwc->chan),
+-					"moving child desc %p to freelist\n",
+-					child);
+-		list_splice_init(&desc->tx_list, &dwc->free_list);
+-		dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
+-		list_add(&desc->desc_node, &dwc->free_list);
+-		spin_unlock_irqrestore(&dwc->lock, flags);
++	list_for_each_entry_safe(child, _next, &desc->tx_list, desc_node) {
++		list_del(&child->desc_node);
++		dma_pool_free(dw->desc_pool, child, child->txd.phys);
++		dwc->descs_allocated--;
+ 	}
++
++	dma_pool_free(dw->desc_pool, desc, desc->txd.phys);
++	dwc->descs_allocated--;
+ }
+ 
+ static void dwc_initialize(struct dw_dma_chan *dwc)
+@@ -133,7 +144,7 @@ static void dwc_initialize(struct dw_dma
+ 	u32 cfghi = DWC_CFGH_FIFO_MODE;
+ 	u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority);
+ 
+-	if (dwc->initialized == true)
++	if (test_bit(DW_DMA_IS_INITIALIZED, &dwc->flags))
+ 		return;
+ 
+ 	cfghi |= DWC_CFGH_DST_PER(dwc->dst_id);
+@@ -146,26 +157,11 @@ static void dwc_initialize(struct dw_dma
+ 	channel_set_bit(dw, MASK.XFER, dwc->mask);
+ 	channel_set_bit(dw, MASK.ERROR, dwc->mask);
+ 
+-	dwc->initialized = true;
++	set_bit(DW_DMA_IS_INITIALIZED, &dwc->flags);
+ }
+ 
+ /*----------------------------------------------------------------------*/
+ 
+-static inline unsigned int dwc_fast_ffs(unsigned long long v)
+-{
+-	/*
+-	 * We can be a lot more clever here, but this should take care
+-	 * of the most common optimization.
+-	 */
+-	if (!(v & 7))
+-		return 3;
+-	else if (!(v & 3))
+-		return 2;
+-	else if (!(v & 1))
+-		return 1;
+-	return 0;
+-}
+-
+ static inline void dwc_dump_chan_regs(struct dw_dma_chan *dwc)
+ {
+ 	dev_err(chan2dev(&dwc->chan),
+@@ -197,12 +193,12 @@ static inline void dwc_do_single_block(s
+ 	 * Software emulation of LLP mode relies on interrupts to continue
+ 	 * multi block transfer.
+ 	 */
+-	ctllo = desc->lli.ctllo | DWC_CTLL_INT_EN;
++	ctllo = lli_read(desc, ctllo) | DWC_CTLL_INT_EN;
+ 
+-	channel_writel(dwc, SAR, desc->lli.sar);
+-	channel_writel(dwc, DAR, desc->lli.dar);
++	channel_writel(dwc, SAR, lli_read(desc, sar));
++	channel_writel(dwc, DAR, lli_read(desc, dar));
+ 	channel_writel(dwc, CTL_LO, ctllo);
+-	channel_writel(dwc, CTL_HI, desc->lli.ctlhi);
++	channel_writel(dwc, CTL_HI, lli_read(desc, ctlhi));
+ 	channel_set_bit(dw, CH_EN, dwc->mask);
+ 
+ 	/* Move pointer to next descriptor */
+@@ -213,6 +209,7 @@ static inline void dwc_do_single_block(s
+ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
+ {
+ 	struct dw_dma	*dw = to_dw_dma(dwc->chan.device);
++	u8		lms = DWC_LLP_LMS(dwc->m_master);
+ 	unsigned long	was_soft_llp;
+ 
+ 	/* ASSERT:  channel is idle */
+@@ -237,7 +234,7 @@ static void dwc_dostart(struct dw_dma_ch
+ 
+ 		dwc_initialize(dwc);
+ 
+-		dwc->residue = first->total_len;
++		first->residue = first->total_len;
+ 		dwc->tx_node_active = &first->tx_list;
+ 
+ 		/* Submit first block */
+@@ -248,9 +245,8 @@ static void dwc_dostart(struct dw_dma_ch
+ 
+ 	dwc_initialize(dwc);
+ 
+-	channel_writel(dwc, LLP, first->txd.phys);
+-	channel_writel(dwc, CTL_LO,
+-			DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
++	channel_writel(dwc, LLP, first->txd.phys | lms);
++	channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+ 	channel_writel(dwc, CTL_HI, 0);
+ 	channel_set_bit(dw, CH_EN, dwc->mask);
+ }
+@@ -293,11 +289,7 @@ dwc_descriptor_complete(struct dw_dma_ch
+ 	list_for_each_entry(child, &desc->tx_list, desc_node)
+ 		async_tx_ack(&child->txd);
+ 	async_tx_ack(&desc->txd);
+-
+-	list_splice_init(&desc->tx_list, &dwc->free_list);
+-	list_move(&desc->desc_node, &dwc->free_list);
+-
+-	dma_descriptor_unmap(txd);
++	dwc_desc_put(dwc, desc);
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+ 	if (callback)
+@@ -368,11 +360,11 @@ static void dwc_scan_descriptors(struct
+ 
+ 			head = &desc->tx_list;
+ 			if (active != head) {
+-				/* Update desc to reflect last sent one */
+-				if (active != head->next)
+-					desc = to_dw_desc(active->prev);
+-
+-				dwc->residue -= desc->len;
++				/* Update residue to reflect last sent descriptor */
++				if (active == head->next)
++					desc->residue -= desc->len;
++				else
++					desc->residue -= to_dw_desc(active->prev)->len;
+ 
+ 				child = to_dw_desc(active);
+ 
+@@ -387,8 +379,6 @@ static void dwc_scan_descriptors(struct
+ 			clear_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags);
+ 		}
+ 
+-		dwc->residue = 0;
+-
+ 		spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+ 		dwc_complete_all(dw, dwc);
+@@ -396,7 +386,6 @@ static void dwc_scan_descriptors(struct
+ 	}
+ 
+ 	if (list_empty(&dwc->active_list)) {
+-		dwc->residue = 0;
+ 		spin_unlock_irqrestore(&dwc->lock, flags);
+ 		return;
+ 	}
+@@ -411,31 +400,31 @@ static void dwc_scan_descriptors(struct
+ 
+ 	list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
+ 		/* Initial residue value */
+-		dwc->residue = desc->total_len;
++		desc->residue = desc->total_len;
+ 
+ 		/* Check first descriptors addr */
+-		if (desc->txd.phys == llp) {
++		if (desc->txd.phys == DWC_LLP_LOC(llp)) {
+ 			spin_unlock_irqrestore(&dwc->lock, flags);
+ 			return;
+ 		}
+ 
+ 		/* Check first descriptors llp */
+-		if (desc->lli.llp == llp) {
++		if (lli_read(desc, llp) == llp) {
+ 			/* This one is currently in progress */
+-			dwc->residue -= dwc_get_sent(dwc);
++			desc->residue -= dwc_get_sent(dwc);
+ 			spin_unlock_irqrestore(&dwc->lock, flags);
+ 			return;
+ 		}
+ 
+-		dwc->residue -= desc->len;
++		desc->residue -= desc->len;
+ 		list_for_each_entry(child, &desc->tx_list, desc_node) {
+-			if (child->lli.llp == llp) {
++			if (lli_read(child, llp) == llp) {
+ 				/* Currently in progress */
+-				dwc->residue -= dwc_get_sent(dwc);
++				desc->residue -= dwc_get_sent(dwc);
+ 				spin_unlock_irqrestore(&dwc->lock, flags);
+ 				return;
+ 			}
+-			dwc->residue -= child->len;
++			desc->residue -= child->len;
+ 		}
+ 
+ 		/*
+@@ -457,10 +446,14 @@ static void dwc_scan_descriptors(struct
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ }
+ 
+-static inline void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_lli *lli)
++static inline void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_desc *desc)
+ {
+ 	dev_crit(chan2dev(&dwc->chan), "  desc: s0x%x d0x%x l0x%x c0x%x:%x\n",
+-		 lli->sar, lli->dar, lli->llp, lli->ctlhi, lli->ctllo);
++		 lli_read(desc, sar),
++		 lli_read(desc, dar),
++		 lli_read(desc, llp),
++		 lli_read(desc, ctlhi),
++		 lli_read(desc, ctllo));
+ }
+ 
+ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
+@@ -496,9 +489,9 @@ static void dwc_handle_error(struct dw_d
+ 	 */
+ 	dev_WARN(chan2dev(&dwc->chan), "Bad descriptor submitted for DMA!\n"
+ 				       "  cookie: %d\n", bad_desc->txd.cookie);
+-	dwc_dump_lli(dwc, &bad_desc->lli);
++	dwc_dump_lli(dwc, bad_desc);
+ 	list_for_each_entry(child, &bad_desc->tx_list, desc_node)
+-		dwc_dump_lli(dwc, &child->lli);
++		dwc_dump_lli(dwc, child);
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+@@ -549,7 +542,7 @@ static void dwc_handle_cyclic(struct dw_
+ 	 */
+ 	if (unlikely(status_err & dwc->mask) ||
+ 			unlikely(status_xfer & dwc->mask)) {
+-		int i;
++		unsigned int i;
+ 
+ 		dev_err(chan2dev(&dwc->chan),
+ 			"cyclic DMA unexpected %s interrupt, stopping DMA transfer\n",
+@@ -571,7 +564,7 @@ static void dwc_handle_cyclic(struct dw_
+ 		dma_writel(dw, CLEAR.XFER, dwc->mask);
+ 
+ 		for (i = 0; i < dwc->cdesc->periods; i++)
+-			dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
++			dwc_dump_lli(dwc, dwc->cdesc->desc[i]);
+ 
+ 		spin_unlock_irqrestore(&dwc->lock, flags);
+ 	}
+@@ -589,7 +582,7 @@ static void dw_dma_tasklet(unsigned long
+ 	u32 status_block;
+ 	u32 status_xfer;
+ 	u32 status_err;
+-	int i;
++	unsigned int i;
+ 
+ 	status_block = dma_readl(dw, RAW.BLOCK);
+ 	status_xfer = dma_readl(dw, RAW.XFER);
+@@ -616,12 +609,17 @@ static void dw_dma_tasklet(unsigned long
+ static irqreturn_t dw_dma_interrupt(int irq, void *dev_id)
+ {
+ 	struct dw_dma *dw = dev_id;
+-	u32 status = dma_readl(dw, STATUS_INT);
++	u32 status;
++
++	/* Check if we have any interrupt from the DMAC which is not in use */
++	if (!dw->in_use)
++		return IRQ_NONE;
+ 
++	status = dma_readl(dw, STATUS_INT);
+ 	dev_vdbg(dw->dma.dev, "%s: status=0x%x\n", __func__, status);
+ 
+ 	/* Check if we have any interrupt from the DMAC */
+-	if (!status || !dw->in_use)
++	if (!status)
+ 		return IRQ_NONE;
+ 
+ 	/*
+@@ -653,30 +651,6 @@ static irqreturn_t dw_dma_interrupt(int
+ 
+ /*----------------------------------------------------------------------*/
+ 
+-static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
+-{
+-	struct dw_desc		*desc = txd_to_dw_desc(tx);
+-	struct dw_dma_chan	*dwc = to_dw_dma_chan(tx->chan);
+-	dma_cookie_t		cookie;
+-	unsigned long		flags;
+-
+-	spin_lock_irqsave(&dwc->lock, flags);
+-	cookie = dma_cookie_assign(tx);
+-
+-	/*
+-	 * REVISIT: We should attempt to chain as many descriptors as
+-	 * possible, perhaps even appending to those already submitted
+-	 * for DMA. But this is hard to do in a race-free manner.
+-	 */
+-
+-	dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n", __func__, desc->txd.cookie);
+-	list_add_tail(&desc->desc_node, &dwc->queue);
+-
+-	spin_unlock_irqrestore(&dwc->lock, flags);
+-
+-	return cookie;
+-}
+-
+ static struct dma_async_tx_descriptor *
+ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+ 		size_t len, unsigned long flags)
+@@ -688,10 +662,12 @@ dwc_prep_dma_memcpy(struct dma_chan *cha
+ 	struct dw_desc		*prev;
+ 	size_t			xfer_count;
+ 	size_t			offset;
++	u8			m_master = dwc->m_master;
+ 	unsigned int		src_width;
+ 	unsigned int		dst_width;
+-	unsigned int		data_width;
++	unsigned int		data_width = dw->pdata->data_width[m_master];
+ 	u32			ctllo;
++	u8			lms = DWC_LLP_LMS(m_master);
+ 
+ 	dev_vdbg(chan2dev(chan),
+ 			"%s: d%pad s%pad l0x%zx f0x%lx\n", __func__,
+@@ -704,11 +680,7 @@ dwc_prep_dma_memcpy(struct dma_chan *cha
+ 
+ 	dwc->direction = DMA_MEM_TO_MEM;
+ 
+-	data_width = min_t(unsigned int, dw->data_width[dwc->src_master],
+-			   dw->data_width[dwc->dst_master]);
+-
+-	src_width = dst_width = min_t(unsigned int, data_width,
+-				      dwc_fast_ffs(src | dest | len));
++	src_width = dst_width = __ffs(data_width | src | dest | len);
+ 
+ 	ctllo = DWC_DEFAULT_CTLLO(chan)
+ 			| DWC_CTLL_DST_WIDTH(dst_width)
+@@ -726,27 +698,27 @@ dwc_prep_dma_memcpy(struct dma_chan *cha
+ 		if (!desc)
+ 			goto err_desc_get;
+ 
+-		desc->lli.sar = src + offset;
+-		desc->lli.dar = dest + offset;
+-		desc->lli.ctllo = ctllo;
+-		desc->lli.ctlhi = xfer_count;
++		lli_write(desc, sar, src + offset);
++		lli_write(desc, dar, dest + offset);
++		lli_write(desc, ctllo, ctllo);
++		lli_write(desc, ctlhi, xfer_count);
+ 		desc->len = xfer_count << src_width;
+ 
+ 		if (!first) {
+ 			first = desc;
+ 		} else {
+-			prev->lli.llp = desc->txd.phys;
+-			list_add_tail(&desc->desc_node,
+-					&first->tx_list);
++			lli_write(prev, llp, desc->txd.phys | lms);
++			list_add_tail(&desc->desc_node, &first->tx_list);
+ 		}
+ 		prev = desc;
+ 	}
+ 
+ 	if (flags & DMA_PREP_INTERRUPT)
+ 		/* Trigger interrupt after last block */
+-		prev->lli.ctllo |= DWC_CTLL_INT_EN;
++		lli_set(prev, ctllo, DWC_CTLL_INT_EN);
+ 
+ 	prev->lli.llp = 0;
++	lli_clear(prev, ctllo, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+ 	first->txd.flags = flags;
+ 	first->total_len = len;
+ 
+@@ -768,10 +740,12 @@ dwc_prep_slave_sg(struct dma_chan *chan,
+ 	struct dw_desc		*prev;
+ 	struct dw_desc		*first;
+ 	u32			ctllo;
++	u8			m_master = dwc->m_master;
++	u8			lms = DWC_LLP_LMS(m_master);
+ 	dma_addr_t		reg;
+ 	unsigned int		reg_width;
+ 	unsigned int		mem_width;
+-	unsigned int		data_width;
++	unsigned int		data_width = dw->pdata->data_width[m_master];
+ 	unsigned int		i;
+ 	struct scatterlist	*sg;
+ 	size_t			total_len = 0;
+@@ -797,8 +771,6 @@ dwc_prep_slave_sg(struct dma_chan *chan,
+ 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
+ 			DWC_CTLL_FC(DW_DMA_FC_D_M2P);
+ 
+-		data_width = dw->data_width[dwc->src_master];
+-
+ 		for_each_sg(sgl, sg, sg_len, i) {
+ 			struct dw_desc	*desc;
+ 			u32		len, dlen, mem;
+@@ -806,17 +778,16 @@ dwc_prep_slave_sg(struct dma_chan *chan,
+ 			mem = sg_dma_address(sg);
+ 			len = sg_dma_len(sg);
+ 
+-			mem_width = min_t(unsigned int,
+-					  data_width, dwc_fast_ffs(mem | len));
++			mem_width = __ffs(data_width | mem | len);
+ 
+ slave_sg_todev_fill_desc:
+ 			desc = dwc_desc_get(dwc);
+ 			if (!desc)
+ 				goto err_desc_get;
+ 
+-			desc->lli.sar = mem;
+-			desc->lli.dar = reg;
+-			desc->lli.ctllo = ctllo | DWC_CTLL_SRC_WIDTH(mem_width);
++			lli_write(desc, sar, mem);
++			lli_write(desc, dar, reg);
++			lli_write(desc, ctllo, ctllo | DWC_CTLL_SRC_WIDTH(mem_width));
+ 			if ((len >> mem_width) > dwc->block_size) {
+ 				dlen = dwc->block_size << mem_width;
+ 				mem += dlen;
+@@ -826,15 +797,14 @@ slave_sg_todev_fill_desc:
+ 				len = 0;
+ 			}
+ 
+-			desc->lli.ctlhi = dlen >> mem_width;
++			lli_write(desc, ctlhi, dlen >> mem_width);
+ 			desc->len = dlen;
+ 
+ 			if (!first) {
+ 				first = desc;
+ 			} else {
+-				prev->lli.llp = desc->txd.phys;
+-				list_add_tail(&desc->desc_node,
+-						&first->tx_list);
++				lli_write(prev, llp, desc->txd.phys | lms);
++				list_add_tail(&desc->desc_node, &first->tx_list);
+ 			}
+ 			prev = desc;
+ 			total_len += dlen;
+@@ -854,8 +824,6 @@ slave_sg_todev_fill_desc:
+ 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
+ 			DWC_CTLL_FC(DW_DMA_FC_D_P2M);
+ 
+-		data_width = dw->data_width[dwc->dst_master];
+-
+ 		for_each_sg(sgl, sg, sg_len, i) {
+ 			struct dw_desc	*desc;
+ 			u32		len, dlen, mem;
+@@ -863,17 +831,16 @@ slave_sg_todev_fill_desc:
+ 			mem = sg_dma_address(sg);
+ 			len = sg_dma_len(sg);
+ 
+-			mem_width = min_t(unsigned int,
+-					  data_width, dwc_fast_ffs(mem | len));
++			mem_width = __ffs(data_width | mem | len);
+ 
+ slave_sg_fromdev_fill_desc:
+ 			desc = dwc_desc_get(dwc);
+ 			if (!desc)
+ 				goto err_desc_get;
+ 
+-			desc->lli.sar = reg;
+-			desc->lli.dar = mem;
+-			desc->lli.ctllo = ctllo | DWC_CTLL_DST_WIDTH(mem_width);
++			lli_write(desc, sar, reg);
++			lli_write(desc, dar, mem);
++			lli_write(desc, ctllo, ctllo | DWC_CTLL_DST_WIDTH(mem_width));
+ 			if ((len >> reg_width) > dwc->block_size) {
+ 				dlen = dwc->block_size << reg_width;
+ 				mem += dlen;
+@@ -882,15 +849,14 @@ slave_sg_fromdev_fill_desc:
+ 				dlen = len;
+ 				len = 0;
+ 			}
+-			desc->lli.ctlhi = dlen >> reg_width;
++			lli_write(desc, ctlhi, dlen >> reg_width);
+ 			desc->len = dlen;
+ 
+ 			if (!first) {
+ 				first = desc;
+ 			} else {
+-				prev->lli.llp = desc->txd.phys;
+-				list_add_tail(&desc->desc_node,
+-						&first->tx_list);
++				lli_write(prev, llp, desc->txd.phys | lms);
++				list_add_tail(&desc->desc_node, &first->tx_list);
+ 			}
+ 			prev = desc;
+ 			total_len += dlen;
+@@ -905,9 +871,10 @@ slave_sg_fromdev_fill_desc:
+ 
+ 	if (flags & DMA_PREP_INTERRUPT)
+ 		/* Trigger interrupt after last block */
+-		prev->lli.ctllo |= DWC_CTLL_INT_EN;
++		lli_set(prev, ctllo, DWC_CTLL_INT_EN);
+ 
+ 	prev->lli.llp = 0;
++	lli_clear(prev, ctllo, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+ 	first->total_len = total_len;
+ 
+ 	return &first->txd;
+@@ -932,8 +899,8 @@ bool dw_dma_filter(struct dma_chan *chan
+ 	dwc->src_id = dws->src_id;
+ 	dwc->dst_id = dws->dst_id;
+ 
+-	dwc->src_master = dws->src_master;
+-	dwc->dst_master = dws->dst_master;
++	dwc->m_master = dws->m_master;
++	dwc->p_master = dws->p_master;
+ 
+ 	return true;
+ }
+@@ -986,7 +953,7 @@ static int dwc_pause(struct dma_chan *ch
+ 	while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY) && count--)
+ 		udelay(2);
+ 
+-	dwc->paused = true;
++	set_bit(DW_DMA_IS_PAUSED, &dwc->flags);
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+@@ -999,7 +966,7 @@ static inline void dwc_chan_resume(struc
+ 
+ 	channel_writel(dwc, CFG_LO, cfglo & ~DWC_CFGL_CH_SUSP);
+ 
+-	dwc->paused = false;
++	clear_bit(DW_DMA_IS_PAUSED, &dwc->flags);
+ }
+ 
+ static int dwc_resume(struct dma_chan *chan)
+@@ -1007,12 +974,10 @@ static int dwc_resume(struct dma_chan *c
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	unsigned long		flags;
+ 
+-	if (!dwc->paused)
+-		return 0;
+-
+ 	spin_lock_irqsave(&dwc->lock, flags);
+ 
+-	dwc_chan_resume(dwc);
++	if (test_bit(DW_DMA_IS_PAUSED, &dwc->flags))
++		dwc_chan_resume(dwc);
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+@@ -1048,16 +1013,37 @@ static int dwc_terminate_all(struct dma_
+ 	return 0;
+ }
+ 
+-static inline u32 dwc_get_residue(struct dw_dma_chan *dwc)
++static struct dw_desc *dwc_find_desc(struct dw_dma_chan *dwc, dma_cookie_t c)
++{
++	struct dw_desc *desc;
++
++	list_for_each_entry(desc, &dwc->active_list, desc_node)
++		if (desc->txd.cookie == c)
++			return desc;
++
++	return NULL;
++}
++
++static u32 dwc_get_residue(struct dw_dma_chan *dwc, dma_cookie_t cookie)
+ {
++	struct dw_desc *desc;
+ 	unsigned long flags;
+ 	u32 residue;
+ 
+ 	spin_lock_irqsave(&dwc->lock, flags);
+ 
+-	residue = dwc->residue;
+-	if (test_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags) && residue)
+-		residue -= dwc_get_sent(dwc);
++	desc = dwc_find_desc(dwc, cookie);
++	if (desc) {
++		if (desc == dwc_first_active(dwc)) {
++			residue = desc->residue;
++			if (test_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags) && residue)
++				residue -= dwc_get_sent(dwc);
++		} else {
++			residue = desc->total_len;
++		}
++	} else {
++		residue = 0;
++	}
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 	return residue;
+@@ -1078,10 +1064,12 @@ dwc_tx_status(struct dma_chan *chan,
+ 	dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
+ 
+ 	ret = dma_cookie_status(chan, cookie, txstate);
+-	if (ret != DMA_COMPLETE)
+-		dma_set_residue(txstate, dwc_get_residue(dwc));
++	if (ret == DMA_COMPLETE)
++		return ret;
++
++	dma_set_residue(txstate, dwc_get_residue(dwc, cookie));
+ 
+-	if (dwc->paused && ret == DMA_IN_PROGRESS)
++	if (test_bit(DW_DMA_IS_PAUSED, &dwc->flags) && ret == DMA_IN_PROGRESS)
+ 		return DMA_PAUSED;
+ 
+ 	return ret;
+@@ -1102,7 +1090,7 @@ static void dwc_issue_pending(struct dma
+ 
+ static void dw_dma_off(struct dw_dma *dw)
+ {
+-	int i;
++	unsigned int i;
+ 
+ 	dma_writel(dw, CFG, 0);
+ 
+@@ -1116,7 +1104,7 @@ static void dw_dma_off(struct dw_dma *dw
+ 		cpu_relax();
+ 
+ 	for (i = 0; i < dw->dma.chancnt; i++)
+-		dw->chan[i].initialized = false;
++		clear_bit(DW_DMA_IS_INITIALIZED, &dw->chan[i].flags);
+ }
+ 
+ static void dw_dma_on(struct dw_dma *dw)
+@@ -1128,9 +1116,6 @@ static int dwc_alloc_chan_resources(stru
+ {
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	struct dw_dma		*dw = to_dw_dma(chan->device);
+-	struct dw_desc		*desc;
+-	int			i;
+-	unsigned long		flags;
+ 
+ 	dev_vdbg(chan2dev(chan), "%s\n", __func__);
+ 
+@@ -1161,48 +1146,13 @@ static int dwc_alloc_chan_resources(stru
+ 		dw_dma_on(dw);
+ 	dw->in_use |= dwc->mask;
+ 
+-	spin_lock_irqsave(&dwc->lock, flags);
+-	i = dwc->descs_allocated;
+-	while (dwc->descs_allocated < NR_DESCS_PER_CHANNEL) {
+-		dma_addr_t phys;
+-
+-		spin_unlock_irqrestore(&dwc->lock, flags);
+-
+-		desc = dma_pool_alloc(dw->desc_pool, GFP_ATOMIC, &phys);
+-		if (!desc)
+-			goto err_desc_alloc;
+-
+-		memset(desc, 0, sizeof(struct dw_desc));
+-
+-		INIT_LIST_HEAD(&desc->tx_list);
+-		dma_async_tx_descriptor_init(&desc->txd, chan);
+-		desc->txd.tx_submit = dwc_tx_submit;
+-		desc->txd.flags = DMA_CTRL_ACK;
+-		desc->txd.phys = phys;
+-
+-		dwc_desc_put(dwc, desc);
+-
+-		spin_lock_irqsave(&dwc->lock, flags);
+-		i = ++dwc->descs_allocated;
+-	}
+-
+-	spin_unlock_irqrestore(&dwc->lock, flags);
+-
+-	dev_dbg(chan2dev(chan), "%s: allocated %d descriptors\n", __func__, i);
+-
+-	return i;
+-
+-err_desc_alloc:
+-	dev_info(chan2dev(chan), "only allocated %d descriptors\n", i);
+-
+-	return i;
++	return 0;
+ }
+ 
+ static void dwc_free_chan_resources(struct dma_chan *chan)
+ {
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	struct dw_dma		*dw = to_dw_dma(chan->device);
+-	struct dw_desc		*desc, *_desc;
+ 	unsigned long		flags;
+ 	LIST_HEAD(list);
+ 
+@@ -1215,17 +1165,15 @@ static void dwc_free_chan_resources(stru
+ 	BUG_ON(dma_readl(to_dw_dma(chan->device), CH_EN) & dwc->mask);
+ 
+ 	spin_lock_irqsave(&dwc->lock, flags);
+-	list_splice_init(&dwc->free_list, &list);
+-	dwc->descs_allocated = 0;
+ 
+ 	/* Clear custom channel configuration */
+ 	dwc->src_id = 0;
+ 	dwc->dst_id = 0;
+ 
+-	dwc->src_master = 0;
+-	dwc->dst_master = 0;
++	dwc->m_master = 0;
++	dwc->p_master = 0;
+ 
+-	dwc->initialized = false;
++	clear_bit(DW_DMA_IS_INITIALIZED, &dwc->flags);
+ 
+ 	/* Disable interrupts */
+ 	channel_clear_bit(dw, MASK.XFER, dwc->mask);
+@@ -1239,11 +1187,6 @@ static void dwc_free_chan_resources(stru
+ 	if (!dw->in_use)
+ 		dw_dma_off(dw);
+ 
+-	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
+-		dev_vdbg(chan2dev(chan), "  freeing descriptor %p\n", desc);
+-		dma_pool_free(dw->desc_pool, desc, desc->txd.phys);
+-	}
+-
+ 	dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
+ }
+ 
+@@ -1321,6 +1264,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_pre
+ 	struct dw_cyclic_desc		*retval = NULL;
+ 	struct dw_desc			*desc;
+ 	struct dw_desc			*last = NULL;
++	u8				lms = DWC_LLP_LMS(dwc->m_master);
+ 	unsigned long			was_cyclic;
+ 	unsigned int			reg_width;
+ 	unsigned int			periods;
+@@ -1374,9 +1318,6 @@ struct dw_cyclic_desc *dw_dma_cyclic_pre
+ 
+ 	retval = ERR_PTR(-ENOMEM);
+ 
+-	if (periods > NR_DESCS_PER_CHANNEL)
+-		goto out_err;
+-
+ 	cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL);
+ 	if (!cdesc)
+ 		goto out_err;
+@@ -1392,50 +1333,50 @@ struct dw_cyclic_desc *dw_dma_cyclic_pre
+ 
+ 		switch (direction) {
+ 		case DMA_MEM_TO_DEV:
+-			desc->lli.dar = sconfig->dst_addr;
+-			desc->lli.sar = buf_addr + (period_len * i);
+-			desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan)
+-					| DWC_CTLL_DST_WIDTH(reg_width)
+-					| DWC_CTLL_SRC_WIDTH(reg_width)
+-					| DWC_CTLL_DST_FIX
+-					| DWC_CTLL_SRC_INC
+-					| DWC_CTLL_INT_EN);
+-
+-			desc->lli.ctllo |= sconfig->device_fc ?
+-				DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
+-				DWC_CTLL_FC(DW_DMA_FC_D_M2P);
++			lli_write(desc, dar, sconfig->dst_addr);
++			lli_write(desc, sar, buf_addr + period_len * i);
++			lli_write(desc, ctllo, (DWC_DEFAULT_CTLLO(chan)
++				| DWC_CTLL_DST_WIDTH(reg_width)
++				| DWC_CTLL_SRC_WIDTH(reg_width)
++				| DWC_CTLL_DST_FIX
++				| DWC_CTLL_SRC_INC
++				| DWC_CTLL_INT_EN));
++
++			lli_set(desc, ctllo, sconfig->device_fc ?
++					DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
++					DWC_CTLL_FC(DW_DMA_FC_D_M2P));
+ 
+ 			break;
+ 		case DMA_DEV_TO_MEM:
+-			desc->lli.dar = buf_addr + (period_len * i);
+-			desc->lli.sar = sconfig->src_addr;
+-			desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan)
+-					| DWC_CTLL_SRC_WIDTH(reg_width)
+-					| DWC_CTLL_DST_WIDTH(reg_width)
+-					| DWC_CTLL_DST_INC
+-					| DWC_CTLL_SRC_FIX
+-					| DWC_CTLL_INT_EN);
+-
+-			desc->lli.ctllo |= sconfig->device_fc ?
+-				DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
+-				DWC_CTLL_FC(DW_DMA_FC_D_P2M);
++			lli_write(desc, dar, buf_addr + period_len * i);
++			lli_write(desc, sar, sconfig->src_addr);
++			lli_write(desc, ctllo, (DWC_DEFAULT_CTLLO(chan)
++				| DWC_CTLL_SRC_WIDTH(reg_width)
++				| DWC_CTLL_DST_WIDTH(reg_width)
++				| DWC_CTLL_DST_INC
++				| DWC_CTLL_SRC_FIX
++				| DWC_CTLL_INT_EN));
++
++			lli_set(desc, ctllo, sconfig->device_fc ?
++					DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
++					DWC_CTLL_FC(DW_DMA_FC_D_P2M));
+ 
+ 			break;
+ 		default:
+ 			break;
+ 		}
+ 
+-		desc->lli.ctlhi = (period_len >> reg_width);
++		lli_write(desc, ctlhi, period_len >> reg_width);
+ 		cdesc->desc[i] = desc;
+ 
+ 		if (last)
+-			last->lli.llp = desc->txd.phys;
++			lli_write(last, llp, desc->txd.phys | lms);
+ 
+ 		last = desc;
+ 	}
+ 
+ 	/* Let's make a cyclic list */
+-	last->lli.llp = cdesc->desc[0]->txd.phys;
++	lli_write(last, llp, cdesc->desc[0]->txd.phys | lms);
+ 
+ 	dev_dbg(chan2dev(&dwc->chan),
+ 			"cyclic prepared buf %pad len %zu period %zu periods %d\n",
+@@ -1466,7 +1407,7 @@ void dw_dma_cyclic_free(struct dma_chan
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+ 	struct dw_cyclic_desc	*cdesc = dwc->cdesc;
+-	int			i;
++	unsigned int		i;
+ 	unsigned long		flags;
+ 
+ 	dev_dbg(chan2dev(&dwc->chan), "%s\n", __func__);
+@@ -1490,32 +1431,38 @@ void dw_dma_cyclic_free(struct dma_chan
+ 	kfree(cdesc->desc);
+ 	kfree(cdesc);
+ 
++	dwc->cdesc = NULL;
++
+ 	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+ }
+ EXPORT_SYMBOL(dw_dma_cyclic_free);
+ 
+ /*----------------------------------------------------------------------*/
+ 
+-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
++int dw_dma_probe(struct dw_dma_chip *chip)
+ {
++	struct dw_dma_platform_data *pdata;
+ 	struct dw_dma		*dw;
+ 	bool			autocfg = false;
+ 	unsigned int		dw_params;
+-	unsigned int		max_blk_size = 0;
++	unsigned int		i;
+ 	int			err;
+-	int			i;
+ 
+ 	dw = devm_kzalloc(chip->dev, sizeof(*dw), GFP_KERNEL);
+ 	if (!dw)
+ 		return -ENOMEM;
+ 
++	dw->pdata = devm_kzalloc(chip->dev, sizeof(*dw->pdata), GFP_KERNEL);
++	if (!dw->pdata)
++		return -ENOMEM;
++
+ 	dw->regs = chip->regs;
+ 	chip->dw = dw;
+ 
+ 	pm_runtime_get_sync(chip->dev);
+ 
+-	if (!pdata) {
+-		dw_params = dma_read_byaddr(chip->regs, DW_PARAMS);
++	if (!chip->pdata) {
++		dw_params = dma_readl(dw, DW_PARAMS);
+ 		dev_dbg(chip->dev, "DW_PARAMS: 0x%08x\n", dw_params);
+ 
+ 		autocfg = dw_params >> DW_PARAMS_EN & 1;
+@@ -1524,29 +1471,31 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 			goto err_pdata;
+ 		}
+ 
+-		pdata = devm_kzalloc(chip->dev, sizeof(*pdata), GFP_KERNEL);
+-		if (!pdata) {
+-			err = -ENOMEM;
+-			goto err_pdata;
+-		}
++		/* Reassign the platform data pointer */
++		pdata = dw->pdata;
+ 
+ 		/* Get hardware configuration parameters */
+ 		pdata->nr_channels = (dw_params >> DW_PARAMS_NR_CHAN & 7) + 1;
+ 		pdata->nr_masters = (dw_params >> DW_PARAMS_NR_MASTER & 3) + 1;
+ 		for (i = 0; i < pdata->nr_masters; i++) {
+ 			pdata->data_width[i] =
+-				(dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3) + 2;
++				4 << (dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3);
+ 		}
+-		max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
++		pdata->block_size = dma_readl(dw, MAX_BLK_SIZE);
+ 
+ 		/* Fill platform data with the default values */
+ 		pdata->is_private = true;
+ 		pdata->is_memcpy = true;
+ 		pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING;
+ 		pdata->chan_priority = CHAN_PRIORITY_ASCENDING;
+-	} else if (pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
++	} else if (chip->pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
+ 		err = -EINVAL;
+ 		goto err_pdata;
++	} else {
++		memcpy(dw->pdata, chip->pdata, sizeof(*dw->pdata));
++
++		/* Reassign the platform data pointer */
++		pdata = dw->pdata;
+ 	}
+ 
+ 	dw->chan = devm_kcalloc(chip->dev, pdata->nr_channels, sizeof(*dw->chan),
+@@ -1556,11 +1505,6 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 		goto err_pdata;
+ 	}
+ 
+-	/* Get hardware configuration parameters */
+-	dw->nr_masters = pdata->nr_masters;
+-	for (i = 0; i < dw->nr_masters; i++)
+-		dw->data_width[i] = pdata->data_width[i];
+-
+ 	/* Calculate all channel mask before DMA setup */
+ 	dw->all_chan_mask = (1 << pdata->nr_channels) - 1;
+ 
+@@ -1607,7 +1551,6 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 
+ 		INIT_LIST_HEAD(&dwc->active_list);
+ 		INIT_LIST_HEAD(&dwc->queue);
+-		INIT_LIST_HEAD(&dwc->free_list);
+ 
+ 		channel_clear_bit(dw, CH_EN, dwc->mask);
+ 
+@@ -1615,11 +1558,9 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 
+ 		/* Hardware configuration */
+ 		if (autocfg) {
+-			unsigned int dwc_params;
+ 			unsigned int r = DW_DMA_MAX_NR_CHANNELS - i - 1;
+-			void __iomem *addr = chip->regs + r * sizeof(u32);
+-
+-			dwc_params = dma_read_byaddr(addr, DWC_PARAMS);
++			void __iomem *addr = &__dw_regs(dw)->DWC_PARAMS[r];
++			unsigned int dwc_params = dma_readl_native(addr);
+ 
+ 			dev_dbg(chip->dev, "DWC_PARAMS[%d]: 0x%08x\n", i,
+ 					   dwc_params);
+@@ -1630,16 +1571,15 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 			 * up to 0x0a for 4095.
+ 			 */
+ 			dwc->block_size =
+-				(4 << ((max_blk_size >> 4 * i) & 0xf)) - 1;
++				(4 << ((pdata->block_size >> 4 * i) & 0xf)) - 1;
+ 			dwc->nollp =
+ 				(dwc_params >> DWC_PARAMS_MBLK_EN & 0x1) == 0;
+ 		} else {
+ 			dwc->block_size = pdata->block_size;
+ 
+ 			/* Check if channel supports multi block transfer */
+-			channel_writel(dwc, LLP, 0xfffffffc);
+-			dwc->nollp =
+-				(channel_readl(dwc, LLP) & 0xfffffffc) == 0;
++			channel_writel(dwc, LLP, DWC_LLP_LOC(0xffffffff));
++			dwc->nollp = DWC_LLP_LOC(channel_readl(dwc, LLP)) == 0;
+ 			channel_writel(dwc, LLP, 0);
+ 		}
+ 	}
+--- a/drivers/dma/dw/pci.c	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/pci.c	2016-05-21 22:47:08.665465180 +0200
+@@ -17,8 +17,8 @@
+ 
+ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
+ {
++	const struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
+ 	struct dw_dma_chip *chip;
+-	struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
+ 	int ret;
+ 
+ 	ret = pcim_enable_device(pdev);
+@@ -49,8 +49,9 @@ static int dw_pci_probe(struct pci_dev *
+ 	chip->dev = &pdev->dev;
+ 	chip->regs = pcim_iomap_table(pdev)[0];
+ 	chip->irq = pdev->irq;
++	chip->pdata = pdata;
+ 
+-	ret = dw_dma_probe(chip, pdata);
++	ret = dw_dma_probe(chip);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -108,6 +109,10 @@ static const struct pci_device_id dw_pci
+ 
+ 	/* Haswell */
+ 	{ PCI_VDEVICE(INTEL, 0x9c60) },
++
++	/* Broadwell */
++	{ PCI_VDEVICE(INTEL, 0x9ce0) },
++
+ 	{ }
+ };
+ MODULE_DEVICE_TABLE(pci, dw_pci_id_table);
+--- a/drivers/dma/dw/platform.c	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/platform.c	2016-05-21 22:47:08.665465180 +0200
+@@ -42,13 +42,13 @@ static struct dma_chan *dw_dma_of_xlate(
+ 
+ 	slave.src_id = dma_spec->args[0];
+ 	slave.dst_id = dma_spec->args[0];
+-	slave.src_master = dma_spec->args[1];
+-	slave.dst_master = dma_spec->args[2];
++	slave.m_master = dma_spec->args[1];
++	slave.p_master = dma_spec->args[2];
+ 
+ 	if (WARN_ON(slave.src_id >= DW_DMA_MAX_NR_REQUESTS ||
+ 		    slave.dst_id >= DW_DMA_MAX_NR_REQUESTS ||
+-		    slave.src_master >= dw->nr_masters ||
+-		    slave.dst_master >= dw->nr_masters))
++		    slave.m_master >= dw->pdata->nr_masters ||
++		    slave.p_master >= dw->pdata->nr_masters))
+ 		return NULL;
+ 
+ 	dma_cap_zero(cap);
+@@ -66,8 +66,8 @@ static bool dw_dma_acpi_filter(struct dm
+ 		.dma_dev = dma_spec->dev,
+ 		.src_id = dma_spec->slave_id,
+ 		.dst_id = dma_spec->slave_id,
+-		.src_master = 1,
+-		.dst_master = 0,
++		.m_master = 0,
++		.p_master = 1,
+ 	};
+ 
+ 	return dw_dma_filter(chan, &slave);
+@@ -103,18 +103,28 @@ dw_dma_parse_dt(struct platform_device *
+ 	struct device_node *np = pdev->dev.of_node;
+ 	struct dw_dma_platform_data *pdata;
+ 	u32 tmp, arr[DW_DMA_MAX_NR_MASTERS];
++	u32 nr_masters;
++	u32 nr_channels;
+ 
+ 	if (!np) {
+ 		dev_err(&pdev->dev, "Missing DT data\n");
+ 		return NULL;
+ 	}
+ 
++	if (of_property_read_u32(np, "dma-masters", &nr_masters))
++		return NULL;
++	if (nr_masters < 1 || nr_masters > DW_DMA_MAX_NR_MASTERS)
++		return NULL;
++
++	if (of_property_read_u32(np, "dma-channels", &nr_channels))
++		return NULL;
++
+ 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+ 	if (!pdata)
+ 		return NULL;
+ 
+-	if (of_property_read_u32(np, "dma-channels", &pdata->nr_channels))
+-		return NULL;
++	pdata->nr_masters = nr_masters;
++	pdata->nr_channels = nr_channels;
+ 
+ 	if (of_property_read_bool(np, "is_private"))
+ 		pdata->is_private = true;
+@@ -128,17 +138,13 @@ dw_dma_parse_dt(struct platform_device *
+ 	if (!of_property_read_u32(np, "block_size", &tmp))
+ 		pdata->block_size = tmp;
+ 
+-	if (!of_property_read_u32(np, "dma-masters", &tmp)) {
+-		if (tmp > DW_DMA_MAX_NR_MASTERS)
+-			return NULL;
+-
+-		pdata->nr_masters = tmp;
+-	}
+-
+-	if (!of_property_read_u32_array(np, "data_width", arr,
+-				pdata->nr_masters))
+-		for (tmp = 0; tmp < pdata->nr_masters; tmp++)
++	if (!of_property_read_u32_array(np, "data-width", arr, nr_masters)) {
++		for (tmp = 0; tmp < nr_masters; tmp++)
+ 			pdata->data_width[tmp] = arr[tmp];
++	} else if (!of_property_read_u32_array(np, "data_width", arr, nr_masters)) {
++		for (tmp = 0; tmp < nr_masters; tmp++)
++			pdata->data_width[tmp] = BIT(arr[tmp] & 0x07);
++	}
+ 
+ 	return pdata;
+ }
+@@ -155,8 +161,7 @@ static int dw_probe(struct platform_devi
+ 	struct dw_dma_chip *chip;
+ 	struct device *dev = &pdev->dev;
+ 	struct resource *mem;
+-	const struct acpi_device_id *id;
+-	struct dw_dma_platform_data *pdata;
++	const struct dw_dma_platform_data *pdata;
+ 	int err;
+ 
+ 	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
+@@ -179,13 +184,9 @@ static int dw_probe(struct platform_devi
+ 	pdata = dev_get_platdata(dev);
+ 	if (!pdata)
+ 		pdata = dw_dma_parse_dt(pdev);
+-	if (!pdata && has_acpi_companion(dev)) {
+-		id = acpi_match_device(dev->driver->acpi_match_table, dev);
+-		if (id)
+-			pdata = (struct dw_dma_platform_data *)id->driver_data;
+-	}
+ 
+ 	chip->dev = dev;
++	chip->pdata = pdata;
+ 
+ 	chip->clk = devm_clk_get(chip->dev, "hclk");
+ 	if (IS_ERR(chip->clk))
+@@ -196,7 +197,7 @@ static int dw_probe(struct platform_devi
+ 
+ 	pm_runtime_enable(&pdev->dev);
+ 
+-	err = dw_dma_probe(chip, pdata);
++	err = dw_dma_probe(chip);
+ 	if (err)
+ 		goto err_dw_dma_probe;
+ 
+@@ -239,7 +240,19 @@ static void dw_shutdown(struct platform_
+ {
+ 	struct dw_dma_chip *chip = platform_get_drvdata(pdev);
+ 
++	/*
++	 * We have to call dw_dma_disable() to stop any ongoing transfer. On
++	 * some platforms we can't do that since DMA device is powered off.
++	 * Moreover we have no possibility to check if the platform is affected
++	 * or not. That's why we call pm_runtime_get_sync() / pm_runtime_put()
++	 * unconditionally. On the other hand we can't use
++	 * pm_runtime_suspended() because runtime PM framework is not fully
++	 * used by the driver.
++	 */
++	pm_runtime_get_sync(chip->dev);
+ 	dw_dma_disable(chip);
++	pm_runtime_put_sync_suspend(chip->dev);
++
+ 	clk_disable_unprepare(chip->clk);
+ }
+ 
+@@ -252,17 +265,8 @@ MODULE_DEVICE_TABLE(of, dw_dma_of_id_tab
+ #endif
+ 
+ #ifdef CONFIG_ACPI
+-static struct dw_dma_platform_data dw_dma_acpi_pdata = {
+-	.nr_channels = 8,
+-	.is_private = true,
+-	.chan_allocation_order = CHAN_ALLOCATION_ASCENDING,
+-	.chan_priority = CHAN_PRIORITY_ASCENDING,
+-	.block_size = 4095,
+-	.nr_masters = 2,
+-};
+-
+ static const struct acpi_device_id dw_dma_acpi_id_table[] = {
+-	{ "INTL9C60", (kernel_ulong_t)&dw_dma_acpi_pdata },
++	{ "INTL9C60", 0 },
+ 	{ }
+ };
+ MODULE_DEVICE_TABLE(acpi, dw_dma_acpi_id_table);
+--- a/drivers/dma/dw/regs.h	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/regs.h	2016-05-21 22:47:08.665465180 +0200
+@@ -114,10 +114,6 @@ struct dw_dma_regs {
+ #define dma_writel_native writel
+ #endif
+ 
+-/* To access the registers in early stage of probe */
+-#define dma_read_byaddr(addr, name) \
+-	dma_readl_native((addr) + offsetof(struct dw_dma_regs, name))
+-
+ /* Bitfields in DW_PARAMS */
+ #define DW_PARAMS_NR_CHAN	8		/* number of channels */
+ #define DW_PARAMS_NR_MASTER	11		/* number of AHB masters */
+@@ -143,6 +139,10 @@ enum dw_dma_msize {
+ 	DW_DMA_MSIZE_256,
+ };
+ 
++/* Bitfields in LLP */
++#define DWC_LLP_LMS(x)		((x) & 3)	/* list master select */
++#define DWC_LLP_LOC(x)		((x) & ~3)	/* next lli */
++
+ /* Bitfields in CTL_LO */
+ #define DWC_CTLL_INT_EN		(1 << 0)	/* irqs enabled? */
+ #define DWC_CTLL_DST_WIDTH(n)	((n)<<1)	/* bytes per element */
+@@ -150,7 +150,7 @@ enum dw_dma_msize {
+ #define DWC_CTLL_DST_INC	(0<<7)		/* DAR update/not */
+ #define DWC_CTLL_DST_DEC	(1<<7)
+ #define DWC_CTLL_DST_FIX	(2<<7)
+-#define DWC_CTLL_SRC_INC	(0<<7)		/* SAR update/not */
++#define DWC_CTLL_SRC_INC	(0<<9)		/* SAR update/not */
+ #define DWC_CTLL_SRC_DEC	(1<<9)
+ #define DWC_CTLL_SRC_FIX	(2<<9)
+ #define DWC_CTLL_DST_MSIZE(n)	((n)<<11)	/* burst, #elements */
+@@ -216,6 +216,8 @@ enum dw_dma_msize {
+ enum dw_dmac_flags {
+ 	DW_DMA_IS_CYCLIC = 0,
+ 	DW_DMA_IS_SOFT_LLP = 1,
++	DW_DMA_IS_PAUSED = 2,
++	DW_DMA_IS_INITIALIZED = 3,
+ };
+ 
+ struct dw_dma_chan {
+@@ -224,8 +226,6 @@ struct dw_dma_chan {
+ 	u8				mask;
+ 	u8				priority;
+ 	enum dma_transfer_direction	direction;
+-	bool				paused;
+-	bool				initialized;
+ 
+ 	/* software emulation of the LLP transfers */
+ 	struct list_head	*tx_node_active;
+@@ -236,8 +236,6 @@ struct dw_dma_chan {
+ 	unsigned long		flags;
+ 	struct list_head	active_list;
+ 	struct list_head	queue;
+-	struct list_head	free_list;
+-	u32			residue;
+ 	struct dw_cyclic_desc	*cdesc;
+ 
+ 	unsigned int		descs_allocated;
+@@ -249,8 +247,8 @@ struct dw_dma_chan {
+ 	/* custom slave configuration */
+ 	u8			src_id;
+ 	u8			dst_id;
+-	u8			src_master;
+-	u8			dst_master;
++	u8			m_master;
++	u8			p_master;
+ 
+ 	/* configuration passed via .device_config */
+ 	struct dma_slave_config dma_sconfig;
+@@ -283,9 +281,8 @@ struct dw_dma {
+ 	u8			all_chan_mask;
+ 	u8			in_use;
+ 
+-	/* hardware configuration */
+-	unsigned char		nr_masters;
+-	unsigned char		data_width[DW_DMA_MAX_NR_MASTERS];
++	/* platform data */
++	struct dw_dma_platform_data	*pdata;
+ };
+ 
+ static inline struct dw_dma_regs __iomem *__dw_regs(struct dw_dma *dw)
+@@ -308,32 +305,51 @@ static inline struct dw_dma *to_dw_dma(s
+ 	return container_of(ddev, struct dw_dma, dma);
+ }
+ 
++#ifdef CONFIG_DW_DMAC_BIG_ENDIAN_IO
++typedef __be32 __dw32;
++#else
++typedef __le32 __dw32;
++#endif
++
+ /* LLI == Linked List Item; a.k.a. DMA block descriptor */
+ struct dw_lli {
+ 	/* values that are not changed by hardware */
+-	u32		sar;
+-	u32		dar;
+-	u32		llp;		/* chain to next lli */
+-	u32		ctllo;
++	__dw32		sar;
++	__dw32		dar;
++	__dw32		llp;		/* chain to next lli */
++	__dw32		ctllo;
+ 	/* values that may get written back: */
+-	u32		ctlhi;
++	__dw32		ctlhi;
+ 	/* sstat and dstat can snapshot peripheral register state.
+ 	 * silicon config may discard either or both...
+ 	 */
+-	u32		sstat;
+-	u32		dstat;
++	__dw32		sstat;
++	__dw32		dstat;
+ };
+ 
+ struct dw_desc {
+ 	/* FIRST values the hardware uses */
+ 	struct dw_lli			lli;
+ 
++#ifdef CONFIG_DW_DMAC_BIG_ENDIAN_IO
++#define lli_set(d, reg, v)		((d)->lli.reg |= cpu_to_be32(v))
++#define lli_clear(d, reg, v)		((d)->lli.reg &= ~cpu_to_be32(v))
++#define lli_read(d, reg)		be32_to_cpu((d)->lli.reg)
++#define lli_write(d, reg, v)		((d)->lli.reg = cpu_to_be32(v))
++#else
++#define lli_set(d, reg, v)		((d)->lli.reg |= cpu_to_le32(v))
++#define lli_clear(d, reg, v)		((d)->lli.reg &= ~cpu_to_le32(v))
++#define lli_read(d, reg)		le32_to_cpu((d)->lli.reg)
++#define lli_write(d, reg, v)		((d)->lli.reg = cpu_to_le32(v))
++#endif
++
+ 	/* THEN values for driver housekeeping */
+ 	struct list_head		desc_node;
+ 	struct list_head		tx_list;
+ 	struct dma_async_tx_descriptor	txd;
+ 	size_t				len;
+ 	size_t				total_len;
++	u32				residue;
+ };
+ 
+ #define to_dw_desc(h)	list_entry(h, struct dw_desc, desc_node)
+--- a/include/linux/dma/dw.h
++++ b/include/linux/dma/dw.h
+@@ -27,6 +27,7 @@ struct dw_dma;
+  * @regs:		memory mapped I/O space
+  * @clk:		hclk clock
+  * @dw:			struct dw_dma that is filed by dw_dma_probe()
++ * @pdata:		pointer to platform data
+  */
+ struct dw_dma_chip {
+ 	struct device	*dev;
+@@ -34,10 +35,12 @@ struct dw_dma_chip {
+ 	void __iomem	*regs;
+ 	struct clk	*clk;
+ 	struct dw_dma	*dw;
++
++	const struct dw_dma_platform_data	*pdata;
+ };
+ 
+ /* Export to the platform drivers */
+-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata);
++int dw_dma_probe(struct dw_dma_chip *chip);
+ int dw_dma_remove(struct dw_dma_chip *chip);
+ 
+ /* DMA API extensions */
+diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
+index 03b6095..d15d8ba 100644
+--- a/include/linux/platform_data/dma-dw.h
++++ b/include/linux/platform_data/dma-dw.h
+@@ -21,15 +21,15 @@
+  * @dma_dev:	required DMA master device
+  * @src_id:	src request line
+  * @dst_id:	dst request line
+- * @src_master: src master for transfers on allocated channel.
+- * @dst_master: dest master for transfers on allocated channel.
++ * @m_master:	memory master for transfers on allocated channel
++ * @p_master:	peripheral master for transfers on allocated channel
+  */
+ struct dw_dma_slave {
+ 	struct device		*dma_dev;
+ 	u8			src_id;
+ 	u8			dst_id;
+-	u8			src_master;
+-	u8			dst_master;
++	u8			m_master;
++	u8			p_master;
+ };
+ 
+ /**
+@@ -43,7 +43,7 @@ struct dw_dma_slave {
+  * @block_size: Maximum block size supported by the controller
+  * @nr_masters: Number of AHB masters supported by the controller
+  * @data_width: Maximum data width supported by hardware per AHB master
+- *		(0 - 8bits, 1 - 16bits, ..., 5 - 256bits)
++ *		(in bytes, power of 2)
+  */
+ struct dw_dma_platform_data {
+ 	unsigned int	nr_channels;
+@@ -55,7 +55,7 @@ struct dw_dma_platform_data {
+ #define CHAN_PRIORITY_ASCENDING		0	/* chan0 highest */
+ #define CHAN_PRIORITY_DESCENDING	1	/* chan7 highest */
+ 	unsigned char	chan_priority;
+-	unsigned short	block_size;
++	unsigned int	block_size;
+ 	unsigned char	nr_masters;
+ 	unsigned char	data_width[DW_DMA_MAX_NR_MASTERS];
+ };
+-- 
+2.8.1
+