1 files changed, 802 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-4.19/950-0570-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch b/target/linux/brcm2708/patches-4.19/950-0570-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch
new file mode 100644
index 0000000000..53c760c4f0
--- /dev/null
+++ b/target/linux/brcm2708/patches-4.19/950-0570-drm-v3d-Add-support-for-submitting-jobs-to-the-TFU.patch
@@ -0,0 +1,802 @@
+From 60c65dc612663be7136a19a117cee5d194530600 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <eric@anholt.net>
+Date: Wed, 28 Nov 2018 15:09:25 -0800
+Subject: [PATCH 570/703] drm/v3d: Add support for submitting jobs to the TFU.
+
+The TFU can copy from raster, UIF, and SAND input images to UIF output
+images, with optional mipmap generation.  This will certainly be
+useful for media EGL image input, but is also useful immediately for
+mipmap generation without bogging the V3D core down.
+
+For now we only run the queue 1 job deep, and don't have any hang
+recovery (though I don't think we should need it, with TFU).  Queuing
+multiple jobs in the HW will require synchronizing the YUV coefficient
+regs updates since they don't get FIFOed with the job.
+
+v2: Change the ioctl to IOW instead of IOWR, always set COEF0, explain
+    why TFU is AUTH, clarify the syncing docs, drop the unused TFU
+    interrupt regs (you're expected to use the hub's), don't take
+    &bo->base for NULL bos.
+v3: Fix a little whitespace alignment (noticed by checkpatch), rebase
+    on drm_sched_job_cleanup() changes.
+
+Signed-off-by: Eric Anholt <eric@anholt.net>
+Reviewed-by: Dave Emett <david.emett@broadcom.com> (v2)
+Link: https://patchwork.freedesktop.org/patch/264607/
+(cherry picked from commit 1584f16ca96ef124aad79efa3303cff5f3530e2c)
+---
+ drivers/gpu/drm/v3d/v3d_drv.c   |  15 ++-
+ drivers/gpu/drm/v3d/v3d_drv.h   |  32 +++++-
+ drivers/gpu/drm/v3d/v3d_gem.c   | 178 ++++++++++++++++++++++++++++----
+ drivers/gpu/drm/v3d/v3d_irq.c   |  12 ++-
+ drivers/gpu/drm/v3d/v3d_regs.h  |  49 +++++++++
+ drivers/gpu/drm/v3d/v3d_sched.c | 148 ++++++++++++++++++++++----
+ drivers/gpu/drm/v3d/v3d_trace.h |  20 ++++
+ include/uapi/drm/v3d_drm.h      |  25 +++++
+ 8 files changed, 426 insertions(+), 53 deletions(-)
+
+--- a/drivers/gpu/drm/v3d/v3d_drv.c
++++ b/drivers/gpu/drm/v3d/v3d_drv.c
+@@ -112,10 +112,15 @@ static int v3d_get_param_ioctl(struct dr
+ 		return 0;
+ 	}
+ 
+-	/* Any params that aren't just register reads would go here. */
+ 
+-	DRM_DEBUG("Unknown parameter %d\n", args->param);
+-	return -EINVAL;
++	switch (args->param) {
++	case DRM_V3D_PARAM_SUPPORTS_TFU:
++		args->value = 1;
++		return 0;
++	default:
++		DRM_DEBUG("Unknown parameter %d\n", args->param);
++		return -EINVAL;
++	}
+ }
+ 
+ static int
+@@ -170,7 +175,8 @@ static const struct file_operations v3d_
+ /* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
+  * protection between clients.  Note that render nodes would be be
+  * able to submit CLs that could access BOs from clients authenticated
+- * with the master node.
++ * with the master node.  The TFU doesn't use the GMP, so it would
++ * need to stay DRM_AUTH until we do buffer size/offset validation.
+  */
+ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
+ 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
+@@ -179,6 +185,7 @@ static const struct drm_ioctl_desc v3d_d
+ 	DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
+ 	DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
+ 	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
++	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
+ };
+ 
+ static const struct vm_operations_struct v3d_vm_ops = {
+--- a/drivers/gpu/drm/v3d/v3d_drv.h
++++ b/drivers/gpu/drm/v3d/v3d_drv.h
+@@ -7,19 +7,18 @@
+ #include <drm/drm_encoder.h>
+ #include <drm/drm_gem.h>
+ #include <drm/gpu_scheduler.h>
++#include "uapi/drm/v3d_drm.h"
+ 
+ #define GMP_GRANULARITY (128 * 1024)
+ 
+-/* Enum for each of the V3D queues.  We maintain various queue
+- * tracking as an array because at some point we'll want to support
+- * the TFU (texture formatting unit) as another queue.
+- */
++/* Enum for each of the V3D queues. */
+ enum v3d_queue {
+ 	V3D_BIN,
+ 	V3D_RENDER,
++	V3D_TFU,
+ };
+ 
+-#define V3D_MAX_QUEUES (V3D_RENDER + 1)
++#define V3D_MAX_QUEUES (V3D_TFU + 1)
+ 
+ struct v3d_queue_state {
+ 	struct drm_gpu_scheduler sched;
+@@ -68,6 +67,7 @@ struct v3d_dev {
+ 
+ 	struct v3d_exec_info *bin_job;
+ 	struct v3d_exec_info *render_job;
++	struct v3d_tfu_job *tfu_job;
+ 
+ 	struct v3d_queue_state queue[V3D_MAX_QUEUES];
+ 
+@@ -218,6 +218,25 @@ struct v3d_exec_info {
+ 	u32 qma, qms, qts;
+ };
+ 
++struct v3d_tfu_job {
++	struct drm_sched_job base;
++
++	struct drm_v3d_submit_tfu args;
++
++	/* An optional fence userspace can pass in for the job to depend on. */
++	struct dma_fence *in_fence;
++
++	/* v3d fence to be signaled by IRQ handler when the job is complete. */
++	struct dma_fence *done_fence;
++
++	struct v3d_dev *v3d;
++
++	struct kref refcount;
++
++	/* This is the array of BOs that were looked up at the start of exec. */
++	struct v3d_bo *bo[4];
++};
++
+ /**
+  * _wait_for - magic (register) wait macro
+  *
+@@ -281,9 +300,12 @@ int v3d_gem_init(struct drm_device *dev)
+ void v3d_gem_destroy(struct drm_device *dev);
+ int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
+ 			struct drm_file *file_priv);
++int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
++			 struct drm_file *file_priv);
+ int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
+ 		      struct drm_file *file_priv);
+ void v3d_exec_put(struct v3d_exec_info *exec);
++void v3d_tfu_job_put(struct v3d_tfu_job *exec);
+ void v3d_reset(struct v3d_dev *v3d);
+ void v3d_invalidate_caches(struct v3d_dev *v3d);
+ void v3d_flush_caches(struct v3d_dev *v3d);
+--- a/drivers/gpu/drm/v3d/v3d_gem.c
++++ b/drivers/gpu/drm/v3d/v3d_gem.c
+@@ -207,26 +207,27 @@ v3d_flush_caches(struct v3d_dev *v3d)
+ }
+ 
+ static void
+-v3d_attach_object_fences(struct v3d_exec_info *exec)
++v3d_attach_object_fences(struct v3d_bo **bos, int bo_count,
++			 struct dma_fence *fence)
+ {
+-	struct dma_fence *out_fence = exec->render_done_fence;
+ 	int i;
+ 
+-	for (i = 0; i < exec->bo_count; i++) {
++	for (i = 0; i < bo_count; i++) {
+ 		/* XXX: Use shared fences for read-only objects. */
+-		reservation_object_add_excl_fence(exec->bo[i]->resv, out_fence);
++		reservation_object_add_excl_fence(bos[i]->resv, fence);
+ 	}
+ }
+ 
+ static void
+ v3d_unlock_bo_reservations(struct drm_device *dev,
+-			   struct v3d_exec_info *exec,
++			   struct v3d_bo **bos,
++			   int bo_count,
+ 			   struct ww_acquire_ctx *acquire_ctx)
+ {
+ 	int i;
+ 
+-	for (i = 0; i < exec->bo_count; i++)
+-		ww_mutex_unlock(&exec->bo[i]->resv->lock);
++	for (i = 0; i < bo_count; i++)
++		ww_mutex_unlock(&bos[i]->resv->lock);
+ 
+ 	ww_acquire_fini(acquire_ctx);
+ }
+@@ -240,7 +241,8 @@ v3d_unlock_bo_reservations(struct drm_de
+  */
+ static int
+ v3d_lock_bo_reservations(struct drm_device *dev,
+-			 struct v3d_exec_info *exec,
++			 struct v3d_bo **bos,
++			 int bo_count,
+ 			 struct ww_acquire_ctx *acquire_ctx)
+ {
+ 	int contended_lock = -1;
+@@ -250,7 +252,7 @@ v3d_lock_bo_reservations(struct drm_devi
+ 
+ retry:
+ 	if (contended_lock != -1) {
+-		struct v3d_bo *bo = exec->bo[contended_lock];
++		struct v3d_bo *bo = bos[contended_lock];
+ 
+ 		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
+ 						       acquire_ctx);
+@@ -260,20 +262,20 @@ retry:
+ 		}
+ 	}
+ 
+-	for (i = 0; i < exec->bo_count; i++) {
++	for (i = 0; i < bo_count; i++) {
+ 		if (i == contended_lock)
+ 			continue;
+ 
+-		ret = ww_mutex_lock_interruptible(&exec->bo[i]->resv->lock,
++		ret = ww_mutex_lock_interruptible(&bos[i]->resv->lock,
+ 						  acquire_ctx);
+ 		if (ret) {
+ 			int j;
+ 
+ 			for (j = 0; j < i; j++)
+-				ww_mutex_unlock(&exec->bo[j]->resv->lock);
++				ww_mutex_unlock(&bos[j]->resv->lock);
+ 
+ 			if (contended_lock != -1 && contended_lock >= i) {
+-				struct v3d_bo *bo = exec->bo[contended_lock];
++				struct v3d_bo *bo = bos[contended_lock];
+ 
+ 				ww_mutex_unlock(&bo->resv->lock);
+ 			}
+@@ -293,10 +295,11 @@ retry:
+ 	/* Reserve space for our shared (read-only) fence references,
+ 	 * before we commit the CL to the hardware.
+ 	 */
+-	for (i = 0; i < exec->bo_count; i++) {
+-		ret = reservation_object_reserve_shared(exec->bo[i]->resv);
++	for (i = 0; i < bo_count; i++) {
++		ret = reservation_object_reserve_shared(bos[i]->resv);
+ 		if (ret) {
+-			v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
++			v3d_unlock_bo_reservations(dev, bos, bo_count,
++						   acquire_ctx);
+ 			return ret;
+ 		}
+ 	}
+@@ -419,6 +422,33 @@ void v3d_exec_put(struct v3d_exec_info *
+ 	kref_put(&exec->refcount, v3d_exec_cleanup);
+ }
+ 
++static void
++v3d_tfu_job_cleanup(struct kref *ref)
++{
++	struct v3d_tfu_job *job = container_of(ref, struct v3d_tfu_job,
++					       refcount);
++	struct v3d_dev *v3d = job->v3d;
++	unsigned int i;
++
++	dma_fence_put(job->in_fence);
++	dma_fence_put(job->done_fence);
++
++	for (i = 0; i < ARRAY_SIZE(job->bo); i++) {
++		if (job->bo[i])
++			drm_gem_object_put_unlocked(&job->bo[i]->base);
++	}
++
++	pm_runtime_mark_last_busy(v3d->dev);
++	pm_runtime_put_autosuspend(v3d->dev);
++
++	kfree(job);
++}
++
++void v3d_tfu_job_put(struct v3d_tfu_job *job)
++{
++	kref_put(&job->refcount, v3d_tfu_job_cleanup);
++}
++
+ int
+ v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
+ 		  struct drm_file *file_priv)
+@@ -536,7 +566,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
+ 	if (ret)
+ 		goto fail;
+ 
+-	ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
++	ret = v3d_lock_bo_reservations(dev, exec->bo, exec->bo_count,
++				       &acquire_ctx);
+ 	if (ret)
+ 		goto fail;
+ 
+@@ -570,9 +601,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
+ 				  &v3d_priv->sched_entity[V3D_RENDER]);
+ 	mutex_unlock(&v3d->sched_lock);
+ 
+-	v3d_attach_object_fences(exec);
++	v3d_attach_object_fences(exec->bo, exec->bo_count,
++				 exec->render_done_fence);
+ 
+-	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
++	v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
+ 
+ 	/* Update the return sync object for the */
+ 	sync_out = drm_syncobj_find(file_priv, args->out_sync);
+@@ -588,12 +620,118 @@ v3d_submit_cl_ioctl(struct drm_device *d
+ 
+ fail_unreserve:
+ 	mutex_unlock(&v3d->sched_lock);
+-	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
++	v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
+ fail:
+ 	v3d_exec_put(exec);
+ 
+ 	return ret;
+ }
++
++/**
++ * v3d_submit_tfu_ioctl() - Submits a TFU (texture formatting) job to the V3D.
++ * @dev: DRM device
++ * @data: ioctl argument
++ * @file_priv: DRM file for this fd
++ *
++ * Userspace provides the register setup for the TFU, which we don't
++ * need to validate since the TFU is behind the MMU.
++ */
++int
++v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
++		     struct drm_file *file_priv)
++{
++	struct v3d_dev *v3d = to_v3d_dev(dev);
++	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
++	struct drm_v3d_submit_tfu *args = data;
++	struct v3d_tfu_job *job;
++	struct ww_acquire_ctx acquire_ctx;
++	struct drm_syncobj *sync_out;
++	struct dma_fence *sched_done_fence;
++	int ret = 0;
++	int bo_count;
++
++	job = kcalloc(1, sizeof(*job), GFP_KERNEL);
++	if (!job)
++		return -ENOMEM;
++
++	ret = pm_runtime_get_sync(v3d->dev);
++	if (ret < 0) {
++		kfree(job);
++		return ret;
++	}
++
++	kref_init(&job->refcount);
++
++	ret = drm_syncobj_find_fence(file_priv, args->in_sync,
++				     0, &job->in_fence);
++	if (ret == -EINVAL)
++		goto fail;
++
++	job->args = *args;
++	job->v3d = v3d;
++
++	spin_lock(&file_priv->table_lock);
++	for (bo_count = 0; bo_count < ARRAY_SIZE(job->bo); bo_count++) {
++		struct drm_gem_object *bo;
++
++		if (!args->bo_handles[bo_count])
++			break;
++
++		bo = idr_find(&file_priv->object_idr,
++			      args->bo_handles[bo_count]);
++		if (!bo) {
++			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
++				  bo_count, args->bo_handles[bo_count]);
++			ret = -ENOENT;
++			spin_unlock(&file_priv->table_lock);
++			goto fail;
++		}
++		drm_gem_object_get(bo);
++		job->bo[bo_count] = to_v3d_bo(bo);
++	}
++	spin_unlock(&file_priv->table_lock);
++
++	ret = v3d_lock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
++	if (ret)
++		goto fail;
++
++	mutex_lock(&v3d->sched_lock);
++	ret = drm_sched_job_init(&job->base,
++				 &v3d_priv->sched_entity[V3D_TFU],
++				 v3d_priv);
++	if (ret)
++		goto fail_unreserve;
++
++	sched_done_fence = dma_fence_get(&job->base.s_fence->finished);
++
++	kref_get(&job->refcount); /* put by scheduler job completion */
++	drm_sched_entity_push_job(&job->base, &v3d_priv->sched_entity[V3D_TFU]);
++	mutex_unlock(&v3d->sched_lock);
++
++	v3d_attach_object_fences(job->bo, bo_count, sched_done_fence);
++
++	v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
++
++	/* Update the return sync object */
++	sync_out = drm_syncobj_find(file_priv, args->out_sync);
++	if (sync_out) {
++		drm_syncobj_replace_fence(sync_out, sched_done_fence);
++		drm_syncobj_put(sync_out);
++	}
++	dma_fence_put(sched_done_fence);
++
++	v3d_tfu_job_put(job);
++
++	return 0;
++
++fail_unreserve:
++	mutex_unlock(&v3d->sched_lock);
++	v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
++fail:
++	v3d_tfu_job_put(job);
++
++	return ret;
++}
+ 
+ int
+ v3d_gem_init(struct drm_device *dev)
+--- a/drivers/gpu/drm/v3d/v3d_irq.c
++++ b/drivers/gpu/drm/v3d/v3d_irq.c
+@@ -4,8 +4,8 @@
+ /**
+  * DOC: Interrupt management for the V3D engine
+  *
+- * When we take a binning or rendering flush done interrupt, we need
+- * to signal the fence for that job so that the scheduler can queue up
++ * When we take a bin, render, or TFU done interrupt, we need to
++ * signal the fence for that job so that the scheduler can queue up
+  * the next one and unblock any waiters.
+  *
+  * When we take the binner out of memory interrupt, we need to
+@@ -23,7 +23,8 @@
+ 
+ #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV |	\
+ 			    V3D_HUB_INT_MMU_PTI |	\
+-			    V3D_HUB_INT_MMU_CAP))
++			    V3D_HUB_INT_MMU_CAP |	\
++			    V3D_HUB_INT_TFUC))
+ 
+ static void
+ v3d_overflow_mem_work(struct work_struct *work)
+@@ -117,6 +118,11 @@ v3d_hub_irq(int irq, void *arg)
+ 	/* Acknowledge the interrupts we're handling here. */
+ 	V3D_WRITE(V3D_HUB_INT_CLR, intsts);
+ 
++	if (intsts & V3D_HUB_INT_TFUC) {
++		dma_fence_signal(v3d->tfu_job->done_fence);
++		status = IRQ_HANDLED;
++	}
++
+ 	if (intsts & (V3D_HUB_INT_MMU_WRV |
+ 		      V3D_HUB_INT_MMU_PTI |
+ 		      V3D_HUB_INT_MMU_CAP)) {
+--- a/drivers/gpu/drm/v3d/v3d_regs.h
++++ b/drivers/gpu/drm/v3d/v3d_regs.h
+@@ -86,6 +86,55 @@
+ # define V3D_TOP_GR_BRIDGE_SW_INIT_1                   0x0000c
+ # define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
+ 
++#define V3D_TFU_CS                                     0x00400
++/* Stops current job, empties input fifo. */
++# define V3D_TFU_CS_TFURST                             BIT(31)
++# define V3D_TFU_CS_CVTCT_MASK                         V3D_MASK(23, 16)
++# define V3D_TFU_CS_CVTCT_SHIFT                        16
++# define V3D_TFU_CS_NFREE_MASK                         V3D_MASK(13, 8)
++# define V3D_TFU_CS_NFREE_SHIFT                        8
++# define V3D_TFU_CS_BUSY                               BIT(0)
++
++#define V3D_TFU_SU                                     0x00404
++/* Interrupt when FINTTHR input slots are free (0 = disabled) */
++# define V3D_TFU_SU_FINTTHR_MASK                       V3D_MASK(13, 8)
++# define V3D_TFU_SU_FINTTHR_SHIFT                      8
++/* Skips resetting the CRC at the start of CRC generation. */
++# define V3D_TFU_SU_CRCCHAIN                           BIT(4)
++/* skips writes, computes CRC of the image.  miplevels must be 0. */
++# define V3D_TFU_SU_CRC                                BIT(3)
++# define V3D_TFU_SU_THROTTLE_MASK                      V3D_MASK(1, 0)
++# define V3D_TFU_SU_THROTTLE_SHIFT                     0
++
++#define V3D_TFU_ICFG                                   0x00408
++/* Interrupt when the conversion is complete. */
++# define V3D_TFU_ICFG_IOC                              BIT(0)
++
++/* Input Image Address */
++#define V3D_TFU_IIA                                    0x0040c
++/* Input Chroma Address */
++#define V3D_TFU_ICA                                    0x00410
++/* Input Image Stride */
++#define V3D_TFU_IIS                                    0x00414
++/* Input Image U-Plane Address */
++#define V3D_TFU_IUA                                    0x00418
++/* Output Image Address */
++#define V3D_TFU_IOA                                    0x0041c
++/* Image Output Size */
++#define V3D_TFU_IOS                                    0x00420
++/* TFU YUV Coefficient 0 */
++#define V3D_TFU_COEF0                                  0x00424
++/* Use these regs instead of the defaults. */
++# define V3D_TFU_COEF0_USECOEF                         BIT(31)
++/* TFU YUV Coefficient 1 */
++#define V3D_TFU_COEF1                                  0x00428
++/* TFU YUV Coefficient 2 */
++#define V3D_TFU_COEF2                                  0x0042c
++/* TFU YUV Coefficient 3 */
++#define V3D_TFU_COEF3                                  0x00430
++
++#define V3D_TFU_CRC                                    0x00434
++
+ /* Per-MMU registers. */
+ 
+ #define V3D_MMUC_CONTROL                               0x01000
+--- a/drivers/gpu/drm/v3d/v3d_sched.c
++++ b/drivers/gpu/drm/v3d/v3d_sched.c
+@@ -30,6 +30,12 @@ to_v3d_job(struct drm_sched_job *sched_j
+ 	return container_of(sched_job, struct v3d_job, base);
+ }
+ 
++static struct v3d_tfu_job *
++to_tfu_job(struct drm_sched_job *sched_job)
++{
++	return container_of(sched_job, struct v3d_tfu_job, base);
++}
++
+ static void
+ v3d_job_free(struct drm_sched_job *sched_job)
+ {
+@@ -38,6 +44,14 @@ v3d_job_free(struct drm_sched_job *sched
+ 	v3d_exec_put(job->exec);
+ }
+ 
++static void
++v3d_tfu_job_free(struct drm_sched_job *sched_job)
++{
++	struct v3d_tfu_job *job = to_tfu_job(sched_job);
++
++	v3d_tfu_job_put(job);
++}
++
+ /**
+  * Returns the fences that the bin or render job depends on, one by one.
+  * v3d_job_run() won't be called until all of them have been signaled.
+@@ -76,6 +90,27 @@ v3d_job_dependency(struct drm_sched_job
+ 	return fence;
+ }
+ 
++/**
++ * Returns the fences that the TFU job depends on, one by one.
++ * v3d_tfu_job_run() won't be called until all of them have been
++ * signaled.
++ */
++static struct dma_fence *
++v3d_tfu_job_dependency(struct drm_sched_job *sched_job,
++		       struct drm_sched_entity *s_entity)
++{
++	struct v3d_tfu_job *job = to_tfu_job(sched_job);
++	struct dma_fence *fence;
++
++	fence = job->in_fence;
++	if (fence) {
++		job->in_fence = NULL;
++		return fence;
++	}
++
++	return NULL;
++}
++
+ static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
+ {
+ 	struct v3d_job *job = to_v3d_job(sched_job);
+@@ -147,31 +182,47 @@ static struct dma_fence *v3d_job_run(str
+ 	return fence;
+ }
+ 
+-static void
+-v3d_job_timedout(struct drm_sched_job *sched_job)
++static struct dma_fence *
++v3d_tfu_job_run(struct drm_sched_job *sched_job)
+ {
+-	struct v3d_job *job = to_v3d_job(sched_job);
+-	struct v3d_exec_info *exec = job->exec;
+-	struct v3d_dev *v3d = exec->v3d;
+-	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
+-	enum v3d_queue q;
+-	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
+-	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
++	struct v3d_tfu_job *job = to_tfu_job(sched_job);
++	struct v3d_dev *v3d = job->v3d;
++	struct drm_device *dev = &v3d->drm;
++	struct dma_fence *fence;
+ 
+-	/* If the current address or return address have changed, then
+-	 * the GPU has probably made progress and we should delay the
+-	 * reset.  This could fail if the GPU got in an infinite loop
+-	 * in the CL, but that is pretty unlikely outside of an i-g-t
+-	 * testcase.
+-	 */
+-	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
+-		job->timedout_ctca = ctca;
+-		job->timedout_ctra = ctra;
++	fence = v3d_fence_create(v3d, V3D_TFU);
++	if (IS_ERR(fence))
++		return NULL;
+ 
+-		schedule_delayed_work(&job->base.work_tdr,
+-				      job->base.sched->timeout);
+-		return;
++	v3d->tfu_job = job;
++	if (job->done_fence)
++		dma_fence_put(job->done_fence);
++	job->done_fence = dma_fence_get(fence);
++
++	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
++
++	V3D_WRITE(V3D_TFU_IIA, job->args.iia);
++	V3D_WRITE(V3D_TFU_IIS, job->args.iis);
++	V3D_WRITE(V3D_TFU_ICA, job->args.ica);
++	V3D_WRITE(V3D_TFU_IUA, job->args.iua);
++	V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
++	V3D_WRITE(V3D_TFU_IOS, job->args.ios);
++	V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
++	if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
++		V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
++		V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
++		V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
+ 	}
++	/* ICFG kicks off the job. */
++	V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
++
++	return fence;
++}
++
++static void
++v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
++{
++	enum v3d_queue q;
+ 
+ 	mutex_lock(&v3d->reset_lock);
+ 
+@@ -196,6 +247,41 @@ v3d_job_timedout(struct drm_sched_job *s
+ 	mutex_unlock(&v3d->reset_lock);
+ }
+ 
++static void
++v3d_job_timedout(struct drm_sched_job *sched_job)
++{
++	struct v3d_job *job = to_v3d_job(sched_job);
++	struct v3d_exec_info *exec = job->exec;
++	struct v3d_dev *v3d = exec->v3d;
++	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
++	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
++	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
++
++	/* If the current address or return address have changed, then
++	 * the GPU has probably made progress and we should delay the
++	 * reset.  This could fail if the GPU got in an infinite loop
++	 * in the CL, but that is pretty unlikely outside of an i-g-t
++	 * testcase.
++	 */
++	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
++		job->timedout_ctca = ctca;
++		job->timedout_ctra = ctra;
++		schedule_delayed_work(&job->base.work_tdr,
++				      job->base.sched->timeout);
++		return;
++	}
++
++	v3d_gpu_reset_for_timeout(v3d, sched_job);
++}
++
++static void
++v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
++{
++	struct v3d_tfu_job *job = to_tfu_job(sched_job);
++
++	v3d_gpu_reset_for_timeout(job->v3d, sched_job);
++}
++
+ static const struct drm_sched_backend_ops v3d_sched_ops = {
+ 	.dependency = v3d_job_dependency,
+ 	.run_job = v3d_job_run,
+@@ -203,6 +289,13 @@ static const struct drm_sched_backend_op
+ 	.free_job = v3d_job_free
+ };
+ 
++static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
++	.dependency = v3d_tfu_job_dependency,
++	.run_job = v3d_tfu_job_run,
++	.timedout_job = v3d_tfu_job_timedout,
++	.free_job = v3d_tfu_job_free
++};
++
+ int
+ v3d_sched_init(struct v3d_dev *v3d)
+ {
+@@ -232,6 +325,19 @@ v3d_sched_init(struct v3d_dev *v3d)
+ 		drm_sched_fini(&v3d->queue[V3D_BIN].sched);
+ 		return ret;
+ 	}
++
++	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
++			     &v3d_tfu_sched_ops,
++			     hw_jobs_limit, job_hang_limit,
++			     msecs_to_jiffies(hang_limit_ms),
++			     "v3d_tfu");
++	if (ret) {
++		dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
++			ret);
++		drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
++		drm_sched_fini(&v3d->queue[V3D_BIN].sched);
++		return ret;
++	}
+ 
+ 	return 0;
+ }
+--- a/drivers/gpu/drm/v3d/v3d_trace.h
++++ b/drivers/gpu/drm/v3d/v3d_trace.h
+@@ -42,6 +42,26 @@ TRACE_EVENT(v3d_submit_cl,
+ 		      __entry->ctnqea)
+ );
+ 
++TRACE_EVENT(v3d_submit_tfu,
++	    TP_PROTO(struct drm_device *dev,
++		     uint64_t seqno),
++	    TP_ARGS(dev, seqno),
++
++	    TP_STRUCT__entry(
++			     __field(u32, dev)
++			     __field(u64, seqno)
++			     ),
++
++	    TP_fast_assign(
++			   __entry->dev = dev->primary->index;
++			   __entry->seqno = seqno;
++			   ),
++
++	    TP_printk("dev=%u, seqno=%llu",
++		      __entry->dev,
++		      __entry->seqno)
++);
++
+ TRACE_EVENT(v3d_reset_begin,
+ 	    TP_PROTO(struct drm_device *dev),
+ 	    TP_ARGS(dev),
+--- a/include/uapi/drm/v3d_drm.h
++++ b/include/uapi/drm/v3d_drm.h
+@@ -36,6 +36,7 @@ extern "C" {
+ #define DRM_V3D_MMAP_BO                           0x03
+ #define DRM_V3D_GET_PARAM                         0x04
+ #define DRM_V3D_GET_BO_OFFSET                     0x05
++#define DRM_V3D_SUBMIT_TFU                        0x06
+ 
+ #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
+ #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
+@@ -43,6 +44,7 @@ extern "C" {
+ #define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
+ #define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
+ #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
++#define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
+ 
+ /**
+  * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
+@@ -169,6 +171,7 @@ enum drm_v3d_param {
+ 	DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+ 	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+ 	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
++	DRM_V3D_PARAM_SUPPORTS_TFU,
+ };
+ 
+ struct drm_v3d_get_param {
+@@ -187,6 +190,28 @@ struct drm_v3d_get_bo_offset {
+ 	__u32 offset;
+ };
+ 
++struct drm_v3d_submit_tfu {
++	__u32 icfg;
++	__u32 iia;
++	__u32 iis;
++	__u32 ica;
++	__u32 iua;
++	__u32 ioa;
++	__u32 ios;
++	__u32 coef[4];
++	/* First handle is the output BO, following are other inputs.
++	 * 0 for unused.
++	 */
++	__u32 bo_handles[4];
++	/* sync object to block on before running the TFU job.  Each TFU
++	 * job will execute in the order submitted to its FD.  Synchronization
++	 * against rendering jobs requires using sync objects.
++	 */
++	__u32 in_sync;
++	/* Sync object to signal when the TFU job is done. */
++	__u32 out_sync;
++};
++
+ #if defined(__cplusplus)
+ }
+ #endif