From 2340c646e6a5da658e114be2f721ff90853588d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= <noltari@gmail.com>
Date: Fri, 9 Aug 2019 19:50:30 +0200
Subject: brcm2708: update to latest patches from RPi foundation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
---
 ...-v3d-Add-missing-implicit-synchronization.patch | 279 +++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 target/linux/brcm2708/patches-4.19/950-0587-drm-v3d-Add-missing-implicit-synchronization.patch

(limited to 'target/linux/brcm2708/patches-4.19/950-0587-drm-v3d-Add-missing-implicit-synchronization.patch')

diff --git a/target/linux/brcm2708/patches-4.19/950-0587-drm-v3d-Add-missing-implicit-synchronization.patch b/target/linux/brcm2708/patches-4.19/950-0587-drm-v3d-Add-missing-implicit-synchronization.patch
new file mode 100644
index 0000000000..f4fe613310
--- /dev/null
+++ b/target/linux/brcm2708/patches-4.19/950-0587-drm-v3d-Add-missing-implicit-synchronization.patch
@@ -0,0 +1,279 @@
+From 9691f6245f13a92f49143602f47f6e7f0b6b5f55 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <eric@anholt.net>
+Date: Wed, 27 Mar 2019 17:44:40 -0700
+Subject: [PATCH 587/773] drm/v3d: Add missing implicit synchronization.
+
+It is the expectation of existing userspace (X11 + Mesa, in
+particular) that jobs submitted to the kernel against a shared BO will
+get implicitly synchronized by their submission order.  If we want to
+allow clever userspace to disable implicit synchronization, we should
+do that under its own submit flag (as amdgpu and lima do).
+
+Note that we currently only implicitly sync for the rendering pass,
+not binning -- if you texture-from-pixmap in the binning vertex shader
+(vertex coordinate generation), you'll miss out on synchronization.
+
+Fixes flickering when multiple clients are running in parallel,
+particularly GL apps and compositors.
+
+Signed-off-by: Eric Anholt <eric@anholt.net>
+---
+ drivers/gpu/drm/v3d/v3d_drv.h   | 10 +---
+ drivers/gpu/drm/v3d/v3d_gem.c   | 98 ++++++++++++++++++++++++++++++---
+ drivers/gpu/drm/v3d/v3d_sched.c | 45 ++-------------
+ 3 files changed, 96 insertions(+), 57 deletions(-)
+
+--- a/drivers/gpu/drm/v3d/v3d_drv.h
++++ b/drivers/gpu/drm/v3d/v3d_drv.h
+@@ -186,8 +186,9 @@ struct v3d_job {
+ 	struct v3d_bo **bo;
+ 	u32 bo_count;
+ 
+-	/* An optional fence userspace can pass in for the job to depend on. */
+-	struct dma_fence *in_fence;
++	struct dma_fence **deps;
++	int deps_count;
++	int deps_size;
+ 
+ 	/* v3d fence to be signaled by IRQ handler when the job is complete. */
+ 	struct dma_fence *irq_fence;
+@@ -219,11 +220,6 @@ struct v3d_bin_job {
+ struct v3d_render_job {
+ 	struct v3d_job base;
+ 
+-	/* Optional fence for the binner, to depend on before starting
+-	 * our job.
+-	 */
+-	struct dma_fence *bin_done_fence;
+-
+ 	/* GPU virtual addresses of the start/end of the CL job. */
+ 	u32 start, end;
+ 
+--- a/drivers/gpu/drm/v3d/v3d_gem.c
++++ b/drivers/gpu/drm/v3d/v3d_gem.c
+@@ -218,6 +218,71 @@ v3d_unlock_bo_reservations(struct v3d_bo
+ 	ww_acquire_fini(acquire_ctx);
+ }
+ 
++static int
++v3d_add_dep(struct v3d_job *job, struct dma_fence *fence)
++{
++	if (!fence)
++		return 0;
++
++	if (job->deps_size == job->deps_count) {
++		int new_deps_size = max(job->deps_size * 2, 4);
++		struct dma_fence **new_deps =
++			krealloc(job->deps, new_deps_size * sizeof(*new_deps),
++				 GFP_KERNEL);
++		if (!new_deps) {
++			dma_fence_put(fence);
++			return -ENOMEM;
++		}
++
++		job->deps = new_deps;
++		job->deps_size = new_deps_size;
++	}
++
++	job->deps[job->deps_count++] = fence;
++
++	return 0;
++}
++
++/**
++ * Adds the required implicit fences before executing the job
++ *
++ * Userspace (X11 + Mesa) requires that a job submitted against a shared BO
++ * from one fd will implicitly synchronize against previous jobs submitted
++ * against that BO from other fds.
++ *
++ * Currently we don't bother trying to track the shared BOs, and instead just
++ * sync everything.  However, our synchronization is only for the render pass
++ * -- the binning stage (VS coordinate calculations) ignores implicit sync,
++ * since using shared buffers for texture coordinates seems unlikely, and
++ * implicitly syncing them would break bin/render parallelism.  If we want to
++ * fix that, we should introduce a flag when VS texturing has been used in the
++ * binning stage, or a set of flags for which BOs are sampled during binning.
++ */
++static int
++v3d_add_implicit_fences(struct v3d_job *job, struct v3d_bo *bo)
++{
++	int i, ret, nr_fences;
++	struct dma_fence **fences;
++
++	ret = reservation_object_get_fences_rcu(bo->resv, NULL,
++						&nr_fences, &fences);
++	if (ret || !nr_fences)
++		return ret;
++
++	for (i = 0; i < nr_fences; i++) {
++		ret = v3d_add_dep(job, fences[i]);
++		if (ret)
++			break;
++	}
++
++	/* Free any remaining fences after error. */
++	for (; i < nr_fences; i++)
++		dma_fence_put(fences[i]);
++	kfree(fences);
++
++	return ret;
++}
++
+ /* Takes the reservation lock on all the BOs being referenced, so that
+  * at queue submit time we can update the reservations.
+  *
+@@ -226,10 +291,11 @@ v3d_unlock_bo_reservations(struct v3d_bo
+  * to v3d, so we don't attach dma-buf fences to them.
+  */
+ static int
+-v3d_lock_bo_reservations(struct v3d_bo **bos,
+-			 int bo_count,
++v3d_lock_bo_reservations(struct v3d_job *job,
+ 			 struct ww_acquire_ctx *acquire_ctx)
+ {
++	struct v3d_bo **bos = job->bo;
++	int bo_count = job->bo_count;
+ 	int contended_lock = -1;
+ 	int i, ret;
+ 
+@@ -281,6 +347,13 @@ retry:
+ 	 * before we commit the CL to the hardware.
+ 	 */
+ 	for (i = 0; i < bo_count; i++) {
++		ret = v3d_add_implicit_fences(job, bos[i]);
++		if (ret) {
++			v3d_unlock_bo_reservations(bos, bo_count,
++						   acquire_ctx);
++			return ret;
++		}
++
+ 		ret = reservation_object_reserve_shared(bos[i]->resv);
+ 		if (ret) {
+ 			v3d_unlock_bo_reservations(bos, bo_count,
+@@ -383,7 +456,10 @@ v3d_job_free(struct kref *ref)
+ 	}
+ 	kvfree(job->bo);
+ 
+-	dma_fence_put(job->in_fence);
++	for (i = 0; i < job->deps_count; i++)
++		dma_fence_put(job->deps[i]);
++	kfree(job->deps);
++
+ 	dma_fence_put(job->irq_fence);
+ 	dma_fence_put(job->done_fence);
+ 
+@@ -464,15 +540,20 @@ v3d_job_init(struct v3d_dev *v3d, struct
+ 	     struct v3d_job *job, void (*free)(struct kref *ref),
+ 	     u32 in_sync)
+ {
++	struct dma_fence *in_fence = NULL;
+ 	int ret;
+ 
+ 	job->v3d = v3d;
+ 	job->free = free;
+ 
+-	ret = drm_syncobj_find_fence(file_priv, in_sync, 0, &job->in_fence);
++	ret = drm_syncobj_find_fence(file_priv, in_sync, 0, &in_fence);
+ 	if (ret == -EINVAL)
+ 		return ret;
+ 
++	ret = v3d_add_dep(job, in_fence);
++	if (ret)
++		return ret;
++
+ 	kref_init(&job->refcount);
+ 
+ 	return 0;
+@@ -590,8 +671,7 @@ v3d_submit_cl_ioctl(struct drm_device *d
+ 	if (ret)
+ 		goto fail;
+ 
+-	ret = v3d_lock_bo_reservations(render->base.bo, render->base.bo_count,
+-				       &acquire_ctx);
++	ret = v3d_lock_bo_reservations(&render->base, &acquire_ctx);
+ 	if (ret)
+ 		goto fail;
+ 
+@@ -601,7 +681,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
+ 		if (ret)
+ 			goto fail_unreserve;
+ 
+-		render->bin_done_fence = dma_fence_get(bin->base.done_fence);
++		ret = v3d_add_dep(&render->base,
++				  dma_fence_get(bin->base.done_fence));
+ 	}
+ 
+ 	ret = v3d_push_job(v3d_priv, &render->base, V3D_RENDER);
+@@ -692,8 +773,7 @@ v3d_submit_tfu_ioctl(struct drm_device *
+ 	}
+ 	spin_unlock(&file_priv->table_lock);
+ 
+-	ret = v3d_lock_bo_reservations(job->base.bo, job->base.bo_count,
+-				       &acquire_ctx);
++	ret = v3d_lock_bo_reservations(&job->base, &acquire_ctx);
+ 	if (ret)
+ 		goto fail;
+ 
+--- a/drivers/gpu/drm/v3d/v3d_sched.c
++++ b/drivers/gpu/drm/v3d/v3d_sched.c
+@@ -67,47 +67,10 @@ v3d_job_dependency(struct drm_sched_job
+ 		   struct drm_sched_entity *s_entity)
+ {
+ 	struct v3d_job *job = to_v3d_job(sched_job);
+-	struct dma_fence *fence;
+-
+-	fence = job->in_fence;
+-	if (fence) {
+-		job->in_fence = NULL;
+-		return fence;
+-	}
+-
+-	return NULL;
+-}
+ 
+-/**
+- * Returns the fences that the render job depends on, one by one.
+- * v3d_job_run() won't be called until all of them have been signaled.
+- */
+-static struct dma_fence *
+-v3d_render_job_dependency(struct drm_sched_job *sched_job,
+-			  struct drm_sched_entity *s_entity)
+-{
+-	struct v3d_render_job *job = to_render_job(sched_job);
+-	struct dma_fence *fence;
+-
+-	fence = v3d_job_dependency(sched_job, s_entity);
+-	if (fence)
+-		return fence;
+-
+-	/* If we had a bin job, the render job definitely depends on
+-	 * it. We first have to wait for bin to be scheduled, so that
+-	 * its done_fence is created.
+-	 */
+-	fence = job->bin_done_fence;
+-	if (fence) {
+-		job->bin_done_fence = NULL;
+-		return fence;
+-	}
+-
+-	/* XXX: Wait on a fence for switching the GMP if necessary,
+-	 * and then do so.
+-	 */
+-
+-	return fence;
++	if (!job->deps_count)
++		return NULL;
++	return job->deps[--job->deps_count];
+ }
+ 
+ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
+@@ -329,7 +292,7 @@ static const struct drm_sched_backend_op
+ };
+ 
+ static const struct drm_sched_backend_ops v3d_render_sched_ops = {
+-	.dependency = v3d_render_job_dependency,
++	.dependency = v3d_job_dependency,
+ 	.run_job = v3d_render_job_run,
+ 	.timedout_job = v3d_render_job_timedout,
+ 	.free_job = v3d_job_free,
-- 
cgit v1.2.3