aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch')
-rw-r--r--target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch228
1 files changed, 228 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch b/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch
new file mode 100644
index 0000000000..ea0f4f1177
--- /dev/null
+++ b/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch
@@ -0,0 +1,228 @@
+From 8f5722ac3e42a33345bfd82b7ad6a153134a4239 Mon Sep 17 00:00:00 2001
+From: Jonas Pfeil <pfeiljonas@gmx.de>
+Date: Tue, 8 Nov 2016 00:18:39 +0100
+Subject: [PATCH] drm/vc4: Add fragment shader threading support
+
+FS threading brings performance improvements of 0-20% in glmark2.
+
+The validation code checks for thread switch signals and ensures that
+the registers of the other thread are not touched, and that our clamps
+are not live across thread switches. It also checks that the
+threading and branching instructions do not interfere.
+
+(Original patch by Jonas, changes by anholt for style cleanup,
+removing validation the kernel doesn't need to do, and adding the flag
+for userspace).
+
+v2: Minor style fixes from checkpatch.
+
+Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
+Signed-off-by: Eric Anholt <eric@anholt.net>
+(cherry picked from commit c778cc5df944291dcdb1ca7a6bb781fbc22550c5)
+---
+ drivers/gpu/drm/vc4/vc4_drv.c | 1 +
+ drivers/gpu/drm/vc4/vc4_drv.h | 2 +
+ drivers/gpu/drm/vc4/vc4_validate.c | 17 +++++---
+ drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++++++++++
+ include/uapi/drm/vc4_drm.h | 1 +
+ 5 files changed, 79 insertions(+), 5 deletions(-)
+
+--- a/drivers/gpu/drm/vc4/vc4_drv.c
++++ b/drivers/gpu/drm/vc4/vc4_drv.c
+@@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct dr
+ break;
+ case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+ case DRM_VC4_PARAM_SUPPORTS_ETC1:
++ case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
+ args->value = true;
+ break;
+ default:
+--- a/drivers/gpu/drm/vc4/vc4_drv.h
++++ b/drivers/gpu/drm/vc4/vc4_drv.h
+@@ -384,6 +384,8 @@ struct vc4_validated_shader_info {
+
+ uint32_t num_uniform_addr_offsets;
+ uint32_t *uniform_addr_offsets;
++
++ bool is_threaded;
+ };
+
+ /**
+--- a/drivers/gpu/drm/vc4/vc4_validate.c
++++ b/drivers/gpu/drm/vc4/vc4_validate.c
+@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device
+ exec->shader_rec_v += roundup(packet_size, 16);
+ exec->shader_rec_size -= packet_size;
+
+- if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
+- DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
+- return -EINVAL;
+- }
+-
+ for (i = 0; i < shader_reloc_count; i++) {
+ if (src_handles[i] > exec->bo_count) {
+ DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
+@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device
+ return -EINVAL;
+ }
+
++ if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
++ to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
++ DRM_ERROR("Thread mode of CL and FS do not match\n");
++ return -EINVAL;
++ }
++
++ if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
++ to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
++ DRM_ERROR("cs and vs cannot be threaded\n");
++ return -EINVAL;
++ }
++
+ for (i = 0; i < shader_reloc_count; i++) {
+ struct vc4_validated_shader_info *validated_shader;
+ uint32_t o = shader_reloc_offsets[i];
+--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
++++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+@@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
+ * basic blocks.
+ */
+ bool needs_uniform_address_for_loop;
++
++ /* Set when we find an instruction writing the top half of the
++ * register files. If we allowed writing the unusable regs in
++ * a threaded shader, then the other shader running on our
++ * QPU's clamp validation would be invalid.
++ */
++ bool all_registers_used;
+ };
+
+ static uint32_t
+@@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t i
+ }
+
+ static bool
++live_reg_is_upper_half(uint32_t lri)
++{
++ return (lri >= 16 && lri < 32) ||
++ (lri >= 32 + 16 && lri < 32 + 32);
++}
++
++static bool
+ is_tmu_submit(uint32_t waddr)
+ {
+ return (waddr == QPU_W_TMU0_S ||
+@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_sha
+ } else {
+ validation_state->live_immediates[lri] = ~0;
+ }
++
++ if (live_reg_is_upper_half(lri))
++ validation_state->all_registers_used = true;
+ }
+
+ switch (waddr) {
+@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_valid
+ }
+ }
+
++ if ((raddr_a >= 16 && raddr_a < 32) ||
++ (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
++ validation_state->all_registers_used = true;
++ }
++
+ return true;
+ }
+
+@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_o
+ {
+ bool found_shader_end = false;
+ int shader_end_ip = 0;
++ uint32_t last_thread_switch_ip = -3;
+ uint32_t ip;
+ struct vc4_validated_shader_info *validated_shader = NULL;
+ struct vc4_shader_validation_state validation_state;
+@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_o
+ if (!vc4_handle_branch_target(&validation_state))
+ goto fail;
+
++ if (ip == last_thread_switch_ip + 3) {
++ /* Reset r0-r3 live clamp data */
++ int i;
++
++ for (i = 64; i < LIVE_REG_COUNT; i++) {
++ validation_state.live_min_clamp_offsets[i] = ~0;
++ validation_state.live_max_clamp_regs[i] = false;
++ validation_state.live_immediates[i] = ~0;
++ }
++ }
++
+ switch (sig) {
+ case QPU_SIG_NONE:
+ case QPU_SIG_WAIT_FOR_SCOREBOARD:
+@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_o
+ case QPU_SIG_LOAD_TMU1:
+ case QPU_SIG_PROG_END:
+ case QPU_SIG_SMALL_IMM:
++ case QPU_SIG_THREAD_SWITCH:
++ case QPU_SIG_LAST_THREAD_SWITCH:
+ if (!check_instruction_writes(validated_shader,
+ &validation_state)) {
+ DRM_ERROR("Bad write at ip %d\n", ip);
+@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_o
+ shader_end_ip = ip;
+ }
+
++ if (sig == QPU_SIG_THREAD_SWITCH ||
++ sig == QPU_SIG_LAST_THREAD_SWITCH) {
++ validated_shader->is_threaded = true;
++
++ if (ip < last_thread_switch_ip + 3) {
++ DRM_ERROR("Thread switch too soon after "
++ "last switch at ip %d\n", ip);
++ goto fail;
++ }
++ last_thread_switch_ip = ip;
++ }
++
+ break;
+
+ case QPU_SIG_LOAD_IMM:
+@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_o
+ if (!check_branch(inst, validated_shader,
+ &validation_state, ip))
+ goto fail;
++
++ if (ip < last_thread_switch_ip + 3) {
++ DRM_ERROR("Branch in thread switch at ip %d",
++ ip);
++ goto fail;
++ }
++
+ break;
+ default:
+ DRM_ERROR("Unsupported QPU signal %d at "
+@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_o
+ goto fail;
+ }
+
++ /* Might corrupt other thread */
++ if (validated_shader->is_threaded &&
++ validation_state.all_registers_used) {
++ DRM_ERROR("Shader uses threading, but uses the upper "
++ "half of the registers, too\n");
++ goto fail;
++ }
++
+ /* If we did a backwards branch and we haven't emitted a uniforms
+ * reset since then, we still need the uniforms stream to have the
+ * uniforms address available so that the backwards branch can do its
+--- a/include/uapi/drm/vc4_drm.h
++++ b/include/uapi/drm/vc4_drm.h
+@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
+ #define DRM_VC4_PARAM_V3D_IDENT2 2
+ #define DRM_VC4_PARAM_SUPPORTS_BRANCHES 3
+ #define DRM_VC4_PARAM_SUPPORTS_ETC1 4
++#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5
+
+ struct drm_vc4_get_param {
+ __u32 param;