diff options
Diffstat (limited to 'target/linux/brcm2708/patches-4.4/0545-drm-vc4-Add-fragment-shader-threading-support.patch')
-rw-r--r-- | target/linux/brcm2708/patches-4.4/0545-drm-vc4-Add-fragment-shader-threading-support.patch | 228 |
1 files changed, 0 insertions, 228 deletions
diff --git a/target/linux/brcm2708/patches-4.4/0545-drm-vc4-Add-fragment-shader-threading-support.patch b/target/linux/brcm2708/patches-4.4/0545-drm-vc4-Add-fragment-shader-threading-support.patch deleted file mode 100644 index db469b3957..0000000000 --- a/target/linux/brcm2708/patches-4.4/0545-drm-vc4-Add-fragment-shader-threading-support.patch +++ /dev/null @@ -1,228 +0,0 @@ -From dcd19269dea9ee867deaf0fc5e5840525767c68e Mon Sep 17 00:00:00 2001 -From: Jonas Pfeil <pfeiljonas@gmx.de> -Date: Tue, 8 Nov 2016 00:18:39 +0100 -Subject: [PATCH] drm/vc4: Add fragment shader threading support - -FS threading brings performance improvements of 0-20% in glmark2. - -The validation code checks for thread switch signals and ensures that -the registers of the other thread are not touched, and that our clamps -are not live across thread switches. It also checks that the -threading and branching instructions do not interfere. - -(Original patch by Jonas, changes by anholt for style cleanup, -removing validation the kernel doesn't need to do, and adding the flag -for userspace). - -v2: Minor style fixes from checkpatch. - -Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de> -Signed-off-by: Eric Anholt <eric@anholt.net> -(cherry picked from commit c778cc5df944291dcdb1ca7a6bb781fbc22550c5) ---- - drivers/gpu/drm/vc4/vc4_drv.c | 1 + - drivers/gpu/drm/vc4/vc4_drv.h | 2 + - drivers/gpu/drm/vc4/vc4_validate.c | 17 +++++--- - drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++++++++++ - include/uapi/drm/vc4_drm.h | 1 + - 5 files changed, 79 insertions(+), 5 deletions(-) - ---- a/drivers/gpu/drm/vc4/vc4_drv.c -+++ b/drivers/gpu/drm/vc4/vc4_drv.c -@@ -107,6 +107,7 @@ static int vc4_get_param_ioctl(struct dr - break; - case DRM_VC4_PARAM_SUPPORTS_BRANCHES: - case DRM_VC4_PARAM_SUPPORTS_ETC1: -+ case DRM_VC4_PARAM_SUPPORTS_THREADED_FS: - args->value = true; - break; - default: ---- a/drivers/gpu/drm/vc4/vc4_drv.h -+++ b/drivers/gpu/drm/vc4/vc4_drv.h -@@ -395,6 +395,8 @@ struct vc4_validated_shader_info { - - uint32_t num_uniform_addr_offsets; - uint32_t *uniform_addr_offsets; -+ -+ bool is_threaded; - }; - - /** ---- a/drivers/gpu/drm/vc4/vc4_validate.c -+++ b/drivers/gpu/drm/vc4/vc4_validate.c -@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device - exec->shader_rec_v += roundup(packet_size, 16); - exec->shader_rec_size -= packet_size; - -- if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) { -- DRM_ERROR("Multi-threaded fragment shaders not supported.\n"); -- return -EINVAL; -- } -- - for (i = 0; i < shader_reloc_count; i++) { - if (src_handles[i] > exec->bo_count) { - DRM_ERROR("Shader handle %d too big\n", src_handles[i]); -@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device - return -EINVAL; - } - -+ if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) != -+ to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) { -+ DRM_ERROR("Thread mode of CL and FS do not match\n"); -+ return -EINVAL; -+ } -+ -+ if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded || -+ to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) { -+ DRM_ERROR("cs and vs cannot be threaded\n"); -+ return -EINVAL; -+ } -+ - for (i = 0; i < shader_reloc_count; i++) { - struct vc4_validated_shader_info *validated_shader; - uint32_t o = shader_reloc_offsets[i]; ---- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c -+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c -@@ -83,6 +83,13 @@ struct vc4_shader_validation_state { - * basic blocks. - */ - bool needs_uniform_address_for_loop; -+ -+ /* Set when we find an instruction writing the top half of the -+ * register files. If we allowed writing the unusable regs in -+ * a threaded shader, then the other shader running on our -+ * QPU's clamp validation would be invalid. -+ */ -+ bool all_registers_used; - }; - - static uint32_t -@@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t i - } - - static bool -+live_reg_is_upper_half(uint32_t lri) -+{ -+ return (lri >= 16 && lri < 32) || -+ (lri >= 32 + 16 && lri < 32 + 32); -+} -+ -+static bool - is_tmu_submit(uint32_t waddr) - { - return (waddr == QPU_W_TMU0_S || -@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_sha - } else { - validation_state->live_immediates[lri] = ~0; - } -+ -+ if (live_reg_is_upper_half(lri)) -+ validation_state->all_registers_used = true; - } - - switch (waddr) { -@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_valid - } - } - -+ if ((raddr_a >= 16 && raddr_a < 32) || -+ (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { -+ validation_state->all_registers_used = true; -+ } -+ - return true; - } - -@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_o - { - bool found_shader_end = false; - int shader_end_ip = 0; -+ uint32_t last_thread_switch_ip = -3; - uint32_t ip; - struct vc4_validated_shader_info *validated_shader = NULL; - struct vc4_shader_validation_state validation_state; -@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_o - if (!vc4_handle_branch_target(&validation_state)) - goto fail; - -+ if (ip == last_thread_switch_ip + 3) { -+ /* Reset r0-r3 live clamp data */ -+ int i; -+ -+ for (i = 64; i < LIVE_REG_COUNT; i++) { -+ validation_state.live_min_clamp_offsets[i] = ~0; -+ validation_state.live_max_clamp_regs[i] = false; -+ validation_state.live_immediates[i] = ~0; -+ } -+ } -+ - switch (sig) { - case QPU_SIG_NONE: - case QPU_SIG_WAIT_FOR_SCOREBOARD: -@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_o - case QPU_SIG_LOAD_TMU1: - case QPU_SIG_PROG_END: - case QPU_SIG_SMALL_IMM: -+ case QPU_SIG_THREAD_SWITCH: -+ case QPU_SIG_LAST_THREAD_SWITCH: - if (!check_instruction_writes(validated_shader, - &validation_state)) { - DRM_ERROR("Bad write at ip %d\n", ip); -@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_o - shader_end_ip = ip; - } - -+ if (sig == QPU_SIG_THREAD_SWITCH || -+ sig == QPU_SIG_LAST_THREAD_SWITCH) { -+ validated_shader->is_threaded = true; -+ -+ if (ip < last_thread_switch_ip + 3) { -+ DRM_ERROR("Thread switch too soon after " -+ "last switch at ip %d\n", ip); -+ goto fail; -+ } -+ last_thread_switch_ip = ip; -+ } -+ - break; - - case QPU_SIG_LOAD_IMM: -@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_o - if (!check_branch(inst, validated_shader, - &validation_state, ip)) - goto fail; -+ -+ if (ip < last_thread_switch_ip + 3) { -+ DRM_ERROR("Branch in thread switch at ip %d", -+ ip); -+ goto fail; -+ } -+ - break; - default: - DRM_ERROR("Unsupported QPU signal %d at " -@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_o - goto fail; - } - -+ /* Might corrupt other thread */ -+ if (validated_shader->is_threaded && -+ validation_state.all_registers_used) { -+ DRM_ERROR("Shader uses threading, but uses the upper " -+ "half of the registers, too\n"); -+ goto fail; -+ } -+ - /* If we did a backwards branch and we haven't emitted a uniforms - * reset since then, we still need the uniforms stream to have the - * uniforms address available so that the backwards branch can do its ---- a/include/uapi/drm/vc4_drm.h -+++ b/include/uapi/drm/vc4_drm.h -@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state { - #define DRM_VC4_PARAM_V3D_IDENT2 2 - #define DRM_VC4_PARAM_SUPPORTS_BRANCHES 3 - #define DRM_VC4_PARAM_SUPPORTS_ETC1 4 -+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5 - - struct drm_vc4_get_param { - __u32 param; |