diff options
Diffstat (limited to 'target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch')
-rw-r--r-- | target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch | 429 |
1 files changed, 0 insertions, 429 deletions
diff --git a/target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch b/target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch deleted file mode 100644 index addef40932..0000000000 --- a/target/linux/brcm2708/patches-4.4/0290-drm-vc4-improve-throughput-by-pipelining-binning-and.patch +++ /dev/null @@ -1,429 +0,0 @@ -From c661b60a4057da2d212fc5e8e2a56d463d912974 Mon Sep 17 00:00:00 2001 -From: Varad Gautam <varadgautam@gmail.com> -Date: Wed, 17 Feb 2016 19:08:21 +0530 -Subject: [PATCH] drm/vc4: improve throughput by pipelining binning and - rendering jobs - -The hardware provides us with separate threads for binning and -rendering, and the existing model waits for them both to complete -before submitting the next job. - -Splitting the binning and rendering submissions reduces idle time and -gives us approx 20-30% speedup with some x11perf tests such as -line10 -and -tilerect1. Improves openarena performance by 1.01897% +/- -0.247857% (n=16). - -Thanks to anholt for suggesting this. - -v2: Rebase on the spurious resets fix (change by anholt). - -Signed-off-by: Varad Gautam <varadgautam@gmail.com> -Reviewed-by: Eric Anholt <eric@anholt.net> -Signed-off-by: Eric Anholt <eric@anholt.net> -(cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207) ---- - drivers/gpu/drm/vc4/vc4_drv.h | 37 +++++++++---- - drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------ - drivers/gpu/drm/vc4/vc4_irq.c | 58 ++++++++++++++++---- - 3 files changed, 166 insertions(+), 52 deletions(-) - ---- a/drivers/gpu/drm/vc4/vc4_drv.h -+++ b/drivers/gpu/drm/vc4/vc4_drv.h -@@ -53,7 +53,7 @@ struct vc4_dev { - /* Protects bo_cache and the BO stats. */ - struct mutex bo_lock; - -- /* Sequence number for the last job queued in job_list. -+ /* Sequence number for the last job queued in bin_job_list. - * Starts at 0 (no jobs emitted). - */ - uint64_t emit_seqno; -@@ -63,11 +63,19 @@ struct vc4_dev { - */ - uint64_t finished_seqno; - -- /* List of all struct vc4_exec_info for jobs to be executed. -- * The first job in the list is the one currently programmed -- * into ct0ca/ct1ca for execution. -+ /* List of all struct vc4_exec_info for jobs to be executed in -+ * the binner. The first job in the list is the one currently -+ * programmed into ct0ca for execution. -+ */ -+ struct list_head bin_job_list; -+ -+ /* List of all struct vc4_exec_info for jobs that have -+ * completed binning and are ready for rendering. The first -+ * job in the list is the one currently programmed into ct1ca -+ * for execution. - */ -- struct list_head job_list; -+ struct list_head render_job_list; -+ - /* List of the finished vc4_exec_infos waiting to be freed by - * job_done_work. - */ -@@ -291,11 +299,20 @@ struct vc4_exec_info { - }; - - static inline struct vc4_exec_info * --vc4_first_job(struct vc4_dev *vc4) -+vc4_first_bin_job(struct vc4_dev *vc4) -+{ -+ if (list_empty(&vc4->bin_job_list)) -+ return NULL; -+ return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head); -+} -+ -+static inline struct vc4_exec_info * -+vc4_first_render_job(struct vc4_dev *vc4) - { -- if (list_empty(&vc4->job_list)) -+ if (list_empty(&vc4->render_job_list)) - return NULL; -- return list_first_entry(&vc4->job_list, struct vc4_exec_info, head); -+ return list_first_entry(&vc4->render_job_list, -+ struct vc4_exec_info, head); - } - - /** -@@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi - struct drm_file *file_priv); - int vc4_wait_bo_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv); --void vc4_submit_next_job(struct drm_device *dev); -+void vc4_submit_next_bin_job(struct drm_device *dev); -+void vc4_submit_next_render_job(struct drm_device *dev); -+void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec); - int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, - uint64_t timeout_ns, bool interruptible); - void vc4_job_handle_completed(struct vc4_dev *vc4); ---- a/drivers/gpu/drm/vc4/vc4_gem.c -+++ b/drivers/gpu/drm/vc4/vc4_gem.c -@@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d - struct vc4_dev *vc4 = to_vc4_dev(dev); - struct drm_vc4_get_hang_state *state; - struct vc4_hang_state *kernel_state; -- struct vc4_exec_info *exec; -+ struct vc4_exec_info *exec[2]; - struct vc4_bo *bo; - unsigned long irqflags; -- unsigned int i, unref_list_count; -+ unsigned int i, j, unref_list_count, prev_idx; - - kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL); - if (!kernel_state) -@@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d - state = &kernel_state->user_state; - - spin_lock_irqsave(&vc4->job_lock, irqflags); -- exec = vc4_first_job(vc4); -- if (!exec) { -+ exec[0] = vc4_first_bin_job(vc4); -+ exec[1] = vc4_first_render_job(vc4); -+ if (!exec[0] && !exec[1]) { - spin_unlock_irqrestore(&vc4->job_lock, irqflags); - return; - } - -- unref_list_count = 0; -- list_for_each_entry(bo, &exec->unref_list, unref_head) -- unref_list_count++; -- -- state->bo_count = exec->bo_count + unref_list_count; -- kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo), -- GFP_ATOMIC); -+ /* Get the bos from both binner and renderer into hang state. */ -+ state->bo_count = 0; -+ for (i = 0; i < 2; i++) { -+ if (!exec[i]) -+ continue; -+ -+ unref_list_count = 0; -+ list_for_each_entry(bo, &exec[i]->unref_list, unref_head) -+ unref_list_count++; -+ state->bo_count += exec[i]->bo_count + unref_list_count; -+ } -+ -+ kernel_state->bo = kcalloc(state->bo_count, -+ sizeof(*kernel_state->bo), GFP_ATOMIC); -+ - if (!kernel_state->bo) { - spin_unlock_irqrestore(&vc4->job_lock, irqflags); - return; - } - -- for (i = 0; i < exec->bo_count; i++) { -- drm_gem_object_reference(&exec->bo[i]->base); -- kernel_state->bo[i] = &exec->bo[i]->base; -- } -+ prev_idx = 0; -+ for (i = 0; i < 2; i++) { -+ if (!exec[i]) -+ continue; -+ -+ for (j = 0; j < exec[i]->bo_count; j++) { -+ drm_gem_object_reference(&exec[i]->bo[j]->base); -+ kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base; -+ } - -- list_for_each_entry(bo, &exec->unref_list, unref_head) { -- drm_gem_object_reference(&bo->base.base); -- kernel_state->bo[i] = &bo->base.base; -- i++; -+ list_for_each_entry(bo, &exec[i]->unref_list, unref_head) { -+ drm_gem_object_reference(&bo->base.base); -+ kernel_state->bo[j + prev_idx] = &bo->base.base; -+ j++; -+ } -+ prev_idx = j + 1; - } - -- state->start_bin = exec->ct0ca; -- state->start_render = exec->ct1ca; -+ if (exec[0]) -+ state->start_bin = exec[0]->ct0ca; -+ if (exec[1]) -+ state->start_render = exec[1]->ct1ca; - - spin_unlock_irqrestore(&vc4->job_lock, irqflags); - -@@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data - struct vc4_dev *vc4 = to_vc4_dev(dev); - uint32_t ct0ca, ct1ca; - unsigned long irqflags; -- struct vc4_exec_info *exec; -+ struct vc4_exec_info *bin_exec, *render_exec; - - spin_lock_irqsave(&vc4->job_lock, irqflags); -- exec = vc4_first_job(vc4); -+ -+ bin_exec = vc4_first_bin_job(vc4); -+ render_exec = vc4_first_render_job(vc4); - - /* If idle, we can stop watching for hangs. */ -- if (!exec) { -+ if (!bin_exec && !render_exec) { - spin_unlock_irqrestore(&vc4->job_lock, irqflags); - return; - } -@@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data - /* If we've made any progress in execution, rearm the timer - * and wait. - */ -- if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) { -- exec->last_ct0ca = ct0ca; -- exec->last_ct1ca = ct1ca; -+ if ((bin_exec && ct0ca != bin_exec->last_ct0ca) || -+ (render_exec && ct1ca != render_exec->last_ct1ca)) { -+ if (bin_exec) -+ bin_exec->last_ct0ca = ct0ca; -+ if (render_exec) -+ render_exec->last_ct1ca = ct1ca; - spin_unlock_irqrestore(&vc4->job_lock, irqflags); - vc4_queue_hangcheck(dev); - return; -@@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev) - * The job_lock should be held during this. - */ - void --vc4_submit_next_job(struct drm_device *dev) -+vc4_submit_next_bin_job(struct drm_device *dev) - { - struct vc4_dev *vc4 = to_vc4_dev(dev); -- struct vc4_exec_info *exec = vc4_first_job(vc4); -+ struct vc4_exec_info *exec; - -+again: -+ exec = vc4_first_bin_job(vc4); - if (!exec) - return; - -@@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d - V3D_WRITE(V3D_BPOA, 0); - V3D_WRITE(V3D_BPOS, 0); - -- if (exec->ct0ca != exec->ct0ea) -+ /* Either put the job in the binner if it uses the binner, or -+ * immediately move it to the to-be-rendered queue. -+ */ -+ if (exec->ct0ca != exec->ct0ea) { - submit_cl(dev, 0, exec->ct0ca, exec->ct0ea); -+ } else { -+ vc4_move_job_to_render(dev, exec); -+ goto again; -+ } -+} -+ -+void -+vc4_submit_next_render_job(struct drm_device *dev) -+{ -+ struct vc4_dev *vc4 = to_vc4_dev(dev); -+ struct vc4_exec_info *exec = vc4_first_render_job(vc4); -+ -+ if (!exec) -+ return; -+ - submit_cl(dev, 1, exec->ct1ca, exec->ct1ea); - } - -+void -+vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec) -+{ -+ struct vc4_dev *vc4 = to_vc4_dev(dev); -+ bool was_empty = list_empty(&vc4->render_job_list); -+ -+ list_move_tail(&exec->head, &vc4->render_job_list); -+ if (was_empty) -+ vc4_submit_next_render_job(dev); -+} -+ - static void - vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno) - { -@@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev, - exec->seqno = seqno; - vc4_update_bo_seqnos(exec, seqno); - -- list_add_tail(&exec->head, &vc4->job_list); -+ list_add_tail(&exec->head, &vc4->bin_job_list); - - /* If no job was executing, kick ours off. Otherwise, it'll -- * get started when the previous job's frame done interrupt -+ * get started when the previous job's flush done interrupt - * occurs. - */ -- if (vc4_first_job(vc4) == exec) { -- vc4_submit_next_job(dev); -+ if (vc4_first_bin_job(vc4) == exec) { -+ vc4_submit_next_bin_job(dev); - vc4_queue_hangcheck(dev); - } - -@@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev) - { - struct vc4_dev *vc4 = to_vc4_dev(dev); - -- INIT_LIST_HEAD(&vc4->job_list); -+ INIT_LIST_HEAD(&vc4->bin_job_list); -+ INIT_LIST_HEAD(&vc4->render_job_list); - INIT_LIST_HEAD(&vc4->job_done_list); - INIT_LIST_HEAD(&vc4->seqno_cb_list); - spin_lock_init(&vc4->job_lock); ---- a/drivers/gpu/drm/vc4/vc4_irq.c -+++ b/drivers/gpu/drm/vc4/vc4_irq.c -@@ -30,6 +30,10 @@ - * disables that specific interrupt, and 0s written are ignored - * (reading either one returns the set of enabled interrupts). - * -+ * When we take a binning flush done interrupt, we need to submit the -+ * next frame for binning and move the finished frame to the render -+ * thread. -+ * - * When we take a render frame interrupt, we need to wake the - * processes waiting for some frame to be done, and get the next frame - * submitted ASAP (so the hardware doesn't sit idle when there's work -@@ -44,6 +48,7 @@ - #include "vc4_regs.h" - - #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \ -+ V3D_INT_FLDONE | \ - V3D_INT_FRDONE) - - DECLARE_WAIT_QUEUE_HEAD(render_wait); -@@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct - unsigned long irqflags; - - spin_lock_irqsave(&vc4->job_lock, irqflags); -- current_exec = vc4_first_job(vc4); -+ current_exec = vc4_first_bin_job(vc4); - if (current_exec) { - vc4->overflow_mem->seqno = vc4->finished_seqno + 1; - list_add_tail(&vc4->overflow_mem->unref_head, -@@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct - } - - static void --vc4_irq_finish_job(struct drm_device *dev) -+vc4_irq_finish_bin_job(struct drm_device *dev) -+{ -+ struct vc4_dev *vc4 = to_vc4_dev(dev); -+ struct vc4_exec_info *exec = vc4_first_bin_job(vc4); -+ -+ if (!exec) -+ return; -+ -+ vc4_move_job_to_render(dev, exec); -+ vc4_submit_next_bin_job(dev); -+} -+ -+static void -+vc4_cancel_bin_job(struct drm_device *dev) -+{ -+ struct vc4_dev *vc4 = to_vc4_dev(dev); -+ struct vc4_exec_info *exec = vc4_first_bin_job(vc4); -+ -+ if (!exec) -+ return; -+ -+ list_move_tail(&exec->head, &vc4->bin_job_list); -+ vc4_submit_next_bin_job(dev); -+} -+ -+static void -+vc4_irq_finish_render_job(struct drm_device *dev) - { - struct vc4_dev *vc4 = to_vc4_dev(dev); -- struct vc4_exec_info *exec = vc4_first_job(vc4); -+ struct vc4_exec_info *exec = vc4_first_render_job(vc4); - - if (!exec) - return; - - vc4->finished_seqno++; - list_move_tail(&exec->head, &vc4->job_done_list); -- vc4_submit_next_job(dev); -+ vc4_submit_next_render_job(dev); - - wake_up_all(&vc4->job_wait_queue); - schedule_work(&vc4->job_done_work); -@@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg) - barrier(); - intctl = V3D_READ(V3D_INTCTL); - -- /* Acknowledge the interrupts we're handling here. The render -- * frame done interrupt will be cleared, while OUTOMEM will -- * stay high until the underlying cause is cleared. -+ /* Acknowledge the interrupts we're handling here. The binner -+ * last flush / render frame done interrupt will be cleared, -+ * while OUTOMEM will stay high until the underlying cause is -+ * cleared. - */ - V3D_WRITE(V3D_INTCTL, intctl); - -@@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg) - status = IRQ_HANDLED; - } - -+ if (intctl & V3D_INT_FLDONE) { -+ spin_lock(&vc4->job_lock); -+ vc4_irq_finish_bin_job(dev); -+ spin_unlock(&vc4->job_lock); -+ status = IRQ_HANDLED; -+ } -+ - if (intctl & V3D_INT_FRDONE) { - spin_lock(&vc4->job_lock); -- vc4_irq_finish_job(dev); -+ vc4_irq_finish_render_job(dev); - spin_unlock(&vc4->job_lock); - status = IRQ_HANDLED; - } -@@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de - V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); - - spin_lock_irqsave(&vc4->job_lock, irqflags); -- vc4_irq_finish_job(dev); -+ vc4_cancel_bin_job(dev); -+ vc4_irq_finish_render_job(dev); - spin_unlock_irqrestore(&vc4->job_lock, irqflags); - } |