diff options
Diffstat (limited to 'target/linux/brcm2708/patches-4.4/0294-drm-vc4-improve-throughput-by-pipelining-binning-and.patch')
-rw-r--r-- | target/linux/brcm2708/patches-4.4/0294-drm-vc4-improve-throughput-by-pipelining-binning-and.patch | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-4.4/0294-drm-vc4-improve-throughput-by-pipelining-binning-and.patch b/target/linux/brcm2708/patches-4.4/0294-drm-vc4-improve-throughput-by-pipelining-binning-and.patch new file mode 100644 index 0000000000..89c067b93e --- /dev/null +++ b/target/linux/brcm2708/patches-4.4/0294-drm-vc4-improve-throughput-by-pipelining-binning-and.patch @@ -0,0 +1,429 @@ +From a2c21b04f340f594c16f9c7235ec7b7f78a96a1f Mon Sep 17 00:00:00 2001 +From: Varad Gautam <varadgautam@gmail.com> +Date: Wed, 17 Feb 2016 19:08:21 +0530 +Subject: [PATCH 294/304] drm/vc4: improve throughput by pipelining binning and + rendering jobs + +The hardware provides us with separate threads for binning and +rendering, and the existing model waits for them both to complete +before submitting the next job. + +Splitting the binning and rendering submissions reduces idle time and +gives us approx 20-30% speedup with some x11perf tests such as -line10 +and -tilerect1. Improves openarena performance by 1.01897% +/- +0.247857% (n=16). + +Thanks to anholt for suggesting this. + +v2: Rebase on the spurious resets fix (change by anholt). + +Signed-off-by: Varad Gautam <varadgautam@gmail.com> +Reviewed-by: Eric Anholt <eric@anholt.net> +Signed-off-by: Eric Anholt <eric@anholt.net> +(cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207) +--- + drivers/gpu/drm/vc4/vc4_drv.h | 37 +++++++++---- + drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------ + drivers/gpu/drm/vc4/vc4_irq.c | 58 ++++++++++++++++---- + 3 files changed, 166 insertions(+), 52 deletions(-) + +--- a/drivers/gpu/drm/vc4/vc4_drv.h ++++ b/drivers/gpu/drm/vc4/vc4_drv.h +@@ -53,7 +53,7 @@ struct vc4_dev { + /* Protects bo_cache and the BO stats. */ + struct mutex bo_lock; + +- /* Sequence number for the last job queued in job_list. ++ /* Sequence number for the last job queued in bin_job_list. + * Starts at 0 (no jobs emitted). + */ + uint64_t emit_seqno; +@@ -63,11 +63,19 @@ struct vc4_dev { + */ + uint64_t finished_seqno; + +- /* List of all struct vc4_exec_info for jobs to be executed. +- * The first job in the list is the one currently programmed +- * into ct0ca/ct1ca for execution. ++ /* List of all struct vc4_exec_info for jobs to be executed in ++ * the binner. The first job in the list is the one currently ++ * programmed into ct0ca for execution. ++ */ ++ struct list_head bin_job_list; ++ ++ /* List of all struct vc4_exec_info for jobs that have ++ * completed binning and are ready for rendering. The first ++ * job in the list is the one currently programmed into ct1ca ++ * for execution. + */ +- struct list_head job_list; ++ struct list_head render_job_list; ++ + /* List of the finished vc4_exec_infos waiting to be freed by + * job_done_work. + */ +@@ -291,11 +299,20 @@ struct vc4_exec_info { + }; + + static inline struct vc4_exec_info * +-vc4_first_job(struct vc4_dev *vc4) ++vc4_first_bin_job(struct vc4_dev *vc4) ++{ ++ if (list_empty(&vc4->bin_job_list)) ++ return NULL; ++ return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head); ++} ++ ++static inline struct vc4_exec_info * ++vc4_first_render_job(struct vc4_dev *vc4) + { +- if (list_empty(&vc4->job_list)) ++ if (list_empty(&vc4->render_job_list)) + return NULL; +- return list_first_entry(&vc4->job_list, struct vc4_exec_info, head); ++ return list_first_entry(&vc4->render_job_list, ++ struct vc4_exec_info, head); + } + + /** +@@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi + struct drm_file *file_priv); + int vc4_wait_bo_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv); +-void vc4_submit_next_job(struct drm_device *dev); ++void vc4_submit_next_bin_job(struct drm_device *dev); ++void vc4_submit_next_render_job(struct drm_device *dev); ++void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec); + int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, + uint64_t timeout_ns, bool interruptible); + void vc4_job_handle_completed(struct vc4_dev *vc4); +--- a/drivers/gpu/drm/vc4/vc4_gem.c ++++ b/drivers/gpu/drm/vc4/vc4_gem.c +@@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct drm_vc4_get_hang_state *state; + struct vc4_hang_state *kernel_state; +- struct vc4_exec_info *exec; ++ struct vc4_exec_info *exec[2]; + struct vc4_bo *bo; + unsigned long irqflags; +- unsigned int i, unref_list_count; ++ unsigned int i, j, unref_list_count, prev_idx; + + kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL); + if (!kernel_state) +@@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d + state = &kernel_state->user_state; + + spin_lock_irqsave(&vc4->job_lock, irqflags); +- exec = vc4_first_job(vc4); +- if (!exec) { ++ exec[0] = vc4_first_bin_job(vc4); ++ exec[1] = vc4_first_render_job(vc4); ++ if (!exec[0] && !exec[1]) { + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + return; + } + +- unref_list_count = 0; +- list_for_each_entry(bo, &exec->unref_list, unref_head) +- unref_list_count++; +- +- state->bo_count = exec->bo_count + unref_list_count; +- kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo), +- GFP_ATOMIC); ++ /* Get the bos from both binner and renderer into hang state. */ ++ state->bo_count = 0; ++ for (i = 0; i < 2; i++) { ++ if (!exec[i]) ++ continue; ++ ++ unref_list_count = 0; ++ list_for_each_entry(bo, &exec[i]->unref_list, unref_head) ++ unref_list_count++; ++ state->bo_count += exec[i]->bo_count + unref_list_count; ++ } ++ ++ kernel_state->bo = kcalloc(state->bo_count, ++ sizeof(*kernel_state->bo), GFP_ATOMIC); ++ + if (!kernel_state->bo) { + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + return; + } + +- for (i = 0; i < exec->bo_count; i++) { +- drm_gem_object_reference(&exec->bo[i]->base); +- kernel_state->bo[i] = &exec->bo[i]->base; +- } ++ prev_idx = 0; ++ for (i = 0; i < 2; i++) { ++ if (!exec[i]) ++ continue; ++ ++ for (j = 0; j < exec[i]->bo_count; j++) { ++ drm_gem_object_reference(&exec[i]->bo[j]->base); ++ kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base; ++ } + +- list_for_each_entry(bo, &exec->unref_list, unref_head) { +- drm_gem_object_reference(&bo->base.base); +- kernel_state->bo[i] = &bo->base.base; +- i++; ++ list_for_each_entry(bo, &exec[i]->unref_list, unref_head) { ++ drm_gem_object_reference(&bo->base.base); ++ kernel_state->bo[j + prev_idx] = &bo->base.base; ++ j++; ++ } ++ prev_idx = j + 1; + } + +- state->start_bin = exec->ct0ca; +- state->start_render = exec->ct1ca; ++ if (exec[0]) ++ state->start_bin = exec[0]->ct0ca; ++ if (exec[1]) ++ state->start_render = exec[1]->ct1ca; + + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + +@@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data + struct vc4_dev *vc4 = to_vc4_dev(dev); + uint32_t ct0ca, ct1ca; + unsigned long irqflags; +- struct vc4_exec_info *exec; ++ struct vc4_exec_info *bin_exec, *render_exec; + + spin_lock_irqsave(&vc4->job_lock, irqflags); +- exec = vc4_first_job(vc4); ++ ++ bin_exec = vc4_first_bin_job(vc4); ++ render_exec = vc4_first_render_job(vc4); + + /* If idle, we can stop watching for hangs. */ +- if (!exec) { ++ if (!bin_exec && !render_exec) { + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + return; + } +@@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data + /* If we've made any progress in execution, rearm the timer + * and wait. + */ +- if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) { +- exec->last_ct0ca = ct0ca; +- exec->last_ct1ca = ct1ca; ++ if ((bin_exec && ct0ca != bin_exec->last_ct0ca) || ++ (render_exec && ct1ca != render_exec->last_ct1ca)) { ++ if (bin_exec) ++ bin_exec->last_ct0ca = ct0ca; ++ if (render_exec) ++ render_exec->last_ct1ca = ct1ca; + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + vc4_queue_hangcheck(dev); + return; +@@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev) + * The job_lock should be held during this. + */ + void +-vc4_submit_next_job(struct drm_device *dev) ++vc4_submit_next_bin_job(struct drm_device *dev) + { + struct vc4_dev *vc4 = to_vc4_dev(dev); +- struct vc4_exec_info *exec = vc4_first_job(vc4); ++ struct vc4_exec_info *exec; + ++again: ++ exec = vc4_first_bin_job(vc4); + if (!exec) + return; + +@@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d + V3D_WRITE(V3D_BPOA, 0); + V3D_WRITE(V3D_BPOS, 0); + +- if (exec->ct0ca != exec->ct0ea) ++ /* Either put the job in the binner if it uses the binner, or ++ * immediately move it to the to-be-rendered queue. ++ */ ++ if (exec->ct0ca != exec->ct0ea) { + submit_cl(dev, 0, exec->ct0ca, exec->ct0ea); ++ } else { ++ vc4_move_job_to_render(dev, exec); ++ goto again; ++ } ++} ++ ++void ++vc4_submit_next_render_job(struct drm_device *dev) ++{ ++ struct vc4_dev *vc4 = to_vc4_dev(dev); ++ struct vc4_exec_info *exec = vc4_first_render_job(vc4); ++ ++ if (!exec) ++ return; ++ + submit_cl(dev, 1, exec->ct1ca, exec->ct1ea); + } + ++void ++vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec) ++{ ++ struct vc4_dev *vc4 = to_vc4_dev(dev); ++ bool was_empty = list_empty(&vc4->render_job_list); ++ ++ list_move_tail(&exec->head, &vc4->render_job_list); ++ if (was_empty) ++ vc4_submit_next_render_job(dev); ++} ++ + static void + vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno) + { +@@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev, + exec->seqno = seqno; + vc4_update_bo_seqnos(exec, seqno); + +- list_add_tail(&exec->head, &vc4->job_list); ++ list_add_tail(&exec->head, &vc4->bin_job_list); + + /* If no job was executing, kick ours off. Otherwise, it'll +- * get started when the previous job's frame done interrupt ++ * get started when the previous job's flush done interrupt + * occurs. + */ +- if (vc4_first_job(vc4) == exec) { +- vc4_submit_next_job(dev); ++ if (vc4_first_bin_job(vc4) == exec) { ++ vc4_submit_next_bin_job(dev); + vc4_queue_hangcheck(dev); + } + +@@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev) + { + struct vc4_dev *vc4 = to_vc4_dev(dev); + +- INIT_LIST_HEAD(&vc4->job_list); ++ INIT_LIST_HEAD(&vc4->bin_job_list); ++ INIT_LIST_HEAD(&vc4->render_job_list); + INIT_LIST_HEAD(&vc4->job_done_list); + INIT_LIST_HEAD(&vc4->seqno_cb_list); + spin_lock_init(&vc4->job_lock); +--- a/drivers/gpu/drm/vc4/vc4_irq.c ++++ b/drivers/gpu/drm/vc4/vc4_irq.c +@@ -30,6 +30,10 @@ + * disables that specific interrupt, and 0s written are ignored + * (reading either one returns the set of enabled interrupts). + * ++ * When we take a binning flush done interrupt, we need to submit the ++ * next frame for binning and move the finished frame to the render ++ * thread. ++ * + * When we take a render frame interrupt, we need to wake the + * processes waiting for some frame to be done, and get the next frame + * submitted ASAP (so the hardware doesn't sit idle when there's work +@@ -44,6 +48,7 @@ + #include "vc4_regs.h" + + #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \ ++ V3D_INT_FLDONE | \ + V3D_INT_FRDONE) + + DECLARE_WAIT_QUEUE_HEAD(render_wait); +@@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct + unsigned long irqflags; + + spin_lock_irqsave(&vc4->job_lock, irqflags); +- current_exec = vc4_first_job(vc4); ++ current_exec = vc4_first_bin_job(vc4); + if (current_exec) { + vc4->overflow_mem->seqno = vc4->finished_seqno + 1; + list_add_tail(&vc4->overflow_mem->unref_head, +@@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct + } + + static void +-vc4_irq_finish_job(struct drm_device *dev) ++vc4_irq_finish_bin_job(struct drm_device *dev) ++{ ++ struct vc4_dev *vc4 = to_vc4_dev(dev); ++ struct vc4_exec_info *exec = vc4_first_bin_job(vc4); ++ ++ if (!exec) ++ return; ++ ++ vc4_move_job_to_render(dev, exec); ++ vc4_submit_next_bin_job(dev); ++} ++ ++static void ++vc4_cancel_bin_job(struct drm_device *dev) ++{ ++ struct vc4_dev *vc4 = to_vc4_dev(dev); ++ struct vc4_exec_info *exec = vc4_first_bin_job(vc4); ++ ++ if (!exec) ++ return; ++ ++ list_move_tail(&exec->head, &vc4->bin_job_list); ++ vc4_submit_next_bin_job(dev); ++} ++ ++static void ++vc4_irq_finish_render_job(struct drm_device *dev) + { + struct vc4_dev *vc4 = to_vc4_dev(dev); +- struct vc4_exec_info *exec = vc4_first_job(vc4); ++ struct vc4_exec_info *exec = vc4_first_render_job(vc4); + + if (!exec) + return; + + vc4->finished_seqno++; + list_move_tail(&exec->head, &vc4->job_done_list); +- vc4_submit_next_job(dev); ++ vc4_submit_next_render_job(dev); + + wake_up_all(&vc4->job_wait_queue); + schedule_work(&vc4->job_done_work); +@@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg) + barrier(); + intctl = V3D_READ(V3D_INTCTL); + +- /* Acknowledge the interrupts we're handling here. The render +- * frame done interrupt will be cleared, while OUTOMEM will +- * stay high until the underlying cause is cleared. ++ /* Acknowledge the interrupts we're handling here. The binner ++ * last flush / render frame done interrupt will be cleared, ++ * while OUTOMEM will stay high until the underlying cause is ++ * cleared. + */ + V3D_WRITE(V3D_INTCTL, intctl); + +@@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg) + status = IRQ_HANDLED; + } + ++ if (intctl & V3D_INT_FLDONE) { ++ spin_lock(&vc4->job_lock); ++ vc4_irq_finish_bin_job(dev); ++ spin_unlock(&vc4->job_lock); ++ status = IRQ_HANDLED; ++ } ++ + if (intctl & V3D_INT_FRDONE) { + spin_lock(&vc4->job_lock); +- vc4_irq_finish_job(dev); ++ vc4_irq_finish_render_job(dev); + spin_unlock(&vc4->job_lock); + status = IRQ_HANDLED; + } +@@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de + V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); + + spin_lock_irqsave(&vc4->job_lock, irqflags); +- vc4_irq_finish_job(dev); ++ vc4_cancel_bin_job(dev); ++ vc4_irq_finish_render_job(dev); + spin_unlock_irqrestore(&vc4->job_lock, irqflags); + } |