summaryrefslogtreecommitdiffstats
path: root/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch')
-rw-r--r--target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch241
1 files changed, 241 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch b/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
new file mode 100644
index 0000000000..f7a3051b2c
--- /dev/null
+++ b/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
@@ -0,0 +1,241 @@
+From ba65074e39e6aee492bd3c077f640b29a0a89c05 Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
+Date: Mon, 17 Jun 2013 16:00:25 +0300
+Subject: [PATCH 070/174] bcm2708_fb: DMA acceleration for fb_copyarea
+
+Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
+Also used Simon's dmaer_master module as a reference for tweaking DMA
+settings for better performance.
+
+For now busylooping only. IRQ support might be added later.
+With non-overclocked Raspberry Pi, the performance is ~360 MB/s
+for simple copy or ~260 MB/s for two-pass copy (used when dragging
+windows to the right).
+
+In the case of using DMA channel 0, the performance improves
+to ~440 MB/s.
+
+For comparison, VFP optimized CPU copy can only do ~114 MB/s in
+the same conditions (hindered by reading uncached source buffer).
+
+Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
+---
+ drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 159 insertions(+), 3 deletions(-)
+
+--- a/drivers/video/bcm2708_fb.c
++++ b/drivers/video/bcm2708_fb.c
+@@ -28,6 +28,7 @@
+ #include <linux/printk.h>
+ #include <linux/console.h>
+
++#include <mach/dma.h>
+ #include <mach/platform.h>
+ #include <mach/vcio.h>
+
+@@ -63,6 +64,11 @@ struct bcm2708_fb {
+ struct fbinfo_s *info;
+ dma_addr_t dma;
+ u32 cmap[16];
++ int dma_chan;
++ int dma_irq;
++ void __iomem *dma_chan_base;
++ void *cb_base; /* DMA control blocks */
++ dma_addr_t cb_handle;
+ };
+
+ #define to_bcm2708(info) container_of(info, struct bcm2708_fb, fb)
+@@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct f
+ cfb_fillrect(info, rect);
+ }
+
++/* A helper function for configuring dma control block */
++static void set_dma_cb(struct bcm2708_dma_cb *cb,
++ int burst_size,
++ dma_addr_t dst,
++ int dst_stride,
++ dma_addr_t src,
++ int src_stride,
++ int w,
++ int h)
++{
++ cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
++ BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
++ BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
++ cb->dst = dst;
++ cb->src = src;
++ /*
++ * This is not really obvious from the DMA documentation,
++ * but the top 16 bits must be programmmed to "height -1"
++ * and not "height" in 2D mode.
++ */
++ cb->length = ((h - 1) << 16) | w;
++ cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
++ cb->pad[0] = 0;
++ cb->pad[1] = 0;
++}
++
+ static void bcm2708_fb_copyarea(struct fb_info *info,
+ const struct fb_copyarea *region)
+ {
+- /*print_debug("bcm2708_fb_copyarea\n"); */
+- cfb_copyarea(info, region);
++ struct bcm2708_fb *fb = to_bcm2708(info);
++ struct bcm2708_dma_cb *cb = fb->cb_base;
++ int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
++ /* Channel 0 supports larger bursts and is a bit faster */
++ int burst_size = (fb->dma_chan == 0) ? 8 : 2;
++
++ /* Fallback to cfb_copyarea() if we don't like something */
++ if (bytes_per_pixel > 4 ||
++ info->var.xres > 1920 || info->var.yres > 1200 ||
++ region->width <= 0 || region->width > info->var.xres ||
++ region->height <= 0 || region->height > info->var.yres ||
++ region->sx < 0 || region->sx >= info->var.xres ||
++ region->sy < 0 || region->sy >= info->var.yres ||
++ region->dx < 0 || region->dx >= info->var.xres ||
++ region->dy < 0 || region->dy >= info->var.yres ||
++ region->sx + region->width > info->var.xres ||
++ region->dx + region->width > info->var.xres ||
++ region->sy + region->height > info->var.yres ||
++ region->dy + region->height > info->var.yres) {
++ cfb_copyarea(info, region);
++ return;
++ }
++
++ if (region->dy == region->sy && region->dx > region->sx) {
++ /*
++ * A difficult case of overlapped copy. Because DMA can't
++ * copy individual scanlines in backwards direction, we need
++ * two-pass processing. We do it by programming a chain of dma
++ * control blocks in the first 16K part of the buffer and use
++ * the remaining 48K as the intermediate temporary scratch
++ * buffer. The buffer size is sufficient to handle up to
++ * 1920x1200 resolution at 32bpp pixel depth.
++ */
++ int y;
++ dma_addr_t control_block_pa = fb->cb_handle;
++ dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
++ int scanline_size = bytes_per_pixel * region->width;
++ int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
++
++ for (y = 0; y < region->height; y += scanlines_per_cb) {
++ dma_addr_t src =
++ fb->fb.fix.smem_start +
++ bytes_per_pixel * region->sx +
++ (region->sy + y) * fb->fb.fix.line_length;
++ dma_addr_t dst =
++ fb->fb.fix.smem_start +
++ bytes_per_pixel * region->dx +
++ (region->dy + y) * fb->fb.fix.line_length;
++
++ if (region->height - y < scanlines_per_cb)
++ scanlines_per_cb = region->height - y;
++
++ set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
++ src, fb->fb.fix.line_length,
++ scanline_size, scanlines_per_cb);
++ control_block_pa += sizeof(struct bcm2708_dma_cb);
++ cb->next = control_block_pa;
++ cb++;
++
++ set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
++ scratchbuf, scanline_size,
++ scanline_size, scanlines_per_cb);
++ control_block_pa += sizeof(struct bcm2708_dma_cb);
++ cb->next = control_block_pa;
++ cb++;
++ }
++ /* move the pointer back to the last dma control block */
++ cb--;
++ } else {
++ /* A single dma control block is enough. */
++ int sy, dy, stride;
++ if (region->dy <= region->sy) {
++ /* processing from top to bottom */
++ dy = region->dy;
++ sy = region->sy;
++ stride = fb->fb.fix.line_length;
++ } else {
++ /* processing from bottom to top */
++ dy = region->dy + region->height - 1;
++ sy = region->sy + region->height - 1;
++ stride = -fb->fb.fix.line_length;
++ }
++ set_dma_cb(cb, burst_size,
++ fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
++ bytes_per_pixel * region->dx,
++ stride,
++ fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
++ bytes_per_pixel * region->sx,
++ stride,
++ region->width * bytes_per_pixel,
++ region->height);
++ }
++
++ /* end of dma control blocks chain */
++ cb->next = 0;
++
++ bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
++ bcm_dma_wait_idle(fb->dma_chan_base);
+ }
+
+ static void bcm2708_fb_imageblit(struct fb_info *info,
+@@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bc
+ fb->dma = dma;
+ }
+ fb->fb.fbops = &bcm2708_fb_ops;
+- fb->fb.flags = FBINFO_FLAG_DEFAULT;
++ fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
+ fb->fb.pseudo_palette = fb->cmap;
+
+ strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
+@@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platf
+ }
+ memset(fb, 0, sizeof(struct bcm2708_fb));
+
++ fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
++ &fb->cb_handle, GFP_KERNEL);
++ if (!fb->cb_base) {
++ dev_err(&dev->dev, "cannot allocate DMA CBs\n");
++ ret = -ENOMEM;
++ goto free_fb;
++ }
++
++ pr_info("BCM2708FB: allocated DMA memory %08x\n",
++ fb->cb_handle);
++
++ ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
++ &fb->dma_chan_base, &fb->dma_irq);
++ if (ret < 0) {
++ dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
++ goto free_cb;
++ }
++ fb->dma_chan = ret;
++
++ pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
++ fb->dma_chan, fb->dma_chan_base);
++
+ fb->dev = dev;
+
+ ret = bcm2708_fb_register(fb);
+@@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platf
+ goto out;
+ }
+
++free_cb:
++ dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
++free_fb:
+ kfree(fb);
+ free_region:
+ dev_err(&dev->dev, "probe failed, err %d\n", ret);
+@@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct plat
+ iounmap(fb->fb.screen_base);
+ unregister_framebuffer(&fb->fb);
+
++ dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
++ bcm_dma_chan_free(fb->dma_chan);
++
+ dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
+ fb->dma);
+ kfree(fb);