diff options
Diffstat (limited to 'target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch')
-rw-r--r-- | target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch b/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch new file mode 100644 index 0000000000..f7a3051b2c --- /dev/null +++ b/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch @@ -0,0 +1,241 @@ +From ba65074e39e6aee492bd3c077f640b29a0a89c05 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@gmail.com> +Date: Mon, 17 Jun 2013 16:00:25 +0300 +Subject: [PATCH 070/174] bcm2708_fb: DMA acceleration for fb_copyarea + +Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425 +Also used Simon's dmaer_master module as a reference for tweaking DMA +settings for better performance. + +For now busylooping only. IRQ support might be added later. +With non-overclocked Raspberry Pi, the performance is ~360 MB/s +for simple copy or ~260 MB/s for two-pass copy (used when dragging +windows to the right). + +In the case of using DMA channel 0, the performance improves +to ~440 MB/s. + +For comparison, VFP optimized CPU copy can only do ~114 MB/s in +the same conditions (hindered by reading uncached source buffer). + +Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com> +--- + drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 159 insertions(+), 3 deletions(-) + +--- a/drivers/video/bcm2708_fb.c ++++ b/drivers/video/bcm2708_fb.c +@@ -28,6 +28,7 @@ + #include <linux/printk.h> + #include <linux/console.h> + ++#include <mach/dma.h> + #include <mach/platform.h> + #include <mach/vcio.h> + +@@ -63,6 +64,11 @@ struct bcm2708_fb { + struct fbinfo_s *info; + dma_addr_t dma; + u32 cmap[16]; ++ int dma_chan; ++ int dma_irq; ++ void __iomem *dma_chan_base; ++ void *cb_base; /* DMA control blocks */ ++ dma_addr_t cb_handle; + }; + + #define to_bcm2708(info) container_of(info, struct bcm2708_fb, fb) +@@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct f + cfb_fillrect(info, rect); + } + ++/* A helper function for configuring dma control block */ ++static void set_dma_cb(struct bcm2708_dma_cb *cb, ++ int burst_size, ++ dma_addr_t dst, ++ int dst_stride, ++ dma_addr_t src, ++ int src_stride, ++ int w, ++ int h) ++{ ++ cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH | ++ BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH | ++ BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE; ++ cb->dst = dst; ++ cb->src = src; ++ /* ++ * This is not really obvious from the DMA documentation, ++ * but the top 16 bits must be programmmed to "height -1" ++ * and not "height" in 2D mode. ++ */ ++ cb->length = ((h - 1) << 16) | w; ++ cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w); ++ cb->pad[0] = 0; ++ cb->pad[1] = 0; ++} ++ + static void bcm2708_fb_copyarea(struct fb_info *info, + const struct fb_copyarea *region) + { +- /*print_debug("bcm2708_fb_copyarea\n"); */ +- cfb_copyarea(info, region); ++ struct bcm2708_fb *fb = to_bcm2708(info); ++ struct bcm2708_dma_cb *cb = fb->cb_base; ++ int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3; ++ /* Channel 0 supports larger bursts and is a bit faster */ ++ int burst_size = (fb->dma_chan == 0) ? 8 : 2; ++ ++ /* Fallback to cfb_copyarea() if we don't like something */ ++ if (bytes_per_pixel > 4 || ++ info->var.xres > 1920 || info->var.yres > 1200 || ++ region->width <= 0 || region->width > info->var.xres || ++ region->height <= 0 || region->height > info->var.yres || ++ region->sx < 0 || region->sx >= info->var.xres || ++ region->sy < 0 || region->sy >= info->var.yres || ++ region->dx < 0 || region->dx >= info->var.xres || ++ region->dy < 0 || region->dy >= info->var.yres || ++ region->sx + region->width > info->var.xres || ++ region->dx + region->width > info->var.xres || ++ region->sy + region->height > info->var.yres || ++ region->dy + region->height > info->var.yres) { ++ cfb_copyarea(info, region); ++ return; ++ } ++ ++ if (region->dy == region->sy && region->dx > region->sx) { ++ /* ++ * A difficult case of overlapped copy. Because DMA can't ++ * copy individual scanlines in backwards direction, we need ++ * two-pass processing. We do it by programming a chain of dma ++ * control blocks in the first 16K part of the buffer and use ++ * the remaining 48K as the intermediate temporary scratch ++ * buffer. The buffer size is sufficient to handle up to ++ * 1920x1200 resolution at 32bpp pixel depth. ++ */ ++ int y; ++ dma_addr_t control_block_pa = fb->cb_handle; ++ dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024; ++ int scanline_size = bytes_per_pixel * region->width; ++ int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size; ++ ++ for (y = 0; y < region->height; y += scanlines_per_cb) { ++ dma_addr_t src = ++ fb->fb.fix.smem_start + ++ bytes_per_pixel * region->sx + ++ (region->sy + y) * fb->fb.fix.line_length; ++ dma_addr_t dst = ++ fb->fb.fix.smem_start + ++ bytes_per_pixel * region->dx + ++ (region->dy + y) * fb->fb.fix.line_length; ++ ++ if (region->height - y < scanlines_per_cb) ++ scanlines_per_cb = region->height - y; ++ ++ set_dma_cb(cb, burst_size, scratchbuf, scanline_size, ++ src, fb->fb.fix.line_length, ++ scanline_size, scanlines_per_cb); ++ control_block_pa += sizeof(struct bcm2708_dma_cb); ++ cb->next = control_block_pa; ++ cb++; ++ ++ set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length, ++ scratchbuf, scanline_size, ++ scanline_size, scanlines_per_cb); ++ control_block_pa += sizeof(struct bcm2708_dma_cb); ++ cb->next = control_block_pa; ++ cb++; ++ } ++ /* move the pointer back to the last dma control block */ ++ cb--; ++ } else { ++ /* A single dma control block is enough. */ ++ int sy, dy, stride; ++ if (region->dy <= region->sy) { ++ /* processing from top to bottom */ ++ dy = region->dy; ++ sy = region->sy; ++ stride = fb->fb.fix.line_length; ++ } else { ++ /* processing from bottom to top */ ++ dy = region->dy + region->height - 1; ++ sy = region->sy + region->height - 1; ++ stride = -fb->fb.fix.line_length; ++ } ++ set_dma_cb(cb, burst_size, ++ fb->fb.fix.smem_start + dy * fb->fb.fix.line_length + ++ bytes_per_pixel * region->dx, ++ stride, ++ fb->fb.fix.smem_start + sy * fb->fb.fix.line_length + ++ bytes_per_pixel * region->sx, ++ stride, ++ region->width * bytes_per_pixel, ++ region->height); ++ } ++ ++ /* end of dma control blocks chain */ ++ cb->next = 0; ++ ++ bcm_dma_start(fb->dma_chan_base, fb->cb_handle); ++ bcm_dma_wait_idle(fb->dma_chan_base); + } + + static void bcm2708_fb_imageblit(struct fb_info *info, +@@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bc + fb->dma = dma; + } + fb->fb.fbops = &bcm2708_fb_ops; +- fb->fb.flags = FBINFO_FLAG_DEFAULT; ++ fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA; + fb->fb.pseudo_palette = fb->cmap; + + strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id)); +@@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platf + } + memset(fb, 0, sizeof(struct bcm2708_fb)); + ++ fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K, ++ &fb->cb_handle, GFP_KERNEL); ++ if (!fb->cb_base) { ++ dev_err(&dev->dev, "cannot allocate DMA CBs\n"); ++ ret = -ENOMEM; ++ goto free_fb; ++ } ++ ++ pr_info("BCM2708FB: allocated DMA memory %08x\n", ++ fb->cb_handle); ++ ++ ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK, ++ &fb->dma_chan_base, &fb->dma_irq); ++ if (ret < 0) { ++ dev_err(&dev->dev, "couldn't allocate a DMA channel\n"); ++ goto free_cb; ++ } ++ fb->dma_chan = ret; ++ ++ pr_info("BCM2708FB: allocated DMA channel %d @ %p\n", ++ fb->dma_chan, fb->dma_chan_base); ++ + fb->dev = dev; + + ret = bcm2708_fb_register(fb); +@@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platf + goto out; + } + ++free_cb: ++ dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle); ++free_fb: + kfree(fb); + free_region: + dev_err(&dev->dev, "probe failed, err %d\n", ret); +@@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct plat + iounmap(fb->fb.screen_base); + unregister_framebuffer(&fb->fb); + ++ dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle); ++ bcm_dma_chan_free(fb->dma_chan); ++ + dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info, + fb->dma); + kfree(fb); |