aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/brcm2708/patches-3.14/0026-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
blob: 5707137ab14925c9dfa27a0c7147c639063aecd6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
From c2731f282848af32425043a2df88c1289538983e Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Mon, 17 Jun 2013 16:00:25 +0300
Subject: [PATCH 26/54] bcm2708_fb: DMA acceleration for fb_copyarea

Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
Also used Simon's dmaer_master module as a reference for tweaking DMA
settings for better performance.

For now busylooping only. IRQ support might be added later.
With non-overclocked Raspberry Pi, the performance is ~360 MB/s
for simple copy or ~260 MB/s for two-pass copy (used when dragging
windows to the right).

In the case of using DMA channel 0, the performance improves
to ~440 MB/s.

For comparison, VFP optimized CPU copy can only do ~114 MB/s in
the same conditions (hindered by reading uncached source buffer).

Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>

bcm2708_fb: report number of dma copies

Add a counter (exported via debugfs) reporting the
number of dma copies that the framebuffer driver
has done, in order to help evaluate different
optimization strategies.

Signed-off-by: Luke Diamand <luked@broadcom.com>

bcm2708_fb: use IRQ for DMA copies

The copyarea ioctl() uses DMA to speed things along. This
was busy-waiting for completion. This change supports using
an interrupt instead for larger transfers. For small
transfers, busy-waiting is still likely to be faster.

Signed-off-by: Luke Diamand <luke@diamand.org>
---
 arch/arm/mach-bcm2708/dma.c              |   8 +
 arch/arm/mach-bcm2708/include/mach/dma.h |   2 +
 drivers/video/bcm2708_fb.c               | 273 ++++++++++++++++++++++++++++++-
 3 files changed, 278 insertions(+), 5 deletions(-)

--- a/arch/arm/mach-bcm2708/dma.c
+++ b/arch/arm/mach-bcm2708/dma.c
@@ -83,6 +83,14 @@ extern void bcm_dma_wait_idle(void __iom
 
 EXPORT_SYMBOL_GPL(bcm_dma_start);
 
+extern bool bcm_dma_is_busy(void __iomem *dma_chan_base)
+{
+	dsb();
+
+	return readl(dma_chan_base + BCM2708_DMA_CS) & BCM2708_DMA_ACTIVE;
+}
+EXPORT_SYMBOL_GPL(bcm_dma_is_busy);
+
 /* Complete an ongoing DMA (assuming its results are to be ignored)
    Does nothing if there is no DMA in progress.
    This routine waits for the current AXI transfer to complete before
--- a/arch/arm/mach-bcm2708/include/mach/dma.h
+++ b/arch/arm/mach-bcm2708/include/mach/dma.h
@@ -62,11 +62,13 @@ struct bcm2708_dma_cb {
 	unsigned long next;
 	unsigned long pad[2];
 };
+struct scatterlist;
 
 extern int bcm_sg_suitable_for_dma(struct scatterlist *sg_ptr, int sg_len);
 extern void bcm_dma_start(void __iomem *dma_chan_base,
 			  dma_addr_t control_block);
 extern void bcm_dma_wait_idle(void __iomem *dma_chan_base);
+extern bool bcm_dma_is_busy(void __iomem *dma_chan_base);
 extern int /*rc*/ bcm_dma_abort(void __iomem *dma_chan_base);
 
 /* When listing features we can ask for when allocating DMA channels give
--- a/drivers/video/bcm2708_fb.c
+++ b/drivers/video/bcm2708_fb.c
@@ -21,13 +21,16 @@
 #include <linux/mm.h>
 #include <linux/fb.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/printk.h>
 #include <linux/console.h>
+#include <linux/debugfs.h>
 
+#include <mach/dma.h>
 #include <mach/platform.h>
 #include <mach/vcio.h>
 
@@ -51,6 +54,10 @@ static int fbheight = 480; /* module par
 static int fbdepth = 16;   /* module parameter */
 static int fbswap = 0;     /* module parameter */
 
+static u32 dma_busy_wait_threshold = 1<<15;
+module_param(dma_busy_wait_threshold, int, 0644);
+MODULE_PARM_DESC(dma_busy_wait_threshold, "Busy-wait for DMA completion below this area");
+
 /* this data structure describes each frame buffer device we find */
 
 struct fbinfo_s {
@@ -62,16 +69,73 @@ struct fbinfo_s {
 	u16 cmap[256];
 };
 
+struct bcm2708_fb_stats {
+	struct debugfs_regset32 regset;
+	u32 dma_copies;
+	u32 dma_irqs;
+};
+
 struct bcm2708_fb {
 	struct fb_info fb;
 	struct platform_device *dev;
 	struct fbinfo_s *info;
 	dma_addr_t dma;
 	u32 cmap[16];
+	int dma_chan;
+	int dma_irq;
+	void __iomem *dma_chan_base;
+	void *cb_base;		/* DMA control blocks */
+	dma_addr_t cb_handle;
+	struct dentry *debugfs_dir;
+	wait_queue_head_t dma_waitq;
+	struct bcm2708_fb_stats stats;
 };
 
 #define to_bcm2708(info)	container_of(info, struct bcm2708_fb, fb)
 
+static void bcm2708_fb_debugfs_deinit(struct bcm2708_fb *fb)
+{
+	debugfs_remove_recursive(fb->debugfs_dir);
+	fb->debugfs_dir = NULL;
+}
+
+static int bcm2708_fb_debugfs_init(struct bcm2708_fb *fb)
+{
+	static struct debugfs_reg32 stats_registers[] = {
+		{
+			"dma_copies",
+			offsetof(struct bcm2708_fb_stats, dma_copies)
+		},
+		{
+			"dma_irqs",
+			offsetof(struct bcm2708_fb_stats, dma_irqs)
+		},
+	};
+
+	fb->debugfs_dir = debugfs_create_dir(DRIVER_NAME, NULL);
+	if (!fb->debugfs_dir) {
+		pr_warn("%s: could not create debugfs entry\n",
+			__func__);
+		return -EFAULT;
+	}
+
+	fb->stats.regset.regs = stats_registers;
+	fb->stats.regset.nregs = ARRAY_SIZE(stats_registers);
+	fb->stats.regset.base = &fb->stats;
+
+	if (!debugfs_create_regset32(
+		"stats", 0444, fb->debugfs_dir, &fb->stats.regset)) {
+		pr_warn("%s: could not create statistics registers\n",
+			__func__);
+		goto fail;
+	}
+	return 0;
+
+fail:
+	bcm2708_fb_debugfs_deinit(fb);
+	return -EFAULT;
+}
+
 static int bcm2708_fb_set_bitfields(struct fb_var_screeninfo *var)
 {
 	int ret = 0;
@@ -322,11 +386,148 @@ static void bcm2708_fb_fillrect(struct f
 	cfb_fillrect(info, rect);
 }
 
+/* A helper function for configuring dma control block */
+static void set_dma_cb(struct bcm2708_dma_cb *cb,
+		       int        burst_size,
+		       dma_addr_t dst,
+		       int        dst_stride,
+		       dma_addr_t src,
+		       int        src_stride,
+		       int        w,
+		       int        h)
+{
+	cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
+		   BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
+		   BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
+	cb->dst = dst;
+	cb->src = src;
+	/*
+	 * This is not really obvious from the DMA documentation,
+	 * but the top 16 bits must be programmmed to "height -1"
+	 * and not "height" in 2D mode.
+	 */
+	cb->length = ((h - 1) << 16) | w;
+	cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
+	cb->pad[0] = 0;
+	cb->pad[1] = 0;
+}
+
 static void bcm2708_fb_copyarea(struct fb_info *info,
 				const struct fb_copyarea *region)
 {
-	/*print_debug("bcm2708_fb_copyarea\n"); */
-	cfb_copyarea(info, region);
+	struct bcm2708_fb *fb = to_bcm2708(info);
+	struct bcm2708_dma_cb *cb = fb->cb_base;
+	int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
+	/* Channel 0 supports larger bursts and is a bit faster */
+	int burst_size = (fb->dma_chan == 0) ? 8 : 2;
+	int pixels = region->width * region->height;
+
+	/* Fallback to cfb_copyarea() if we don't like something */
+	if (bytes_per_pixel > 4 ||
+	    info->var.xres * info->var.yres > 1920 * 1200 ||
+	    region->width <= 0 || region->width > info->var.xres ||
+	    region->height <= 0 || region->height > info->var.yres ||
+	    region->sx < 0 || region->sx >= info->var.xres ||
+	    region->sy < 0 || region->sy >= info->var.yres ||
+	    region->dx < 0 || region->dx >= info->var.xres ||
+	    region->dy < 0 || region->dy >= info->var.yres ||
+	    region->sx + region->width > info->var.xres ||
+	    region->dx + region->width > info->var.xres ||
+	    region->sy + region->height > info->var.yres ||
+	    region->dy + region->height > info->var.yres) {
+		cfb_copyarea(info, region);
+		return;
+	}
+
+	if (region->dy == region->sy && region->dx > region->sx) {
+		/*
+		 * A difficult case of overlapped copy. Because DMA can't
+		 * copy individual scanlines in backwards direction, we need
+		 * two-pass processing. We do it by programming a chain of dma
+		 * control blocks in the first 16K part of the buffer and use
+		 * the remaining 48K as the intermediate temporary scratch
+		 * buffer. The buffer size is sufficient to handle up to
+		 * 1920x1200 resolution at 32bpp pixel depth.
+		 */
+		int y;
+		dma_addr_t control_block_pa = fb->cb_handle;
+		dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
+		int scanline_size = bytes_per_pixel * region->width;
+		int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
+
+		for (y = 0; y < region->height; y += scanlines_per_cb) {
+			dma_addr_t src =
+				fb->fb.fix.smem_start +
+				bytes_per_pixel * region->sx +
+				(region->sy + y) * fb->fb.fix.line_length;
+			dma_addr_t dst =
+				fb->fb.fix.smem_start +
+				bytes_per_pixel * region->dx +
+				(region->dy + y) * fb->fb.fix.line_length;
+
+			if (region->height - y < scanlines_per_cb)
+				scanlines_per_cb = region->height - y;
+
+			set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
+				   src, fb->fb.fix.line_length,
+				   scanline_size, scanlines_per_cb);
+			control_block_pa += sizeof(struct bcm2708_dma_cb);
+			cb->next = control_block_pa;
+			cb++;
+
+			set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
+				   scratchbuf, scanline_size,
+				   scanline_size, scanlines_per_cb);
+			control_block_pa += sizeof(struct bcm2708_dma_cb);
+			cb->next = control_block_pa;
+			cb++;
+		}
+		/* move the pointer back to the last dma control block */
+		cb--;
+	} else {
+		/* A single dma control block is enough. */
+		int sy, dy, stride;
+		if (region->dy <= region->sy) {
+			/* processing from top to bottom */
+			dy = region->dy;
+			sy = region->sy;
+			stride = fb->fb.fix.line_length;
+		} else {
+			/* processing from bottom to top */
+			dy = region->dy + region->height - 1;
+			sy = region->sy + region->height - 1;
+			stride = -fb->fb.fix.line_length;
+		}
+		set_dma_cb(cb, burst_size,
+			   fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
+						   bytes_per_pixel * region->dx,
+			   stride,
+			   fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
+						   bytes_per_pixel * region->sx,
+			   stride,
+			   region->width * bytes_per_pixel,
+			   region->height);
+	}
+
+	/* end of dma control blocks chain */
+	cb->next = 0;
+
+
+	if (pixels < dma_busy_wait_threshold) {
+		bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
+		bcm_dma_wait_idle(fb->dma_chan_base);
+	} else {
+		void __iomem *dma_chan = fb->dma_chan_base;
+		cb->info |= BCM2708_DMA_INT_EN;
+		bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
+		while (bcm_dma_is_busy(dma_chan)) {
+			wait_event_interruptible(
+				fb->dma_waitq,
+				!bcm_dma_is_busy(dma_chan));
+		}
+		fb->stats.dma_irqs++;
+	}
+	fb->stats.dma_copies++;
 }
 
 static void bcm2708_fb_imageblit(struct fb_info *info,
@@ -336,6 +537,24 @@ static void bcm2708_fb_imageblit(struct 
 	cfb_imageblit(info, image);
 }
 
+static irqreturn_t bcm2708_fb_dma_irq(int irq, void *cxt)
+{
+	struct bcm2708_fb *fb = cxt;
+
+	/* FIXME: should read status register to check if this is
+	 * actually interrupting us or not, in case this interrupt
+	 * ever becomes shared amongst several DMA channels
+	 *
+	 * readl(dma_chan_base + BCM2708_DMA_CS) & BCM2708_DMA_IRQ;
+	 */
+
+	/* acknowledge the interrupt */
+	writel(BCM2708_DMA_INT, fb->dma_chan_base + BCM2708_DMA_CS);
+
+	wake_up(&fb->dma_waitq);
+	return IRQ_HANDLED;
+}
+
 static struct fb_ops bcm2708_fb_ops = {
 	.owner = THIS_MODULE,
 	.fb_check_var = bcm2708_fb_check_var,
@@ -365,7 +584,7 @@ static int bcm2708_fb_register(struct bc
 		fb->dma = dma;
 	}
 	fb->fb.fbops = &bcm2708_fb_ops;
-	fb->fb.flags = FBINFO_FLAG_DEFAULT;
+	fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
 	fb->fb.pseudo_palette = fb->cmap;
 
 	strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
@@ -396,6 +615,7 @@ static int bcm2708_fb_register(struct bc
 	fb->fb.monspecs.dclkmax = 100000000;
 
 	bcm2708_fb_set_bitfields(&fb->fb.var);
+	init_waitqueue_head(&fb->dma_waitq);
 
 	/*
 	 * Allocate colourmap.
@@ -421,14 +641,45 @@ static int bcm2708_fb_probe(struct platf
 	struct bcm2708_fb *fb;
 	int ret;
 
-	fb = kmalloc(sizeof(struct bcm2708_fb), GFP_KERNEL);
+	fb = kzalloc(sizeof(struct bcm2708_fb), GFP_KERNEL);
 	if (!fb) {
 		dev_err(&dev->dev,
 			"could not allocate new bcm2708_fb struct\n");
 		ret = -ENOMEM;
 		goto free_region;
 	}
-	memset(fb, 0, sizeof(struct bcm2708_fb));
+
+	bcm2708_fb_debugfs_init(fb);
+
+	fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
+					     &fb->cb_handle, GFP_KERNEL);
+	if (!fb->cb_base) {
+		dev_err(&dev->dev, "cannot allocate DMA CBs\n");
+		ret = -ENOMEM;
+		goto free_fb;
+	}
+
+	pr_info("BCM2708FB: allocated DMA memory %08x\n",
+	       fb->cb_handle);
+
+	ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
+				 &fb->dma_chan_base, &fb->dma_irq);
+	if (ret < 0) {
+		dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
+		goto free_cb;
+	}
+	fb->dma_chan = ret;
+
+	ret = request_irq(fb->dma_irq, bcm2708_fb_dma_irq,
+			  0, "bcm2708_fb dma", fb);
+	if (ret) {
+		pr_err("%s: failed to request DMA irq\n", __func__);
+		goto free_dma_chan;
+	}
+
+
+	pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
+	       fb->dma_chan, fb->dma_chan_base);
 
 	fb->dev = dev;
 
@@ -438,6 +689,11 @@ static int bcm2708_fb_probe(struct platf
 		goto out;
 	}
 
+free_dma_chan:
+	bcm_dma_chan_free(fb->dma_chan);
+free_cb:
+	dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
+free_fb:
 	kfree(fb);
 free_region:
 	dev_err(&dev->dev, "probe failed, err %d\n", ret);
@@ -455,8 +711,15 @@ static int bcm2708_fb_remove(struct plat
 		iounmap(fb->fb.screen_base);
 	unregister_framebuffer(&fb->fb);
 
+	dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
+	bcm_dma_chan_free(fb->dma_chan);
+
 	dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
 			  fb->dma);
+	bcm2708_fb_debugfs_deinit(fb);
+
+	free_irq(fb->dma_irq, fb);
+
 	kfree(fb);
 
 	return 0;