target/linux/brcm2708/patches-4.1/0029-Speed-up-console-framebuffer-imageblit-function.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

From 00637200a8710623f0514c98765e2c6bc3a34a37 Mon Sep 17 00:00:00 2001
From: Harm Hanemaaijer <fgenfb@yahoo.com>
Date: Thu, 20 Jun 2013 20:21:39 +0200
Subject: [PATCH 029/203] Speed up console framebuffer imageblit function

Especially on platforms with a slower CPU but a relatively high
framebuffer fill bandwidth, like current ARM devices, the existing
console monochrome imageblit function used to draw console text is
suboptimal for common pixel depths such as 16bpp and 32bpp. The existing
code is quite general and can deal with several pixel depths. By creating
special case functions for 16bpp and 32bpp, by far the most common pixel
formats used on modern systems, a significant speed-up is attained
which can be readily felt on ARM-based devices like the Raspberry Pi
and the Allwinner platform, but should help any platform using the
fb layer.

The special case functions allow constant folding, eliminating a number
of instructions including divide operations, and allow the use of an
unrolled loop, eliminating instructions with a variable shift size,
reducing source memory access instructions, and eliminating excessive
branching. These unrolled loops also allow much better code optimization
by the C compiler. The code that selects which optimized variant is used
is also simplified, eliminating integer divide instructions.

The speed-up, measured by timing 'cat file.txt' in the console, varies
between 40% and 70%, when testing on the Raspberry Pi and Allwinner
ARM-based platforms, depending on font size and the pixel depth, with
the greater benefit for 32bpp.

Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com>
---
 drivers/video/fbdev/core/cfbimgblt.c | 152 +++++++++++++++++++++++++++++++++--
 1 file changed, 147 insertions(+), 5 deletions(-)

--- a/drivers/video/fbdev/core/cfbimgblt.c
+++ b/drivers/video/fbdev/core/cfbimgblt.c
@@ -28,6 +28,11 @@
  *
  *  Also need to add code to deal with cards endians that are different than
  *  the native cpu endians. I also need to deal with MSB position in the word.
+ *  Modified by Harm Hanemaaijer (fgenfb@yahoo.com) 2013:
+ *  - Provide optimized versions of fast_imageblit for 16 and 32bpp that are
+ *    significantly faster than the previous implementation.
+ *  - Simplify the fast/slow_imageblit selection code, avoiding integer
+ *    divides.
  */
 #include <linux/module.h>
 #include <linux/string.h>
@@ -262,6 +267,133 @@ static inline void fast_imageblit(const
 	}
 }	
 	
+/*
+ * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded
+ * into the code, main loop unrolled.
+ */
+
+static inline void fast_imageblit16(const struct fb_image *image,
+				    struct fb_info *p, u8 __iomem * dst1,
+				    u32 fgcolor, u32 bgcolor)
+{
+	u32 fgx = fgcolor, bgx = bgcolor;
+	u32 spitch = (image->width + 7) / 8;
+	u32 end_mask, eorx;
+	const char *s = image->data, *src;
+	u32 __iomem *dst;
+	const u32 *tab = NULL;
+	int i, j, k;
+
+	tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
+
+	fgx <<= 16;
+	bgx <<= 16;
+	fgx |= fgcolor;
+	bgx |= bgcolor;
+
+	eorx = fgx ^ bgx;
+	k = image->width / 2;
+
+	for (i = image->height; i--;) {
+		dst = (u32 __iomem *) dst1;
+		src = s;
+
+		j = k;
+		while (j >= 4) {
+			u8 bits = *src;
+			end_mask = tab[(bits >> 6) & 3];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 4) & 3];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 2) & 3];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[bits & 3];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			src++;
+			j -= 4;
+		}
+		if (j != 0) {
+			u8 bits = *src;
+			end_mask = tab[(bits >> 6) & 3];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			if (j >= 2) {
+				end_mask = tab[(bits >> 4) & 3];
+				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+				if (j == 3) {
+					end_mask = tab[(bits >> 2) & 3];
+					FB_WRITEL((end_mask & eorx) ^ bgx, dst);
+				}
+			}
+		}
+		dst1 += p->fix.line_length;
+		s += spitch;
+	}
+}
+
+/*
+ * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded
+ * into the code, main loop unrolled.
+ */
+
+static inline void fast_imageblit32(const struct fb_image *image,
+				    struct fb_info *p, u8 __iomem * dst1,
+				    u32 fgcolor, u32 bgcolor)
+{
+	u32 fgx = fgcolor, bgx = bgcolor;
+	u32 spitch = (image->width + 7) / 8;
+	u32 end_mask, eorx;
+	const char *s = image->data, *src;
+	u32 __iomem *dst;
+	const u32 *tab = NULL;
+	int i, j, k;
+
+	tab = cfb_tab32;
+
+	eorx = fgx ^ bgx;
+	k = image->width;
+
+	for (i = image->height; i--;) {
+		dst = (u32 __iomem *) dst1;
+		src = s;
+
+		j = k;
+		while (j >= 8) {
+			u8 bits = *src;
+			end_mask = tab[(bits >> 7) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 6) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 5) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 4) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 3) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 2) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[(bits >> 1) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			end_mask = tab[bits & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+			src++;
+			j -= 8;
+		}
+		if (j != 0) {
+			u32 bits = (u32) * src;
+			while (j > 1) {
+				end_mask = tab[(bits >> 7) & 1];
+				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
+				bits <<= 1;
+				j--;
+			}
+			end_mask = tab[(bits >> 7) & 1];
+			FB_WRITEL((end_mask & eorx) ^ bgx, dst);
+		}
+		dst1 += p->fix.line_length;
+		s += spitch;
+	}
+}
+
 void cfb_imageblit(struct fb_info *p, const struct fb_image *image)
 {
 	u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0;
@@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co
 			bgcolor = image->bg_color;
 		}	
 		
-		if (32 % bpp == 0 && !start_index && !pitch_index && 
-		    ((width & (32/bpp-1)) == 0) &&
-		    bpp >= 8 && bpp <= 32) 			
-			fast_imageblit(image, p, dst1, fgcolor, bgcolor);
-		else 
+		if (!start_index && !pitch_index) {
+			if (bpp == 32)
+				fast_imageblit32(image, p, dst1, fgcolor,
+						 bgcolor);
+			else if (bpp == 16 && (width & 1) == 0)
+				fast_imageblit16(image, p, dst1, fgcolor,
+						 bgcolor);
+			else if (bpp == 8 && (width & 3) == 0)
+				fast_imageblit(image, p, dst1, fgcolor,
+					       bgcolor);
+			else
+				slow_imageblit(image, p, dst1, fgcolor,
+					       bgcolor,
+					       start_index, pitch_index);
+		} else
 			slow_imageblit(image, p, dst1, fgcolor, bgcolor,
 					start_index, pitch_index);
 	} else