1 /*
2  * Copyright © 2009 Nokia Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  *
23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24  */
25 
26 /*
27  * This file contains a macro ('generate_composite_function') which can
28  * construct 2D image processing functions, based on a common template.
29  * Any combinations of source, destination and mask images with 8bpp,
30  * 16bpp, 24bpp, 32bpp color formats are supported.
31  *
32  * This macro takes care of:
33  *  - handling of leading and trailing unaligned pixels
34  *  - doing most of the work related to L2 cache preload
35  *  - encourages the use of software pipelining for better instructions
36  *    scheduling
37  *
38  * The user of this macro has to provide some configuration parameters
39  * (bit depths for the images, prefetch distance, etc.) and a set of
40  * macros, which should implement basic code chunks responsible for
41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
42  * examples.
43  *
44  * TODO:
45  *  - try overlapped pixel method (from Ian Rickards) when processing
46  *    exactly two blocks of pixels
47  *  - maybe add an option to do reverse scanline processing
48  */
49 
50 /*
51  * Bit flags for 'generate_composite_function' macro which are used
52  * to tune generated functions behavior.
53  */
54 .set FLAG_DST_WRITEONLY,       0
55 .set FLAG_DST_READWRITE,       1
56 .set FLAG_DEINTERLEAVE_32BPP,  2
57 
58 /*
59  * Offset in stack where mask and source pointer/stride can be accessed
60  * from 'init' macro. This is useful for doing special handling for solid mask.
61  */
62 .set ARGS_STACK_OFFSET,        40
63 
64 /*
65  * Constants for selecting preferable prefetch type.
66  */
67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
70 
71 /*
72  * Definitions of supplementary pixld/pixst macros (for partial load/store of
73  * pixel data).
74  */
75 
76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
77 .if abits > 0
78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
79 .else
80     op&.&elem_size {d&reg1}, [&mem_operand&]!
81 .endif
82 .endm
83 
84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
85 .if abits > 0
86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
87 .else
88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
89 .endif
90 .endm
91 
92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
93 .if abits > 0
94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
95 .else
96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
97 .endif
98 .endm
99 
100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
102 .endm
103 
104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
106 .endm
107 
108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
110 .endm
111 
112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
113 .if numbytes == 32
114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
115                               %(basereg+6), %(basereg+7), mem_operand, abits
116 .elseif numbytes == 16
117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
118 .elseif numbytes == 8
119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
120 .elseif numbytes == 4
121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
123     .elseif elem_size == 16
124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
126     .else
127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
131     .endif
132 .elseif numbytes == 2
133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
135     .else
136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
138     .endif
139 .elseif numbytes == 1
140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
141 .else
142     .error "unsupported size: numbytes"
143 .endif
144 .endm
145 
146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
147 .if bpp > 0
148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
150                       %(basereg+6), %(basereg+7), mem_operand, abits
151 .elseif (bpp == 24) && (numpix == 8)
152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
153 .elseif (bpp == 24) && (numpix == 4)
154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
158 .elseif (bpp == 24) && (numpix == 2)
159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
161 .elseif (bpp == 24) && (numpix == 1)
162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
163 .else
164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
165 .endif
166 .endif
167 .endm
168 
169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
170 .if bpp > 0
171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
173                       %(basereg+6), %(basereg+7), mem_operand, abits
174 .elseif (bpp == 24) && (numpix == 8)
175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
176 .elseif (bpp == 24) && (numpix == 4)
177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
181 .elseif (bpp == 24) && (numpix == 2)
182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
184 .elseif (bpp == 24) && (numpix == 1)
185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
186 .else
187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
188 .endif
189 .endif
190 .endm
191 
192 .macro pixld_a numpix, bpp, basereg, mem_operand
193 .if (bpp * numpix) <= 128
194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
195 .else
196     pixld numpix, bpp, basereg, mem_operand, 128
197 .endif
198 .endm
199 
200 .macro pixst_a numpix, bpp, basereg, mem_operand
201 .if (bpp * numpix) <= 128
202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
203 .else
204     pixst numpix, bpp, basereg, mem_operand, 128
205 .endif
206 .endm
207 
208 /*
209  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
210  * aliases to be defined)
211  */
212 .macro pixld1_s elem_size, reg1, mem_operand
213 .if elem_size == 16
214     mov     TMP1, VX, asr #16
215     adds    VX, VX, UNIT_X
216 5:  subpls  VX, VX, SRC_WIDTH_FIXED
217     bpl     5b
218     add     TMP1, mem_operand, TMP1, asl #1
219     mov     TMP2, VX, asr #16
220     adds    VX, VX, UNIT_X
221 5:  subpls  VX, VX, SRC_WIDTH_FIXED
222     bpl     5b
223     add     TMP2, mem_operand, TMP2, asl #1
224     vld1.16 {d&reg1&[0]}, [TMP1, :16]
225     mov     TMP1, VX, asr #16
226     adds    VX, VX, UNIT_X
227 5:  subpls  VX, VX, SRC_WIDTH_FIXED
228     bpl     5b
229     add     TMP1, mem_operand, TMP1, asl #1
230     vld1.16 {d&reg1&[1]}, [TMP2, :16]
231     mov     TMP2, VX, asr #16
232     adds    VX, VX, UNIT_X
233 5:  subpls  VX, VX, SRC_WIDTH_FIXED
234     bpl     5b
235     add     TMP2, mem_operand, TMP2, asl #1
236     vld1.16 {d&reg1&[2]}, [TMP1, :16]
237     vld1.16 {d&reg1&[3]}, [TMP2, :16]
238 .elseif elem_size == 32
239     mov     TMP1, VX, asr #16
240     adds    VX, VX, UNIT_X
241 5:  subpls  VX, VX, SRC_WIDTH_FIXED
242     bpl     5b
243     add     TMP1, mem_operand, TMP1, asl #2
244     mov     TMP2, VX, asr #16
245     adds    VX, VX, UNIT_X
246 5:  subpls  VX, VX, SRC_WIDTH_FIXED
247     bpl     5b
248     add     TMP2, mem_operand, TMP2, asl #2
249     vld1.32 {d&reg1&[0]}, [TMP1, :32]
250     vld1.32 {d&reg1&[1]}, [TMP2, :32]
251 .else
252     .error "unsupported"
253 .endif
254 .endm
255 
256 .macro pixld2_s elem_size, reg1, reg2, mem_operand
257 .if 0 /* elem_size == 32 */
258     mov     TMP1, VX, asr #16
259     add     VX, VX, UNIT_X, asl #1
260     add     TMP1, mem_operand, TMP1, asl #2
261     mov     TMP2, VX, asr #16
262     sub     VX, VX, UNIT_X
263     add     TMP2, mem_operand, TMP2, asl #2
264     vld1.32 {d&reg1&[0]}, [TMP1, :32]
265     mov     TMP1, VX, asr #16
266     add     VX, VX, UNIT_X, asl #1
267     add     TMP1, mem_operand, TMP1, asl #2
268     vld1.32 {d&reg2&[0]}, [TMP2, :32]
269     mov     TMP2, VX, asr #16
270     add     VX, VX, UNIT_X
271     add     TMP2, mem_operand, TMP2, asl #2
272     vld1.32 {d&reg1&[1]}, [TMP1, :32]
273     vld1.32 {d&reg2&[1]}, [TMP2, :32]
274 .else
275     pixld1_s elem_size, reg1, mem_operand
276     pixld1_s elem_size, reg2, mem_operand
277 .endif
278 .endm
279 
280 .macro pixld0_s elem_size, reg1, idx, mem_operand
281 .if elem_size == 16
282     mov     TMP1, VX, asr #16
283     adds    VX, VX, UNIT_X
284 5:  subpls  VX, VX, SRC_WIDTH_FIXED
285     bpl     5b
286     add     TMP1, mem_operand, TMP1, asl #1
287     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
288 .elseif elem_size == 32
289     mov     TMP1, VX, asr #16
290     adds    VX, VX, UNIT_X
291 5:  subpls  VX, VX, SRC_WIDTH_FIXED
292     bpl     5b
293     add     TMP1, mem_operand, TMP1, asl #2
294     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
295 .endif
296 .endm
297 
298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
299 .if numbytes == 32
300     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
301     pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
302     pixdeinterleave elem_size, %(basereg+4)
303 .elseif numbytes == 16
304     pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
305 .elseif numbytes == 8
306     pixld1_s elem_size, %(basereg+1), mem_operand
307 .elseif numbytes == 4
308     .if elem_size == 32
309         pixld0_s elem_size, %(basereg+0), 1, mem_operand
310     .elseif elem_size == 16
311         pixld0_s elem_size, %(basereg+0), 2, mem_operand
312         pixld0_s elem_size, %(basereg+0), 3, mem_operand
313     .else
314         pixld0_s elem_size, %(basereg+0), 4, mem_operand
315         pixld0_s elem_size, %(basereg+0), 5, mem_operand
316         pixld0_s elem_size, %(basereg+0), 6, mem_operand
317         pixld0_s elem_size, %(basereg+0), 7, mem_operand
318     .endif
319 .elseif numbytes == 2
320     .if elem_size == 16
321         pixld0_s elem_size, %(basereg+0), 1, mem_operand
322     .else
323         pixld0_s elem_size, %(basereg+0), 2, mem_operand
324         pixld0_s elem_size, %(basereg+0), 3, mem_operand
325     .endif
326 .elseif numbytes == 1
327     pixld0_s elem_size, %(basereg+0), 1, mem_operand
328 .else
329     .error "unsupported size: numbytes"
330 .endif
331 .endm
332 
333 .macro pixld_s numpix, bpp, basereg, mem_operand
334 .if bpp > 0
335     pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
336 .endif
337 .endm
338 
339 .macro vuzp8 reg1, reg2
340     vuzp.8 d&reg1, d&reg2
341 .endm
342 
343 .macro vzip8 reg1, reg2
344     vzip.8 d&reg1, d&reg2
345 .endm
346 
347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
348 .macro pixdeinterleave bpp, basereg
349 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
350     vuzp8 %(basereg+0), %(basereg+1)
351     vuzp8 %(basereg+2), %(basereg+3)
352     vuzp8 %(basereg+1), %(basereg+3)
353     vuzp8 %(basereg+0), %(basereg+2)
354 .endif
355 .endm
356 
357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
358 .macro pixinterleave bpp, basereg
359 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
360     vzip8 %(basereg+0), %(basereg+2)
361     vzip8 %(basereg+1), %(basereg+3)
362     vzip8 %(basereg+2), %(basereg+3)
363     vzip8 %(basereg+0), %(basereg+1)
364 .endif
365 .endm
366 
367 /*
368  * This is a macro for implementing cache preload. The main idea is that
369  * cache preload logic is mostly independent from the rest of pixels
370  * processing code. It starts at the top left pixel and moves forward
371  * across pixels and can jump across scanlines. Prefetch distance is
372  * handled in an 'incremental' way: it starts from 0 and advances to the
373  * optimal distance over time. After reaching optimal prefetch distance,
374  * it is kept constant. There are some checks which prevent prefetching
375  * unneeded pixel lines below the image (but it still can prefetch a bit
376  * more data on the right side of the image - not a big issue and may
377  * be actually helpful when rendering text glyphs). Additional trick is
378  * the use of LDR instruction for prefetch instead of PLD when moving to
379  * the next line, the point is that we have a high chance of getting TLB
380  * miss in this case, and PLD would be useless.
381  *
382  * This sounds like it may introduce a noticeable overhead (when working with
383  * fully cached data). But in reality, due to having a separate pipeline and
384  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
385  * execute simultaneously with NEON and be completely shadowed by it. Thus
386  * we get no performance overhead at all (*). This looks like a very nice
387  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
388  * but still can implement some rather advanced prefetch logic in software
389  * for almost zero cost!
390  *
391  * (*) The overhead of the prefetcher is visible when running some trivial
392  * pixels processing like simple copy. Anyway, having prefetch is a must
393  * when working with the graphics data.
394  */
395 .macro PF a, x:vararg
396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
397     a x
398 .endif
399 .endm
400 
401 .macro cache_preload std_increment, boost_increment
402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
403 .if regs_shortage
404     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
405 .endif
406 .if std_increment != 0
407     PF add PF_X, PF_X, #std_increment
408 .endif
409     PF tst PF_CTL, #0xF
410     PF addne PF_X, PF_X, #boost_increment
411     PF subne PF_CTL, PF_CTL, #1
412     PF cmp PF_X, ORIG_W
413 .if src_bpp_shift >= 0
414     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
415 .endif
416 .if dst_r_bpp != 0
417     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
418 .endif
419 .if mask_bpp_shift >= 0
420     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
421 .endif
422     PF subge PF_X, PF_X, ORIG_W
423     PF subges PF_CTL, PF_CTL, #0x10
424 .if src_bpp_shift >= 0
425     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
426 .endif
427 .if dst_r_bpp != 0
428     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
429 .endif
430 .if mask_bpp_shift >= 0
431     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
432 .endif
433 .endif
434 .endm
435 
436 .macro cache_preload_simple
437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
438 .if src_bpp > 0
439     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
440 .endif
441 .if dst_r_bpp > 0
442     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
443 .endif
444 .if mask_bpp > 0
445     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
446 .endif
447 .endif
448 .endm
449 
450 .macro fetch_mask_pixblock
451     pixld       pixblock_size, mask_bpp, \
452                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
453 .endm
454 
455 /*
456  * Macro which is used to process leading pixels until destination
457  * pointer is properly aligned (at 16 bytes boundary). When destination
458  * buffer uses 16bpp format, this is unnecessary, or even pointless.
459  */
460 .macro ensure_destination_ptr_alignment process_pixblock_head, \
461                                         process_pixblock_tail, \
462                                         process_pixblock_tail_head
463 .if dst_w_bpp != 24
464     tst         DST_R, #0xF
465     beq         2f
466 
467 .irp lowbit, 1, 2, 4, 8, 16
468 local skip1
469 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
470 .if lowbit < 16 /* we don't need more than 16-byte alignment */
471     tst         DST_R, #lowbit
472     beq         1f
473 .endif
474     pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
475     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
476 .if dst_r_bpp > 0
477     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
478 .else
479     add         DST_R, DST_R, #lowbit
480 .endif
481     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
482     sub         W, W, #(lowbit * 8 / dst_w_bpp)
483 1:
484 .endif
485 .endr
486     pixdeinterleave src_bpp, src_basereg
487     pixdeinterleave mask_bpp, mask_basereg
488     pixdeinterleave dst_r_bpp, dst_r_basereg
489 
490     process_pixblock_head
491     cache_preload 0, pixblock_size
492     cache_preload_simple
493     process_pixblock_tail
494 
495     pixinterleave dst_w_bpp, dst_w_basereg
496 .irp lowbit, 1, 2, 4, 8, 16
497 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
498 .if lowbit < 16 /* we don't need more than 16-byte alignment */
499     tst         DST_W, #lowbit
500     beq         1f
501 .endif
502     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
503 1:
504 .endif
505 .endr
506 .endif
507 2:
508 .endm
509 
510 /*
511  * Special code for processing up to (pixblock_size - 1) remaining
512  * trailing pixels. As SIMD processing performs operation on
513  * pixblock_size pixels, anything smaller than this has to be loaded
514  * and stored in a special way. Loading and storing of pixel data is
515  * performed in such a way that we fill some 'slots' in the NEON
516  * registers (some slots naturally are unused), then perform compositing
517  * operation as usual. In the end, the data is taken from these 'slots'
518  * and saved to memory.
519  *
520  * cache_preload_flag - allows to suppress prefetch if
521  *                      set to 0
522  * dst_aligned_flag   - selects whether destination buffer
523  *                      is aligned
524  */
525 .macro process_trailing_pixels cache_preload_flag, \
526                                dst_aligned_flag, \
527                                process_pixblock_head, \
528                                process_pixblock_tail, \
529                                process_pixblock_tail_head
530     tst         W, #(pixblock_size - 1)
531     beq         2f
532 .irp chunk_size, 16, 8, 4, 2, 1
533 .if pixblock_size > chunk_size
534     tst         W, #chunk_size
535     beq         1f
536     pixld_src   chunk_size, src_bpp, src_basereg, SRC
537     pixld       chunk_size, mask_bpp, mask_basereg, MASK
538 .if dst_aligned_flag != 0
539     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
540 .else
541     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
542 .endif
543 .if cache_preload_flag != 0
544     PF add      PF_X, PF_X, #chunk_size
545 .endif
546 1:
547 .endif
548 .endr
549     pixdeinterleave src_bpp, src_basereg
550     pixdeinterleave mask_bpp, mask_basereg
551     pixdeinterleave dst_r_bpp, dst_r_basereg
552 
553     process_pixblock_head
554 .if cache_preload_flag != 0
555     cache_preload 0, pixblock_size
556     cache_preload_simple
557 .endif
558     process_pixblock_tail
559     pixinterleave dst_w_bpp, dst_w_basereg
560 .irp chunk_size, 16, 8, 4, 2, 1
561 .if pixblock_size > chunk_size
562     tst         W, #chunk_size
563     beq         1f
564 .if dst_aligned_flag != 0
565     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
566 .else
567     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
568 .endif
569 1:
570 .endif
571 .endr
572 2:
573 .endm
574 
575 /*
576  * Macro, which performs all the needed operations to switch to the next
577  * scanline and start the next loop iteration unless all the scanlines
578  * are already processed.
579  */
580 .macro advance_to_next_scanline start_of_loop_label
581 .if regs_shortage
582     ldrd        W, [sp] /* load W and H (width and height) from stack */
583 .else
584     mov         W, ORIG_W
585 .endif
586     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
587 .if src_bpp != 0
588     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
589 .endif
590 .if mask_bpp != 0
591     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
592 .endif
593 .if (dst_w_bpp != 24)
594     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
595 .endif
596 .if (src_bpp != 24) && (src_bpp != 0)
597     sub         SRC, SRC, W, lsl #src_bpp_shift
598 .endif
599 .if (mask_bpp != 24) && (mask_bpp != 0)
600     sub         MASK, MASK, W, lsl #mask_bpp_shift
601 .endif
602     subs        H, H, #1
603     mov         DST_R, DST_W
604 .if regs_shortage
605     str         H, [sp, #4] /* save updated height to stack */
606 .endif
607     bge         start_of_loop_label
608 .endm
609 
610 /*
611  * Registers are allocated in the following way by default:
612  * d0, d1, d2, d3     - reserved for loading source pixel data
613  * d4, d5, d6, d7     - reserved for loading destination pixel data
614  * d24, d25, d26, d27 - reserved for loading mask pixel data
615  * d28, d29, d30, d31 - final destination pixel data for writeback to memory
616  */
617 .macro generate_composite_function fname, \
618                                    src_bpp_, \
619                                    mask_bpp_, \
620                                    dst_w_bpp_, \
621                                    flags, \
622                                    pixblock_size_, \
623                                    prefetch_distance, \
624                                    init, \
625                                    cleanup, \
626                                    process_pixblock_head, \
627                                    process_pixblock_tail, \
628                                    process_pixblock_tail_head, \
629                                    dst_w_basereg_ = 28, \
630                                    dst_r_basereg_ = 4, \
631                                    src_basereg_   = 0, \
632                                    mask_basereg_  = 24
633 
634     pixman_asm_function fname
635 
636     push        {r4-r12, lr}        /* save all registers */
637 
638 /*
639  * Select prefetch type for this function. If prefetch distance is
640  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
641  * has to be used instead of ADVANCED.
642  */
643     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
644 .if prefetch_distance == 0
645     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
646 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
647         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
648     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
649 .endif
650 
651 /*
652  * Make some macro arguments globally visible and accessible
653  * from other macros
654  */
655     .set src_bpp, src_bpp_
656     .set mask_bpp, mask_bpp_
657     .set dst_w_bpp, dst_w_bpp_
658     .set pixblock_size, pixblock_size_
659     .set dst_w_basereg, dst_w_basereg_
660     .set dst_r_basereg, dst_r_basereg_
661     .set src_basereg, src_basereg_
662     .set mask_basereg, mask_basereg_
663 
664     .macro pixld_src x:vararg
665         pixld x
666     .endm
667     .macro fetch_src_pixblock
668         pixld_src   pixblock_size, src_bpp, \
669                     (src_basereg - pixblock_size * src_bpp / 64), SRC
670     .endm
671 /*
672  * Assign symbolic names to registers
673  */
674     W           .req        r0      /* width (is updated during processing) */
675     H           .req        r1      /* height (is updated during processing) */
676     DST_W       .req        r2      /* destination buffer pointer for writes */
677     DST_STRIDE  .req        r3      /* destination image stride */
678     SRC         .req        r4      /* source buffer pointer */
679     SRC_STRIDE  .req        r5      /* source image stride */
680     DST_R       .req        r6      /* destination buffer pointer for reads */
681 
682     MASK        .req        r7      /* mask pointer */
683     MASK_STRIDE .req        r8      /* mask stride */
684 
685     PF_CTL      .req        r9      /* combined lines counter and prefetch */
686                                     /* distance increment counter */
687     PF_X        .req        r10     /* pixel index in a scanline for current */
688                                     /* pretetch position */
689     PF_SRC      .req        r11     /* pointer to source scanline start */
690                                     /* for prefetch purposes */
691     PF_DST      .req        r12     /* pointer to destination scanline start */
692                                     /* for prefetch purposes */
693     PF_MASK     .req        r14     /* pointer to mask scanline start */
694                                     /* for prefetch purposes */
695 /*
696  * Check whether we have enough registers for all the local variables.
697  * If we don't have enough registers, original width and height are
698  * kept on top of stack (and 'regs_shortage' variable is set to indicate
699  * this for the rest of code). Even if there are enough registers, the
700  * allocation scheme may be a bit different depending on whether source
701  * or mask is not used.
702  */
703 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
704     ORIG_W      .req        r10     /* saved original width */
705     DUMMY       .req        r12     /* temporary register */
706     .set        regs_shortage, 0
707 .elseif mask_bpp == 0
708     ORIG_W      .req        r7      /* saved original width */
709     DUMMY       .req        r8      /* temporary register */
710     .set        regs_shortage, 0
711 .elseif src_bpp == 0
712     ORIG_W      .req        r4      /* saved original width */
713     DUMMY       .req        r5      /* temporary register */
714     .set        regs_shortage, 0
715 .else
716     ORIG_W      .req        r1      /* saved original width */
717     DUMMY       .req        r1      /* temporary register */
718     .set        regs_shortage, 1
719 .endif
720 
721     .set mask_bpp_shift, -1
722 .if src_bpp == 32
723     .set src_bpp_shift, 2
724 .elseif src_bpp == 24
725     .set src_bpp_shift, 0
726 .elseif src_bpp == 16
727     .set src_bpp_shift, 1
728 .elseif src_bpp == 8
729     .set src_bpp_shift, 0
730 .elseif src_bpp == 0
731     .set src_bpp_shift, -1
732 .else
733     .error "requested src bpp (src_bpp) is not supported"
734 .endif
735 .if mask_bpp == 32
736     .set mask_bpp_shift, 2
737 .elseif mask_bpp == 24
738     .set mask_bpp_shift, 0
739 .elseif mask_bpp == 8
740     .set mask_bpp_shift, 0
741 .elseif mask_bpp == 0
742     .set mask_bpp_shift, -1
743 .else
744     .error "requested mask bpp (mask_bpp) is not supported"
745 .endif
746 .if dst_w_bpp == 32
747     .set dst_bpp_shift, 2
748 .elseif dst_w_bpp == 24
749     .set dst_bpp_shift, 0
750 .elseif dst_w_bpp == 16
751     .set dst_bpp_shift, 1
752 .elseif dst_w_bpp == 8
753     .set dst_bpp_shift, 0
754 .else
755     .error "requested dst bpp (dst_w_bpp) is not supported"
756 .endif
757 
758 .if (((flags) & FLAG_DST_READWRITE) != 0)
759     .set dst_r_bpp, dst_w_bpp
760 .else
761     .set dst_r_bpp, 0
762 .endif
763 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
764     .set DEINTERLEAVE_32BPP_ENABLED, 1
765 .else
766     .set DEINTERLEAVE_32BPP_ENABLED, 0
767 .endif
768 
769 .if prefetch_distance < 0 || prefetch_distance > 15
770     .error "invalid prefetch distance (prefetch_distance)"
771 .endif
772 
773 .if src_bpp > 0
774     ldr         SRC, [sp, #40]
775 .endif
776 .if mask_bpp > 0
777     ldr         MASK, [sp, #48]
778 .endif
779     PF mov      PF_X, #0
780 .if src_bpp > 0
781     ldr         SRC_STRIDE, [sp, #44]
782 .endif
783 .if mask_bpp > 0
784     ldr         MASK_STRIDE, [sp, #52]
785 .endif
786     mov         DST_R, DST_W
787 
788 .if src_bpp == 24
789     sub         SRC_STRIDE, SRC_STRIDE, W
790     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
791 .endif
792 .if mask_bpp == 24
793     sub         MASK_STRIDE, MASK_STRIDE, W
794     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
795 .endif
796 .if dst_w_bpp == 24
797     sub         DST_STRIDE, DST_STRIDE, W
798     sub         DST_STRIDE, DST_STRIDE, W, lsl #1
799 .endif
800 
801 /*
802  * Setup advanced prefetcher initial state
803  */
804     PF mov      PF_SRC, SRC
805     PF mov      PF_DST, DST_R
806     PF mov      PF_MASK, MASK
807     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
808     PF mov      PF_CTL, H, lsl #4
809     PF add      PF_CTL, #(prefetch_distance - 0x10)
810 
811     init
812 .if regs_shortage
813     push        {r0, r1}
814 .endif
815     subs        H, H, #1
816 .if regs_shortage
817     str         H, [sp, #4] /* save updated height to stack */
818 .else
819     mov         ORIG_W, W
820 .endif
821     blt         9f
822     cmp         W, #(pixblock_size * 2)
823     blt         8f
824 /*
825  * This is the start of the pipelined loop, which if optimized for
826  * long scanlines
827  */
828 0:
829     ensure_destination_ptr_alignment process_pixblock_head, \
830                                      process_pixblock_tail, \
831                                      process_pixblock_tail_head
832 
833     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
834     pixld_a     pixblock_size, dst_r_bpp, \
835                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
836     fetch_src_pixblock
837     pixld       pixblock_size, mask_bpp, \
838                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
839     PF add      PF_X, PF_X, #pixblock_size
840     process_pixblock_head
841     cache_preload 0, pixblock_size
842     cache_preload_simple
843     subs        W, W, #(pixblock_size * 2)
844     blt         2f
845 1:
846     process_pixblock_tail_head
847     cache_preload_simple
848     subs        W, W, #pixblock_size
849     bge         1b
850 2:
851     process_pixblock_tail
852     pixst_a     pixblock_size, dst_w_bpp, \
853                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
854 
855     /* Process the remaining trailing pixels in the scanline */
856     process_trailing_pixels 1, 1, \
857                             process_pixblock_head, \
858                             process_pixblock_tail, \
859                             process_pixblock_tail_head
860     advance_to_next_scanline 0b
861 
862 .if regs_shortage
863     pop         {r0, r1}
864 .endif
865     cleanup
866     pop         {r4-r12, pc}  /* exit */
867 /*
868  * This is the start of the loop, designed to process images with small width
869  * (less than pixblock_size * 2 pixels). In this case neither pipelining
870  * nor prefetch are used.
871  */
872 8:
873     /* Process exactly pixblock_size pixels if needed */
874     tst         W, #pixblock_size
875     beq         1f
876     pixld       pixblock_size, dst_r_bpp, \
877                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
878     fetch_src_pixblock
879     pixld       pixblock_size, mask_bpp, \
880                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
881     process_pixblock_head
882     process_pixblock_tail
883     pixst       pixblock_size, dst_w_bpp, \
884                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
885 1:
886     /* Process the remaining trailing pixels in the scanline */
887     process_trailing_pixels 0, 0, \
888                             process_pixblock_head, \
889                             process_pixblock_tail, \
890                             process_pixblock_tail_head
891     advance_to_next_scanline 8b
892 9:
893 .if regs_shortage
894     pop         {r0, r1}
895 .endif
896     cleanup
897     pop         {r4-r12, pc}  /* exit */
898 
899     .purgem     fetch_src_pixblock
900     .purgem     pixld_src
901 
902     .unreq      SRC
903     .unreq      MASK
904     .unreq      DST_R
905     .unreq      DST_W
906     .unreq      ORIG_W
907     .unreq      W
908     .unreq      H
909     .unreq      SRC_STRIDE
910     .unreq      DST_STRIDE
911     .unreq      MASK_STRIDE
912     .unreq      PF_CTL
913     .unreq      PF_X
914     .unreq      PF_SRC
915     .unreq      PF_DST
916     .unreq      PF_MASK
917     .unreq      DUMMY
918     .endfunc
919 .endm
920 
921 /*
922  * A simplified variant of function generation template for a single
923  * scanline processing (for implementing pixman combine functions)
924  */
925 .macro generate_composite_function_scanline        use_nearest_scaling, \
926                                                    fname, \
927                                                    src_bpp_, \
928                                                    mask_bpp_, \
929                                                    dst_w_bpp_, \
930                                                    flags, \
931                                                    pixblock_size_, \
932                                                    init, \
933                                                    cleanup, \
934                                                    process_pixblock_head, \
935                                                    process_pixblock_tail, \
936                                                    process_pixblock_tail_head, \
937                                                    dst_w_basereg_ = 28, \
938                                                    dst_r_basereg_ = 4, \
939                                                    src_basereg_   = 0, \
940                                                    mask_basereg_  = 24
941 
942     pixman_asm_function fname
943 
944     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
945 /*
946  * Make some macro arguments globally visible and accessible
947  * from other macros
948  */
949     .set src_bpp, src_bpp_
950     .set mask_bpp, mask_bpp_
951     .set dst_w_bpp, dst_w_bpp_
952     .set pixblock_size, pixblock_size_
953     .set dst_w_basereg, dst_w_basereg_
954     .set dst_r_basereg, dst_r_basereg_
955     .set src_basereg, src_basereg_
956     .set mask_basereg, mask_basereg_
957 
958 .if use_nearest_scaling != 0
959     /*
960      * Assign symbolic names to registers for nearest scaling
961      */
962     W           .req        r0
963     DST_W       .req        r1
964     SRC         .req        r2
965     VX          .req        r3
966     UNIT_X      .req        ip
967     MASK        .req        lr
968     TMP1        .req        r4
969     TMP2        .req        r5
970     DST_R       .req        r6
971     SRC_WIDTH_FIXED .req        r7
972 
973     .macro pixld_src x:vararg
974         pixld_s x
975     .endm
976 
977     ldr         UNIT_X, [sp]
978     push        {r4-r8, lr}
979     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
980     .if mask_bpp != 0
981     ldr         MASK, [sp, #(24 + 8)]
982     .endif
983 .else
984     /*
985      * Assign symbolic names to registers
986      */
987     W           .req        r0      /* width (is updated during processing) */
988     DST_W       .req        r1      /* destination buffer pointer for writes */
989     SRC         .req        r2      /* source buffer pointer */
990     DST_R       .req        ip      /* destination buffer pointer for reads */
991     MASK        .req        r3      /* mask pointer */
992 
993     .macro pixld_src x:vararg
994         pixld x
995     .endm
996 .endif
997 
998 .if (((flags) & FLAG_DST_READWRITE) != 0)
999     .set dst_r_bpp, dst_w_bpp
1000 .else
1001     .set dst_r_bpp, 0
1002 .endif
1003 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
1004     .set DEINTERLEAVE_32BPP_ENABLED, 1
1005 .else
1006     .set DEINTERLEAVE_32BPP_ENABLED, 0
1007 .endif
1008 
1009     .macro fetch_src_pixblock
1010         pixld_src   pixblock_size, src_bpp, \
1011                     (src_basereg - pixblock_size * src_bpp / 64), SRC
1012     .endm
1013 
1014     init
1015     mov         DST_R, DST_W
1016 
1017     cmp         W, #pixblock_size
1018     blt         8f
1019 
1020     ensure_destination_ptr_alignment process_pixblock_head, \
1021                                      process_pixblock_tail, \
1022                                      process_pixblock_tail_head
1023 
1024     subs        W, W, #pixblock_size
1025     blt         7f
1026 
1027     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1028     pixld_a     pixblock_size, dst_r_bpp, \
1029                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
1030     fetch_src_pixblock
1031     pixld       pixblock_size, mask_bpp, \
1032                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
1033     process_pixblock_head
1034     subs        W, W, #pixblock_size
1035     blt         2f
1036 1:
1037     process_pixblock_tail_head
1038     subs        W, W, #pixblock_size
1039     bge         1b
1040 2:
1041     process_pixblock_tail
1042     pixst_a     pixblock_size, dst_w_bpp, \
1043                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
1044 7:
1045     /* Process the remaining trailing pixels in the scanline (dst aligned) */
1046     process_trailing_pixels 0, 1, \
1047                             process_pixblock_head, \
1048                             process_pixblock_tail, \
1049                             process_pixblock_tail_head
1050 
1051     cleanup
1052 .if use_nearest_scaling != 0
1053     pop         {r4-r8, pc}  /* exit */
1054 .else
1055     bx          lr  /* exit */
1056 .endif
1057 8:
1058     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1059     process_trailing_pixels 0, 0, \
1060                             process_pixblock_head, \
1061                             process_pixblock_tail, \
1062                             process_pixblock_tail_head
1063 
1064     cleanup
1065 
1066 .if use_nearest_scaling != 0
1067     pop         {r4-r8, pc}  /* exit */
1068 
1069     .unreq      DST_R
1070     .unreq      SRC
1071     .unreq      W
1072     .unreq      VX
1073     .unreq      UNIT_X
1074     .unreq      TMP1
1075     .unreq      TMP2
1076     .unreq      DST_W
1077     .unreq      MASK
1078     .unreq      SRC_WIDTH_FIXED
1079 
1080 .else
1081     bx          lr  /* exit */
1082 
1083     .unreq      SRC
1084     .unreq      MASK
1085     .unreq      DST_R
1086     .unreq      DST_W
1087     .unreq      W
1088 .endif
1089 
1090     .purgem     fetch_src_pixblock
1091     .purgem     pixld_src
1092 
1093     .endfunc
1094 .endm
1095 
1096 .macro generate_composite_function_single_scanline x:vararg
1097     generate_composite_function_scanline 0, x
1098 .endm
1099 
1100 .macro generate_composite_function_nearest_scanline x:vararg
1101     generate_composite_function_scanline 1, x
1102 .endm
1103 
1104 /* Default prologue/epilogue, nothing special needs to be done */
1105 
1106 .macro default_init
1107 .endm
1108 
1109 .macro default_cleanup
1110 .endm
1111 
1112 /*
1113  * Prologue/epilogue variant which additionally saves/restores d8-d15
1114  * registers (they need to be saved/restored by callee according to ABI).
1115  * This is required if the code needs to use all the NEON registers.
1116  */
1117 
1118 .macro default_init_need_all_regs
1119     vpush       {d8-d15}
1120 .endm
1121 
1122 .macro default_cleanup_need_all_regs
1123     vpop        {d8-d15}
1124 .endm
1125 
1126 /******************************************************************************/
1127 
1128 /*
1129  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
1130  * into a planar a8r8g8b8 format (with a, r, g, b color components
1131  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
1132  *
1133  * Warning: the conversion is destructive and the original
1134  *          value (in) is lost.
1135  */
1136 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
1137     vshrn.u16   out_r, in,    #8
1138     vshrn.u16   out_g, in,    #3
1139     vsli.u16    in,    in,    #5
1140     vmov.u8     out_a, #255
1141     vsri.u8     out_r, out_r, #5
1142     vsri.u8     out_g, out_g, #6
1143     vshrn.u16   out_b, in,    #2
1144 .endm
1145 
1146 .macro convert_0565_to_x888 in, out_r, out_g, out_b
1147     vshrn.u16   out_r, in,    #8
1148     vshrn.u16   out_g, in,    #3
1149     vsli.u16    in,    in,    #5
1150     vsri.u8     out_r, out_r, #5
1151     vsri.u8     out_g, out_g, #6
1152     vshrn.u16   out_b, in,    #2
1153 .endm
1154 
1155 /*
1156  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1157  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1158  * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1159  * registers (tmp1, tmp2)
1160  */
1161 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
1162     vshll.u8    tmp1, in_g, #8
1163     vshll.u8    out, in_r, #8
1164     vshll.u8    tmp2, in_b, #8
1165     vsri.u16    out, tmp1, #5
1166     vsri.u16    out, tmp2, #11
1167 .endm
1168 
1169 /*
1170  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1171  * returned in (out0, out1) registers pair. Requires one temporary
1172  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1173  * value from 'in' is lost
1174  */
1175 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
1176     vshl.u16    out0, in,   #5  /* G top 6 bits */
1177     vshl.u16    tmp,  in,   #11 /* B top 5 bits */
1178     vsri.u16    in,   in,   #5  /* R is ready in top bits */
1179     vsri.u16    out0, out0, #6  /* G is ready in top bits */
1180     vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
1181     vshr.u16    out1, in,   #8  /* R is in place */
1182     vsri.u16    out0, tmp,  #8  /* G & B is in place */
1183     vzip.u16    out0, out1      /* everything is in place */
1184 .endm
1185