bfin: Refactor duplicated assembly-related macros
[libav.git] / libswscale / bfin / internal_bfin.S
1 /*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 * April 20, 2007
4 *
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26 /*
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31 The following calculation is used for the conversion:
32
33 r = clipz((y - oy) * cy + crv * (v - 128))
34 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
35 b = clipz((y - oy) * cy + cbu * (u - 128))
36
37 y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40 New factorization to eliminate the truncation error which was
41 occurring due to the byteop3p.
42
43
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
49
50 3) Compute into the accumulators cy * yx0, cy * yx1.
51
52 4) Compute each of the linear equations:
53 r = clipz((y - oy) * cy + crv * (v - 128))
54
55 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
56
57 b = clipz((y - oy) * cy + cbu * (u - 128))
58
59 Reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtraction.
61
62 Because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
64 formats.
65
66 API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
68
69 A B
70 --- ---
71 i2 = cb i3 = cr
72 i1 = coeff i0 = y
73
74 Where coeffs have the following layout in memory.
75
76 uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
77
78 coeffs is a pointer to oy.
79
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual Mac
82 architecture of BlackFin.
83
84 All routines are exported with _ff_bfin_ as a symbol prefix.
85
86 Rough performance gain compared against -O3:
87
88 2779809/1484290 187.28%
89
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
94 */
95
96 #include "libavutil/bfin/asm.h"
97
98 #define MEM mL1
99
100
101 .text
102
103 #define COEFF_LEN 11*4
104 #define COEFF_REL_CY_OFF 4*4
105
106 #define ARG_OUT 20
107 #define ARG_W 24
108 #define ARG_COEFF 28
109
110 DEFUN(yuv2rgb565_line,MEM,
111 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
112 link 0;
113 [--sp] = (r7:4);
114 p1 = [fp+ARG_OUT];
115 r3 = [fp+ARG_W];
116
117 i0 = r0;
118 i2 = r1;
119 i3 = r2;
120
121 r0 = [fp+ARG_COEFF];
122 i1 = r0;
123 b1 = i1;
124 l1 = COEFF_LEN;
125 m0 = COEFF_REL_CY_OFF;
126 p0 = r3;
127
128 r0 = [i0++]; // 2Y
129 r1.l = w[i2++]; // 2u
130 r1.h = w[i3++]; // 2v
131 p0 = p0>>2;
132
133 lsetup (.L0565, .L1565) lc0 = p0;
134
135 /*
136 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
137 r0 -- used to load 4ys
138 r1 -- used to load 2us,2vs
139 r4 -- y3,y2
140 r5 -- y1,y0
141 r6 -- u1,u0
142 r7 -- v1,v0
143 */
144 r2=[i1++]; // oy
145 .L0565:
146 /*
147 rrrrrrrr gggggggg bbbbbbbb
148 5432109876543210
149 bbbbb >>3
150 gggggggg <<3
151 rrrrrrrr <<8
152 rrrrrggggggbbbbb
153 */
154 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
155 (r7,r6) = byteop16m (r1:0, r3:2) (r);
156 r5 = r5 << 2 (v); // y1,y0
157 r4 = r4 << 2 (v); // y3,y2
158 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
159 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
160 /* Y' = y*cy */
161 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
162
163 /* R = Y+ crv*(Cr-128) */
164 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
165 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
166 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
167 r2 = r2 >> 3 (v);
168 r3 = r2 & r5;
169
170 /* B = Y+ cbu*(Cb-128) */
171 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
172 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
173 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
174 r2 = r2 << 8 (v);
175 r2 = r2 & r5;
176 r3 = r3 | r2;
177
178 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
179 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
180 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
181 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
182 r2 = r2 << 3 (v);
183 r2 = r2 & r5;
184 r3 = r3 | r2;
185 [p1++]=r3 || r1=[i1++]; // cy
186
187 /* Y' = y*cy */
188
189 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
190
191 /* R = Y+ crv*(Cr-128) */
192 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
193 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
194 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
195 r2 = r2 >> 3 (v);
196 r3 = r2 & r5;
197
198 /* B = Y+ cbu*(Cb-128) */
199 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
200 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
201 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
202 r2 = r2 << 8 (v);
203 r2 = r2 & r5;
204 r3 = r3 | r2;
205
206 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
207 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
208 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
209 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
210 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
211 r2 = r2 & r5;
212 r3 = r3 | r2;
213 [p1++]=r3 || r1.h = w[i3++]; // 2v
214 .L1565: r2=[i1++]; // oy
215
216 l1 = 0;
217
218 (r7:4) = [sp++];
219 unlink;
220 rts;
221 DEFUN_END(yuv2rgb565_line)
222
223 DEFUN(yuv2rgb555_line,MEM,
224 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
225 link 0;
226 [--sp] = (r7:4);
227 p1 = [fp+ARG_OUT];
228 r3 = [fp+ARG_W];
229
230 i0 = r0;
231 i2 = r1;
232 i3 = r2;
233
234 r0 = [fp+ARG_COEFF];
235 i1 = r0;
236 b1 = i1;
237 l1 = COEFF_LEN;
238 m0 = COEFF_REL_CY_OFF;
239 p0 = r3;
240
241 r0 = [i0++]; // 2Y
242 r1.l = w[i2++]; // 2u
243 r1.h = w[i3++]; // 2v
244 p0 = p0>>2;
245
246 lsetup (.L0555, .L1555) lc0 = p0;
247
248 /*
249 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
250 r0 -- used to load 4ys
251 r1 -- used to load 2us,2vs
252 r4 -- y3,y2
253 r5 -- y1,y0
254 r6 -- u1,u0
255 r7 -- v1,v0
256 */
257 r2=[i1++]; // oy
258 .L0555:
259 /*
260 rrrrrrrr gggggggg bbbbbbbb
261 5432109876543210
262 bbbbb >>3
263 gggggggg <<2
264 rrrrrrrr <<7
265 xrrrrrgggggbbbbb
266 */
267
268 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
269 (r7,r6) = byteop16m (r1:0, r3:2) (r);
270 r5 = r5 << 2 (v); // y1,y0
271 r4 = r4 << 2 (v); // y3,y2
272 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
273 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
274 /* Y' = y*cy */
275 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
276
277 /* R = Y+ crv*(Cr-128) */
278 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
279 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
280 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
281 r2 = r2 >> 3 (v);
282 r3 = r2 & r5;
283
284 /* B = Y+ cbu*(Cb-128) */
285 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
286 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
287 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
288 r2 = r2 << 7 (v);
289 r2 = r2 & r5;
290 r3 = r3 | r2;
291
292 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
293 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
294 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
295 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
296 r2 = r2 << 2 (v);
297 r2 = r2 & r5;
298 r3 = r3 | r2;
299 [p1++]=r3 || r1=[i1++]; // cy
300
301 /* Y' = y*cy */
302
303 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
304
305 /* R = Y+ crv*(Cr-128) */
306 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
307 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
308 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
309 r2 = r2 >> 3 (v);
310 r3 = r2 & r5;
311
312 /* B = Y+ cbu*(Cb-128) */
313 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
314 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
315 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
316 r2 = r2 << 7 (v);
317 r2 = r2 & r5;
318 r3 = r3 | r2;
319
320 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
321 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
322 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
323 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
324 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
325 r2 = r2 & r5;
326 r3 = r3 | r2;
327 [p1++]=r3 || r1.h=w[i3++]; // 2v
328
329 .L1555: r2=[i1++]; // oy
330
331 l1 = 0;
332
333 (r7:4) = [sp++];
334 unlink;
335 rts;
336 DEFUN_END(yuv2rgb555_line)
337
338 DEFUN(yuv2rgb24_line,MEM,
339 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
340 link 0;
341 [--sp] = (r7:4);
342 p1 = [fp+ARG_OUT];
343 r3 = [fp+ARG_W];
344 p2 = p1;
345 p2 += 3;
346
347 i0 = r0;
348 i2 = r1;
349 i3 = r2;
350
351 r0 = [fp+ARG_COEFF]; // coeff buffer
352 i1 = r0;
353 b1 = i1;
354 l1 = COEFF_LEN;
355 m0 = COEFF_REL_CY_OFF;
356 p0 = r3;
357
358 r0 = [i0++]; // 2Y
359 r1.l = w[i2++]; // 2u
360 r1.h = w[i3++]; // 2v
361 p0 = p0>>2;
362
363 lsetup (.L0888, .L1888) lc0 = p0;
364
365 /*
366 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
367 r0 -- used to load 4ys
368 r1 -- used to load 2us,2vs
369 r4 -- y3,y2
370 r5 -- y1,y0
371 r6 -- u1,u0
372 r7 -- v1,v0
373 */
374 r2=[i1++]; // oy
375 .L0888:
376 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
377 (r7,r6) = byteop16m (r1:0, r3:2) (r);
378 r5 = r5 << 2 (v); // y1,y0
379 r4 = r4 << 2 (v); // y3,y2
380 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
381 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
382
383 /* Y' = y*cy */
384 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
385
386 /* R = Y+ crv*(Cr-128) */
387 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
388 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
389 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
390 r2=r2>>16 || B[p1++]=r2;
391 B[p2++]=r2;
392
393 /* B = Y+ cbu*(Cb-128) */
394 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
395 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
396 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
397
398 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
399 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
400 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
401 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
402
403 r2=r2>>16 || B[p1++]=r2;
404 B[p2++]=r2;
405
406 r3=r3>>16 || B[p1++]=r3;
407 B[p2++]=r3 || r1=[i1++]; // cy
408
409 p1+=3;
410 p2+=3;
411 /* Y' = y*cy */
412 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
413
414 /* R = Y+ crv*(Cr-128) */
415 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
416 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
417 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
418 r2=r2>>16 || B[p1++]=r2;
419 B[p2++]=r2;
420
421 /* B = Y+ cbu*(Cb-128) */
422 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
423 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
424 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
425
426 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
427 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
428 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
429 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
430 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
431 B[p2++]=r2 || r1.l = w[i2++]; // 2u
432 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
433 B[p2++]=r3 || r2=[i1++]; // oy
434
435 p1+=3;
436 .L1888: p2+=3;
437
438 l1 = 0;
439
440 (r7:4) = [sp++];
441 unlink;
442 rts;
443 DEFUN_END(yuv2rgb24_line)
444
445
446
447 #define ARG_vdst 20
448 #define ARG_width 24
449 #define ARG_height 28
450 #define ARG_lumStride 32
451 #define ARG_chromStride 36
452 #define ARG_srcStride 40
453
454 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
455 int width, int height,
456 int lumStride, int chromStride, int srcStride)):
457 link 0;
458 [--sp] = (r7:4,p5:4);
459
460 p0 = r1; // Y top even
461
462 i2 = r2; // *u
463 r2 = [fp + ARG_vdst];
464 i3 = r2; // *v
465
466 r1 = [fp + ARG_srcStride];
467 r2 = r0 + r1;
468 i0 = r0; // uyvy_T even
469 i1 = r2; // uyvy_B odd
470
471 p2 = [fp + ARG_lumStride];
472 p1 = p0 + p2; // Y bot odd
473
474 p5 = [fp + ARG_width];
475 p4 = [fp + ARG_height];
476 r0 = p5;
477 p4 = p4 >> 1;
478 p5 = p5 >> 2;
479
480 r2 = r0 << 1;
481 r1 = r1 << 1;
482 r1 = r1 - r2; // srcStride + (srcStride - 2*width)
483 r1 += -8; // i0,i1 is pre read need to correct
484 m0 = r1;
485
486 r2 = [fp + ARG_chromStride];
487 r0 = r0 >> 1;
488 r2 = r2 - r0;
489 m1 = r2;
490
491 /* I0,I1 - src input line pointers
492 * p0,p1 - luma output line pointers
493 * I2 - dstU
494 * I3 - dstV
495 */
496
497 lsetup (0f, 1f) lc1 = p4; // H/2
498 0: r0 = [i0++] || r2 = [i1++];
499 r1 = [i0++] || r3 = [i1++];
500 r4 = byteop1p(r1:0, r3:2);
501 r5 = byteop1p(r1:0, r3:2) (r);
502 lsetup (2f, 3f) lc0 = p5; // W/4
503 2: r0 = r0 >> 8(v);
504 r1 = r1 >> 8(v);
505 r2 = r2 >> 8(v);
506 r3 = r3 >> 8(v);
507 r0 = bytepack(r0, r1);
508 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
509 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
510 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
511 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
512 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
513 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
514
515 i0 += m0;
516 i1 += m0;
517 i2 += m1;
518 i3 += m1;
519 p0 = p0 + p2;
520 1: p1 = p1 + p2;
521
522 (r7:4,p5:4) = [sp++];
523 unlink;
524 rts;
525 DEFUN_END(uyvytoyv12)
526
527 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
528 int width, int height,
529 int lumStride, int chromStride, int srcStride)):
530 link 0;
531 [--sp] = (r7:4,p5:4);
532
533 p0 = r1; // Y top even
534
535 i2 = r2; // *u
536 r2 = [fp + ARG_vdst];
537 i3 = r2; // *v
538
539 r1 = [fp + ARG_srcStride];
540 r2 = r0 + r1;
541
542 i0 = r0; // uyvy_T even
543 i1 = r2; // uyvy_B odd
544
545 p2 = [fp + ARG_lumStride];
546 p1 = p0 + p2; // Y bot odd
547
548 p5 = [fp + ARG_width];
549 p4 = [fp + ARG_height];
550 r0 = p5;
551 p4 = p4 >> 1;
552 p5 = p5 >> 2;
553
554 r2 = r0 << 1;
555 r1 = r1 << 1;
556 r1 = r1 - r2; // srcStride + (srcStride - 2*width)
557 r1 += -8; // i0,i1 is pre read need to correct
558 m0 = r1;
559
560 r2 = [fp + ARG_chromStride];
561 r0 = r0 >> 1;
562 r2 = r2 - r0;
563 m1 = r2;
564
565 /* I0,I1 - src input line pointers
566 * p0,p1 - luma output line pointers
567 * I2 - dstU
568 * I3 - dstV
569 */
570
571 lsetup (0f, 1f) lc1 = p4; // H/2
572 0: r0 = [i0++] || r2 = [i1++];
573 r1 = [i0++] || r3 = [i1++];
574 r4 = bytepack(r0, r1);
575 r5 = bytepack(r2, r3);
576 lsetup (2f, 3f) lc0 = p5; // W/4
577 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
578 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
579 r2 = r2 >> 8(v);
580 r3 = r3 >> 8(v);
581 r4 = byteop1p(r1:0, r3:2);
582 r5 = byteop1p(r1:0, r3:2) (r);
583 r6 = pack(r5.l, r4.l);
584 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
585 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
586 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
587 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
588
589 i0 += m0;
590 i1 += m0;
591 i2 += m1;
592 i3 += m1;
593 p0 = p0 + p2;
594 1: p1 = p1 + p2;
595
596 (r7:4,p5:4) = [sp++];
597 unlink;
598 rts;
599 DEFUN_END(yuyvtoyv12)