b007f07f533e0be53a36fe9505889ca0752d5901
[libav.git] / libswscale / bfin / internal_bfin.S
1 /*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 * April 20, 2007
4 *
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26 /*
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31 The following calculation is used for the conversion:
32
33 r = clipz((y - oy) * cy + crv * (v - 128))
34 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
35 b = clipz((y - oy) * cy + cbu * (u - 128))
36
37 y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40 New factorization to eliminate the truncation error which was
41 occurring due to the byteop3p.
42
43
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
49
50 3) Compute into the accumulators cy * yx0, cy * yx1.
51
52 4) Compute each of the linear equations:
53 r = clipz((y - oy) * cy + crv * (v - 128))
54
55 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
56
57 b = clipz((y - oy) * cy + cbu * (u - 128))
58
59 Reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtraction.
61
62 Because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
64 formats.
65
66 API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
68
69 A B
70 --- ---
71 i2 = cb i3 = cr
72 i1 = coeff i0 = y
73
74 Where coeffs have the following layout in memory.
75
76 uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
77
78 coeffs is a pointer to oy.
79
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual Mac
82 architecture of BlackFin.
83
84 All routines are exported with _ff_bfin_ as a symbol prefix.
85
86 Rough performance gain compared against -O3:
87
88 2779809/1484290 187.28%
89
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
94 */
95
96 #define mL3 .text
97 #if defined(__FDPIC__) && CONFIG_SRAM
98 #define mL1 .l1.text
99 #else
100 #define mL1 mL3
101 #endif
102 #define MEM mL1
103
104 #define DEFUN(fname,where,interface) \
105 .section where; \
106 .global _ff_bfin_ ## fname; \
107 .type _ff_bfin_ ## fname, STT_FUNC; \
108 .align 8; \
109 _ff_bfin_ ## fname
110
111 #define DEFUN_END(fname) \
112 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115 .text
116
117 #define COEFF_LEN 11*4
118 #define COEFF_REL_CY_OFF 4*4
119
120 #define ARG_OUT 20
121 #define ARG_W 24
122 #define ARG_COEFF 28
123
124 DEFUN(yuv2rgb565_line,MEM,
125 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126 link 0;
127 [--sp] = (r7:4);
128 p1 = [fp+ARG_OUT];
129 r3 = [fp+ARG_W];
130
131 i0 = r0;
132 i2 = r1;
133 i3 = r2;
134
135 r0 = [fp+ARG_COEFF];
136 i1 = r0;
137 b1 = i1;
138 l1 = COEFF_LEN;
139 m0 = COEFF_REL_CY_OFF;
140 p0 = r3;
141
142 r0 = [i0++]; // 2Y
143 r1.l = w[i2++]; // 2u
144 r1.h = w[i3++]; // 2v
145 p0 = p0>>2;
146
147 lsetup (.L0565, .L1565) lc0 = p0;
148
149 /*
150 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151 r0 -- used to load 4ys
152 r1 -- used to load 2us,2vs
153 r4 -- y3,y2
154 r5 -- y1,y0
155 r6 -- u1,u0
156 r7 -- v1,v0
157 */
158 r2=[i1++]; // oy
159 .L0565:
160 /*
161 rrrrrrrr gggggggg bbbbbbbb
162 5432109876543210
163 bbbbb >>3
164 gggggggg <<3
165 rrrrrrrr <<8
166 rrrrrggggggbbbbb
167 */
168 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
169 (r7,r6) = byteop16m (r1:0, r3:2) (r);
170 r5 = r5 << 2 (v); // y1,y0
171 r4 = r4 << 2 (v); // y3,y2
172 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
173 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
174 /* Y' = y*cy */
175 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
176
177 /* R = Y+ crv*(Cr-128) */
178 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
180 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
181 r2 = r2 >> 3 (v);
182 r3 = r2 & r5;
183
184 /* B = Y+ cbu*(Cb-128) */
185 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
187 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
188 r2 = r2 << 8 (v);
189 r2 = r2 & r5;
190 r3 = r3 | r2;
191
192 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
194 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
196 r2 = r2 << 3 (v);
197 r2 = r2 & r5;
198 r3 = r3 | r2;
199 [p1++]=r3 || r1=[i1++]; // cy
200
201 /* Y' = y*cy */
202
203 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
204
205 /* R = Y+ crv*(Cr-128) */
206 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
208 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
209 r2 = r2 >> 3 (v);
210 r3 = r2 & r5;
211
212 /* B = Y+ cbu*(Cb-128) */
213 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
215 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
216 r2 = r2 << 8 (v);
217 r2 = r2 & r5;
218 r3 = r3 | r2;
219
220 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
222 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
224 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
225 r2 = r2 & r5;
226 r3 = r3 | r2;
227 [p1++]=r3 || r1.h = w[i3++]; // 2v
228 .L1565: r2=[i1++]; // oy
229
230 l1 = 0;
231
232 (r7:4) = [sp++];
233 unlink;
234 rts;
235 DEFUN_END(yuv2rgb565_line)
236
237 DEFUN(yuv2rgb555_line,MEM,
238 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239 link 0;
240 [--sp] = (r7:4);
241 p1 = [fp+ARG_OUT];
242 r3 = [fp+ARG_W];
243
244 i0 = r0;
245 i2 = r1;
246 i3 = r2;
247
248 r0 = [fp+ARG_COEFF];
249 i1 = r0;
250 b1 = i1;
251 l1 = COEFF_LEN;
252 m0 = COEFF_REL_CY_OFF;
253 p0 = r3;
254
255 r0 = [i0++]; // 2Y
256 r1.l = w[i2++]; // 2u
257 r1.h = w[i3++]; // 2v
258 p0 = p0>>2;
259
260 lsetup (.L0555, .L1555) lc0 = p0;
261
262 /*
263 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264 r0 -- used to load 4ys
265 r1 -- used to load 2us,2vs
266 r4 -- y3,y2
267 r5 -- y1,y0
268 r6 -- u1,u0
269 r7 -- v1,v0
270 */
271 r2=[i1++]; // oy
272 .L0555:
273 /*
274 rrrrrrrr gggggggg bbbbbbbb
275 5432109876543210
276 bbbbb >>3
277 gggggggg <<2
278 rrrrrrrr <<7
279 xrrrrrgggggbbbbb
280 */
281
282 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
283 (r7,r6) = byteop16m (r1:0, r3:2) (r);
284 r5 = r5 << 2 (v); // y1,y0
285 r4 = r4 << 2 (v); // y3,y2
286 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
287 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
288 /* Y' = y*cy */
289 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
290
291 /* R = Y+ crv*(Cr-128) */
292 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
294 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
295 r2 = r2 >> 3 (v);
296 r3 = r2 & r5;
297
298 /* B = Y+ cbu*(Cb-128) */
299 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
301 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
302 r2 = r2 << 7 (v);
303 r2 = r2 & r5;
304 r3 = r3 | r2;
305
306 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
308 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
310 r2 = r2 << 2 (v);
311 r2 = r2 & r5;
312 r3 = r3 | r2;
313 [p1++]=r3 || r1=[i1++]; // cy
314
315 /* Y' = y*cy */
316
317 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
318
319 /* R = Y+ crv*(Cr-128) */
320 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
322 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
323 r2 = r2 >> 3 (v);
324 r3 = r2 & r5;
325
326 /* B = Y+ cbu*(Cb-128) */
327 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
329 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
330 r2 = r2 << 7 (v);
331 r2 = r2 & r5;
332 r3 = r3 | r2;
333
334 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
336 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
338 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
339 r2 = r2 & r5;
340 r3 = r3 | r2;
341 [p1++]=r3 || r1.h=w[i3++]; // 2v
342
343 .L1555: r2=[i1++]; // oy
344
345 l1 = 0;
346
347 (r7:4) = [sp++];
348 unlink;
349 rts;
350 DEFUN_END(yuv2rgb555_line)
351
352 DEFUN(yuv2rgb24_line,MEM,
353 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354 link 0;
355 [--sp] = (r7:4);
356 p1 = [fp+ARG_OUT];
357 r3 = [fp+ARG_W];
358 p2 = p1;
359 p2 += 3;
360
361 i0 = r0;
362 i2 = r1;
363 i3 = r2;
364
365 r0 = [fp+ARG_COEFF]; // coeff buffer
366 i1 = r0;
367 b1 = i1;
368 l1 = COEFF_LEN;
369 m0 = COEFF_REL_CY_OFF;
370 p0 = r3;
371
372 r0 = [i0++]; // 2Y
373 r1.l = w[i2++]; // 2u
374 r1.h = w[i3++]; // 2v
375 p0 = p0>>2;
376
377 lsetup (.L0888, .L1888) lc0 = p0;
378
379 /*
380 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381 r0 -- used to load 4ys
382 r1 -- used to load 2us,2vs
383 r4 -- y3,y2
384 r5 -- y1,y0
385 r6 -- u1,u0
386 r7 -- v1,v0
387 */
388 r2=[i1++]; // oy
389 .L0888:
390 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
391 (r7,r6) = byteop16m (r1:0, r3:2) (r);
392 r5 = r5 << 2 (v); // y1,y0
393 r4 = r4 << 2 (v); // y3,y2
394 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
395 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
396
397 /* Y' = y*cy */
398 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
399
400 /* R = Y+ crv*(Cr-128) */
401 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
403 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
404 r2=r2>>16 || B[p1++]=r2;
405 B[p2++]=r2;
406
407 /* B = Y+ cbu*(Cb-128) */
408 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
410 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
411
412 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
414 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
416
417 r2=r2>>16 || B[p1++]=r2;
418 B[p2++]=r2;
419
420 r3=r3>>16 || B[p1++]=r3;
421 B[p2++]=r3 || r1=[i1++]; // cy
422
423 p1+=3;
424 p2+=3;
425 /* Y' = y*cy */
426 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
427
428 /* R = Y+ crv*(Cr-128) */
429 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
431 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
432 r2=r2>>16 || B[p1++]=r2;
433 B[p2++]=r2;
434
435 /* B = Y+ cbu*(Cb-128) */
436 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
438 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
439
440 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
442 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
444 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
445 B[p2++]=r2 || r1.l = w[i2++]; // 2u
446 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447 B[p2++]=r3 || r2=[i1++]; // oy
448
449 p1+=3;
450 .L1888: p2+=3;
451
452 l1 = 0;
453
454 (r7:4) = [sp++];
455 unlink;
456 rts;
457 DEFUN_END(yuv2rgb24_line)
458
459
460
461 #define ARG_vdst 20
462 #define ARG_width 24
463 #define ARG_height 28
464 #define ARG_lumStride 32
465 #define ARG_chromStride 36
466 #define ARG_srcStride 40
467
468 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469 int width, int height,
470 int lumStride, int chromStride, int srcStride)):
471 link 0;
472 [--sp] = (r7:4,p5:4);
473
474 p0 = r1; // Y top even
475
476 i2 = r2; // *u
477 r2 = [fp + ARG_vdst];
478 i3 = r2; // *v
479
480 r1 = [fp + ARG_srcStride];
481 r2 = r0 + r1;
482 i0 = r0; // uyvy_T even
483 i1 = r2; // uyvy_B odd
484
485 p2 = [fp + ARG_lumStride];
486 p1 = p0 + p2; // Y bot odd
487
488 p5 = [fp + ARG_width];
489 p4 = [fp + ARG_height];
490 r0 = p5;
491 p4 = p4 >> 1;
492 p5 = p5 >> 2;
493
494 r2 = r0 << 1;
495 r1 = r1 << 1;
496 r1 = r1 - r2; // srcStride + (srcStride - 2*width)
497 r1 += -8; // i0,i1 is pre read need to correct
498 m0 = r1;
499
500 r2 = [fp + ARG_chromStride];
501 r0 = r0 >> 1;
502 r2 = r2 - r0;
503 m1 = r2;
504
505 /* I0,I1 - src input line pointers
506 * p0,p1 - luma output line pointers
507 * I2 - dstU
508 * I3 - dstV
509 */
510
511 lsetup (0f, 1f) lc1 = p4; // H/2
512 0: r0 = [i0++] || r2 = [i1++];
513 r1 = [i0++] || r3 = [i1++];
514 r4 = byteop1p(r1:0, r3:2);
515 r5 = byteop1p(r1:0, r3:2) (r);
516 lsetup (2f, 3f) lc0 = p5; // W/4
517 2: r0 = r0 >> 8(v);
518 r1 = r1 >> 8(v);
519 r2 = r2 >> 8(v);
520 r3 = r3 >> 8(v);
521 r0 = bytepack(r0, r1);
522 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
523 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
524 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
525 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
526 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
527 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
528
529 i0 += m0;
530 i1 += m0;
531 i2 += m1;
532 i3 += m1;
533 p0 = p0 + p2;
534 1: p1 = p1 + p2;
535
536 (r7:4,p5:4) = [sp++];
537 unlink;
538 rts;
539 DEFUN_END(uyvytoyv12)
540
541 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
542 int width, int height,
543 int lumStride, int chromStride, int srcStride)):
544 link 0;
545 [--sp] = (r7:4,p5:4);
546
547 p0 = r1; // Y top even
548
549 i2 = r2; // *u
550 r2 = [fp + ARG_vdst];
551 i3 = r2; // *v
552
553 r1 = [fp + ARG_srcStride];
554 r2 = r0 + r1;
555
556 i0 = r0; // uyvy_T even
557 i1 = r2; // uyvy_B odd
558
559 p2 = [fp + ARG_lumStride];
560 p1 = p0 + p2; // Y bot odd
561
562 p5 = [fp + ARG_width];
563 p4 = [fp + ARG_height];
564 r0 = p5;
565 p4 = p4 >> 1;
566 p5 = p5 >> 2;
567
568 r2 = r0 << 1;
569 r1 = r1 << 1;
570 r1 = r1 - r2; // srcStride + (srcStride - 2*width)
571 r1 += -8; // i0,i1 is pre read need to correct
572 m0 = r1;
573
574 r2 = [fp + ARG_chromStride];
575 r0 = r0 >> 1;
576 r2 = r2 - r0;
577 m1 = r2;
578
579 /* I0,I1 - src input line pointers
580 * p0,p1 - luma output line pointers
581 * I2 - dstU
582 * I3 - dstV
583 */
584
585 lsetup (0f, 1f) lc1 = p4; // H/2
586 0: r0 = [i0++] || r2 = [i1++];
587 r1 = [i0++] || r3 = [i1++];
588 r4 = bytepack(r0, r1);
589 r5 = bytepack(r2, r3);
590 lsetup (2f, 3f) lc0 = p5; // W/4
591 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
592 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
593 r2 = r2 >> 8(v);
594 r3 = r3 >> 8(v);
595 r4 = byteop1p(r1:0, r3:2);
596 r5 = byteop1p(r1:0, r3:2) (r);
597 r6 = pack(r5.l, r4.l);
598 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
599 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
600 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
601 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
602
603 i0 += m0;
604 i1 += m0;
605 i2 += m1;
606 i3 += m1;
607 p0 = p0 + p2;
608 1: p1 = p1 + p2;
609
610 (r7:4,p5:4) = [sp++];
611 unlink;
612 rts;
613 DEFUN_END(yuyvtoyv12)