Add missing const qualifiers to AltiVec function parameters where appropriate.
[libav.git] / libswscale / ppc / yuv2rgb_altivec.c
CommitLineData
a31de956 1/*
7a4d5e17
DB
2 * AltiVec acceleration for colorspace conversion
3 *
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5 *
6 * This file is part of FFmpeg.
7 *
ee8ee340
DB
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
7a4d5e17
DB
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ee8ee340
DB
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
7a4d5e17 17 *
ee8ee340
DB
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
7a4d5e17
DB
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
a31de956 22
7a4d5e17 23/*
8a322796
DB
24Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28Reject defers to C simulation code.
a31de956 29
8a322796 30Lots of optimizations to be done here.
a31de956 31
8a322796
DB
321. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
a31de956 34
8a322796 352. The inefficient use of chroma loading needs a bit of brushing up.
a31de956 36
8a322796
DB
373. Analysis of pipeline stalls needs to be done. Use shark to identify
38 pipeline stalls.
a31de956
MN
39
40
4bdc44c7 41MODIFIED to calculate coeffs from currently selected color space.
8a322796
DB
42MODIFIED core to be a macro where you specify the output format.
43ADDED UYVY conversion which is never called due to some thing in swscale.
4bdc44c7 44CORRECTED algorithim selection to be strict on input formats.
8a322796 45ADDED runtime detection of AltiVec.
a31de956 46
4bdc44c7 47ADDED altivec_yuv2packedX vertical scl + RGB converter
a31de956 48
4bdc44c7
DB
49March 27,2004
50PERFORMANCE ANALYSIS
a31de956 51
8a322796
DB
52The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53used as test.
54The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55same sequence.
a31de956 56
8a322796 57720 * 480 * 30 ~10MPS
a31de956 58
8a322796
DB
59so we have roughly 10 clocks per pixel. This is too high, something has
60to be wrong.
a31de956 61
8a322796
DB
62OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63need for vec_min.
a31de956 64
8a322796
DB
65OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66the input video frame, it was just decompressed so it probably resides in L1
67caches. However, we are creating the output video stream. This needs to use the
68DSTST instruction to optimize for the cache. We couple this with the fact that
69we are not going to be visiting the input buffer again so we mark it Least
70Recently Used. This shaves 25% of the processor cycles off.
a31de956 71
8a322796 72Now memcpy is the largest mips consumer in the system, probably due
4bdc44c7 73to the inefficient X11 stuff.
a31de956 74
4bdc44c7
DB
75GL libraries seem to be very slow on this machine 1.33Ghz PB running
76Jaguar, this is not the case for my 1Ghz PB. I thought it might be
8a322796
DB
77a versioning issue, however I have libGL.1.2.dylib for both
78machines. (We need to figure this out now.)
a31de956 79
8a322796 80GL2 libraries work now with patch for RGB32.
a31de956 81
8a322796 82NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
a31de956 83
8a322796
DB
84Integrated luma prescaling adjustment for saturation/contrast/brightness
85adjustment.
d026b45e 86*/
a31de956 87
a31de956
MN
88#include <stdio.h>
89#include <stdlib.h>
84fdd642 90#include <string.h>
a31de956
MN
91#include <inttypes.h>
92#include <assert.h>
93#include "config.h"
befa8e66
RP
94#include "libswscale/rgb2rgb.h"
95#include "libswscale/swscale.h"
96#include "libswscale/swscale_internal.h"
a31de956
MN
97
98#undef PROFILE_THE_BEAST
99#undef INC_SCALING
100
101typedef unsigned char ubyte;
102typedef signed char sbyte;
103
104
105/* RGB interleaver, 16 planar pels 8-bit samples per channel in
106 homogeneous vector registers x0,x1,x2 are interleaved with the
107 following technique:
108
109 o0 = vec_mergeh (x0,x1);
110 o1 = vec_perm (o0, x2, perm_rgb_0);
111 o2 = vec_perm (o0, x2, perm_rgb_1);
112 o3 = vec_mergel (x0,x1);
113 o4 = vec_perm (o3,o2,perm_rgb_2);
114 o5 = vec_perm (o3,o2,perm_rgb_3);
115
116 perm_rgb_0: o0(RG).h v1(B) --> o1*
117 0 1 2 3 4
118 rgbr|gbrg|brgb|rgbr
119 0010 0100 1001 0010
120 0102 3145 2673 894A
121
122 perm_rgb_1: o0(RG).h v1(B) --> o2
123 0 1 2 3 4
124 gbrg|brgb|bbbb|bbbb
125 0100 1001 1111 1111
126 B5CD 6EF7 89AB CDEF
127
128 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
129 0 1 2 3 4
130 gbrg|brgb|rgbr|gbrg
131 1111 1111 0010 0100
132 89AB CDEF 0182 3945
133
134 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
135 0 1 2 3 4
136 brgb|rgbr|gbrg|brgb
137 1001 0010 0100 1001
138 a67b 89cA BdCD eEFf
139
140*/
141static
142const vector unsigned char
f22e5e22
DB
143 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
144 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
145 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
146 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
147 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
148 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
149 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
150 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
42809816
DB
151
152#define vec_merge3(x2,x1,x0,y0,y1,y2) \
153do { \
9655ffb5 154 __typeof__(x0) o0,o2,o3; \
42809816
DB
155 o0 = vec_mergeh (x0,x1); \
156 y0 = vec_perm (o0, x2, perm_rgb_0); \
157 o2 = vec_perm (o0, x2, perm_rgb_1); \
158 o3 = vec_mergel (x0,x1); \
159 y1 = vec_perm (o3,o2,perm_rgb_2); \
160 y2 = vec_perm (o3,o2,perm_rgb_3); \
a31de956
MN
161} while(0)
162
42809816
DB
163#define vec_mstbgr24(x0,x1,x2,ptr) \
164do { \
9655ffb5 165 __typeof__(x0) _0,_1,_2; \
42809816
DB
166 vec_merge3 (x0,x1,x2,_0,_1,_2); \
167 vec_st (_0, 0, ptr++); \
168 vec_st (_1, 0, ptr++); \
169 vec_st (_2, 0, ptr++); \
a31de956
MN
170} while (0);
171
42809816
DB
172#define vec_mstrgb24(x0,x1,x2,ptr) \
173do { \
9655ffb5 174 __typeof__(x0) _0,_1,_2; \
42809816
DB
175 vec_merge3 (x2,x1,x0,_0,_1,_2); \
176 vec_st (_0, 0, ptr++); \
177 vec_st (_1, 0, ptr++); \
178 vec_st (_2, 0, ptr++); \
a31de956
MN
179} while (0);
180
181/* pack the pixels in rgb0 format
182 msb R
183 lsb 0
184*/
42809816
DB
185#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
186do { \
187 T _0,_1,_2,_3; \
188 _0 = vec_mergeh (x0,x1); \
189 _1 = vec_mergeh (x2,x3); \
190 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
191 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
192 vec_st (_2, 0*16, (T *)ptr); \
193 vec_st (_3, 1*16, (T *)ptr); \
194 _0 = vec_mergel (x0,x1); \
195 _1 = vec_mergel (x2,x3); \
196 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
197 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
198 vec_st (_2, 2*16, (T *)ptr); \
199 vec_st (_3, 3*16, (T *)ptr); \
200 ptr += 4; \
a31de956
MN
201} while (0);
202
203/*
204
205 | 1 0 1.4021 | | Y |
206 | 1 -0.3441 -0.7142 |x| Cb|
42809816 207 | 1 1.7718 0 | | Cr|
a31de956
MN
208
209
210 Y: [-128 127]
211 Cb/Cr : [-128 127]
212
213 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
214
215*/
216
217
218
219
220#define vec_unh(x) \
42809816 221 (vector signed short) \
9655ffb5 222 vec_perm(x,(__typeof__(x)){0}, \
14b83f9a
GP
223 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
224 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
a31de956 225#define vec_unl(x) \
42809816 226 (vector signed short) \
9655ffb5 227 vec_perm(x,(__typeof__(x)){0}, \
14b83f9a
GP
228 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
229 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
a31de956 230
cbddd5df 231#define vec_clip_s16(x) \
14b83f9a
GP
232 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
233 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
a31de956
MN
234
235#define vec_packclp(x,y) \
42809816 236 (vector unsigned char)vec_packs \
14b83f9a
GP
237 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
238 (vector unsigned short)vec_max (y,((vector signed short) {0})))
a31de956 239
68363b69 240//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
a31de956
MN
241
242
84fdd642 243static inline void cvtyuvtoRGB (SwsContext *c,
42809816
DB
244 vector signed short Y, vector signed short U, vector signed short V,
245 vector signed short *R, vector signed short *G, vector signed short *B)
a31de956 246{
42809816 247 vector signed short vx,ux,uvx;
a31de956 248
42809816
DB
249 Y = vec_mradds (Y, c->CY, c->OY);
250 U = vec_sub (U,(vector signed short)
f22e5e22 251 vec_splat((vector signed short){128},0));
42809816 252 V = vec_sub (V,(vector signed short)
f22e5e22 253 vec_splat((vector signed short){128},0));
a31de956 254
42809816
DB
255 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
256 ux = vec_sl (U, c->CSHIFT);
257 *B = vec_mradds (ux, c->CBU, Y);
a31de956 258
42809816
DB
259 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
260 vx = vec_sl (V, c->CSHIFT);
261 *R = vec_mradds (vx, c->CRV, Y);
a31de956 262
42809816
DB
263 // uvx = ((CGU*u) + (CGV*v))>>15;
264 uvx = vec_mradds (U, c->CGU, Y);
265 *G = vec_mradds (V, c->CGV, uvx);
a31de956
MN
266}
267
268
269/*
270 ------------------------------------------------------------------------------
271 CS converters
272 ------------------------------------------------------------------------------
273*/
274
275
42809816
DB
276#define DEFCSP420_CVT(name,out_pixels) \
277static int altivec_##name (SwsContext *c, \
278 unsigned char **in, int *instrides, \
279 int srcSliceY, int srcSliceH, \
280 unsigned char **oplanes, int *outstrides) \
281{ \
282 int w = c->srcW; \
283 int h = srcSliceH; \
284 int i,j; \
285 int instrides_scl[3]; \
286 vector unsigned char y0,y1; \
287 \
288 vector signed char u,v; \
289 \
290 vector signed short Y0,Y1,Y2,Y3; \
291 vector signed short U,V; \
292 vector signed short vx,ux,uvx; \
293 vector signed short vx0,ux0,uvx0; \
294 vector signed short vx1,ux1,uvx1; \
295 vector signed short R0,G0,B0; \
296 vector signed short R1,G1,B1; \
297 vector unsigned char R,G,B; \
298 \
299 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
300 vector unsigned char align_perm; \
301 \
302 vector signed short \
303 lCY = c->CY, \
304 lOY = c->OY, \
305 lCRV = c->CRV, \
306 lCBU = c->CBU, \
307 lCGU = c->CGU, \
308 lCGV = c->CGV; \
309 \
310 vector unsigned short lCSHIFT = c->CSHIFT; \
311 \
312 ubyte *y1i = in[0]; \
313 ubyte *y2i = in[0]+instrides[0]; \
314 ubyte *ui = in[1]; \
315 ubyte *vi = in[2]; \
316 \
317 vector unsigned char *oute \
318 = (vector unsigned char *) \
319 (oplanes[0]+srcSliceY*outstrides[0]); \
320 vector unsigned char *outo \
321 = (vector unsigned char *) \
322 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 \
324 \
325 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
326 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
327 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 \
329 \
330 for (i=0;i<h/2;i++) { \
331 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
332 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
333 \
334 for (j=0;j<w/16;j++) { \
335 \
336 y1ivP = (vector unsigned char *)y1i; \
337 y2ivP = (vector unsigned char *)y2i; \
338 uivP = (vector unsigned char *)ui; \
339 vivP = (vector unsigned char *)vi; \
340 \
341 align_perm = vec_lvsl (0, y1i); \
342 y0 = (vector unsigned char) \
343 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
344 \
345 align_perm = vec_lvsl (0, y2i); \
346 y1 = (vector unsigned char) \
347 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
348 \
349 align_perm = vec_lvsl (0, ui); \
350 u = (vector signed char) \
351 vec_perm (uivP[0], uivP[1], align_perm); \
352 \
353 align_perm = vec_lvsl (0, vi); \
354 v = (vector signed char) \
355 vec_perm (vivP[0], vivP[1], align_perm); \
356 \
357 u = (vector signed char) \
358 vec_sub (u,(vector signed char) \
f22e5e22 359 vec_splat((vector signed char){128},0)); \
42809816
DB
360 v = (vector signed char) \
361 vec_sub (v,(vector signed char) \
f22e5e22 362 vec_splat((vector signed char){128},0)); \
42809816
DB
363 \
364 U = vec_unpackh (u); \
365 V = vec_unpackh (v); \
366 \
367 \
368 Y0 = vec_unh (y0); \
369 Y1 = vec_unl (y0); \
370 Y2 = vec_unh (y1); \
371 Y3 = vec_unl (y1); \
372 \
373 Y0 = vec_mradds (Y0, lCY, lOY); \
374 Y1 = vec_mradds (Y1, lCY, lOY); \
375 Y2 = vec_mradds (Y2, lCY, lOY); \
376 Y3 = vec_mradds (Y3, lCY, lOY); \
377 \
378 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
379 ux = vec_sl (U, lCSHIFT); \
f22e5e22 380 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
42809816
DB
381 ux0 = vec_mergeh (ux,ux); \
382 ux1 = vec_mergel (ux,ux); \
383 \
384 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
385 vx = vec_sl (V, lCSHIFT); \
f22e5e22 386 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
42809816
DB
387 vx0 = vec_mergeh (vx,vx); \
388 vx1 = vec_mergel (vx,vx); \
389 \
390 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
f22e5e22 391 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
42809816
DB
392 uvx = vec_mradds (V, lCGV, uvx); \
393 uvx0 = vec_mergeh (uvx,uvx); \
394 uvx1 = vec_mergel (uvx,uvx); \
395 \
396 R0 = vec_add (Y0,vx0); \
397 G0 = vec_add (Y0,uvx0); \
398 B0 = vec_add (Y0,ux0); \
399 R1 = vec_add (Y1,vx1); \
400 G1 = vec_add (Y1,uvx1); \
401 B1 = vec_add (Y1,ux1); \
402 \
403 R = vec_packclp (R0,R1); \
404 G = vec_packclp (G0,G1); \
405 B = vec_packclp (B0,B1); \
406 \
407 out_pixels(R,G,B,oute); \
408 \
409 R0 = vec_add (Y2,vx0); \
410 G0 = vec_add (Y2,uvx0); \
411 B0 = vec_add (Y2,ux0); \
412 R1 = vec_add (Y3,vx1); \
413 G1 = vec_add (Y3,uvx1); \
414 B1 = vec_add (Y3,ux1); \
415 R = vec_packclp (R0,R1); \
416 G = vec_packclp (G0,G1); \
417 B = vec_packclp (B0,B1); \
418 \
419 \
420 out_pixels(R,G,B,outo); \
421 \
422 y1i += 16; \
423 y2i += 16; \
424 ui += 8; \
425 vi += 8; \
426 \
427 } \
428 \
429 outo += (outstrides[0])>>4; \
430 oute += (outstrides[0])>>4; \
431 \
432 ui += instrides_scl[1]; \
433 vi += instrides_scl[2]; \
434 y1i += instrides_scl[0]; \
435 y2i += instrides_scl[0]; \
436 } \
437 return srcSliceH; \
a31de956
MN
438}
439
440
68363b69
RD
441#define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
442#define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
443#define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
444#define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
a31de956 445#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
7d20ebff 446#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
a31de956 447
340ea251 448DEFCSP420_CVT (yuv2_abgr, out_abgr)
582552fb 449#if 1
340ea251 450DEFCSP420_CVT (yuv2_bgra, out_bgra)
582552fb 451#else
6a4970ab 452static int altivec_yuv2_bgra32 (SwsContext *c,
42809816
DB
453 unsigned char **in, int *instrides,
454 int srcSliceY, int srcSliceH,
455 unsigned char **oplanes, int *outstrides)
6a4970ab 456{
42809816
DB
457 int w = c->srcW;
458 int h = srcSliceH;
459 int i,j;
460 int instrides_scl[3];
461 vector unsigned char y0,y1;
462
463 vector signed char u,v;
464
465 vector signed short Y0,Y1,Y2,Y3;
466 vector signed short U,V;
467 vector signed short vx,ux,uvx;
468 vector signed short vx0,ux0,uvx0;
469 vector signed short vx1,ux1,uvx1;
470 vector signed short R0,G0,B0;
471 vector signed short R1,G1,B1;
472 vector unsigned char R,G,B;
473
474 vector unsigned char *uivP, *vivP;
475 vector unsigned char align_perm;
476
477 vector signed short
478 lCY = c->CY,
479 lOY = c->OY,
480 lCRV = c->CRV,
481 lCBU = c->CBU,
482 lCGU = c->CGU,
483 lCGV = c->CGV;
484
485 vector unsigned short lCSHIFT = c->CSHIFT;
486
487 ubyte *y1i = in[0];
488 ubyte *y2i = in[0]+w;
489 ubyte *ui = in[1];
490 ubyte *vi = in[2];
491
492 vector unsigned char *oute
493 = (vector unsigned char *)
494 (oplanes[0]+srcSliceY*outstrides[0]);
495 vector unsigned char *outo
496 = (vector unsigned char *)
497 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498
499
500 instrides_scl[0] = instrides[0];
501 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
502 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
503
504
505 for (i=0;i<h/2;i++) {
506 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
507 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
508
509 for (j=0;j<w/16;j++) {
510
511 y0 = vec_ldl (0,y1i);
512 y1 = vec_ldl (0,y2i);
513 uivP = (vector unsigned char *)ui;
514 vivP = (vector unsigned char *)vi;
515
516 align_perm = vec_lvsl (0, ui);
517 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
518
519 align_perm = vec_lvsl (0, vi);
520 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
521 u = (vector signed char)
522 vec_sub (u,(vector signed char)
f22e5e22 523 vec_splat((vector signed char){128},0));
42809816
DB
524
525 v = (vector signed char)
526 vec_sub (v, (vector signed char)
f22e5e22 527 vec_splat((vector signed char){128},0));
42809816
DB
528
529 U = vec_unpackh (u);
530 V = vec_unpackh (v);
531
532
533 Y0 = vec_unh (y0);
534 Y1 = vec_unl (y0);
535 Y2 = vec_unh (y1);
536 Y3 = vec_unl (y1);
537
538 Y0 = vec_mradds (Y0, lCY, lOY);
539 Y1 = vec_mradds (Y1, lCY, lOY);
540 Y2 = vec_mradds (Y2, lCY, lOY);
541 Y3 = vec_mradds (Y3, lCY, lOY);
542
543 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
544 ux = vec_sl (U, lCSHIFT);
f22e5e22 545 ux = vec_mradds (ux, lCBU, (vector signed short){0});
42809816
DB
546 ux0 = vec_mergeh (ux,ux);
547 ux1 = vec_mergel (ux,ux);
548
549 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
550 vx = vec_sl (V, lCSHIFT);
f22e5e22 551 vx = vec_mradds (vx, lCRV, (vector signed short){0});
42809816
DB
552 vx0 = vec_mergeh (vx,vx);
553 vx1 = vec_mergel (vx,vx);
554 /* uvx = ((CGU*u) + (CGV*v))>>15 */
f22e5e22 555 uvx = vec_mradds (U, lCGU, (vector signed short){0});
42809816
DB
556 uvx = vec_mradds (V, lCGV, uvx);
557 uvx0 = vec_mergeh (uvx,uvx);
558 uvx1 = vec_mergel (uvx,uvx);
559 R0 = vec_add (Y0,vx0);
560 G0 = vec_add (Y0,uvx0);
561 B0 = vec_add (Y0,ux0);
562 R1 = vec_add (Y1,vx1);
563 G1 = vec_add (Y1,uvx1);
564 B1 = vec_add (Y1,ux1);
565 R = vec_packclp (R0,R1);
566 G = vec_packclp (G0,G1);
567 B = vec_packclp (B0,B1);
568
569 out_argb(R,G,B,oute);
570 R0 = vec_add (Y2,vx0);
571 G0 = vec_add (Y2,uvx0);
572 B0 = vec_add (Y2,ux0);
573 R1 = vec_add (Y3,vx1);
574 G1 = vec_add (Y3,uvx1);
575 B1 = vec_add (Y3,ux1);
576 R = vec_packclp (R0,R1);
577 G = vec_packclp (G0,G1);
578 B = vec_packclp (B0,B1);
579
580 out_argb(R,G,B,outo);
581 y1i += 16;
582 y2i += 16;
583 ui += 8;
584 vi += 8;
6a4970ab 585
42809816 586 }
6a4970ab 587
42809816
DB
588 outo += (outstrides[0])>>4;
589 oute += (outstrides[0])>>4;
6a4970ab 590
42809816
DB
591 ui += instrides_scl[1];
592 vi += instrides_scl[2];
593 y1i += instrides_scl[0];
594 y2i += instrides_scl[0];
595 }
596 return srcSliceH;
582552fb
LB
597}
598
599#endif
600
601
340ea251
AC
602DEFCSP420_CVT (yuv2_rgba, out_rgba)
603DEFCSP420_CVT (yuv2_argb, out_argb)
a31de956
MN
604DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
605DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
606
607
608// uyvy|uyvy|uyvy|uyvy
609// 0123 4567 89ab cdef
610static
611const vector unsigned char
f22e5e22 612 demux_u = {0x10,0x00,0x10,0x00,
6b83bb1e
DB
613 0x10,0x04,0x10,0x04,
614 0x10,0x08,0x10,0x08,
f22e5e22
DB
615 0x10,0x0c,0x10,0x0c},
616 demux_v = {0x10,0x02,0x10,0x02,
6b83bb1e
DB
617 0x10,0x06,0x10,0x06,
618 0x10,0x0A,0x10,0x0A,
f22e5e22
DB
619 0x10,0x0E,0x10,0x0E},
620 demux_y = {0x10,0x01,0x10,0x03,
6b83bb1e
DB
621 0x10,0x05,0x10,0x07,
622 0x10,0x09,0x10,0x0B,
f22e5e22 623 0x10,0x0D,0x10,0x0F};
a31de956
MN
624
625/*
626 this is so I can play live CCIR raw video
627*/
628static int altivec_uyvy_rgb32 (SwsContext *c,
42809816
DB
629 unsigned char **in, int *instrides,
630 int srcSliceY, int srcSliceH,
631 unsigned char **oplanes, int *outstrides)
a31de956 632{
42809816
DB
633 int w = c->srcW;
634 int h = srcSliceH;
635 int i,j;
636 vector unsigned char uyvy;
637 vector signed short Y,U,V;
638 vector signed short R0,G0,B0,R1,G1,B1;
639 vector unsigned char R,G,B;
640 vector unsigned char *out;
641 ubyte *img;
a31de956 642
42809816
DB
643 img = in[0];
644 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
a31de956 645
42809816
DB
646 for (i=0;i<h;i++) {
647 for (j=0;j<w/16;j++) {
648 uyvy = vec_ld (0, img);
649 U = (vector signed short)
f22e5e22 650 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
a31de956 651
42809816 652 V = (vector signed short)
f22e5e22 653 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
a31de956 654
42809816 655 Y = (vector signed short)
f22e5e22 656 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
a31de956 657
42809816 658 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
a31de956 659
42809816
DB
660 uyvy = vec_ld (16, img);
661 U = (vector signed short)
f22e5e22 662 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
a31de956 663
42809816 664 V = (vector signed short)
f22e5e22 665 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
a31de956 666
42809816 667 Y = (vector signed short)
f22e5e22 668 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
a31de956 669
42809816 670 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
a31de956 671
42809816
DB
672 R = vec_packclp (R0,R1);
673 G = vec_packclp (G0,G1);
674 B = vec_packclp (B0,B1);
a31de956 675
42809816
DB
676 // vec_mstbgr24 (R,G,B, out);
677 out_rgba (R,G,B,out);
a31de956 678
42809816
DB
679 img += 32;
680 }
a31de956 681 }
42809816 682 return srcSliceH;
a31de956
MN
683}
684
685
686
687/* Ok currently the acceleration routine only supports
688 inputs of widths a multiple of 16
689 and heights a multiple 2
690
691 So we just fall back to the C codes for this.
692*/
780daf2b 693SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
a31de956 694{
42809816
DB
695 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
696 return NULL;
a31de956 697
42809816
DB
698 /*
699 and this seems not to matter too much I tried a bunch of
bee972ee 700 videos with abnormal widths and MPlayer crashes elsewhere.
42809816
DB
701 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
702 boom with X11 bad match.
a31de956 703
42809816
DB
704 */
705 if ((c->srcW & 0xf) != 0) return NULL;
706
707 switch (c->srcFormat) {
708 case PIX_FMT_YUV410P:
709 case PIX_FMT_YUV420P:
710 /*case IMGFMT_CLPL: ??? */
711 case PIX_FMT_GRAY8:
712 case PIX_FMT_NV12:
713 case PIX_FMT_NV21:
714 if ((c->srcH & 0x1) != 0)
715 return NULL;
716
717 switch(c->dstFormat){
718 case PIX_FMT_RGB24:
719 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
720 return altivec_yuv2_rgb24;
721 case PIX_FMT_BGR24:
722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
723 return altivec_yuv2_bgr24;
724 case PIX_FMT_ARGB:
725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
726 return altivec_yuv2_argb;
727 case PIX_FMT_ABGR:
728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
729 return altivec_yuv2_abgr;
730 case PIX_FMT_RGBA:
731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
732 return altivec_yuv2_rgba;
733 case PIX_FMT_BGRA:
734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
735 return altivec_yuv2_bgra;
736 default: return NULL;
737 }
738 break;
739
740 case PIX_FMT_UYVY422:
741 switch(c->dstFormat){
742 case PIX_FMT_BGR32:
743 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
744 return altivec_uyvy_rgb32;
745 default: return NULL;
746 }
747 break;
748
749 }
750 return NULL;
a31de956
MN
751}
752
780daf2b 753void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
582552fb 754{
42809816
DB
755 union {
756 signed short tmp[8] __attribute__ ((aligned(16)));
757 vector signed short vec;
758 } buf;
759
e5091488 760 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
42809816
DB
761 buf.tmp[1] = -256*brightness; //oy
762 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
763 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
764 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
765 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
766
767
768 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
769 c->CY = vec_splat ((vector signed short)buf.vec, 0);
770 c->OY = vec_splat ((vector signed short)buf.vec, 1);
771 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
772 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
773 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
774 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
84fdd642 775#if 0
42809816
DB
776 {
777 int i;
778 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
779 for (i=0; i<6; i++)
780 printf("%s %d ", v[i],buf.tmp[i] );
781 printf("\n");
782 }
a31de956 783#endif
42809816 784 return;
a31de956
MN
785}
786
787
788void
780daf2b 789ff_yuv2packedX_altivec(SwsContext *c,
f1933e43
DB
790 const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
791 const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
42809816 792 uint8_t *dest, int dstW, int dstY)
a31de956 793{
42809816
DB
794 int i,j;
795 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
796 vector signed short R0,G0,B0,R1,G1,B1;
582552fb 797
42809816
DB
798 vector unsigned char R,G,B;
799 vector unsigned char *out,*nout;
a31de956 800
42809816
DB
801 vector signed short RND = vec_splat_s16(1<<3);
802 vector unsigned short SCL = vec_splat_u16(4);
803 unsigned long scratch[16] __attribute__ ((aligned (16)));
a31de956 804
42809816 805 vector signed short *YCoeffs, *CCoeffs;
a31de956 806
42809816
DB
807 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
808 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
a31de956 809
42809816 810 out = (vector unsigned char *)dest;
a31de956 811
42809816
DB
812 for (i=0; i<dstW; i+=16){
813 Y0 = RND;
814 Y1 = RND;
815 /* extract 16 coeffs from lumSrc */
816 for (j=0; j<lumFilterSize; j++) {
817 X0 = vec_ld (0, &lumSrc[j][i]);
818 X1 = vec_ld (16, &lumSrc[j][i]);
819 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
820 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
821 }
a31de956 822
42809816
DB
823 U = RND;
824 V = RND;
825 /* extract 8 coeffs from U,V */
826 for (j=0; j<chrFilterSize; j++) {
827 X = vec_ld (0, &chrSrc[j][i/2]);
828 U = vec_mradds (X, CCoeffs[j], U);
829 X = vec_ld (0, &chrSrc[j][i/2+2048]);
830 V = vec_mradds (X, CCoeffs[j], V);
3845b56d 831 }
a31de956 832
42809816
DB
833 /* scale and clip signals */
834 Y0 = vec_sra (Y0, SCL);
835 Y1 = vec_sra (Y1, SCL);
836 U = vec_sra (U, SCL);
837 V = vec_sra (V, SCL);
838
839 Y0 = vec_clip_s16 (Y0);
840 Y1 = vec_clip_s16 (Y1);
841 U = vec_clip_s16 (U);
842 V = vec_clip_s16 (V);
843
844 /* now we have
845 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
846 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
847
848 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
849 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
850 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
851 */
852
853 U0 = vec_mergeh (U,U);
854 V0 = vec_mergeh (V,V);
855
856 U1 = vec_mergel (U,U);
857 V1 = vec_mergel (V,V);
858
859 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
860 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
861
862 R = vec_packclp (R0,R1);
863 G = vec_packclp (G0,G1);
864 B = vec_packclp (B0,B1);
865
866 switch(c->dstFormat) {
867 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
868 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
869 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
870 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
871 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
872 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
873 default:
874 {
875 /* If this is reached, the caller should have called yuv2packedXinC
876 instead. */
877 static int printed_error_message;
878 if (!printed_error_message) {
879 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
880 sws_format_name(c->dstFormat));
881 printed_error_message=1;
882 }
883 return;
884 }
885 }
a31de956
MN
886 }
887
42809816
DB
888 if (i < dstW) {
889 i -= 16;
890
891 Y0 = RND;
892 Y1 = RND;
893 /* extract 16 coeffs from lumSrc */
894 for (j=0; j<lumFilterSize; j++) {
895 X0 = vec_ld (0, &lumSrc[j][i]);
896 X1 = vec_ld (16, &lumSrc[j][i]);
897 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
898 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
899 }
a31de956 900
42809816
DB
901 U = RND;
902 V = RND;
903 /* extract 8 coeffs from U,V */
904 for (j=0; j<chrFilterSize; j++) {
905 X = vec_ld (0, &chrSrc[j][i/2]);
906 U = vec_mradds (X, CCoeffs[j], U);
907 X = vec_ld (0, &chrSrc[j][i/2+2048]);
908 V = vec_mradds (X, CCoeffs[j], V);
909 }
a31de956 910
42809816
DB
911 /* scale and clip signals */
912 Y0 = vec_sra (Y0, SCL);
913 Y1 = vec_sra (Y1, SCL);
914 U = vec_sra (U, SCL);
915 V = vec_sra (V, SCL);
916
917 Y0 = vec_clip_s16 (Y0);
918 Y1 = vec_clip_s16 (Y1);
919 U = vec_clip_s16 (U);
920 V = vec_clip_s16 (V);
921
922 /* now we have
923 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
924 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
925
926 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
927 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
928 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
929 */
930
931 U0 = vec_mergeh (U,U);
932 V0 = vec_mergeh (V,V);
933
934 U1 = vec_mergel (U,U);
935 V1 = vec_mergel (V,V);
936
937 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
938 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
939
940 R = vec_packclp (R0,R1);
941 G = vec_packclp (G0,G1);
942 B = vec_packclp (B0,B1);
943
944 nout = (vector unsigned char *)scratch;
945 switch(c->dstFormat) {
946 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
947 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
948 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
949 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
950 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
951 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
952 default:
953 /* Unreachable, I think. */
954 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
955 sws_format_name(c->dstFormat));
956 return;
957 }
a31de956 958
42809816 959 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
3845b56d 960 }
a31de956 961
a31de956 962}