da57ad773f825183f1224a5bdde925e85fa4790b
2 * High quality image resampling with polyphase filters
3 * Copyright (c) 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * High quality image resampling with polyphase filters .
29 #include "fastmemcpy.h"
32 #define NB_COMPONENTS 3
35 #define NB_PHASES (1 << PHASE_BITS)
37 #define FCENTER 1 /* index of the center of the filter */
38 //#define TEST 1 /* Test it */
40 #define POS_FRAC_BITS 16
41 #define POS_FRAC (1 << POS_FRAC_BITS)
42 /* 6 bits precision is needed for MMX */
45 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
47 struct ImgReSampleContext
{
48 int iwidth
, iheight
, owidth
, oheight
;
49 int topBand
, bottomBand
, leftBand
, rightBand
;
50 int padtop
, padbottom
, padleft
, padright
;
51 int pad_owidth
, pad_oheight
;
53 int16_t h_filters
[NB_PHASES
][NB_TAPS
] __align8
; /* horizontal filters */
54 int16_t v_filters
[NB_PHASES
][NB_TAPS
] __align8
; /* vertical filters */
58 static inline int get_phase(int pos
)
60 return ((pos
) >> (POS_FRAC_BITS
- PHASE_BITS
)) & ((1 << PHASE_BITS
) - 1);
63 /* This function must be optimized */
64 static void h_resample_fast(uint8_t *dst
, int dst_width
, const uint8_t *src
,
65 int src_width
, int src_start
, int src_incr
,
68 int src_pos
, phase
, sum
, i
;
73 for(i
=0;i
<dst_width
;i
++) {
76 if ((src_pos
>> POS_FRAC_BITS
) < 0 ||
77 (src_pos
>> POS_FRAC_BITS
) > (src_width
- NB_TAPS
))
80 s
= src
+ (src_pos
>> POS_FRAC_BITS
);
81 phase
= get_phase(src_pos
);
82 filter
= filters
+ phase
* NB_TAPS
;
84 sum
= s
[0] * filter
[0] +
92 for(j
=0;j
<NB_TAPS
;j
++)
93 sum
+= s
[j
] * filter
[j
];
96 sum
= sum
>> FILTER_BITS
;
107 /* This function must be optimized */
108 static void v_resample(uint8_t *dst
, int dst_width
, const uint8_t *src
,
109 int wrap
, int16_t *filter
)
115 for(i
=0;i
<dst_width
;i
++) {
117 sum
= s
[0 * wrap
] * filter
[0] +
118 s
[1 * wrap
] * filter
[1] +
119 s
[2 * wrap
] * filter
[2] +
120 s
[3 * wrap
] * filter
[3];
127 for(j
=0;j
<NB_TAPS
;j
++) {
128 sum
+= s1
[0] * filter
[j
];
133 sum
= sum
>> FILTER_BITS
;
146 #include "i386/mmx.h"
148 #define FILTER4(reg) \
150 s = src + (src_pos >> POS_FRAC_BITS);\
151 phase = get_phase(src_pos);\
152 filter = filters + phase * NB_TAPS;\
154 punpcklbw_r2r(mm7, reg);\
155 movq_m2r(*filter, mm6);\
156 pmaddwd_r2r(reg, mm6);\
159 paddd_r2r(mm6, reg);\
160 psrad_i2r(FILTER_BITS, reg);\
161 src_pos += src_incr;\
164 #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
166 /* XXX: do four pixels at a time */
167 static void h_resample_fast4_mmx(uint8_t *dst
, int dst_width
,
168 const uint8_t *src
, int src_width
,
169 int src_start
, int src_incr
, int16_t *filters
)
179 while (dst_width
>= 4) {
186 packuswb_r2r(mm7
, mm0
);
187 packuswb_r2r(mm7
, mm1
);
188 packuswb_r2r(mm7
, mm3
);
189 packuswb_r2r(mm7
, mm2
);
201 while (dst_width
> 0) {
203 packuswb_r2r(mm7
, mm0
);
212 static void v_resample4_mmx(uint8_t *dst
, int dst_width
, const uint8_t *src
,
213 int wrap
, int16_t *filter
)
230 while (dst_width
>= 4) {
231 movq_m2r(s
[0 * wrap
], mm0
);
232 punpcklbw_r2r(mm7
, mm0
);
233 movq_m2r(s
[1 * wrap
], mm1
);
234 punpcklbw_r2r(mm7
, mm1
);
235 movq_m2r(s
[2 * wrap
], mm2
);
236 punpcklbw_r2r(mm7
, mm2
);
237 movq_m2r(s
[3 * wrap
], mm3
);
238 punpcklbw_r2r(mm7
, mm3
);
240 pmullw_m2r(coefs
[0], mm0
);
241 pmullw_m2r(coefs
[1], mm1
);
242 pmullw_m2r(coefs
[2], mm2
);
243 pmullw_m2r(coefs
[3], mm3
);
248 psraw_i2r(FILTER_BITS
, mm0
);
250 packuswb_r2r(mm7
, mm0
);
253 *(uint32_t *)dst
= tmp
.ud
[0];
258 while (dst_width
> 0) {
259 sum
= s
[0 * wrap
] * filter
[0] +
260 s
[1 * wrap
] * filter
[1] +
261 s
[2 * wrap
] * filter
[2] +
262 s
[3 * wrap
] * filter
[3];
263 sum
= sum
>> FILTER_BITS
;
279 vector
unsigned char v
;
284 vector
signed short v
;
288 void v_resample16_altivec(uint8_t *dst
, int dst_width
, const uint8_t *src
,
289 int wrap
, int16_t *filter
)
293 vector
unsigned char *tv
, tmp
, dstv
, zero
;
294 vec_ss_t srchv
[4], srclv
[4], fv
[4];
295 vector
signed short zeros
, sumhv
, sumlv
;
301 The vec_madds later on does an implicit >>15 on the result.
302 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
303 a signed short, we have just enough bits to pre-shift our
304 filter constants <<7 to compensate for vec_madds.
306 fv
[i
].s
[0] = filter
[i
] << (15-FILTER_BITS
);
307 fv
[i
].v
= vec_splat(fv
[i
].v
, 0);
310 zero
= vec_splat_u8(0);
311 zeros
= vec_splat_s16(0);
315 When we're resampling, we'd ideally like both our input buffers,
316 and output buffers to be 16-byte aligned, so we can do both aligned
317 reads and writes. Sadly we can't always have this at the moment, so
318 we opt for aligned writes, as unaligned writes have a huge overhead.
319 To do this, do enough scalar resamples to get dst 16-byte aligned.
321 i
= (-(int)dst
) & 0xf;
323 sum
= s
[0 * wrap
] * filter
[0] +
324 s
[1 * wrap
] * filter
[1] +
325 s
[2 * wrap
] * filter
[2] +
326 s
[3 * wrap
] * filter
[3];
327 sum
= sum
>> FILTER_BITS
;
328 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
336 /* Do our altivec resampling on 16 pixels at once. */
337 while(dst_width
>=16) {
339 Read 16 (potentially unaligned) bytes from each of
340 4 lines into 4 vectors, and split them into shorts.
341 Interleave the multipy/accumulate for the resample
342 filter with the loads to hide the 3 cycle latency
345 tv
= (vector
unsigned char *) &s
[0 * wrap
];
346 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[i
* wrap
]));
347 srchv
[0].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
348 srclv
[0].v
= (vector
signed short) vec_mergel(zero
, tmp
);
349 sumhv
= vec_madds(srchv
[0].v
, fv
[0].v
, zeros
);
350 sumlv
= vec_madds(srclv
[0].v
, fv
[0].v
, zeros
);
352 tv
= (vector
unsigned char *) &s
[1 * wrap
];
353 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[1 * wrap
]));
354 srchv
[1].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
355 srclv
[1].v
= (vector
signed short) vec_mergel(zero
, tmp
);
356 sumhv
= vec_madds(srchv
[1].v
, fv
[1].v
, sumhv
);
357 sumlv
= vec_madds(srclv
[1].v
, fv
[1].v
, sumlv
);
359 tv
= (vector
unsigned char *) &s
[2 * wrap
];
360 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[2 * wrap
]));
361 srchv
[2].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
362 srclv
[2].v
= (vector
signed short) vec_mergel(zero
, tmp
);
363 sumhv
= vec_madds(srchv
[2].v
, fv
[2].v
, sumhv
);
364 sumlv
= vec_madds(srclv
[2].v
, fv
[2].v
, sumlv
);
366 tv
= (vector
unsigned char *) &s
[3 * wrap
];
367 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[3 * wrap
]));
368 srchv
[3].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
369 srclv
[3].v
= (vector
signed short) vec_mergel(zero
, tmp
);
370 sumhv
= vec_madds(srchv
[3].v
, fv
[3].v
, sumhv
);
371 sumlv
= vec_madds(srclv
[3].v
, fv
[3].v
, sumlv
);
374 Pack the results into our destination vector,
375 and do an aligned write of that back to memory.
377 dstv
= vec_packsu(sumhv
, sumlv
) ;
378 vec_st(dstv
, 0, (vector
unsigned char *) dst
);
386 If there are any leftover pixels, resample them
387 with the slow scalar method.
390 sum
= s
[0 * wrap
] * filter
[0] +
391 s
[1 * wrap
] * filter
[1] +
392 s
[2 * wrap
] * filter
[2] +
393 s
[3 * wrap
] * filter
[3];
394 sum
= sum
>> FILTER_BITS
;
395 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
404 /* slow version to handle limit cases. Does not need optimisation */
405 static void h_resample_slow(uint8_t *dst
, int dst_width
,
406 const uint8_t *src
, int src_width
,
407 int src_start
, int src_incr
, int16_t *filters
)
409 int src_pos
, phase
, sum
, j
, v
, i
;
410 const uint8_t *s
, *src_end
;
413 src_end
= src
+ src_width
;
415 for(i
=0;i
<dst_width
;i
++) {
416 s
= src
+ (src_pos
>> POS_FRAC_BITS
);
417 phase
= get_phase(src_pos
);
418 filter
= filters
+ phase
* NB_TAPS
;
420 for(j
=0;j
<NB_TAPS
;j
++) {
423 else if (s
>= src_end
)
427 sum
+= v
* filter
[j
];
430 sum
= sum
>> FILTER_BITS
;
441 static void h_resample(uint8_t *dst
, int dst_width
, const uint8_t *src
,
442 int src_width
, int src_start
, int src_incr
,
448 n
= (0 - src_start
+ src_incr
- 1) / src_incr
;
449 h_resample_slow(dst
, n
, src
, src_width
, src_start
, src_incr
, filters
);
452 src_start
+= n
* src_incr
;
454 src_end
= src_start
+ dst_width
* src_incr
;
455 if (src_end
> ((src_width
- NB_TAPS
) << POS_FRAC_BITS
)) {
456 n
= (((src_width
- NB_TAPS
+ 1) << POS_FRAC_BITS
) - 1 - src_start
) /
462 if ((mm_flags
& MM_MMX
) && NB_TAPS
== 4)
463 h_resample_fast4_mmx(dst
, n
,
464 src
, src_width
, src_start
, src_incr
, filters
);
467 h_resample_fast(dst
, n
,
468 src
, src_width
, src_start
, src_incr
, filters
);
472 src_start
+= n
* src_incr
;
473 h_resample_slow(dst
, dst_width
,
474 src
, src_width
, src_start
, src_incr
, filters
);
478 static void component_resample(ImgReSampleContext
*s
,
479 uint8_t *output
, int owrap
, int owidth
, int oheight
,
480 uint8_t *input
, int iwrap
, int iwidth
, int iheight
)
482 int src_y
, src_y1
, last_src_y
, ring_y
, phase_y
, y1
, y
;
483 uint8_t *new_line
, *src_line
;
485 last_src_y
= - FCENTER
- 1;
486 /* position of the bottom of the filter in the source image */
487 src_y
= (last_src_y
+ NB_TAPS
) * POS_FRAC
;
488 ring_y
= NB_TAPS
; /* position in ring buffer */
489 for(y
=0;y
<oheight
;y
++) {
490 /* apply horizontal filter on new lines from input if needed */
491 src_y1
= src_y
>> POS_FRAC_BITS
;
492 while (last_src_y
< src_y1
) {
493 if (++ring_y
>= LINE_BUF_HEIGHT
+ NB_TAPS
)
496 /* handle limit conditions : replicate line (slightly
497 inefficient because we filter multiple times) */
501 } else if (y1
>= iheight
) {
504 src_line
= input
+ y1
* iwrap
;
505 new_line
= s
->line_buf
+ ring_y
* owidth
;
506 /* apply filter and handle limit cases correctly */
507 h_resample(new_line
, owidth
,
508 src_line
, iwidth
, - FCENTER
* POS_FRAC
, s
->h_incr
,
509 &s
->h_filters
[0][0]);
510 /* handle ring buffer wraping */
511 if (ring_y
>= LINE_BUF_HEIGHT
) {
512 memcpy(s
->line_buf
+ (ring_y
- LINE_BUF_HEIGHT
) * owidth
,
516 /* apply vertical filter */
517 phase_y
= get_phase(src_y
);
519 /* desactivated MMX because loss of precision */
520 if ((mm_flags
& MM_MMX
) && NB_TAPS
== 4 && 0)
521 v_resample4_mmx(output
, owidth
,
522 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
523 &s
->v_filters
[phase_y
][0]);
527 if ((mm_flags
& MM_ALTIVEC
) && NB_TAPS
== 4 && FILTER_BITS
<= 6)
528 v_resample16_altivec(output
, owidth
,
529 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
530 &s
->v_filters
[phase_y
][0]);
533 v_resample(output
, owidth
,
534 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
535 &s
->v_filters
[phase_y
][0]);
543 /* XXX: the following filter is quite naive, but it seems to suffice
545 static void build_filter(int16_t *filter
, float factor
)
548 float x
, y
, tab
[NB_TAPS
], norm
, mult
, target
;
550 /* if upsampling, only need to interpolate, no filter */
554 for(ph
=0;ph
<NB_PHASES
;ph
++) {
556 for(i
=0;i
<NB_TAPS
;i
++) {
558 const float d
= -0.5; //first order derivative = -0.5
559 x
= fabs(((float)(i
- FCENTER
) - (float)ph
/ NB_PHASES
) * factor
);
560 if(x
<1.0) y
= 1 - 3*x
*x
+ 2*x
*x
*x
+ d
*( -x
*x
+ x
*x
*x
);
561 else y
= d
*(-4 + 8*x
- 5*x
*x
+ x
*x
*x
);
563 x
= M_PI
* ((float)(i
- FCENTER
) - (float)ph
/ NB_PHASES
) * factor
;
573 /* normalize so that an uniform color remains the same */
574 target
= 1 << FILTER_BITS
;
575 for(i
=0;i
<NB_TAPS
;i
++) {
576 mult
= target
/ norm
;
577 v
= lrintf(tab
[i
] * mult
);
578 filter
[ph
* NB_TAPS
+ i
] = v
;
585 ImgReSampleContext
*img_resample_init(int owidth
, int oheight
,
586 int iwidth
, int iheight
)
588 return img_resample_full_init(owidth
, oheight
, iwidth
, iheight
,
589 0, 0, 0, 0, 0, 0, 0, 0);
592 ImgReSampleContext
*img_resample_full_init(int owidth
, int oheight
,
593 int iwidth
, int iheight
,
594 int topBand
, int bottomBand
,
595 int leftBand
, int rightBand
,
596 int padtop
, int padbottom
,
597 int padleft
, int padright
)
599 ImgReSampleContext
*s
;
601 s
= av_mallocz(sizeof(ImgReSampleContext
));
604 s
->line_buf
= av_mallocz(owidth
* (LINE_BUF_HEIGHT
+ NB_TAPS
));
609 s
->oheight
= oheight
;
611 s
->iheight
= iheight
;
613 s
->topBand
= topBand
;
614 s
->bottomBand
= bottomBand
;
615 s
->leftBand
= leftBand
;
616 s
->rightBand
= rightBand
;
619 s
->padbottom
= padbottom
;
620 s
->padleft
= padleft
;
621 s
->padright
= padright
;
623 s
->pad_owidth
= owidth
- (padleft
+ padright
);
624 s
->pad_oheight
= oheight
- (padtop
+ padbottom
);
626 s
->h_incr
= ((iwidth
- leftBand
- rightBand
) * POS_FRAC
) / s
->pad_owidth
;
627 s
->v_incr
= ((iheight
- topBand
- bottomBand
) * POS_FRAC
) / s
->pad_oheight
;
629 build_filter(&s
->h_filters
[0][0], (float) s
->pad_owidth
/
630 (float) (iwidth
- leftBand
- rightBand
));
631 build_filter(&s
->v_filters
[0][0], (float) s
->pad_oheight
/
632 (float) (iheight
- topBand
- bottomBand
));
640 void img_resample(ImgReSampleContext
*s
,
641 AVPicture
*output
, const AVPicture
*input
)
647 shift
= (i
== 0) ?
0 : 1;
649 optr
= output
->data
[i
] + (((output
->linesize
[i
] *
650 s
->padtop
) + s
->padleft
) >> shift
);
652 component_resample(s
, optr
, output
->linesize
[i
],
653 s
->pad_owidth
>> shift
, s
->pad_oheight
>> shift
,
654 input
->data
[i
] + (input
->linesize
[i
] *
655 (s
->topBand
>> shift
)) + (s
->leftBand
>> shift
),
656 input
->linesize
[i
], ((s
->iwidth
- s
->leftBand
-
657 s
->rightBand
) >> shift
),
658 (s
->iheight
- s
->topBand
- s
->bottomBand
) >> shift
);
662 void img_resample_close(ImgReSampleContext
*s
)
664 av_free(s
->line_buf
);
670 void *av_mallocz(int size
)
674 memset(ptr
, 0, size
);
678 void av_free(void *ptr
)
680 /* XXX: this test should not be needed on most libcs */
688 uint8_t img
[XSIZE
* YSIZE
];
693 uint8_t img1
[XSIZE1
* YSIZE1
];
694 uint8_t img2
[XSIZE1
* YSIZE1
];
696 void save_pgm(const char *filename
, uint8_t *img
, int xsize
, int ysize
)
699 f
=fopen(filename
,"w");
700 fprintf(f
,"P5\n%d %d\n%d\n", xsize
, ysize
, 255);
701 fwrite(img
,1, xsize
* ysize
,f
);
705 static void dump_filter(int16_t *filter
)
709 for(ph
=0;ph
<NB_PHASES
;ph
++) {
711 for(i
=0;i
<NB_TAPS
;i
++) {
712 printf(" %5.2f", filter
[ph
* NB_TAPS
+ i
] / 256.0);
722 int main(int argc
, char **argv
)
724 int x
, y
, v
, i
, xsize
, ysize
;
725 ImgReSampleContext
*s
;
726 float fact
, factors
[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
729 /* build test image */
730 for(y
=0;y
<YSIZE
;y
++) {
731 for(x
=0;x
<XSIZE
;x
++) {
732 if (x
< XSIZE
/2 && y
< YSIZE
/2) {
733 if (x
< XSIZE
/4 && y
< YSIZE
/4) {
739 } else if (x
< XSIZE
/4) {
744 } else if (y
< XSIZE
/4) {
756 if (((x
+3) % 4) <= 1 &&
763 } else if (x
< XSIZE
/2) {
764 v
= ((x
- (XSIZE
/2)) * 255) / (XSIZE
/2);
765 } else if (y
< XSIZE
/2) {
766 v
= ((y
- (XSIZE
/2)) * 255) / (XSIZE
/2);
768 v
= ((x
+ y
- XSIZE
) * 255) / XSIZE
;
770 img
[(YSIZE
- y
) * XSIZE
+ (XSIZE
- x
)] = v
;
773 save_pgm("/tmp/in.pgm", img
, XSIZE
, YSIZE
);
774 for(i
=0;i
<sizeof(factors
)/sizeof(float);i
++) {
776 xsize
= (int)(XSIZE
* fact
);
777 ysize
= (int)((YSIZE
- 100) * fact
);
778 s
= img_resample_full_init(xsize
, ysize
, XSIZE
, YSIZE
, 50 ,50, 0, 0);
779 printf("Factor=%0.2f\n", fact
);
780 dump_filter(&s
->h_filters
[0][0]);
781 component_resample(s
, img1
, xsize
, xsize
, ysize
,
782 img
+ 50 * XSIZE
, XSIZE
, XSIZE
, YSIZE
- 100);
783 img_resample_close(s
);
785 sprintf(buf
, "/tmp/out%d.pgm", i
);
786 save_pgm(buf
, img1
, xsize
, ysize
);
791 printf("MMX test\n");
793 xsize
= (int)(XSIZE
* fact
);
794 ysize
= (int)(YSIZE
* fact
);
796 s
= img_resample_init(xsize
, ysize
, XSIZE
, YSIZE
);
797 component_resample(s
, img1
, xsize
, xsize
, ysize
,
798 img
, XSIZE
, XSIZE
, YSIZE
);
801 s
= img_resample_init(xsize
, ysize
, XSIZE
, YSIZE
);
802 component_resample(s
, img2
, xsize
, xsize
, ysize
,
803 img
, XSIZE
, XSIZE
, YSIZE
);
804 if (memcmp(img1
, img2
, xsize
* ysize
) != 0) {
805 fprintf(stderr
, "mmx error\n");