fix linking on systems with a function name prefix (10l in r20287)
[libav.git] / libavcodec / x86 / dsputil_yasm.asm
CommitLineData
7ca7d5fa
LM
1;******************************************************************************
2;* MMX optimized DSP utils
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23
2f77923d
LM
24SECTION_RODATA
25pb_f: times 16 db 15
26pb_zzzzzzzz77777777: times 8 db -1
27pb_7: times 8 db 7
28pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30
7ca7d5fa
LM
31section .text align=16
32
33%macro PSWAPD_SSE 2
34 pshufw %1, %2, 0x4e
35%endmacro
36%macro PSWAPD_3DN1 2
37 movq %1, %2
38 psrlq %1, 32
39 punpckldq %1, %2
40%endmacro
41
42%macro FLOAT_TO_INT16_INTERLEAVE6 1
43; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
40c7d0ae 44cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
7ca7d5fa
LM
45%ifdef ARCH_X86_64
46 %define lend r10d
47 mov lend, r2d
48%else
49 %define lend dword r2m
50%endif
51 mov src1q, [srcq+1*gprsize]
52 mov src2q, [srcq+2*gprsize]
53 mov src3q, [srcq+3*gprsize]
54 mov src4q, [srcq+4*gprsize]
55 mov src5q, [srcq+5*gprsize]
56 mov srcq, [srcq]
57 sub src1q, srcq
58 sub src2q, srcq
59 sub src3q, srcq
60 sub src4q, srcq
61 sub src5q, srcq
62.loop:
63 cvtps2pi mm0, [srcq]
64 cvtps2pi mm1, [srcq+src1q]
65 cvtps2pi mm2, [srcq+src2q]
66 cvtps2pi mm3, [srcq+src3q]
67 cvtps2pi mm4, [srcq+src4q]
68 cvtps2pi mm5, [srcq+src5q]
69 packssdw mm0, mm3
70 packssdw mm1, mm4
71 packssdw mm2, mm5
72 pswapd mm3, mm0
73 punpcklwd mm0, mm1
74 punpckhwd mm1, mm2
75 punpcklwd mm2, mm3
76 pswapd mm3, mm0
77 punpckldq mm0, mm2
78 punpckhdq mm2, mm1
79 punpckldq mm1, mm3
80 movq [dstq ], mm0
81 movq [dstq+16], mm2
82 movq [dstq+ 8], mm1
83 add srcq, 8
84 add dstq, 24
85 sub lend, 2
86 jg .loop
87 emms
88 RET
89%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
90
91%define pswapd PSWAPD_SSE
92FLOAT_TO_INT16_INTERLEAVE6 sse
93%define cvtps2pi pf2id
94%define pswapd PSWAPD_3DN1
95FLOAT_TO_INT16_INTERLEAVE6 3dnow
96%undef pswapd
97FLOAT_TO_INT16_INTERLEAVE6 3dn2
98%undef cvtps2pi
99
3daa434a
LM
100
101
e17ccf60 102; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
3daa434a
LM
103cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
104 movq mm0, [topq]
105 movq mm2, mm0
106 movd mm4, [left_topq]
107 psllq mm2, 8
108 movq mm1, mm0
109 por mm4, mm2
110 movd mm3, [leftq]
111 psubb mm0, mm4 ; t-tl
112 add dstq, wq
113 add topq, wq
114 add diffq, wq
115 neg wq
116 jmp .skip
117.loop:
118 movq mm4, [topq+wq]
119 movq mm0, mm4
120 psllq mm4, 8
121 por mm4, mm1
122 movq mm1, mm0 ; t
123 psubb mm0, mm4 ; t-tl
124.skip:
125 movq mm2, [diffq+wq]
126%assign i 0
127%rep 8
128 movq mm4, mm0
129 paddb mm4, mm3 ; t-tl+l
130 movq mm5, mm3
131 pmaxub mm3, mm1
132 pminub mm5, mm1
133 pminub mm3, mm4
134 pmaxub mm3, mm5 ; median
135 paddb mm3, mm2 ; +residual
136%if i==0
137 movq mm7, mm3
138 psllq mm7, 56
139%else
140 movq mm6, mm3
141 psrlq mm7, 8
142 psllq mm6, 56
143 por mm7, mm6
144%endif
145%if i<7
146 psrlq mm0, 8
147 psrlq mm1, 8
148 psrlq mm2, 8
149%endif
150%assign i i+1
151%endrep
152 movq [dstq+wq], mm7
153 add wq, 8
154 jl .loop
155 movzx r2d, byte [dstq-1]
156 mov [leftq], r2d
157 movzx r2d, byte [topq-1]
158 mov [left_topq], r2d
159 RET
2f77923d
LM
160
161
162%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
163 add srcq, wq
164 add dstq, wq
165 neg wq
166%%.loop:
167 mova m1, [srcq+wq]
168 mova m2, m1
169 psllw m1, 8
170 paddb m1, m2
171 mova m2, m1
172 pshufb m1, m3
173 paddb m1, m2
174 pshufb m0, m5
175 mova m2, m1
176 pshufb m1, m4
177 paddb m1, m2
178%if mmsize == 16
179 mova m2, m1
180 pshufb m1, m6
181 paddb m1, m2
182%endif
183 paddb m0, m1
184%if %1
185 mova [dstq+wq], m0
186%else
187 movq [dstq+wq], m0
188 movhps [dstq+wq+8], m0
189%endif
190 add wq, mmsize
191 jl %%.loop
192 mov eax, mmsize-1
193 sub eax, wd
194 movd m1, eax
195 pshufb m0, m1
196 movd eax, m0
197 RET
198%endmacro
199
e17ccf60 200; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
2f77923d
LM
201INIT_MMX
202cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
203.skip_prologue:
204 mova m5, [pb_7 GLOBAL]
205 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
206 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
207 movd m0, leftm
208 psllq m0, 56
209 ADD_HFYU_LEFT_LOOP 1
210
211INIT_XMM
212cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
213 mova m5, [pb_f GLOBAL]
214 mova m6, [pb_zzzzzzzz77777777 GLOBAL]
215 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
216 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
217 movd m0, leftm
218 pslldq m0, 15
219 test srcq, 15
b07781b6 220 jnz add_hfyu_left_prediction_ssse3.skip_prologue
2f77923d
LM
221 test dstq, 15
222 jnz .unaligned
223 ADD_HFYU_LEFT_LOOP 1
224.unaligned:
225 ADD_HFYU_LEFT_LOOP 0
226