Commit | Line | Data |
---|---|---|
a9b3f630 NK |
1 | /* |
2 | * The simplest mpeg encoder (well, it was the simplest!) | |
ff4ec49e | 3 | * Copyright (c) 2000,2001 Fabrice Bellard. |
a9b3f630 | 4 | * |
7b94177e DB |
5 | * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> |
6 | * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> | |
7 | * | |
b78e7197 DB |
8 | * This file is part of FFmpeg. |
9 | * | |
10 | * FFmpeg is free software; you can redistribute it and/or | |
ff4ec49e FB |
11 | * modify it under the terms of the GNU Lesser General Public |
12 | * License as published by the Free Software Foundation; either | |
b78e7197 | 13 | * version 2.1 of the License, or (at your option) any later version. |
a9b3f630 | 14 | * |
b78e7197 | 15 | * FFmpeg is distributed in the hope that it will be useful, |
a9b3f630 | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ff4ec49e FB |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * Lesser General Public License for more details. | |
a9b3f630 | 19 | * |
ff4ec49e | 20 | * You should have received a copy of the GNU Lesser General Public |
b78e7197 | 21 | * License along with FFmpeg; if not, write to the Free Software |
5509bffa | 22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
a9b3f630 NK |
23 | */ |
24 | ||
b550bfaa | 25 | #include "dsputil.h" |
182f56cb | 26 | #include "dsputil_mmx.h" |
b550bfaa RB |
27 | #include "mpegvideo.h" |
28 | #include "avcodec.h" | |
9c39071d | 29 | #include "x86_cpu.h" |
2ad1516a | 30 | |
486497e0 | 31 | extern uint16_t inv_zigzag_direct16[64]; |
badaf88e | 32 | |
7f3f5ec8 | 33 | |
d50635cd | 34 | static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, |
7f3f5ec8 A |
35 | DCTELEM *block, int n, int qscale) |
36 | { | |
053dea12 | 37 | long level, qmul, qadd, nCoeffs; |
2ad1516a MN |
38 | |
39 | qmul = qscale << 1; | |
7f3f5ec8 | 40 | |
ba58dabc | 41 | assert(s->block_last_index[n]>=0 || s->h263_aic); |
115329f1 | 42 | |
d50635cd MN |
43 | if (!s->h263_aic) { |
44 | if (n < 4) | |
45 | level = block[0] * s->y_dc_scale; | |
46 | else | |
47 | level = block[0] * s->c_dc_scale; | |
48 | qadd = (qscale - 1) | 1; | |
49 | }else{ | |
50 | qadd = 0; | |
51 | level= block[0]; | |
7f3f5ec8 | 52 | } |
d50635cd MN |
53 | if(s->ac_pred) |
54 | nCoeffs=63; | |
55 | else | |
56 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
57 | //printf("%d %d ", qmul, qadd); | |
58 | asm volatile( | |
bb270c08 DB |
59 | "movd %1, %%mm6 \n\t" //qmul |
60 | "packssdw %%mm6, %%mm6 \n\t" | |
61 | "packssdw %%mm6, %%mm6 \n\t" | |
62 | "movd %2, %%mm5 \n\t" //qadd | |
63 | "pxor %%mm7, %%mm7 \n\t" | |
64 | "packssdw %%mm5, %%mm5 \n\t" | |
65 | "packssdw %%mm5, %%mm5 \n\t" | |
66 | "psubw %%mm5, %%mm7 \n\t" | |
67 | "pxor %%mm4, %%mm4 \n\t" | |
4454dc1b | 68 | ASMALIGN(4) |
bb270c08 DB |
69 | "1: \n\t" |
70 | "movq (%0, %3), %%mm0 \n\t" | |
71 | "movq 8(%0, %3), %%mm1 \n\t" | |
72 | ||
73 | "pmullw %%mm6, %%mm0 \n\t" | |
74 | "pmullw %%mm6, %%mm1 \n\t" | |
75 | ||
76 | "movq (%0, %3), %%mm2 \n\t" | |
77 | "movq 8(%0, %3), %%mm3 \n\t" | |
78 | ||
79 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
80 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
81 | ||
82 | "pxor %%mm2, %%mm0 \n\t" | |
83 | "pxor %%mm3, %%mm1 \n\t" | |
84 | ||
85 | "paddw %%mm7, %%mm0 \n\t" | |
86 | "paddw %%mm7, %%mm1 \n\t" | |
87 | ||
88 | "pxor %%mm0, %%mm2 \n\t" | |
89 | "pxor %%mm1, %%mm3 \n\t" | |
90 | ||
91 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
92 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
93 | ||
94 | "pandn %%mm2, %%mm0 \n\t" | |
95 | "pandn %%mm3, %%mm1 \n\t" | |
96 | ||
97 | "movq %%mm0, (%0, %3) \n\t" | |
98 | "movq %%mm1, 8(%0, %3) \n\t" | |
99 | ||
100 | "add $16, %3 \n\t" | |
101 | "jng 1b \n\t" | |
102 | ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) | |
103 | : "memory" | |
104 | ); | |
d50635cd MN |
105 | block[0]= level; |
106 | } | |
107 | ||
108 | ||
109 | static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
110 | DCTELEM *block, int n, int qscale) | |
111 | { | |
053dea12 | 112 | long qmul, qadd, nCoeffs; |
d50635cd MN |
113 | |
114 | qmul = qscale << 1; | |
115 | qadd = (qscale - 1) | 1; | |
116 | ||
117 | assert(s->block_last_index[n]>=0 || s->h263_aic); | |
115329f1 | 118 | |
d50635cd | 119 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
badaf88e | 120 | //printf("%d %d ", qmul, qadd); |
7f3f5ec8 | 121 | asm volatile( |
bb270c08 DB |
122 | "movd %1, %%mm6 \n\t" //qmul |
123 | "packssdw %%mm6, %%mm6 \n\t" | |
124 | "packssdw %%mm6, %%mm6 \n\t" | |
125 | "movd %2, %%mm5 \n\t" //qadd | |
126 | "pxor %%mm7, %%mm7 \n\t" | |
127 | "packssdw %%mm5, %%mm5 \n\t" | |
128 | "packssdw %%mm5, %%mm5 \n\t" | |
129 | "psubw %%mm5, %%mm7 \n\t" | |
130 | "pxor %%mm4, %%mm4 \n\t" | |
4454dc1b | 131 | ASMALIGN(4) |
bb270c08 DB |
132 | "1: \n\t" |
133 | "movq (%0, %3), %%mm0 \n\t" | |
134 | "movq 8(%0, %3), %%mm1 \n\t" | |
135 | ||
136 | "pmullw %%mm6, %%mm0 \n\t" | |
137 | "pmullw %%mm6, %%mm1 \n\t" | |
138 | ||
139 | "movq (%0, %3), %%mm2 \n\t" | |
140 | "movq 8(%0, %3), %%mm3 \n\t" | |
141 | ||
142 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
143 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
144 | ||
145 | "pxor %%mm2, %%mm0 \n\t" | |
146 | "pxor %%mm3, %%mm1 \n\t" | |
147 | ||
148 | "paddw %%mm7, %%mm0 \n\t" | |
149 | "paddw %%mm7, %%mm1 \n\t" | |
150 | ||
151 | "pxor %%mm0, %%mm2 \n\t" | |
152 | "pxor %%mm1, %%mm3 \n\t" | |
153 | ||
154 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
155 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
156 | ||
157 | "pandn %%mm2, %%mm0 \n\t" | |
158 | "pandn %%mm3, %%mm1 \n\t" | |
159 | ||
160 | "movq %%mm0, (%0, %3) \n\t" | |
161 | "movq %%mm1, 8(%0, %3) \n\t" | |
162 | ||
163 | "add $16, %3 \n\t" | |
164 | "jng 1b \n\t" | |
165 | ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) | |
166 | : "memory" | |
167 | ); | |
7f3f5ec8 A |
168 | } |
169 | ||
170 | ||
a9b3f630 NK |
171 | /* |
172 | NK: | |
173 | Note: looking at PARANOID: | |
174 | "enable all paranoid tests for rounding, overflows, etc..." | |
175 | ||
176 | #ifdef PARANOID | |
177 | if (level < -2048 || level > 2047) | |
178 | fprintf(stderr, "unquant error %d %d\n", i, level); | |
179 | #endif | |
52b541ad | 180 | We can suppose that result of two multiplications can't be greater than 0xFFFF |
a9b3f630 NK |
181 | i.e. is 16-bit, so we use here only PMULLW instruction and can avoid |
182 | a complex multiplication. | |
183 | ===================================================== | |
184 | Full formula for multiplication of 2 integer numbers | |
185 | which are represent as high:low words: | |
186 | input: value1 = high1:low1 | |
187 | value2 = high2:low2 | |
188 | output: value3 = value1*value2 | |
189 | value3=high3:low3 (on overflow: modulus 2^32 wrap-around) | |
190 | this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 | |
191 | but this algorithm will compute only 0x66cb0ce4 | |
192 | this limited by 16-bit size of operands | |
193 | --------------------------------- | |
194 | tlow1 = high1*low2 | |
195 | tlow2 = high2*low1 | |
196 | tlow1 = tlow1 + tlow2 | |
197 | high3:low3 = low1*low2 | |
198 | high3 += tlow1 | |
199 | */ | |
d50635cd | 200 | static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, |
35b2a786 | 201 | DCTELEM *block, int n, int qscale) |
a9b3f630 | 202 | { |
053dea12 | 203 | long nCoeffs; |
0c1a9eda | 204 | const uint16_t *quant_matrix; |
d50635cd | 205 | int block0; |
2ad1516a MN |
206 | |
207 | assert(s->block_last_index[n]>=0); | |
208 | ||
209 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
badaf88e | 210 | |
115329f1 | 211 | if (n < 4) |
d50635cd MN |
212 | block0 = block[0] * s->y_dc_scale; |
213 | else | |
214 | block0 = block[0] * s->c_dc_scale; | |
215 | /* XXX: only mpeg1 */ | |
216 | quant_matrix = s->intra_matrix; | |
9dbf1ddd | 217 | asm volatile( |
bb270c08 DB |
218 | "pcmpeqw %%mm7, %%mm7 \n\t" |
219 | "psrlw $15, %%mm7 \n\t" | |
220 | "movd %2, %%mm6 \n\t" | |
221 | "packssdw %%mm6, %%mm6 \n\t" | |
222 | "packssdw %%mm6, %%mm6 \n\t" | |
223 | "mov %3, %%"REG_a" \n\t" | |
4454dc1b | 224 | ASMALIGN(4) |
bb270c08 DB |
225 | "1: \n\t" |
226 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
227 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
228 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
229 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
230 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
231 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
232 | "pxor %%mm2, %%mm2 \n\t" | |
233 | "pxor %%mm3, %%mm3 \n\t" | |
234 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
235 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
236 | "pxor %%mm2, %%mm0 \n\t" | |
237 | "pxor %%mm3, %%mm1 \n\t" | |
238 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
239 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
240 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
241 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
242 | "pxor %%mm4, %%mm4 \n\t" | |
243 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
244 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
245 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
246 | "psraw $3, %%mm0 \n\t" | |
247 | "psraw $3, %%mm1 \n\t" | |
248 | "psubw %%mm7, %%mm0 \n\t" | |
249 | "psubw %%mm7, %%mm1 \n\t" | |
250 | "por %%mm7, %%mm0 \n\t" | |
251 | "por %%mm7, %%mm1 \n\t" | |
252 | "pxor %%mm2, %%mm0 \n\t" | |
253 | "pxor %%mm3, %%mm1 \n\t" | |
254 | "psubw %%mm2, %%mm0 \n\t" | |
255 | "psubw %%mm3, %%mm1 \n\t" | |
256 | "pandn %%mm0, %%mm4 \n\t" | |
257 | "pandn %%mm1, %%mm5 \n\t" | |
258 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
259 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
260 | ||
261 | "add $16, %%"REG_a" \n\t" | |
262 | "js 1b \n\t" | |
263 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
264 | : "%"REG_a, "memory" | |
265 | ); | |
d50635cd MN |
266 | block[0]= block0; |
267 | } | |
268 | ||
269 | static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, | |
270 | DCTELEM *block, int n, int qscale) | |
271 | { | |
053dea12 | 272 | long nCoeffs; |
d50635cd MN |
273 | const uint16_t *quant_matrix; |
274 | ||
275 | assert(s->block_last_index[n]>=0); | |
276 | ||
277 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
9dbf1ddd | 278 | |
d7e9533a | 279 | quant_matrix = s->inter_matrix; |
7f3f5ec8 | 280 | asm volatile( |
bb270c08 DB |
281 | "pcmpeqw %%mm7, %%mm7 \n\t" |
282 | "psrlw $15, %%mm7 \n\t" | |
283 | "movd %2, %%mm6 \n\t" | |
284 | "packssdw %%mm6, %%mm6 \n\t" | |
285 | "packssdw %%mm6, %%mm6 \n\t" | |
286 | "mov %3, %%"REG_a" \n\t" | |
4454dc1b | 287 | ASMALIGN(4) |
bb270c08 DB |
288 | "1: \n\t" |
289 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
290 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
291 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
292 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
293 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
294 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
295 | "pxor %%mm2, %%mm2 \n\t" | |
296 | "pxor %%mm3, %%mm3 \n\t" | |
297 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
298 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
299 | "pxor %%mm2, %%mm0 \n\t" | |
300 | "pxor %%mm3, %%mm1 \n\t" | |
301 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
302 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
303 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
304 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
305 | "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 | |
306 | "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 | |
307 | "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
308 | "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
309 | "pxor %%mm4, %%mm4 \n\t" | |
310 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
311 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
312 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
313 | "psraw $4, %%mm0 \n\t" | |
314 | "psraw $4, %%mm1 \n\t" | |
315 | "psubw %%mm7, %%mm0 \n\t" | |
316 | "psubw %%mm7, %%mm1 \n\t" | |
317 | "por %%mm7, %%mm0 \n\t" | |
318 | "por %%mm7, %%mm1 \n\t" | |
319 | "pxor %%mm2, %%mm0 \n\t" | |
320 | "pxor %%mm3, %%mm1 \n\t" | |
321 | "psubw %%mm2, %%mm0 \n\t" | |
322 | "psubw %%mm3, %%mm1 \n\t" | |
323 | "pandn %%mm0, %%mm4 \n\t" | |
324 | "pandn %%mm1, %%mm5 \n\t" | |
325 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
326 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
327 | ||
328 | "add $16, %%"REG_a" \n\t" | |
329 | "js 1b \n\t" | |
330 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
331 | : "%"REG_a, "memory" | |
332 | ); | |
9dbf1ddd MN |
333 | } |
334 | ||
d50635cd | 335 | static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, |
9dbf1ddd MN |
336 | DCTELEM *block, int n, int qscale) |
337 | { | |
053dea12 | 338 | long nCoeffs; |
0c1a9eda | 339 | const uint16_t *quant_matrix; |
d50635cd | 340 | int block0; |
115329f1 | 341 | |
2ad1516a MN |
342 | assert(s->block_last_index[n]>=0); |
343 | ||
344 | if(s->alternate_scan) nCoeffs= 63; //FIXME | |
345 | else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
9dbf1ddd | 346 | |
115329f1 | 347 | if (n < 4) |
d50635cd MN |
348 | block0 = block[0] * s->y_dc_scale; |
349 | else | |
350 | block0 = block[0] * s->c_dc_scale; | |
351 | quant_matrix = s->intra_matrix; | |
9dbf1ddd | 352 | asm volatile( |
bb270c08 DB |
353 | "pcmpeqw %%mm7, %%mm7 \n\t" |
354 | "psrlw $15, %%mm7 \n\t" | |
355 | "movd %2, %%mm6 \n\t" | |
356 | "packssdw %%mm6, %%mm6 \n\t" | |
357 | "packssdw %%mm6, %%mm6 \n\t" | |
358 | "mov %3, %%"REG_a" \n\t" | |
4454dc1b | 359 | ASMALIGN(4) |
bb270c08 DB |
360 | "1: \n\t" |
361 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
362 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
363 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
364 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
365 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
366 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
367 | "pxor %%mm2, %%mm2 \n\t" | |
368 | "pxor %%mm3, %%mm3 \n\t" | |
369 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
370 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
371 | "pxor %%mm2, %%mm0 \n\t" | |
372 | "pxor %%mm3, %%mm1 \n\t" | |
373 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
374 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
375 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
376 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
377 | "pxor %%mm4, %%mm4 \n\t" | |
378 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
379 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
380 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
381 | "psraw $3, %%mm0 \n\t" | |
382 | "psraw $3, %%mm1 \n\t" | |
383 | "pxor %%mm2, %%mm0 \n\t" | |
384 | "pxor %%mm3, %%mm1 \n\t" | |
385 | "psubw %%mm2, %%mm0 \n\t" | |
386 | "psubw %%mm3, %%mm1 \n\t" | |
387 | "pandn %%mm0, %%mm4 \n\t" | |
388 | "pandn %%mm1, %%mm5 \n\t" | |
389 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
390 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
391 | ||
392 | "add $16, %%"REG_a" \n\t" | |
393 | "jng 1b \n\t" | |
394 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
395 | : "%"REG_a, "memory" | |
396 | ); | |
d50635cd | 397 | block[0]= block0; |
755bfeab | 398 | //Note, we do not do mismatch control for intra as errors cannot accumulate |
d50635cd MN |
399 | } |
400 | ||
401 | static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, | |
402 | DCTELEM *block, int n, int qscale) | |
403 | { | |
053dea12 | 404 | long nCoeffs; |
d50635cd | 405 | const uint16_t *quant_matrix; |
115329f1 | 406 | |
d50635cd MN |
407 | assert(s->block_last_index[n]>=0); |
408 | ||
409 | if(s->alternate_scan) nCoeffs= 63; //FIXME | |
410 | else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
9dbf1ddd | 411 | |
d7e9533a | 412 | quant_matrix = s->inter_matrix; |
9dbf1ddd | 413 | asm volatile( |
bb270c08 DB |
414 | "pcmpeqw %%mm7, %%mm7 \n\t" |
415 | "psrlq $48, %%mm7 \n\t" | |
416 | "movd %2, %%mm6 \n\t" | |
417 | "packssdw %%mm6, %%mm6 \n\t" | |
418 | "packssdw %%mm6, %%mm6 \n\t" | |
419 | "mov %3, %%"REG_a" \n\t" | |
4454dc1b | 420 | ASMALIGN(4) |
bb270c08 DB |
421 | "1: \n\t" |
422 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
423 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
424 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
425 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
426 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
427 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
428 | "pxor %%mm2, %%mm2 \n\t" | |
429 | "pxor %%mm3, %%mm3 \n\t" | |
430 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
431 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
432 | "pxor %%mm2, %%mm0 \n\t" | |
433 | "pxor %%mm3, %%mm1 \n\t" | |
434 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
435 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
436 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
437 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
438 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | |
439 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | |
440 | "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
441 | "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
442 | "pxor %%mm4, %%mm4 \n\t" | |
443 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
444 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
445 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
446 | "psrlw $4, %%mm0 \n\t" | |
447 | "psrlw $4, %%mm1 \n\t" | |
448 | "pxor %%mm2, %%mm0 \n\t" | |
449 | "pxor %%mm3, %%mm1 \n\t" | |
450 | "psubw %%mm2, %%mm0 \n\t" | |
451 | "psubw %%mm3, %%mm1 \n\t" | |
452 | "pandn %%mm0, %%mm4 \n\t" | |
453 | "pandn %%mm1, %%mm5 \n\t" | |
454 | "pxor %%mm4, %%mm7 \n\t" | |
455 | "pxor %%mm5, %%mm7 \n\t" | |
456 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
457 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
458 | ||
459 | "add $16, %%"REG_a" \n\t" | |
460 | "jng 1b \n\t" | |
461 | "movd 124(%0, %3), %%mm0 \n\t" | |
462 | "movq %%mm7, %%mm6 \n\t" | |
463 | "psrlq $32, %%mm7 \n\t" | |
464 | "pxor %%mm6, %%mm7 \n\t" | |
465 | "movq %%mm7, %%mm6 \n\t" | |
466 | "psrlq $16, %%mm7 \n\t" | |
467 | "pxor %%mm6, %%mm7 \n\t" | |
468 | "pslld $31, %%mm7 \n\t" | |
469 | "psrlq $15, %%mm7 \n\t" | |
470 | "pxor %%mm7, %%mm0 \n\t" | |
471 | "movd %%mm0, 124(%0, %3) \n\t" | |
472 | ||
473 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) | |
474 | : "%"REG_a, "memory" | |
475 | ); | |
a9b3f630 NK |
476 | } |
477 | ||
115329f1 | 478 | /* draw the edges of width 'w' of an image of size width, height |
3d9fccbf | 479 | this mmx version can only handle w==8 || w==16 */ |
0c1a9eda | 480 | static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) |
3d9fccbf | 481 | { |
0c1a9eda | 482 | uint8_t *ptr, *last_line; |
3d9fccbf MN |
483 | int i; |
484 | ||
485 | last_line = buf + (height - 1) * wrap; | |
486 | /* left and right */ | |
487 | ptr = buf; | |
488 | if(w==8) | |
489 | { | |
bb270c08 DB |
490 | asm volatile( |
491 | "1: \n\t" | |
492 | "movd (%0), %%mm0 \n\t" | |
493 | "punpcklbw %%mm0, %%mm0 \n\t" | |
494 | "punpcklwd %%mm0, %%mm0 \n\t" | |
495 | "punpckldq %%mm0, %%mm0 \n\t" | |
496 | "movq %%mm0, -8(%0) \n\t" | |
497 | "movq -8(%0, %2), %%mm1 \n\t" | |
498 | "punpckhbw %%mm1, %%mm1 \n\t" | |
499 | "punpckhwd %%mm1, %%mm1 \n\t" | |
500 | "punpckhdq %%mm1, %%mm1 \n\t" | |
501 | "movq %%mm1, (%0, %2) \n\t" | |
502 | "add %1, %0 \n\t" | |
503 | "cmp %3, %0 \n\t" | |
504 | " jb 1b \n\t" | |
505 | : "+r" (ptr) | |
506 | : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) | |
507 | ); | |
3d9fccbf MN |
508 | } |
509 | else | |
510 | { | |
bb270c08 DB |
511 | asm volatile( |
512 | "1: \n\t" | |
513 | "movd (%0), %%mm0 \n\t" | |
514 | "punpcklbw %%mm0, %%mm0 \n\t" | |
515 | "punpcklwd %%mm0, %%mm0 \n\t" | |
516 | "punpckldq %%mm0, %%mm0 \n\t" | |
517 | "movq %%mm0, -8(%0) \n\t" | |
518 | "movq %%mm0, -16(%0) \n\t" | |
519 | "movq -8(%0, %2), %%mm1 \n\t" | |
520 | "punpckhbw %%mm1, %%mm1 \n\t" | |
521 | "punpckhwd %%mm1, %%mm1 \n\t" | |
522 | "punpckhdq %%mm1, %%mm1 \n\t" | |
523 | "movq %%mm1, (%0, %2) \n\t" | |
524 | "movq %%mm1, 8(%0, %2) \n\t" | |
525 | "add %1, %0 \n\t" | |
526 | "cmp %3, %0 \n\t" | |
527 | " jb 1b \n\t" | |
528 | : "+r" (ptr) | |
529 | : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) | |
530 | ); | |
3d9fccbf | 531 | } |
115329f1 | 532 | |
3d9fccbf MN |
533 | for(i=0;i<w;i+=4) { |
534 | /* top and bottom (and hopefully also the corners) */ | |
bb270c08 DB |
535 | ptr= buf - (i + 1) * wrap - w; |
536 | asm volatile( | |
537 | "1: \n\t" | |
538 | "movq (%1, %0), %%mm0 \n\t" | |
539 | "movq %%mm0, (%0) \n\t" | |
540 | "movq %%mm0, (%0, %2) \n\t" | |
541 | "movq %%mm0, (%0, %2, 2) \n\t" | |
542 | "movq %%mm0, (%0, %3) \n\t" | |
543 | "add $8, %0 \n\t" | |
544 | "cmp %4, %0 \n\t" | |
545 | " jb 1b \n\t" | |
546 | : "+r" (ptr) | |
547 | : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w) | |
548 | ); | |
549 | ptr= last_line + (i + 1) * wrap - w; | |
550 | asm volatile( | |
551 | "1: \n\t" | |
552 | "movq (%1, %0), %%mm0 \n\t" | |
553 | "movq %%mm0, (%0) \n\t" | |
554 | "movq %%mm0, (%0, %2) \n\t" | |
555 | "movq %%mm0, (%0, %2, 2) \n\t" | |
556 | "movq %%mm0, (%0, %3) \n\t" | |
557 | "add $8, %0 \n\t" | |
558 | "cmp %4, %0 \n\t" | |
559 | " jb 1b \n\t" | |
560 | : "+r" (ptr) | |
561 | : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) | |
562 | ); | |
3d9fccbf MN |
563 | } |
564 | } | |
565 | ||
783df5f3 MN |
566 | static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ |
567 | const int intra= s->mb_intra; | |
568 | int *sum= s->dct_error_sum[intra]; | |
569 | uint16_t *offset= s->dct_offset[intra]; | |
570 | ||
571 | s->dct_count[intra]++; | |
572 | ||
573 | asm volatile( | |
bb270c08 DB |
574 | "pxor %%mm7, %%mm7 \n\t" |
575 | "1: \n\t" | |
576 | "pxor %%mm0, %%mm0 \n\t" | |
577 | "pxor %%mm1, %%mm1 \n\t" | |
578 | "movq (%0), %%mm2 \n\t" | |
579 | "movq 8(%0), %%mm3 \n\t" | |
580 | "pcmpgtw %%mm2, %%mm0 \n\t" | |
581 | "pcmpgtw %%mm3, %%mm1 \n\t" | |
582 | "pxor %%mm0, %%mm2 \n\t" | |
583 | "pxor %%mm1, %%mm3 \n\t" | |
584 | "psubw %%mm0, %%mm2 \n\t" | |
585 | "psubw %%mm1, %%mm3 \n\t" | |
586 | "movq %%mm2, %%mm4 \n\t" | |
587 | "movq %%mm3, %%mm5 \n\t" | |
588 | "psubusw (%2), %%mm2 \n\t" | |
589 | "psubusw 8(%2), %%mm3 \n\t" | |
590 | "pxor %%mm0, %%mm2 \n\t" | |
591 | "pxor %%mm1, %%mm3 \n\t" | |
592 | "psubw %%mm0, %%mm2 \n\t" | |
593 | "psubw %%mm1, %%mm3 \n\t" | |
594 | "movq %%mm2, (%0) \n\t" | |
595 | "movq %%mm3, 8(%0) \n\t" | |
596 | "movq %%mm4, %%mm2 \n\t" | |
597 | "movq %%mm5, %%mm3 \n\t" | |
598 | "punpcklwd %%mm7, %%mm4 \n\t" | |
599 | "punpckhwd %%mm7, %%mm2 \n\t" | |
600 | "punpcklwd %%mm7, %%mm5 \n\t" | |
601 | "punpckhwd %%mm7, %%mm3 \n\t" | |
602 | "paddd (%1), %%mm4 \n\t" | |
603 | "paddd 8(%1), %%mm2 \n\t" | |
604 | "paddd 16(%1), %%mm5 \n\t" | |
605 | "paddd 24(%1), %%mm3 \n\t" | |
606 | "movq %%mm4, (%1) \n\t" | |
607 | "movq %%mm2, 8(%1) \n\t" | |
608 | "movq %%mm5, 16(%1) \n\t" | |
609 | "movq %%mm3, 24(%1) \n\t" | |
610 | "add $16, %0 \n\t" | |
611 | "add $32, %1 \n\t" | |
612 | "add $16, %2 \n\t" | |
613 | "cmp %3, %0 \n\t" | |
614 | " jb 1b \n\t" | |
783df5f3 MN |
615 | : "+r" (block), "+r" (sum), "+r" (offset) |
616 | : "r"(block+64) | |
617 | ); | |
618 | } | |
619 | ||
c953e797 DB |
620 | static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ |
621 | const int intra= s->mb_intra; | |
622 | int *sum= s->dct_error_sum[intra]; | |
623 | uint16_t *offset= s->dct_offset[intra]; | |
624 | ||
625 | s->dct_count[intra]++; | |
626 | ||
627 | asm volatile( | |
bb270c08 DB |
628 | "pxor %%xmm7, %%xmm7 \n\t" |
629 | "1: \n\t" | |
630 | "pxor %%xmm0, %%xmm0 \n\t" | |
631 | "pxor %%xmm1, %%xmm1 \n\t" | |
632 | "movdqa (%0), %%xmm2 \n\t" | |
633 | "movdqa 16(%0), %%xmm3 \n\t" | |
634 | "pcmpgtw %%xmm2, %%xmm0 \n\t" | |
635 | "pcmpgtw %%xmm3, %%xmm1 \n\t" | |
636 | "pxor %%xmm0, %%xmm2 \n\t" | |
637 | "pxor %%xmm1, %%xmm3 \n\t" | |
638 | "psubw %%xmm0, %%xmm2 \n\t" | |
639 | "psubw %%xmm1, %%xmm3 \n\t" | |
640 | "movdqa %%xmm2, %%xmm4 \n\t" | |
641 | "movdqa %%xmm3, %%xmm5 \n\t" | |
642 | "psubusw (%2), %%xmm2 \n\t" | |
643 | "psubusw 16(%2), %%xmm3 \n\t" | |
644 | "pxor %%xmm0, %%xmm2 \n\t" | |
645 | "pxor %%xmm1, %%xmm3 \n\t" | |
646 | "psubw %%xmm0, %%xmm2 \n\t" | |
647 | "psubw %%xmm1, %%xmm3 \n\t" | |
648 | "movdqa %%xmm2, (%0) \n\t" | |
649 | "movdqa %%xmm3, 16(%0) \n\t" | |
650 | "movdqa %%xmm4, %%xmm6 \n\t" | |
651 | "movdqa %%xmm5, %%xmm0 \n\t" | |
652 | "punpcklwd %%xmm7, %%xmm4 \n\t" | |
653 | "punpckhwd %%xmm7, %%xmm6 \n\t" | |
654 | "punpcklwd %%xmm7, %%xmm5 \n\t" | |
655 | "punpckhwd %%xmm7, %%xmm0 \n\t" | |
656 | "paddd (%1), %%xmm4 \n\t" | |
657 | "paddd 16(%1), %%xmm6 \n\t" | |
658 | "paddd 32(%1), %%xmm5 \n\t" | |
659 | "paddd 48(%1), %%xmm0 \n\t" | |
660 | "movdqa %%xmm4, (%1) \n\t" | |
661 | "movdqa %%xmm6, 16(%1) \n\t" | |
662 | "movdqa %%xmm5, 32(%1) \n\t" | |
663 | "movdqa %%xmm0, 48(%1) \n\t" | |
664 | "add $32, %0 \n\t" | |
665 | "add $64, %1 \n\t" | |
666 | "add $32, %2 \n\t" | |
667 | "cmp %3, %0 \n\t" | |
668 | " jb 1b \n\t" | |
c953e797 DB |
669 | : "+r" (block), "+r" (sum), "+r" (offset) |
670 | : "r"(block+64) | |
671 | ); | |
672 | } | |
673 | ||
ff506a90 LM |
674 | #ifdef HAVE_SSSE3 |
675 | #define HAVE_SSSE3_BAK | |
676 | #endif | |
677 | #undef HAVE_SSSE3 | |
678 | ||
679 | #undef HAVE_SSE2 | |
2f349de2 MN |
680 | #undef HAVE_MMX2 |
681 | #define RENAME(a) a ## _MMX | |
cf3bf5bb | 682 | #define RENAMEl(a) a ## _mmx |
2f349de2 MN |
683 | #include "mpegvideo_mmx_template.c" |
684 | ||
685 | #define HAVE_MMX2 | |
686 | #undef RENAME | |
821cb11f | 687 | #undef RENAMEl |
2f349de2 | 688 | #define RENAME(a) a ## _MMX2 |
cf3bf5bb | 689 | #define RENAMEl(a) a ## _mmx2 |
2f349de2 | 690 | #include "mpegvideo_mmx_template.c" |
3d9fccbf | 691 | |
ff506a90 | 692 | #define HAVE_SSE2 |
8fd19ab2 MN |
693 | #undef RENAME |
694 | #undef RENAMEl | |
695 | #define RENAME(a) a ## _SSE2 | |
696 | #define RENAMEl(a) a ## _sse2 | |
697 | #include "mpegvideo_mmx_template.c" | |
698 | ||
ff506a90 LM |
699 | #ifdef HAVE_SSSE3_BAK |
700 | #define HAVE_SSSE3 | |
701 | #undef RENAME | |
702 | #undef RENAMEl | |
703 | #define RENAME(a) a ## _SSSE3 | |
704 | #define RENAMEl(a) a ## _sse2 | |
705 | #include "mpegvideo_mmx_template.c" | |
706 | #endif | |
707 | ||
35b2a786 | 708 | void MPV_common_init_mmx(MpegEncContext *s) |
a9b3f630 | 709 | { |
486497e0 | 710 | if (mm_flags & MM_MMX) { |
2ad1516a | 711 | const int dct_algo = s->avctx->dct_algo; |
115329f1 | 712 | |
d50635cd MN |
713 | s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; |
714 | s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
715 | s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
716 | s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
e27b6e62 MN |
717 | if(!(s->flags & CODEC_FLAG_BITEXACT)) |
718 | s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; | |
d50635cd | 719 | s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; |
3bf43d42 | 720 | |
ef5b1b5a | 721 | draw_edges = draw_edges_mmx; |
115329f1 | 722 | |
486497e0 | 723 | if (mm_flags & MM_SSE2) { |
bb270c08 DB |
724 | s->denoise_dct= denoise_dct_sse2; |
725 | } else { | |
726 | s->denoise_dct= denoise_dct_mmx; | |
727 | } | |
2f349de2 | 728 | |
28db7fce | 729 | if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
ff506a90 LM |
730 | #ifdef HAVE_SSSE3 |
731 | if(mm_flags & MM_SSSE3){ | |
732 | s->dct_quantize= dct_quantize_SSSE3; | |
733 | } else | |
734 | #endif | |
486497e0 | 735 | if(mm_flags & MM_SSE2){ |
8fd19ab2 | 736 | s->dct_quantize= dct_quantize_SSE2; |
486497e0 | 737 | } else if(mm_flags & MM_MMXEXT){ |
28db7fce MN |
738 | s->dct_quantize= dct_quantize_MMX2; |
739 | } else { | |
740 | s->dct_quantize= dct_quantize_MMX; | |
741 | } | |
ef5b1b5a | 742 | } |
35b2a786 | 743 | } |
a9b3f630 | 744 | } |