1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
25 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
29 %macro SCALARPRODUCT 0
30 ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
31 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
38 movu m0, [v1q + orderq]
39 movu m1, [v1q + orderq + mmsize]
40 pmaddwd m0, [v2q + orderq]
41 pmaddwd m1, [v2q + orderq + mmsize]
64 ;-----------------------------------------------------------------------------
65 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
66 ; int32_t max, unsigned int len)
67 ;-----------------------------------------------------------------------------
69 ; %1 = number of xmm registers used
70 ; %2 = number of inline load/process/store loops per asm loop
71 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
72 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
74 %macro VECTOR_CLIP_INT32 4-5
75 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
88 mova m0, [srcq+mmsize*0*%%i]
89 mova m1, [srcq+mmsize*1*%%i]
90 mova m2, [srcq+mmsize*2*%%i]
91 mova m3, [srcq+mmsize*3*%%i]
93 mova m7, [srcq+mmsize*4*%%i]
94 mova m8, [srcq+mmsize*5*%%i]
95 mova m9, [srcq+mmsize*6*%%i]
96 mova m10, [srcq+mmsize*7*%%i]
106 CLIPD m10, m4, m5, m6
108 mova [dstq+mmsize*0*%%i], m0
109 mova [dstq+mmsize*1*%%i], m1
110 mova [dstq+mmsize*2*%%i], m2
111 mova [dstq+mmsize*3*%%i], m3
113 mova [dstq+mmsize*4*%%i], m7
114 mova [dstq+mmsize*5*%%i], m8
115 mova [dstq+mmsize*6*%%i], m9
116 mova [dstq+mmsize*7*%%i], m10
120 add srcq, mmsize*4*(%2+%3)
121 add dstq, mmsize*4*(%2+%3)
122 sub lend, mmsize*(%2+%3)
128 %define CLIPD CLIPD_MMX
129 VECTOR_CLIP_INT32 0, 1, 0, 0
131 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
132 %define CLIPD CLIPD_SSE2
133 VECTOR_CLIP_INT32 6, 2, 0, 1
135 %define CLIPD CLIPD_SSE41
137 VECTOR_CLIP_INT32 11, 1, 1, 0
139 VECTOR_CLIP_INT32 6, 1, 0, 0
142 ; %1 = aligned/unaligned
156 pshuflw m0, m0, 10110001b
157 pshuflw m1, m1, 10110001b
158 pshufhw m0, m0, 10110001b
159 pshufhw m1, m1, 10110001b
184 pshuflw m0, m0, 10110001b
185 pshufhw m0, m0, 10110001b
196 ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
199 cglobal bswap32_buf, 3,4,3
201 mova m2, [pb_bswap32]
203 cglobal bswap32_buf, 3,4,5