Commit | Line | Data |
---|---|---|
1e98dffb NK |
1 | /* |
2 | * Alpha optimized DSP utils | |
3 | * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
4 | * | |
ff4ec49e FB |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public | |
7 | * License as published by the Free Software Foundation; either | |
8 | * version 2 of the License, or (at your option) any later version. | |
1e98dffb | 9 | * |
ff4ec49e | 10 | * This library is distributed in the hope that it will be useful, |
1e98dffb | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ff4ec49e FB |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. | |
1e98dffb | 14 | * |
ff4ec49e FB |
15 | * You should have received a copy of the GNU Lesser General Public |
16 | * License along with this library; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1e98dffb NK |
18 | */ |
19 | ||
20 | #include "asm.h" | |
21 | #include "../dsputil.h" | |
22 | #include "../mpegvideo.h" | |
23 | ||
2393e654 | 24 | static void dct_unquantize_h263_intra_axp(MpegEncContext *s, DCTELEM *block, |
e0580f8c | 25 | int n, int qscale) |
1e98dffb | 26 | { |
e0580f8c FH |
27 | int i, n_coeffs; |
28 | uint64_t qmul, qadd; | |
29 | uint64_t correction; | |
30 | DCTELEM *orig_block = block; | |
7e491fa5 | 31 | DCTELEM block0; /* might not be used uninitialized */ |
db42e13b | 32 | |
acd2e05c FH |
33 | qadd = WORD_VEC((qscale - 1) | 1); |
34 | qmul = qscale << 1; | |
35 | /* This mask kills spill from negative subwords to the next subword. */ | |
36 | correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */ | |
37 | ||
2393e654 MN |
38 | if (!s->h263_aic) { |
39 | if (n < 4) | |
40 | block0 = block[0] * s->y_dc_scale; | |
41 | else | |
42 | block0 = block[0] * s->c_dc_scale; | |
1e98dffb | 43 | } else { |
2393e654 | 44 | qadd = 0; |
1e98dffb | 45 | } |
2393e654 | 46 | n_coeffs = 63; // does not always use zigzag table |
e0580f8c | 47 | |
acd2e05c | 48 | for(i = 0; i <= n_coeffs; block += 4, i += 4) { |
e0580f8c FH |
49 | uint64_t levels, negmask, zeros, add; |
50 | ||
51 | levels = ldq(block); | |
52 | if (levels == 0) | |
53 | continue; | |
54 | ||
2dbe7ec8 FH |
55 | #ifdef __alpha_max__ |
56 | /* I don't think the speed difference justifies runtime | |
57 | detection. */ | |
e0580f8c FH |
58 | negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */ |
59 | negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ | |
2dbe7ec8 FH |
60 | #else |
61 | negmask = cmpbge(WORD_VEC(0x7fff), levels); | |
62 | negmask &= (negmask >> 1) | (1 << 7); | |
63 | negmask = zap(-1, negmask); | |
64 | #endif | |
e0580f8c FH |
65 | |
66 | zeros = cmpbge(0, levels); | |
67 | zeros &= zeros >> 1; | |
68 | /* zeros |= zeros << 1 is not needed since qadd <= 255, so | |
69 | zapping the lower byte suffices. */ | |
70 | ||
71 | levels *= qmul; | |
72 | levels -= correction & (negmask << 16); | |
73 | ||
74 | /* Negate qadd for negative levels. */ | |
75 | add = qadd ^ negmask; | |
76 | add += WORD_VEC(0x0001) & negmask; | |
77 | /* Set qadd to 0 for levels == 0. */ | |
78 | add = zap(add, zeros); | |
79 | ||
80 | levels += add; | |
81 | ||
82 | stq(levels, block); | |
83 | } | |
84 | ||
85 | if (s->mb_intra && !s->h263_aic) | |
86 | orig_block[0] = block0; | |
1e98dffb NK |
87 | } |
88 | ||
2393e654 MN |
89 | static void dct_unquantize_h263_inter_axp(MpegEncContext *s, DCTELEM *block, |
90 | int n, int qscale) | |
91 | { | |
92 | int i, n_coeffs; | |
93 | uint64_t qmul, qadd; | |
94 | uint64_t correction; | |
2393e654 MN |
95 | |
96 | qadd = WORD_VEC((qscale - 1) | 1); | |
97 | qmul = qscale << 1; | |
98 | /* This mask kills spill from negative subwords to the next subword. */ | |
99 | correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */ | |
100 | ||
101 | n_coeffs = s->intra_scantable.raster_end[s->block_last_index[n]]; | |
102 | ||
103 | for(i = 0; i <= n_coeffs; block += 4, i += 4) { | |
104 | uint64_t levels, negmask, zeros, add; | |
105 | ||
106 | levels = ldq(block); | |
107 | if (levels == 0) | |
108 | continue; | |
109 | ||
110 | #ifdef __alpha_max__ | |
111 | /* I don't think the speed difference justifies runtime | |
112 | detection. */ | |
113 | negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */ | |
114 | negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ | |
115 | #else | |
116 | negmask = cmpbge(WORD_VEC(0x7fff), levels); | |
117 | negmask &= (negmask >> 1) | (1 << 7); | |
118 | negmask = zap(-1, negmask); | |
119 | #endif | |
120 | ||
121 | zeros = cmpbge(0, levels); | |
122 | zeros &= zeros >> 1; | |
123 | /* zeros |= zeros << 1 is not needed since qadd <= 255, so | |
124 | zapping the lower byte suffices. */ | |
125 | ||
126 | levels *= qmul; | |
127 | levels -= correction & (negmask << 16); | |
128 | ||
129 | /* Negate qadd for negative levels. */ | |
130 | add = qadd ^ negmask; | |
131 | add += WORD_VEC(0x0001) & negmask; | |
132 | /* Set qadd to 0 for levels == 0. */ | |
133 | add = zap(add, zeros); | |
134 | ||
135 | levels += add; | |
136 | ||
137 | stq(levels, block); | |
138 | } | |
139 | } | |
140 | ||
1e98dffb NK |
141 | void MPV_common_init_axp(MpegEncContext *s) |
142 | { | |
2393e654 MN |
143 | s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_axp; |
144 | s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_axp; | |
1e98dffb | 145 | } |