+ Compute partial IDCT of half row.
+ shift = left-shift amount
+ a3 = row[2,0]
+ a4 = row[3,1]
+
+ Output in registers v1--v8
+*/
+ .macro idct_row4 shift
+ ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
+ ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
+ ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
+ mov a2, #(1<<(\shift-1))
+ smlad v1, a3, ip, a2
+ smlsd v4, a3, ip, a2
+ ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
+ smlad v2, a3, lr, a2
+ smlsd v3, a3, lr, a2
+ smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
+ smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
+ pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
+ pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
+ smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
+ smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
+ .endm
+
+/*