aarch64: vp9itxfm: Restructure the idct32 store macros
authorMartin Storsjö <martin@martin.st>
Thu, 1 Dec 2016 09:10:19 +0000 (11:10 +0200)
committerMartin Storsjö <martin@martin.st>
Sun, 5 Feb 2017 11:05:32 +0000 (13:05 +0200)
This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.

This is also arguably more readable.

Signed-off-by: Martin Storsjö <martin@martin.st>
libavcodec/aarch64/vp9itxfm_neon.S

index 7ce6df0..c14c5f9 100644 (file)
@@ -935,23 +935,23 @@ function idct32_1d_8x32_pass1_neon
 .macro store_rev a, b
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, v\b\().8h
-        st1             {v\a\().8h},  [x0], #16
-        rev64           v0.8h, v\a\().8h
+        rev64           v1.8h, \b
+        st1             {\a},  [x0], #16
+        rev64           v0.8h, \a
         ext             v1.16b, v1.16b, v1.16b, #8
-        st1             {v\b\().8h},  [x0], #16
+        st1             {\b},  [x0], #16
         ext             v0.16b, v0.16b, v0.16b, #8
         st1             {v1.8h},  [x0], #16
         st1             {v0.8h},  [x0], #16
 .endm
-        store_rev       16, 24
-        store_rev       17, 25
-        store_rev       18, 26
-        store_rev       19, 27
-        store_rev       20, 28
-        store_rev       21, 29
-        store_rev       22, 30
-        store_rev       23, 31
+        store_rev       v16.8h, v24.8h
+        store_rev       v17.8h, v25.8h
+        store_rev       v18.8h, v26.8h
+        store_rev       v19.8h, v27.8h
+        store_rev       v20.8h, v28.8h
+        store_rev       v21.8h, v29.8h
+        store_rev       v22.8h, v30.8h
+        store_rev       v23.8h, v31.8h
         sub             x0,  x0,  #512
 .purgem store_rev
 
@@ -977,14 +977,14 @@ function idct32_1d_8x32_pass1_neon
         // subtracted from the output.
 .macro store_rev a, b
         ld1             {v4.8h},  [x0]
-        rev64           v1.8h, v\b\().8h
-        add             v4.8h, v4.8h, v\a\().8h
-        rev64           v0.8h, v\a\().8h
+        rev64           v1.8h, \b
+        add             v4.8h, v4.8h, \a
+        rev64           v0.8h, \a
         st1             {v4.8h},  [x0], #16
         ext             v1.16b, v1.16b, v1.16b, #8
         ld1             {v5.8h},  [x0]
         ext             v0.16b, v0.16b, v0.16b, #8
-        add             v5.8h, v5.8h, v\b\().8h
+        add             v5.8h, v5.8h, \b
         st1             {v5.8h},  [x0], #16
         ld1             {v6.8h},  [x0]
         sub             v6.8h, v6.8h, v1.8h
@@ -994,14 +994,14 @@ function idct32_1d_8x32_pass1_neon
         st1             {v7.8h},  [x0], #16
 .endm
 
-        store_rev       31, 23
-        store_rev       30, 22
-        store_rev       29, 21
-        store_rev       28, 20
-        store_rev       27, 19
-        store_rev       26, 18
-        store_rev       25, 17
-        store_rev       24, 16
+        store_rev       v31.8h, v23.8h
+        store_rev       v30.8h, v22.8h
+        store_rev       v29.8h, v21.8h
+        store_rev       v28.8h, v20.8h
+        store_rev       v27.8h, v19.8h
+        store_rev       v26.8h, v18.8h
+        store_rev       v25.8h, v17.8h
+        store_rev       v24.8h, v16.8h
 .purgem store_rev
         ret
 endfunc
@@ -1047,21 +1047,21 @@ function idct32_1d_8x32_pass2_neon
 .if \neg == 0
         ld1             {v4.8h},  [x2], x9
         ld1             {v5.8h},  [x2], x9
-        add             v4.8h, v4.8h, v\a\().8h
+        add             v4.8h, v4.8h, \a
         ld1             {v6.8h},  [x2], x9
-        add             v5.8h, v5.8h, v\b\().8h
+        add             v5.8h, v5.8h, \b
         ld1             {v7.8h},  [x2], x9
-        add             v6.8h, v6.8h, v\c\().8h
-        add             v7.8h, v7.8h, v\d\().8h
+        add             v6.8h, v6.8h, \c
+        add             v7.8h, v7.8h, \d
 .else
         ld1             {v4.8h},  [x2], x7
         ld1             {v5.8h},  [x2], x7
-        sub             v4.8h, v4.8h, v\a\().8h
+        sub             v4.8h, v4.8h, \a
         ld1             {v6.8h},  [x2], x7
-        sub             v5.8h, v5.8h, v\b\().8h
+        sub             v5.8h, v5.8h, \b
         ld1             {v7.8h},  [x2], x7
-        sub             v6.8h, v6.8h, v\c\().8h
-        sub             v7.8h, v7.8h, v\d\().8h
+        sub             v6.8h, v6.8h, \c
+        sub             v7.8h, v7.8h, \d
 .endif
         ld1             {v0.8b}, [x0], x1
         ld1             {v1.8b}, [x0], x1
@@ -1085,15 +1085,15 @@ function idct32_1d_8x32_pass2_neon
         st1             {v6.8b}, [x0], x1
         st1             {v7.8b}, [x0], x1
 .endm
-        load_acc_store  31, 30, 29, 28
-        load_acc_store  27, 26, 25, 24
-        load_acc_store  23, 22, 21, 20
-        load_acc_store  19, 18, 17, 16
+        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
+        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
+        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
+        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
         sub             x2,  x2,  x9
-        load_acc_store  16, 17, 18, 19, 1
-        load_acc_store  20, 21, 22, 23, 1
-        load_acc_store  24, 25, 26, 27, 1
-        load_acc_store  28, 29, 30, 31, 1
+        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
+        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
+        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
+        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
 .purgem load_acc_store
         ret
 endfunc