aarch64: vp8: Fix assembling with clang
authorMartin Storsjö <martin@martin.st>
Thu, 31 Jan 2019 21:43:45 +0000 (23:43 +0200)
committerMartin Storsjö <martin@martin.st>
Tue, 19 Feb 2019 09:45:41 +0000 (11:45 +0200)
This also partially fixes assembling with MS armasm64 (via
gas-preprocessor).

The movrel macro invocations need to pass the offset via a separate
parameter. Mach-o and COFF relocations don't allow a negative
offset to a symbol, which is handled properly if the offset is passed
via the parameter. If no offset parameter is given, the macro
evaluates to something like "adrp x17, subpel_filters-16+(0)", which
older clang versions also fail to parse (the older clang versions
only support one single offset term, although it can be a parenthesis.

Signed-off-by: Martin Storsjö <martin@martin.st>
libavcodec/aarch64/vp8dsp_neon.S

index 771877c..be70a68 100644 (file)
@@ -31,10 +31,10 @@ function ff_vp8_idct_add_neon, export=1
         movk            w4,  #35468/2, lsl 16
         dup             v4.2s, w4
 
-        smull           v26.4s, v1.4h,  v4.4h[0]
-        smull           v27.4s, v3.4h,  v4.4h[0]
-        sqdmulh         v20.4h, v1.4h,  v4.4h[1]
-        sqdmulh         v23.4h, v3.4h,  v4.4h[1]
+        smull           v26.4s, v1.4h,  v4.h[0]
+        smull           v27.4s, v3.4h,  v4.h[0]
+        sqdmulh         v20.4h, v1.4h,  v4.h[1]
+        sqdmulh         v23.4h, v3.4h,  v4.h[1]
         sqshrn          v21.4h, v26.4s, #16
         sqshrn          v22.4h, v27.4s, #16
         add             v21.4h, v21.4h, v1.4h
@@ -54,12 +54,12 @@ function ff_vp8_idct_add_neon, export=1
         transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
 
         movi            v29.8h, #0
-        smull           v26.4s,     v1.4h,  v4.4h[0]
+        smull           v26.4s,     v1.4h,  v4.h[0]
         st1             {v29.8h},   [x1],   #16
-        smull           v27.4s,     v3.4h,  v4.4h[0]
+        smull           v27.4s,     v3.4h,  v4.h[0]
         st1             {v29.16b},  [x1]
-        sqdmulh         v21.4h,     v1.4h,  v4.4h[1]
-        sqdmulh         v23.4h,     v3.4h,  v4.4h[1]
+        sqdmulh         v21.4h,     v1.4h,  v4.h[1]
+        sqdmulh         v23.4h,     v3.4h,  v4.h[1]
         sqshrn          v20.4h,     v26.4s, #16
         sqshrn          v22.4h,     v27.4s, #16
         add             v20.4h,     v20.4h, v1.4h
@@ -469,7 +469,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
         ld1             {v6.d}[1], [x0], x1
         ld1             {v7.d}[1], [x0], x1
 
-        transpose_8x16b   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         dup             v22.16b, w2                 // flim_E
     .if !\simple
@@ -480,7 +480,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
 
         sub             x0,  x0,  x1, lsl #4    // backup 16 rows
 
-        transpose_8x16b   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         // Store pixels:
         st1             {v0.d}[0], [x0], x1
@@ -531,7 +531,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
         ld1          {v7.d}[0],     [x0], x2
         ld1          {v7.d}[1],     [x1], x2
 
-        transpose_8x16b   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         dup             v22.16b, w3                 // flim_E
         dup             v23.16b, w4                 // flim_I
@@ -541,7 +541,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
         sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
         sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
 
-        transpose_8x16b   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         // Store pixels:
         st1          {v0.d}[0],     [x0], x2 // load u
@@ -613,13 +613,13 @@ endfunc
         uxtl            v22.8h, v24.8b
         ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
         uxtl            v25.8h, v25.8b
-        mul             v21.8h, v21.8h, v0.8h[2]
+        mul             v21.8h, v21.8h, v0.h[2]
         uxtl            v26.8h, v26.8b
-        mul             v22.8h, v22.8h, v0.8h[3]
-        mls             v21.8h, v19.8h, v0.8h[1]
-        mls             v22.8h, v25.8h, v0.8h[4]
-        mla             v21.8h, v18.8h, v0.8h[0]
-        mla             v22.8h, v26.8h, v0.8h[5]
+        mul             v22.8h, v22.8h, v0.h[3]
+        mls             v21.8h, v19.8h, v0.h[1]
+        mls             v22.8h, v25.8h, v0.h[4]
+        mla             v21.8h, v18.8h, v0.h[0]
+        mla             v22.8h, v26.8h, v0.h[5]
         sqadd           v22.8h, v21.8h, v22.8h
         sqrshrun        \d\().8b, v22.8h, #7
 .endm
@@ -640,20 +640,20 @@ endfunc
         uxtl2           v2.8h,   v2.16b
         uxtl            v17.8h,  v16.8b
         uxtl2           v16.8h,  v16.16b
-        mul             v19.8h,  v19.8h, v0.8h[3]
-        mul             v18.8h,  v18.8h, v0.8h[2]
-        mul             v3.8h,   v3.8h,  v0.8h[2]
-        mul             v22.8h,  v22.8h, v0.8h[3]
-        mls             v19.8h,  v20.8h, v0.8h[4]
+        mul             v19.8h,  v19.8h, v0.h[3]
+        mul             v18.8h,  v18.8h, v0.h[2]
+        mul             v3.8h,   v3.8h,  v0.h[2]
+        mul             v22.8h,  v22.8h, v0.h[3]
+        mls             v19.8h,  v20.8h, v0.h[4]
         uxtl            v20.8h,  \v0\().8b
         uxtl2           v1.8h,   \v0\().16b
-        mls             v18.8h,  v17.8h, v0.8h[1]
-        mls             v3.8h,   v16.8h, v0.8h[1]
-        mls             v22.8h,  v23.8h, v0.8h[4]
-        mla             v18.8h,  v20.8h, v0.8h[0]
-        mla             v19.8h,  v21.8h, v0.8h[5]
-        mla             v3.8h,   v1.8h,  v0.8h[0]
-        mla             v22.8h,  v2.8h,  v0.8h[5]
+        mls             v18.8h,  v17.8h, v0.h[1]
+        mls             v3.8h,   v16.8h, v0.h[1]
+        mls             v22.8h,  v23.8h, v0.h[4]
+        mla             v18.8h,  v20.8h, v0.h[0]
+        mla             v19.8h,  v21.8h, v0.h[5]
+        mla             v3.8h,   v1.8h,  v0.h[0]
+        mla             v22.8h,  v2.8h,  v0.h[5]
         sqadd           v19.8h,  v18.8h, v19.8h
         sqadd           v22.8h,  v3.8h,  v22.8h
         sqrshrun        \d0\().8b,  v19.8h, #7
@@ -667,12 +667,12 @@ endfunc
         uxtl            \s4\().8h, \s4\().8b
         uxtl            \s0\().8h, \s0\().8b
         uxtl            \s5\().8h, \s5\().8b
-        mul             \s2\().8h, \s2\().8h, v0.8h[2]
-        mul             \s3\().8h, \s3\().8h, v0.8h[3]
-        mls             \s2\().8h, \s1\().8h, v0.8h[1]
-        mls             \s3\().8h, \s4\().8h, v0.8h[4]
-        mla             \s2\().8h, \s0\().8h, v0.8h[0]
-        mla             \s3\().8h, \s5\().8h, v0.8h[5]
+        mul             \s2\().8h, \s2\().8h, v0.h[2]
+        mul             \s3\().8h, \s3\().8h, v0.h[3]
+        mls             \s2\().8h, \s1\().8h, v0.h[1]
+        mls             \s3\().8h, \s4\().8h, v0.h[4]
+        mla             \s2\().8h, \s0\().8h, v0.h[0]
+        mla             \s3\().8h, \s5\().8h, v0.h[5]
         sqadd           \s3\().8h, \s2\().8h, \s3\().8h
         sqrshrun        \d0\().8b, \s3\().8h, #7
 .endm
@@ -685,20 +685,20 @@ endfunc
         uxtl            \s4\().8h, \s4\().8b
         uxtl            \s2\().8h, \s2\().8b
         uxtl            \s5\().8h, \s5\().8b
-        mul             \s0\().8h, \s0\().8h, v0.8h[0]
-        mul             v31.8h   , \s3\().8h, v0.8h[3]
-        mul             \s3\().8h, \s3\().8h, v0.8h[2]
-        mul             \s6\().8h, \s6\().8h, v0.8h[5]
-
-        mls             \s0\().8h, \s1\().8h, v0.8h[1]
-        mls             v31.8h   , \s4\().8h, v0.8h[4]
-        mls             \s3\().8h, \s2\().8h, v0.8h[1]
-        mls             \s6\().8h, \s5\().8h, v0.8h[4]
-
-        mla             \s0\().8h, \s2\().8h, v0.8h[2]
-        mla             v31.8h   , \s5\().8h, v0.8h[5]
-        mla             \s3\().8h, \s1\().8h, v0.8h[0]
-        mla             \s6\().8h, \s4\().8h, v0.8h[3]
+        mul             \s0\().8h, \s0\().8h, v0.h[0]
+        mul             v31.8h   , \s3\().8h, v0.h[3]
+        mul             \s3\().8h, \s3\().8h, v0.h[2]
+        mul             \s6\().8h, \s6\().8h, v0.h[5]
+
+        mls             \s0\().8h, \s1\().8h, v0.h[1]
+        mls             v31.8h   , \s4\().8h, v0.h[4]
+        mls             \s3\().8h, \s2\().8h, v0.h[1]
+        mls             \s6\().8h, \s5\().8h, v0.h[4]
+
+        mla             \s0\().8h, \s2\().8h, v0.h[2]
+        mla             v31.8h   , \s5\().8h, v0.h[5]
+        mla             \s3\().8h, \s1\().8h, v0.h[0]
+        mla             \s6\().8h, \s4\().8h, v0.h[3]
         sqadd           v31.8h   , \s0\().8h, v31.8h
         sqadd           \s6\().8h, \s3\().8h, \s6\().8h
         sqrshrun        \d0\().8b, v31.8h,    #7
@@ -713,10 +713,10 @@ endfunc
         ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
         uxtl            v22.8h, v23.8b
         uxtl            v25.8h, v25.8b
-        mul             v20.8h, v20.8h, v0.8h[2]
-        mul             v22.8h, v22.8h, v0.8h[3]
-        mls             v20.8h, v19.8h, v0.8h[1]
-        mls             v22.8h, v25.8h, v0.8h[4]
+        mul             v20.8h, v20.8h, v0.h[2]
+        mul             v22.8h, v22.8h, v0.h[3]
+        mls             v20.8h, v19.8h, v0.h[1]
+        mls             v22.8h, v25.8h, v0.h[4]
         sqadd           v22.8h, v20.8h, v22.8h
         sqrshrun        \d\().8b, v22.8h, #7
 .endm
@@ -727,14 +727,14 @@ endfunc
         uxtl            \s2\().8h,  \s2\().8b
         uxtl            \s3\().8h,  \s3\().8b
         uxtl            \s4\().8h,  \s4\().8b
-        mul             v21.8h,     \s1\().8h, v0.8h[2]
-        mul             v23.8h,     \s2\().8h, v0.8h[3]
-        mul             \s2\().8h,  \s2\().8h, v0.8h[2]
-        mul             v22.8h,     \s3\().8h, v0.8h[3]
-        mls             v21.8h,     \s0\().8h, v0.8h[1]
-        mls             v23.8h,     \s3\().8h, v0.8h[4]
-        mls             \s2\().8h,  \s1\().8h, v0.8h[1]
-        mls             v22.8h,     \s4\().8h, v0.8h[4]
+        mul             v21.8h,     \s1\().8h, v0.h[2]
+        mul             v23.8h,     \s2\().8h, v0.h[3]
+        mul             \s2\().8h,  \s2\().8h, v0.h[2]
+        mul             v22.8h,     \s3\().8h, v0.h[3]
+        mls             v21.8h,     \s0\().8h, v0.h[1]
+        mls             v23.8h,     \s3\().8h, v0.h[4]
+        mls             \s2\().8h,  \s1\().8h, v0.h[1]
+        mls             v22.8h,     \s4\().8h, v0.h[4]
         sqadd           v21.8h,     v21.8h,    v23.8h
         sqadd           \s2\().8h,  \s2\().8h, v22.8h
         sqrshrun        \d0\().8b,  v21.8h,    #7
@@ -759,7 +759,7 @@ function ff_put_vp8_epel16_v6_neon, export=1
 
         sxtw            x4,  w4
         sxtw            x6,  w6
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         add             x6,  x17,  x6, lsl #4  // y
         ld1             {v0.8h},     [x6]
 1:
@@ -788,7 +788,7 @@ function ff_put_vp8_epel16_h6_neon, export=1
         sxtw            x5,  w5 // x
 
         // first pass (horizontal):
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         add             x5,  x17,  x5, lsl #4 // x
         ld1             {v0.8h},  [x5]
 1:
@@ -807,7 +807,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
         sub             x2,  x2,  #2
 
         // first pass (horizontal):
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         sxtw            x5,  w5 // x
         add             x16,  x17,  x5, lsl #4 // x
         sub             sp,  sp,  #336+16
@@ -854,7 +854,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1
         sxtw            x4,  w4
 
         // first pass (horizontal):
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         sxtw            x5,  w5
         add             x5,  x17,  x5, lsl #4 // x
         sub             sp,  sp,  #168+16
@@ -900,7 +900,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1
         sxtw            x4,  w4
 
         // first pass (horizontal):
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         sxtw            x5,  w5
         add             x5,  x17,  x5, lsl #4 // x
         sub             sp,  sp,  #168+16
@@ -947,7 +947,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1
 
 
         // first pass (horizontal):
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         sxtw            x5,  w5
         add             x5,  x17,  x5, lsl #4 // x
         sub             sp,  sp,  #168+16
@@ -992,7 +992,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1
 
 
         // first pass (horizontal):
-        movrel          x17,  subpel_filters-16
+        movrel          x17,  subpel_filters-16
         sxtw            x5,  w5
         add             x5,  x17,  x5, lsl #4 // x
         sub             sp,  sp,  #168+16