aarch64: vp8: Port vp8_luma_dc_wht and vp8_idct_dc_add4uv from arm version
authorMartin Storsjö <martin@martin.st>
Fri, 1 Feb 2019 08:04:56 +0000 (10:04 +0200)
committerMartin Storsjö <martin@martin.st>
Tue, 19 Feb 2019 09:46:04 +0000 (11:46 +0200)
                     Cortex A53    A72    A73
vp8_luma_dc_wht_c:        115.7   75.7   90.7
vp8_luma_dc_wht_neon:      60.7   41.2   45.7
vp8_idct_dc_add4uv_c:     376.1  262.9  282.5
vp8_idct_dc_add4uv_neon:   52.0   29.0   37.0

Signed-off-by: Martin Storsjö <martin@martin.st>
libavcodec/aarch64/vp8dsp_init_aarch64.c
libavcodec/aarch64/vp8dsp_neon.S

index 20bb607..e296ccd 100644 (file)
@@ -28,6 +28,7 @@ void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
 void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
 
 VP8_LF(neon);
 
@@ -55,10 +56,12 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
 {
     if (!have_neon(av_get_cpu_flags()))
         return;
+    dsp->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_neon;
 
     dsp->vp8_idct_add       = ff_vp8_idct_add_neon;
     dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_neon;
     dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_neon;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
 
     dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
     dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
index 2b5b049..4ea62c0 100644 (file)
@@ -4,6 +4,7 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
  *
  * This file is part of Libav.
  *
 #include "libavutil/aarch64/asm.S"
 #include "neon.S"
 
+function ff_vp8_luma_dc_wht_neon, export=1
+        ld1             {v0.4h - v3.4h}, [x1]
+        movi            v30.8h, #0
+
+        add             v4.4h,  v0.4h,  v3.4h
+        add             v6.4h,  v1.4h,  v2.4h
+        st1             {v30.8h}, [x1], #16
+        sub             v7.4h,  v1.4h,  v2.4h
+        sub             v5.4h,  v0.4h,  v3.4h
+        st1             {v30.8h}, [x1]
+        add             v0.4h,  v4.4h,  v6.4h
+        add             v1.4h,  v5.4h,  v7.4h
+        sub             v2.4h,  v4.4h,  v6.4h
+        sub             v3.4h,  v5.4h,  v7.4h
+
+        movi            v16.4h, #3
+
+        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
+
+        add             v0.4h,  v0.4h,  v16.4h
+
+        add             v4.4h,  v0.4h,  v3.4h
+        add             v6.4h,  v1.4h,  v2.4h
+        sub             v7.4h,  v1.4h,  v2.4h
+        sub             v5.4h,  v0.4h,  v3.4h
+        add             v0.4h,  v4.4h,  v6.4h
+        add             v1.4h,  v5.4h,  v7.4h
+        sub             v2.4h,  v4.4h,  v6.4h
+        sub             v3.4h,  v5.4h,  v7.4h
+
+        sshr            v0.4h,  v0.4h,  #3
+        sshr            v1.4h,  v1.4h,  #3
+        sshr            v2.4h,  v2.4h,  #3
+        sshr            v3.4h,  v3.4h,  #3
+
+        mov             x3,  #32
+        st1             {v0.h}[0],  [x0], x3
+        st1             {v1.h}[0],  [x0], x3
+        st1             {v2.h}[0],  [x0], x3
+        st1             {v3.h}[0],  [x0], x3
+        st1             {v0.h}[1],  [x0], x3
+        st1             {v1.h}[1],  [x0], x3
+        st1             {v2.h}[1],  [x0], x3
+        st1             {v3.h}[1],  [x0], x3
+        st1             {v0.h}[2],  [x0], x3
+        st1             {v1.h}[2],  [x0], x3
+        st1             {v2.h}[2],  [x0], x3
+        st1             {v3.h}[2],  [x0], x3
+        st1             {v0.h}[3],  [x0], x3
+        st1             {v1.h}[3],  [x0], x3
+        st1             {v2.h}[3],  [x0], x3
+        st1             {v3.h}[3],  [x0], x3
+
+        ret
+endfunc
+
 function ff_vp8_idct_add_neon, export=1
         ld1             {v0.8b - v3.8b},  [x1]
         mov             w4,  #20091
@@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1
         ret
 endfunc
 
+function ff_vp8_idct_dc_add4uv_neon, export=1
+        movi            v0.4h,  #0
+        mov             x3,     #32
+        ld1r            {v16.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v17.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v18.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v19.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ins             v16.d[1],  v17.d[0]
+        ins             v18.d[1],  v19.d[0]
+        mov             x3,  x0
+        srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
+        ld1             {v0.8b},   [x0], x2
+        srshr           v18.8h,    v18.8h,  #3
+        ld1             {v1.8b},   [x0], x2
+        uaddw           v20.8h,    v16.8h, v0.8b
+        ld1             {v2.8b},   [x0], x2
+        uaddw           v0.8h,     v16.8h, v1.8b
+        ld1             {v3.8b},   [x0], x2
+        uaddw           v22.8h,    v16.8h, v2.8b
+        ld1             {v4.8b},   [x0], x2
+        uaddw           v2.8h,     v16.8h, v3.8b
+        ld1             {v5.8b},   [x0], x2
+        uaddw           v24.8h,    v18.8h, v4.8b
+        ld1             {v6.8b},   [x0], x2
+        uaddw           v4.8h,     v18.8h, v5.8b
+        ld1             {v7.8b},   [x0], x2
+        uaddw           v26.8h,    v18.8h, v6.8b
+        sqxtun          v20.8b,    v20.8h
+        uaddw           v6.8h,     v18.8h, v7.8b
+        sqxtun          v21.8b,    v0.8h
+        sqxtun          v22.8b,    v22.8h
+        st1             {v20.8b},  [x3], x2
+        sqxtun          v23.8b,    v2.8h
+        st1             {v21.8b},  [x3], x2
+        sqxtun          v24.8b,    v24.8h
+        st1             {v22.8b},  [x3], x2
+        sqxtun          v25.8b,    v4.8h
+        st1             {v23.8b},  [x3], x2
+        sqxtun          v26.8b,    v26.8h
+        st1             {v24.8b},  [x3], x2
+        sqxtun          v27.8b,    v6.8h
+        st1             {v25.8b},  [x3], x2
+        st1             {v26.8b},  [x3], x2
+        st1             {v27.8b},  [x3], x2
+
+        ret
+endfunc
+
 function ff_vp8_idct_dc_add4y_neon, export=1
         movi            v0.16b,  #0
         mov             x3,  #32