|
@@ -0,0 +1,12454 @@
|
|
|
|
+# HG changeset patch
|
|
|
|
+# User Dan Minor <dminor@mozilla.com>
|
|
|
|
+# Date 1556751985 0
|
|
|
|
+# Node ID 742b7c0a4bdbbe5f4004b038b4b5b4467ef4484b
|
|
|
|
+# Parent f40ae51578ac27c6ea38af1e2818a12ac0b93dbd
|
|
|
|
+Bug 1540760 - Add missing aarch64 files for ffvpx; r=jya
|
|
|
|
+
|
|
|
|
+Differential Revision: https://phabricator.services.mozilla.com/D27789
|
|
|
|
+
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,50 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++
|
|
|
|
++#include "libavcodec/fft.h"
|
|
|
|
++
|
|
|
|
++void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
|
|
|
++void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
|
|
|
++
|
|
|
|
++void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
|
|
|
++void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
|
|
|
++void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
|
|
|
++
|
|
|
|
++av_cold void ff_fft_init_aarch64(FFTContext *s)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++ s->fft_permute = ff_fft_permute_neon;
|
|
|
|
++ s->fft_calc = ff_fft_calc_neon;
|
|
|
|
++#if CONFIG_MDCT
|
|
|
|
++ s->imdct_calc = ff_imdct_calc_neon;
|
|
|
|
++ s->imdct_half = ff_imdct_half_neon;
|
|
|
|
++ s->mdct_calc = ff_mdct_calc_neon;
|
|
|
|
++ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
|
|
|
++#endif
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/fft_neon.S b/media/ffvpx/libavcodec/aarch64/fft_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/fft_neon.S
|
|
|
|
+@@ -0,0 +1,442 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON optimised FFT
|
|
|
|
++ *
|
|
|
|
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2009 Naotoshi Nojiri
|
|
|
|
++ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This algorithm (though not any of the implementation details) is
|
|
|
|
++ * based on libdjbfft by D. J. Bernstein.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++#define M_SQRT1_2 0.70710678118654752440
|
|
|
|
++
|
|
|
|
++.macro transpose d0, d1, s0, s1
|
|
|
|
++ trn1 \d0, \s0, \s1
|
|
|
|
++ trn2 \d1, \s0, \s1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++function fft4_neon
|
|
|
|
++ ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
|
|
|
++
|
|
|
|
++ fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
|
|
|
++ fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
|
|
|
++
|
|
|
|
++ ext v16.8b, v2.8b, v3.8b, #4
|
|
|
|
++ ext v17.8b, v3.8b, v2.8b, #4
|
|
|
|
++
|
|
|
|
++ fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
|
|
|
|
++ fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
|
|
|
|
++
|
|
|
|
++ fadd v0.2s, v4.2s, v5.2s
|
|
|
|
++ fsub v2.2s, v4.2s, v5.2s
|
|
|
|
++ fadd v1.2s, v6.2s, v7.2s
|
|
|
|
++ fsub v3.2s, v6.2s, v7.2s
|
|
|
|
++
|
|
|
|
++ st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function fft8_neon
|
|
|
|
++ mov x1, x0
|
|
|
|
++ ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
|
|
|
++ ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
|
|
|
++ ext v22.8b, v2.8b, v3.8b, #4
|
|
|
|
++ ext v23.8b, v3.8b, v2.8b, #4
|
|
|
|
++ fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
|
|
|
++ fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
|
|
|
++ fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
|
|
|
++ fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
|
|
|
++ rev64 v27.2s, v28.2s // ???
|
|
|
|
++ fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
|
|
|
++ fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
|
|
|
++ fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
|
|
|
++ ext v6.8b, v4.8b, v5.8b, #4
|
|
|
|
++ ext v7.8b, v5.8b, v4.8b, #4
|
|
|
|
++ fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
|
|
|
++ fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
|
|
|
++ fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
|
|
|
++ fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
|
|
|
++ fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
|
|
|
++ fadd v0.2s, v20.2s, v21.2s
|
|
|
|
++ fsub v2.2s, v20.2s, v21.2s
|
|
|
|
++ fadd v1.2s, v22.2s, v23.2s
|
|
|
|
++ rev64 v26.2s, v26.2s
|
|
|
|
++ rev64 v27.2s, v27.2s
|
|
|
|
++ fsub v3.2s, v22.2s, v23.2s
|
|
|
|
++ fsub v6.2s, v6.2s, v7.2s
|
|
|
|
++ fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
|
|
|
++ fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
|
|
|
++ fadd v7.2s, v4.2s, v5.2s
|
|
|
|
++ fsub v18.2s, v2.2s, v6.2s
|
|
|
|
++ ext v26.8b, v24.8b, v25.8b, #4
|
|
|
|
++ ext v27.8b, v25.8b, v24.8b, #4
|
|
|
|
++ fadd v2.2s, v2.2s, v6.2s
|
|
|
|
++ fsub v16.2s, v0.2s, v7.2s
|
|
|
|
++ fadd v5.2s, v25.2s, v24.2s
|
|
|
|
++ fsub v4.2s, v26.2s, v27.2s
|
|
|
|
++ fadd v0.2s, v0.2s, v7.2s
|
|
|
|
++ fsub v17.2s, v1.2s, v5.2s
|
|
|
|
++ fsub v19.2s, v3.2s, v4.2s
|
|
|
|
++ fadd v3.2s, v3.2s, v4.2s
|
|
|
|
++ fadd v1.2s, v1.2s, v5.2s
|
|
|
|
++
|
|
|
|
++ st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
|
|
|
++ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function fft16_neon
|
|
|
|
++ mov x1, x0
|
|
|
|
++ ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
|
|
|
++ ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
|
|
|
|
++ ext v22.8b, v2.8b, v3.8b, #4
|
|
|
|
++ ext v23.8b, v3.8b, v2.8b, #4
|
|
|
|
++ fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
|
|
|
++ fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
|
|
|
++ fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
|
|
|
++ fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
|
|
|
++ rev64 v27.2s, v28.2s // ???
|
|
|
|
++ fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
|
|
|
++ fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
|
|
|
++ fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
|
|
|
++ ext v6.8b, v4.8b, v5.8b, #4
|
|
|
|
++ ext v7.8b, v5.8b, v4.8b, #4
|
|
|
|
++ fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
|
|
|
++ fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
|
|
|
++ fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
|
|
|
++ fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
|
|
|
++ fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
|
|
|
++ fadd v0.2s, v20.2s, v21.2s
|
|
|
|
++ fsub v2.2s, v20.2s, v21.2s
|
|
|
|
++ fadd v1.2s, v22.2s, v23.2s
|
|
|
|
++ rev64 v26.2s, v26.2s
|
|
|
|
++ rev64 v27.2s, v27.2s
|
|
|
|
++ fsub v3.2s, v22.2s, v23.2s
|
|
|
|
++ fsub v6.2s, v6.2s, v7.2s
|
|
|
|
++ fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
|
|
|
++ fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
|
|
|
++ fadd v7.2s, v4.2s, v5.2s
|
|
|
|
++ fsub v18.2s, v2.2s, v6.2s
|
|
|
|
++ ld1 {v20.4s,v21.4s}, [x0], #32
|
|
|
|
++ ld1 {v22.4s,v23.4s}, [x0], #32
|
|
|
|
++ ext v26.8b, v24.8b, v25.8b, #4
|
|
|
|
++ ext v27.8b, v25.8b, v24.8b, #4
|
|
|
|
++ fadd v2.2s, v2.2s, v6.2s
|
|
|
|
++ fsub v16.2s, v0.2s, v7.2s
|
|
|
|
++ fadd v5.2s, v25.2s, v24.2s
|
|
|
|
++ fsub v4.2s, v26.2s, v27.2s
|
|
|
|
++ transpose v24.2d, v25.2d, v20.2d, v22.2d
|
|
|
|
++ transpose v26.2d, v27.2d, v21.2d, v23.2d
|
|
|
|
++ fadd v0.2s, v0.2s, v7.2s
|
|
|
|
++ fsub v17.2s, v1.2s, v5.2s
|
|
|
|
++ fsub v19.2s, v3.2s, v4.2s
|
|
|
|
++ fadd v3.2s, v3.2s, v4.2s
|
|
|
|
++ fadd v1.2s, v1.2s, v5.2s
|
|
|
|
++ ext v20.16b, v21.16b, v21.16b, #4
|
|
|
|
++ ext v21.16b, v23.16b, v23.16b, #4
|
|
|
|
++
|
|
|
|
++ zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
|
|
|
|
++ zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
|
|
|
|
++ zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
|
|
|
|
++ zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
|
|
|
|
++
|
|
|
|
++ // 2 x fft4
|
|
|
|
++ transpose v22.2d, v23.2d, v20.2d, v21.2d
|
|
|
|
++
|
|
|
|
++ fadd v4.4s, v24.4s, v25.4s
|
|
|
|
++ fadd v5.4s, v26.4s, v27.4s
|
|
|
|
++ fsub v6.4s, v24.4s, v25.4s
|
|
|
|
++ fsub v7.4s, v22.4s, v23.4s
|
|
|
|
++
|
|
|
|
++ ld1 {v23.4s}, [x14]
|
|
|
|
++
|
|
|
|
++ fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
|
|
|
|
++ fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
|
|
|
|
++ fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
|
|
|
|
++ fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
|
|
|
|
++
|
|
|
|
++ //fft_pass_neon_16
|
|
|
|
++ rev64 v7.4s, v25.4s
|
|
|
|
++ fmul v25.4s, v25.4s, v23.s[1]
|
|
|
|
++ fmul v7.4s, v7.4s, v29.4s
|
|
|
|
++ fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
|
|
|
|
++
|
|
|
|
++ zip1 v20.4s, v24.4s, v25.4s
|
|
|
|
++ zip2 v21.4s, v24.4s, v25.4s
|
|
|
|
++ fneg v22.4s, v20.4s
|
|
|
|
++ fadd v4.4s, v21.4s, v20.4s
|
|
|
|
++ fsub v6.4s, v20.4s, v21.4s // just the second half
|
|
|
|
++ fadd v5.4s, v21.4s, v22.4s // just the first half
|
|
|
|
++
|
|
|
|
++ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
|
|
|
++ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
|
|
|
++
|
|
|
|
++ fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
|
|
|
|
++ fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
|
|
|
|
++ fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
|
|
|
|
++ fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
|
|
|
|
++
|
|
|
|
++//second half
|
|
|
|
++ rev64 v6.4s, v26.4s
|
|
|
|
++ fmul v26.4s, v26.4s, v23.s[2]
|
|
|
|
++ rev64 v7.4s, v27.4s
|
|
|
|
++ fmul v27.4s, v27.4s, v23.s[3]
|
|
|
|
++ fmul v6.4s, v6.4s, v29.4s
|
|
|
|
++ fmul v7.4s, v7.4s, v29.4s
|
|
|
|
++ fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
|
|
|
|
++ fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
|
|
|
|
++
|
|
|
|
++ zip1 v24.4s, v26.4s, v27.4s
|
|
|
|
++ zip2 v25.4s, v26.4s, v27.4s
|
|
|
|
++ fneg v26.4s, v24.4s
|
|
|
|
++ fadd v4.4s, v25.4s, v24.4s
|
|
|
|
++ fsub v6.4s, v24.4s, v25.4s // just the second half
|
|
|
|
++ fadd v5.4s, v25.4s, v26.4s // just the first half
|
|
|
|
++
|
|
|
|
++ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
|
|
|
++ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
|
|
|
++
|
|
|
|
++ fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
|
|
|
|
++ fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
|
|
|
|
++ fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
|
|
|
|
++ fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
|
|
|
|
++
|
|
|
|
++ st1 {v16.4s,v17.4s}, [x1], #32
|
|
|
|
++ st1 {v18.4s,v19.4s}, [x1], #32
|
|
|
|
++ st1 {v20.4s,v21.4s}, [x1], #32
|
|
|
|
++ st1 {v22.4s,v23.4s}, [x1], #32
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++const trans4_float, align=4
|
|
|
|
++ .byte 0, 1, 2, 3
|
|
|
|
++ .byte 8, 9, 10, 11
|
|
|
|
++ .byte 4, 5, 6, 7
|
|
|
|
++ .byte 12, 13, 14, 15
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const trans8_float, align=4
|
|
|
|
++ .byte 24, 25, 26, 27
|
|
|
|
++ .byte 0, 1, 2, 3
|
|
|
|
++ .byte 28, 29, 30, 31
|
|
|
|
++ .byte 4, 5, 6, 7
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++function fft_pass_neon
|
|
|
|
++ sub x6, x2, #1 // n - 1, loop counter
|
|
|
|
++ lsl x5, x2, #3 // 2 * n * sizeof FFTSample
|
|
|
|
++ lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
|
|
|
|
++ add x5, x4, x5 // wim
|
|
|
|
++ add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
|
|
|
|
++ add x2, x0, x2, lsl #5 // &z[o2]
|
|
|
|
++ add x3, x0, x3 // &z[o3]
|
|
|
|
++ add x1, x0, x1 // &z[o1]
|
|
|
|
++ ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
|
|
|
++ ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
|
|
|
++ ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
|
|
|
++ trn2 v25.2d, v20.2d, v22.2d
|
|
|
|
++ sub x5, x5, #4 // wim--
|
|
|
|
++ trn1 v24.2d, v20.2d, v22.2d
|
|
|
|
++ ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
|
|
|
|
++ rev64 v7.4s, v25.4s
|
|
|
|
++ fmul v25.4s, v25.4s, v4.s[1]
|
|
|
|
++ ld1 {v16.4s}, [x0] // {z[0],z[1]}
|
|
|
|
++ fmul v7.4s, v7.4s, v29.4s
|
|
|
|
++ ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
|
|
|
|
++ prfm pldl1keep, [x2, #16]
|
|
|
|
++ prfm pldl1keep, [x3, #16]
|
|
|
|
++ fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
|
|
|
++ prfm pldl1keep, [x0, #16]
|
|
|
|
++ prfm pldl1keep, [x1, #16]
|
|
|
|
++
|
|
|
|
++ zip1 v20.4s, v24.4s, v25.4s
|
|
|
|
++ zip2 v21.4s, v24.4s, v25.4s
|
|
|
|
++ fneg v22.4s, v20.4s
|
|
|
|
++ fadd v4.4s, v21.4s, v20.4s
|
|
|
|
++ fsub v6.4s, v20.4s, v21.4s // just the second half
|
|
|
|
++ fadd v5.4s, v21.4s, v22.4s // just the first half
|
|
|
|
++
|
|
|
|
++ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
|
|
|
++ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
|
|
|
++
|
|
|
|
++ fadd v20.4s, v16.4s, v4.4s
|
|
|
|
++ fsub v22.4s, v16.4s, v4.4s
|
|
|
|
++ fadd v21.4s, v17.4s, v5.4s
|
|
|
|
++ st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
|
|
|
++ fsub v23.4s, v17.4s, v5.4s
|
|
|
|
++
|
|
|
|
++ st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
|
|
|
++ st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
|
|
|
++ st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
|
|
|
++1:
|
|
|
|
++ ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
|
|
|
++ ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
|
|
|
++ ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
|
|
|
++ transpose v26.2d, v27.2d, v20.2d, v22.2d
|
|
|
|
++ ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
|
|
|
|
++ rev64 v6.4s, v26.4s
|
|
|
|
++ fmul v26.4s, v26.4s, v4.s[0]
|
|
|
|
++ rev64 v7.4s, v27.4s
|
|
|
|
++ fmul v27.4s, v27.4s, v4.s[1]
|
|
|
|
++ fmul v6.4s, v6.4s, v29.4s
|
|
|
|
++ fmul v7.4s, v7.4s, v29.4s
|
|
|
|
++ ld1 {v16.4s},[x0] // {z[0],z[1]}
|
|
|
|
++ fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
|
|
|
|
++ fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
|
|
|
++ ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #1 // n--
|
|
|
|
++
|
|
|
|
++ zip1 v20.4s, v26.4s, v27.4s
|
|
|
|
++ zip2 v21.4s, v26.4s, v27.4s
|
|
|
|
++ fneg v22.4s, v20.4s
|
|
|
|
++ fadd v4.4s, v21.4s, v20.4s
|
|
|
|
++ fsub v6.4s, v20.4s, v21.4s // just the second half
|
|
|
|
++ fadd v5.4s, v21.4s, v22.4s // just the first half
|
|
|
|
++
|
|
|
|
++ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
|
|
|
++ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
|
|
|
++
|
|
|
|
++ fadd v20.4s, v16.4s, v4.4s
|
|
|
|
++ fsub v22.4s, v16.4s, v4.4s
|
|
|
|
++ fadd v21.4s, v17.4s, v5.4s
|
|
|
|
++ st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
|
|
|
++ fsub v23.4s, v17.4s, v5.4s
|
|
|
|
++
|
|
|
|
++ st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
|
|
|
++ st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
|
|
|
++ st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro def_fft n, n2, n4
|
|
|
|
++function fft\n\()_neon, align=6
|
|
|
|
++ sub sp, sp, #16
|
|
|
|
++ stp x28, x30, [sp]
|
|
|
|
++ add x28, x0, #\n4*2*8
|
|
|
|
++ bl fft\n2\()_neon
|
|
|
|
++ mov x0, x28
|
|
|
|
++ bl fft\n4\()_neon
|
|
|
|
++ add x0, x28, #\n4*1*8
|
|
|
|
++ bl fft\n4\()_neon
|
|
|
|
++ sub x0, x28, #\n4*2*8
|
|
|
|
++ ldp x28, x30, [sp], #16
|
|
|
|
++ movrel x4, X(ff_cos_\n)
|
|
|
|
++ mov x2, #\n4>>1
|
|
|
|
++ b fft_pass_neon
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++ def_fft 32, 16, 8
|
|
|
|
++ def_fft 64, 32, 16
|
|
|
|
++ def_fft 128, 64, 32
|
|
|
|
++ def_fft 256, 128, 64
|
|
|
|
++ def_fft 512, 256, 128
|
|
|
|
++ def_fft 1024, 512, 256
|
|
|
|
++ def_fft 2048, 1024, 512
|
|
|
|
++ def_fft 4096, 2048, 1024
|
|
|
|
++ def_fft 8192, 4096, 2048
|
|
|
|
++ def_fft 16384, 8192, 4096
|
|
|
|
++ def_fft 32768, 16384, 8192
|
|
|
|
++ def_fft 65536, 32768, 16384
|
|
|
|
++
|
|
|
|
++function ff_fft_calc_neon, export=1
|
|
|
|
++ prfm pldl1keep, [x1]
|
|
|
|
++ movrel x10, trans4_float
|
|
|
|
++ ldr w2, [x0]
|
|
|
|
++ movrel x11, trans8_float
|
|
|
|
++ sub w2, w2, #2
|
|
|
|
++ movrel x3, fft_tab_neon
|
|
|
|
++ ld1 {v30.16b}, [x10]
|
|
|
|
++ mov x7, #-8
|
|
|
|
++ movrel x12, pmmp
|
|
|
|
++ ldr x3, [x3, x2, lsl #3]
|
|
|
|
++ movrel x13, mppm
|
|
|
|
++ movrel x14, X(ff_cos_16)
|
|
|
|
++ ld1 {v31.16b}, [x11]
|
|
|
|
++ mov x0, x1
|
|
|
|
++ ld1 {v29.4s}, [x12] // pmmp
|
|
|
|
++ ld1 {v28.4s}, [x13]
|
|
|
|
++ br x3
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_fft_permute_neon, export=1
|
|
|
|
++ mov x6, #1
|
|
|
|
++ ldr w2, [x0] // nbits
|
|
|
|
++ ldr x3, [x0, #16] // tmp_buf
|
|
|
|
++ ldr x0, [x0, #8] // revtab
|
|
|
|
++ lsl x6, x6, x2
|
|
|
|
++ mov x2, x6
|
|
|
|
++1:
|
|
|
|
++ ld1 {v0.2s,v1.2s}, [x1], #16
|
|
|
|
++ ldr w4, [x0], #4
|
|
|
|
++ uxth w5, w4
|
|
|
|
++ lsr w4, w4, #16
|
|
|
|
++ add x5, x3, x5, lsl #3
|
|
|
|
++ add x4, x3, x4, lsl #3
|
|
|
|
++ st1 {v0.2s}, [x5]
|
|
|
|
++ st1 {v1.2s}, [x4]
|
|
|
|
++ subs x6, x6, #2
|
|
|
|
++ b.gt 1b
|
|
|
|
++
|
|
|
|
++ sub x1, x1, x2, lsl #3
|
|
|
|
++1:
|
|
|
|
++ ld1 {v0.4s,v1.4s}, [x3], #32
|
|
|
|
++ st1 {v0.4s,v1.4s}, [x1], #32
|
|
|
|
++ subs x2, x2, #4
|
|
|
|
++ b.gt 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++const fft_tab_neon, relocate=1
|
|
|
|
++ .quad fft4_neon
|
|
|
|
++ .quad fft8_neon
|
|
|
|
++ .quad fft16_neon
|
|
|
|
++ .quad fft32_neon
|
|
|
|
++ .quad fft64_neon
|
|
|
|
++ .quad fft128_neon
|
|
|
|
++ .quad fft256_neon
|
|
|
|
++ .quad fft512_neon
|
|
|
|
++ .quad fft1024_neon
|
|
|
|
++ .quad fft2048_neon
|
|
|
|
++ .quad fft4096_neon
|
|
|
|
++ .quad fft8192_neon
|
|
|
|
++ .quad fft16384_neon
|
|
|
|
++ .quad fft32768_neon
|
|
|
|
++ .quad fft65536_neon
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const pmmp, align=4
|
|
|
|
++ .float +1.0, -1.0, -1.0, +1.0
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const mppm, align=4
|
|
|
|
++ .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
|
|
|
++endconst
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,59 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON optimised H.264 chroma functions
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/h264chroma.h"
|
|
|
|
++
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++
|
|
|
|
++void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++
|
|
|
|
++av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
|
|
|
|
++{
|
|
|
|
++ const int high_bit_depth = bit_depth > 8;
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags) && !high_bit_depth) {
|
|
|
|
++ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
|
|
|
|
++ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
|
|
|
|
++ c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
|
|
|
|
++
|
|
|
|
++ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
|
|
|
|
++ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
|
|
|
|
++ c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
|
|
|
|
+@@ -0,0 +1,450 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
|
|
|
++.macro h264_chroma_mc8 type, codec=h264
|
|
|
|
++function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ mov x8, x0
|
|
|
|
++ .endif
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \codec,rv40
|
|
|
|
++ movrel x6, rv40bias
|
|
|
|
++ lsr w9, w5, #1
|
|
|
|
++ lsr w10, w4, #1
|
|
|
|
++ lsl w9, w9, #3
|
|
|
|
++ lsl w10, w10, #1
|
|
|
|
++ add w9, w9, w10
|
|
|
|
++ add x6, x6, w9, UXTW
|
|
|
|
++ ld1r {v22.8H}, [x6]
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \codec,vc1
|
|
|
|
++ movi v22.8H, #28
|
|
|
|
++ .endif
|
|
|
|
++ mul w7, w4, w5
|
|
|
|
++ lsl w14, w5, #3
|
|
|
|
++ lsl w13, w4, #3
|
|
|
|
++ cmp w7, #0
|
|
|
|
++ sub w6, w14, w7
|
|
|
|
++ sub w12, w13, w7
|
|
|
|
++ sub w4, w7, w13
|
|
|
|
++ sub w4, w4, w14
|
|
|
|
++ add w4, w4, #64
|
|
|
|
++ b.eq 2f
|
|
|
|
++
|
|
|
|
++ dup v0.8B, w4
|
|
|
|
++ dup v1.8B, w12
|
|
|
|
++ ld1 {v4.8B, v5.8B}, [x1], x2
|
|
|
|
++ dup v2.8B, w6
|
|
|
|
++ dup v3.8B, w7
|
|
|
|
++ ext v5.8B, v4.8B, v5.8B, #1
|
|
|
|
++1: ld1 {v6.8B, v7.8B}, [x1], x2
|
|
|
|
++ umull v16.8H, v4.8B, v0.8B
|
|
|
|
++ umlal v16.8H, v5.8B, v1.8B
|
|
|
|
++ ext v7.8B, v6.8B, v7.8B, #1
|
|
|
|
++ ld1 {v4.8B, v5.8B}, [x1], x2
|
|
|
|
++ umlal v16.8H, v6.8B, v2.8B
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ ext v5.8B, v4.8B, v5.8B, #1
|
|
|
|
++ umlal v16.8H, v7.8B, v3.8B
|
|
|
|
++ umull v17.8H, v6.8B, v0.8B
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ umlal v17.8H, v7.8B, v1.8B
|
|
|
|
++ umlal v17.8H, v4.8B, v2.8B
|
|
|
|
++ umlal v17.8H, v5.8B, v3.8B
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v16.8H, #6
|
|
|
|
++ rshrn v17.8B, v17.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v16.8H, v16.8H, v22.8H
|
|
|
|
++ add v17.8H, v17.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v16.8H, #6
|
|
|
|
++ shrn v17.8B, v17.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.8B}, [x8], x2
|
|
|
|
++ ld1 {v21.8B}, [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ urhadd v17.8B, v17.8B, v21.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v16.8B}, [x0], x2
|
|
|
|
++ st1 {v17.8B}, [x0], x2
|
|
|
|
++ b.gt 1b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++2: adds w12, w12, w6
|
|
|
|
++ dup v0.8B, w4
|
|
|
|
++ b.eq 5f
|
|
|
|
++ tst w6, w6
|
|
|
|
++ dup v1.8B, w12
|
|
|
|
++ b.eq 4f
|
|
|
|
++
|
|
|
|
++ ld1 {v4.8B}, [x1], x2
|
|
|
|
++3: ld1 {v6.8B}, [x1], x2
|
|
|
|
++ umull v16.8H, v4.8B, v0.8B
|
|
|
|
++ umlal v16.8H, v6.8B, v1.8B
|
|
|
|
++ ld1 {v4.8B}, [x1], x2
|
|
|
|
++ umull v17.8H, v6.8B, v0.8B
|
|
|
|
++ umlal v17.8H, v4.8B, v1.8B
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v16.8H, #6
|
|
|
|
++ rshrn v17.8B, v17.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v16.8H, v16.8H, v22.8H
|
|
|
|
++ add v17.8H, v17.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v16.8H, #6
|
|
|
|
++ shrn v17.8B, v17.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.8B}, [x8], x2
|
|
|
|
++ ld1 {v21.8B}, [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ urhadd v17.8B, v17.8B, v21.8B
|
|
|
|
++ .endif
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ st1 {v16.8B}, [x0], x2
|
|
|
|
++ st1 {v17.8B}, [x0], x2
|
|
|
|
++ b.gt 3b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++4: ld1 {v4.8B, v5.8B}, [x1], x2
|
|
|
|
++ ld1 {v6.8B, v7.8B}, [x1], x2
|
|
|
|
++ ext v5.8B, v4.8B, v5.8B, #1
|
|
|
|
++ ext v7.8B, v6.8B, v7.8B, #1
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ umull v16.8H, v4.8B, v0.8B
|
|
|
|
++ umlal v16.8H, v5.8B, v1.8B
|
|
|
|
++ umull v17.8H, v6.8B, v0.8B
|
|
|
|
++ umlal v17.8H, v7.8B, v1.8B
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v16.8H, #6
|
|
|
|
++ rshrn v17.8B, v17.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v16.8H, v16.8H, v22.8H
|
|
|
|
++ add v17.8H, v17.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v16.8H, #6
|
|
|
|
++ shrn v17.8B, v17.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.8B}, [x8], x2
|
|
|
|
++ ld1 {v21.8B}, [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ urhadd v17.8B, v17.8B, v21.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v16.8B}, [x0], x2
|
|
|
|
++ st1 {v17.8B}, [x0], x2
|
|
|
|
++ b.gt 4b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++5: ld1 {v4.8B}, [x1], x2
|
|
|
|
++ ld1 {v5.8B}, [x1], x2
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ umull v16.8H, v4.8B, v0.8B
|
|
|
|
++ umull v17.8H, v5.8B, v0.8B
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v16.8H, #6
|
|
|
|
++ rshrn v17.8B, v17.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v16.8H, v16.8H, v22.8H
|
|
|
|
++ add v17.8H, v17.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v16.8H, #6
|
|
|
|
++ shrn v17.8B, v17.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.8B}, [x8], x2
|
|
|
|
++ ld1 {v21.8B}, [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ urhadd v17.8B, v17.8B, v21.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v16.8B}, [x0], x2
|
|
|
|
++ st1 {v17.8B}, [x0], x2
|
|
|
|
++ b.gt 5b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
|
|
|
++.macro h264_chroma_mc4 type, codec=h264
|
|
|
|
++function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ mov x8, x0
|
|
|
|
++ .endif
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \codec,rv40
|
|
|
|
++ movrel x6, rv40bias
|
|
|
|
++ lsr w9, w5, #1
|
|
|
|
++ lsr w10, w4, #1
|
|
|
|
++ lsl w9, w9, #3
|
|
|
|
++ lsl w10, w10, #1
|
|
|
|
++ add w9, w9, w10
|
|
|
|
++ add x6, x6, w9, UXTW
|
|
|
|
++ ld1r {v22.8H}, [x6]
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \codec,vc1
|
|
|
|
++ movi v22.8H, #28
|
|
|
|
++ .endif
|
|
|
|
++ mul w7, w4, w5
|
|
|
|
++ lsl w14, w5, #3
|
|
|
|
++ lsl w13, w4, #3
|
|
|
|
++ cmp w7, #0
|
|
|
|
++ sub w6, w14, w7
|
|
|
|
++ sub w12, w13, w7
|
|
|
|
++ sub w4, w7, w13
|
|
|
|
++ sub w4, w4, w14
|
|
|
|
++ add w4, w4, #64
|
|
|
|
++ b.eq 2f
|
|
|
|
++
|
|
|
|
++ dup v24.8B, w4
|
|
|
|
++ dup v25.8B, w12
|
|
|
|
++ ld1 {v4.8B}, [x1], x2
|
|
|
|
++ dup v26.8B, w6
|
|
|
|
++ dup v27.8B, w7
|
|
|
|
++ ext v5.8B, v4.8B, v5.8B, #1
|
|
|
|
++ trn1 v0.2S, v24.2S, v25.2S
|
|
|
|
++ trn1 v2.2S, v26.2S, v27.2S
|
|
|
|
++ trn1 v4.2S, v4.2S, v5.2S
|
|
|
|
++1: ld1 {v6.8B}, [x1], x2
|
|
|
|
++ ext v7.8B, v6.8B, v7.8B, #1
|
|
|
|
++ trn1 v6.2S, v6.2S, v7.2S
|
|
|
|
++ umull v18.8H, v4.8B, v0.8B
|
|
|
|
++ umlal v18.8H, v6.8B, v2.8B
|
|
|
|
++ ld1 {v4.8B}, [x1], x2
|
|
|
|
++ ext v5.8B, v4.8B, v5.8B, #1
|
|
|
|
++ trn1 v4.2S, v4.2S, v5.2S
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ umull v19.8H, v6.8B, v0.8B
|
|
|
|
++ umlal v19.8H, v4.8B, v2.8B
|
|
|
|
++ trn1 v30.2D, v18.2D, v19.2D
|
|
|
|
++ trn2 v31.2D, v18.2D, v19.2D
|
|
|
|
++ add v18.8H, v30.8H, v31.8H
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v18.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v18.8H, v18.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v18.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.S}[0], [x8], x2
|
|
|
|
++ ld1 {v20.S}[1], [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v16.S}[0], [x0], x2
|
|
|
|
++ st1 {v16.S}[1], [x0], x2
|
|
|
|
++ b.gt 1b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++2: adds w12, w12, w6
|
|
|
|
++ dup v30.8B, w4
|
|
|
|
++ b.eq 5f
|
|
|
|
++ tst w6, w6
|
|
|
|
++ dup v31.8B, w12
|
|
|
|
++ trn1 v0.2S, v30.2S, v31.2S
|
|
|
|
++ trn2 v1.2S, v30.2S, v31.2S
|
|
|
|
++ b.eq 4f
|
|
|
|
++
|
|
|
|
++ ext v1.8B, v0.8B, v1.8B, #4
|
|
|
|
++ ld1 {v4.S}[0], [x1], x2
|
|
|
|
++3: ld1 {v4.S}[1], [x1], x2
|
|
|
|
++ umull v18.8H, v4.8B, v0.8B
|
|
|
|
++ ld1 {v4.S}[0], [x1], x2
|
|
|
|
++ umull v19.8H, v4.8B, v1.8B
|
|
|
|
++ trn1 v30.2D, v18.2D, v19.2D
|
|
|
|
++ trn2 v31.2D, v18.2D, v19.2D
|
|
|
|
++ add v18.8H, v30.8H, v31.8H
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v18.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v18.8H, v18.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v18.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.S}[0], [x8], x2
|
|
|
|
++ ld1 {v20.S}[1], [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ .endif
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ st1 {v16.S}[0], [x0], x2
|
|
|
|
++ st1 {v16.S}[1], [x0], x2
|
|
|
|
++ b.gt 3b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++4: ld1 {v4.8B}, [x1], x2
|
|
|
|
++ ld1 {v6.8B}, [x1], x2
|
|
|
|
++ ext v5.8B, v4.8B, v5.8B, #1
|
|
|
|
++ ext v7.8B, v6.8B, v7.8B, #1
|
|
|
|
++ trn1 v4.2S, v4.2S, v5.2S
|
|
|
|
++ trn1 v6.2S, v6.2S, v7.2S
|
|
|
|
++ umull v18.8H, v4.8B, v0.8B
|
|
|
|
++ umull v19.8H, v6.8B, v0.8B
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ trn1 v30.2D, v18.2D, v19.2D
|
|
|
|
++ trn2 v31.2D, v18.2D, v19.2D
|
|
|
|
++ add v18.8H, v30.8H, v31.8H
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v18.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v18.8H, v18.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v18.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.S}[0], [x8], x2
|
|
|
|
++ ld1 {v20.S}[1], [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ .endif
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ st1 {v16.S}[0], [x0], x2
|
|
|
|
++ st1 {v16.S}[1], [x0], x2
|
|
|
|
++ b.gt 4b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++5: ld1 {v4.S}[0], [x1], x2
|
|
|
|
++ ld1 {v4.S}[1], [x1], x2
|
|
|
|
++ umull v18.8H, v4.8B, v30.8B
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ .ifc \codec,h264
|
|
|
|
++ rshrn v16.8B, v18.8H, #6
|
|
|
|
++ .else
|
|
|
|
++ add v18.8H, v18.8H, v22.8H
|
|
|
|
++ shrn v16.8B, v18.8H, #6
|
|
|
|
++ .endif
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v20.S}[0], [x8], x2
|
|
|
|
++ ld1 {v20.S}[1], [x8], x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v20.8B
|
|
|
|
++ .endif
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ st1 {v16.S}[0], [x0], x2
|
|
|
|
++ st1 {v16.S}[1], [x0], x2
|
|
|
|
++ b.gt 5b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro h264_chroma_mc2 type
|
|
|
|
++function ff_\type\()_h264_chroma_mc2_neon, export=1
|
|
|
|
++ prfm pldl1strm, [x1]
|
|
|
|
++ prfm pldl1strm, [x1, x2]
|
|
|
|
++ orr w7, w4, w5
|
|
|
|
++ cbz w7, 2f
|
|
|
|
++
|
|
|
|
++ mul w7, w4, w5
|
|
|
|
++ lsl w14, w5, #3
|
|
|
|
++ lsl w13, w4, #3
|
|
|
|
++ sub w6, w14, w7
|
|
|
|
++ sub w12, w13, w7
|
|
|
|
++ sub w4, w7, w13
|
|
|
|
++ sub w4, w4, w14
|
|
|
|
++ add w4, w4, #64
|
|
|
|
++ dup v0.8B, w4
|
|
|
|
++ dup v2.8B, w12
|
|
|
|
++ dup v1.8B, w6
|
|
|
|
++ dup v3.8B, w7
|
|
|
|
++ trn1 v0.4H, v0.4H, v2.4H
|
|
|
|
++ trn1 v1.4H, v1.4H, v3.4H
|
|
|
|
++1:
|
|
|
|
++ ld1 {v4.S}[0], [x1], x2
|
|
|
|
++ ld1 {v4.S}[1], [x1], x2
|
|
|
|
++ rev64 v5.2S, v4.2S
|
|
|
|
++ ld1 {v5.S}[1], [x1]
|
|
|
|
++ ext v6.8B, v4.8B, v5.8B, #1
|
|
|
|
++ ext v7.8B, v5.8B, v4.8B, #1
|
|
|
|
++ trn1 v4.4H, v4.4H, v6.4H
|
|
|
|
++ trn1 v5.4H, v5.4H, v7.4H
|
|
|
|
++ umull v16.8H, v4.8B, v0.8B
|
|
|
|
++ umlal v16.8H, v5.8B, v1.8B
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v18.H}[0], [x0], x2
|
|
|
|
++ ld1 {v18.H}[2], [x0]
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ rev64 v17.4S, v16.4S
|
|
|
|
++ add v16.8H, v16.8H, v17.8H
|
|
|
|
++ rshrn v16.8B, v16.8H, #6
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ urhadd v16.8B, v16.8B, v18.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v16.H}[0], [x0], x2
|
|
|
|
++ st1 {v16.H}[2], [x0], x2
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ b.gt 1b
|
|
|
|
++ ret
|
|
|
|
++
|
|
|
|
++2:
|
|
|
|
++ ld1 {v16.H}[0], [x1], x2
|
|
|
|
++ ld1 {v16.H}[1], [x1], x2
|
|
|
|
++ .ifc \type,avg
|
|
|
|
++ ld1 {v18.H}[0], [x0], x2
|
|
|
|
++ ld1 {v18.H}[1], [x0]
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ urhadd v16.8B, v16.8B, v18.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v16.H}[0], [x0], x2
|
|
|
|
++ st1 {v16.H}[1], [x0], x2
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ b.gt 2b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++ h264_chroma_mc8 put
|
|
|
|
++ h264_chroma_mc8 avg
|
|
|
|
++ h264_chroma_mc4 put
|
|
|
|
++ h264_chroma_mc4 avg
|
|
|
|
++ h264_chroma_mc2 put
|
|
|
|
++ h264_chroma_mc2 avg
|
|
|
|
++
|
|
|
|
++#if CONFIG_RV40_DECODER
|
|
|
|
++const rv40bias
|
|
|
|
++ .short 0, 16, 32, 16
|
|
|
|
++ .short 32, 28, 32, 28
|
|
|
|
++ .short 0, 32, 16, 32
|
|
|
|
++ .short 32, 28, 32, 28
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++ h264_chroma_mc8 put, rv40
|
|
|
|
++ h264_chroma_mc8 avg, rv40
|
|
|
|
++ h264_chroma_mc4 put, rv40
|
|
|
|
++ h264_chroma_mc4 avg, rv40
|
|
|
|
++#endif
|
|
|
|
++
|
|
|
|
++#if CONFIG_VC1DSP
|
|
|
|
++ h264_chroma_mc8 put, vc1
|
|
|
|
++ h264_chroma_mc8 avg, vc1
|
|
|
|
++ h264_chroma_mc4 put, vc1
|
|
|
|
++ h264_chroma_mc4 avg, vc1
|
|
|
|
++#endif
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,102 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/h264dsp.h"
|
|
|
|
++
|
|
|
|
++void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
|
|
|
++ int beta, int8_t *tc0);
|
|
|
|
++void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
|
|
|
++ int beta, int8_t *tc0);
|
|
|
|
++void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
|
|
|
++ int beta, int8_t *tc0);
|
|
|
|
++void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
|
|
|
++ int beta, int8_t *tc0);
|
|
|
|
++
|
|
|
|
++void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
|
|
|
|
++ int log2_den, int weight, int offset);
|
|
|
|
++void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
|
|
|
|
++ int log2_den, int weight, int offset);
|
|
|
|
++void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
|
|
|
|
++ int log2_den, int weight, int offset);
|
|
|
|
++
|
|
|
|
++void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
|
|
|
|
++ int height, int log2_den, int weightd,
|
|
|
|
++ int weights, int offset);
|
|
|
|
++void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
|
|
|
|
++ int height, int log2_den, int weightd,
|
|
|
|
++ int weights, int offset);
|
|
|
|
++void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
|
|
|
|
++ int height, int log2_den, int weightd,
|
|
|
|
++ int weights, int offset);
|
|
|
|
++
|
|
|
|
++void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
|
|
|
++void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
|
|
|
++void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
|
|
|
++ int16_t *block, int stride,
|
|
|
|
++ const uint8_t nnzc[6*8]);
|
|
|
|
++void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
|
|
|
|
++ int16_t *block, int stride,
|
|
|
|
++ const uint8_t nnzc[6*8]);
|
|
|
|
++void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
|
|
|
++ int16_t *block, int stride,
|
|
|
|
++ const uint8_t nnzc[6*8]);
|
|
|
|
++
|
|
|
|
++void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
|
|
|
++void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
|
|
|
++void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
|
|
|
++ int16_t *block, int stride,
|
|
|
|
++ const uint8_t nnzc[6*8]);
|
|
|
|
++
|
|
|
|
++av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
|
|
|
++ const int chroma_format_idc)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags) && bit_depth == 8) {
|
|
|
|
++ c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
|
|
|
++ c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
|
|
|
++ c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
|
|
|
++ if (chroma_format_idc <= 1)
|
|
|
|
++ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
|
|
|
++
|
|
|
|
++ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
|
|
|
|
++ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
|
|
|
|
++ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
|
|
|
|
++
|
|
|
|
++ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
|
|
|
|
++ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
|
|
|
|
++ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
|
|
|
|
++
|
|
|
|
++ c->h264_idct_add = ff_h264_idct_add_neon;
|
|
|
|
++ c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
|
|
|
++ c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
|
|
|
++ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
|
|
|
|
++ if (chroma_format_idc <= 1)
|
|
|
|
++ c->h264_idct_add8 = ff_h264_idct_add8_neon;
|
|
|
|
++ c->h264_idct8_add = ff_h264_idct8_add_neon;
|
|
|
|
++ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
|
|
|
++ c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
|
|
|
|
+@@ -0,0 +1,498 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++#include "neon.S"
|
|
|
|
++
|
|
|
|
++.macro h264_loop_filter_start
|
|
|
|
++ cmp w2, #0
|
|
|
|
++ ldr w6, [x4]
|
|
|
|
++ ccmp w3, #0, #0, ne
|
|
|
|
++ mov v24.S[0], w6
|
|
|
|
++ and w6, w6, w6, lsl #16
|
|
|
|
++ b.eq 1f
|
|
|
|
++ ands w6, w6, w6, lsl #8
|
|
|
|
++ b.ge 2f
|
|
|
|
++1:
|
|
|
|
++ ret
|
|
|
|
++2:
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro h264_loop_filter_luma
|
|
|
|
++ dup v22.16B, w2 // alpha
|
|
|
|
++ uxtl v24.8H, v24.8B
|
|
|
|
++ uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
|
|
|
|
++ uxtl v24.4S, v24.4H
|
|
|
|
++ uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
|
|
|
|
++ sli v24.8H, v24.8H, #8
|
|
|
|
++ uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
|
|
|
|
++ sli v24.4S, v24.4S, #16
|
|
|
|
++ cmhi v21.16B, v22.16B, v21.16B // < alpha
|
|
|
|
++ dup v22.16B, w3 // beta
|
|
|
|
++ cmlt v23.16B, v24.16B, #0
|
|
|
|
++ cmhi v28.16B, v22.16B, v28.16B // < beta
|
|
|
|
++ cmhi v30.16B, v22.16B, v30.16B // < beta
|
|
|
|
++ bic v21.16B, v21.16B, v23.16B
|
|
|
|
++ uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
|
|
|
++ and v21.16B, v21.16B, v28.16B
|
|
|
|
++ uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
|
|
|
++ cmhi v17.16B, v22.16B, v17.16B // < beta
|
|
|
|
++ and v21.16B, v21.16B, v30.16B
|
|
|
|
++ cmhi v19.16B, v22.16B, v19.16B // < beta
|
|
|
|
++ and v17.16B, v17.16B, v21.16B
|
|
|
|
++ and v19.16B, v19.16B, v21.16B
|
|
|
|
++ and v24.16B, v24.16B, v21.16B
|
|
|
|
++ urhadd v28.16B, v16.16B, v0.16B
|
|
|
|
++ sub v21.16B, v24.16B, v17.16B
|
|
|
|
++ uqadd v23.16B, v18.16B, v24.16B
|
|
|
|
++ uhadd v20.16B, v20.16B, v28.16B
|
|
|
|
++ sub v21.16B, v21.16B, v19.16B
|
|
|
|
++ uhadd v28.16B, v4.16B, v28.16B
|
|
|
|
++ umin v23.16B, v23.16B, v20.16B
|
|
|
|
++ uqsub v22.16B, v18.16B, v24.16B
|
|
|
|
++ uqadd v4.16B, v2.16B, v24.16B
|
|
|
|
++ umax v23.16B, v23.16B, v22.16B
|
|
|
|
++ uqsub v22.16B, v2.16B, v24.16B
|
|
|
|
++ umin v28.16B, v4.16B, v28.16B
|
|
|
|
++ uxtl v4.8H, v0.8B
|
|
|
|
++ umax v28.16B, v28.16B, v22.16B
|
|
|
|
++ uxtl2 v20.8H, v0.16B
|
|
|
|
++ usubw v4.8H, v4.8H, v16.8B
|
|
|
|
++ usubw2 v20.8H, v20.8H, v16.16B
|
|
|
|
++ shl v4.8H, v4.8H, #2
|
|
|
|
++ shl v20.8H, v20.8H, #2
|
|
|
|
++ uaddw v4.8H, v4.8H, v18.8B
|
|
|
|
++ uaddw2 v20.8H, v20.8H, v18.16B
|
|
|
|
++ usubw v4.8H, v4.8H, v2.8B
|
|
|
|
++ usubw2 v20.8H, v20.8H, v2.16B
|
|
|
|
++ rshrn v4.8B, v4.8H, #3
|
|
|
|
++ rshrn2 v4.16B, v20.8H, #3
|
|
|
|
++ bsl v17.16B, v23.16B, v18.16B
|
|
|
|
++ bsl v19.16B, v28.16B, v2.16B
|
|
|
|
++ neg v23.16B, v21.16B
|
|
|
|
++ uxtl v28.8H, v16.8B
|
|
|
|
++ smin v4.16B, v4.16B, v21.16B
|
|
|
|
++ uxtl2 v21.8H, v16.16B
|
|
|
|
++ smax v4.16B, v4.16B, v23.16B
|
|
|
|
++ uxtl v22.8H, v0.8B
|
|
|
|
++ uxtl2 v24.8H, v0.16B
|
|
|
|
++ saddw v28.8H, v28.8H, v4.8B
|
|
|
|
++ saddw2 v21.8H, v21.8H, v4.16B
|
|
|
|
++ ssubw v22.8H, v22.8H, v4.8B
|
|
|
|
++ ssubw2 v24.8H, v24.8H, v4.16B
|
|
|
|
++ sqxtun v16.8B, v28.8H
|
|
|
|
++ sqxtun2 v16.16B, v21.8H
|
|
|
|
++ sqxtun v0.8B, v22.8H
|
|
|
|
++ sqxtun2 v0.16B, v24.8H
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function ff_h264_v_loop_filter_luma_neon, export=1
|
|
|
|
++ h264_loop_filter_start
|
|
|
|
++ sxtw x1, w1
|
|
|
|
++
|
|
|
|
++ ld1 {v0.16B}, [x0], x1
|
|
|
|
++ ld1 {v2.16B}, [x0], x1
|
|
|
|
++ ld1 {v4.16B}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ ld1 {v20.16B}, [x0], x1
|
|
|
|
++ ld1 {v18.16B}, [x0], x1
|
|
|
|
++ ld1 {v16.16B}, [x0], x1
|
|
|
|
++
|
|
|
|
++ h264_loop_filter_luma
|
|
|
|
++
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ st1 {v17.16B}, [x0], x1
|
|
|
|
++ st1 {v16.16B}, [x0], x1
|
|
|
|
++ st1 {v0.16B}, [x0], x1
|
|
|
|
++ st1 {v19.16B}, [x0]
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_h_loop_filter_luma_neon, export=1
|
|
|
|
++ h264_loop_filter_start
|
|
|
|
++
|
|
|
|
++ sub x0, x0, #4
|
|
|
|
++ ld1 {v6.8B}, [x0], x1
|
|
|
|
++ ld1 {v20.8B}, [x0], x1
|
|
|
|
++ ld1 {v18.8B}, [x0], x1
|
|
|
|
++ ld1 {v16.8B}, [x0], x1
|
|
|
|
++ ld1 {v0.8B}, [x0], x1
|
|
|
|
++ ld1 {v2.8B}, [x0], x1
|
|
|
|
++ ld1 {v4.8B}, [x0], x1
|
|
|
|
++ ld1 {v26.8B}, [x0], x1
|
|
|
|
++ ld1 {v6.D}[1], [x0], x1
|
|
|
|
++ ld1 {v20.D}[1], [x0], x1
|
|
|
|
++ ld1 {v18.D}[1], [x0], x1
|
|
|
|
++ ld1 {v16.D}[1], [x0], x1
|
|
|
|
++ ld1 {v0.D}[1], [x0], x1
|
|
|
|
++ ld1 {v2.D}[1], [x0], x1
|
|
|
|
++ ld1 {v4.D}[1], [x0], x1
|
|
|
|
++ ld1 {v26.D}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
|
|
|
++
|
|
|
|
++ h264_loop_filter_luma
|
|
|
|
++
|
|
|
|
++ transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
|
|
|
|
++
|
|
|
|
++ sub x0, x0, x1, lsl #4
|
|
|
|
++ add x0, x0, #2
|
|
|
|
++ st1 {v17.S}[0], [x0], x1
|
|
|
|
++ st1 {v16.S}[0], [x0], x1
|
|
|
|
++ st1 {v0.S}[0], [x0], x1
|
|
|
|
++ st1 {v19.S}[0], [x0], x1
|
|
|
|
++ st1 {v17.S}[1], [x0], x1
|
|
|
|
++ st1 {v16.S}[1], [x0], x1
|
|
|
|
++ st1 {v0.S}[1], [x0], x1
|
|
|
|
++ st1 {v19.S}[1], [x0], x1
|
|
|
|
++ st1 {v17.S}[2], [x0], x1
|
|
|
|
++ st1 {v16.S}[2], [x0], x1
|
|
|
|
++ st1 {v0.S}[2], [x0], x1
|
|
|
|
++ st1 {v19.S}[2], [x0], x1
|
|
|
|
++ st1 {v17.S}[3], [x0], x1
|
|
|
|
++ st1 {v16.S}[3], [x0], x1
|
|
|
|
++ st1 {v0.S}[3], [x0], x1
|
|
|
|
++ st1 {v19.S}[3], [x0], x1
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro h264_loop_filter_chroma
|
|
|
|
++ dup v22.8B, w2 // alpha
|
|
|
|
++ uxtl v24.8H, v24.8B
|
|
|
|
++ uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
|
|
|
++ uxtl v4.8H, v0.8B
|
|
|
|
++ uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
|
|
|
++ usubw v4.8H, v4.8H, v16.8B
|
|
|
|
++ sli v24.8H, v24.8H, #8
|
|
|
|
++ shl v4.8H, v4.8H, #2
|
|
|
|
++ uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
|
|
|
++ uaddw v4.8H, v4.8H, v18.8B
|
|
|
|
++ cmhi v26.8B, v22.8B, v26.8B // < alpha
|
|
|
|
++ usubw v4.8H, v4.8H, v2.8B
|
|
|
|
++ dup v22.8B, w3 // beta
|
|
|
|
++ rshrn v4.8B, v4.8H, #3
|
|
|
|
++ cmhi v28.8B, v22.8B, v28.8B // < beta
|
|
|
|
++ cmhi v30.8B, v22.8B, v30.8B // < beta
|
|
|
|
++ smin v4.8B, v4.8B, v24.8B
|
|
|
|
++ neg v25.8B, v24.8B
|
|
|
|
++ and v26.8B, v26.8B, v28.8B
|
|
|
|
++ smax v4.8B, v4.8B, v25.8B
|
|
|
|
++ and v26.8B, v26.8B, v30.8B
|
|
|
|
++ uxtl v22.8H, v0.8B
|
|
|
|
++ and v4.8B, v4.8B, v26.8B
|
|
|
|
++ uxtl v28.8H, v16.8B
|
|
|
|
++ saddw v28.8H, v28.8H, v4.8B
|
|
|
|
++ ssubw v22.8H, v22.8H, v4.8B
|
|
|
|
++ sqxtun v16.8B, v28.8H
|
|
|
|
++ sqxtun v0.8B, v22.8H
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function ff_h264_v_loop_filter_chroma_neon, export=1
|
|
|
|
++ h264_loop_filter_start
|
|
|
|
++
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ ld1 {v18.8B}, [x0], x1
|
|
|
|
++ ld1 {v16.8B}, [x0], x1
|
|
|
|
++ ld1 {v0.8B}, [x0], x1
|
|
|
|
++ ld1 {v2.8B}, [x0]
|
|
|
|
++
|
|
|
|
++ h264_loop_filter_chroma
|
|
|
|
++
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ st1 {v16.8B}, [x0], x1
|
|
|
|
++ st1 {v0.8B}, [x0], x1
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_h_loop_filter_chroma_neon, export=1
|
|
|
|
++ h264_loop_filter_start
|
|
|
|
++
|
|
|
|
++ sub x0, x0, #2
|
|
|
|
++ ld1 {v18.S}[0], [x0], x1
|
|
|
|
++ ld1 {v16.S}[0], [x0], x1
|
|
|
|
++ ld1 {v0.S}[0], [x0], x1
|
|
|
|
++ ld1 {v2.S}[0], [x0], x1
|
|
|
|
++ ld1 {v18.S}[1], [x0], x1
|
|
|
|
++ ld1 {v16.S}[1], [x0], x1
|
|
|
|
++ ld1 {v0.S}[1], [x0], x1
|
|
|
|
++ ld1 {v2.S}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
|
|
|
++
|
|
|
|
++ h264_loop_filter_chroma
|
|
|
|
++
|
|
|
|
++ transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
|
|
|
++
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ st1 {v18.S}[0], [x0], x1
|
|
|
|
++ st1 {v16.S}[0], [x0], x1
|
|
|
|
++ st1 {v0.S}[0], [x0], x1
|
|
|
|
++ st1 {v2.S}[0], [x0], x1
|
|
|
|
++ st1 {v18.S}[1], [x0], x1
|
|
|
|
++ st1 {v16.S}[1], [x0], x1
|
|
|
|
++ st1 {v0.S}[1], [x0], x1
|
|
|
|
++ st1 {v2.S}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro biweight_16 macs, macd
|
|
|
|
++ dup v0.16B, w5
|
|
|
|
++ dup v1.16B, w6
|
|
|
|
++ mov v4.16B, v16.16B
|
|
|
|
++ mov v6.16B, v16.16B
|
|
|
|
++1: subs w3, w3, #2
|
|
|
|
++ ld1 {v20.16B}, [x0], x2
|
|
|
|
++ \macd v4.8H, v0.8B, v20.8B
|
|
|
|
++ \macd\()2 v6.8H, v0.16B, v20.16B
|
|
|
|
++ ld1 {v22.16B}, [x1], x2
|
|
|
|
++ \macs v4.8H, v1.8B, v22.8B
|
|
|
|
++ \macs\()2 v6.8H, v1.16B, v22.16B
|
|
|
|
++ mov v24.16B, v16.16B
|
|
|
|
++ ld1 {v28.16B}, [x0], x2
|
|
|
|
++ mov v26.16B, v16.16B
|
|
|
|
++ \macd v24.8H, v0.8B, v28.8B
|
|
|
|
++ \macd\()2 v26.8H, v0.16B, v28.16B
|
|
|
|
++ ld1 {v30.16B}, [x1], x2
|
|
|
|
++ \macs v24.8H, v1.8B, v30.8B
|
|
|
|
++ \macs\()2 v26.8H, v1.16B, v30.16B
|
|
|
|
++ sshl v4.8H, v4.8H, v18.8H
|
|
|
|
++ sshl v6.8H, v6.8H, v18.8H
|
|
|
|
++ sqxtun v4.8B, v4.8H
|
|
|
|
++ sqxtun2 v4.16B, v6.8H
|
|
|
|
++ sshl v24.8H, v24.8H, v18.8H
|
|
|
|
++ sshl v26.8H, v26.8H, v18.8H
|
|
|
|
++ sqxtun v24.8B, v24.8H
|
|
|
|
++ sqxtun2 v24.16B, v26.8H
|
|
|
|
++ mov v6.16B, v16.16B
|
|
|
|
++ st1 {v4.16B}, [x7], x2
|
|
|
|
++ mov v4.16B, v16.16B
|
|
|
|
++ st1 {v24.16B}, [x7], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro biweight_8 macs, macd
|
|
|
|
++ dup v0.8B, w5
|
|
|
|
++ dup v1.8B, w6
|
|
|
|
++ mov v2.16B, v16.16B
|
|
|
|
++ mov v20.16B, v16.16B
|
|
|
|
++1: subs w3, w3, #2
|
|
|
|
++ ld1 {v4.8B}, [x0], x2
|
|
|
|
++ \macd v2.8H, v0.8B, v4.8B
|
|
|
|
++ ld1 {v5.8B}, [x1], x2
|
|
|
|
++ \macs v2.8H, v1.8B, v5.8B
|
|
|
|
++ ld1 {v6.8B}, [x0], x2
|
|
|
|
++ \macd v20.8H, v0.8B, v6.8B
|
|
|
|
++ ld1 {v7.8B}, [x1], x2
|
|
|
|
++ \macs v20.8H, v1.8B, v7.8B
|
|
|
|
++ sshl v2.8H, v2.8H, v18.8H
|
|
|
|
++ sqxtun v2.8B, v2.8H
|
|
|
|
++ sshl v20.8H, v20.8H, v18.8H
|
|
|
|
++ sqxtun v4.8B, v20.8H
|
|
|
|
++ mov v20.16B, v16.16B
|
|
|
|
++ st1 {v2.8B}, [x7], x2
|
|
|
|
++ mov v2.16B, v16.16B
|
|
|
|
++ st1 {v4.8B}, [x7], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro biweight_4 macs, macd
|
|
|
|
++ dup v0.8B, w5
|
|
|
|
++ dup v1.8B, w6
|
|
|
|
++ mov v2.16B, v16.16B
|
|
|
|
++ mov v20.16B,v16.16B
|
|
|
|
++1: subs w3, w3, #4
|
|
|
|
++ ld1 {v4.S}[0], [x0], x2
|
|
|
|
++ ld1 {v4.S}[1], [x0], x2
|
|
|
|
++ \macd v2.8H, v0.8B, v4.8B
|
|
|
|
++ ld1 {v5.S}[0], [x1], x2
|
|
|
|
++ ld1 {v5.S}[1], [x1], x2
|
|
|
|
++ \macs v2.8H, v1.8B, v5.8B
|
|
|
|
++ b.lt 2f
|
|
|
|
++ ld1 {v6.S}[0], [x0], x2
|
|
|
|
++ ld1 {v6.S}[1], [x0], x2
|
|
|
|
++ \macd v20.8H, v0.8B, v6.8B
|
|
|
|
++ ld1 {v7.S}[0], [x1], x2
|
|
|
|
++ ld1 {v7.S}[1], [x1], x2
|
|
|
|
++ \macs v20.8H, v1.8B, v7.8B
|
|
|
|
++ sshl v2.8H, v2.8H, v18.8H
|
|
|
|
++ sqxtun v2.8B, v2.8H
|
|
|
|
++ sshl v20.8H, v20.8H, v18.8H
|
|
|
|
++ sqxtun v4.8B, v20.8H
|
|
|
|
++ mov v20.16B, v16.16B
|
|
|
|
++ st1 {v2.S}[0], [x7], x2
|
|
|
|
++ st1 {v2.S}[1], [x7], x2
|
|
|
|
++ mov v2.16B, v16.16B
|
|
|
|
++ st1 {v4.S}[0], [x7], x2
|
|
|
|
++ st1 {v4.S}[1], [x7], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++2: sshl v2.8H, v2.8H, v18.8H
|
|
|
|
++ sqxtun v2.8B, v2.8H
|
|
|
|
++ st1 {v2.S}[0], [x7], x2
|
|
|
|
++ st1 {v2.S}[1], [x7], x2
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro biweight_func w
|
|
|
|
++function ff_biweight_h264_pixels_\w\()_neon, export=1
|
|
|
|
++ sxtw x2, w2
|
|
|
|
++ lsr w8, w5, #31
|
|
|
|
++ add w7, w7, #1
|
|
|
|
++ eor w8, w8, w6, lsr #30
|
|
|
|
++ orr w7, w7, #1
|
|
|
|
++ dup v18.8H, w4
|
|
|
|
++ lsl w7, w7, w4
|
|
|
|
++ not v18.16B, v18.16B
|
|
|
|
++ dup v16.8H, w7
|
|
|
|
++ mov x7, x0
|
|
|
|
++ cbz w8, 10f
|
|
|
|
++ subs w8, w8, #1
|
|
|
|
++ b.eq 20f
|
|
|
|
++ subs w8, w8, #1
|
|
|
|
++ b.eq 30f
|
|
|
|
++ b 40f
|
|
|
|
++10: biweight_\w umlal, umlal
|
|
|
|
++20: neg w5, w5
|
|
|
|
++ biweight_\w umlal, umlsl
|
|
|
|
++30: neg w5, w5
|
|
|
|
++ neg w6, w6
|
|
|
|
++ biweight_\w umlsl, umlsl
|
|
|
|
++40: neg w6, w6
|
|
|
|
++ biweight_\w umlsl, umlal
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++ biweight_func 16
|
|
|
|
++ biweight_func 8
|
|
|
|
++ biweight_func 4
|
|
|
|
++
|
|
|
|
++.macro weight_16 add
|
|
|
|
++ dup v0.16B, w4
|
|
|
|
++1: subs w2, w2, #2
|
|
|
|
++ ld1 {v20.16B}, [x0], x1
|
|
|
|
++ umull v4.8H, v0.8B, v20.8B
|
|
|
|
++ umull2 v6.8H, v0.16B, v20.16B
|
|
|
|
++ ld1 {v28.16B}, [x0], x1
|
|
|
|
++ umull v24.8H, v0.8B, v28.8B
|
|
|
|
++ umull2 v26.8H, v0.16B, v28.16B
|
|
|
|
++ \add v4.8H, v16.8H, v4.8H
|
|
|
|
++ srshl v4.8H, v4.8H, v18.8H
|
|
|
|
++ \add v6.8H, v16.8H, v6.8H
|
|
|
|
++ srshl v6.8H, v6.8H, v18.8H
|
|
|
|
++ sqxtun v4.8B, v4.8H
|
|
|
|
++ sqxtun2 v4.16B, v6.8H
|
|
|
|
++ \add v24.8H, v16.8H, v24.8H
|
|
|
|
++ srshl v24.8H, v24.8H, v18.8H
|
|
|
|
++ \add v26.8H, v16.8H, v26.8H
|
|
|
|
++ srshl v26.8H, v26.8H, v18.8H
|
|
|
|
++ sqxtun v24.8B, v24.8H
|
|
|
|
++ sqxtun2 v24.16B, v26.8H
|
|
|
|
++ st1 {v4.16B}, [x5], x1
|
|
|
|
++ st1 {v24.16B}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro weight_8 add
|
|
|
|
++ dup v0.8B, w4
|
|
|
|
++1: subs w2, w2, #2
|
|
|
|
++ ld1 {v4.8B}, [x0], x1
|
|
|
|
++ umull v2.8H, v0.8B, v4.8B
|
|
|
|
++ ld1 {v6.8B}, [x0], x1
|
|
|
|
++ umull v20.8H, v0.8B, v6.8B
|
|
|
|
++ \add v2.8H, v16.8H, v2.8H
|
|
|
|
++ srshl v2.8H, v2.8H, v18.8H
|
|
|
|
++ sqxtun v2.8B, v2.8H
|
|
|
|
++ \add v20.8H, v16.8H, v20.8H
|
|
|
|
++ srshl v20.8H, v20.8H, v18.8H
|
|
|
|
++ sqxtun v4.8B, v20.8H
|
|
|
|
++ st1 {v2.8B}, [x5], x1
|
|
|
|
++ st1 {v4.8B}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro weight_4 add
|
|
|
|
++ dup v0.8B, w4
|
|
|
|
++1: subs w2, w2, #4
|
|
|
|
++ ld1 {v4.S}[0], [x0], x1
|
|
|
|
++ ld1 {v4.S}[1], [x0], x1
|
|
|
|
++ umull v2.8H, v0.8B, v4.8B
|
|
|
|
++ b.lt 2f
|
|
|
|
++ ld1 {v6.S}[0], [x0], x1
|
|
|
|
++ ld1 {v6.S}[1], [x0], x1
|
|
|
|
++ umull v20.8H, v0.8B, v6.8B
|
|
|
|
++ \add v2.8H, v16.8H, v2.8H
|
|
|
|
++ srshl v2.8H, v2.8H, v18.8H
|
|
|
|
++ sqxtun v2.8B, v2.8H
|
|
|
|
++ \add v20.8H, v16.8H, v20.8H
|
|
|
|
++ srshl v20.8H, v20.8h, v18.8H
|
|
|
|
++ sqxtun v4.8B, v20.8H
|
|
|
|
++ st1 {v2.S}[0], [x5], x1
|
|
|
|
++ st1 {v2.S}[1], [x5], x1
|
|
|
|
++ st1 {v4.S}[0], [x5], x1
|
|
|
|
++ st1 {v4.S}[1], [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++2: \add v2.8H, v16.8H, v2.8H
|
|
|
|
++ srshl v2.8H, v2.8H, v18.8H
|
|
|
|
++ sqxtun v2.8B, v2.8H
|
|
|
|
++ st1 {v2.S}[0], [x5], x1
|
|
|
|
++ st1 {v2.S}[1], [x5], x1
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro weight_func w
|
|
|
|
++function ff_weight_h264_pixels_\w\()_neon, export=1
|
|
|
|
++ sxtw x1, w1
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ mov w6, #1
|
|
|
|
++ lsl w5, w5, w3
|
|
|
|
++ dup v16.8H, w5
|
|
|
|
++ mov x5, x0
|
|
|
|
++ b.le 20f
|
|
|
|
++ sub w6, w6, w3
|
|
|
|
++ dup v18.8H, w6
|
|
|
|
++ cmp w4, #0
|
|
|
|
++ b.lt 10f
|
|
|
|
++ weight_\w shadd
|
|
|
|
++10: neg w4, w4
|
|
|
|
++ weight_\w shsub
|
|
|
|
++20: neg w6, w3
|
|
|
|
++ dup v18.8H, w6
|
|
|
|
++ cmp w4, #0
|
|
|
|
++ b.lt 10f
|
|
|
|
++ weight_\w add
|
|
|
|
++10: neg w4, w4
|
|
|
|
++ weight_\w sub
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++ weight_func 16
|
|
|
|
++ weight_func 8
|
|
|
|
++ weight_func 4
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264idct_neon.S b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
|
|
|
|
+@@ -0,0 +1,409 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++#include "neon.S"
|
|
|
|
++
|
|
|
|
++function ff_h264_idct_add_neon, export=1
|
|
|
|
++ ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
|
|
|
|
++ sxtw x2, w2
|
|
|
|
++ movi v30.8H, #0
|
|
|
|
++
|
|
|
|
++ add v4.4H, v0.4H, v2.4H
|
|
|
|
++ sshr v16.4H, v1.4H, #1
|
|
|
|
++ st1 {v30.8H}, [x1], #16
|
|
|
|
++ sshr v17.4H, v3.4H, #1
|
|
|
|
++ st1 {v30.8H}, [x1], #16
|
|
|
|
++ sub v5.4H, v0.4H, v2.4H
|
|
|
|
++ sub v6.4H, v16.4H, v3.4H
|
|
|
|
++ add v7.4H, v1.4H, v17.4H
|
|
|
|
++ add v0.4H, v4.4H, v7.4H
|
|
|
|
++ add v1.4H, v5.4H, v6.4H
|
|
|
|
++ sub v2.4H, v5.4H, v6.4H
|
|
|
|
++ sub v3.4H, v4.4H, v7.4H
|
|
|
|
++
|
|
|
|
++ transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ add v4.4H, v0.4H, v2.4H
|
|
|
|
++ ld1 {v18.S}[0], [x0], x2
|
|
|
|
++ sshr v16.4H, v3.4H, #1
|
|
|
|
++ sshr v17.4H, v1.4H, #1
|
|
|
|
++ ld1 {v18.S}[1], [x0], x2
|
|
|
|
++ sub v5.4H, v0.4H, v2.4H
|
|
|
|
++ ld1 {v19.S}[1], [x0], x2
|
|
|
|
++ add v6.4H, v16.4H, v1.4H
|
|
|
|
++ ins v4.D[1], v5.D[0]
|
|
|
|
++ sub v7.4H, v17.4H, v3.4H
|
|
|
|
++ ld1 {v19.S}[0], [x0], x2
|
|
|
|
++ ins v6.D[1], v7.D[0]
|
|
|
|
++ sub x0, x0, x2, lsl #2
|
|
|
|
++ add v0.8H, v4.8H, v6.8H
|
|
|
|
++ sub v1.8H, v4.8H, v6.8H
|
|
|
|
++
|
|
|
|
++ srshr v0.8H, v0.8H, #6
|
|
|
|
++ srshr v1.8H, v1.8H, #6
|
|
|
|
++
|
|
|
|
++ uaddw v0.8H, v0.8H, v18.8B
|
|
|
|
++ uaddw v1.8H, v1.8H, v19.8B
|
|
|
|
++
|
|
|
|
++ sqxtun v0.8B, v0.8H
|
|
|
|
++ sqxtun v1.8B, v1.8H
|
|
|
|
++
|
|
|
|
++ st1 {v0.S}[0], [x0], x2
|
|
|
|
++ st1 {v0.S}[1], [x0], x2
|
|
|
|
++ st1 {v1.S}[1], [x0], x2
|
|
|
|
++ st1 {v1.S}[0], [x0], x2
|
|
|
|
++
|
|
|
|
++ sub x1, x1, #32
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_idct_dc_add_neon, export=1
|
|
|
|
++ sxtw x2, w2
|
|
|
|
++ mov w3, #0
|
|
|
|
++ ld1r {v2.8H}, [x1]
|
|
|
|
++ strh w3, [x1]
|
|
|
|
++ srshr v2.8H, v2.8H, #6
|
|
|
|
++ ld1 {v0.S}[0], [x0], x2
|
|
|
|
++ ld1 {v0.S}[1], [x0], x2
|
|
|
|
++ uaddw v3.8H, v2.8H, v0.8B
|
|
|
|
++ ld1 {v1.S}[0], [x0], x2
|
|
|
|
++ ld1 {v1.S}[1], [x0], x2
|
|
|
|
++ uaddw v4.8H, v2.8H, v1.8B
|
|
|
|
++ sqxtun v0.8B, v3.8H
|
|
|
|
++ sqxtun v1.8B, v4.8H
|
|
|
|
++ sub x0, x0, x2, lsl #2
|
|
|
|
++ st1 {v0.S}[0], [x0], x2
|
|
|
|
++ st1 {v0.S}[1], [x0], x2
|
|
|
|
++ st1 {v1.S}[0], [x0], x2
|
|
|
|
++ st1 {v1.S}[1], [x0], x2
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_idct_add16_neon, export=1
|
|
|
|
++ mov x12, x30
|
|
|
|
++ mov x6, x0 // dest
|
|
|
|
++ mov x5, x1 // block_offset
|
|
|
|
++ mov x1, x2 // block
|
|
|
|
++ mov w9, w3 // stride
|
|
|
|
++ movrel x7, scan8
|
|
|
|
++ mov x10, #16
|
|
|
|
++ movrel x13, X(ff_h264_idct_dc_add_neon)
|
|
|
|
++ movrel x14, X(ff_h264_idct_add_neon)
|
|
|
|
++1: mov w2, w9
|
|
|
|
++ ldrb w3, [x7], #1
|
|
|
|
++ ldrsw x0, [x5], #4
|
|
|
|
++ ldrb w3, [x4, w3, uxtw]
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.lt 2f
|
|
|
|
++ ldrsh w3, [x1]
|
|
|
|
++ add x0, x0, x6
|
|
|
|
++ ccmp w3, #0, #4, eq
|
|
|
|
++ csel x15, x13, x14, ne
|
|
|
|
++ blr x15
|
|
|
|
++2: subs x10, x10, #1
|
|
|
|
++ add x1, x1, #32
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret x12
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_idct_add16intra_neon, export=1
|
|
|
|
++ mov x12, x30
|
|
|
|
++ mov x6, x0 // dest
|
|
|
|
++ mov x5, x1 // block_offset
|
|
|
|
++ mov x1, x2 // block
|
|
|
|
++ mov w9, w3 // stride
|
|
|
|
++ movrel x7, scan8
|
|
|
|
++ mov x10, #16
|
|
|
|
++ movrel x13, X(ff_h264_idct_dc_add_neon)
|
|
|
|
++ movrel x14, X(ff_h264_idct_add_neon)
|
|
|
|
++1: mov w2, w9
|
|
|
|
++ ldrb w3, [x7], #1
|
|
|
|
++ ldrsw x0, [x5], #4
|
|
|
|
++ ldrb w3, [x4, w3, uxtw]
|
|
|
|
++ add x0, x0, x6
|
|
|
|
++ cmp w3, #0
|
|
|
|
++ ldrsh w3, [x1]
|
|
|
|
++ csel x15, x13, x14, eq
|
|
|
|
++ ccmp w3, #0, #0, eq
|
|
|
|
++ b.eq 2f
|
|
|
|
++ blr x15
|
|
|
|
++2: subs x10, x10, #1
|
|
|
|
++ add x1, x1, #32
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret x12
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_idct_add8_neon, export=1
|
|
|
|
++ sub sp, sp, #0x40
|
|
|
|
++ stp x19, x20, [sp]
|
|
|
|
++ mov x12, x30
|
|
|
|
++ ldp x6, x15, [x0] // dest[0], dest[1]
|
|
|
|
++ add x5, x1, #16*4 // block_offset
|
|
|
|
++ add x9, x2, #16*32 // block
|
|
|
|
++ mov w19, w3 // stride
|
|
|
|
++ movrel x13, X(ff_h264_idct_dc_add_neon)
|
|
|
|
++ movrel x14, X(ff_h264_idct_add_neon)
|
|
|
|
++ movrel x7, scan8, 16
|
|
|
|
++ mov x10, #0
|
|
|
|
++ mov x11, #16
|
|
|
|
++1: mov w2, w19
|
|
|
|
++ ldrb w3, [x7, x10] // scan8[i]
|
|
|
|
++ ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
|
|
|
|
++ ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
|
|
|
|
++ add x0, x0, x6 // block_offset[i] + dst[j-1]
|
|
|
|
++ add x1, x9, x10, lsl #5 // block + i * 16
|
|
|
|
++ cmp w3, #0
|
|
|
|
++ ldrsh w3, [x1] // block[i*16]
|
|
|
|
++ csel x20, x13, x14, eq
|
|
|
|
++ ccmp w3, #0, #0, eq
|
|
|
|
++ b.eq 2f
|
|
|
|
++ blr x20
|
|
|
|
++2: add x10, x10, #1
|
|
|
|
++ cmp x10, #4
|
|
|
|
++ csel x10, x11, x10, eq // mov x10, #16
|
|
|
|
++ csel x6, x15, x6, eq
|
|
|
|
++ cmp x10, #20
|
|
|
|
++ b.lt 1b
|
|
|
|
++ ldp x19, x20, [sp]
|
|
|
|
++ add sp, sp, #0x40
|
|
|
|
++ ret x12
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct8x8_cols pass
|
|
|
|
++ .if \pass == 0
|
|
|
|
++ va .req v18
|
|
|
|
++ vb .req v30
|
|
|
|
++ sshr v18.8H, v26.8H, #1
|
|
|
|
++ add v16.8H, v24.8H, v28.8H
|
|
|
|
++ ld1 {v30.8H, v31.8H}, [x1]
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ sub v17.8H, v24.8H, v28.8H
|
|
|
|
++ sshr v19.8H, v30.8H, #1
|
|
|
|
++ sub v18.8H, v18.8H, v30.8H
|
|
|
|
++ add v19.8H, v19.8H, v26.8H
|
|
|
|
++ .else
|
|
|
|
++ va .req v30
|
|
|
|
++ vb .req v18
|
|
|
|
++ sshr v30.8H, v26.8H, #1
|
|
|
|
++ sshr v19.8H, v18.8H, #1
|
|
|
|
++ add v16.8H, v24.8H, v28.8H
|
|
|
|
++ sub v17.8H, v24.8H, v28.8H
|
|
|
|
++ sub v30.8H, v30.8H, v18.8H
|
|
|
|
++ add v19.8H, v19.8H, v26.8H
|
|
|
|
++ .endif
|
|
|
|
++ add v26.8H, v17.8H, va.8H
|
|
|
|
++ sub v28.8H, v17.8H, va.8H
|
|
|
|
++ add v24.8H, v16.8H, v19.8H
|
|
|
|
++ sub vb.8H, v16.8H, v19.8H
|
|
|
|
++ sub v16.8H, v29.8H, v27.8H
|
|
|
|
++ add v17.8H, v31.8H, v25.8H
|
|
|
|
++ sub va.8H, v31.8H, v25.8H
|
|
|
|
++ add v19.8H, v29.8H, v27.8H
|
|
|
|
++ sub v16.8H, v16.8H, v31.8H
|
|
|
|
++ sub v17.8H, v17.8H, v27.8H
|
|
|
|
++ add va.8H, va.8H, v29.8H
|
|
|
|
++ add v19.8H, v19.8H, v25.8H
|
|
|
|
++ sshr v25.8H, v25.8H, #1
|
|
|
|
++ sshr v27.8H, v27.8H, #1
|
|
|
|
++ sshr v29.8H, v29.8H, #1
|
|
|
|
++ sshr v31.8H, v31.8H, #1
|
|
|
|
++ sub v16.8H, v16.8H, v31.8H
|
|
|
|
++ sub v17.8H, v17.8H, v27.8H
|
|
|
|
++ add va.8H, va.8H, v29.8H
|
|
|
|
++ add v19.8H, v19.8H, v25.8H
|
|
|
|
++ sshr v25.8H, v16.8H, #2
|
|
|
|
++ sshr v27.8H, v17.8H, #2
|
|
|
|
++ sshr v29.8H, va.8H, #2
|
|
|
|
++ sshr v31.8H, v19.8H, #2
|
|
|
|
++ sub v19.8H, v19.8H, v25.8H
|
|
|
|
++ sub va.8H, v27.8H, va.8H
|
|
|
|
++ add v17.8H, v17.8H, v29.8H
|
|
|
|
++ add v16.8H, v16.8H, v31.8H
|
|
|
|
++ .if \pass == 0
|
|
|
|
++ sub v31.8H, v24.8H, v19.8H
|
|
|
|
++ add v24.8H, v24.8H, v19.8H
|
|
|
|
++ add v25.8H, v26.8H, v18.8H
|
|
|
|
++ sub v18.8H, v26.8H, v18.8H
|
|
|
|
++ add v26.8H, v28.8H, v17.8H
|
|
|
|
++ add v27.8H, v30.8H, v16.8H
|
|
|
|
++ sub v29.8H, v28.8H, v17.8H
|
|
|
|
++ sub v28.8H, v30.8H, v16.8H
|
|
|
|
++ .else
|
|
|
|
++ sub v31.8H, v24.8H, v19.8H
|
|
|
|
++ add v24.8H, v24.8H, v19.8H
|
|
|
|
++ add v25.8H, v26.8H, v30.8H
|
|
|
|
++ sub v30.8H, v26.8H, v30.8H
|
|
|
|
++ add v26.8H, v28.8H, v17.8H
|
|
|
|
++ sub v29.8H, v28.8H, v17.8H
|
|
|
|
++ add v27.8H, v18.8H, v16.8H
|
|
|
|
++ sub v28.8H, v18.8H, v16.8H
|
|
|
|
++ .endif
|
|
|
|
++ .unreq va
|
|
|
|
++ .unreq vb
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function ff_h264_idct8_add_neon, export=1
|
|
|
|
++ movi v19.8H, #0
|
|
|
|
++ sxtw x2, w2
|
|
|
|
++ ld1 {v24.8H, v25.8H}, [x1]
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ ld1 {v26.8H, v27.8H}, [x1]
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ ld1 {v28.8H, v29.8H}, [x1]
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++ st1 {v19.8H}, [x1], #16
|
|
|
|
++
|
|
|
|
++ idct8x8_cols 0
|
|
|
|
++ transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
|
|
|
|
++ idct8x8_cols 1
|
|
|
|
++
|
|
|
|
++ mov x3, x0
|
|
|
|
++ srshr v24.8H, v24.8H, #6
|
|
|
|
++ ld1 {v0.8B}, [x0], x2
|
|
|
|
++ srshr v25.8H, v25.8H, #6
|
|
|
|
++ ld1 {v1.8B}, [x0], x2
|
|
|
|
++ srshr v26.8H, v26.8H, #6
|
|
|
|
++ ld1 {v2.8B}, [x0], x2
|
|
|
|
++ srshr v27.8H, v27.8H, #6
|
|
|
|
++ ld1 {v3.8B}, [x0], x2
|
|
|
|
++ srshr v28.8H, v28.8H, #6
|
|
|
|
++ ld1 {v4.8B}, [x0], x2
|
|
|
|
++ srshr v29.8H, v29.8H, #6
|
|
|
|
++ ld1 {v5.8B}, [x0], x2
|
|
|
|
++ srshr v30.8H, v30.8H, #6
|
|
|
|
++ ld1 {v6.8B}, [x0], x2
|
|
|
|
++ srshr v31.8H, v31.8H, #6
|
|
|
|
++ ld1 {v7.8B}, [x0], x2
|
|
|
|
++ uaddw v24.8H, v24.8H, v0.8B
|
|
|
|
++ uaddw v25.8H, v25.8H, v1.8B
|
|
|
|
++ uaddw v26.8H, v26.8H, v2.8B
|
|
|
|
++ sqxtun v0.8B, v24.8H
|
|
|
|
++ uaddw v27.8H, v27.8H, v3.8B
|
|
|
|
++ sqxtun v1.8B, v25.8H
|
|
|
|
++ uaddw v28.8H, v28.8H, v4.8B
|
|
|
|
++ sqxtun v2.8B, v26.8H
|
|
|
|
++ st1 {v0.8B}, [x3], x2
|
|
|
|
++ uaddw v29.8H, v29.8H, v5.8B
|
|
|
|
++ sqxtun v3.8B, v27.8H
|
|
|
|
++ st1 {v1.8B}, [x3], x2
|
|
|
|
++ uaddw v30.8H, v30.8H, v6.8B
|
|
|
|
++ sqxtun v4.8B, v28.8H
|
|
|
|
++ st1 {v2.8B}, [x3], x2
|
|
|
|
++ uaddw v31.8H, v31.8H, v7.8B
|
|
|
|
++ sqxtun v5.8B, v29.8H
|
|
|
|
++ st1 {v3.8B}, [x3], x2
|
|
|
|
++ sqxtun v6.8B, v30.8H
|
|
|
|
++ sqxtun v7.8B, v31.8H
|
|
|
|
++ st1 {v4.8B}, [x3], x2
|
|
|
|
++ st1 {v5.8B}, [x3], x2
|
|
|
|
++ st1 {v6.8B}, [x3], x2
|
|
|
|
++ st1 {v7.8B}, [x3], x2
|
|
|
|
++
|
|
|
|
++ sub x1, x1, #128
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_idct8_dc_add_neon, export=1
|
|
|
|
++ mov w3, #0
|
|
|
|
++ sxtw x2, w2
|
|
|
|
++ ld1r {v31.8H}, [x1]
|
|
|
|
++ strh w3, [x1]
|
|
|
|
++ ld1 {v0.8B}, [x0], x2
|
|
|
|
++ srshr v31.8H, v31.8H, #6
|
|
|
|
++ ld1 {v1.8B}, [x0], x2
|
|
|
|
++ ld1 {v2.8B}, [x0], x2
|
|
|
|
++ uaddw v24.8H, v31.8H, v0.8B
|
|
|
|
++ ld1 {v3.8B}, [x0], x2
|
|
|
|
++ uaddw v25.8H, v31.8H, v1.8B
|
|
|
|
++ ld1 {v4.8B}, [x0], x2
|
|
|
|
++ uaddw v26.8H, v31.8H, v2.8B
|
|
|
|
++ ld1 {v5.8B}, [x0], x2
|
|
|
|
++ uaddw v27.8H, v31.8H, v3.8B
|
|
|
|
++ ld1 {v6.8B}, [x0], x2
|
|
|
|
++ uaddw v28.8H, v31.8H, v4.8B
|
|
|
|
++ ld1 {v7.8B}, [x0], x2
|
|
|
|
++ uaddw v29.8H, v31.8H, v5.8B
|
|
|
|
++ uaddw v30.8H, v31.8H, v6.8B
|
|
|
|
++ uaddw v31.8H, v31.8H, v7.8B
|
|
|
|
++ sqxtun v0.8B, v24.8H
|
|
|
|
++ sqxtun v1.8B, v25.8H
|
|
|
|
++ sqxtun v2.8B, v26.8H
|
|
|
|
++ sqxtun v3.8B, v27.8H
|
|
|
|
++ sub x0, x0, x2, lsl #3
|
|
|
|
++ st1 {v0.8B}, [x0], x2
|
|
|
|
++ sqxtun v4.8B, v28.8H
|
|
|
|
++ st1 {v1.8B}, [x0], x2
|
|
|
|
++ sqxtun v5.8B, v29.8H
|
|
|
|
++ st1 {v2.8B}, [x0], x2
|
|
|
|
++ sqxtun v6.8B, v30.8H
|
|
|
|
++ st1 {v3.8B}, [x0], x2
|
|
|
|
++ sqxtun v7.8B, v31.8H
|
|
|
|
++ st1 {v4.8B}, [x0], x2
|
|
|
|
++ st1 {v5.8B}, [x0], x2
|
|
|
|
++ st1 {v6.8B}, [x0], x2
|
|
|
|
++ st1 {v7.8B}, [x0], x2
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_h264_idct8_add4_neon, export=1
|
|
|
|
++ mov x12, x30
|
|
|
|
++ mov x6, x0
|
|
|
|
++ mov x5, x1
|
|
|
|
++ mov x1, x2
|
|
|
|
++ mov w2, w3
|
|
|
|
++ movrel x7, scan8
|
|
|
|
++ mov w10, #16
|
|
|
|
++ movrel x13, X(ff_h264_idct8_dc_add_neon)
|
|
|
|
++ movrel x14, X(ff_h264_idct8_add_neon)
|
|
|
|
++1: ldrb w9, [x7], #4
|
|
|
|
++ ldrsw x0, [x5], #16
|
|
|
|
++ ldrb w9, [x4, w9, UXTW]
|
|
|
|
++ subs w9, w9, #1
|
|
|
|
++ b.lt 2f
|
|
|
|
++ ldrsh w11, [x1]
|
|
|
|
++ add x0, x6, x0
|
|
|
|
++ ccmp w11, #0, #4, eq
|
|
|
|
++ csel x15, x13, x14, ne
|
|
|
|
++ blr x15
|
|
|
|
++2: subs w10, w10, #4
|
|
|
|
++ add x1, x1, #128
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret x12
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++const scan8
|
|
|
|
++ .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
|
|
|
|
++ .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
|
|
|
|
++ .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
|
|
|
|
++ .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
|
|
|
|
++ .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
|
|
|
|
++ .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
|
|
|
|
++ .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
|
|
|
|
++ .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
|
|
|
|
++ .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
|
|
|
|
++ .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
|
|
|
|
++ .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
|
|
|
|
++ .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
|
|
|
|
++endconst
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_init.c b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
|
|
|
|
+@@ -0,0 +1,93 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/avcodec.h"
|
|
|
|
++#include "libavcodec/h264pred.h"
|
|
|
|
++
|
|
|
|
++void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++
|
|
|
|
++void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
|
|
|
|
++
|
|
|
|
++static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
|
|
|
|
++ const int bit_depth,
|
|
|
|
++ const int chroma_format_idc)
|
|
|
|
++{
|
|
|
|
++ const int high_depth = bit_depth > 8;
|
|
|
|
++
|
|
|
|
++ if (high_depth)
|
|
|
|
++ return;
|
|
|
|
++
|
|
|
|
++ if (chroma_format_idc <= 1) {
|
|
|
|
++ h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
|
|
|
|
++ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
|
|
|
|
++ if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
|
|
|
++ h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
|
|
|
|
++ h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
|
|
|
|
++ if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
|
|
|
|
++ codec_id != AV_CODEC_ID_VP8) {
|
|
|
|
++ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
|
|
|
|
++ h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
|
|
|
|
++ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
|
|
|
|
++ h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
|
|
|
|
++ h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
|
|
|
|
++ h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
|
|
|
|
++ h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
|
|
|
|
++ }
|
|
|
|
++ }
|
|
|
|
++
|
|
|
|
++ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
|
|
|
|
++ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
|
|
|
|
++ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
|
|
|
|
++ h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
|
|
|
|
++ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
|
|
|
|
++ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
|
|
|
|
++ if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
|
|
|
|
++ codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
|
|
|
++ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
|
|
|
|
++ int bit_depth, const int chroma_format_idc)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags))
|
|
|
|
++ h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_neon.S b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
|
|
|
|
+@@ -0,0 +1,361 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
|
|
|
++.if \n >= 8 || \hi == 0
|
|
|
|
++ ld1 {\rd\().b}[0], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[1], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[2], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[3], [\rs], \rt
|
|
|
|
++.endif
|
|
|
|
++.if \n >= 8 || \hi == 1
|
|
|
|
++ ld1 {\rd\().b}[4], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[5], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[6], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[7], [\rs], \rt
|
|
|
|
++.endif
|
|
|
|
++.if \n == 16
|
|
|
|
++ ld1 {\rd\().b}[8], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[9], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[10], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[11], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[12], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[13], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[14], [\rs], \rt
|
|
|
|
++ ld1 {\rd\().b}[15], [\rs], \rt
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_128_dc_neon, export=1
|
|
|
|
++ movi v0.16b, #128
|
|
|
|
++ b .L_pred16x16_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_top_dc_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ ld1 {v0.16b}, [x2]
|
|
|
|
++ uaddlv h0, v0.16b
|
|
|
|
++ rshrn v0.8b, v0.8h, #4
|
|
|
|
++ dup v0.16b, v0.b[0]
|
|
|
|
++ b .L_pred16x16_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_left_dc_neon, export=1
|
|
|
|
++ sub x2, x0, #1
|
|
|
|
++ ldcol.8 v0, x2, x1, 16
|
|
|
|
++ uaddlv h0, v0.16b
|
|
|
|
++ rshrn v0.8b, v0.8h, #4
|
|
|
|
++ dup v0.16b, v0.b[0]
|
|
|
|
++ b .L_pred16x16_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_dc_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ sub x3, x0, #1
|
|
|
|
++ ld1 {v0.16b}, [x2]
|
|
|
|
++ ldcol.8 v1, x3, x1, 16
|
|
|
|
++ uaddlv h0, v0.16b
|
|
|
|
++ uaddlv h1, v1.16b
|
|
|
|
++ add v0.4h, v0.4h, v1.4h
|
|
|
|
++ rshrn v0.8b, v0.8h, #5
|
|
|
|
++ dup v0.16b, v0.b[0]
|
|
|
|
++.L_pred16x16_dc_end:
|
|
|
|
++ mov w3, #8
|
|
|
|
++6: st1 {v0.16b}, [x0], x1
|
|
|
|
++ st1 {v0.16b}, [x0], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 6b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_hor_neon, export=1
|
|
|
|
++ sub x2, x0, #1
|
|
|
|
++ mov w3, #16
|
|
|
|
++1: ld1r {v0.16b}, [x2], x1
|
|
|
|
++ st1 {v0.16b}, [x0], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_vert_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ add x1, x1, x1
|
|
|
|
++ ld1 {v0.16b}, [x2], x1
|
|
|
|
++ mov w3, #8
|
|
|
|
++1: st1 {v0.16b}, [x0], x1
|
|
|
|
++ st1 {v0.16b}, [x2], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred16x16_plane_neon, export=1
|
|
|
|
++ sub x3, x0, x1
|
|
|
|
++ movrel x4, p16weight
|
|
|
|
++ add x2, x3, #8
|
|
|
|
++ sub x3, x3, #1
|
|
|
|
++ ld1 {v0.8b}, [x3]
|
|
|
|
++ ld1 {v2.8b}, [x2], x1
|
|
|
|
++ ldcol.8 v1, x3, x1
|
|
|
|
++ add x3, x3, x1
|
|
|
|
++ ldcol.8 v3, x3, x1
|
|
|
|
++ rev64 v0.8b, v0.8b
|
|
|
|
++ rev64 v1.8b, v1.8b
|
|
|
|
++ uaddl v7.8h, v2.8b, v3.8b
|
|
|
|
++ usubl v2.8h, v2.8b, v0.8b
|
|
|
|
++ usubl v3.8h, v3.8b, v1.8b
|
|
|
|
++ ld1 {v0.8h}, [x4]
|
|
|
|
++ mul v2.8h, v2.8h, v0.8h
|
|
|
|
++ mul v3.8h, v3.8h, v0.8h
|
|
|
|
++ addp v2.8h, v2.8h, v3.8h
|
|
|
|
++ addp v2.8h, v2.8h, v2.8h
|
|
|
|
++ addp v2.4h, v2.4h, v2.4h
|
|
|
|
++ sshll v3.4s, v2.4h, #2
|
|
|
|
++ saddw v2.4s, v3.4s, v2.4h
|
|
|
|
++ rshrn v4.4h, v2.4s, #6
|
|
|
|
++ trn2 v5.4h, v4.4h, v4.4h
|
|
|
|
++ add v2.4h, v4.4h, v5.4h
|
|
|
|
++ shl v3.4h, v2.4h, #3
|
|
|
|
++ ext v7.16b, v7.16b, v7.16b, #14
|
|
|
|
++ sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
|
|
|
|
++ add v7.4h, v7.4h, v0.4h
|
|
|
|
++ shl v2.4h, v7.4h, #4
|
|
|
|
++ sub v2.4h, v2.4h, v3.4h
|
|
|
|
++ shl v3.4h, v4.4h, #4
|
|
|
|
++ ext v0.16b, v0.16b, v0.16b, #14
|
|
|
|
++ sub v6.4h, v5.4h, v3.4h
|
|
|
|
++ mov v0.h[0], wzr
|
|
|
|
++ mul v0.8h, v0.8h, v4.h[0]
|
|
|
|
++ dup v1.8h, v2.h[0]
|
|
|
|
++ dup v2.8h, v4.h[0]
|
|
|
|
++ dup v3.8h, v6.h[0]
|
|
|
|
++ shl v2.8h, v2.8h, #3
|
|
|
|
++ add v1.8h, v1.8h, v0.8h
|
|
|
|
++ add v3.8h, v3.8h, v2.8h
|
|
|
|
++ mov w3, #16
|
|
|
|
++1:
|
|
|
|
++ sqshrun v0.8b, v1.8h, #5
|
|
|
|
++ add v1.8h, v1.8h, v2.8h
|
|
|
|
++ sqshrun2 v0.16b, v1.8h, #5
|
|
|
|
++ add v1.8h, v1.8h, v3.8h
|
|
|
|
++ st1 {v0.16b}, [x0], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++const p16weight, align=4
|
|
|
|
++ .short 1,2,3,4,5,6,7,8
|
|
|
|
++endconst
|
|
|
|
++const p8weight, align=4
|
|
|
|
++ .short 1,2,3,4,1,2,3,4
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_hor_neon, export=1
|
|
|
|
++ sub x2, x0, #1
|
|
|
|
++ mov w3, #8
|
|
|
|
++1: ld1r {v0.8b}, [x2], x1
|
|
|
|
++ st1 {v0.8b}, [x0], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_vert_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ ld1 {v0.8b}, [x2], x1
|
|
|
|
++ mov w3, #4
|
|
|
|
++1: st1 {v0.8b}, [x0], x1
|
|
|
|
++ st1 {v0.8b}, [x2], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_plane_neon, export=1
|
|
|
|
++ sub x3, x0, x1
|
|
|
|
++ movrel x4, p8weight
|
|
|
|
++ movrel x5, p16weight
|
|
|
|
++ add x2, x3, #4
|
|
|
|
++ sub x3, x3, #1
|
|
|
|
++ ld1 {v0.s}[0], [x3]
|
|
|
|
++ ld1 {v2.s}[0], [x2], x1
|
|
|
|
++ ldcol.8 v0, x3, x1, 4, hi=1
|
|
|
|
++ add x3, x3, x1
|
|
|
|
++ ldcol.8 v3, x3, x1, 4
|
|
|
|
++ uaddl v7.8h, v2.8b, v3.8b
|
|
|
|
++ rev32 v0.8b, v0.8b
|
|
|
|
++ trn1 v2.2s, v2.2s, v3.2s
|
|
|
|
++ usubl v2.8h, v2.8b, v0.8b
|
|
|
|
++ ld1 {v6.8h}, [x4]
|
|
|
|
++ mul v2.8h, v2.8h, v6.8h
|
|
|
|
++ ld1 {v0.8h}, [x5]
|
|
|
|
++ saddlp v2.4s, v2.8h
|
|
|
|
++ addp v2.4s, v2.4s, v2.4s
|
|
|
|
++ shl v3.4s, v2.4s, #4
|
|
|
|
++ add v2.4s, v3.4s, v2.4s
|
|
|
|
++ rshrn v5.4h, v2.4s, #5
|
|
|
|
++ addp v2.4h, v5.4h, v5.4h
|
|
|
|
++ shl v3.4h, v2.4h, #1
|
|
|
|
++ add v3.4h, v3.4h, v2.4h
|
|
|
|
++ rev64 v7.4h, v7.4h
|
|
|
|
++ add v7.4h, v7.4h, v0.4h
|
|
|
|
++ shl v2.4h, v7.4h, #4
|
|
|
|
++ sub v2.4h, v2.4h, v3.4h
|
|
|
|
++ ext v0.16b, v0.16b, v0.16b, #14
|
|
|
|
++ mov v0.h[0], wzr
|
|
|
|
++ mul v0.8h, v0.8h, v5.h[0]
|
|
|
|
++ dup v1.8h, v2.h[0]
|
|
|
|
++ dup v2.8h, v5.h[1]
|
|
|
|
++ add v1.8h, v1.8h, v0.8h
|
|
|
|
++ mov w3, #8
|
|
|
|
++1:
|
|
|
|
++ sqshrun v0.8b, v1.8h, #5
|
|
|
|
++ add v1.8h, v1.8h, v2.8h
|
|
|
|
++ st1 {v0.8b}, [x0], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_128_dc_neon, export=1
|
|
|
|
++ movi v0.8b, #128
|
|
|
|
++ movi v1.8b, #128
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_top_dc_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ ld1 {v0.8b}, [x2]
|
|
|
|
++ uaddlp v0.4h, v0.8b
|
|
|
|
++ addp v0.4h, v0.4h, v0.4h
|
|
|
|
++ zip1 v0.8h, v0.8h, v0.8h
|
|
|
|
++ rshrn v2.8b, v0.8h, #2
|
|
|
|
++ zip1 v0.8b, v2.8b, v2.8b
|
|
|
|
++ zip1 v1.8b, v2.8b, v2.8b
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_left_dc_neon, export=1
|
|
|
|
++ sub x2, x0, #1
|
|
|
|
++ ldcol.8 v0, x2, x1
|
|
|
|
++ uaddlp v0.4h, v0.8b
|
|
|
|
++ addp v0.4h, v0.4h, v0.4h
|
|
|
|
++ rshrn v2.8b, v0.8h, #2
|
|
|
|
++ dup v1.8b, v2.b[1]
|
|
|
|
++ dup v0.8b, v2.b[0]
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_dc_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ sub x3, x0, #1
|
|
|
|
++ ld1 {v0.8b}, [x2]
|
|
|
|
++ ldcol.8 v1, x3, x1
|
|
|
|
++ uaddlp v0.4h, v0.8b
|
|
|
|
++ uaddlp v1.4h, v1.8b
|
|
|
|
++ trn1 v2.2s, v0.2s, v1.2s
|
|
|
|
++ trn2 v3.2s, v0.2s, v1.2s
|
|
|
|
++ addp v4.4h, v2.4h, v3.4h
|
|
|
|
++ addp v5.4h, v4.4h, v4.4h
|
|
|
|
++ rshrn v6.8b, v5.8h, #3
|
|
|
|
++ rshrn v7.8b, v4.8h, #2
|
|
|
|
++ dup v0.8b, v6.b[0]
|
|
|
|
++ dup v2.8b, v7.b[2]
|
|
|
|
++ dup v1.8b, v7.b[3]
|
|
|
|
++ dup v3.8b, v6.b[1]
|
|
|
|
++ zip1 v0.2s, v0.2s, v2.2s
|
|
|
|
++ zip1 v1.2s, v1.2s, v3.2s
|
|
|
|
++.L_pred8x8_dc_end:
|
|
|
|
++ mov w3, #4
|
|
|
|
++ add x2, x0, x1, lsl #2
|
|
|
|
++6: st1 {v0.8b}, [x0], x1
|
|
|
|
++ st1 {v1.8b}, [x2], x1
|
|
|
|
++ subs w3, w3, #1
|
|
|
|
++ b.ne 6b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_l0t_dc_neon, export=1
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ sub x3, x0, #1
|
|
|
|
++ ld1 {v0.8b}, [x2]
|
|
|
|
++ ldcol.8 v1, x3, x1, 4
|
|
|
|
++ zip1 v0.4s, v0.4s, v1.4s
|
|
|
|
++ uaddlp v0.8h, v0.16b
|
|
|
|
++ addp v0.8h, v0.8h, v0.8h
|
|
|
|
++ addp v1.4h, v0.4h, v0.4h
|
|
|
|
++ rshrn v2.8b, v0.8h, #2
|
|
|
|
++ rshrn v3.8b, v1.8h, #3
|
|
|
|
++ dup v4.8b, v3.b[0]
|
|
|
|
++ dup v6.8b, v2.b[2]
|
|
|
|
++ dup v5.8b, v2.b[0]
|
|
|
|
++ zip1 v0.2s, v4.2s, v6.2s
|
|
|
|
++ zip1 v1.2s, v5.2s, v6.2s
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_l00_dc_neon, export=1
|
|
|
|
++ sub x2, x0, #1
|
|
|
|
++ ldcol.8 v0, x2, x1, 4
|
|
|
|
++ uaddlp v0.4h, v0.8b
|
|
|
|
++ addp v0.4h, v0.4h, v0.4h
|
|
|
|
++ rshrn v0.8b, v0.8h, #2
|
|
|
|
++ movi v1.8b, #128
|
|
|
|
++ dup v0.8b, v0.b[0]
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_0lt_dc_neon, export=1
|
|
|
|
++ add x3, x0, x1, lsl #2
|
|
|
|
++ sub x2, x0, x1
|
|
|
|
++ sub x3, x3, #1
|
|
|
|
++ ld1 {v0.8b}, [x2]
|
|
|
|
++ ldcol.8 v1, x3, x1, 4, hi=1
|
|
|
|
++ zip1 v0.4s, v0.4s, v1.4s
|
|
|
|
++ uaddlp v0.8h, v0.16b
|
|
|
|
++ addp v0.8h, v0.8h, v0.8h
|
|
|
|
++ addp v1.4h, v0.4h, v0.4h
|
|
|
|
++ rshrn v2.8b, v0.8h, #2
|
|
|
|
++ rshrn v3.8b, v1.8h, #3
|
|
|
|
++ dup v4.8b, v2.b[0]
|
|
|
|
++ dup v5.8b, v2.b[3]
|
|
|
|
++ dup v6.8b, v2.b[2]
|
|
|
|
++ dup v7.8b, v3.b[1]
|
|
|
|
++ zip1 v0.2s, v4.2s, v6.2s
|
|
|
|
++ zip1 v1.2s, v5.2s, v7.2s
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_pred8x8_0l0_dc_neon, export=1
|
|
|
|
++ add x2, x0, x1, lsl #2
|
|
|
|
++ sub x2, x2, #1
|
|
|
|
++ ldcol.8 v1, x2, x1, 4
|
|
|
|
++ uaddlp v2.4h, v1.8b
|
|
|
|
++ addp v2.4h, v2.4h, v2.4h
|
|
|
|
++ rshrn v1.8b, v2.8h, #2
|
|
|
|
++ movi v0.8b, #128
|
|
|
|
++ dup v1.8b, v1.b[0]
|
|
|
|
++ b .L_pred8x8_dc_end
|
|
|
|
++endfunc
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,123 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON optimised DSP functions
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stddef.h>
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/hpeldsp.h"
|
|
|
|
++
|
|
|
|
++void ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++
|
|
|
|
++void ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++
|
|
|
|
++void ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++
|
|
|
|
++void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
|
|
|
++ ptrdiff_t line_size, int h);
|
|
|
|
++
|
|
|
|
++av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
|
|
|
|
++ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
|
|
|
|
++ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
|
|
|
|
++ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
|
|
|
|
++ c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
|
|
|
|
++ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
|
|
|
|
++ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
|
|
|
|
++ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
|
|
|
|
++
|
|
|
|
++ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
|
|
|
|
++ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
|
|
|
|
++
|
|
|
|
++ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
|
|
|
|
++ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
|
|
|
|
++ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
|
|
|
|
++ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
|
|
|
|
++ c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
|
|
|
|
++ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
|
|
|
|
++ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
|
|
|
|
++ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
|
|
|
|
++
|
|
|
|
++ c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
|
|
|
|
++ c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
|
|
|
|
++ c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
|
|
|
|
++ c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
|
|
|
|
+@@ -0,0 +1,397 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON optimised DSP functions
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++.macro pixels16 rnd=1, avg=0
|
|
|
|
++ .if \avg
|
|
|
|
++ mov x12, x0
|
|
|
|
++ .endif
|
|
|
|
++1: ld1 {v0.16B}, [x1], x2
|
|
|
|
++ ld1 {v1.16B}, [x1], x2
|
|
|
|
++ ld1 {v2.16B}, [x1], x2
|
|
|
|
++ ld1 {v3.16B}, [x1], x2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v4.16B}, [x12], x2
|
|
|
|
++ urhadd v0.16B, v0.16B, v4.16B
|
|
|
|
++ ld1 {v5.16B}, [x12], x2
|
|
|
|
++ urhadd v1.16B, v1.16B, v5.16B
|
|
|
|
++ ld1 {v6.16B}, [x12], x2
|
|
|
|
++ urhadd v2.16B, v2.16B, v6.16B
|
|
|
|
++ ld1 {v7.16B}, [x12], x2
|
|
|
|
++ urhadd v3.16B, v3.16B, v7.16B
|
|
|
|
++ .endif
|
|
|
|
++ subs w3, w3, #4
|
|
|
|
++ st1 {v0.16B}, [x0], x2
|
|
|
|
++ st1 {v1.16B}, [x0], x2
|
|
|
|
++ st1 {v2.16B}, [x0], x2
|
|
|
|
++ st1 {v3.16B}, [x0], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels16_x2 rnd=1, avg=0
|
|
|
|
++1: ld1 {v0.16B, v1.16B}, [x1], x2
|
|
|
|
++ ld1 {v2.16B, v3.16B}, [x1], x2
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ ext v1.16B, v0.16B, v1.16B, #1
|
|
|
|
++ avg v0.16B, v0.16B, v1.16B
|
|
|
|
++ ext v3.16B, v2.16B, v3.16B, #1
|
|
|
|
++ avg v2.16B, v2.16B, v3.16B
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v1.16B}, [x0], x2
|
|
|
|
++ ld1 {v3.16B}, [x0]
|
|
|
|
++ urhadd v0.16B, v0.16B, v1.16B
|
|
|
|
++ urhadd v2.16B, v2.16B, v3.16B
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v0.16B}, [x0], x2
|
|
|
|
++ st1 {v2.16B}, [x0], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels16_y2 rnd=1, avg=0
|
|
|
|
++ sub w3, w3, #2
|
|
|
|
++ ld1 {v0.16B}, [x1], x2
|
|
|
|
++ ld1 {v1.16B}, [x1], x2
|
|
|
|
++1: subs w3, w3, #2
|
|
|
|
++ avg v2.16B, v0.16B, v1.16B
|
|
|
|
++ ld1 {v0.16B}, [x1], x2
|
|
|
|
++ avg v3.16B, v0.16B, v1.16B
|
|
|
|
++ ld1 {v1.16B}, [x1], x2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v4.16B}, [x0], x2
|
|
|
|
++ ld1 {v5.16B}, [x0]
|
|
|
|
++ urhadd v2.16B, v2.16B, v4.16B
|
|
|
|
++ urhadd v3.16B, v3.16B, v5.16B
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v2.16B}, [x0], x2
|
|
|
|
++ st1 {v3.16B}, [x0], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ avg v2.16B, v0.16B, v1.16B
|
|
|
|
++ ld1 {v0.16B}, [x1], x2
|
|
|
|
++ avg v3.16B, v0.16B, v1.16B
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v4.16B}, [x0], x2
|
|
|
|
++ ld1 {v5.16B}, [x0]
|
|
|
|
++ urhadd v2.16B, v2.16B, v4.16B
|
|
|
|
++ urhadd v3.16B, v3.16B, v5.16B
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v2.16B}, [x0], x2
|
|
|
|
++ st1 {v3.16B}, [x0], x2
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels16_xy2 rnd=1, avg=0
|
|
|
|
++ sub w3, w3, #2
|
|
|
|
++ ld1 {v0.16B, v1.16B}, [x1], x2
|
|
|
|
++ ld1 {v4.16B, v5.16B}, [x1], x2
|
|
|
|
++NRND movi v26.8H, #1
|
|
|
|
++ ext v1.16B, v0.16B, v1.16B, #1
|
|
|
|
++ ext v5.16B, v4.16B, v5.16B, #1
|
|
|
|
++ uaddl v16.8H, v0.8B, v1.8B
|
|
|
|
++ uaddl2 v20.8H, v0.16B, v1.16B
|
|
|
|
++ uaddl v18.8H, v4.8B, v5.8B
|
|
|
|
++ uaddl2 v22.8H, v4.16B, v5.16B
|
|
|
|
++1: subs w3, w3, #2
|
|
|
|
++ ld1 {v0.16B, v1.16B}, [x1], x2
|
|
|
|
++ add v24.8H, v16.8H, v18.8H
|
|
|
|
++NRND add v24.8H, v24.8H, v26.8H
|
|
|
|
++ ext v30.16B, v0.16B, v1.16B, #1
|
|
|
|
++ add v1.8H, v20.8H, v22.8H
|
|
|
|
++ mshrn v28.8B, v24.8H, #2
|
|
|
|
++NRND add v1.8H, v1.8H, v26.8H
|
|
|
|
++ mshrn2 v28.16B, v1.8H, #2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v16.16B}, [x0]
|
|
|
|
++ urhadd v28.16B, v28.16B, v16.16B
|
|
|
|
++ .endif
|
|
|
|
++ uaddl v16.8H, v0.8B, v30.8B
|
|
|
|
++ ld1 {v2.16B, v3.16B}, [x1], x2
|
|
|
|
++ uaddl2 v20.8H, v0.16B, v30.16B
|
|
|
|
++ st1 {v28.16B}, [x0], x2
|
|
|
|
++ add v24.8H, v16.8H, v18.8H
|
|
|
|
++NRND add v24.8H, v24.8H, v26.8H
|
|
|
|
++ ext v3.16B, v2.16B, v3.16B, #1
|
|
|
|
++ add v0.8H, v20.8H, v22.8H
|
|
|
|
++ mshrn v30.8B, v24.8H, #2
|
|
|
|
++NRND add v0.8H, v0.8H, v26.8H
|
|
|
|
++ mshrn2 v30.16B, v0.8H, #2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v18.16B}, [x0]
|
|
|
|
++ urhadd v30.16B, v30.16B, v18.16B
|
|
|
|
++ .endif
|
|
|
|
++ uaddl v18.8H, v2.8B, v3.8B
|
|
|
|
++ uaddl2 v22.8H, v2.16B, v3.16B
|
|
|
|
++ st1 {v30.16B}, [x0], x2
|
|
|
|
++ b.gt 1b
|
|
|
|
++
|
|
|
|
++ ld1 {v0.16B, v1.16B}, [x1], x2
|
|
|
|
++ add v24.8H, v16.8H, v18.8H
|
|
|
|
++NRND add v24.8H, v24.8H, v26.8H
|
|
|
|
++ ext v30.16B, v0.16B, v1.16B, #1
|
|
|
|
++ add v1.8H, v20.8H, v22.8H
|
|
|
|
++ mshrn v28.8B, v24.8H, #2
|
|
|
|
++NRND add v1.8H, v1.8H, v26.8H
|
|
|
|
++ mshrn2 v28.16B, v1.8H, #2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v16.16B}, [x0]
|
|
|
|
++ urhadd v28.16B, v28.16B, v16.16B
|
|
|
|
++ .endif
|
|
|
|
++ uaddl v16.8H, v0.8B, v30.8B
|
|
|
|
++ uaddl2 v20.8H, v0.16B, v30.16B
|
|
|
|
++ st1 {v28.16B}, [x0], x2
|
|
|
|
++ add v24.8H, v16.8H, v18.8H
|
|
|
|
++NRND add v24.8H, v24.8H, v26.8H
|
|
|
|
++ add v0.8H, v20.8H, v22.8H
|
|
|
|
++ mshrn v30.8B, v24.8H, #2
|
|
|
|
++NRND add v0.8H, v0.8H, v26.8H
|
|
|
|
++ mshrn2 v30.16B, v0.8H, #2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v18.16B}, [x0]
|
|
|
|
++ urhadd v30.16B, v30.16B, v18.16B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v30.16B}, [x0], x2
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels8 rnd=1, avg=0
|
|
|
|
++1: ld1 {v0.8B}, [x1], x2
|
|
|
|
++ ld1 {v1.8B}, [x1], x2
|
|
|
|
++ ld1 {v2.8B}, [x1], x2
|
|
|
|
++ ld1 {v3.8B}, [x1], x2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v4.8B}, [x0], x2
|
|
|
|
++ urhadd v0.8B, v0.8B, v4.8B
|
|
|
|
++ ld1 {v5.8B}, [x0], x2
|
|
|
|
++ urhadd v1.8B, v1.8B, v5.8B
|
|
|
|
++ ld1 {v6.8B}, [x0], x2
|
|
|
|
++ urhadd v2.8B, v2.8B, v6.8B
|
|
|
|
++ ld1 {v7.8B}, [x0], x2
|
|
|
|
++ urhadd v3.8B, v3.8B, v7.8B
|
|
|
|
++ sub x0, x0, x2, lsl #2
|
|
|
|
++ .endif
|
|
|
|
++ subs w3, w3, #4
|
|
|
|
++ st1 {v0.8B}, [x0], x2
|
|
|
|
++ st1 {v1.8B}, [x0], x2
|
|
|
|
++ st1 {v2.8B}, [x0], x2
|
|
|
|
++ st1 {v3.8B}, [x0], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels8_x2 rnd=1, avg=0
|
|
|
|
++1: ld1 {v0.8B, v1.8B}, [x1], x2
|
|
|
|
++ ext v1.8B, v0.8B, v1.8B, #1
|
|
|
|
++ ld1 {v2.8B, v3.8B}, [x1], x2
|
|
|
|
++ ext v3.8B, v2.8B, v3.8B, #1
|
|
|
|
++ subs w3, w3, #2
|
|
|
|
++ avg v0.8B, v0.8B, v1.8B
|
|
|
|
++ avg v2.8B, v2.8B, v3.8B
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v4.8B}, [x0], x2
|
|
|
|
++ ld1 {v5.8B}, [x0]
|
|
|
|
++ urhadd v0.8B, v0.8B, v4.8B
|
|
|
|
++ urhadd v2.8B, v2.8B, v5.8B
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v0.8B}, [x0], x2
|
|
|
|
++ st1 {v2.8B}, [x0], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels8_y2 rnd=1, avg=0
|
|
|
|
++ sub w3, w3, #2
|
|
|
|
++ ld1 {v0.8B}, [x1], x2
|
|
|
|
++ ld1 {v1.8B}, [x1], x2
|
|
|
|
++1: subs w3, w3, #2
|
|
|
|
++ avg v4.8B, v0.8B, v1.8B
|
|
|
|
++ ld1 {v0.8B}, [x1], x2
|
|
|
|
++ avg v5.8B, v0.8B, v1.8B
|
|
|
|
++ ld1 {v1.8B}, [x1], x2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v2.8B}, [x0], x2
|
|
|
|
++ ld1 {v3.8B}, [x0]
|
|
|
|
++ urhadd v4.8B, v4.8B, v2.8B
|
|
|
|
++ urhadd v5.8B, v5.8B, v3.8B
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v4.8B}, [x0], x2
|
|
|
|
++ st1 {v5.8B}, [x0], x2
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ avg v4.8B, v0.8B, v1.8B
|
|
|
|
++ ld1 {v0.8B}, [x1], x2
|
|
|
|
++ avg v5.8B, v0.8B, v1.8B
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v2.8B}, [x0], x2
|
|
|
|
++ ld1 {v3.8B}, [x0]
|
|
|
|
++ urhadd v4.8B, v4.8B, v2.8B
|
|
|
|
++ urhadd v5.8B, v5.8B, v3.8B
|
|
|
|
++ sub x0, x0, x2
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v4.8B}, [x0], x2
|
|
|
|
++ st1 {v5.8B}, [x0], x2
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixels8_xy2 rnd=1, avg=0
|
|
|
|
++ sub w3, w3, #2
|
|
|
|
++ ld1 {v0.16B}, [x1], x2
|
|
|
|
++ ld1 {v1.16B}, [x1], x2
|
|
|
|
++NRND movi v19.8H, #1
|
|
|
|
++ ext v4.16B, v0.16B, v4.16B, #1
|
|
|
|
++ ext v6.16B, v1.16B, v6.16B, #1
|
|
|
|
++ uaddl v16.8H, v0.8B, v4.8B
|
|
|
|
++ uaddl v17.8H, v1.8B, v6.8B
|
|
|
|
++1: subs w3, w3, #2
|
|
|
|
++ ld1 {v0.16B}, [x1], x2
|
|
|
|
++ add v18.8H, v16.8H, v17.8H
|
|
|
|
++ ext v4.16B, v0.16B, v4.16B, #1
|
|
|
|
++NRND add v18.8H, v18.8H, v19.8H
|
|
|
|
++ uaddl v16.8H, v0.8B, v4.8B
|
|
|
|
++ mshrn v5.8B, v18.8H, #2
|
|
|
|
++ ld1 {v1.16B}, [x1], x2
|
|
|
|
++ add v18.8H, v16.8H, v17.8H
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v7.8B}, [x0]
|
|
|
|
++ urhadd v5.8B, v5.8B, v7.8B
|
|
|
|
++ .endif
|
|
|
|
++NRND add v18.8H, v18.8H, v19.8H
|
|
|
|
++ st1 {v5.8B}, [x0], x2
|
|
|
|
++ mshrn v7.8B, v18.8H, #2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v5.8B}, [x0]
|
|
|
|
++ urhadd v7.8B, v7.8B, v5.8B
|
|
|
|
++ .endif
|
|
|
|
++ ext v6.16B, v1.16B, v6.16B, #1
|
|
|
|
++ uaddl v17.8H, v1.8B, v6.8B
|
|
|
|
++ st1 {v7.8B}, [x0], x2
|
|
|
|
++ b.gt 1b
|
|
|
|
++
|
|
|
|
++ ld1 {v0.16B}, [x1], x2
|
|
|
|
++ add v18.8H, v16.8H, v17.8H
|
|
|
|
++ ext v4.16B, v0.16B, v4.16B, #1
|
|
|
|
++NRND add v18.8H, v18.8H, v19.8H
|
|
|
|
++ uaddl v16.8H, v0.8B, v4.8B
|
|
|
|
++ mshrn v5.8B, v18.8H, #2
|
|
|
|
++ add v18.8H, v16.8H, v17.8H
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v7.8B}, [x0]
|
|
|
|
++ urhadd v5.8B, v5.8B, v7.8B
|
|
|
|
++ .endif
|
|
|
|
++NRND add v18.8H, v18.8H, v19.8H
|
|
|
|
++ st1 {v5.8B}, [x0], x2
|
|
|
|
++ mshrn v7.8B, v18.8H, #2
|
|
|
|
++ .if \avg
|
|
|
|
++ ld1 {v5.8B}, [x0]
|
|
|
|
++ urhadd v7.8B, v7.8B, v5.8B
|
|
|
|
++ .endif
|
|
|
|
++ st1 {v7.8B}, [x0], x2
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixfunc pfx, name, suf, rnd=1, avg=0
|
|
|
|
++ .if \rnd
|
|
|
|
++ .macro avg rd, rn, rm
|
|
|
|
++ urhadd \rd, \rn, \rm
|
|
|
|
++ .endm
|
|
|
|
++ .macro mshrn rd, rn, rm
|
|
|
|
++ rshrn \rd, \rn, \rm
|
|
|
|
++ .endm
|
|
|
|
++ .macro mshrn2 rd, rn, rm
|
|
|
|
++ rshrn2 \rd, \rn, \rm
|
|
|
|
++ .endm
|
|
|
|
++ .macro NRND insn:vararg
|
|
|
|
++ .endm
|
|
|
|
++ .else
|
|
|
|
++ .macro avg rd, rn, rm
|
|
|
|
++ uhadd \rd, \rn, \rm
|
|
|
|
++ .endm
|
|
|
|
++ .macro mshrn rd, rn, rm
|
|
|
|
++ shrn \rd, \rn, \rm
|
|
|
|
++ .endm
|
|
|
|
++ .macro mshrn2 rd, rn, rm
|
|
|
|
++ shrn2 \rd, \rn, \rm
|
|
|
|
++ .endm
|
|
|
|
++ .macro NRND insn:vararg
|
|
|
|
++ \insn
|
|
|
|
++ .endm
|
|
|
|
++ .endif
|
|
|
|
++function ff_\pfx\name\suf\()_neon, export=1
|
|
|
|
++ \name \rnd, \avg
|
|
|
|
++endfunc
|
|
|
|
++ .purgem avg
|
|
|
|
++ .purgem mshrn
|
|
|
|
++ .purgem mshrn2
|
|
|
|
++ .purgem NRND
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro pixfunc2 pfx, name, avg=0
|
|
|
|
++ pixfunc \pfx, \name, rnd=1, avg=\avg
|
|
|
|
++ pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function ff_put_h264_qpel16_mc00_neon, export=1
|
|
|
|
++ mov w3, #16
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++ pixfunc put_, pixels16, avg=0
|
|
|
|
++ pixfunc2 put_, pixels16_x2, avg=0
|
|
|
|
++ pixfunc2 put_, pixels16_y2, avg=0
|
|
|
|
++ pixfunc2 put_, pixels16_xy2, avg=0
|
|
|
|
++
|
|
|
|
++function ff_avg_h264_qpel16_mc00_neon, export=1
|
|
|
|
++ mov w3, #16
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++ pixfunc avg_, pixels16, avg=1
|
|
|
|
++ pixfunc2 avg_, pixels16_x2, avg=1
|
|
|
|
++ pixfunc2 avg_, pixels16_y2, avg=1
|
|
|
|
++ pixfunc2 avg_, pixels16_xy2, avg=1
|
|
|
|
++
|
|
|
|
++function ff_put_h264_qpel8_mc00_neon, export=1
|
|
|
|
++ mov w3, #8
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++ pixfunc put_, pixels8, avg=0
|
|
|
|
++ pixfunc2 put_, pixels8_x2, avg=0
|
|
|
|
++ pixfunc2 put_, pixels8_y2, avg=0
|
|
|
|
++ pixfunc2 put_, pixels8_xy2, avg=0
|
|
|
|
++
|
|
|
|
++function ff_avg_h264_qpel8_mc00_neon, export=1
|
|
|
|
++ mov w3, #8
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++ pixfunc avg_, pixels8, avg=1
|
|
|
|
++ pixfunc avg_, pixels8_x2, avg=1
|
|
|
|
++ pixfunc avg_, pixels8_y2, avg=1
|
|
|
|
++ pixfunc avg_, pixels8_xy2, avg=1
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/idct.h b/media/ffvpx/libavcodec/aarch64/idct.h
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/idct.h
|
|
|
|
+@@ -0,0 +1,28 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#ifndef AVCODEC_AARCH64_IDCT_H
|
|
|
|
++#define AVCODEC_AARCH64_IDCT_H
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++void ff_simple_idct_neon(int16_t *data);
|
|
|
|
++void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
|
|
|
++void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
|
|
|
++
|
|
|
|
++#endif /* AVCODEC_AARCH64_IDCT_H */
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,41 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM-NEON-optimized IDCT functions
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavcodec/avcodec.h"
|
|
|
|
++#include "libavcodec/idctdsp.h"
|
|
|
|
++#include "idct.h"
|
|
|
|
++
|
|
|
|
++av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
|
|
|
|
++ unsigned high_bit_depth)
|
|
|
|
++{
|
|
|
|
++ if (!avctx->lowres && !high_bit_depth) {
|
|
|
|
++ if (avctx->idct_algo == FF_IDCT_AUTO ||
|
|
|
|
++ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
|
|
|
++ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
|
|
|
|
++ c->idct_put = ff_simple_idct_put_neon;
|
|
|
|
++ c->idct_add = ff_simple_idct_add_neon;
|
|
|
|
++ c->idct = ff_simple_idct_neon;
|
|
|
|
++ c->perm_type = FF_IDCT_PERM_PARTTRANS;
|
|
|
|
++ }
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/mdct_neon.S b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
|
|
|
|
+@@ -0,0 +1,323 @@
|
|
|
|
++/*
|
|
|
|
++ * AArch64 NEON optimised MDCT
|
|
|
|
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++function ff_imdct_half_neon, export=1
|
|
|
|
++ sub sp, sp, #32
|
|
|
|
++ stp x19, x20, [sp]
|
|
|
|
++ str x30, [sp, #16]
|
|
|
|
++ mov x12, #1
|
|
|
|
++ ldr w14, [x0, #28] // mdct_bits
|
|
|
|
++ ldr x4, [x0, #32] // tcos
|
|
|
|
++ ldr x3, [x0, #8] // revtab
|
|
|
|
++ lsl x12, x12, x14 // n = 1 << nbits
|
|
|
|
++ lsr x14, x12, #2 // n4 = n >> 2
|
|
|
|
++ add x7, x2, x12, lsl #1
|
|
|
|
++ mov x12, #-16
|
|
|
|
++ sub x7, x7, #16
|
|
|
|
++
|
|
|
|
++ ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
|
|
|
|
++ ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
|
|
|
|
++ rev64 v17.2s, v17.2s
|
|
|
|
++ ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
|
|
|
++ fmul v6.2s, v17.2s, v2.2s
|
|
|
|
++ fmul v7.2s, v0.2s, v2.2s
|
|
|
|
++1:
|
|
|
|
++ subs x14, x14, #2
|
|
|
|
++ ldr w6, [x3], #4
|
|
|
|
++ fmul v4.2s, v0.2s, v3.2s
|
|
|
|
++ fmul v5.2s, v17.2s, v3.2s
|
|
|
|
++ fsub v4.2s, v6.2s, v4.2s
|
|
|
|
++ fadd v5.2s, v5.2s, v7.2s
|
|
|
|
++ ubfm x8, x6, #16, #31
|
|
|
|
++ ubfm x6, x6, #0, #15
|
|
|
|
++ add x8, x1, x8, lsl #3
|
|
|
|
++ add x6, x1, x6, lsl #3
|
|
|
|
++ b.eq 2f
|
|
|
|
++ ld2 {v16.2s,v17.2s}, [x7], x12
|
|
|
|
++ ld2 {v0.2s,v1.2s}, [x2], #16
|
|
|
|
++ rev64 v17.2s, v17.2s
|
|
|
|
++ ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
|
|
|
++ fmul v6.2s, v17.2s, v2.2s
|
|
|
|
++ fmul v7.2s, v0.2s, v2.2s
|
|
|
|
++ st2 {v4.s,v5.s}[0], [x6]
|
|
|
|
++ st2 {v4.s,v5.s}[1], [x8]
|
|
|
|
++ b 1b
|
|
|
|
++2:
|
|
|
|
++ st2 {v4.s,v5.s}[0], [x6]
|
|
|
|
++ st2 {v4.s,v5.s}[1], [x8]
|
|
|
|
++
|
|
|
|
++ mov x19, x0
|
|
|
|
++ mov x20, x1
|
|
|
|
++ bl X(ff_fft_calc_neon)
|
|
|
|
++
|
|
|
|
++ mov x12, #1
|
|
|
|
++ ldr w14, [x19, #28] // mdct_bits
|
|
|
|
++ ldr x4, [x19, #32] // tcos
|
|
|
|
++ lsl x12, x12, x14 // n = 1 << nbits
|
|
|
|
++ lsr x14, x12, #3 // n8 = n >> 3
|
|
|
|
++
|
|
|
|
++ add x4, x4, x14, lsl #3
|
|
|
|
++ add x6, x20, x14, lsl #3
|
|
|
|
++ sub x1, x4, #16
|
|
|
|
++ sub x3, x6, #16
|
|
|
|
++
|
|
|
|
++ mov x7, #-16
|
|
|
|
++ mov x8, x6
|
|
|
|
++ mov x0, x3
|
|
|
|
++
|
|
|
|
++ ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
|
|
|
|
++ ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
|
|
|
|
++ ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
|
|
|
++3:
|
|
|
|
++ subs x14, x14, #2
|
|
|
|
++ fmul v7.2s, v0.2s, v17.2s
|
|
|
|
++ ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
|
|
|
|
++ fmul v4.2s, v1.2s, v17.2s
|
|
|
|
++ fmul v6.2s, v21.2s, v19.2s
|
|
|
|
++ fmul v5.2s, v20.2s, v19.2s
|
|
|
|
++ fmul v22.2s, v1.2s, v16.2s
|
|
|
|
++ fmul v23.2s, v21.2s, v18.2s
|
|
|
|
++ fmul v24.2s, v0.2s, v16.2s
|
|
|
|
++ fmul v25.2s, v20.2s, v18.2s
|
|
|
|
++ fadd v7.2s, v7.2s, v22.2s
|
|
|
|
++ fadd v5.2s, v5.2s, v23.2s
|
|
|
|
++ fsub v4.2s, v4.2s, v24.2s
|
|
|
|
++ fsub v6.2s, v6.2s, v25.2s
|
|
|
|
++ b.eq 4f
|
|
|
|
++ ld2 {v0.2s,v1.2s}, [x3], x7
|
|
|
|
++ ld2 {v20.2s,v21.2s},[x6], #16
|
|
|
|
++ ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
|
|
|
++ rev64 v5.2s, v5.2s
|
|
|
|
++ rev64 v7.2s, v7.2s
|
|
|
|
++ st2 {v4.2s,v5.2s}, [x0], x7
|
|
|
|
++ st2 {v6.2s,v7.2s}, [x8], #16
|
|
|
|
++ b 3b
|
|
|
|
++4:
|
|
|
|
++ rev64 v5.2s, v5.2s
|
|
|
|
++ rev64 v7.2s, v7.2s
|
|
|
|
++ st2 {v4.2s,v5.2s}, [x0]
|
|
|
|
++ st2 {v6.2s,v7.2s}, [x8]
|
|
|
|
++
|
|
|
|
++ ldp x19, x20, [sp]
|
|
|
|
++ ldr x30, [sp, #16]
|
|
|
|
++ add sp, sp, #32
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_imdct_calc_neon, export=1
|
|
|
|
++ sub sp, sp, #32
|
|
|
|
++ stp x19, x20, [sp]
|
|
|
|
++ str x30, [sp, #16]
|
|
|
|
++ ldr w3, [x0, #28] // mdct_bits
|
|
|
|
++ mov x19, #1
|
|
|
|
++ mov x20, x1
|
|
|
|
++ lsl x19, x19, x3
|
|
|
|
++ add x1, x1, x19
|
|
|
|
++
|
|
|
|
++ bl X(ff_imdct_half_neon)
|
|
|
|
++
|
|
|
|
++ add x0, x20, x19, lsl #2
|
|
|
|
++ add x1, x20, x19, lsl #1
|
|
|
|
++ sub x0, x0, #8
|
|
|
|
++ sub x2, x1, #16
|
|
|
|
++ mov x3, #-16
|
|
|
|
++ mov x6, #-8
|
|
|
|
++1:
|
|
|
|
++ ld1 {v0.4s}, [x2], x3
|
|
|
|
++ prfum pldl1keep, [x0, #-16]
|
|
|
|
++ rev64 v0.4s, v0.4s
|
|
|
|
++ ld1 {v2.2s,v3.2s}, [x1], #16
|
|
|
|
++ fneg v4.4s, v0.4s
|
|
|
|
++ prfum pldl1keep, [x2, #-16]
|
|
|
|
++ rev64 v2.2s, v2.2s
|
|
|
|
++ rev64 v3.2s, v3.2s
|
|
|
|
++ ext v4.16b, v4.16b, v4.16b, #8
|
|
|
|
++ st1 {v2.2s}, [x0], x6
|
|
|
|
++ st1 {v3.2s}, [x0], x6
|
|
|
|
++ st1 {v4.4s}, [x20], #16
|
|
|
|
++ subs x19, x19, #16
|
|
|
|
++ b.gt 1b
|
|
|
|
++
|
|
|
|
++ ldp x19, x20, [sp], #16
|
|
|
|
++ ldr x30, [sp], #16
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++function ff_mdct_calc_neon, export=1
|
|
|
|
++ sub sp, sp, #32
|
|
|
|
++ stp x19, x20, [sp]
|
|
|
|
++ str x30, [sp, #16]
|
|
|
|
++
|
|
|
|
++ mov x12, #1
|
|
|
|
++ ldr w14, [x0, #28] // mdct_bits
|
|
|
|
++ ldr x4, [x0, #32] // tcos
|
|
|
|
++ ldr x3, [x0, #8] // revtab
|
|
|
|
++ lsl x14, x12, x14 // n = 1 << nbits
|
|
|
|
++ add x7, x2, x14 // in4u
|
|
|
|
++ sub x9, x7, #16 // in4d
|
|
|
|
++ add x2, x7, x14, lsl #1 // in3u
|
|
|
|
++ add x8, x9, x14, lsl #1 // in3d
|
|
|
|
++ add x5, x4, x14, lsl #1
|
|
|
|
++ sub x5, x5, #16
|
|
|
|
++ sub x3, x3, #4
|
|
|
|
++ mov x12, #-16
|
|
|
|
++ lsr x13, x14, #1
|
|
|
|
++
|
|
|
|
++ ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
|
|
|
++ ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
|
|
|
++ ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
|
|
|
++ rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
|
|
|
++ rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
|
|
|
++ ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
|
|
|
++ fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
|
|
|
++ ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
|
|
|
++ rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
|
|
|
++ rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
|
|
|
++ ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
|
|
|
++ fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
|
|
|
++ fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
|
|
|
++ fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
|
|
|
++1:
|
|
|
|
++ fmul v7.2s, v0.2s, v21.2s // I*s
|
|
|
|
++ ldr w10, [x3, x13]
|
|
|
|
++ fmul v6.2s, v2.2s, v20.2s // -R*c
|
|
|
|
++ ldr w6, [x3, #4]!
|
|
|
|
++ fmul v4.2s, v2.2s, v21.2s // -R*s
|
|
|
|
++ fmul v5.2s, v0.2s, v20.2s // I*c
|
|
|
|
++ fmul v24.2s, v16.2s, v30.2s // R*c
|
|
|
|
++ fmul v25.2s, v18.2s, v31.2s // -I*s
|
|
|
|
++ fmul v22.2s, v16.2s, v31.2s // R*s
|
|
|
|
++ fmul v23.2s, v18.2s, v30.2s // I*c
|
|
|
|
++ subs x14, x14, #16
|
|
|
|
++ subs x13, x13, #8
|
|
|
|
++ fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
|
|
|
|
++ fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
|
|
|
|
++ fsub v24.2s, v25.2s, v24.2s // I*s-R*c
|
|
|
|
++ fadd v25.2s, v22.2s, v23.2s // R*s-I*c
|
|
|
|
++ b.eq 1f
|
|
|
|
++ mov x12, #-16
|
|
|
|
++ ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
|
|
|
++ ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
|
|
|
++ fneg v7.2s, v7.2s // R*s-I*c
|
|
|
|
++ ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
|
|
|
++ rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
|
|
|
++ rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
|
|
|
++ ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
|
|
|
++ fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
|
|
|
++ ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
|
|
|
++ rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
|
|
|
++ rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
|
|
|
++ ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
|
|
|
++ fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
|
|
|
++ fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
|
|
|
++ fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
|
|
|
++ ubfm x12, x6, #16, #31
|
|
|
|
++ ubfm x6, x6, #0, #15
|
|
|
|
++ add x12, x1, x12, lsl #3
|
|
|
|
++ add x6, x1, x6, lsl #3
|
|
|
|
++ st2 {v6.s,v7.s}[0], [x6]
|
|
|
|
++ st2 {v6.s,v7.s}[1], [x12]
|
|
|
|
++ ubfm x6, x10, #16, #31
|
|
|
|
++ ubfm x10, x10, #0, #15
|
|
|
|
++ add x6 , x1, x6, lsl #3
|
|
|
|
++ add x10, x1, x10, lsl #3
|
|
|
|
++ st2 {v24.s,v25.s}[0], [x10]
|
|
|
|
++ st2 {v24.s,v25.s}[1], [x6]
|
|
|
|
++ b 1b
|
|
|
|
++1:
|
|
|
|
++ fneg v7.2s, v7.2s // R*s-I*c
|
|
|
|
++ ubfm x12, x6, #16, #31
|
|
|
|
++ ubfm x6, x6, #0, #15
|
|
|
|
++ add x12, x1, x12, lsl #3
|
|
|
|
++ add x6, x1, x6, lsl #3
|
|
|
|
++ st2 {v6.s,v7.s}[0], [x6]
|
|
|
|
++ st2 {v6.s,v7.s}[1], [x12]
|
|
|
|
++ ubfm x6, x10, #16, #31
|
|
|
|
++ ubfm x10, x10, #0, #15
|
|
|
|
++ add x6 , x1, x6, lsl #3
|
|
|
|
++ add x10, x1, x10, lsl #3
|
|
|
|
++ st2 {v24.s,v25.s}[0], [x10]
|
|
|
|
++ st2 {v24.s,v25.s}[1], [x6]
|
|
|
|
++
|
|
|
|
++ mov x19, x0
|
|
|
|
++ mov x20, x1
|
|
|
|
++ bl X(ff_fft_calc_neon)
|
|
|
|
++
|
|
|
|
++ mov x12, #1
|
|
|
|
++ ldr w14, [x19, #28] // mdct_bits
|
|
|
|
++ ldr x4, [x19, #32] // tcos
|
|
|
|
++ lsl x12, x12, x14 // n = 1 << nbits
|
|
|
|
++ lsr x14, x12, #3 // n8 = n >> 3
|
|
|
|
++
|
|
|
|
++ add x4, x4, x14, lsl #3
|
|
|
|
++ add x6, x20, x14, lsl #3
|
|
|
|
++ sub x1, x4, #16
|
|
|
|
++ sub x3, x6, #16
|
|
|
|
++
|
|
|
|
++ mov x7, #-16
|
|
|
|
++ mov x8, x6
|
|
|
|
++ mov x0, x3
|
|
|
|
++
|
|
|
|
++ ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
|
|
|
|
++ ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
|
|
|
|
++ ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
|
|
|
++1:
|
|
|
|
++ subs x14, x14, #2
|
|
|
|
++ fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
|
|
|
|
++ ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
|
|
|
|
++ fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
|
|
|
|
++ fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
|
|
|
|
++ fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
|
|
|
|
++ fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
|
|
|
|
++ fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
|
|
|
|
++ fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
|
|
|
|
++ fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
|
|
|
|
++ fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
|
|
|
|
++ fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
|
|
|
|
++ fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
|
|
|
|
++ fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
|
|
|
|
++ fneg v4.2s, v4.2s
|
|
|
|
++ fneg v6.2s, v6.2s
|
|
|
|
++ b.eq 1f
|
|
|
|
++ ld2 {v0.2s, v1.2s}, [x3], x7
|
|
|
|
++ ld2 {v20.2s,v21.2s}, [x6], #16
|
|
|
|
++ ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
|
|
|
++ rev64 v5.2s, v5.2s
|
|
|
|
++ rev64 v7.2s, v7.2s
|
|
|
|
++ st2 {v4.2s,v5.2s}, [x0], x7
|
|
|
|
++ st2 {v6.2s,v7.2s}, [x8], #16
|
|
|
|
++ b 1b
|
|
|
|
++1:
|
|
|
|
++ rev64 v5.2s, v5.2s
|
|
|
|
++ rev64 v7.2s, v7.2s
|
|
|
|
++ st2 {v4.2s,v5.2s}, [x0]
|
|
|
|
++ st2 {v6.2s,v7.2s}, [x8]
|
|
|
|
++
|
|
|
|
++ ldp x19, x20, [sp], #16
|
|
|
|
++ ldr x30, [sp], #16
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/neon.S b/media/ffvpx/libavcodec/aarch64/neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/neon.S
|
|
|
|
+@@ -0,0 +1,149 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
|
|
|
++ trn1 \r8\().8B, \r0\().8B, \r1\().8B
|
|
|
|
++ trn2 \r9\().8B, \r0\().8B, \r1\().8B
|
|
|
|
++ trn1 \r1\().8B, \r2\().8B, \r3\().8B
|
|
|
|
++ trn2 \r3\().8B, \r2\().8B, \r3\().8B
|
|
|
|
++ trn1 \r0\().8B, \r4\().8B, \r5\().8B
|
|
|
|
++ trn2 \r5\().8B, \r4\().8B, \r5\().8B
|
|
|
|
++ trn1 \r2\().8B, \r6\().8B, \r7\().8B
|
|
|
|
++ trn2 \r7\().8B, \r6\().8B, \r7\().8B
|
|
|
|
++
|
|
|
|
++ trn1 \r4\().4H, \r0\().4H, \r2\().4H
|
|
|
|
++ trn2 \r2\().4H, \r0\().4H, \r2\().4H
|
|
|
|
++ trn1 \r6\().4H, \r5\().4H, \r7\().4H
|
|
|
|
++ trn2 \r7\().4H, \r5\().4H, \r7\().4H
|
|
|
|
++ trn1 \r5\().4H, \r9\().4H, \r3\().4H
|
|
|
|
++ trn2 \r9\().4H, \r9\().4H, \r3\().4H
|
|
|
|
++ trn1 \r3\().4H, \r8\().4H, \r1\().4H
|
|
|
|
++ trn2 \r8\().4H, \r8\().4H, \r1\().4H
|
|
|
|
++
|
|
|
|
++ trn1 \r0\().2S, \r3\().2S, \r4\().2S
|
|
|
|
++ trn2 \r4\().2S, \r3\().2S, \r4\().2S
|
|
|
|
++
|
|
|
|
++ trn1 \r1\().2S, \r5\().2S, \r6\().2S
|
|
|
|
++ trn2 \r5\().2S, \r5\().2S, \r6\().2S
|
|
|
|
++
|
|
|
|
++ trn2 \r6\().2S, \r8\().2S, \r2\().2S
|
|
|
|
++ trn1 \r2\().2S, \r8\().2S, \r2\().2S
|
|
|
|
++
|
|
|
|
++ trn1 \r3\().2S, \r9\().2S, \r7\().2S
|
|
|
|
++ trn2 \r7\().2S, \r9\().2S, \r7\().2S
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
|
|
|
++ trn1 \t0\().16B, \r0\().16B, \r1\().16B
|
|
|
|
++ trn2 \t1\().16B, \r0\().16B, \r1\().16B
|
|
|
|
++ trn1 \r1\().16B, \r2\().16B, \r3\().16B
|
|
|
|
++ trn2 \r3\().16B, \r2\().16B, \r3\().16B
|
|
|
|
++ trn1 \r0\().16B, \r4\().16B, \r5\().16B
|
|
|
|
++ trn2 \r5\().16B, \r4\().16B, \r5\().16B
|
|
|
|
++ trn1 \r2\().16B, \r6\().16B, \r7\().16B
|
|
|
|
++ trn2 \r7\().16B, \r6\().16B, \r7\().16B
|
|
|
|
++
|
|
|
|
++ trn1 \r4\().8H, \r0\().8H, \r2\().8H
|
|
|
|
++ trn2 \r2\().8H, \r0\().8H, \r2\().8H
|
|
|
|
++ trn1 \r6\().8H, \r5\().8H, \r7\().8H
|
|
|
|
++ trn2 \r7\().8H, \r5\().8H, \r7\().8H
|
|
|
|
++ trn1 \r5\().8H, \t1\().8H, \r3\().8H
|
|
|
|
++ trn2 \t1\().8H, \t1\().8H, \r3\().8H
|
|
|
|
++ trn1 \r3\().8H, \t0\().8H, \r1\().8H
|
|
|
|
++ trn2 \t0\().8H, \t0\().8H, \r1\().8H
|
|
|
|
++
|
|
|
|
++ trn1 \r0\().4S, \r3\().4S, \r4\().4S
|
|
|
|
++ trn2 \r4\().4S, \r3\().4S, \r4\().4S
|
|
|
|
++
|
|
|
|
++ trn1 \r1\().4S, \r5\().4S, \r6\().4S
|
|
|
|
++ trn2 \r5\().4S, \r5\().4S, \r6\().4S
|
|
|
|
++
|
|
|
|
++ trn2 \r6\().4S, \t0\().4S, \r2\().4S
|
|
|
|
++ trn1 \r2\().4S, \t0\().4S, \r2\().4S
|
|
|
|
++
|
|
|
|
++ trn1 \r3\().4S, \t1\().4S, \r7\().4S
|
|
|
|
++ trn2 \r7\().4S, \t1\().4S, \r7\().4S
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
|
|
|
++ trn1 \t4\().16B, \r0\().16B, \r1\().16B
|
|
|
|
++ trn2 \t5\().16B, \r0\().16B, \r1\().16B
|
|
|
|
++ trn1 \t6\().16B, \r2\().16B, \r3\().16B
|
|
|
|
++ trn2 \t7\().16B, \r2\().16B, \r3\().16B
|
|
|
|
++
|
|
|
|
++ trn1 \r0\().8H, \t4\().8H, \t6\().8H
|
|
|
|
++ trn2 \r2\().8H, \t4\().8H, \t6\().8H
|
|
|
|
++ trn1 \r1\().8H, \t5\().8H, \t7\().8H
|
|
|
|
++ trn2 \r3\().8H, \t5\().8H, \t7\().8H
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
|
|
|
++ trn1 \t4\().8B, \r0\().8B, \r1\().8B
|
|
|
|
++ trn2 \t5\().8B, \r0\().8B, \r1\().8B
|
|
|
|
++ trn1 \t6\().8B, \r2\().8B, \r3\().8B
|
|
|
|
++ trn2 \t7\().8B, \r2\().8B, \r3\().8B
|
|
|
|
++
|
|
|
|
++ trn1 \r0\().4H, \t4\().4H, \t6\().4H
|
|
|
|
++ trn2 \r2\().4H, \t4\().4H, \t6\().4H
|
|
|
|
++ trn1 \r1\().4H, \t5\().4H, \t7\().4H
|
|
|
|
++ trn2 \r3\().4H, \t5\().4H, \t7\().4H
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
|
|
|
++ trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
|
|
|
++ trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
|
|
|
++ trn1 \r6\().4H, \r2\().4H, \r3\().4H
|
|
|
|
++ trn2 \r7\().4H, \r2\().4H, \r3\().4H
|
|
|
|
++ trn1 \r0\().2S, \r4\().2S, \r6\().2S
|
|
|
|
++ trn2 \r2\().2S, \r4\().2S, \r6\().2S
|
|
|
|
++ trn1 \r1\().2S, \r5\().2S, \r7\().2S
|
|
|
|
++ trn2 \r3\().2S, \r5\().2S, \r7\().2S
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
|
|
|
++ trn1 \r8\().8H, \r0\().8H, \r1\().8H
|
|
|
|
++ trn2 \r9\().8H, \r0\().8H, \r1\().8H
|
|
|
|
++ trn1 \r1\().8H, \r2\().8H, \r3\().8H
|
|
|
|
++ trn2 \r3\().8H, \r2\().8H, \r3\().8H
|
|
|
|
++ trn1 \r0\().8H, \r4\().8H, \r5\().8H
|
|
|
|
++ trn2 \r5\().8H, \r4\().8H, \r5\().8H
|
|
|
|
++ trn1 \r2\().8H, \r6\().8H, \r7\().8H
|
|
|
|
++ trn2 \r7\().8H, \r6\().8H, \r7\().8H
|
|
|
|
++
|
|
|
|
++ trn1 \r4\().4S, \r0\().4S, \r2\().4S
|
|
|
|
++ trn2 \r2\().4S, \r0\().4S, \r2\().4S
|
|
|
|
++ trn1 \r6\().4S, \r5\().4S, \r7\().4S
|
|
|
|
++ trn2 \r7\().4S, \r5\().4S, \r7\().4S
|
|
|
|
++ trn1 \r5\().4S, \r9\().4S, \r3\().4S
|
|
|
|
++ trn2 \r9\().4S, \r9\().4S, \r3\().4S
|
|
|
|
++ trn1 \r3\().4S, \r8\().4S, \r1\().4S
|
|
|
|
++ trn2 \r8\().4S, \r8\().4S, \r1\().4S
|
|
|
|
++
|
|
|
|
++ trn1 \r0\().2D, \r3\().2D, \r4\().2D
|
|
|
|
++ trn2 \r4\().2D, \r3\().2D, \r4\().2D
|
|
|
|
++
|
|
|
|
++ trn1 \r1\().2D, \r5\().2D, \r6\().2D
|
|
|
|
++ trn2 \r5\().2D, \r5\().2D, \r6\().2D
|
|
|
|
++
|
|
|
|
++ trn2 \r6\().2D, \r8\().2D, \r2\().2D
|
|
|
|
++ trn1 \r2\().2D, \r8\().2D, \r2\().2D
|
|
|
|
++
|
|
|
|
++ trn1 \r3\().2D, \r9\().2D, \r7\().2D
|
|
|
|
++ trn2 \r7\().2D, \r9\().2D, \r7\().2D
|
|
|
|
++
|
|
|
|
++.endm
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
|
|
|
|
+@@ -0,0 +1,362 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON IDCT
|
|
|
|
++ *
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
|
|
|
|
++ *
|
|
|
|
++ * Based on Simple IDCT
|
|
|
|
++ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
++#define Z4c ((1<<(COL_SHIFT-1))/Z4)
|
|
|
|
++#define ROW_SHIFT 11
|
|
|
|
++#define COL_SHIFT 20
|
|
|
|
++
|
|
|
|
++#define z1 v0.H[0]
|
|
|
|
++#define z2 v0.H[1]
|
|
|
|
++#define z3 v0.H[2]
|
|
|
|
++#define z4 v0.H[3]
|
|
|
|
++#define z5 v0.H[4]
|
|
|
|
++#define z6 v0.H[5]
|
|
|
|
++#define z7 v0.H[6]
|
|
|
|
++#define z4c v0.H[7]
|
|
|
|
++
|
|
|
|
++const idct_coeff_neon, align=4
|
|
|
|
++ .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++.macro idct_start data
|
|
|
|
++ prfm pldl1keep, [\data]
|
|
|
|
++ mov x10, x30
|
|
|
|
++ movrel x3, idct_coeff_neon
|
|
|
|
++ ld1 {v0.2D}, [x3]
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro idct_end
|
|
|
|
++ br x10
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro smull1 a, b, c
|
|
|
|
++ smull \a, \b, \c
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro smlal1 a, b, c
|
|
|
|
++ smlal \a, \b, \c
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro smlsl1 a, b, c
|
|
|
|
++ smlsl \a, \b, \c
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro idct_col4_top y1, y2, y3, y4, i, l
|
|
|
|
++ smull\i v7.4S, \y3\l, z2
|
|
|
|
++ smull\i v16.4S, \y3\l, z6
|
|
|
|
++ smull\i v17.4S, \y2\l, z1
|
|
|
|
++ add v19.4S, v23.4S, v7.4S
|
|
|
|
++ smull\i v18.4S, \y2\l, z3
|
|
|
|
++ add v20.4S, v23.4S, v16.4S
|
|
|
|
++ smull\i v5.4S, \y2\l, z5
|
|
|
|
++ sub v21.4S, v23.4S, v16.4S
|
|
|
|
++ smull\i v6.4S, \y2\l, z7
|
|
|
|
++ sub v22.4S, v23.4S, v7.4S
|
|
|
|
++
|
|
|
|
++ smlal\i v17.4S, \y4\l, z3
|
|
|
|
++ smlsl\i v18.4S, \y4\l, z7
|
|
|
|
++ smlsl\i v5.4S, \y4\l, z1
|
|
|
|
++ smlsl\i v6.4S, \y4\l, z5
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro idct_row4_neon y1, y2, y3, y4, pass
|
|
|
|
++ ld1 {\y1\().2D,\y2\().2D}, [x2], #32
|
|
|
|
++ movi v23.4S, #1<<2, lsl #8
|
|
|
|
++ orr v5.16B, \y1\().16B, \y2\().16B
|
|
|
|
++ ld1 {\y3\().2D,\y4\().2D}, [x2], #32
|
|
|
|
++ orr v6.16B, \y3\().16B, \y4\().16B
|
|
|
|
++ orr v5.16B, v5.16B, v6.16B
|
|
|
|
++ mov x3, v5.D[1]
|
|
|
|
++ smlal v23.4S, \y1\().4H, z4
|
|
|
|
++
|
|
|
|
++ idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
|
|
|
|
++
|
|
|
|
++ cmp x3, #0
|
|
|
|
++ b.eq \pass\()f
|
|
|
|
++
|
|
|
|
++ smull2 v7.4S, \y1\().8H, z4
|
|
|
|
++ smlal2 v17.4S, \y2\().8H, z5
|
|
|
|
++ smlsl2 v18.4S, \y2\().8H, z1
|
|
|
|
++ smull2 v16.4S, \y3\().8H, z2
|
|
|
|
++ smlal2 v5.4S, \y2\().8H, z7
|
|
|
|
++ add v19.4S, v19.4S, v7.4S
|
|
|
|
++ sub v20.4S, v20.4S, v7.4S
|
|
|
|
++ sub v21.4S, v21.4S, v7.4S
|
|
|
|
++ add v22.4S, v22.4S, v7.4S
|
|
|
|
++ smlal2 v6.4S, \y2\().8H, z3
|
|
|
|
++ smull2 v7.4S, \y3\().8H, z6
|
|
|
|
++ smlal2 v17.4S, \y4\().8H, z7
|
|
|
|
++ smlsl2 v18.4S, \y4\().8H, z5
|
|
|
|
++ smlal2 v5.4S, \y4\().8H, z3
|
|
|
|
++ smlsl2 v6.4S, \y4\().8H, z1
|
|
|
|
++ add v19.4S, v19.4S, v7.4S
|
|
|
|
++ sub v20.4S, v20.4S, v16.4S
|
|
|
|
++ add v21.4S, v21.4S, v16.4S
|
|
|
|
++ sub v22.4S, v22.4S, v7.4S
|
|
|
|
++
|
|
|
|
++\pass: add \y3\().4S, v19.4S, v17.4S
|
|
|
|
++ add \y4\().4S, v20.4S, v18.4S
|
|
|
|
++ shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
|
|
|
|
++ shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
|
|
|
|
++ add v7.4S, v21.4S, v5.4S
|
|
|
|
++ add v16.4S, v22.4S, v6.4S
|
|
|
|
++ shrn \y3\().4H, v7.4S, #ROW_SHIFT
|
|
|
|
++ shrn \y4\().4H, v16.4S, #ROW_SHIFT
|
|
|
|
++ sub v22.4S, v22.4S, v6.4S
|
|
|
|
++ sub v19.4S, v19.4S, v17.4S
|
|
|
|
++ sub v21.4S, v21.4S, v5.4S
|
|
|
|
++ shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
|
|
|
|
++ sub v20.4S, v20.4S, v18.4S
|
|
|
|
++ shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
|
|
|
|
++ shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
|
|
|
|
++ shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
|
|
|
|
++
|
|
|
|
++ trn1 v16.8H, \y1\().8H, \y2\().8H
|
|
|
|
++ trn2 v17.8H, \y1\().8H, \y2\().8H
|
|
|
|
++ trn1 v18.8H, \y3\().8H, \y4\().8H
|
|
|
|
++ trn2 v19.8H, \y3\().8H, \y4\().8H
|
|
|
|
++ trn1 \y1\().4S, v16.4S, v18.4S
|
|
|
|
++ trn1 \y2\().4S, v17.4S, v19.4S
|
|
|
|
++ trn2 \y3\().4S, v16.4S, v18.4S
|
|
|
|
++ trn2 \y4\().4S, v17.4S, v19.4S
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro declare_idct_col4_neon i, l
|
|
|
|
++function idct_col4_neon\i
|
|
|
|
++ dup v23.4H, z4c
|
|
|
|
++.if \i == 1
|
|
|
|
++ add v23.4H, v23.4H, v24.4H
|
|
|
|
++.else
|
|
|
|
++ mov v5.D[0], v24.D[1]
|
|
|
|
++ add v23.4H, v23.4H, v5.4H
|
|
|
|
++.endif
|
|
|
|
++ smull v23.4S, v23.4H, z4
|
|
|
|
++
|
|
|
|
++ idct_col4_top v24, v25, v26, v27, \i, \l
|
|
|
|
++
|
|
|
|
++ mov x4, v28.D[\i - 1]
|
|
|
|
++ mov x5, v29.D[\i - 1]
|
|
|
|
++ cmp x4, #0
|
|
|
|
++ b.eq 1f
|
|
|
|
++
|
|
|
|
++ smull\i v7.4S, v28\l, z4
|
|
|
|
++ add v19.4S, v19.4S, v7.4S
|
|
|
|
++ sub v20.4S, v20.4S, v7.4S
|
|
|
|
++ sub v21.4S, v21.4S, v7.4S
|
|
|
|
++ add v22.4S, v22.4S, v7.4S
|
|
|
|
++
|
|
|
|
++1: mov x4, v30.D[\i - 1]
|
|
|
|
++ cmp x5, #0
|
|
|
|
++ b.eq 2f
|
|
|
|
++
|
|
|
|
++ smlal\i v17.4S, v29\l, z5
|
|
|
|
++ smlsl\i v18.4S, v29\l, z1
|
|
|
|
++ smlal\i v5.4S, v29\l, z7
|
|
|
|
++ smlal\i v6.4S, v29\l, z3
|
|
|
|
++
|
|
|
|
++2: mov x5, v31.D[\i - 1]
|
|
|
|
++ cmp x4, #0
|
|
|
|
++ b.eq 3f
|
|
|
|
++
|
|
|
|
++ smull\i v7.4S, v30\l, z6
|
|
|
|
++ smull\i v16.4S, v30\l, z2
|
|
|
|
++ add v19.4S, v19.4S, v7.4S
|
|
|
|
++ sub v22.4S, v22.4S, v7.4S
|
|
|
|
++ sub v20.4S, v20.4S, v16.4S
|
|
|
|
++ add v21.4S, v21.4S, v16.4S
|
|
|
|
++
|
|
|
|
++3: cmp x5, #0
|
|
|
|
++ b.eq 4f
|
|
|
|
++
|
|
|
|
++ smlal\i v17.4S, v31\l, z7
|
|
|
|
++ smlsl\i v18.4S, v31\l, z5
|
|
|
|
++ smlal\i v5.4S, v31\l, z3
|
|
|
|
++ smlsl\i v6.4S, v31\l, z1
|
|
|
|
++
|
|
|
|
++4: addhn v7.4H, v19.4S, v17.4S
|
|
|
|
++ addhn2 v7.8H, v20.4S, v18.4S
|
|
|
|
++ subhn v18.4H, v20.4S, v18.4S
|
|
|
|
++ subhn2 v18.8H, v19.4S, v17.4S
|
|
|
|
++
|
|
|
|
++ addhn v16.4H, v21.4S, v5.4S
|
|
|
|
++ addhn2 v16.8H, v22.4S, v6.4S
|
|
|
|
++ subhn v17.4H, v22.4S, v6.4S
|
|
|
|
++ subhn2 v17.8H, v21.4S, v5.4S
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++declare_idct_col4_neon 1, .4H
|
|
|
|
++declare_idct_col4_neon 2, .8H
|
|
|
|
++
|
|
|
|
++function ff_simple_idct_put_neon, export=1
|
|
|
|
++ idct_start x2
|
|
|
|
++
|
|
|
|
++ idct_row4_neon v24, v25, v26, v27, 1
|
|
|
|
++ idct_row4_neon v28, v29, v30, v31, 2
|
|
|
|
++ bl idct_col4_neon1
|
|
|
|
++
|
|
|
|
++ sqshrun v1.8B, v7.8H, #COL_SHIFT-16
|
|
|
|
++ sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
|
|
|
|
++ sqshrun v3.8B, v17.8H, #COL_SHIFT-16
|
|
|
|
++ sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
|
|
|
|
++
|
|
|
|
++ bl idct_col4_neon2
|
|
|
|
++
|
|
|
|
++ sqshrun v2.8B, v7.8H, #COL_SHIFT-16
|
|
|
|
++ sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
|
|
|
|
++ sqshrun v4.8B, v17.8H, #COL_SHIFT-16
|
|
|
|
++ sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
|
|
|
|
++
|
|
|
|
++ zip1 v16.4S, v1.4S, v2.4S
|
|
|
|
++ zip2 v17.4S, v1.4S, v2.4S
|
|
|
|
++
|
|
|
|
++ st1 {v16.D}[0], [x0], x1
|
|
|
|
++ st1 {v16.D}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ zip1 v18.4S, v3.4S, v4.4S
|
|
|
|
++ zip2 v19.4S, v3.4S, v4.4S
|
|
|
|
++
|
|
|
|
++ st1 {v17.D}[0], [x0], x1
|
|
|
|
++ st1 {v17.D}[1], [x0], x1
|
|
|
|
++ st1 {v18.D}[0], [x0], x1
|
|
|
|
++ st1 {v18.D}[1], [x0], x1
|
|
|
|
++ st1 {v19.D}[0], [x0], x1
|
|
|
|
++ st1 {v19.D}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ idct_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_simple_idct_add_neon, export=1
|
|
|
|
++ idct_start x2
|
|
|
|
++
|
|
|
|
++ idct_row4_neon v24, v25, v26, v27, 1
|
|
|
|
++ idct_row4_neon v28, v29, v30, v31, 2
|
|
|
|
++ bl idct_col4_neon1
|
|
|
|
++
|
|
|
|
++ sshr v1.8H, v7.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v2.8H, v16.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v3.8H, v17.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v4.8H, v18.8H, #COL_SHIFT-16
|
|
|
|
++
|
|
|
|
++ bl idct_col4_neon2
|
|
|
|
++
|
|
|
|
++ sshr v7.8H, v7.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v16.8H, v16.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v17.8H, v17.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v18.8H, v18.8H, #COL_SHIFT-16
|
|
|
|
++
|
|
|
|
++ mov x9, x0
|
|
|
|
++ ld1 {v19.D}[0], [x0], x1
|
|
|
|
++ zip1 v23.2D, v1.2D, v7.2D
|
|
|
|
++ zip2 v24.2D, v1.2D, v7.2D
|
|
|
|
++ ld1 {v19.D}[1], [x0], x1
|
|
|
|
++ zip1 v25.2D, v2.2D, v16.2D
|
|
|
|
++ zip2 v26.2D, v2.2D, v16.2D
|
|
|
|
++ ld1 {v20.D}[0], [x0], x1
|
|
|
|
++ zip1 v27.2D, v3.2D, v17.2D
|
|
|
|
++ zip2 v28.2D, v3.2D, v17.2D
|
|
|
|
++ ld1 {v20.D}[1], [x0], x1
|
|
|
|
++ zip1 v29.2D, v4.2D, v18.2D
|
|
|
|
++ zip2 v30.2D, v4.2D, v18.2D
|
|
|
|
++ ld1 {v21.D}[0], [x0], x1
|
|
|
|
++ uaddw v23.8H, v23.8H, v19.8B
|
|
|
|
++ uaddw2 v24.8H, v24.8H, v19.16B
|
|
|
|
++ ld1 {v21.D}[1], [x0], x1
|
|
|
|
++ sqxtun v23.8B, v23.8H
|
|
|
|
++ sqxtun2 v23.16B, v24.8H
|
|
|
|
++ ld1 {v22.D}[0], [x0], x1
|
|
|
|
++ uaddw v24.8H, v25.8H, v20.8B
|
|
|
|
++ uaddw2 v25.8H, v26.8H, v20.16B
|
|
|
|
++ ld1 {v22.D}[1], [x0], x1
|
|
|
|
++ sqxtun v24.8B, v24.8H
|
|
|
|
++ sqxtun2 v24.16B, v25.8H
|
|
|
|
++ st1 {v23.D}[0], [x9], x1
|
|
|
|
++ uaddw v25.8H, v27.8H, v21.8B
|
|
|
|
++ uaddw2 v26.8H, v28.8H, v21.16B
|
|
|
|
++ st1 {v23.D}[1], [x9], x1
|
|
|
|
++ sqxtun v25.8B, v25.8H
|
|
|
|
++ sqxtun2 v25.16B, v26.8H
|
|
|
|
++ st1 {v24.D}[0], [x9], x1
|
|
|
|
++ uaddw v26.8H, v29.8H, v22.8B
|
|
|
|
++ uaddw2 v27.8H, v30.8H, v22.16B
|
|
|
|
++ st1 {v24.D}[1], [x9], x1
|
|
|
|
++ sqxtun v26.8B, v26.8H
|
|
|
|
++ sqxtun2 v26.16B, v27.8H
|
|
|
|
++ st1 {v25.D}[0], [x9], x1
|
|
|
|
++ st1 {v25.D}[1], [x9], x1
|
|
|
|
++ st1 {v26.D}[0], [x9], x1
|
|
|
|
++ st1 {v26.D}[1], [x9], x1
|
|
|
|
++
|
|
|
|
++ idct_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_simple_idct_neon, export=1
|
|
|
|
++ idct_start x0
|
|
|
|
++
|
|
|
|
++ mov x2, x0
|
|
|
|
++ idct_row4_neon v24, v25, v26, v27, 1
|
|
|
|
++ idct_row4_neon v28, v29, v30, v31, 2
|
|
|
|
++ sub x2, x2, #128
|
|
|
|
++ bl idct_col4_neon1
|
|
|
|
++
|
|
|
|
++ sshr v1.8H, v7.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v2.8H, v16.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v3.8H, v17.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v4.8H, v18.8H, #COL_SHIFT-16
|
|
|
|
++
|
|
|
|
++ bl idct_col4_neon2
|
|
|
|
++
|
|
|
|
++ sshr v7.8H, v7.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v16.8H, v16.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v17.8H, v17.8H, #COL_SHIFT-16
|
|
|
|
++ sshr v18.8H, v18.8H, #COL_SHIFT-16
|
|
|
|
++
|
|
|
|
++ zip1 v23.2D, v1.2D, v7.2D
|
|
|
|
++ zip2 v24.2D, v1.2D, v7.2D
|
|
|
|
++ st1 {v23.2D,v24.2D}, [x2], #32
|
|
|
|
++ zip1 v25.2D, v2.2D, v16.2D
|
|
|
|
++ zip2 v26.2D, v2.2D, v16.2D
|
|
|
|
++ st1 {v25.2D,v26.2D}, [x2], #32
|
|
|
|
++ zip1 v27.2D, v3.2D, v17.2D
|
|
|
|
++ zip2 v28.2D, v3.2D, v17.2D
|
|
|
|
++ st1 {v27.2D,v28.2D}, [x2], #32
|
|
|
|
++ zip1 v29.2D, v4.2D, v18.2D
|
|
|
|
++ zip2 v30.2D, v4.2D, v18.2D
|
|
|
|
++ st1 {v29.2D,v30.2D}, [x2], #32
|
|
|
|
++
|
|
|
|
++ idct_end
|
|
|
|
++endfunc
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,47 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/vc1dsp.h"
|
|
|
|
++
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
|
|
++ int h, int x, int y);
|
|
|
|
++
|
|
|
|
++av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
|
|
|
|
++ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
|
|
|
|
++ dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
|
|
|
|
++ dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/videodsp.S b/media/ffvpx/libavcodec/aarch64/videodsp.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/videodsp.S
|
|
|
|
+@@ -0,0 +1,28 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++function ff_prefetch_aarch64, export=1
|
|
|
|
++ subs w2, w2, #2
|
|
|
|
++ prfm pldl1strm, [x0]
|
|
|
|
++ prfm pldl1strm, [x0, x1]
|
|
|
|
++ add x0, x0, x1, lsl #1
|
|
|
|
++ b.gt X(ff_prefetch_aarch64)
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/videodsp_init.c b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
|
|
|
|
+@@ -0,0 +1,32 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/videodsp.h"
|
|
|
|
++
|
|
|
|
++void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
|
|
|
|
++
|
|
|
|
++av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_armv8(cpu_flags))
|
|
|
|
++ ctx->prefetch = ff_prefetch_aarch64;
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
|
|
|
|
+@@ -0,0 +1,29 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
|
|
|
|
++#define AVCODEC_AARCH64_VP9DSP_INIT_H
|
|
|
|
++
|
|
|
|
++#include "libavcodec/vp9dsp.h"
|
|
|
|
++
|
|
|
|
++void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
|
|
|
|
++void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
|
|
|
|
++
|
|
|
|
++#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
|
|
|
|
+@@ -0,0 +1,23 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#define BPP 10
|
|
|
|
++#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
|
|
|
|
++#include "vp9dsp_init_16bpp_aarch64_template.c"
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
|
|
|
|
+@@ -0,0 +1,23 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#define BPP 12
|
|
|
|
++#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
|
|
|
|
++#include "vp9dsp_init_16bpp_aarch64_template.c"
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
|
|
|
|
+@@ -0,0 +1,273 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/internal.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "vp9dsp_init.h"
|
|
|
|
++
|
|
|
|
++#define declare_fpel(type, sz, suffix) \
|
|
|
|
++void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
|
|
|
++ const uint8_t *src, ptrdiff_t src_stride, \
|
|
|
|
++ int h, int mx, int my)
|
|
|
|
++
|
|
|
|
++#define decl_mc_func(op, filter, dir, sz, bpp) \
|
|
|
|
++void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
|
|
|
++ const uint8_t *src, ptrdiff_t src_stride, \
|
|
|
|
++ int h, int mx, int my)
|
|
|
|
++
|
|
|
|
++#define define_8tap_2d_fn(op, filter, sz, bpp) \
|
|
|
|
++static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
|
|
|
++ const uint8_t *src, \
|
|
|
|
++ ptrdiff_t src_stride, \
|
|
|
|
++ int h, int mx, int my) \
|
|
|
|
++{ \
|
|
|
|
++ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
|
|
|
|
++ /* We only need h + 7 lines, but the horizontal filter assumes an \
|
|
|
|
++ * even number of rows, so filter h + 8 lines here. */ \
|
|
|
|
++ ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
|
|
|
|
++ src - 3 * src_stride, src_stride, \
|
|
|
|
++ h + 8, mx, 0); \
|
|
|
|
++ ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
|
|
|
|
++ temp + 3 * 2 * sz, 2 * sz, \
|
|
|
|
++ h, 0, my); \
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define decl_filter_funcs(op, dir, sz, bpp) \
|
|
|
|
++ decl_mc_func(op, regular, dir, sz, bpp); \
|
|
|
|
++ decl_mc_func(op, sharp, dir, sz, bpp); \
|
|
|
|
++ decl_mc_func(op, smooth, dir, sz, bpp)
|
|
|
|
++
|
|
|
|
++#define decl_mc_funcs(sz, bpp) \
|
|
|
|
++ decl_filter_funcs(put, h, sz, bpp); \
|
|
|
|
++ decl_filter_funcs(avg, h, sz, bpp); \
|
|
|
|
++ decl_filter_funcs(put, v, sz, bpp); \
|
|
|
|
++ decl_filter_funcs(avg, v, sz, bpp); \
|
|
|
|
++ decl_filter_funcs(put, hv, sz, bpp); \
|
|
|
|
++ decl_filter_funcs(avg, hv, sz, bpp)
|
|
|
|
++
|
|
|
|
++#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
|
|
|
|
++#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
|
|
|
|
++#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
|
|
|
|
++
|
|
|
|
++declare_fpel(copy, 128, );
|
|
|
|
++declare_fpel(copy, 64, );
|
|
|
|
++declare_fpel(copy, 32, );
|
|
|
|
++declare_fpel(copy, 16, );
|
|
|
|
++declare_fpel(copy, 8, );
|
|
|
|
++declare_fpel(avg, 64, _16);
|
|
|
|
++declare_fpel(avg, 32, _16);
|
|
|
|
++declare_fpel(avg, 16, _16);
|
|
|
|
++declare_fpel(avg, 8, _16);
|
|
|
|
++declare_fpel(avg, 4, _16);
|
|
|
|
++
|
|
|
|
++decl_mc_funcs(64, BPP);
|
|
|
|
++decl_mc_funcs(32, BPP);
|
|
|
|
++decl_mc_funcs(16, BPP);
|
|
|
|
++decl_mc_funcs(8, BPP);
|
|
|
|
++decl_mc_funcs(4, BPP);
|
|
|
|
++
|
|
|
|
++#define define_8tap_2d_funcs(sz, bpp) \
|
|
|
|
++ define_8tap_2d_fn(put, regular, sz, bpp) \
|
|
|
|
++ define_8tap_2d_fn(put, sharp, sz, bpp) \
|
|
|
|
++ define_8tap_2d_fn(put, smooth, sz, bpp) \
|
|
|
|
++ define_8tap_2d_fn(avg, regular, sz, bpp) \
|
|
|
|
++ define_8tap_2d_fn(avg, sharp, sz, bpp) \
|
|
|
|
++ define_8tap_2d_fn(avg, smooth, sz, bpp)
|
|
|
|
++
|
|
|
|
++define_8tap_2d_funcs(64, BPP)
|
|
|
|
++define_8tap_2d_funcs(32, BPP)
|
|
|
|
++define_8tap_2d_funcs(16, BPP)
|
|
|
|
++define_8tap_2d_funcs(8, BPP)
|
|
|
|
++define_8tap_2d_funcs(4, BPP)
|
|
|
|
++
|
|
|
|
++static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++#define init_fpel(idx1, idx2, sz, type, suffix) \
|
|
|
|
++ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
|
|
|
++ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
|
|
|
++ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
|
|
|
++ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
|
|
|
|
++
|
|
|
|
++#define init_copy(idx, sz, suffix) \
|
|
|
|
++ init_fpel(idx, 0, sz, copy, suffix)
|
|
|
|
++
|
|
|
|
++#define init_avg(idx, sz, suffix) \
|
|
|
|
++ init_fpel(idx, 1, sz, avg, suffix)
|
|
|
|
++
|
|
|
|
++#define init_copy_avg(idx, sz1, sz2) \
|
|
|
|
++ init_copy(idx, sz2, _neon); \
|
|
|
|
++ init_avg (idx, sz1, _16_neon)
|
|
|
|
++
|
|
|
|
++ if (have_armv8(cpu_flags)) {
|
|
|
|
++ init_copy(0, 128, _aarch64);
|
|
|
|
++ init_copy(1, 64, _aarch64);
|
|
|
|
++ init_copy(2, 32, _aarch64);
|
|
|
|
++ }
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
|
|
|
|
++ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
|
|
|
|
++
|
|
|
|
++#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
|
|
|
|
++ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
|
|
|
|
++ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
|
|
|
|
++ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
|
|
|
|
++ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
|
|
|
|
++ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
|
|
|
|
++ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
|
|
|
|
++
|
|
|
|
++#define init_mc_funcs_dirs(idx, sz, bpp) \
|
|
|
|
++ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
|
|
|
|
++ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
|
|
|
|
++ init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++ init_avg(0, 64, _16_neon);
|
|
|
|
++ init_avg(1, 32, _16_neon);
|
|
|
|
++ init_avg(2, 16, _16_neon);
|
|
|
|
++ init_copy_avg(3, 8, 16);
|
|
|
|
++ init_copy_avg(4, 4, 8);
|
|
|
|
++
|
|
|
|
++ init_mc_funcs_dirs(0, 64, BPP);
|
|
|
|
++ init_mc_funcs_dirs(1, 32, BPP);
|
|
|
|
++ init_mc_funcs_dirs(2, 16, BPP);
|
|
|
|
++ init_mc_funcs_dirs(3, 8, BPP);
|
|
|
|
++ init_mc_funcs_dirs(4, 4, BPP);
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define define_itxfm2(type_a, type_b, sz, bpp) \
|
|
|
|
++void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
|
|
|
|
++ ptrdiff_t stride, \
|
|
|
|
++ int16_t *_block, int eob)
|
|
|
|
++#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
|
|
|
|
++
|
|
|
|
++#define define_itxfm_funcs(sz, bpp) \
|
|
|
|
++ define_itxfm(idct, idct, sz, bpp); \
|
|
|
|
++ define_itxfm(iadst, idct, sz, bpp); \
|
|
|
|
++ define_itxfm(idct, iadst, sz, bpp); \
|
|
|
|
++ define_itxfm(iadst, iadst, sz, bpp)
|
|
|
|
++
|
|
|
|
++define_itxfm_funcs(4, BPP);
|
|
|
|
++define_itxfm_funcs(8, BPP);
|
|
|
|
++define_itxfm_funcs(16, BPP);
|
|
|
|
++define_itxfm(idct, idct, 32, BPP);
|
|
|
|
++define_itxfm(iwht, iwht, 4, BPP);
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++#define init_itxfm2(tx, sz, bpp) \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
|
|
|
|
++#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
|
|
|
|
++
|
|
|
|
++#define init_idct2(tx, nm, bpp) \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_DCT] = \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_DCT] = \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_ADST] = \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
|
|
|
|
++#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
|
|
|
|
++
|
|
|
|
++ init_itxfm(TX_4X4, 4x4, BPP);
|
|
|
|
++ init_itxfm(TX_8X8, 8x8, BPP);
|
|
|
|
++ init_itxfm(TX_16X16, 16x16, BPP);
|
|
|
|
++ init_idct(TX_32X32, idct_idct_32x32, BPP);
|
|
|
|
++ init_idct(4, iwht_iwht_4x4, BPP);
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define define_loop_filter(dir, wd, size, bpp) \
|
|
|
|
++void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
|
|
|
|
++
|
|
|
|
++#define define_loop_filters(wd, size, bpp) \
|
|
|
|
++ define_loop_filter(h, wd, size, bpp); \
|
|
|
|
++ define_loop_filter(v, wd, size, bpp)
|
|
|
|
++
|
|
|
|
++define_loop_filters(4, 8, BPP);
|
|
|
|
++define_loop_filters(8, 8, BPP);
|
|
|
|
++define_loop_filters(16, 8, BPP);
|
|
|
|
++
|
|
|
|
++define_loop_filters(16, 16, BPP);
|
|
|
|
++
|
|
|
|
++define_loop_filters(44, 16, BPP);
|
|
|
|
++define_loop_filters(48, 16, BPP);
|
|
|
|
++define_loop_filters(84, 16, BPP);
|
|
|
|
++define_loop_filters(88, 16, BPP);
|
|
|
|
++
|
|
|
|
++static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
|
|
|
|
++ dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
|
|
|
|
++
|
|
|
|
++#define init_lpf_func_16(idx, dir, bpp) \
|
|
|
|
++ dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
|
|
|
|
++
|
|
|
|
++#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
|
|
|
|
++ dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
|
|
|
|
++
|
|
|
|
++#define init_lpf_funcs_8_wd(idx, wd, bpp) \
|
|
|
|
++ init_lpf_func_8(idx, 0, h, wd, bpp); \
|
|
|
|
++ init_lpf_func_8(idx, 1, v, wd, bpp)
|
|
|
|
++
|
|
|
|
++#define init_lpf_funcs_16(bpp) \
|
|
|
|
++ init_lpf_func_16(0, h, bpp); \
|
|
|
|
++ init_lpf_func_16(1, v, bpp)
|
|
|
|
++
|
|
|
|
++#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
|
|
|
|
++ init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
|
|
|
|
++ init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
|
|
|
|
++
|
|
|
|
++#define init_lpf_funcs_8(bpp) \
|
|
|
|
++ init_lpf_funcs_8_wd(0, 4, bpp); \
|
|
|
|
++ init_lpf_funcs_8_wd(1, 8, bpp); \
|
|
|
|
++ init_lpf_funcs_8_wd(2, 16, bpp)
|
|
|
|
++
|
|
|
|
++#define init_lpf_funcs_mix2(bpp) \
|
|
|
|
++ init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
|
|
|
|
++ init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
|
|
|
|
++ init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
|
|
|
|
++ init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
|
|
|
|
++
|
|
|
|
++ init_lpf_funcs_8(BPP);
|
|
|
|
++ init_lpf_funcs_16(BPP);
|
|
|
|
++ init_lpf_funcs_mix2(BPP);
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++av_cold void INIT_FUNC(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ vp9dsp_mc_init_aarch64(dsp);
|
|
|
|
++ vp9dsp_loopfilter_init_aarch64(dsp);
|
|
|
|
++ vp9dsp_itxfm_init_aarch64(dsp);
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
|
|
|
|
+@@ -0,0 +1,258 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2016 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/internal.h"
|
|
|
|
++#include "libavutil/aarch64/cpu.h"
|
|
|
|
++#include "libavcodec/vp9dsp.h"
|
|
|
|
++#include "vp9dsp_init.h"
|
|
|
|
++
|
|
|
|
++#define declare_fpel(type, sz) \
|
|
|
|
++void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
|
|
|
++ const uint8_t *src, ptrdiff_t src_stride, \
|
|
|
|
++ int h, int mx, int my)
|
|
|
|
++
|
|
|
|
++#define declare_copy_avg(sz) \
|
|
|
|
++ declare_fpel(copy, sz); \
|
|
|
|
++ declare_fpel(avg , sz)
|
|
|
|
++
|
|
|
|
++#define decl_mc_func(op, filter, dir, sz) \
|
|
|
|
++void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
|
|
|
++ const uint8_t *src, ptrdiff_t src_stride, \
|
|
|
|
++ int h, int mx, int my)
|
|
|
|
++
|
|
|
|
++#define define_8tap_2d_fn(op, filter, sz) \
|
|
|
|
++static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
|
|
|
++ const uint8_t *src, ptrdiff_t src_stride, \
|
|
|
|
++ int h, int mx, int my) \
|
|
|
|
++{ \
|
|
|
|
++ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \
|
|
|
|
++ /* We only need h + 7 lines, but the horizontal filter assumes an \
|
|
|
|
++ * even number of rows, so filter h + 8 lines here. */ \
|
|
|
|
++ ff_vp9_put_##filter##sz##_h_neon(temp, sz, \
|
|
|
|
++ src - 3 * src_stride, src_stride, \
|
|
|
|
++ h + 8, mx, 0); \
|
|
|
|
++ ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \
|
|
|
|
++ temp + 3 * sz, sz, \
|
|
|
|
++ h, 0, my); \
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define decl_filter_funcs(op, dir, sz) \
|
|
|
|
++ decl_mc_func(op, regular, dir, sz); \
|
|
|
|
++ decl_mc_func(op, sharp, dir, sz); \
|
|
|
|
++ decl_mc_func(op, smooth, dir, sz)
|
|
|
|
++
|
|
|
|
++#define decl_mc_funcs(sz) \
|
|
|
|
++ decl_filter_funcs(put, h, sz); \
|
|
|
|
++ decl_filter_funcs(avg, h, sz); \
|
|
|
|
++ decl_filter_funcs(put, v, sz); \
|
|
|
|
++ decl_filter_funcs(avg, v, sz); \
|
|
|
|
++ decl_filter_funcs(put, hv, sz); \
|
|
|
|
++ decl_filter_funcs(avg, hv, sz)
|
|
|
|
++
|
|
|
|
++#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
|
|
|
|
++#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
|
|
|
|
++
|
|
|
|
++declare_copy_avg(64);
|
|
|
|
++declare_copy_avg(32);
|
|
|
|
++declare_copy_avg(16);
|
|
|
|
++declare_copy_avg(8);
|
|
|
|
++declare_copy_avg(4);
|
|
|
|
++
|
|
|
|
++decl_mc_funcs(64);
|
|
|
|
++decl_mc_funcs(32);
|
|
|
|
++decl_mc_funcs(16);
|
|
|
|
++decl_mc_funcs(8);
|
|
|
|
++decl_mc_funcs(4);
|
|
|
|
++
|
|
|
|
++#define define_8tap_2d_funcs(sz) \
|
|
|
|
++ define_8tap_2d_fn(put, regular, sz) \
|
|
|
|
++ define_8tap_2d_fn(put, sharp, sz) \
|
|
|
|
++ define_8tap_2d_fn(put, smooth, sz) \
|
|
|
|
++ define_8tap_2d_fn(avg, regular, sz) \
|
|
|
|
++ define_8tap_2d_fn(avg, sharp, sz) \
|
|
|
|
++ define_8tap_2d_fn(avg, smooth, sz)
|
|
|
|
++
|
|
|
|
++define_8tap_2d_funcs(64)
|
|
|
|
++define_8tap_2d_funcs(32)
|
|
|
|
++define_8tap_2d_funcs(16)
|
|
|
|
++define_8tap_2d_funcs(8)
|
|
|
|
++define_8tap_2d_funcs(4)
|
|
|
|
++
|
|
|
|
++static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++#define init_fpel(idx1, idx2, sz, type, suffix) \
|
|
|
|
++ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
|
|
|
++ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
|
|
|
++ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
|
|
|
++ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
|
|
|
|
++
|
|
|
|
++#define init_copy(idx, sz, suffix) \
|
|
|
|
++ init_fpel(idx, 0, sz, copy, suffix)
|
|
|
|
++
|
|
|
|
++#define init_avg(idx, sz, suffix) \
|
|
|
|
++ init_fpel(idx, 1, sz, avg, suffix)
|
|
|
|
++
|
|
|
|
++#define init_copy_avg(idx, sz) \
|
|
|
|
++ init_copy(idx, sz, _neon); \
|
|
|
|
++ init_avg (idx, sz, _neon)
|
|
|
|
++
|
|
|
|
++ if (have_armv8(cpu_flags)) {
|
|
|
|
++ init_copy(0, 64, _aarch64);
|
|
|
|
++ init_copy(1, 32, _aarch64);
|
|
|
|
++ }
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
|
|
|
|
++ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
|
|
|
|
++
|
|
|
|
++#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
|
|
|
|
++ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
|
|
|
|
++ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
|
|
|
|
++ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \
|
|
|
|
++ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
|
|
|
|
++ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
|
|
|
|
++ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx)
|
|
|
|
++
|
|
|
|
++#define init_mc_funcs_dirs(idx, sz) \
|
|
|
|
++ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \
|
|
|
|
++ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \
|
|
|
|
++ init_mc_funcs(idx, hv, 1, 1, sz,)
|
|
|
|
++
|
|
|
|
++ init_avg(0, 64, _neon);
|
|
|
|
++ init_avg(1, 32, _neon);
|
|
|
|
++ init_copy_avg(2, 16);
|
|
|
|
++ init_copy_avg(3, 8);
|
|
|
|
++ init_copy_avg(4, 4);
|
|
|
|
++
|
|
|
|
++ init_mc_funcs_dirs(0, 64);
|
|
|
|
++ init_mc_funcs_dirs(1, 32);
|
|
|
|
++ init_mc_funcs_dirs(2, 16);
|
|
|
|
++ init_mc_funcs_dirs(3, 8);
|
|
|
|
++ init_mc_funcs_dirs(4, 4);
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define define_itxfm(type_a, type_b, sz) \
|
|
|
|
++void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
|
|
|
|
++ ptrdiff_t stride, \
|
|
|
|
++ int16_t *_block, int eob)
|
|
|
|
++
|
|
|
|
++#define define_itxfm_funcs(sz) \
|
|
|
|
++ define_itxfm(idct, idct, sz); \
|
|
|
|
++ define_itxfm(iadst, idct, sz); \
|
|
|
|
++ define_itxfm(idct, iadst, sz); \
|
|
|
|
++ define_itxfm(iadst, iadst, sz)
|
|
|
|
++
|
|
|
|
++define_itxfm_funcs(4);
|
|
|
|
++define_itxfm_funcs(8);
|
|
|
|
++define_itxfm_funcs(16);
|
|
|
|
++define_itxfm(idct, idct, 32);
|
|
|
|
++define_itxfm(iwht, iwht, 4);
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++#define init_itxfm(tx, sz) \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
|
|
|
|
++
|
|
|
|
++#define init_idct(tx, nm) \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_DCT] = \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_DCT] = \
|
|
|
|
++ dsp->itxfm_add[tx][DCT_ADST] = \
|
|
|
|
++ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
|
|
|
|
++
|
|
|
|
++ init_itxfm(TX_4X4, 4x4);
|
|
|
|
++ init_itxfm(TX_8X8, 8x8);
|
|
|
|
++ init_itxfm(TX_16X16, 16x16);
|
|
|
|
++ init_idct(TX_32X32, idct_idct_32x32);
|
|
|
|
++ init_idct(4, iwht_iwht_4x4);
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define define_loop_filter(dir, wd, len) \
|
|
|
|
++void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
|
|
|
|
++
|
|
|
|
++#define define_loop_filters(wd, len) \
|
|
|
|
++ define_loop_filter(h, wd, len); \
|
|
|
|
++ define_loop_filter(v, wd, len)
|
|
|
|
++
|
|
|
|
++define_loop_filters(4, 8);
|
|
|
|
++define_loop_filters(8, 8);
|
|
|
|
++define_loop_filters(16, 8);
|
|
|
|
++
|
|
|
|
++define_loop_filters(16, 16);
|
|
|
|
++
|
|
|
|
++define_loop_filters(44, 16);
|
|
|
|
++define_loop_filters(48, 16);
|
|
|
|
++define_loop_filters(84, 16);
|
|
|
|
++define_loop_filters(88, 16);
|
|
|
|
++
|
|
|
|
++static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++ dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
|
|
|
|
++ dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
|
|
|
|
++ dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
|
|
|
|
++ dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
|
|
|
|
++ dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
|
|
|
|
++ dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
|
|
|
|
++
|
|
|
|
++ dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
|
|
|
|
++ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
|
|
|
|
++
|
|
|
|
++ dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
|
|
|
|
++ dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
|
|
|
|
++{
|
|
|
|
++ if (bpp == 10) {
|
|
|
|
++ ff_vp9dsp_init_10bpp_aarch64(dsp);
|
|
|
|
++ return;
|
|
|
|
++ } else if (bpp == 12) {
|
|
|
|
++ ff_vp9dsp_init_12bpp_aarch64(dsp);
|
|
|
|
++ return;
|
|
|
|
++ } else if (bpp != 8)
|
|
|
|
++ return;
|
|
|
|
++
|
|
|
|
++ vp9dsp_mc_init_aarch64(dsp);
|
|
|
|
++ vp9dsp_loopfilter_init_aarch64(dsp);
|
|
|
|
++ vp9dsp_itxfm_init_aarch64(dsp);
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
|
|
|
|
+@@ -0,0 +1,2017 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++#include "neon.S"
|
|
|
|
++
|
|
|
|
++const itxfm4_coeffs, align=4
|
|
|
|
++ .short 11585, 0, 6270, 15137
|
|
|
|
++iadst4_coeffs:
|
|
|
|
++ .short 5283, 15212, 9929, 13377
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const iadst8_coeffs, align=4
|
|
|
|
++ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
|
|
|
|
++idct_coeffs:
|
|
|
|
++ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
|
|
|
|
++ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
|
|
|
|
++ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
|
|
|
|
++ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const iadst16_coeffs, align=4
|
|
|
|
++ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
|
|
|
|
++ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
|
|
|
|
++ trn1 \r4\().4s, \r0\().4s, \r1\().4s
|
|
|
|
++ trn2 \r5\().4s, \r0\().4s, \r1\().4s
|
|
|
|
++ trn1 \r6\().4s, \r2\().4s, \r3\().4s
|
|
|
|
++ trn2 \r7\().4s, \r2\().4s, \r3\().4s
|
|
|
|
++ trn1 \r0\().2d, \r4\().2d, \r6\().2d
|
|
|
|
++ trn2 \r2\().2d, \r4\().2d, \r6\().2d
|
|
|
|
++ trn1 \r1\().2d, \r5\().2d, \r7\().2d
|
|
|
|
++ trn2 \r3\().2d, \r5\().2d, \r7\().2d
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
|
|
|
|
++// over two registers.
|
|
|
|
++.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
|
|
|
|
++ transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
|
|
|
|
++ transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
|
|
|
|
++
|
|
|
|
++ // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
|
|
|
|
++ // while swapping the two 4x4 matrices between each other
|
|
|
|
++
|
|
|
|
++ // First step of the 4x4 transpose of r1-r7, into t0-t3
|
|
|
|
++ trn1 \t0\().4s, \r1\().4s, \r3\().4s
|
|
|
|
++ trn2 \t1\().4s, \r1\().4s, \r3\().4s
|
|
|
|
++ trn1 \t2\().4s, \r5\().4s, \r7\().4s
|
|
|
|
++ trn2 \t3\().4s, \r5\().4s, \r7\().4s
|
|
|
|
++
|
|
|
|
++ // First step of the 4x4 transpose of r8-r12, into r1-r7
|
|
|
|
++ trn1 \r1\().4s, \r8\().4s, \r10\().4s
|
|
|
|
++ trn2 \r3\().4s, \r8\().4s, \r10\().4s
|
|
|
|
++ trn1 \r5\().4s, \r12\().4s, \r14\().4s
|
|
|
|
++ trn2 \r7\().4s, \r12\().4s, \r14\().4s
|
|
|
|
++
|
|
|
|
++ // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
|
|
|
|
++ trn1 \r8\().2d, \t0\().2d, \t2\().2d
|
|
|
|
++ trn2 \r12\().2d, \t0\().2d, \t2\().2d
|
|
|
|
++ trn1 \r10\().2d, \t1\().2d, \t3\().2d
|
|
|
|
++ trn2 \r14\().2d, \t1\().2d, \t3\().2d
|
|
|
|
++
|
|
|
|
++ // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
|
|
|
|
++ trn1 \t0\().2d, \r1\().2d, \r5\().2d
|
|
|
|
++ trn2 \r5\().2d, \r1\().2d, \r5\().2d
|
|
|
|
++ trn1 \t1\().2d, \r3\().2d, \r7\().2d
|
|
|
|
++ trn2 \r7\().2d, \r3\().2d, \r7\().2d
|
|
|
|
++
|
|
|
|
++ // Move the outputs of trn1 back in place
|
|
|
|
++ mov \r1\().16b, \t0\().16b
|
|
|
|
++ mov \r3\().16b, \t1\().16b
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
|
|
|
|
++// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
|
|
|
|
++// in/out are .4s registers; this can do with 4 temp registers, but is
|
|
|
|
++// more efficient if 6 temp registers are available.
|
|
|
|
++.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
|
|
|
|
++.if \neg > 0
|
|
|
|
++ neg \tmp4\().4s, v0.4s
|
|
|
|
++.endif
|
|
|
|
++ add \tmp1\().4s, \in1\().4s, \in2\().4s
|
|
|
|
++ sub \tmp2\().4s, \in1\().4s, \in2\().4s
|
|
|
|
++.if \neg > 0
|
|
|
|
++ smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
|
|
|
|
++ smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
|
|
|
|
++.else
|
|
|
|
++ smull \tmp3\().2d, \tmp1\().2s, v0.s[0]
|
|
|
|
++ smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0]
|
|
|
|
++.endif
|
|
|
|
++.ifb \tmp5
|
|
|
|
++ rshrn \out1\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \out1\().4s, \tmp4\().2d, #14
|
|
|
|
++ smull \tmp3\().2d, \tmp2\().2s, v0.s[0]
|
|
|
|
++ smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0]
|
|
|
|
++ rshrn \out2\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \out2\().4s, \tmp4\().2d, #14
|
|
|
|
++.else
|
|
|
|
++ smull \tmp5\().2d, \tmp2\().2s, v0.s[0]
|
|
|
|
++ smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0]
|
|
|
|
++ rshrn \out1\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \out1\().4s, \tmp4\().2d, #14
|
|
|
|
++ rshrn \out2\().2s, \tmp5\().2d, #14
|
|
|
|
++ rshrn2 \out2\().4s, \tmp6\().2d, #14
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Same as dmbutterfly0 above, but treating the input in in2 as zero,
|
|
|
|
++// writing the same output into both out1 and out2.
|
|
|
|
++.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
|
|
|
|
++ smull \tmp1\().2d, \in1\().2s, v0.s[0]
|
|
|
|
++ smull2 \tmp2\().2d, \in1\().4s, v0.s[0]
|
|
|
|
++ rshrn \out1\().2s, \tmp1\().2d, #14
|
|
|
|
++ rshrn2 \out1\().4s, \tmp2\().2d, #14
|
|
|
|
++ rshrn \out2\().2s, \tmp1\().2d, #14
|
|
|
|
++ rshrn2 \out2\().4s, \tmp2\().2d, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1,out2 = in1 * coef1 - in2 * coef2
|
|
|
|
++// out3,out4 = in1 * coef2 + in2 * coef1
|
|
|
|
++// out are 4 x .2d registers, in are 2 x .4s registers
|
|
|
|
++.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
|
|
|
|
++ smull \out1\().2d, \in1\().2s, \coef1
|
|
|
|
++ smull2 \out2\().2d, \in1\().4s, \coef1
|
|
|
|
++ smull \out3\().2d, \in1\().2s, \coef2
|
|
|
|
++ smull2 \out4\().2d, \in1\().4s, \coef2
|
|
|
|
++ smlsl \out1\().2d, \in2\().2s, \coef2
|
|
|
|
++ smlsl2 \out2\().2d, \in2\().4s, \coef2
|
|
|
|
++ smlal \out3\().2d, \in2\().2s, \coef1
|
|
|
|
++ smlal2 \out4\().2d, \in2\().4s, \coef1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
|
|
|
|
++// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
|
|
|
|
++// inout are 2 x .4s registers
|
|
|
|
++.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
|
|
|
|
++ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
|
|
|
|
++.if \neg > 0
|
|
|
|
++ neg \tmp3\().2d, \tmp3\().2d
|
|
|
|
++ neg \tmp4\().2d, \tmp4\().2d
|
|
|
|
++.endif
|
|
|
|
++ rshrn \inout1\().2s, \tmp1\().2d, #14
|
|
|
|
++ rshrn2 \inout1\().4s, \tmp2\().2d, #14
|
|
|
|
++ rshrn \inout2\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \inout2\().4s, \tmp4\().2d, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Same as dmbutterfly above, but treating the input in inout2 as zero
|
|
|
|
++.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
|
|
|
|
++ smull \tmp1\().2d, \inout1\().2s, \coef1
|
|
|
|
++ smull2 \tmp2\().2d, \inout1\().4s, \coef1
|
|
|
|
++ smull \tmp3\().2d, \inout1\().2s, \coef2
|
|
|
|
++ smull2 \tmp4\().2d, \inout1\().4s, \coef2
|
|
|
|
++ rshrn \inout1\().2s, \tmp1\().2d, #14
|
|
|
|
++ rshrn2 \inout1\().4s, \tmp2\().2d, #14
|
|
|
|
++ rshrn \inout2\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \inout2\().4s, \tmp4\().2d, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Same as dmbutterfly above, but treating the input in inout1 as zero
|
|
|
|
++.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
|
|
|
|
++ smull \tmp1\().2d, \inout2\().2s, \coef2
|
|
|
|
++ smull2 \tmp2\().2d, \inout2\().4s, \coef2
|
|
|
|
++ smull \tmp3\().2d, \inout2\().2s, \coef1
|
|
|
|
++ smull2 \tmp4\().2d, \inout2\().4s, \coef1
|
|
|
|
++ neg \tmp1\().2d, \tmp1\().2d
|
|
|
|
++ neg \tmp2\().2d, \tmp2\().2d
|
|
|
|
++ rshrn \inout2\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \inout2\().4s, \tmp4\().2d, #14
|
|
|
|
++ rshrn \inout1\().2s, \tmp1\().2d, #14
|
|
|
|
++ rshrn2 \inout1\().4s, \tmp2\().2d, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro dsmull_h out1, out2, in, coef
|
|
|
|
++ smull \out1\().2d, \in\().2s, \coef
|
|
|
|
++ smull2 \out2\().2d, \in\().4s, \coef
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro drshrn_h out, in1, in2, shift
|
|
|
|
++ rshrn \out\().2s, \in1\().2d, \shift
|
|
|
|
++ rshrn2 \out\().4s, \in2\().2d, \shift
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// out1 = in1 + in2
|
|
|
|
++// out2 = in1 - in2
|
|
|
|
++.macro butterfly_4s out1, out2, in1, in2
|
|
|
|
++ add \out1\().4s, \in1\().4s, \in2\().4s
|
|
|
|
++ sub \out2\().4s, \in1\().4s, \in2\().4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1 = in1 - in2
|
|
|
|
++// out2 = in1 + in2
|
|
|
|
++.macro butterfly_4s_r out1, out2, in1, in2
|
|
|
|
++ sub \out1\().4s, \in1\().4s, \in2\().4s
|
|
|
|
++ add \out2\().4s, \in1\().4s, \in2\().4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
|
|
|
|
++// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
|
|
|
|
++// out are 2 x .4s registers, in are 4 x .2d registers
|
|
|
|
++.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
|
|
|
|
++ add \tmp1\().2d, \in1\().2d, \in3\().2d
|
|
|
|
++ add \tmp2\().2d, \in2\().2d, \in4\().2d
|
|
|
|
++ sub \tmp3\().2d, \in1\().2d, \in3\().2d
|
|
|
|
++ sub \tmp4\().2d, \in2\().2d, \in4\().2d
|
|
|
|
++ rshrn \out1\().2s, \tmp1\().2d, #14
|
|
|
|
++ rshrn2 \out1\().4s, \tmp2\().2d, #14
|
|
|
|
++ rshrn \out2\().2s, \tmp3\().2d, #14
|
|
|
|
++ rshrn2 \out2\().4s, \tmp4\().2d, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iwht4_10 c0, c1, c2, c3
|
|
|
|
++ add \c0\().4s, \c0\().4s, \c1\().4s
|
|
|
|
++ sub v17.4s, \c2\().4s, \c3\().4s
|
|
|
|
++ sub v16.4s, \c0\().4s, v17.4s
|
|
|
|
++ sshr v16.4s, v16.4s, #1
|
|
|
|
++ sub \c2\().4s, v16.4s, \c1\().4s
|
|
|
|
++ sub \c1\().4s, v16.4s, \c3\().4s
|
|
|
|
++ add \c3\().4s, v17.4s, \c2\().4s
|
|
|
|
++ sub \c0\().4s, \c0\().4s, \c1\().4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iwht4_12 c0, c1, c2, c3
|
|
|
|
++ iwht4_10 \c0, \c1, \c2, \c3
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro idct4_10 c0, c1, c2, c3
|
|
|
|
++ mul v22.4s, \c1\().4s, v0.s[3]
|
|
|
|
++ mul v20.4s, \c1\().4s, v0.s[2]
|
|
|
|
++ add v16.4s, \c0\().4s, \c2\().4s
|
|
|
|
++ sub v17.4s, \c0\().4s, \c2\().4s
|
|
|
|
++ mla v22.4s, \c3\().4s, v0.s[2]
|
|
|
|
++ mul v18.4s, v16.4s, v0.s[0]
|
|
|
|
++ mul v24.4s, v17.4s, v0.s[0]
|
|
|
|
++ mls v20.4s, \c3\().4s, v0.s[3]
|
|
|
|
++ srshr v22.4s, v22.4s, #14
|
|
|
|
++ srshr v18.4s, v18.4s, #14
|
|
|
|
++ srshr v24.4s, v24.4s, #14
|
|
|
|
++ srshr v20.4s, v20.4s, #14
|
|
|
|
++ add \c0\().4s, v18.4s, v22.4s
|
|
|
|
++ sub \c3\().4s, v18.4s, v22.4s
|
|
|
|
++ add \c1\().4s, v24.4s, v20.4s
|
|
|
|
++ sub \c2\().4s, v24.4s, v20.4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro idct4_12 c0, c1, c2, c3
|
|
|
|
++ smull v22.2d, \c1\().2s, v0.s[3]
|
|
|
|
++ smull2 v23.2d, \c1\().4s, v0.s[3]
|
|
|
|
++ smull v20.2d, \c1\().2s, v0.s[2]
|
|
|
|
++ smull2 v21.2d, \c1\().4s, v0.s[2]
|
|
|
|
++ add v16.4s, \c0\().4s, \c2\().4s
|
|
|
|
++ sub v17.4s, \c0\().4s, \c2\().4s
|
|
|
|
++ smlal v22.2d, \c3\().2s, v0.s[2]
|
|
|
|
++ smlal2 v23.2d, \c3\().4s, v0.s[2]
|
|
|
|
++ smull v18.2d, v16.2s, v0.s[0]
|
|
|
|
++ smull2 v19.2d, v16.4s, v0.s[0]
|
|
|
|
++ smull v24.2d, v17.2s, v0.s[0]
|
|
|
|
++ smull2 v25.2d, v17.4s, v0.s[0]
|
|
|
|
++ smlsl v20.2d, \c3\().2s, v0.s[3]
|
|
|
|
++ smlsl2 v21.2d, \c3\().4s, v0.s[3]
|
|
|
|
++ rshrn v22.2s, v22.2d, #14
|
|
|
|
++ rshrn2 v22.4s, v23.2d, #14
|
|
|
|
++ rshrn v18.2s, v18.2d, #14
|
|
|
|
++ rshrn2 v18.4s, v19.2d, #14
|
|
|
|
++ rshrn v24.2s, v24.2d, #14
|
|
|
|
++ rshrn2 v24.4s, v25.2d, #14
|
|
|
|
++ rshrn v20.2s, v20.2d, #14
|
|
|
|
++ rshrn2 v20.4s, v21.2d, #14
|
|
|
|
++ add \c0\().4s, v18.4s, v22.4s
|
|
|
|
++ sub \c3\().4s, v18.4s, v22.4s
|
|
|
|
++ add \c1\().4s, v24.4s, v20.4s
|
|
|
|
++ sub \c2\().4s, v24.4s, v20.4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iadst4_10 c0, c1, c2, c3
|
|
|
|
++ mul v16.4s, \c0\().4s, v1.s[0]
|
|
|
|
++ mla v16.4s, \c2\().4s, v1.s[1]
|
|
|
|
++ mla v16.4s, \c3\().4s, v1.s[2]
|
|
|
|
++ mul v18.4s, \c0\().4s, v1.s[2]
|
|
|
|
++ mls v18.4s, \c2\().4s, v1.s[0]
|
|
|
|
++ sub \c0\().4s, \c0\().4s, \c2\().4s
|
|
|
|
++ mls v18.4s, \c3\().4s, v1.s[1]
|
|
|
|
++ add \c0\().4s, \c0\().4s, \c3\().4s
|
|
|
|
++ mul v22.4s, \c1\().4s, v1.s[3]
|
|
|
|
++ mul v20.4s, \c0\().4s, v1.s[3]
|
|
|
|
++ add v24.4s, v16.4s, v22.4s
|
|
|
|
++ add v26.4s, v18.4s, v22.4s
|
|
|
|
++ srshr \c0\().4s, v24.4s, #14
|
|
|
|
++ add v16.4s, v16.4s, v18.4s
|
|
|
|
++ srshr \c1\().4s, v26.4s, #14
|
|
|
|
++ sub v16.4s, v16.4s, v22.4s
|
|
|
|
++ srshr \c2\().4s, v20.4s, #14
|
|
|
|
++ srshr \c3\().4s, v16.4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iadst4_12 c0, c1, c2, c3
|
|
|
|
++ smull v16.2d, \c0\().2s, v1.s[0]
|
|
|
|
++ smull2 v17.2d, \c0\().4s, v1.s[0]
|
|
|
|
++ smlal v16.2d, \c2\().2s, v1.s[1]
|
|
|
|
++ smlal2 v17.2d, \c2\().4s, v1.s[1]
|
|
|
|
++ smlal v16.2d, \c3\().2s, v1.s[2]
|
|
|
|
++ smlal2 v17.2d, \c3\().4s, v1.s[2]
|
|
|
|
++ smull v18.2d, \c0\().2s, v1.s[2]
|
|
|
|
++ smull2 v19.2d, \c0\().4s, v1.s[2]
|
|
|
|
++ smlsl v18.2d, \c2\().2s, v1.s[0]
|
|
|
|
++ smlsl2 v19.2d, \c2\().4s, v1.s[0]
|
|
|
|
++ sub \c0\().4s, \c0\().4s, \c2\().4s
|
|
|
|
++ smlsl v18.2d, \c3\().2s, v1.s[1]
|
|
|
|
++ smlsl2 v19.2d, \c3\().4s, v1.s[1]
|
|
|
|
++ add \c0\().4s, \c0\().4s, \c3\().4s
|
|
|
|
++ smull v22.2d, \c1\().2s, v1.s[3]
|
|
|
|
++ smull2 v23.2d, \c1\().4s, v1.s[3]
|
|
|
|
++ smull v20.2d, \c0\().2s, v1.s[3]
|
|
|
|
++ smull2 v21.2d, \c0\().4s, v1.s[3]
|
|
|
|
++ add v24.2d, v16.2d, v22.2d
|
|
|
|
++ add v25.2d, v17.2d, v23.2d
|
|
|
|
++ add v26.2d, v18.2d, v22.2d
|
|
|
|
++ add v27.2d, v19.2d, v23.2d
|
|
|
|
++ rshrn \c0\().2s, v24.2d, #14
|
|
|
|
++ rshrn2 \c0\().4s, v25.2d, #14
|
|
|
|
++ add v16.2d, v16.2d, v18.2d
|
|
|
|
++ add v17.2d, v17.2d, v19.2d
|
|
|
|
++ rshrn \c1\().2s, v26.2d, #14
|
|
|
|
++ rshrn2 \c1\().4s, v27.2d, #14
|
|
|
|
++ sub v16.2d, v16.2d, v22.2d
|
|
|
|
++ sub v17.2d, v17.2d, v23.2d
|
|
|
|
++ rshrn \c2\().2s, v20.2d, #14
|
|
|
|
++ rshrn2 \c2\().4s, v21.2d, #14
|
|
|
|
++ rshrn \c3\().2s, v16.2d, #14
|
|
|
|
++ rshrn2 \c3\().4s, v17.2d, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// The public functions in this file have got the following signature:
|
|
|
|
++// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
|
|
|
++
|
|
|
|
++.macro itxfm_func4x4 txfm1, txfm2, bpp
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
|
|
|
|
++.ifc \txfm1,\txfm2
|
|
|
|
++.ifc \txfm1,idct
|
|
|
|
++ movrel x4, itxfm4_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++.endif
|
|
|
|
++.ifc \txfm1,iadst
|
|
|
|
++ movrel x4, iadst4_coeffs
|
|
|
|
++ ld1 {v0.d}[1], [x4]
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++.endif
|
|
|
|
++.else
|
|
|
|
++ movrel x4, itxfm4_coeffs
|
|
|
|
++ ld1 {v0.8h}, [x4]
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ movi v30.4s, #0
|
|
|
|
++ movi v31.4s, #0
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.ne 1f
|
|
|
|
++ // DC-only for idct/idct
|
|
|
|
++ ld1 {v2.s}[0], [x2]
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ st1 {v31.s}[0], [x2]
|
|
|
|
++ dup v4.4s, v2.s[0]
|
|
|
|
++ mov v5.16b, v4.16b
|
|
|
|
++ mov v6.16b, v4.16b
|
|
|
|
++ mov v7.16b, v4.16b
|
|
|
|
++ b 2f
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2]
|
|
|
|
++ st1 {v30.4s,v31.4s}, [x2], #32
|
|
|
|
++
|
|
|
|
++.ifc \txfm1,iwht
|
|
|
|
++ sshr v4.4s, v4.4s, #2
|
|
|
|
++ sshr v5.4s, v5.4s, #2
|
|
|
|
++ sshr v6.4s, v6.4s, #2
|
|
|
|
++ sshr v7.4s, v7.4s, #2
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ \txfm1\()4_\bpp v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ st1 {v30.4s,v31.4s}, [x2], #32
|
|
|
|
++ // Transpose 4x4 with 32 bit elements
|
|
|
|
++ transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19
|
|
|
|
++
|
|
|
|
++ \txfm2\()4_\bpp v4, v5, v6, v7
|
|
|
|
++2:
|
|
|
|
++ mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
|
|
|
++ ld1 {v0.4h}, [x0], x1
|
|
|
|
++ ld1 {v1.4h}, [x0], x1
|
|
|
|
++.ifnc \txfm1,iwht
|
|
|
|
++ srshr v4.4s, v4.4s, #4
|
|
|
|
++ srshr v5.4s, v5.4s, #4
|
|
|
|
++ srshr v6.4s, v6.4s, #4
|
|
|
|
++ srshr v7.4s, v7.4s, #4
|
|
|
|
++.endif
|
|
|
|
++ uaddw v4.4s, v4.4s, v0.4h
|
|
|
|
++ uaddw v5.4s, v5.4s, v1.4h
|
|
|
|
++ ld1 {v2.4h}, [x0], x1
|
|
|
|
++ ld1 {v3.4h}, [x0], x1
|
|
|
|
++ sqxtun v0.4h, v4.4s
|
|
|
|
++ sqxtun2 v0.8h, v5.4s
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++
|
|
|
|
++ uaddw v6.4s, v6.4s, v2.4h
|
|
|
|
++ umin v0.8h, v0.8h, v31.8h
|
|
|
|
++ uaddw v7.4s, v7.4s, v3.4h
|
|
|
|
++ st1 {v0.4h}, [x0], x1
|
|
|
|
++ sqxtun v2.4h, v6.4s
|
|
|
|
++ sqxtun2 v2.8h, v7.4s
|
|
|
|
++ umin v2.8h, v2.8h, v31.8h
|
|
|
|
++
|
|
|
|
++ st1 {v0.d}[1], [x0], x1
|
|
|
|
++ st1 {v2.4h}, [x0], x1
|
|
|
|
++ st1 {v2.d}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro itxfm_funcs4x4 bpp
|
|
|
|
++itxfm_func4x4 idct, idct, \bpp
|
|
|
|
++itxfm_func4x4 iadst, idct, \bpp
|
|
|
|
++itxfm_func4x4 idct, iadst, \bpp
|
|
|
|
++itxfm_func4x4 iadst, iadst, \bpp
|
|
|
|
++itxfm_func4x4 iwht, iwht, \bpp
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm_funcs4x4 10
|
|
|
|
++itxfm_funcs4x4 12
|
|
|
|
++
|
|
|
|
++function idct8x8_dc_add_neon
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++
|
|
|
|
++ movi v1.4h, #0
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++
|
|
|
|
++ ld1 {v2.s}[0], [x2]
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ st1 {v1.s}[0], [x2]
|
|
|
|
++ dup v2.4s, v2.s[0]
|
|
|
|
++
|
|
|
|
++ srshr v2.4s, v2.4s, #5
|
|
|
|
++
|
|
|
|
++ mov x4, #8
|
|
|
|
++ mov x3, x0
|
|
|
|
++ dup v31.8h, w5
|
|
|
|
++1:
|
|
|
|
++ // Loop to add the constant from v2 into all 8x8 outputs
|
|
|
|
++ subs x4, x4, #2
|
|
|
|
++ ld1 {v3.8h}, [x0], x1
|
|
|
|
++ ld1 {v4.8h}, [x0], x1
|
|
|
|
++ uaddw v16.4s, v2.4s, v3.4h
|
|
|
|
++ uaddw2 v17.4s, v2.4s, v3.8h
|
|
|
|
++ uaddw v18.4s, v2.4s, v4.4h
|
|
|
|
++ uaddw2 v19.4s, v2.4s, v4.8h
|
|
|
|
++ sqxtun v3.4h, v16.4s
|
|
|
|
++ sqxtun2 v3.8h, v17.4s
|
|
|
|
++ sqxtun v4.4h, v18.4s
|
|
|
|
++ sqxtun2 v4.8h, v19.4s
|
|
|
|
++ umin v3.8h, v3.8h, v31.8h
|
|
|
|
++ umin v4.8h, v4.8h, v31.8h
|
|
|
|
++ st1 {v3.8h}, [x3], x1
|
|
|
|
++ st1 {v4.8h}, [x3], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
|
|
|
|
++ dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
|
|
|
|
++ dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a
|
|
|
|
++ dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a
|
|
|
|
++ dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a
|
|
|
|
++
|
|
|
|
++ butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
|
|
|
|
++ butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
|
|
|
|
++ butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
|
|
|
|
++ butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
|
|
|
|
++
|
|
|
|
++ butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
|
|
|
|
++ butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
|
|
|
|
++ butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
|
|
|
|
++ butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
|
|
|
|
++ dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a
|
|
|
|
++ dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a
|
|
|
|
++
|
|
|
|
++ dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
|
|
|
|
++ dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a
|
|
|
|
++ dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a
|
|
|
|
++
|
|
|
|
++ dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
|
|
|
|
++ dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
|
|
|
|
++
|
|
|
|
++ butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3
|
|
|
|
++ neg \r7\().4s, \r7\().4s // r7 = out[7]
|
|
|
|
++ butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a
|
|
|
|
++ dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a
|
|
|
|
++
|
|
|
|
++ dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4]
|
|
|
|
++ neg \r3\().4s, \r3\().4s // r3 = out[3]
|
|
|
|
++
|
|
|
|
++ dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
|
|
|
|
++ neg \r1\().4s, \r1\().4s // r1 = out[1]
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5]
|
|
|
|
++ neg \r5\().4s, \r5\().4s // r5 = out[5]
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro itxfm_func8x8 txfm1, txfm2
|
|
|
|
++function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.eq idct8x8_dc_add_neon
|
|
|
|
++.endif
|
|
|
|
++ // The iadst also uses a few coefficients from
|
|
|
|
++ // idct, so those always need to be loaded.
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++.else
|
|
|
|
++ movrel x4, iadst8_coeffs
|
|
|
|
++ ld1 {v1.8h}, [x4], #16
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++ sxtl2 v3.4s, v1.8h
|
|
|
|
++ sxtl v2.4s, v1.4h
|
|
|
|
++.endif
|
|
|
|
++ ld1 {v0.8h}, [x4]
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++
|
|
|
|
++ movi v4.4s, #0
|
|
|
|
++ movi v5.4s, #0
|
|
|
|
++ movi v6.4s, #0
|
|
|
|
++ movi v7.4s, #0
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64
|
|
|
|
++ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
|
|
|
|
++ ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64
|
|
|
|
++ ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
|
|
|
|
++ sub x2, x2, #256
|
|
|
|
++ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
|
|
|
|
++ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
|
|
|
|
++ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
|
|
|
|
++ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
|
|
|
|
++ idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
|
|
|
|
++.else
|
|
|
|
++ \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
|
|
|
|
++ \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ // Transpose 8x8 with 16 bit elements
|
|
|
|
++ transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
|
|
|
|
++ idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
|
|
|
|
++.else
|
|
|
|
++ \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
|
|
|
|
++ \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
|
|
|
|
++.endif
|
|
|
|
++2:
|
|
|
|
++ mov x3, x0
|
|
|
|
++ // Add into the destination
|
|
|
|
++ ld1 {v0.8h}, [x0], x1
|
|
|
|
++ srshr v16.4s, v16.4s, #5
|
|
|
|
++ srshr v17.4s, v17.4s, #5
|
|
|
|
++ ld1 {v1.8h}, [x0], x1
|
|
|
|
++ srshr v18.4s, v18.4s, #5
|
|
|
|
++ srshr v19.4s, v19.4s, #5
|
|
|
|
++ ld1 {v2.8h}, [x0], x1
|
|
|
|
++ srshr v20.4s, v20.4s, #5
|
|
|
|
++ srshr v21.4s, v21.4s, #5
|
|
|
|
++ uaddw v16.4s, v16.4s, v0.4h
|
|
|
|
++ uaddw2 v17.4s, v17.4s, v0.8h
|
|
|
|
++ ld1 {v3.8h}, [x0], x1
|
|
|
|
++ srshr v22.4s, v22.4s, #5
|
|
|
|
++ srshr v23.4s, v23.4s, #5
|
|
|
|
++ uaddw v18.4s, v18.4s, v1.4h
|
|
|
|
++ uaddw2 v19.4s, v19.4s, v1.8h
|
|
|
|
++ ld1 {v4.8h}, [x0], x1
|
|
|
|
++ srshr v24.4s, v24.4s, #5
|
|
|
|
++ srshr v25.4s, v25.4s, #5
|
|
|
|
++ uaddw v20.4s, v20.4s, v2.4h
|
|
|
|
++ uaddw2 v21.4s, v21.4s, v2.8h
|
|
|
|
++ sqxtun v0.4h, v16.4s
|
|
|
|
++ sqxtun2 v0.8h, v17.4s
|
|
|
|
++ dup v16.8h, w5
|
|
|
|
++ ld1 {v5.8h}, [x0], x1
|
|
|
|
++ srshr v26.4s, v26.4s, #5
|
|
|
|
++ srshr v27.4s, v27.4s, #5
|
|
|
|
++ uaddw v22.4s, v22.4s, v3.4h
|
|
|
|
++ uaddw2 v23.4s, v23.4s, v3.8h
|
|
|
|
++ sqxtun v1.4h, v18.4s
|
|
|
|
++ sqxtun2 v1.8h, v19.4s
|
|
|
|
++ umin v0.8h, v0.8h, v16.8h
|
|
|
|
++ ld1 {v6.8h}, [x0], x1
|
|
|
|
++ srshr v28.4s, v28.4s, #5
|
|
|
|
++ srshr v29.4s, v29.4s, #5
|
|
|
|
++ uaddw v24.4s, v24.4s, v4.4h
|
|
|
|
++ uaddw2 v25.4s, v25.4s, v4.8h
|
|
|
|
++ sqxtun v2.4h, v20.4s
|
|
|
|
++ sqxtun2 v2.8h, v21.4s
|
|
|
|
++ umin v1.8h, v1.8h, v16.8h
|
|
|
|
++ ld1 {v7.8h}, [x0], x1
|
|
|
|
++ srshr v30.4s, v30.4s, #5
|
|
|
|
++ srshr v31.4s, v31.4s, #5
|
|
|
|
++ uaddw v26.4s, v26.4s, v5.4h
|
|
|
|
++ uaddw2 v27.4s, v27.4s, v5.8h
|
|
|
|
++ sqxtun v3.4h, v22.4s
|
|
|
|
++ sqxtun2 v3.8h, v23.4s
|
|
|
|
++ umin v2.8h, v2.8h, v16.8h
|
|
|
|
++
|
|
|
|
++ st1 {v0.8h}, [x3], x1
|
|
|
|
++ uaddw v28.4s, v28.4s, v6.4h
|
|
|
|
++ uaddw2 v29.4s, v29.4s, v6.8h
|
|
|
|
++ st1 {v1.8h}, [x3], x1
|
|
|
|
++ sqxtun v4.4h, v24.4s
|
|
|
|
++ sqxtun2 v4.8h, v25.4s
|
|
|
|
++ umin v3.8h, v3.8h, v16.8h
|
|
|
|
++ st1 {v2.8h}, [x3], x1
|
|
|
|
++ uaddw v30.4s, v30.4s, v7.4h
|
|
|
|
++ uaddw2 v31.4s, v31.4s, v7.8h
|
|
|
|
++ st1 {v3.8h}, [x3], x1
|
|
|
|
++ sqxtun v5.4h, v26.4s
|
|
|
|
++ sqxtun2 v5.8h, v27.4s
|
|
|
|
++ umin v4.8h, v4.8h, v16.8h
|
|
|
|
++ st1 {v4.8h}, [x3], x1
|
|
|
|
++ sqxtun v6.4h, v28.4s
|
|
|
|
++ sqxtun2 v6.8h, v29.4s
|
|
|
|
++ umin v5.8h, v5.8h, v16.8h
|
|
|
|
++ st1 {v5.8h}, [x3], x1
|
|
|
|
++ sqxtun v7.4h, v30.4s
|
|
|
|
++ sqxtun2 v7.8h, v31.4s
|
|
|
|
++ umin v6.8h, v6.8h, v16.8h
|
|
|
|
++
|
|
|
|
++ st1 {v6.8h}, [x3], x1
|
|
|
|
++ umin v7.8h, v7.8h, v16.8h
|
|
|
|
++ st1 {v7.8h}, [x3], x1
|
|
|
|
++
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++.endif
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
|
|
|
|
++ mov x5, #0x03ff
|
|
|
|
++ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
|
|
|
|
++ mov x5, #0x0fff
|
|
|
|
++ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm_func8x8 idct, idct
|
|
|
|
++itxfm_func8x8 iadst, idct
|
|
|
|
++itxfm_func8x8 idct, iadst
|
|
|
|
++itxfm_func8x8 iadst, iadst
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++function idct16x16_dc_add_neon
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++
|
|
|
|
++ movi v1.4h, #0
|
|
|
|
++
|
|
|
|
++ ld1 {v2.s}[0], [x2]
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ st1 {v1.s}[0], [x2]
|
|
|
|
++ dup v2.4s, v2.s[0]
|
|
|
|
++
|
|
|
|
++ srshr v0.4s, v2.4s, #6
|
|
|
|
++
|
|
|
|
++ mov x3, x0
|
|
|
|
++ mov x4, #16
|
|
|
|
++ dup v31.8h, w13
|
|
|
|
++1:
|
|
|
|
++ // Loop to add the constant from v2 into all 16x16 outputs
|
|
|
|
++ subs x4, x4, #2
|
|
|
|
++ ld1 {v1.8h,v2.8h}, [x0], x1
|
|
|
|
++ uaddw v16.4s, v0.4s, v1.4h
|
|
|
|
++ uaddw2 v17.4s, v0.4s, v1.8h
|
|
|
|
++ ld1 {v3.8h,v4.8h}, [x0], x1
|
|
|
|
++ uaddw v18.4s, v0.4s, v2.4h
|
|
|
|
++ uaddw2 v19.4s, v0.4s, v2.8h
|
|
|
|
++ uaddw v20.4s, v0.4s, v3.4h
|
|
|
|
++ uaddw2 v21.4s, v0.4s, v3.8h
|
|
|
|
++ uaddw v22.4s, v0.4s, v4.4h
|
|
|
|
++ uaddw2 v23.4s, v0.4s, v4.8h
|
|
|
|
++ sqxtun v1.4h, v16.4s
|
|
|
|
++ sqxtun2 v1.8h, v17.4s
|
|
|
|
++ sqxtun v2.4h, v18.4s
|
|
|
|
++ sqxtun2 v2.8h, v19.4s
|
|
|
|
++ sqxtun v3.4h, v20.4s
|
|
|
|
++ sqxtun2 v3.8h, v21.4s
|
|
|
|
++ sqxtun v4.4h, v22.4s
|
|
|
|
++ sqxtun2 v4.8h, v23.4s
|
|
|
|
++ umin v1.8h, v1.8h, v31.8h
|
|
|
|
++ umin v2.8h, v2.8h, v31.8h
|
|
|
|
++ st1 {v1.8h,v2.8h}, [x3], x1
|
|
|
|
++ umin v3.8h, v3.8h, v31.8h
|
|
|
|
++ umin v4.8h, v4.8h, v31.8h
|
|
|
|
++ st1 {v3.8h,v4.8h}, [x3], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct16_end
|
|
|
|
++ butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a
|
|
|
|
++ butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6
|
|
|
|
++ butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5
|
|
|
|
++ butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4
|
|
|
|
++ butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a
|
|
|
|
++ butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10
|
|
|
|
++ butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13
|
|
|
|
++ butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a
|
|
|
|
++ dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
|
|
|
|
++
|
|
|
|
++ butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
|
|
|
|
++ butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
|
|
|
|
++ butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
|
|
|
|
++ butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
|
|
|
|
++ butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13]
|
|
|
|
++ butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
|
|
|
|
++ butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
|
|
|
|
++ butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10]
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function idct16
|
|
|
|
++ dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
|
|
|
|
++ dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
|
|
|
|
++ dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
|
|
|
|
++ dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
|
|
|
|
++ dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
|
|
|
|
++ dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
|
|
|
|
++ dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
|
|
|
|
++ dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
|
|
|
|
++
|
|
|
|
++ butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
|
|
|
|
++ butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
|
|
|
|
++ butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
|
|
|
|
++ butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
|
|
|
|
++ butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
|
|
|
|
++ butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
|
|
|
|
++ butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
|
|
|
|
++ butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
|
|
|
|
++ dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
|
|
|
|
++ dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
|
|
|
|
++ idct16_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_half
|
|
|
|
++ dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
|
|
|
|
++ dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
|
|
|
|
++ dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
|
|
|
|
++ dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
|
|
|
|
++ dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
|
|
|
|
++ dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
|
|
|
|
++ dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
|
|
|
|
++ dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
|
|
|
|
++
|
|
|
|
++ butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
|
|
|
|
++ butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
|
|
|
|
++ butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
|
|
|
|
++ butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
|
|
|
|
++ butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
|
|
|
|
++ butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
|
|
|
|
++ butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
|
|
|
|
++ butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
|
|
|
|
++ dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
|
|
|
|
++ dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
|
|
|
|
++ idct16_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_quarter
|
|
|
|
++ dsmull_h v24, v25, v19, v3.s[3]
|
|
|
|
++ dsmull_h v4, v5, v17, v2.s[0]
|
|
|
|
++ dsmull_h v7, v6, v18, v1.s[1]
|
|
|
|
++ dsmull_h v30, v31, v18, v1.s[0]
|
|
|
|
++ neg v24.2d, v24.2d
|
|
|
|
++ neg v25.2d, v25.2d
|
|
|
|
++ dsmull_h v29, v28, v17, v2.s[1]
|
|
|
|
++ dsmull_h v26, v27, v19, v3.s[2]
|
|
|
|
++ dsmull_h v22, v23, v16, v0.s[0]
|
|
|
|
++ drshrn_h v24, v24, v25, #14
|
|
|
|
++ drshrn_h v16, v4, v5, #14
|
|
|
|
++ drshrn_h v7, v7, v6, #14
|
|
|
|
++ drshrn_h v6, v30, v31, #14
|
|
|
|
++ drshrn_h v29, v29, v28, #14
|
|
|
|
++ drshrn_h v17, v26, v27, #14
|
|
|
|
++ drshrn_h v28, v22, v23, #14
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
|
|
|
|
++ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
|
|
|
|
++ neg v22.2d, v22.2d
|
|
|
|
++ neg v23.2d, v23.2d
|
|
|
|
++ drshrn_h v27, v20, v21, #14
|
|
|
|
++ drshrn_h v21, v22, v23, #14
|
|
|
|
++ drshrn_h v23, v18, v19, #14
|
|
|
|
++ drshrn_h v25, v30, v31, #14
|
|
|
|
++ mov v4.16b, v28.16b
|
|
|
|
++ mov v5.16b, v28.16b
|
|
|
|
++ dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
|
|
|
|
++ mov v20.16b, v28.16b
|
|
|
|
++ idct16_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function iadst16
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x11]
|
|
|
|
++ sxtl v2.4s, v1.4h
|
|
|
|
++ sxtl2 v3.4s, v1.8h
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8
|
|
|
|
++ dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
|
|
|
|
++ dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2
|
|
|
|
++ dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10
|
|
|
|
++ dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4
|
|
|
|
++ dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12
|
|
|
|
++ dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
|
|
|
|
++ dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6
|
|
|
|
++ dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14
|
|
|
|
++ ld1 {v0.8h}, [x10]
|
|
|
|
++ dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++ dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8
|
|
|
|
++ dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13
|
|
|
|
++ dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10
|
|
|
|
++ butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0
|
|
|
|
++ dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15
|
|
|
|
++ butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1
|
|
|
|
++ dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
|
|
|
|
++ dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
|
|
|
|
++
|
|
|
|
++ butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2
|
|
|
|
++ butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12
|
|
|
|
++ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15
|
|
|
|
++
|
|
|
|
++ dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
|
|
|
|
++ dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
|
|
|
|
++ neg v29.4s, v29.4s // v29 = out[13]
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a
|
|
|
|
++ dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a
|
|
|
|
++
|
|
|
|
++ butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a
|
|
|
|
++ butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10
|
|
|
|
++
|
|
|
|
++ dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
|
|
|
|
++ neg v19.4s, v19.4s // v19 = out[3]
|
|
|
|
++ dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
|
|
|
|
++
|
|
|
|
++ butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
|
|
|
|
++ butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
|
|
|
|
++ dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
|
|
|
|
++ dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
|
|
|
|
++ dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
|
|
|
|
++
|
|
|
|
++ neg v31.4s, v5.4s // v31 = out[15]
|
|
|
|
++ neg v17.4s, v3.4s // v17 = out[1]
|
|
|
|
++
|
|
|
|
++ mov v16.16b, v2.16b
|
|
|
|
++ mov v30.16b, v4.16b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++// Helper macros; we can't use these expressions directly within
|
|
|
|
++// e.g. .irp due to the extra concatenation \(). Therefore wrap
|
|
|
|
++// them in macros to allow using .irp below.
|
|
|
|
++.macro load i, src, inc
|
|
|
|
++ ld1 {v\i\().4s}, [\src], \inc
|
|
|
|
++.endm
|
|
|
|
++.macro store i, dst, inc
|
|
|
|
++ st1 {v\i\().4s}, [\dst], \inc
|
|
|
|
++.endm
|
|
|
|
++.macro movi_v i, size, imm
|
|
|
|
++ movi v\i\()\size, \imm
|
|
|
|
++.endm
|
|
|
|
++.macro load_clear i, src, inc
|
|
|
|
++ ld1 {v\i\().4s}, [\src]
|
|
|
|
++ st1 {v4.4s}, [\src], \inc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
|
|
|
|
++ srshr \coef0, \coef0, #6
|
|
|
|
++ ld1 {v4.4h}, [x0], x1
|
|
|
|
++ srshr \coef1, \coef1, #6
|
|
|
|
++ ld1 {v4.d}[1], [x3], x1
|
|
|
|
++ srshr \coef2, \coef2, #6
|
|
|
|
++ ld1 {v5.4h}, [x0], x1
|
|
|
|
++ srshr \coef3, \coef3, #6
|
|
|
|
++ uaddw \coef0, \coef0, v4.4h
|
|
|
|
++ ld1 {v5.d}[1], [x3], x1
|
|
|
|
++ srshr \coef4, \coef4, #6
|
|
|
|
++ uaddw2 \coef1, \coef1, v4.8h
|
|
|
|
++ ld1 {v6.4h}, [x0], x1
|
|
|
|
++ srshr \coef5, \coef5, #6
|
|
|
|
++ uaddw \coef2, \coef2, v5.4h
|
|
|
|
++ ld1 {v6.d}[1], [x3], x1
|
|
|
|
++ sqxtun v4.4h, \coef0
|
|
|
|
++ srshr \coef6, \coef6, #6
|
|
|
|
++ uaddw2 \coef3, \coef3, v5.8h
|
|
|
|
++ ld1 {v7.4h}, [x0], x1
|
|
|
|
++ sqxtun2 v4.8h, \coef1
|
|
|
|
++ srshr \coef7, \coef7, #6
|
|
|
|
++ uaddw \coef4, \coef4, v6.4h
|
|
|
|
++ ld1 {v7.d}[1], [x3], x1
|
|
|
|
++ umin v4.8h, v4.8h, v8.8h
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ sub x3, x3, x1, lsl #2
|
|
|
|
++ sqxtun v5.4h, \coef2
|
|
|
|
++ uaddw2 \coef5, \coef5, v6.8h
|
|
|
|
++ st1 {v4.4h}, [x0], x1
|
|
|
|
++ sqxtun2 v5.8h, \coef3
|
|
|
|
++ uaddw \coef6, \coef6, v7.4h
|
|
|
|
++ st1 {v4.d}[1], [x3], x1
|
|
|
|
++ umin v5.8h, v5.8h, v8.8h
|
|
|
|
++ sqxtun v6.4h, \coef4
|
|
|
|
++ uaddw2 \coef7, \coef7, v7.8h
|
|
|
|
++ st1 {v5.4h}, [x0], x1
|
|
|
|
++ sqxtun2 v6.8h, \coef5
|
|
|
|
++ st1 {v5.d}[1], [x3], x1
|
|
|
|
++ umin v6.8h, v6.8h, v8.8h
|
|
|
|
++ sqxtun v7.4h, \coef6
|
|
|
|
++ st1 {v6.4h}, [x0], x1
|
|
|
|
++ sqxtun2 v7.8h, \coef7
|
|
|
|
++ st1 {v6.d}[1], [x3], x1
|
|
|
|
++ umin v7.8h, v7.8h, v8.8h
|
|
|
|
++ st1 {v7.4h}, [x0], x1
|
|
|
|
++ st1 {v7.d}[1], [x3], x1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
|
|
|
|
++// transpose into a horizontal 16x4 slice and store.
|
|
|
|
++// x0 = dst (temp buffer)
|
|
|
|
++// x1 = slice offset
|
|
|
|
++// x2 = src
|
|
|
|
++// x9 = input stride
|
|
|
|
++.macro itxfm16_1d_funcs txfm
|
|
|
|
++function \txfm\()16_1d_4x16_pass1_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ movi v4.4s, #0
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ bl \txfm\()16
|
|
|
|
++
|
|
|
|
++ // Do four 4x4 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
|
|
|
|
++ // contain the four transposed 4x4 blocks.
|
|
|
|
++ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ // Store the transposed 4x4 blocks horizontally.
|
|
|
|
++ cmp x1, #12
|
|
|
|
++ b.eq 1f
|
|
|
|
++.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
|
|
|
|
++ store \i, x0, #16
|
|
|
|
++.endr
|
|
|
|
++ br x14
|
|
|
|
++1:
|
|
|
|
++ // Special case: For the last input column (x1 == 12),
|
|
|
|
++ // which would be stored as the last row in the temp buffer,
|
|
|
|
++ // don't store the first 4x4 block, but keep it in registers
|
|
|
|
++ // for the first slice of the second pass (where it is the
|
|
|
|
++ // last 4x4 block).
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v20.4s}, [x0], #16
|
|
|
|
++ st1 {v24.4s}, [x0], #16
|
|
|
|
++ st1 {v28.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v21.4s}, [x0], #16
|
|
|
|
++ st1 {v25.4s}, [x0], #16
|
|
|
|
++ st1 {v29.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v22.4s}, [x0], #16
|
|
|
|
++ st1 {v26.4s}, [x0], #16
|
|
|
|
++ st1 {v30.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v23.4s}, [x0], #16
|
|
|
|
++ st1 {v27.4s}, [x0], #16
|
|
|
|
++ st1 {v31.4s}, [x0], #16
|
|
|
|
++
|
|
|
|
++ mov v28.16b, v16.16b
|
|
|
|
++ mov v29.16b, v17.16b
|
|
|
|
++ mov v30.16b, v18.16b
|
|
|
|
++ mov v31.16b, v19.16b
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
|
|
|
|
++// load the destination pixels (from a similar 4x16 slice), add and store back.
|
|
|
|
++// x0 = dst
|
|
|
|
++// x1 = dst stride
|
|
|
|
++// x2 = src (temp buffer)
|
|
|
|
++// x3 = slice offset
|
|
|
|
++// x9 = temp buffer stride
|
|
|
|
++function \txfm\()16_1d_4x16_pass2_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ cbz x3, 1f
|
|
|
|
++.irp i, 28, 29, 30, 31
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ add x3, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ bl \txfm\()16
|
|
|
|
++
|
|
|
|
++ dup v8.8h, w13
|
|
|
|
++ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
|
|
|
|
++ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
|
|
|
|
++
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm16_1d_funcs idct
|
|
|
|
++itxfm16_1d_funcs iadst
|
|
|
|
++
|
|
|
|
++// This is the minimum eob value for each subpartition, in increments of 4
|
|
|
|
++const min_eob_idct_idct_16, align=4
|
|
|
|
++ .short 0, 10, 38, 89
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++.macro itxfm_func16x16 txfm1, txfm2
|
|
|
|
++function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.eq idct16x16_dc_add_neon
|
|
|
|
++.endif
|
|
|
|
++ mov x15, x30
|
|
|
|
++ // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++.endif
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++
|
|
|
|
++ sub sp, sp, #1024
|
|
|
|
++
|
|
|
|
++ mov x4, x0
|
|
|
|
++ mov x5, x1
|
|
|
|
++ mov x6, x2
|
|
|
|
++
|
|
|
|
++ movrel x10, idct_coeffs
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ movrel x11, iadst16_coeffs
|
|
|
|
++.endif
|
|
|
|
++.ifc \txfm1,idct
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x10]
|
|
|
|
++ sxtl v2.4s, v1.4h
|
|
|
|
++ sxtl2 v3.4s, v1.8h
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++.endif
|
|
|
|
++ mov x9, #64
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #10
|
|
|
|
++ b.le idct16x16_quarter_add_16_neon
|
|
|
|
++ cmp w3, #38
|
|
|
|
++ b.le idct16x16_half_add_16_neon
|
|
|
|
++
|
|
|
|
++ movrel x12, min_eob_idct_idct_16, 2
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.irp i, 0, 4, 8, 12
|
|
|
|
++ add x0, sp, #(\i*64)
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++.if \i > 0
|
|
|
|
++ ldrh w1, [x12], #2
|
|
|
|
++ cmp w3, w1
|
|
|
|
++ mov x1, #(16 - \i)/4
|
|
|
|
++ b.le 1f
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ mov x1, #\i
|
|
|
|
++ add x2, x6, #(\i*4)
|
|
|
|
++ bl \txfm1\()16_1d_4x16_pass1_neon
|
|
|
|
++.endr
|
|
|
|
++.ifc \txfm1\()_\txfm2,iadst_idct
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x10]
|
|
|
|
++ sxtl v2.4s, v1.4h
|
|
|
|
++ sxtl2 v3.4s, v1.8h
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ b 3f
|
|
|
|
++1:
|
|
|
|
++ // Set v28-v31 to zero, for the in-register passthrough of
|
|
|
|
++ // coefficients to pass 2.
|
|
|
|
++ movi v28.4s, #0
|
|
|
|
++ movi v29.4s, #0
|
|
|
|
++ movi v30.4s, #0
|
|
|
|
++ movi v31.4s, #0
|
|
|
|
++2:
|
|
|
|
++ subs x1, x1, #1
|
|
|
|
++.rept 4
|
|
|
|
++ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
|
|
|
|
++.endr
|
|
|
|
++ b.ne 2b
|
|
|
|
++3:
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.irp i, 0, 4, 8, 12
|
|
|
|
++ add x0, x4, #(\i*2)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*4)
|
|
|
|
++ mov x3, #\i
|
|
|
|
++ bl \txfm2\()16_1d_4x16_pass2_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #1024
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++.endif
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
|
|
|
|
++ mov x13, #0x03ff
|
|
|
|
++ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
|
|
|
|
++ mov x13, #0x0fff
|
|
|
|
++ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm_func16x16 idct, idct
|
|
|
|
++itxfm_func16x16 iadst, idct
|
|
|
|
++itxfm_func16x16 idct, iadst
|
|
|
|
++itxfm_func16x16 iadst, iadst
|
|
|
|
++
|
|
|
|
++function idct16_1d_4x16_pass1_quarter_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ movi v4.4s, #0
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ bl idct16_quarter
|
|
|
|
++
|
|
|
|
++ // Do four 4x4 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
|
|
|
|
++ // contain the four transposed 4x4 blocks.
|
|
|
|
++ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ // Store the transposed 4x4 blocks horizontally.
|
|
|
|
++ // The first 4x4 block is kept in registers for the second pass,
|
|
|
|
++ // store the rest in the temp buffer.
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v20.4s}, [x0], #16
|
|
|
|
++ st1 {v24.4s}, [x0], #16
|
|
|
|
++ st1 {v28.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v21.4s}, [x0], #16
|
|
|
|
++ st1 {v25.4s}, [x0], #16
|
|
|
|
++ st1 {v29.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v22.4s}, [x0], #16
|
|
|
|
++ st1 {v26.4s}, [x0], #16
|
|
|
|
++ st1 {v30.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v23.4s}, [x0], #16
|
|
|
|
++ st1 {v27.4s}, [x0], #16
|
|
|
|
++ st1 {v31.4s}, [x0], #16
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_1d_4x16_pass2_quarter_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ // Only load the top 4 lines, and only do it for the later slices.
|
|
|
|
++ // For the first slice, d16-d19 is kept in registers from the first pass.
|
|
|
|
++ cbz x3, 1f
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ add x3, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ bl idct16_quarter
|
|
|
|
++
|
|
|
|
++ dup v8.8h, w13
|
|
|
|
++ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
|
|
|
|
++ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
|
|
|
|
++
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_1d_4x16_pass1_half_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ movi v4.4s, #0
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ bl idct16_half
|
|
|
|
++
|
|
|
|
++ // Do four 4x4 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
|
|
|
|
++ // contain the four transposed 4x4 blocks.
|
|
|
|
++ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ // Store the transposed 4x4 blocks horizontally.
|
|
|
|
++ cmp x1, #4
|
|
|
|
++ b.eq 1f
|
|
|
|
++.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
|
|
|
|
++ store \i, x0, #16
|
|
|
|
++.endr
|
|
|
|
++ br x14
|
|
|
|
++1:
|
|
|
|
++ // Special case: For the second input column (r1 == 4),
|
|
|
|
++ // which would be stored as the second row in the temp buffer,
|
|
|
|
++ // don't store the first 4x4 block, but keep it in registers
|
|
|
|
++ // for the first slice of the second pass (where it is the
|
|
|
|
++ // second 4x4 block).
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v20.4s}, [x0], #16
|
|
|
|
++ st1 {v24.4s}, [x0], #16
|
|
|
|
++ st1 {v28.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v21.4s}, [x0], #16
|
|
|
|
++ st1 {v25.4s}, [x0], #16
|
|
|
|
++ st1 {v29.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v22.4s}, [x0], #16
|
|
|
|
++ st1 {v26.4s}, [x0], #16
|
|
|
|
++ st1 {v30.4s}, [x0], #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ st1 {v23.4s}, [x0], #16
|
|
|
|
++ st1 {v27.4s}, [x0], #16
|
|
|
|
++ st1 {v31.4s}, [x0], #16
|
|
|
|
++
|
|
|
|
++ mov v20.16b, v16.16b
|
|
|
|
++ mov v21.16b, v17.16b
|
|
|
|
++ mov v22.16b, v18.16b
|
|
|
|
++ mov v23.16b, v19.16b
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_1d_4x16_pass2_half_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ cbz x3, 1f
|
|
|
|
++.irp i, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ add x3, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ bl idct16_half
|
|
|
|
++
|
|
|
|
++ dup v8.8h, w13
|
|
|
|
++ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
|
|
|
|
++ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
|
|
|
|
++
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct16_partial size
|
|
|
|
++function idct16x16_\size\()_add_16_neon
|
|
|
|
++ add x0, sp, #(0*64)
|
|
|
|
++ mov x1, #0
|
|
|
|
++ add x2, x6, #(0*4)
|
|
|
|
++ bl idct16_1d_4x16_pass1_\size\()_neon
|
|
|
|
++.ifc \size,half
|
|
|
|
++ add x0, sp, #(4*64)
|
|
|
|
++ mov x1, #4
|
|
|
|
++ add x2, x6, #(4*4)
|
|
|
|
++ bl idct16_1d_4x16_pass1_\size\()_neon
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.irp i, 0, 4, 8, 12
|
|
|
|
++ add x0, x4, #(\i*2)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*4)
|
|
|
|
++ mov x3, #\i
|
|
|
|
++ bl idct16_1d_4x16_pass2_\size\()_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #1024
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++idct16_partial quarter
|
|
|
|
++idct16_partial half
|
|
|
|
++
|
|
|
|
++function idct32x32_dc_add_neon
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++
|
|
|
|
++ movi v1.4h, #0
|
|
|
|
++
|
|
|
|
++ ld1 {v2.s}[0], [x2]
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ smull v2.2d, v2.2s, v0.s[0]
|
|
|
|
++ rshrn v2.2s, v2.2d, #14
|
|
|
|
++ st1 {v1.s}[0], [x2]
|
|
|
|
++ dup v2.4s, v2.s[0]
|
|
|
|
++
|
|
|
|
++ srshr v0.4s, v2.4s, #6
|
|
|
|
++
|
|
|
|
++ mov x3, x0
|
|
|
|
++ mov x4, #32
|
|
|
|
++ sub x1, x1, #32
|
|
|
|
++ dup v31.8h, w13
|
|
|
|
++1:
|
|
|
|
++ // Loop to add the constant v0 into all 32x32 outputs
|
|
|
|
++ subs x4, x4, #1
|
|
|
|
++ ld1 {v1.8h,v2.8h}, [x0], #32
|
|
|
|
++ uaddw v16.4s, v0.4s, v1.4h
|
|
|
|
++ uaddw2 v17.4s, v0.4s, v1.8h
|
|
|
|
++ ld1 {v3.8h,v4.8h}, [x0], x1
|
|
|
|
++ uaddw v18.4s, v0.4s, v2.4h
|
|
|
|
++ uaddw2 v19.4s, v0.4s, v2.8h
|
|
|
|
++ uaddw v20.4s, v0.4s, v3.4h
|
|
|
|
++ uaddw2 v21.4s, v0.4s, v3.8h
|
|
|
|
++ uaddw v22.4s, v0.4s, v4.4h
|
|
|
|
++ uaddw2 v23.4s, v0.4s, v4.8h
|
|
|
|
++ sqxtun v1.4h, v16.4s
|
|
|
|
++ sqxtun2 v1.8h, v17.4s
|
|
|
|
++ sqxtun v2.4h, v18.4s
|
|
|
|
++ sqxtun2 v2.8h, v19.4s
|
|
|
|
++ sqxtun v3.4h, v20.4s
|
|
|
|
++ sqxtun2 v3.8h, v21.4s
|
|
|
|
++ sqxtun v4.4h, v22.4s
|
|
|
|
++ sqxtun2 v4.8h, v23.4s
|
|
|
|
++ umin v1.8h, v1.8h, v31.8h
|
|
|
|
++ umin v2.8h, v2.8h, v31.8h
|
|
|
|
++ st1 {v1.8h,v2.8h}, [x3], #32
|
|
|
|
++ umin v3.8h, v3.8h, v31.8h
|
|
|
|
++ umin v4.8h, v4.8h, v31.8h
|
|
|
|
++ st1 {v3.8h,v4.8h}, [x3], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct32_end
|
|
|
|
++ butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a
|
|
|
|
++ butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18
|
|
|
|
++ butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a
|
|
|
|
++ butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21
|
|
|
|
++ butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a
|
|
|
|
++ butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26
|
|
|
|
++ butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a
|
|
|
|
++ butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29
|
|
|
|
++
|
|
|
|
++ dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
|
|
|
|
++ dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
|
|
|
|
++ dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
|
|
|
|
++ dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
|
|
|
|
++
|
|
|
|
++ butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24
|
|
|
|
++ butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a
|
|
|
|
++ butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16
|
|
|
|
++ butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
|
|
|
|
++ butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21
|
|
|
|
++ butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
|
|
|
|
++ butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26
|
|
|
|
++ butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20
|
|
|
|
++ dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
|
|
|
|
++ dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22
|
|
|
|
++ dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function idct32_odd
|
|
|
|
++ dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
|
|
|
|
++ dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
|
|
|
|
++ dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
|
|
|
|
++ dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
|
|
|
|
++ dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
|
|
|
|
++ dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
|
|
|
|
++ dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
|
|
|
|
++ dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
|
|
|
|
++
|
|
|
|
++ butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
|
|
|
|
++ butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
|
|
|
|
++ butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
|
|
|
|
++ butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
|
|
|
|
++ butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
|
|
|
|
++ butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
|
|
|
|
++ butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
|
|
|
|
++ butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
|
|
|
|
++
|
|
|
|
++ dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
|
|
|
|
++ dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
|
|
|
|
++ dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
|
|
|
|
++ dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
|
|
|
|
++ idct32_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct32_odd_half
|
|
|
|
++ dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
|
|
|
|
++ dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
|
|
|
|
++ dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
|
|
|
|
++ dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
|
|
|
|
++ dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
|
|
|
|
++ dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
|
|
|
|
++ dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
|
|
|
|
++ dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
|
|
|
|
++
|
|
|
|
++ butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
|
|
|
|
++ butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
|
|
|
|
++ butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
|
|
|
|
++ butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
|
|
|
|
++ butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
|
|
|
|
++ butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
|
|
|
|
++ butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
|
|
|
|
++ butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
|
|
|
|
++
|
|
|
|
++ dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
|
|
|
|
++ dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
|
|
|
|
++ dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
|
|
|
|
++ dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
|
|
|
|
++ idct32_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct32_odd_quarter
|
|
|
|
++ dsmull_h v4, v5, v16, v10.s[0]
|
|
|
|
++ dsmull_h v28, v29, v19, v11.s[3]
|
|
|
|
++ dsmull_h v30, v31, v16, v10.s[1]
|
|
|
|
++ dsmull_h v22, v23, v17, v13.s[2]
|
|
|
|
++ dsmull_h v7, v6, v17, v13.s[3]
|
|
|
|
++ dsmull_h v26, v27, v19, v11.s[2]
|
|
|
|
++ dsmull_h v20, v21, v18, v12.s[0]
|
|
|
|
++ dsmull_h v24, v25, v18, v12.s[1]
|
|
|
|
++
|
|
|
|
++ neg v28.2d, v28.2d
|
|
|
|
++ neg v29.2d, v29.2d
|
|
|
|
++ neg v7.2d, v7.2d
|
|
|
|
++ neg v6.2d, v6.2d
|
|
|
|
++
|
|
|
|
++ drshrn_h v4, v4, v5, #14
|
|
|
|
++ drshrn_h v5, v28, v29, #14
|
|
|
|
++ drshrn_h v29, v30, v31, #14
|
|
|
|
++ drshrn_h v28, v22, v23, #14
|
|
|
|
++ drshrn_h v7, v7, v6, #14
|
|
|
|
++ drshrn_h v31, v26, v27, #14
|
|
|
|
++ drshrn_h v6, v20, v21, #14
|
|
|
|
++ drshrn_h v30, v24, v25, #14
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1]
|
|
|
|
++ dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1]
|
|
|
|
++ drshrn_h v23, v16, v17, #14
|
|
|
|
++ drshrn_h v24, v18, v19, #14
|
|
|
|
++ neg v20.2d, v20.2d
|
|
|
|
++ neg v21.2d, v21.2d
|
|
|
|
++ drshrn_h v27, v27, v26, #14
|
|
|
|
++ drshrn_h v20, v20, v21, #14
|
|
|
|
++ dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3]
|
|
|
|
++ drshrn_h v21, v16, v17, #14
|
|
|
|
++ drshrn_h v26, v18, v19, #14
|
|
|
|
++ dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3]
|
|
|
|
++ drshrn_h v25, v16, v17, #14
|
|
|
|
++ neg v18.2d, v18.2d
|
|
|
|
++ neg v19.2d, v19.2d
|
|
|
|
++ drshrn_h v22, v18, v19, #14
|
|
|
|
++
|
|
|
|
++ idct32_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct32_funcs suffix
|
|
|
|
++// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
|
|
|
|
++// The 32-point IDCT can be decomposed into two 16-point IDCTs;
|
|
|
|
++// a normal IDCT16 with every other input component (the even ones, with
|
|
|
|
++// each output written twice), followed by a separate 16-point IDCT
|
|
|
|
++// of the odd inputs, added/subtracted onto the outputs of the first idct16.
|
|
|
|
++// x0 = dst (temp buffer)
|
|
|
|
++// x1 = unused
|
|
|
|
++// x2 = src
|
|
|
|
++// x9 = double input stride
|
|
|
|
++function idct32_1d_4x32_pass1\suffix\()_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ movi v4.4s, #0
|
|
|
|
++
|
|
|
|
++ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ bl idct16\suffix
|
|
|
|
++
|
|
|
|
++ // Do four 4x4 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
|
|
|
|
++ // contain the four transposed 4x4 blocks.
|
|
|
|
++ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ // Store the registers a, b, c, d horizontally, followed by the
|
|
|
|
++ // same registers d, c, b, a mirrored.
|
|
|
|
++.macro store_rev a, b, c, d
|
|
|
|
++ // There's no rev128 instruction, but we reverse each 64 bit
|
|
|
|
++ // half, and then flip them using an ext with 8 bytes offset.
|
|
|
|
++ rev64 v7.4s, \d
|
|
|
|
++ st1 {\a}, [x0], #16
|
|
|
|
++ ext v7.16b, v7.16b, v7.16b, #8
|
|
|
|
++ st1 {\b}, [x0], #16
|
|
|
|
++ rev64 v6.4s, \c
|
|
|
|
++ st1 {\c}, [x0], #16
|
|
|
|
++ ext v6.16b, v6.16b, v6.16b, #8
|
|
|
|
++ st1 {\d}, [x0], #16
|
|
|
|
++ rev64 v5.4s, \b
|
|
|
|
++ st1 {v7.4s}, [x0], #16
|
|
|
|
++ ext v5.16b, v5.16b, v5.16b, #8
|
|
|
|
++ st1 {v6.4s}, [x0], #16
|
|
|
|
++ rev64 v4.4s, \a
|
|
|
|
++ st1 {v5.4s}, [x0], #16
|
|
|
|
++ ext v4.16b, v4.16b, v4.16b, #8
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++.endm
|
|
|
|
++ store_rev v16.4s, v20.4s, v24.4s, v28.4s
|
|
|
|
++ store_rev v17.4s, v21.4s, v25.4s, v29.4s
|
|
|
|
++ store_rev v18.4s, v22.4s, v26.4s, v30.4s
|
|
|
|
++ store_rev v19.4s, v23.4s, v27.4s, v31.4s
|
|
|
|
++ sub x0, x0, #512
|
|
|
|
++.purgem store_rev
|
|
|
|
++
|
|
|
|
++ // Move x2 back to the start of the input, and move
|
|
|
|
++ // to the first odd row
|
|
|
|
++.ifb \suffix
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++ sub x2, x2, x9, lsl #2
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++ sub x2, x2, x9, lsl #3
|
|
|
|
++.endif
|
|
|
|
++ add x2, x2, #128
|
|
|
|
++
|
|
|
|
++ movi v4.4s, #0
|
|
|
|
++ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ bl idct32_odd\suffix
|
|
|
|
++
|
|
|
|
++ transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7
|
|
|
|
++ transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ // Store the registers a, b, c, d horizontally,
|
|
|
|
++ // adding into the output first, and the mirrored,
|
|
|
|
++ // subtracted from the output.
|
|
|
|
++.macro store_rev a, b, c, d, a16b, b16b
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ rev64 v9.4s, \d
|
|
|
|
++ add v4.4s, v4.4s, \a
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ rev64 v8.4s, \c
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ ext v9.16b, v9.16b, v9.16b, #8
|
|
|
|
++ add v4.4s, v4.4s, \b
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ ext v8.16b, v8.16b, v8.16b, #8
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ rev64 \b, \b
|
|
|
|
++ add v4.4s, v4.4s, \c
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ rev64 \a, \a
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ ext \b16b, \b16b, \b16b, #8
|
|
|
|
++ add v4.4s, v4.4s, \d
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ ext \a16b, \a16b, \a16b, #8
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ sub v4.4s, v4.4s, v9.4s
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ sub v4.4s, v4.4s, v8.4s
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ sub v4.4s, v4.4s, \b
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++ ld1 {v4.4s}, [x0]
|
|
|
|
++ sub v4.4s, v4.4s, \a
|
|
|
|
++ st1 {v4.4s}, [x0], #16
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++ store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
|
|
|
|
++ store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
|
|
|
|
++ store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
|
|
|
|
++ store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
|
|
|
|
++.purgem store_rev
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++// This is mostly the same as 4x32_pass1, but without the transpose,
|
|
|
|
++// and use the source as temp buffer between the two idct passes, and
|
|
|
|
++// add into the destination.
|
|
|
|
++// x0 = dst
|
|
|
|
++// x1 = dst stride
|
|
|
|
++// x2 = src (temp buffer)
|
|
|
|
++// x7 = negative double temp buffer stride
|
|
|
|
++// x9 = double temp buffer stride
|
|
|
|
++function idct32_1d_4x32_pass2\suffix\()_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #2
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #3
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ bl idct16\suffix
|
|
|
|
++
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ store \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++ add x2, x2, #128
|
|
|
|
++
|
|
|
|
++ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #2
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #3
|
|
|
|
++.endif
|
|
|
|
++ sub x2, x2, #128
|
|
|
|
++
|
|
|
|
++ bl idct32_odd\suffix
|
|
|
|
++
|
|
|
|
++.macro load_acc_store a, b, c, d, neg=0
|
|
|
|
++.if \neg == 0
|
|
|
|
++ ld1 {v4.4s}, [x2], x9
|
|
|
|
++ ld1 {v5.4s}, [x2], x9
|
|
|
|
++ add v4.4s, v4.4s, \a
|
|
|
|
++ ld1 {v6.4s}, [x2], x9
|
|
|
|
++ add v5.4s, v5.4s, \b
|
|
|
|
++ ld1 {v7.4s}, [x2], x9
|
|
|
|
++ add v6.4s, v6.4s, \c
|
|
|
|
++ add v7.4s, v7.4s, \d
|
|
|
|
++.else
|
|
|
|
++ ld1 {v4.4s}, [x2], x7
|
|
|
|
++ ld1 {v5.4s}, [x2], x7
|
|
|
|
++ sub v4.4s, v4.4s, \a
|
|
|
|
++ ld1 {v6.4s}, [x2], x7
|
|
|
|
++ sub v5.4s, v5.4s, \b
|
|
|
|
++ ld1 {v7.4s}, [x2], x7
|
|
|
|
++ sub v6.4s, v6.4s, \c
|
|
|
|
++ sub v7.4s, v7.4s, \d
|
|
|
|
++.endif
|
|
|
|
++ ld1 {v8.4h}, [x0], x1
|
|
|
|
++ ld1 {v8.d}[1], [x0], x1
|
|
|
|
++ srshr v4.4s, v4.4s, #6
|
|
|
|
++ ld1 {v9.4h}, [x0], x1
|
|
|
|
++ srshr v5.4s, v5.4s, #6
|
|
|
|
++ uaddw v4.4s, v4.4s, v8.4h
|
|
|
|
++ ld1 {v9.d}[1], [x0], x1
|
|
|
|
++ srshr v6.4s, v6.4s, #6
|
|
|
|
++ uaddw2 v5.4s, v5.4s, v8.8h
|
|
|
|
++ srshr v7.4s, v7.4s, #6
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ uaddw v6.4s, v6.4s, v9.4h
|
|
|
|
++ sqxtun v4.4h, v4.4s
|
|
|
|
++ uaddw2 v7.4s, v7.4s, v9.8h
|
|
|
|
++ sqxtun2 v4.8h, v5.4s
|
|
|
|
++ umin v4.8h, v4.8h, v15.8h
|
|
|
|
++ st1 {v4.4h}, [x0], x1
|
|
|
|
++ sqxtun v5.4h, v6.4s
|
|
|
|
++ st1 {v4.d}[1], [x0], x1
|
|
|
|
++ sqxtun2 v5.8h, v7.4s
|
|
|
|
++ umin v5.8h, v5.8h, v15.8h
|
|
|
|
++ st1 {v5.4h}, [x0], x1
|
|
|
|
++ st1 {v5.d}[1], [x0], x1
|
|
|
|
++.endm
|
|
|
|
++ load_acc_store v31.4s, v30.4s, v29.4s, v28.4s
|
|
|
|
++ load_acc_store v27.4s, v26.4s, v25.4s, v24.4s
|
|
|
|
++ load_acc_store v23.4s, v22.4s, v21.4s, v20.4s
|
|
|
|
++ load_acc_store v19.4s, v18.4s, v17.4s, v16.4s
|
|
|
|
++ sub x2, x2, x9
|
|
|
|
++ load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1
|
|
|
|
++ load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1
|
|
|
|
++ load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
|
|
|
|
++ load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
|
|
|
|
++.purgem load_acc_store
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++idct32_funcs
|
|
|
|
++idct32_funcs _quarter
|
|
|
|
++idct32_funcs _half
|
|
|
|
++
|
|
|
|
++const min_eob_idct_idct_32, align=4
|
|
|
|
++ .short 0, 9, 34, 70, 135, 240, 336, 448
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++function vp9_idct_idct_32x32_add_16_neon
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.eq idct32x32_dc_add_neon
|
|
|
|
++
|
|
|
|
++ movrel x10, idct_coeffs
|
|
|
|
++
|
|
|
|
++ mov x15, x30
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++
|
|
|
|
++ sub sp, sp, #4096
|
|
|
|
++
|
|
|
|
++ mov x4, x0
|
|
|
|
++ mov x5, x1
|
|
|
|
++ mov x6, x2
|
|
|
|
++
|
|
|
|
++ // Double stride of the input, since we only read every other line
|
|
|
|
++ mov x9, #256
|
|
|
|
++ neg x7, x9
|
|
|
|
++
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x10], #32
|
|
|
|
++ sxtl v2.4s, v1.4h
|
|
|
|
++ sxtl2 v3.4s, v1.8h
|
|
|
|
++ sxtl2 v1.4s, v0.8h
|
|
|
|
++ sxtl v0.4s, v0.4h
|
|
|
|
++ ld1 {v10.8h,v11.8h}, [x10]
|
|
|
|
++ sxtl v12.4s, v11.4h
|
|
|
|
++ sxtl2 v13.4s, v11.8h
|
|
|
|
++ sxtl2 v11.4s, v10.8h
|
|
|
|
++ sxtl v10.4s, v10.4h
|
|
|
|
++
|
|
|
|
++ dup v15.8h, w13
|
|
|
|
++
|
|
|
|
++ cmp w3, #34
|
|
|
|
++ b.le idct32x32_quarter_add_16_neon
|
|
|
|
++ cmp w3, #135
|
|
|
|
++ b.le idct32x32_half_add_16_neon
|
|
|
|
++
|
|
|
|
++ movrel x12, min_eob_idct_idct_32, 2
|
|
|
|
++
|
|
|
|
++.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
|
|
|
++ add x0, sp, #(\i*128)
|
|
|
|
++.if \i > 0
|
|
|
|
++ ldrh w1, [x12], #2
|
|
|
|
++ cmp w3, w1
|
|
|
|
++ mov x1, #(32 - \i)/4
|
|
|
|
++ b.le 1f
|
|
|
|
++.endif
|
|
|
|
++ add x2, x6, #(\i*4)
|
|
|
|
++ bl idct32_1d_4x32_pass1_neon
|
|
|
|
++.endr
|
|
|
|
++ b 3f
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ // Write zeros to the temp buffer for pass 2
|
|
|
|
++ movi v16.4s, #0
|
|
|
|
++ movi v17.4s, #0
|
|
|
|
++ movi v18.4s, #0
|
|
|
|
++ movi v19.4s, #0
|
|
|
|
++2:
|
|
|
|
++ subs x1, x1, #1
|
|
|
|
++.rept 4
|
|
|
|
++ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
|
|
|
|
++ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
|
|
|
|
++.endr
|
|
|
|
++ b.ne 2b
|
|
|
|
++3:
|
|
|
|
++.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
|
|
|
++ add x0, x4, #(\i*2)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*4)
|
|
|
|
++ bl idct32_1d_4x32_pass2_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #4096
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_idct_idct_32x32_add_10_neon, export=1
|
|
|
|
++ mov x13, #0x03ff
|
|
|
|
++ b vp9_idct_idct_32x32_add_16_neon
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_idct_idct_32x32_add_12_neon, export=1
|
|
|
|
++ mov x13, #0x0fff
|
|
|
|
++ b vp9_idct_idct_32x32_add_16_neon
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct32_partial size
|
|
|
|
++function idct32x32_\size\()_add_16_neon
|
|
|
|
++.irp i, 0, 4
|
|
|
|
++ add x0, sp, #(\i*128)
|
|
|
|
++.ifc \size,quarter
|
|
|
|
++.if \i == 4
|
|
|
|
++ cmp w3, #9
|
|
|
|
++ b.le 1f
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ add x2, x6, #(\i*4)
|
|
|
|
++ bl idct32_1d_4x32_pass1_\size\()_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++.ifc \size,half
|
|
|
|
++.irp i, 8, 12
|
|
|
|
++ add x0, sp, #(\i*128)
|
|
|
|
++.if \i == 12
|
|
|
|
++ cmp w3, #70
|
|
|
|
++ b.le 1f
|
|
|
|
++.endif
|
|
|
|
++ add x2, x6, #(\i*4)
|
|
|
|
++ bl idct32_1d_4x32_pass1_\size\()_neon
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++ b 3f
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ // Write zeros to the temp buffer for pass 2
|
|
|
|
++ movi v16.4s, #0
|
|
|
|
++ movi v17.4s, #0
|
|
|
|
++ movi v18.4s, #0
|
|
|
|
++ movi v19.4s, #0
|
|
|
|
++
|
|
|
|
++.rept 4
|
|
|
|
++ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
|
|
|
|
++ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++3:
|
|
|
|
++.irp i, 0, 4, 8, 12, 16, 20, 24, 28
|
|
|
|
++ add x0, x4, #(\i*2)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*4)
|
|
|
|
++ bl idct32_1d_4x32_pass2_\size\()_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #4096
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++idct32_partial quarter
|
|
|
|
++idct32_partial half
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
|
|
|
|
+@@ -0,0 +1,1580 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2016 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++#include "neon.S"
|
|
|
|
++
|
|
|
|
++const itxfm4_coeffs, align=4
|
|
|
|
++ .short 11585, 0, 6270, 15137
|
|
|
|
++iadst4_coeffs:
|
|
|
|
++ .short 5283, 15212, 9929, 13377
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const iadst8_coeffs, align=4
|
|
|
|
++ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
|
|
|
|
++idct_coeffs:
|
|
|
|
++ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
|
|
|
|
++ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
|
|
|
|
++ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
|
|
|
|
++ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++const iadst16_coeffs, align=4
|
|
|
|
++ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
|
|
|
|
++ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
|
|
|
|
++// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
|
|
|
|
++// in/out are .8h registers; this can do with 4 temp registers, but is
|
|
|
|
++// more efficient if 6 temp registers are available.
|
|
|
|
++.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
|
|
|
|
++.if \neg > 0
|
|
|
|
++ neg \tmp4\().4h, v0.4h
|
|
|
|
++.endif
|
|
|
|
++ add \tmp1\().8h, \in1\().8h, \in2\().8h
|
|
|
|
++ sub \tmp2\().8h, \in1\().8h, \in2\().8h
|
|
|
|
++.if \neg > 0
|
|
|
|
++ smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
|
|
|
|
++ smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
|
|
|
|
++.else
|
|
|
|
++ smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
|
|
|
|
++ smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
|
|
|
|
++.endif
|
|
|
|
++.ifb \tmp5
|
|
|
|
++ rshrn \out1\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \out1\().8h, \tmp4\().4s, #14
|
|
|
|
++ smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
|
|
|
|
++ smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
|
|
|
|
++ rshrn \out2\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \out2\().8h, \tmp4\().4s, #14
|
|
|
|
++.else
|
|
|
|
++ smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
|
|
|
|
++ smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
|
|
|
|
++ rshrn \out1\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \out1\().8h, \tmp4\().4s, #14
|
|
|
|
++ rshrn \out2\().4h, \tmp5\().4s, #14
|
|
|
|
++ rshrn2 \out2\().8h, \tmp6\().4s, #14
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Same as dmbutterfly0 above, but treating the input in in2 as zero,
|
|
|
|
++// writing the same output into both out1 and out2.
|
|
|
|
++.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
|
|
|
|
++ smull \tmp1\().4s, \in1\().4h, v0.h[0]
|
|
|
|
++ smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
|
|
|
|
++ rshrn \out1\().4h, \tmp1\().4s, #14
|
|
|
|
++ rshrn2 \out1\().8h, \tmp2\().4s, #14
|
|
|
|
++ rshrn \out2\().4h, \tmp1\().4s, #14
|
|
|
|
++ rshrn2 \out2\().8h, \tmp2\().4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1,out2 = in1 * coef1 - in2 * coef2
|
|
|
|
++// out3,out4 = in1 * coef2 + in2 * coef1
|
|
|
|
++// out are 4 x .4s registers, in are 2 x .8h registers
|
|
|
|
++.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
|
|
|
|
++ smull \out1\().4s, \in1\().4h, \coef1
|
|
|
|
++ smull2 \out2\().4s, \in1\().8h, \coef1
|
|
|
|
++ smull \out3\().4s, \in1\().4h, \coef2
|
|
|
|
++ smull2 \out4\().4s, \in1\().8h, \coef2
|
|
|
|
++ smlsl \out1\().4s, \in2\().4h, \coef2
|
|
|
|
++ smlsl2 \out2\().4s, \in2\().8h, \coef2
|
|
|
|
++ smlal \out3\().4s, \in2\().4h, \coef1
|
|
|
|
++ smlal2 \out4\().4s, \in2\().8h, \coef1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
|
|
|
|
++// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
|
|
|
|
++// inout are 2 x .8h registers
|
|
|
|
++.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
|
|
|
|
++ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
|
|
|
|
++.if \neg > 0
|
|
|
|
++ neg \tmp3\().4s, \tmp3\().4s
|
|
|
|
++ neg \tmp4\().4s, \tmp4\().4s
|
|
|
|
++.endif
|
|
|
|
++ rshrn \inout1\().4h, \tmp1\().4s, #14
|
|
|
|
++ rshrn2 \inout1\().8h, \tmp2\().4s, #14
|
|
|
|
++ rshrn \inout2\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \inout2\().8h, \tmp4\().4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Same as dmbutterfly above, but treating the input in inout2 as zero
|
|
|
|
++.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
|
|
|
|
++ smull \tmp1\().4s, \inout1\().4h, \coef1
|
|
|
|
++ smull2 \tmp2\().4s, \inout1\().8h, \coef1
|
|
|
|
++ smull \tmp3\().4s, \inout1\().4h, \coef2
|
|
|
|
++ smull2 \tmp4\().4s, \inout1\().8h, \coef2
|
|
|
|
++ rshrn \inout1\().4h, \tmp1\().4s, #14
|
|
|
|
++ rshrn2 \inout1\().8h, \tmp2\().4s, #14
|
|
|
|
++ rshrn \inout2\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \inout2\().8h, \tmp4\().4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Same as dmbutterfly above, but treating the input in inout1 as zero
|
|
|
|
++.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
|
|
|
|
++ smull \tmp1\().4s, \inout2\().4h, \coef2
|
|
|
|
++ smull2 \tmp2\().4s, \inout2\().8h, \coef2
|
|
|
|
++ smull \tmp3\().4s, \inout2\().4h, \coef1
|
|
|
|
++ smull2 \tmp4\().4s, \inout2\().8h, \coef1
|
|
|
|
++ neg \tmp1\().4s, \tmp1\().4s
|
|
|
|
++ neg \tmp2\().4s, \tmp2\().4s
|
|
|
|
++ rshrn \inout2\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \inout2\().8h, \tmp4\().4s, #14
|
|
|
|
++ rshrn \inout1\().4h, \tmp1\().4s, #14
|
|
|
|
++ rshrn2 \inout1\().8h, \tmp2\().4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro dsmull_h out1, out2, in, coef
|
|
|
|
++ smull \out1\().4s, \in\().4h, \coef
|
|
|
|
++ smull2 \out2\().4s, \in\().8h, \coef
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro drshrn_h out, in1, in2, shift
|
|
|
|
++ rshrn \out\().4h, \in1\().4s, \shift
|
|
|
|
++ rshrn2 \out\().8h, \in2\().4s, \shift
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// out1 = in1 + in2
|
|
|
|
++// out2 = in1 - in2
|
|
|
|
++.macro butterfly_8h out1, out2, in1, in2
|
|
|
|
++ add \out1\().8h, \in1\().8h, \in2\().8h
|
|
|
|
++ sub \out2\().8h, \in1\().8h, \in2\().8h
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1 = in1 - in2
|
|
|
|
++// out2 = in1 + in2
|
|
|
|
++.macro butterfly_8h_r out1, out2, in1, in2
|
|
|
|
++ sub \out1\().8h, \in1\().8h, \in2\().8h
|
|
|
|
++ add \out2\().8h, \in1\().8h, \in2\().8h
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
|
|
|
|
++// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
|
|
|
|
++// out are 2 x .8h registers, in are 4 x .4s registers
|
|
|
|
++.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
|
|
|
|
++ add \tmp1\().4s, \in1\().4s, \in3\().4s
|
|
|
|
++ add \tmp2\().4s, \in2\().4s, \in4\().4s
|
|
|
|
++ sub \tmp3\().4s, \in1\().4s, \in3\().4s
|
|
|
|
++ sub \tmp4\().4s, \in2\().4s, \in4\().4s
|
|
|
|
++ rshrn \out1\().4h, \tmp1\().4s, #14
|
|
|
|
++ rshrn2 \out1\().8h, \tmp2\().4s, #14
|
|
|
|
++ rshrn \out2\().4h, \tmp3\().4s, #14
|
|
|
|
++ rshrn2 \out2\().8h, \tmp4\().4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iwht4 c0, c1, c2, c3
|
|
|
|
++ add \c0\().4h, \c0\().4h, \c1\().4h
|
|
|
|
++ sub v17.4h, \c2\().4h, \c3\().4h
|
|
|
|
++ sub v16.4h, \c0\().4h, v17.4h
|
|
|
|
++ sshr v16.4h, v16.4h, #1
|
|
|
|
++ sub \c2\().4h, v16.4h, \c1\().4h
|
|
|
|
++ sub \c1\().4h, v16.4h, \c3\().4h
|
|
|
|
++ add \c3\().4h, v17.4h, \c2\().4h
|
|
|
|
++ sub \c0\().4h, \c0\().4h, \c1\().4h
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro idct4 c0, c1, c2, c3
|
|
|
|
++ smull v22.4s, \c1\().4h, v0.h[3]
|
|
|
|
++ smull v20.4s, \c1\().4h, v0.h[2]
|
|
|
|
++ add v16.4h, \c0\().4h, \c2\().4h
|
|
|
|
++ sub v17.4h, \c0\().4h, \c2\().4h
|
|
|
|
++ smlal v22.4s, \c3\().4h, v0.h[2]
|
|
|
|
++ smull v18.4s, v16.4h, v0.h[0]
|
|
|
|
++ smull v19.4s, v17.4h, v0.h[0]
|
|
|
|
++ smlsl v20.4s, \c3\().4h, v0.h[3]
|
|
|
|
++ rshrn v22.4h, v22.4s, #14
|
|
|
|
++ rshrn v18.4h, v18.4s, #14
|
|
|
|
++ rshrn v19.4h, v19.4s, #14
|
|
|
|
++ rshrn v20.4h, v20.4s, #14
|
|
|
|
++ add \c0\().4h, v18.4h, v22.4h
|
|
|
|
++ sub \c3\().4h, v18.4h, v22.4h
|
|
|
|
++ add \c1\().4h, v19.4h, v20.4h
|
|
|
|
++ sub \c2\().4h, v19.4h, v20.4h
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iadst4 c0, c1, c2, c3
|
|
|
|
++ smull v16.4s, \c0\().4h, v0.h[4]
|
|
|
|
++ smlal v16.4s, \c2\().4h, v0.h[5]
|
|
|
|
++ smlal v16.4s, \c3\().4h, v0.h[6]
|
|
|
|
++ smull v17.4s, \c0\().4h, v0.h[6]
|
|
|
|
++ smlsl v17.4s, \c2\().4h, v0.h[4]
|
|
|
|
++ sub \c0\().4h, \c0\().4h, \c2\().4h
|
|
|
|
++ smlsl v17.4s, \c3\().4h, v0.h[5]
|
|
|
|
++ add \c0\().4h, \c0\().4h, \c3\().4h
|
|
|
|
++ smull v19.4s, \c1\().4h, v0.h[7]
|
|
|
|
++ smull v18.4s, \c0\().4h, v0.h[7]
|
|
|
|
++ add v20.4s, v16.4s, v19.4s
|
|
|
|
++ add v21.4s, v17.4s, v19.4s
|
|
|
|
++ rshrn \c0\().4h, v20.4s, #14
|
|
|
|
++ add v16.4s, v16.4s, v17.4s
|
|
|
|
++ rshrn \c1\().4h, v21.4s, #14
|
|
|
|
++ sub v16.4s, v16.4s, v19.4s
|
|
|
|
++ rshrn \c2\().4h, v18.4s, #14
|
|
|
|
++ rshrn \c3\().4h, v16.4s, #14
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// The public functions in this file have got the following signature:
|
|
|
|
++// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
|
|
|
++
|
|
|
|
++.macro itxfm_func4x4 txfm1, txfm2
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
|
|
|
|
++.ifc \txfm1,\txfm2
|
|
|
|
++.ifc \txfm1,idct
|
|
|
|
++ movrel x4, itxfm4_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++.endif
|
|
|
|
++.ifc \txfm1,iadst
|
|
|
|
++ movrel x4, iadst4_coeffs
|
|
|
|
++ ld1 {v0.d}[1], [x4]
|
|
|
|
++.endif
|
|
|
|
++.else
|
|
|
|
++ movrel x4, itxfm4_coeffs
|
|
|
|
++ ld1 {v0.8h}, [x4]
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ movi v31.8h, #0
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.ne 1f
|
|
|
|
++ // DC-only for idct/idct
|
|
|
|
++ ld1 {v2.h}[0], [x2]
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ st1 {v31.h}[0], [x2]
|
|
|
|
++ dup v4.4h, v2.h[0]
|
|
|
|
++ mov v5.16b, v4.16b
|
|
|
|
++ mov v6.16b, v4.16b
|
|
|
|
++ mov v7.16b, v4.16b
|
|
|
|
++ b 2f
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
|
|
|
|
++ st1 {v31.8h}, [x2], #16
|
|
|
|
++
|
|
|
|
++.ifc \txfm1,iwht
|
|
|
|
++ sshr v4.4h, v4.4h, #2
|
|
|
|
++ sshr v5.4h, v5.4h, #2
|
|
|
|
++ sshr v6.4h, v6.4h, #2
|
|
|
|
++ sshr v7.4h, v7.4h, #2
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ \txfm1\()4 v4, v5, v6, v7
|
|
|
|
++
|
|
|
|
++ st1 {v31.8h}, [x2], #16
|
|
|
|
++ // Transpose 4x4 with 16 bit elements
|
|
|
|
++ transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
|
|
|
|
++
|
|
|
|
++ \txfm2\()4 v4, v5, v6, v7
|
|
|
|
++2:
|
|
|
|
++ ld1 {v0.s}[0], [x0], x1
|
|
|
|
++ ld1 {v1.s}[0], [x0], x1
|
|
|
|
++.ifnc \txfm1,iwht
|
|
|
|
++ srshr v4.4h, v4.4h, #4
|
|
|
|
++ srshr v5.4h, v5.4h, #4
|
|
|
|
++ srshr v6.4h, v6.4h, #4
|
|
|
|
++ srshr v7.4h, v7.4h, #4
|
|
|
|
++.endif
|
|
|
|
++ uaddw v4.8h, v4.8h, v0.8b
|
|
|
|
++ uaddw v5.8h, v5.8h, v1.8b
|
|
|
|
++ ld1 {v2.s}[0], [x0], x1
|
|
|
|
++ ld1 {v3.s}[0], [x0], x1
|
|
|
|
++ sqxtun v0.8b, v4.8h
|
|
|
|
++ sqxtun v1.8b, v5.8h
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++
|
|
|
|
++ uaddw v6.8h, v6.8h, v2.8b
|
|
|
|
++ uaddw v7.8h, v7.8h, v3.8b
|
|
|
|
++ st1 {v0.s}[0], [x0], x1
|
|
|
|
++ sqxtun v2.8b, v6.8h
|
|
|
|
++ sqxtun v3.8b, v7.8h
|
|
|
|
++
|
|
|
|
++ st1 {v1.s}[0], [x0], x1
|
|
|
|
++ st1 {v2.s}[0], [x0], x1
|
|
|
|
++ st1 {v3.s}[0], [x0], x1
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm_func4x4 idct, idct
|
|
|
|
++itxfm_func4x4 iadst, idct
|
|
|
|
++itxfm_func4x4 idct, iadst
|
|
|
|
++itxfm_func4x4 iadst, iadst
|
|
|
|
++itxfm_func4x4 iwht, iwht
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro idct8
|
|
|
|
++ dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
|
|
|
|
++ dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
|
|
|
|
++ dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
|
|
|
|
++ dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
|
|
|
|
++ butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
|
|
|
|
++ butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
|
|
|
|
++ butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
|
|
|
|
++
|
|
|
|
++ butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
|
|
|
|
++ butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
|
|
|
|
++ butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
|
|
|
|
++ butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro iadst8
|
|
|
|
++ dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
|
|
|
|
++ dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
|
|
|
|
++ dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
|
|
|
|
++ dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
|
|
|
|
++
|
|
|
|
++ dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
|
|
|
|
++ dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
|
|
|
|
++ dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
|
|
|
|
++ dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
|
|
|
|
++
|
|
|
|
++ butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
|
|
|
|
++ butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
|
|
|
|
++ neg v23.8h, v23.8h // v23 = out[7]
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
|
|
|
|
++ neg v19.8h, v19.8h // v19 = out[3]
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a
|
|
|
|
++ dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a
|
|
|
|
++
|
|
|
|
++ dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
|
|
|
|
++ dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
|
|
|
|
++ neg v17.8h, v17.8h // v17 = out[1]
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
|
|
|
|
++ neg v21.8h, v21.8h // v21 = out[5]
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro itxfm_func8x8 txfm1, txfm2
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
|
|
|
|
++ // The iadst also uses a few coefficients from
|
|
|
|
++ // idct, so those always need to be loaded.
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++.else
|
|
|
|
++ movrel x4, iadst8_coeffs
|
|
|
|
++ ld1 {v1.8h}, [x4], #16
|
|
|
|
++.endif
|
|
|
|
++ ld1 {v0.8h}, [x4]
|
|
|
|
++
|
|
|
|
++ movi v2.8h, #0
|
|
|
|
++ movi v3.8h, #0
|
|
|
|
++ movi v4.8h, #0
|
|
|
|
++ movi v5.8h, #0
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.ne 1f
|
|
|
|
++ // DC-only for idct/idct
|
|
|
|
++ ld1 {v2.h}[0], [x2]
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ st1 {v3.h}[0], [x2]
|
|
|
|
++ dup v16.8h, v2.h[0]
|
|
|
|
++ mov v17.16b, v16.16b
|
|
|
|
++ mov v18.16b, v16.16b
|
|
|
|
++ mov v19.16b, v16.16b
|
|
|
|
++ mov v20.16b, v16.16b
|
|
|
|
++ mov v21.16b, v16.16b
|
|
|
|
++ mov v22.16b, v16.16b
|
|
|
|
++ mov v23.16b, v16.16b
|
|
|
|
++ b 2f
|
|
|
|
++.endif
|
|
|
|
++1:
|
|
|
|
++ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
|
|
|
|
++ ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64
|
|
|
|
++ sub x2, x2, #128
|
|
|
|
++ st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
|
|
|
|
++ st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
|
|
|
|
++
|
|
|
|
++ \txfm1\()8
|
|
|
|
++
|
|
|
|
++ // Transpose 8x8 with 16 bit elements
|
|
|
|
++ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
|
|
|
|
++
|
|
|
|
++ \txfm2\()8
|
|
|
|
++2:
|
|
|
|
++ mov x3, x0
|
|
|
|
++ // Add into the destination
|
|
|
|
++ ld1 {v0.8b}, [x0], x1
|
|
|
|
++ srshr v16.8h, v16.8h, #5
|
|
|
|
++ ld1 {v1.8b}, [x0], x1
|
|
|
|
++ srshr v17.8h, v17.8h, #5
|
|
|
|
++ ld1 {v2.8b}, [x0], x1
|
|
|
|
++ srshr v18.8h, v18.8h, #5
|
|
|
|
++ uaddw v16.8h, v16.8h, v0.8b
|
|
|
|
++ ld1 {v3.8b}, [x0], x1
|
|
|
|
++ srshr v19.8h, v19.8h, #5
|
|
|
|
++ uaddw v17.8h, v17.8h, v1.8b
|
|
|
|
++ ld1 {v4.8b}, [x0], x1
|
|
|
|
++ srshr v20.8h, v20.8h, #5
|
|
|
|
++ uaddw v18.8h, v18.8h, v2.8b
|
|
|
|
++ sqxtun v0.8b, v16.8h
|
|
|
|
++ ld1 {v5.8b}, [x0], x1
|
|
|
|
++ srshr v21.8h, v21.8h, #5
|
|
|
|
++ uaddw v19.8h, v19.8h, v3.8b
|
|
|
|
++ sqxtun v1.8b, v17.8h
|
|
|
|
++ ld1 {v6.8b}, [x0], x1
|
|
|
|
++ srshr v22.8h, v22.8h, #5
|
|
|
|
++ uaddw v20.8h, v20.8h, v4.8b
|
|
|
|
++ sqxtun v2.8b, v18.8h
|
|
|
|
++ ld1 {v7.8b}, [x0], x1
|
|
|
|
++ srshr v23.8h, v23.8h, #5
|
|
|
|
++ uaddw v21.8h, v21.8h, v5.8b
|
|
|
|
++ sqxtun v3.8b, v19.8h
|
|
|
|
++
|
|
|
|
++ st1 {v0.8b}, [x3], x1
|
|
|
|
++ uaddw v22.8h, v22.8h, v6.8b
|
|
|
|
++ st1 {v1.8b}, [x3], x1
|
|
|
|
++ sqxtun v4.8b, v20.8h
|
|
|
|
++ st1 {v2.8b}, [x3], x1
|
|
|
|
++ uaddw v23.8h, v23.8h, v7.8b
|
|
|
|
++ st1 {v3.8b}, [x3], x1
|
|
|
|
++ sqxtun v5.8b, v21.8h
|
|
|
|
++ st1 {v4.8b}, [x3], x1
|
|
|
|
++ sqxtun v6.8b, v22.8h
|
|
|
|
++ st1 {v5.8b}, [x3], x1
|
|
|
|
++ sqxtun v7.8b, v23.8h
|
|
|
|
++
|
|
|
|
++ st1 {v6.8b}, [x3], x1
|
|
|
|
++ st1 {v7.8b}, [x3], x1
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm_func8x8 idct, idct
|
|
|
|
++itxfm_func8x8 iadst, idct
|
|
|
|
++itxfm_func8x8 idct, iadst
|
|
|
|
++itxfm_func8x8 iadst, iadst
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++function idct16x16_dc_add_neon
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++
|
|
|
|
++ movi v1.4h, #0
|
|
|
|
++
|
|
|
|
++ ld1 {v2.h}[0], [x2]
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ dup v2.8h, v2.h[0]
|
|
|
|
++ st1 {v1.h}[0], [x2]
|
|
|
|
++
|
|
|
|
++ srshr v2.8h, v2.8h, #6
|
|
|
|
++
|
|
|
|
++ mov x3, x0
|
|
|
|
++ mov x4, #16
|
|
|
|
++1:
|
|
|
|
++ // Loop to add the constant from v2 into all 16x16 outputs
|
|
|
|
++ subs x4, x4, #2
|
|
|
|
++ ld1 {v3.16b}, [x0], x1
|
|
|
|
++ ld1 {v4.16b}, [x0], x1
|
|
|
|
++ uaddw v16.8h, v2.8h, v3.8b
|
|
|
|
++ uaddw2 v17.8h, v2.8h, v3.16b
|
|
|
|
++ uaddw v18.8h, v2.8h, v4.8b
|
|
|
|
++ uaddw2 v19.8h, v2.8h, v4.16b
|
|
|
|
++ sqxtun v3.8b, v16.8h
|
|
|
|
++ sqxtun2 v3.16b, v17.8h
|
|
|
|
++ sqxtun v4.8b, v18.8h
|
|
|
|
++ sqxtun2 v4.16b, v19.8h
|
|
|
|
++ st1 {v3.16b}, [x3], x1
|
|
|
|
++ st1 {v4.16b}, [x3], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct16_end
|
|
|
|
++ butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
|
|
|
|
++ butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
|
|
|
|
++ butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
|
|
|
|
++ butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
|
|
|
|
++ butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
|
|
|
|
++ butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
|
|
|
|
++ butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
|
|
|
|
++ butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
|
|
|
|
++ dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
|
|
|
|
++
|
|
|
|
++ butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
|
|
|
|
++ butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
|
|
|
|
++ butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
|
|
|
|
++ butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
|
|
|
|
++ butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
|
|
|
|
++ butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
|
|
|
|
++ butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
|
|
|
|
++ butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function idct16
|
|
|
|
++ dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
|
|
|
|
++ dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
|
|
|
|
++ dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
|
|
|
|
++ dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
|
|
|
|
++ dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
|
|
|
|
++ dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
|
|
|
|
++ dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
|
|
|
|
++ dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
|
|
|
|
++ butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
|
|
|
|
++ butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
|
|
|
|
++ butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
|
|
|
|
++ butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
|
|
|
|
++ butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
|
|
|
|
++ butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
|
|
|
|
++ butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
|
|
|
|
++ dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
|
|
|
|
++ dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
|
|
|
|
++ idct16_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_half
|
|
|
|
++ dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
|
|
|
|
++ dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
|
|
|
|
++ dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
|
|
|
|
++ dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
|
|
|
|
++ dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
|
|
|
|
++ dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
|
|
|
|
++ dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
|
|
|
|
++ dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
|
|
|
|
++ butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
|
|
|
|
++ butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
|
|
|
|
++ butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
|
|
|
|
++ butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
|
|
|
|
++ butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
|
|
|
|
++ butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
|
|
|
|
++ butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
|
|
|
|
++ dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
|
|
|
|
++ dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
|
|
|
|
++ idct16_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_quarter
|
|
|
|
++ dsmull_h v24, v25, v19, v1.h[7]
|
|
|
|
++ dsmull_h v4, v5, v17, v1.h[0]
|
|
|
|
++ dsmull_h v7, v6, v18, v0.h[5]
|
|
|
|
++ dsmull_h v30, v31, v18, v0.h[4]
|
|
|
|
++ neg v24.4s, v24.4s
|
|
|
|
++ neg v25.4s, v25.4s
|
|
|
|
++ dsmull_h v29, v28, v17, v1.h[1]
|
|
|
|
++ dsmull_h v26, v27, v19, v1.h[6]
|
|
|
|
++ dsmull_h v22, v23, v16, v0.h[0]
|
|
|
|
++ drshrn_h v24, v24, v25, #14
|
|
|
|
++ drshrn_h v16, v4, v5, #14
|
|
|
|
++ drshrn_h v7, v7, v6, #14
|
|
|
|
++ drshrn_h v6, v30, v31, #14
|
|
|
|
++ drshrn_h v29, v29, v28, #14
|
|
|
|
++ drshrn_h v17, v26, v27, #14
|
|
|
|
++ drshrn_h v28, v22, v23, #14
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
|
|
|
|
++ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
|
|
|
|
++ neg v22.4s, v22.4s
|
|
|
|
++ neg v23.4s, v23.4s
|
|
|
|
++ drshrn_h v27, v20, v21, #14
|
|
|
|
++ drshrn_h v21, v22, v23, #14
|
|
|
|
++ drshrn_h v23, v18, v19, #14
|
|
|
|
++ drshrn_h v25, v30, v31, #14
|
|
|
|
++ mov v4.16b, v28.16b
|
|
|
|
++ mov v5.16b, v28.16b
|
|
|
|
++ dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
|
|
|
|
++ mov v20.16b, v28.16b
|
|
|
|
++ idct16_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function iadst16
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x11]
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8
|
|
|
|
++ dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
|
|
|
|
++ dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
|
|
|
|
++ dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10
|
|
|
|
++ dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4
|
|
|
|
++ dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
|
|
|
|
++ dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
|
|
|
|
++ dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6
|
|
|
|
++ dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
|
|
|
|
++ ld1 {v0.8h}, [x10]
|
|
|
|
++ dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
|
|
|
|
++ dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8
|
|
|
|
++ dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13
|
|
|
|
++ dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10
|
|
|
|
++ butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
|
|
|
|
++ dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15
|
|
|
|
++ butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
|
|
|
|
++ dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
|
|
|
|
++ dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
|
|
|
|
++
|
|
|
|
++ butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
|
|
|
|
++ butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12
|
|
|
|
++ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15
|
|
|
|
++
|
|
|
|
++ dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
|
|
|
|
++ dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
|
|
|
|
++ neg v29.8h, v29.8h // v29 = out[13]
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a
|
|
|
|
++ dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
|
|
|
|
++ butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
|
|
|
|
++
|
|
|
|
++ dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
|
|
|
|
++ neg v19.8h, v19.8h // v19 = out[3]
|
|
|
|
++ dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
|
|
|
|
++
|
|
|
|
++ butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
|
|
|
|
++ butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
|
|
|
|
++ dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
|
|
|
|
++ dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
|
|
|
|
++ dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
|
|
|
|
++
|
|
|
|
++ neg v31.8h, v5.8h // v31 = out[15]
|
|
|
|
++ neg v17.8h, v3.8h // v17 = out[1]
|
|
|
|
++
|
|
|
|
++ mov v16.16b, v2.16b
|
|
|
|
++ mov v30.16b, v4.16b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++// Helper macros; we can't use these expressions directly within
|
|
|
|
++// e.g. .irp due to the extra concatenation \(). Therefore wrap
|
|
|
|
++// them in macros to allow using .irp below.
|
|
|
|
++.macro load i, src, inc
|
|
|
|
++ ld1 {v\i\().8h}, [\src], \inc
|
|
|
|
++.endm
|
|
|
|
++.macro store i, dst, inc
|
|
|
|
++ st1 {v\i\().8h}, [\dst], \inc
|
|
|
|
++.endm
|
|
|
|
++.macro movi_v i, size, imm
|
|
|
|
++ movi v\i\()\size, \imm
|
|
|
|
++.endm
|
|
|
|
++.macro load_clear i, src, inc
|
|
|
|
++ ld1 {v\i\().8h}, [\src]
|
|
|
|
++ st1 {v2.8h}, [\src], \inc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
|
|
|
|
++ srshr \coef0, \coef0, #6
|
|
|
|
++ ld1 {v2.8b}, [x0], x1
|
|
|
|
++ srshr \coef1, \coef1, #6
|
|
|
|
++ ld1 {v3.8b}, [x3], x1
|
|
|
|
++ srshr \coef2, \coef2, #6
|
|
|
|
++ ld1 {v4.8b}, [x0], x1
|
|
|
|
++ srshr \coef3, \coef3, #6
|
|
|
|
++ uaddw \coef0, \coef0, v2.8b
|
|
|
|
++ ld1 {v5.8b}, [x3], x1
|
|
|
|
++ uaddw \coef1, \coef1, v3.8b
|
|
|
|
++ srshr \coef4, \coef4, #6
|
|
|
|
++ ld1 {v6.8b}, [x0], x1
|
|
|
|
++ srshr \coef5, \coef5, #6
|
|
|
|
++ ld1 {v7.8b}, [x3], x1
|
|
|
|
++ sqxtun v2.8b, \coef0
|
|
|
|
++ srshr \coef6, \coef6, #6
|
|
|
|
++ sqxtun v3.8b, \coef1
|
|
|
|
++ srshr \coef7, \coef7, #6
|
|
|
|
++ uaddw \coef2, \coef2, v4.8b
|
|
|
|
++ ld1 {\tmp1}, [x0], x1
|
|
|
|
++ uaddw \coef3, \coef3, v5.8b
|
|
|
|
++ ld1 {\tmp2}, [x3], x1
|
|
|
|
++ sqxtun v4.8b, \coef2
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ sub x3, x3, x1, lsl #2
|
|
|
|
++ sqxtun v5.8b, \coef3
|
|
|
|
++ uaddw \coef4, \coef4, v6.8b
|
|
|
|
++ st1 {v2.8b}, [x0], x1
|
|
|
|
++ uaddw \coef5, \coef5, v7.8b
|
|
|
|
++ st1 {v3.8b}, [x3], x1
|
|
|
|
++ sqxtun v6.8b, \coef4
|
|
|
|
++ st1 {v4.8b}, [x0], x1
|
|
|
|
++ sqxtun v7.8b, \coef5
|
|
|
|
++ st1 {v5.8b}, [x3], x1
|
|
|
|
++ uaddw \coef6, \coef6, \tmp1
|
|
|
|
++ st1 {v6.8b}, [x0], x1
|
|
|
|
++ uaddw \coef7, \coef7, \tmp2
|
|
|
|
++ st1 {v7.8b}, [x3], x1
|
|
|
|
++ sqxtun \tmp1, \coef6
|
|
|
|
++ sqxtun \tmp2, \coef7
|
|
|
|
++ st1 {\tmp1}, [x0], x1
|
|
|
|
++ st1 {\tmp2}, [x3], x1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
|
|
|
|
++// transpose into a horizontal 16x8 slice and store.
|
|
|
|
++// x0 = dst (temp buffer)
|
|
|
|
++// x1 = slice offset
|
|
|
|
++// x2 = src
|
|
|
|
++// x9 = input stride
|
|
|
|
++.macro itxfm16_1d_funcs txfm
|
|
|
|
++function \txfm\()16_1d_8x16_pass1_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++
|
|
|
|
++ movi v2.8h, #0
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ bl \txfm\()16
|
|
|
|
++
|
|
|
|
++ // Do two 8x8 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
|
|
|
|
++ // transposed 8x8 blocks.
|
|
|
|
++ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
|
|
|
++ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
|
|
|
++
|
|
|
|
++ // Store the transposed 8x8 blocks horizontally.
|
|
|
|
++ cmp x1, #8
|
|
|
|
++ b.eq 1f
|
|
|
|
++.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
|
|
|
|
++ store \i, x0, #16
|
|
|
|
++.endr
|
|
|
|
++ br x14
|
|
|
|
++1:
|
|
|
|
++ // Special case: For the last input column (x1 == 8),
|
|
|
|
++ // which would be stored as the last row in the temp buffer,
|
|
|
|
++ // don't store the first 8x8 block, but keep it in registers
|
|
|
|
++ // for the first slice of the second pass (where it is the
|
|
|
|
++ // last 8x8 block).
|
|
|
|
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ store \i, x0, #16
|
|
|
|
++.endr
|
|
|
|
++ mov v24.16b, v16.16b
|
|
|
|
++ mov v25.16b, v17.16b
|
|
|
|
++ mov v26.16b, v18.16b
|
|
|
|
++ mov v27.16b, v19.16b
|
|
|
|
++ mov v28.16b, v20.16b
|
|
|
|
++ mov v29.16b, v21.16b
|
|
|
|
++ mov v30.16b, v22.16b
|
|
|
|
++ mov v31.16b, v23.16b
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
|
|
|
|
++// load the destination pixels (from a similar 8x16 slice), add and store back.
|
|
|
|
++// x0 = dst
|
|
|
|
++// x1 = dst stride
|
|
|
|
++// x2 = src (temp buffer)
|
|
|
|
++// x3 = slice offset
|
|
|
|
++// x9 = temp buffer stride
|
|
|
|
++function \txfm\()16_1d_8x16_pass2_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ cbz x3, 1f
|
|
|
|
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ add x3, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ bl \txfm\()16
|
|
|
|
++
|
|
|
|
++ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
|
|
|
|
++ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
|
|
|
|
++
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm16_1d_funcs idct
|
|
|
|
++itxfm16_1d_funcs iadst
|
|
|
|
++
|
|
|
|
++.macro itxfm_func16x16 txfm1, txfm2
|
|
|
|
++function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.eq idct16x16_dc_add_neon
|
|
|
|
++.endif
|
|
|
|
++ mov x15, x30
|
|
|
|
++ // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ sub sp, sp, #512
|
|
|
|
++
|
|
|
|
++ mov x4, x0
|
|
|
|
++ mov x5, x1
|
|
|
|
++ mov x6, x2
|
|
|
|
++
|
|
|
|
++ movrel x10, idct_coeffs
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ movrel x11, iadst16_coeffs
|
|
|
|
++.endif
|
|
|
|
++.ifc \txfm1,idct
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x10]
|
|
|
|
++.endif
|
|
|
|
++ mov x9, #32
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ cmp w3, #10
|
|
|
|
++ b.le idct16x16_quarter_add_neon
|
|
|
|
++ cmp w3, #38
|
|
|
|
++ b.le idct16x16_half_add_neon
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.irp i, 0, 8
|
|
|
|
++ add x0, sp, #(\i*32)
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++.if \i == 8
|
|
|
|
++ cmp w3, #38
|
|
|
|
++ b.le 1f
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ mov x1, #\i
|
|
|
|
++ add x2, x6, #(\i*2)
|
|
|
|
++ bl \txfm1\()16_1d_8x16_pass1_neon
|
|
|
|
++.endr
|
|
|
|
++.ifc \txfm1\()_\txfm2,iadst_idct
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x10]
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.ifc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ b 3f
|
|
|
|
++1:
|
|
|
|
++ // Set v24-v31 to zero, for the in-register passthrough of
|
|
|
|
++ // coefficients to pass 2. Since we only do two slices, this can
|
|
|
|
++ // only ever happen for the second slice. So we only need to store
|
|
|
|
++ // zeros to the temp buffer for the second half of the buffer.
|
|
|
|
++ // Move x0 to the second half, and use x9 == 32 as increment.
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ movi_v \i, .16b, #0
|
|
|
|
++ st1 {v24.8h}, [x0], x9
|
|
|
|
++.endr
|
|
|
|
++3:
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.irp i, 0, 8
|
|
|
|
++ add x0, x4, #(\i)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*2)
|
|
|
|
++ mov x3, #\i
|
|
|
|
++ bl \txfm2\()16_1d_8x16_pass2_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #512
|
|
|
|
++.ifnc \txfm1\()_\txfm2,idct_idct
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++.endif
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++itxfm_func16x16 idct, idct
|
|
|
|
++itxfm_func16x16 iadst, idct
|
|
|
|
++itxfm_func16x16 idct, iadst
|
|
|
|
++itxfm_func16x16 iadst, iadst
|
|
|
|
++
|
|
|
|
++function idct16_1d_8x16_pass1_quarter_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++ movi v2.8h, #0
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ bl idct16_quarter
|
|
|
|
++
|
|
|
|
++ // Do two 8x8 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
|
|
|
|
++ // transposed 8x8 blocks.
|
|
|
|
++ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
|
|
|
++ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
|
|
|
++
|
|
|
|
++ // Store the transposed 8x8 blocks horizontally.
|
|
|
|
++ // The first 8x8 block is kept in registers for the second pass,
|
|
|
|
++ // store the rest in the temp buffer.
|
|
|
|
++ // Since only a 4x4 part of the input was nonzero, this means that
|
|
|
|
++ // only 4 rows are nonzero after transposing, and the second pass
|
|
|
|
++ // only reads the topmost 4 rows. Therefore only store the topmost
|
|
|
|
++ // 4 rows.
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++.irp i, 24, 25, 26, 27
|
|
|
|
++ store \i, x0, x9
|
|
|
|
++.endr
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_1d_8x16_pass2_quarter_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++ cbz x3, 1f
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ add x3, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ bl idct16_quarter
|
|
|
|
++
|
|
|
|
++ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
|
|
|
|
++ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
|
|
|
|
++
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_1d_8x16_pass1_half_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++ movi v2.8h, #0
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ bl idct16_half
|
|
|
|
++
|
|
|
|
++ // Do two 8x8 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
|
|
|
|
++ // transposed 8x8 blocks.
|
|
|
|
++ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
|
|
|
++ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
|
|
|
++
|
|
|
|
++ // Store the transposed 8x8 blocks horizontally.
|
|
|
|
++ // The first 8x8 block is kept in registers for the second pass,
|
|
|
|
++ // store the rest in the temp buffer.
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ store \i, x0, x9
|
|
|
|
++.endr
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct16_1d_8x16_pass2_half_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++ cbz x3, 1f
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ add x3, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ bl idct16_half
|
|
|
|
++
|
|
|
|
++ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
|
|
|
|
++ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
|
|
|
|
++
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct16_partial size
|
|
|
|
++function idct16x16_\size\()_add_neon
|
|
|
|
++ add x0, sp, #(0*32)
|
|
|
|
++ add x2, x6, #(0*2)
|
|
|
|
++ bl idct16_1d_8x16_pass1_\size\()_neon
|
|
|
|
++.irp i, 0, 8
|
|
|
|
++ add x0, x4, #(\i)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*2)
|
|
|
|
++ mov x3, #\i
|
|
|
|
++ bl idct16_1d_8x16_pass2_\size\()_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #512
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++idct16_partial quarter
|
|
|
|
++idct16_partial half
|
|
|
|
++
|
|
|
|
++function idct32x32_dc_add_neon
|
|
|
|
++ movrel x4, idct_coeffs
|
|
|
|
++ ld1 {v0.4h}, [x4]
|
|
|
|
++
|
|
|
|
++ movi v1.4h, #0
|
|
|
|
++
|
|
|
|
++ ld1 {v2.h}[0], [x2]
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ smull v2.4s, v2.4h, v0.h[0]
|
|
|
|
++ rshrn v2.4h, v2.4s, #14
|
|
|
|
++ dup v2.8h, v2.h[0]
|
|
|
|
++ st1 {v1.h}[0], [x2]
|
|
|
|
++
|
|
|
|
++ srshr v0.8h, v2.8h, #6
|
|
|
|
++
|
|
|
|
++ mov x3, x0
|
|
|
|
++ mov x4, #32
|
|
|
|
++1:
|
|
|
|
++ // Loop to add the constant v0 into all 32x32 outputs
|
|
|
|
++ subs x4, x4, #2
|
|
|
|
++ ld1 {v1.16b,v2.16b}, [x0], x1
|
|
|
|
++ uaddw v16.8h, v0.8h, v1.8b
|
|
|
|
++ uaddw2 v17.8h, v0.8h, v1.16b
|
|
|
|
++ ld1 {v3.16b,v4.16b}, [x0], x1
|
|
|
|
++ uaddw v18.8h, v0.8h, v2.8b
|
|
|
|
++ uaddw2 v19.8h, v0.8h, v2.16b
|
|
|
|
++ uaddw v20.8h, v0.8h, v3.8b
|
|
|
|
++ uaddw2 v21.8h, v0.8h, v3.16b
|
|
|
|
++ uaddw v22.8h, v0.8h, v4.8b
|
|
|
|
++ uaddw2 v23.8h, v0.8h, v4.16b
|
|
|
|
++ sqxtun v1.8b, v16.8h
|
|
|
|
++ sqxtun2 v1.16b, v17.8h
|
|
|
|
++ sqxtun v2.8b, v18.8h
|
|
|
|
++ sqxtun2 v2.16b, v19.8h
|
|
|
|
++ sqxtun v3.8b, v20.8h
|
|
|
|
++ sqxtun2 v3.16b, v21.8h
|
|
|
|
++ st1 {v1.16b,v2.16b}, [x3], x1
|
|
|
|
++ sqxtun v4.8b, v22.8h
|
|
|
|
++ sqxtun2 v4.16b, v23.8h
|
|
|
|
++ st1 {v3.16b,v4.16b}, [x3], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct32_end
|
|
|
|
++ butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
|
|
|
|
++ butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
|
|
|
|
++ butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
|
|
|
|
++ butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
|
|
|
|
++ butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
|
|
|
|
++ butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
|
|
|
|
++ butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
|
|
|
|
++ butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
|
|
|
|
++
|
|
|
|
++ dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
|
|
|
|
++ dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
|
|
|
|
++ dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
|
|
|
|
++ dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
|
|
|
|
++ butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
|
|
|
|
++ butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
|
|
|
|
++ butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
|
|
|
|
++ butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
|
|
|
|
++ butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
|
|
|
|
++ butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
|
|
|
|
++ butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
|
|
|
|
++
|
|
|
|
++ dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
|
|
|
|
++ dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
|
|
|
|
++ dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
|
|
|
|
++ dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
|
|
|
|
++ ret
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function idct32_odd
|
|
|
|
++ dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
|
|
|
|
++ dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
|
|
|
|
++ dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
|
|
|
|
++ dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
|
|
|
|
++ dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
|
|
|
|
++ dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
|
|
|
|
++ dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
|
|
|
|
++ dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
|
|
|
|
++ butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
|
|
|
|
++ butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
|
|
|
|
++ butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
|
|
|
|
++ butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
|
|
|
|
++ butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
|
|
|
|
++ butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
|
|
|
|
++ butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
|
|
|
|
++
|
|
|
|
++ dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
|
|
|
|
++ dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
|
|
|
|
++ dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
|
|
|
|
++ dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
|
|
|
|
++ idct32_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct32_odd_half
|
|
|
|
++ dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
|
|
|
|
++ dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
|
|
|
|
++ dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
|
|
|
|
++ dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
|
|
|
|
++ dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
|
|
|
|
++ dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
|
|
|
|
++ dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
|
|
|
|
++ dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
|
|
|
|
++
|
|
|
|
++ butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
|
|
|
|
++ butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
|
|
|
|
++ butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
|
|
|
|
++ butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
|
|
|
|
++ butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
|
|
|
|
++ butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
|
|
|
|
++ butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
|
|
|
|
++ butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
|
|
|
|
++
|
|
|
|
++ dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
|
|
|
|
++ dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
|
|
|
|
++ dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
|
|
|
|
++ dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
|
|
|
|
++ idct32_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function idct32_odd_quarter
|
|
|
|
++ dsmull_h v4, v5, v16, v8.h[0]
|
|
|
|
++ dsmull_h v28, v29, v19, v8.h[7]
|
|
|
|
++ dsmull_h v30, v31, v16, v8.h[1]
|
|
|
|
++ dsmull_h v22, v23, v17, v9.h[6]
|
|
|
|
++ dsmull_h v7, v6, v17, v9.h[7]
|
|
|
|
++ dsmull_h v26, v27, v19, v8.h[6]
|
|
|
|
++ dsmull_h v20, v21, v18, v9.h[0]
|
|
|
|
++ dsmull_h v24, v25, v18, v9.h[1]
|
|
|
|
++
|
|
|
|
++ neg v28.4s, v28.4s
|
|
|
|
++ neg v29.4s, v29.4s
|
|
|
|
++ neg v7.4s, v7.4s
|
|
|
|
++ neg v6.4s, v6.4s
|
|
|
|
++
|
|
|
|
++ drshrn_h v4, v4, v5, #14
|
|
|
|
++ drshrn_h v5, v28, v29, #14
|
|
|
|
++ drshrn_h v29, v30, v31, #14
|
|
|
|
++ drshrn_h v28, v22, v23, #14
|
|
|
|
++ drshrn_h v7, v7, v6, #14
|
|
|
|
++ drshrn_h v31, v26, v27, #14
|
|
|
|
++ drshrn_h v6, v20, v21, #14
|
|
|
|
++ drshrn_h v30, v24, v25, #14
|
|
|
|
++
|
|
|
|
++ dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5]
|
|
|
|
++ dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5]
|
|
|
|
++ drshrn_h v23, v16, v17, #14
|
|
|
|
++ drshrn_h v24, v18, v19, #14
|
|
|
|
++ neg v20.4s, v20.4s
|
|
|
|
++ neg v21.4s, v21.4s
|
|
|
|
++ drshrn_h v27, v27, v26, #14
|
|
|
|
++ drshrn_h v20, v20, v21, #14
|
|
|
|
++ dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7]
|
|
|
|
++ drshrn_h v21, v16, v17, #14
|
|
|
|
++ drshrn_h v26, v18, v19, #14
|
|
|
|
++ dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7]
|
|
|
|
++ drshrn_h v25, v16, v17, #14
|
|
|
|
++ neg v18.4s, v18.4s
|
|
|
|
++ neg v19.4s, v19.4s
|
|
|
|
++ drshrn_h v22, v18, v19, #14
|
|
|
|
++
|
|
|
|
++ idct32_end
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct32_funcs suffix
|
|
|
|
++// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
|
|
|
|
++// The 32-point IDCT can be decomposed into two 16-point IDCTs;
|
|
|
|
++// a normal IDCT16 with every other input component (the even ones, with
|
|
|
|
++// each output written twice), followed by a separate 16-point IDCT
|
|
|
|
++// of the odd inputs, added/subtracted onto the outputs of the first idct16.
|
|
|
|
++// x0 = dst (temp buffer)
|
|
|
|
++// x1 = unused
|
|
|
|
++// x2 = src
|
|
|
|
++// x9 = double input stride
|
|
|
|
++function idct32_1d_8x32_pass1\suffix\()_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++ movi v2.8h, #0
|
|
|
|
++
|
|
|
|
++ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ bl idct16\suffix
|
|
|
|
++
|
|
|
|
++ // Do two 8x8 transposes. Originally, v16-v31 contain the
|
|
|
|
++ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
|
|
|
|
++ // two transposed 8x8 blocks.
|
|
|
|
++ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
|
|
|
++ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
|
|
|
++
|
|
|
|
++ // Store the registers a, b horizontally, followed by the
|
|
|
|
++ // same registers b, a mirrored.
|
|
|
|
++.macro store_rev a, b
|
|
|
|
++ // There's no rev128 instruction, but we reverse each 64 bit
|
|
|
|
++ // half, and then flip them using an ext with 8 bytes offset.
|
|
|
|
++ rev64 v3.8h, \b
|
|
|
|
++ st1 {\a}, [x0], #16
|
|
|
|
++ rev64 v2.8h, \a
|
|
|
|
++ ext v3.16b, v3.16b, v3.16b, #8
|
|
|
|
++ st1 {\b}, [x0], #16
|
|
|
|
++ ext v2.16b, v2.16b, v2.16b, #8
|
|
|
|
++ st1 {v3.8h}, [x0], #16
|
|
|
|
++ st1 {v2.8h}, [x0], #16
|
|
|
|
++.endm
|
|
|
|
++ store_rev v16.8h, v24.8h
|
|
|
|
++ store_rev v17.8h, v25.8h
|
|
|
|
++ store_rev v18.8h, v26.8h
|
|
|
|
++ store_rev v19.8h, v27.8h
|
|
|
|
++ store_rev v20.8h, v28.8h
|
|
|
|
++ store_rev v21.8h, v29.8h
|
|
|
|
++ store_rev v22.8h, v30.8h
|
|
|
|
++ store_rev v23.8h, v31.8h
|
|
|
|
++ sub x0, x0, #512
|
|
|
|
++.purgem store_rev
|
|
|
|
++
|
|
|
|
++ // Move x2 back to the start of the input, and move
|
|
|
|
++ // to the first odd row
|
|
|
|
++.ifb \suffix
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++ sub x2, x2, x9, lsl #2
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++ sub x2, x2, x9, lsl #3
|
|
|
|
++.endif
|
|
|
|
++ add x2, x2, #64
|
|
|
|
++
|
|
|
|
++ movi v2.8h, #0
|
|
|
|
++ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load_clear \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ bl idct32_odd\suffix
|
|
|
|
++
|
|
|
|
++ transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
|
|
|
|
++ transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
|
|
|
|
++
|
|
|
|
++ // Store the registers a, b horizontally,
|
|
|
|
++ // adding into the output first, and the mirrored,
|
|
|
|
++ // subtracted from the output.
|
|
|
|
++.macro store_rev a, b
|
|
|
|
++ ld1 {v4.8h}, [x0]
|
|
|
|
++ rev64 v3.8h, \b
|
|
|
|
++ add v4.8h, v4.8h, \a
|
|
|
|
++ rev64 v2.8h, \a
|
|
|
|
++ st1 {v4.8h}, [x0], #16
|
|
|
|
++ ext v3.16b, v3.16b, v3.16b, #8
|
|
|
|
++ ld1 {v5.8h}, [x0]
|
|
|
|
++ ext v2.16b, v2.16b, v2.16b, #8
|
|
|
|
++ add v5.8h, v5.8h, \b
|
|
|
|
++ st1 {v5.8h}, [x0], #16
|
|
|
|
++ ld1 {v6.8h}, [x0]
|
|
|
|
++ sub v6.8h, v6.8h, v3.8h
|
|
|
|
++ st1 {v6.8h}, [x0], #16
|
|
|
|
++ ld1 {v7.8h}, [x0]
|
|
|
|
++ sub v7.8h, v7.8h, v2.8h
|
|
|
|
++ st1 {v7.8h}, [x0], #16
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++ store_rev v31.8h, v23.8h
|
|
|
|
++ store_rev v30.8h, v22.8h
|
|
|
|
++ store_rev v29.8h, v21.8h
|
|
|
|
++ store_rev v28.8h, v20.8h
|
|
|
|
++ store_rev v27.8h, v19.8h
|
|
|
|
++ store_rev v26.8h, v18.8h
|
|
|
|
++ store_rev v25.8h, v17.8h
|
|
|
|
++ store_rev v24.8h, v16.8h
|
|
|
|
++.purgem store_rev
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++// This is mostly the same as 8x32_pass1, but without the transpose,
|
|
|
|
++// and use the source as temp buffer between the two idct passes, and
|
|
|
|
++// add into the destination.
|
|
|
|
++// x0 = dst
|
|
|
|
++// x1 = dst stride
|
|
|
|
++// x2 = src (temp buffer)
|
|
|
|
++// x7 = negative double temp buffer stride
|
|
|
|
++// x9 = double temp buffer stride
|
|
|
|
++function idct32_1d_8x32_pass2\suffix\()_neon
|
|
|
|
++ mov x14, x30
|
|
|
|
++ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #2
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #3
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ bl idct16\suffix
|
|
|
|
++
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ store \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++ add x2, x2, #64
|
|
|
|
++
|
|
|
|
++ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
|
|
|
|
++.ifb \suffix
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #4
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_quarter
|
|
|
|
++.irp i, 16, 17, 18, 19
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #2
|
|
|
|
++.endif
|
|
|
|
++.ifc \suffix,_half
|
|
|
|
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
|
|
|
++ load \i, x2, x9
|
|
|
|
++.endr
|
|
|
|
++ sub x2, x2, x9, lsl #3
|
|
|
|
++.endif
|
|
|
|
++ sub x2, x2, #64
|
|
|
|
++
|
|
|
|
++ bl idct32_odd\suffix
|
|
|
|
++
|
|
|
|
++.macro load_acc_store a, b, c, d, neg=0
|
|
|
|
++.if \neg == 0
|
|
|
|
++ ld1 {v4.8h}, [x2], x9
|
|
|
|
++ ld1 {v5.8h}, [x2], x9
|
|
|
|
++ add v4.8h, v4.8h, \a
|
|
|
|
++ ld1 {v6.8h}, [x2], x9
|
|
|
|
++ add v5.8h, v5.8h, \b
|
|
|
|
++ ld1 {v7.8h}, [x2], x9
|
|
|
|
++ add v6.8h, v6.8h, \c
|
|
|
|
++ add v7.8h, v7.8h, \d
|
|
|
|
++.else
|
|
|
|
++ ld1 {v4.8h}, [x2], x7
|
|
|
|
++ ld1 {v5.8h}, [x2], x7
|
|
|
|
++ sub v4.8h, v4.8h, \a
|
|
|
|
++ ld1 {v6.8h}, [x2], x7
|
|
|
|
++ sub v5.8h, v5.8h, \b
|
|
|
|
++ ld1 {v7.8h}, [x2], x7
|
|
|
|
++ sub v6.8h, v6.8h, \c
|
|
|
|
++ sub v7.8h, v7.8h, \d
|
|
|
|
++.endif
|
|
|
|
++ ld1 {v10.8b}, [x0], x1
|
|
|
|
++ ld1 {v11.8b}, [x0], x1
|
|
|
|
++ srshr v4.8h, v4.8h, #6
|
|
|
|
++ ld1 {v2.8b}, [x0], x1
|
|
|
|
++ srshr v5.8h, v5.8h, #6
|
|
|
|
++ uaddw v4.8h, v4.8h, v10.8b
|
|
|
|
++ ld1 {v3.8b}, [x0], x1
|
|
|
|
++ srshr v6.8h, v6.8h, #6
|
|
|
|
++ uaddw v5.8h, v5.8h, v11.8b
|
|
|
|
++ srshr v7.8h, v7.8h, #6
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ uaddw v6.8h, v6.8h, v2.8b
|
|
|
|
++ sqxtun v4.8b, v4.8h
|
|
|
|
++ uaddw v7.8h, v7.8h, v3.8b
|
|
|
|
++ sqxtun v5.8b, v5.8h
|
|
|
|
++ st1 {v4.8b}, [x0], x1
|
|
|
|
++ sqxtun v6.8b, v6.8h
|
|
|
|
++ st1 {v5.8b}, [x0], x1
|
|
|
|
++ sqxtun v7.8b, v7.8h
|
|
|
|
++ st1 {v6.8b}, [x0], x1
|
|
|
|
++ st1 {v7.8b}, [x0], x1
|
|
|
|
++.endm
|
|
|
|
++ load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
|
|
|
|
++ load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
|
|
|
|
++ load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
|
|
|
|
++ load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
|
|
|
|
++ sub x2, x2, x9
|
|
|
|
++ load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
|
|
|
|
++ load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
|
|
|
|
++ load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
|
|
|
|
++ load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
|
|
|
|
++.purgem load_acc_store
|
|
|
|
++ br x14
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++idct32_funcs
|
|
|
|
++idct32_funcs _quarter
|
|
|
|
++idct32_funcs _half
|
|
|
|
++
|
|
|
|
++const min_eob_idct_idct_32, align=4
|
|
|
|
++ .short 0, 34, 135, 336
|
|
|
|
++endconst
|
|
|
|
++
|
|
|
|
++function ff_vp9_idct_idct_32x32_add_neon, export=1
|
|
|
|
++ cmp w3, #1
|
|
|
|
++ b.eq idct32x32_dc_add_neon
|
|
|
|
++
|
|
|
|
++ movrel x10, idct_coeffs
|
|
|
|
++
|
|
|
|
++ mov x15, x30
|
|
|
|
++
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++
|
|
|
|
++ sub sp, sp, #2048
|
|
|
|
++
|
|
|
|
++ mov x4, x0
|
|
|
|
++ mov x5, x1
|
|
|
|
++ mov x6, x2
|
|
|
|
++
|
|
|
|
++ // Double stride of the input, since we only read every other line
|
|
|
|
++ mov x9, #128
|
|
|
|
++ neg x7, x9
|
|
|
|
++
|
|
|
|
++ ld1 {v0.8h,v1.8h}, [x10], #32
|
|
|
|
++ ld1 {v8.8h,v9.8h}, [x10]
|
|
|
|
++
|
|
|
|
++ cmp w3, #34
|
|
|
|
++ b.le idct32x32_quarter_add_neon
|
|
|
|
++ cmp w3, #135
|
|
|
|
++ b.le idct32x32_half_add_neon
|
|
|
|
++
|
|
|
|
++ movrel x12, min_eob_idct_idct_32, 2
|
|
|
|
++
|
|
|
|
++.irp i, 0, 8, 16, 24
|
|
|
|
++ add x0, sp, #(\i*64)
|
|
|
|
++.if \i > 0
|
|
|
|
++ ldrh w1, [x12], #2
|
|
|
|
++ cmp w3, w1
|
|
|
|
++ mov x1, #(32 - \i)/4
|
|
|
|
++ b.le 1f
|
|
|
|
++.endif
|
|
|
|
++ add x2, x6, #(\i*2)
|
|
|
|
++ bl idct32_1d_8x32_pass1_neon
|
|
|
|
++.endr
|
|
|
|
++ b 3f
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ // Write zeros to the temp buffer for pass 2
|
|
|
|
++ movi v16.8h, #0
|
|
|
|
++ movi v17.8h, #0
|
|
|
|
++ movi v18.8h, #0
|
|
|
|
++ movi v19.8h, #0
|
|
|
|
++2:
|
|
|
|
++ subs x1, x1, #1
|
|
|
|
++.rept 4
|
|
|
|
++ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64
|
|
|
|
++.endr
|
|
|
|
++ b.ne 2b
|
|
|
|
++3:
|
|
|
|
++.irp i, 0, 8, 16, 24
|
|
|
|
++ add x0, x4, #(\i)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*2)
|
|
|
|
++ bl idct32_1d_8x32_pass2_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #2048
|
|
|
|
++
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro idct32_partial size
|
|
|
|
++function idct32x32_\size\()_add_neon
|
|
|
|
++ add x0, sp, #(0*64)
|
|
|
|
++ add x2, x6, #(0*2)
|
|
|
|
++ bl idct32_1d_8x32_pass1_\size\()_neon
|
|
|
|
++.ifc \size,half
|
|
|
|
++ add x0, sp, #(8*64)
|
|
|
|
++ add x2, x6, #(8*2)
|
|
|
|
++ bl idct32_1d_8x32_pass1_\size\()_neon
|
|
|
|
++.endif
|
|
|
|
++.irp i, 0, 8, 16, 24
|
|
|
|
++ add x0, x4, #(\i)
|
|
|
|
++ mov x1, x5
|
|
|
|
++ add x2, sp, #(\i*2)
|
|
|
|
++ bl idct32_1d_8x32_pass2_\size\()_neon
|
|
|
|
++.endr
|
|
|
|
++
|
|
|
|
++ add sp, sp, #2048
|
|
|
|
++
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++
|
|
|
|
++ br x15
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++idct32_partial quarter
|
|
|
|
++idct32_partial half
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
|
|
|
|
+@@ -0,0 +1,873 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++#include "neon.S"
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
|
|
|
|
++ trn1 \t4\().8h, \r0\().8h, \r1\().8h
|
|
|
|
++ trn2 \t5\().8h, \r0\().8h, \r1\().8h
|
|
|
|
++ trn1 \t6\().8h, \r2\().8h, \r3\().8h
|
|
|
|
++ trn2 \t7\().8h, \r2\().8h, \r3\().8h
|
|
|
|
++
|
|
|
|
++ trn1 \r0\().4s, \t4\().4s, \t6\().4s
|
|
|
|
++ trn2 \r2\().4s, \t4\().4s, \t6\().4s
|
|
|
|
++ trn1 \r1\().4s, \t5\().4s, \t7\().4s
|
|
|
|
++ trn2 \r3\().4s, \t5\().4s, \t7\().4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// The input to and output from this macro is in the registers v16-v31,
|
|
|
|
++// and v0-v7 are used as scratch registers.
|
|
|
|
++// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
|
|
|
|
++// Depending on the width of the loop filter, we either use v16-v19
|
|
|
|
++// and v28-v31 as temp registers, or v8-v15.
|
|
|
|
++.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
|
|
|
|
++ dup v0.8h, w2 // E
|
|
|
|
++ dup v2.8h, w3 // I
|
|
|
|
++ dup v3.8h, w4 // H
|
|
|
|
++
|
|
|
|
++ uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
|
|
|
|
++ uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
|
|
|
|
++ uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
|
|
|
|
++ uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
|
|
|
|
++ uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
|
|
|
|
++ uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
|
|
|
|
++ umax v4.8h, v4.8h, v5.8h
|
|
|
|
++ umax v5.8h, v6.8h, v7.8h
|
|
|
|
++ umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
|
|
|
|
++ uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
|
|
|
|
++ umax v4.8h, v4.8h, v5.8h
|
|
|
|
++ add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
|
|
|
|
++ uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
|
|
|
|
++ umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
|
|
|
|
++ ushr v5.8h, v5.8h, #1
|
|
|
|
++ cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
|
|
|
|
++ add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
|
|
|
++ cmhs v6.8h, v0.8h, v6.8h
|
|
|
|
++ and v4.16b, v4.16b, v6.16b // fm
|
|
|
|
++
|
|
|
|
++ // If no pixels need filtering, just exit as soon as possible
|
|
|
|
++ mov x11, v4.d[0]
|
|
|
|
++ mov x12, v4.d[1]
|
|
|
|
++ adds x11, x11, x12
|
|
|
|
++ b.ne 1f
|
|
|
|
++ br x10
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++.if \wd >= 8
|
|
|
|
++ dup v0.8h, w5
|
|
|
|
++
|
|
|
|
++ uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
|
|
|
|
++ uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
|
|
|
|
++ uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
|
|
|
|
++ uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
|
|
|
|
++ uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
|
|
|
|
++ uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
|
|
|
|
++ umax v6.8h, v6.8h, v2.8h
|
|
|
|
++ umax v1.8h, v1.8h, \tmp1\().8h
|
|
|
|
++ umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
|
|
|
|
++.if \wd == 16
|
|
|
|
++ uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
|
|
|
|
++ umax v6.8h, v6.8h, v1.8h
|
|
|
|
++ uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
|
|
|
|
++ umax v6.8h, v6.8h, \tmp2\().8h
|
|
|
|
++ uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
|
|
|
|
++ cmhs v6.8h, v0.8h, v6.8h // flat8in
|
|
|
|
++ uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
|
|
|
|
++ and v6.16b, v6.16b, v4.16b // flat8in && fm
|
|
|
|
++ uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
|
|
|
|
++ bic v4.16b, v4.16b, v6.16b // fm && !flat8in
|
|
|
|
++ uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
|
|
|
|
++ uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
|
|
|
|
++ uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
|
|
|
|
++
|
|
|
|
++ umax v7.8h, v7.8h, v2.8h
|
|
|
|
++ umax v1.8h, v1.8h, v8.8h
|
|
|
|
++ umax v9.8h, v9.8h, v10.8h
|
|
|
|
++ umax v11.8h, v11.8h, v12.8h
|
|
|
|
++ // The rest of the calculation of flat8out is interleaved below
|
|
|
|
++.else
|
|
|
|
++ // The rest of the calculation of flat8in is interleaved below
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ // Calculate the normal inner loop filter for 2 or 4 pixels
|
|
|
|
++ uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
|
|
|
|
++.if \wd == 16
|
|
|
|
++ umax v7.8h, v7.8h, v1.8h
|
|
|
|
++ umax v9.8h, v9.8h, v11.8h
|
|
|
|
++.elseif \wd == 8
|
|
|
|
++ umax v6.8h, v6.8h, v1.8h
|
|
|
|
++.endif
|
|
|
|
++ uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
|
|
|
|
++.if \wd == 16
|
|
|
|
++ umax v7.8h, v7.8h, v9.8h
|
|
|
|
++.elseif \wd == 8
|
|
|
|
++ umax v6.8h, v6.8h, \tmp2\().8h
|
|
|
|
++.endif
|
|
|
|
++ dup \tmp2\().8h, w6 // left shift for saturation
|
|
|
|
++ sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
|
|
|
|
++ neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
|
|
|
|
++ umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
|
|
|
|
++ sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
|
|
|
|
++ movi \tmp5\().8h, #3
|
|
|
|
++.if \wd == 8
|
|
|
|
++ cmhs v6.8h, v0.8h, v6.8h // flat8in
|
|
|
|
++.endif
|
|
|
|
++ cmhs v5.8h, v3.8h, v5.8h // !hev
|
|
|
|
++.if \wd == 8
|
|
|
|
++ and v6.16b, v6.16b, v4.16b // flat8in && fm
|
|
|
|
++.endif
|
|
|
|
++ sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
|
|
|
|
++.if \wd == 16
|
|
|
|
++ cmhs v7.8h, v0.8h, v7.8h // flat8out
|
|
|
|
++.elseif \wd == 8
|
|
|
|
++ bic v4.16b, v4.16b, v6.16b // fm && !flat8in
|
|
|
|
++.endif
|
|
|
|
++ and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
|
|
|
|
++.if \wd == 16
|
|
|
|
++ and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
|
|
|
|
++.endif
|
|
|
|
++ sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
|
|
|
|
++
|
|
|
|
++ mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
|
|
|
|
++ bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
|
|
|
|
++ movi v2.8h, #4
|
|
|
|
++ add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
|
|
|
|
++ movi v3.8h, #3
|
|
|
|
++ sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
|
|
|
|
++ movi \tmp5\().8h, #0
|
|
|
|
++ sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
|
|
|
|
++ dup \tmp6\().8h, w7 // max pixel value
|
|
|
|
++.if \wd == 16
|
|
|
|
++ bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
|
|
|
|
++
|
|
|
|
++ add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
|
|
|
|
++ add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
|
|
|
|
++ smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
|
|
|
|
++ smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
|
|
|
|
++ sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
|
|
|
|
++ sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
|
|
|
|
++
|
|
|
|
++ add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
|
|
|
|
++ sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
|
|
|
|
++ smin v0.8h, v0.8h, \tmp6\().8h
|
|
|
|
++ smin v2.8h, v2.8h, \tmp6\().8h
|
|
|
|
++ srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
|
|
|
|
++ smax v0.8h, v0.8h, \tmp5\().8h // out p0
|
|
|
|
++ smax v2.8h, v2.8h, \tmp5\().8h // out q0
|
|
|
|
++ bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
|
|
|
|
++ bit v24.16b, v2.16b, v4.16b
|
|
|
|
++
|
|
|
|
++ add v0.8h, v22.8h, \tmp3\().8h // p1 + f
|
|
|
|
++ sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
|
|
|
|
++.if \wd >= 8
|
|
|
|
++ mov x11, v6.d[0]
|
|
|
|
++.endif
|
|
|
|
++ smin v0.8h, v0.8h, \tmp6\().8h
|
|
|
|
++ smin v2.8h, v2.8h, \tmp6\().8h
|
|
|
|
++.if \wd >= 8
|
|
|
|
++ mov x12, v6.d[1]
|
|
|
|
++.endif
|
|
|
|
++ smax v0.8h, v0.8h, \tmp5\().8h // out p1
|
|
|
|
++ smax v2.8h, v2.8h, \tmp5\().8h // out q1
|
|
|
|
++.if \wd >= 8
|
|
|
|
++ adds x11, x11, x12
|
|
|
|
++.endif
|
|
|
|
++ bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
|
|
|
|
++ bit v25.16b, v2.16b, v5.16b
|
|
|
|
++
|
|
|
|
++ // If no pixels need flat8in, jump to flat8out
|
|
|
|
++ // (or to a writeout of the inner 4 pixels, for wd=8)
|
|
|
|
++.if \wd >= 8
|
|
|
|
++.if \wd == 16
|
|
|
|
++ b.eq 6f
|
|
|
|
++.else
|
|
|
|
++ b.ne 1f
|
|
|
|
++ br x13
|
|
|
|
++1:
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ // flat8in
|
|
|
|
++ add \tmp1\().8h, v20.8h, v21.8h
|
|
|
|
++ add \tmp3\().8h, v22.8h, v25.8h
|
|
|
|
++ add \tmp5\().8h, v20.8h, v22.8h
|
|
|
|
++ add \tmp7\().8h, v23.8h, v26.8h
|
|
|
|
++ add v0.8h, \tmp1\().8h, \tmp1\().8h
|
|
|
|
++ add v0.8h, v0.8h, v23.8h
|
|
|
|
++ add v0.8h, v0.8h, v24.8h
|
|
|
|
++ add v0.8h, v0.8h, \tmp5\().8h
|
|
|
|
++ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
|
|
|
++ sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
|
|
|
|
++ urshr v2.8h, v0.8h, #3 // out p2
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, \tmp3\().8h
|
|
|
|
++ add \tmp1\().8h, v20.8h, v23.8h
|
|
|
|
++ add \tmp3\().8h, v24.8h, v27.8h
|
|
|
|
++ urshr v3.8h, v0.8h, #3 // out p1
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, \tmp7\().8h
|
|
|
|
++ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
|
|
|
++ add \tmp5\().8h, v21.8h, v24.8h
|
|
|
|
++ add \tmp7\().8h, v25.8h, v27.8h
|
|
|
|
++ urshr v4.8h, v0.8h, #3 // out p0
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, \tmp3\().8h
|
|
|
|
++ sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
|
|
|
|
++ add \tmp1\().8h, v22.8h, v25.8h
|
|
|
|
++ add \tmp3\().8h, v26.8h, v27.8h
|
|
|
|
++ urshr v5.8h, v0.8h, #3 // out q0
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, \tmp7\().8h
|
|
|
|
++ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
|
|
|
++ urshr \tmp5\().8h, v0.8h, #3 // out q1
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, \tmp3\().8h
|
|
|
|
++ // The output here is written back into the input registers. This doesn't
|
|
|
|
++ // matter for the flat8part below, since we only update those pixels
|
|
|
|
++ // which won't be touched below.
|
|
|
|
++ bit v21.16b, v2.16b, v6.16b
|
|
|
|
++ bit v22.16b, v3.16b, v6.16b
|
|
|
|
++ bit v23.16b, v4.16b, v6.16b
|
|
|
|
++ urshr \tmp6\().8h, v0.8h, #3 // out q2
|
|
|
|
++ bit v24.16b, v5.16b, v6.16b
|
|
|
|
++ bit v25.16b, \tmp5\().16b, v6.16b
|
|
|
|
++ bit v26.16b, \tmp6\().16b, v6.16b
|
|
|
|
++.endif
|
|
|
|
++.if \wd == 16
|
|
|
|
++6:
|
|
|
|
++ orr v2.16b, v6.16b, v7.16b
|
|
|
|
++ mov x11, v2.d[0]
|
|
|
|
++ mov x12, v2.d[1]
|
|
|
|
++ adds x11, x11, x12
|
|
|
|
++ b.ne 1f
|
|
|
|
++ // If no pixels needed flat8in nor flat8out, jump to a
|
|
|
|
++ // writeout of the inner 4 pixels
|
|
|
|
++ br x14
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ mov x11, v7.d[0]
|
|
|
|
++ mov x12, v7.d[1]
|
|
|
|
++ adds x11, x11, x12
|
|
|
|
++ b.ne 1f
|
|
|
|
++ // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
|
|
|
++ br x15
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ // flat8out
|
|
|
|
++ // This writes all outputs into v2-v17 (skipping v6 and v16).
|
|
|
|
++ // If this part is skipped, the output is read from v21-v26 (which is the input
|
|
|
|
++ // to this section).
|
|
|
|
++ shl v0.8h, v16.8h, #3 // 8 * v16
|
|
|
|
++ sub v0.8h, v0.8h, v16.8h // 7 * v16
|
|
|
|
++ add v0.8h, v0.8h, v17.8h
|
|
|
|
++ add v8.8h, v17.8h, v18.8h
|
|
|
|
++ add v10.8h, v19.8h, v20.8h
|
|
|
|
++ add v0.8h, v0.8h, v8.8h
|
|
|
|
++ add v8.8h, v16.8h, v17.8h
|
|
|
|
++ add v12.8h, v21.8h, v22.8h
|
|
|
|
++ add v0.8h, v0.8h, v10.8h
|
|
|
|
++ add v10.8h, v18.8h, v25.8h
|
|
|
|
++ add v14.8h, v23.8h, v24.8h
|
|
|
|
++ sub v10.8h, v10.8h, v8.8h
|
|
|
|
++ add v0.8h, v0.8h, v12.8h
|
|
|
|
++ add v0.8h, v0.8h, v14.8h
|
|
|
|
++ add v12.8h, v16.8h, v18.8h
|
|
|
|
++ add v14.8h, v19.8h, v26.8h
|
|
|
|
++ urshr v2.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v10.8h
|
|
|
|
++ add v8.8h, v16.8h, v19.8h
|
|
|
|
++ add v10.8h, v20.8h, v27.8h
|
|
|
|
++ sub v14.8h, v14.8h, v12.8h
|
|
|
|
++ bif v2.16b, v17.16b, v7.16b
|
|
|
|
++ urshr v3.8h , v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v14.8h
|
|
|
|
++ add v12.8h, v16.8h, v20.8h
|
|
|
|
++ add v14.8h, v21.8h, v28.8h
|
|
|
|
++ sub v10.8h, v10.8h, v8.8h
|
|
|
|
++ bif v3.16b, v18.16b, v7.16b
|
|
|
|
++ urshr v4.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v10.8h
|
|
|
|
++ add v8.8h, v16.8h, v21.8h
|
|
|
|
++ add v10.8h, v22.8h, v29.8h
|
|
|
|
++ sub v14.8h, v14.8h, v12.8h
|
|
|
|
++ bif v4.16b, v19.16b, v7.16b
|
|
|
|
++ urshr v5.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v14.8h
|
|
|
|
++ add v12.8h, v16.8h, v22.8h
|
|
|
|
++ add v14.8h, v23.8h, v30.8h
|
|
|
|
++ sub v10.8h, v10.8h, v8.8h
|
|
|
|
++ bif v5.16b, v20.16b, v7.16b
|
|
|
|
++ urshr v6.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v10.8h
|
|
|
|
++ add v10.8h, v16.8h, v23.8h
|
|
|
|
++ sub v14.8h, v14.8h, v12.8h
|
|
|
|
++ add v12.8h, v24.8h, v31.8h
|
|
|
|
++ bif v6.16b, v21.16b, v7.16b
|
|
|
|
++ urshr v8.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v14.8h
|
|
|
|
++ sub v10.8h, v12.8h, v10.8h
|
|
|
|
++ add v12.8h, v17.8h, v24.8h
|
|
|
|
++ add v14.8h, v25.8h, v31.8h
|
|
|
|
++ bif v8.16b, v22.16b, v7.16b
|
|
|
|
++ urshr v9.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v10.8h
|
|
|
|
++ sub v14.8h, v14.8h, v12.8h
|
|
|
|
++ add v12.8h, v26.8h, v31.8h
|
|
|
|
++ bif v9.16b, v23.16b, v7.16b
|
|
|
|
++ urshr v10.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v14.8h
|
|
|
|
++ add v14.8h, v18.8h, v25.8h
|
|
|
|
++ add v18.8h, v19.8h, v26.8h
|
|
|
|
++ sub v12.8h, v12.8h, v14.8h
|
|
|
|
++ add v14.8h, v27.8h, v31.8h
|
|
|
|
++ bif v10.16b, v24.16b, v7.16b
|
|
|
|
++ urshr v11.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v12.8h
|
|
|
|
++ add v12.8h, v20.8h, v27.8h
|
|
|
|
++ sub v14.8h, v14.8h, v18.8h
|
|
|
|
++ add v18.8h, v28.8h, v31.8h
|
|
|
|
++ bif v11.16b, v25.16b, v7.16b
|
|
|
|
++ sub v18.8h, v18.8h, v12.8h
|
|
|
|
++ urshr v12.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v14.8h
|
|
|
|
++ add v14.8h, v21.8h, v28.8h
|
|
|
|
++ add v20.8h, v29.8h, v31.8h
|
|
|
|
++ bif v12.16b, v26.16b, v7.16b
|
|
|
|
++ urshr v13.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v18.8h
|
|
|
|
++ sub v20.8h, v20.8h, v14.8h
|
|
|
|
++ add v18.8h, v22.8h, v29.8h
|
|
|
|
++ add v22.8h, v30.8h, v31.8h
|
|
|
|
++ bif v13.16b, v27.16b, v7.16b
|
|
|
|
++ urshr v14.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v20.8h
|
|
|
|
++ sub v22.8h, v22.8h, v18.8h
|
|
|
|
++ bif v14.16b, v28.16b, v7.16b
|
|
|
|
++ urshr v15.8h, v0.8h, #4
|
|
|
|
++
|
|
|
|
++ add v0.8h, v0.8h, v22.8h
|
|
|
|
++ bif v15.16b, v29.16b, v7.16b
|
|
|
|
++ urshr v17.8h, v0.8h, #4
|
|
|
|
++ bif v17.16b, v30.16b, v7.16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
|
|
|
|
++// while we need those for inputs/outputs in wd=16 and use v8-v15
|
|
|
|
++// for temp registers there instead.
|
|
|
|
++function vp9_loop_filter_4
|
|
|
|
++ loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_8
|
|
|
|
++ loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_16
|
|
|
|
++ loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro loop_filter_4
|
|
|
|
++ bl vp9_loop_filter_4
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_8
|
|
|
|
++ // calculate alternative 'return' targets
|
|
|
|
++ adr x13, 6f
|
|
|
|
++ bl vp9_loop_filter_8
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_16
|
|
|
|
++ // calculate alternative 'return' targets
|
|
|
|
++ adr x14, 7f
|
|
|
|
++ adr x15, 8f
|
|
|
|
++ bl vp9_loop_filter_16
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// The public functions in this file have got the following signature:
|
|
|
|
++// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
|
|
|
|
++
|
|
|
|
++.macro bpp_frontend func, bpp, push
|
|
|
|
++function ff_\func\()_\bpp\()_neon, export=1
|
|
|
|
++.if \push
|
|
|
|
++ mov x16, x30
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++.endif
|
|
|
|
++ lsl w2, w2, #\bpp - 8
|
|
|
|
++ lsl w3, w3, #\bpp - 8
|
|
|
|
++ lsl w4, w4, #\bpp - 8
|
|
|
|
++ mov x5, #1 << (\bpp - 8)
|
|
|
|
++ mov x6, #16 - \bpp
|
|
|
|
++ mov x7, #((1 << \bpp) - 1)
|
|
|
|
++.if \push
|
|
|
|
++ bl \func\()_16_neon
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x16
|
|
|
|
++.else
|
|
|
|
++ b \func\()_16_neon
|
|
|
|
++.endif
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro bpp_frontends func, push=0
|
|
|
|
++ bpp_frontend \func, 10, \push
|
|
|
|
++ bpp_frontend \func, 12, \push
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
|
|
|
|
++function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
|
|
|
|
++ mov x16, x30
|
|
|
|
++.if \push
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++.endif
|
|
|
|
++ lsl w2, w2, #\bpp - 8
|
|
|
|
++ lsl w3, w3, #\bpp - 8
|
|
|
|
++ lsl w4, w4, #\bpp - 8
|
|
|
|
++ mov x5, #1 << (\bpp - 8)
|
|
|
|
++ mov x6, #16 - \bpp
|
|
|
|
++ mov x7, #((1 << \bpp) - 1)
|
|
|
|
++ bl \func\()_\int_suffix\()_16_neon
|
|
|
|
++.ifc \dir,h
|
|
|
|
++ add x0, x0, x1, lsl #3
|
|
|
|
++.else
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++.endif
|
|
|
|
++ bl \func\()_\int_suffix\()_16_neon
|
|
|
|
++.if \push
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++.endif
|
|
|
|
++ br x16
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
|
|
|
|
++ bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
|
|
|
|
++ bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
|
|
|
|
++function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
|
|
|
|
++ mov x16, x30
|
|
|
|
++ lsr w8, w2, #8
|
|
|
|
++ lsr w14, w3, #8
|
|
|
|
++ lsr w15, w4, #8
|
|
|
|
++ and w2, w2, #0xff
|
|
|
|
++ and w3, w3, #0xff
|
|
|
|
++ and w4, w4, #0xff
|
|
|
|
++ lsl w2, w2, #\bpp - 8
|
|
|
|
++ lsl w3, w3, #\bpp - 8
|
|
|
|
++ lsl w4, w4, #\bpp - 8
|
|
|
|
++ mov x5, #1 << (\bpp - 8)
|
|
|
|
++ mov x6, #16 - \bpp
|
|
|
|
++ mov x7, #((1 << \bpp) - 1)
|
|
|
|
++ bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
|
|
|
|
++.ifc \dir,h
|
|
|
|
++ add x0, x0, x1, lsl #3
|
|
|
|
++.else
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++.endif
|
|
|
|
++ lsl w2, w8, #\bpp - 8
|
|
|
|
++ lsl w3, w14, #\bpp - 8
|
|
|
|
++ lsl w4, w15, #\bpp - 8
|
|
|
|
++ bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
|
|
|
|
++ br x16
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro bpp_frontends_mix2 wd1, wd2
|
|
|
|
++ bpp_frontend_mix2 \wd1, \wd2, v, 10
|
|
|
|
++ bpp_frontend_mix2 \wd1, \wd2, v, 12
|
|
|
|
++ bpp_frontend_mix2 \wd1, \wd2, h, 10
|
|
|
|
++ bpp_frontend_mix2 \wd1, \wd2, h, 12
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_v_4_8_16_neon
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #2
|
|
|
|
++ ld1 {v20.8h}, [x9], x1 // p3
|
|
|
|
++ ld1 {v24.8h}, [x0], x1 // q0
|
|
|
|
++ ld1 {v21.8h}, [x9], x1 // p2
|
|
|
|
++ ld1 {v25.8h}, [x0], x1 // q1
|
|
|
|
++ ld1 {v22.8h}, [x9], x1 // p1
|
|
|
|
++ ld1 {v26.8h}, [x0], x1 // q2
|
|
|
|
++ ld1 {v23.8h}, [x9], x1 // p0
|
|
|
|
++ ld1 {v27.8h}, [x0], x1 // q3
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ sub x9, x9, x1, lsl #1
|
|
|
|
++
|
|
|
|
++ loop_filter_4
|
|
|
|
++
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++bpp_frontends vp9_loop_filter_v_4_8
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_h_4_8_16_neon
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #8
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ ld1 {v20.8h}, [x9], x1
|
|
|
|
++ ld1 {v24.8h}, [x0], x1
|
|
|
|
++ ld1 {v21.8h}, [x9], x1
|
|
|
|
++ ld1 {v25.8h}, [x0], x1
|
|
|
|
++ ld1 {v22.8h}, [x9], x1
|
|
|
|
++ ld1 {v26.8h}, [x0], x1
|
|
|
|
++ ld1 {v23.8h}, [x9], x1
|
|
|
|
++ ld1 {v27.8h}, [x0], x1
|
|
|
|
++
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #8
|
|
|
|
++
|
|
|
|
++ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ loop_filter_4
|
|
|
|
++
|
|
|
|
++ // Move x9 forward by 2 pixels; we don't need to rewrite the
|
|
|
|
++ // outermost 2 pixels since they aren't changed.
|
|
|
|
++ add x9, x9, #4
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++
|
|
|
|
++ // We only will write the mid 4 pixels back; after the loop filter,
|
|
|
|
++ // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
|
|
|
|
++ // We need to transpose them to columns, done with a 4x8 transpose
|
|
|
|
++ // (which in practice is two 4x4 transposes of the two 4x4 halves
|
|
|
|
++ // of the 8x4 pixels; into 4x8 pixels).
|
|
|
|
++ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.d}[0], [x9], x1
|
|
|
|
++ st1 {v22.d}[1], [x0], x1
|
|
|
|
++ st1 {v23.d}[0], [x9], x1
|
|
|
|
++ st1 {v23.d}[1], [x0], x1
|
|
|
|
++ st1 {v24.d}[0], [x9], x1
|
|
|
|
++ st1 {v24.d}[1], [x0], x1
|
|
|
|
++ st1 {v25.d}[0], [x9], x1
|
|
|
|
++ st1 {v25.d}[1], [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #4
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++bpp_frontends vp9_loop_filter_h_4_8
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_v_8_8_16_neon
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #2
|
|
|
|
++ ld1 {v20.8h}, [x9], x1 // p3
|
|
|
|
++ ld1 {v24.8h}, [x0], x1 // q0
|
|
|
|
++ ld1 {v21.8h}, [x9], x1 // p2
|
|
|
|
++ ld1 {v25.8h}, [x0], x1 // q1
|
|
|
|
++ ld1 {v22.8h}, [x9], x1 // p1
|
|
|
|
++ ld1 {v26.8h}, [x0], x1 // q2
|
|
|
|
++ ld1 {v23.8h}, [x9], x1 // p0
|
|
|
|
++ ld1 {v27.8h}, [x0], x1 // q3
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ add x9, x9, x1
|
|
|
|
++
|
|
|
|
++ loop_filter_8
|
|
|
|
++
|
|
|
|
++ st1 {v21.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v26.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ sub x0, x0, x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++6:
|
|
|
|
++ sub x9, x0, x1, lsl #1
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++bpp_frontends vp9_loop_filter_v_8_8
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_h_8_8_16_neon
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #8
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ ld1 {v20.8h}, [x9], x1
|
|
|
|
++ ld1 {v24.8h}, [x0], x1
|
|
|
|
++ ld1 {v21.8h}, [x9], x1
|
|
|
|
++ ld1 {v25.8h}, [x0], x1
|
|
|
|
++ ld1 {v22.8h}, [x9], x1
|
|
|
|
++ ld1 {v26.8h}, [x0], x1
|
|
|
|
++ ld1 {v23.8h}, [x9], x1
|
|
|
|
++ ld1 {v27.8h}, [x0], x1
|
|
|
|
++
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #8
|
|
|
|
++
|
|
|
|
++ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ loop_filter_8
|
|
|
|
++
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++
|
|
|
|
++ // Even though only 6 pixels per row have been changed, we write the
|
|
|
|
++ // full 8 pixel registers.
|
|
|
|
++ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v20.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v21.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v26.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v27.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #8
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++6:
|
|
|
|
++ // If we didn't need to do the flat8in part, we use the same writeback
|
|
|
|
++ // as in loop_filter_h_4_8.
|
|
|
|
++ add x9, x9, #4
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.d}[0], [x9], x1
|
|
|
|
++ st1 {v22.d}[1], [x0], x1
|
|
|
|
++ st1 {v23.d}[0], [x9], x1
|
|
|
|
++ st1 {v23.d}[1], [x0], x1
|
|
|
|
++ st1 {v24.d}[0], [x9], x1
|
|
|
|
++ st1 {v24.d}[1], [x0], x1
|
|
|
|
++ st1 {v25.d}[0], [x9], x1
|
|
|
|
++ st1 {v25.d}[1], [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #4
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++bpp_frontends vp9_loop_filter_h_8_8
|
|
|
|
++
|
|
|
|
++bpp_frontends_mix2 4, 4
|
|
|
|
++bpp_frontends_mix2 4, 8
|
|
|
|
++bpp_frontends_mix2 8, 4
|
|
|
|
++bpp_frontends_mix2 8, 8
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_v_16_8_16_neon
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #3
|
|
|
|
++ ld1 {v16.8h}, [x9], x1 // p7
|
|
|
|
++ ld1 {v24.8h}, [x0], x1 // q0
|
|
|
|
++ ld1 {v17.8h}, [x9], x1 // p6
|
|
|
|
++ ld1 {v25.8h}, [x0], x1 // q1
|
|
|
|
++ ld1 {v18.8h}, [x9], x1 // p5
|
|
|
|
++ ld1 {v26.8h}, [x0], x1 // q2
|
|
|
|
++ ld1 {v19.8h}, [x9], x1 // p4
|
|
|
|
++ ld1 {v27.8h}, [x0], x1 // q3
|
|
|
|
++ ld1 {v20.8h}, [x9], x1 // p3
|
|
|
|
++ ld1 {v28.8h}, [x0], x1 // q4
|
|
|
|
++ ld1 {v21.8h}, [x9], x1 // p2
|
|
|
|
++ ld1 {v29.8h}, [x0], x1 // q5
|
|
|
|
++ ld1 {v22.8h}, [x9], x1 // p1
|
|
|
|
++ ld1 {v30.8h}, [x0], x1 // q6
|
|
|
|
++ ld1 {v23.8h}, [x9], x1 // p0
|
|
|
|
++ ld1 {v31.8h}, [x0], x1 // q7
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x9, x9, x1
|
|
|
|
++
|
|
|
|
++ loop_filter_16
|
|
|
|
++
|
|
|
|
++ // If we did the flat8out part, we get the output in
|
|
|
|
++ // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
|
|
|
|
++ // store v2-v9 there, and v10-v17 into x0.
|
|
|
|
++ st1 {v2.8h}, [x9], x1
|
|
|
|
++ st1 {v10.8h}, [x0], x1
|
|
|
|
++ st1 {v3.8h}, [x9], x1
|
|
|
|
++ st1 {v11.8h}, [x0], x1
|
|
|
|
++ st1 {v4.8h}, [x9], x1
|
|
|
|
++ st1 {v12.8h}, [x0], x1
|
|
|
|
++ st1 {v5.8h}, [x9], x1
|
|
|
|
++ st1 {v13.8h}, [x0], x1
|
|
|
|
++ st1 {v6.8h}, [x9], x1
|
|
|
|
++ st1 {v14.8h}, [x0], x1
|
|
|
|
++ st1 {v8.8h}, [x9], x1
|
|
|
|
++ st1 {v15.8h}, [x0], x1
|
|
|
|
++ st1 {v9.8h}, [x9], x1
|
|
|
|
++ st1 {v17.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++8:
|
|
|
|
++ add x9, x9, x1, lsl #2
|
|
|
|
++ // If we didn't do the flat8out part, the output is left in the
|
|
|
|
++ // input registers.
|
|
|
|
++ st1 {v21.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v26.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ sub x0, x0, x1
|
|
|
|
++ br x10
|
|
|
|
++7:
|
|
|
|
++ sub x9, x0, x1, lsl #1
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #1
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++bpp_frontends vp9_loop_filter_v_16_8, push=1
|
|
|
|
++bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_h_16_8_16_neon
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #16
|
|
|
|
++ ld1 {v16.8h}, [x9], x1
|
|
|
|
++ ld1 {v24.8h}, [x0], x1
|
|
|
|
++ ld1 {v17.8h}, [x9], x1
|
|
|
|
++ ld1 {v25.8h}, [x0], x1
|
|
|
|
++ ld1 {v18.8h}, [x9], x1
|
|
|
|
++ ld1 {v26.8h}, [x0], x1
|
|
|
|
++ ld1 {v19.8h}, [x9], x1
|
|
|
|
++ ld1 {v27.8h}, [x0], x1
|
|
|
|
++ ld1 {v20.8h}, [x9], x1
|
|
|
|
++ ld1 {v28.8h}, [x0], x1
|
|
|
|
++ ld1 {v21.8h}, [x9], x1
|
|
|
|
++ ld1 {v29.8h}, [x0], x1
|
|
|
|
++ ld1 {v22.8h}, [x9], x1
|
|
|
|
++ ld1 {v30.8h}, [x0], x1
|
|
|
|
++ ld1 {v23.8h}, [x9], x1
|
|
|
|
++ ld1 {v31.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++
|
|
|
|
++ // The 16x8 pixels read above is in two 8x8 blocks; the left
|
|
|
|
++ // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
|
|
|
|
++ // of this, to get one column per register.
|
|
|
|
++ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
|
|
|
++ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
|
|
|
++
|
|
|
|
++ loop_filter_16
|
|
|
|
++
|
|
|
|
++ transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
|
|
|
|
++ transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
|
|
|
|
++
|
|
|
|
++ st1 {v16.8h}, [x9], x1
|
|
|
|
++ st1 {v10.8h}, [x0], x1
|
|
|
|
++ st1 {v2.8h}, [x9], x1
|
|
|
|
++ st1 {v11.8h}, [x0], x1
|
|
|
|
++ st1 {v3.8h}, [x9], x1
|
|
|
|
++ st1 {v12.8h}, [x0], x1
|
|
|
|
++ st1 {v4.8h}, [x9], x1
|
|
|
|
++ st1 {v13.8h}, [x0], x1
|
|
|
|
++ st1 {v5.8h}, [x9], x1
|
|
|
|
++ st1 {v14.8h}, [x0], x1
|
|
|
|
++ st1 {v6.8h}, [x9], x1
|
|
|
|
++ st1 {v15.8h}, [x0], x1
|
|
|
|
++ st1 {v8.8h}, [x9], x1
|
|
|
|
++ st1 {v17.8h}, [x0], x1
|
|
|
|
++ st1 {v9.8h}, [x9], x1
|
|
|
|
++ st1 {v31.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++8:
|
|
|
|
++ // The same writeback as in loop_filter_h_8_8
|
|
|
|
++ sub x9, x0, #8
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v20.8h}, [x9], x1
|
|
|
|
++ st1 {v24.8h}, [x0], x1
|
|
|
|
++ st1 {v21.8h}, [x9], x1
|
|
|
|
++ st1 {v25.8h}, [x0], x1
|
|
|
|
++ st1 {v22.8h}, [x9], x1
|
|
|
|
++ st1 {v26.8h}, [x0], x1
|
|
|
|
++ st1 {v23.8h}, [x9], x1
|
|
|
|
++ st1 {v27.8h}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #8
|
|
|
|
++ br x10
|
|
|
|
++7:
|
|
|
|
++ // The same writeback as in loop_filter_h_4_8
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.d}[0], [x9], x1
|
|
|
|
++ st1 {v22.d}[1], [x0], x1
|
|
|
|
++ st1 {v23.d}[0], [x9], x1
|
|
|
|
++ st1 {v23.d}[1], [x0], x1
|
|
|
|
++ st1 {v24.d}[0], [x9], x1
|
|
|
|
++ st1 {v24.d}[1], [x0], x1
|
|
|
|
++ st1 {v25.d}[0], [x9], x1
|
|
|
|
++ st1 {v25.d}[1], [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x0, x0, #4
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++bpp_frontends vp9_loop_filter_h_16_8, push=1
|
|
|
|
++bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
|
|
|
|
+@@ -0,0 +1,1334 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2016 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++#include "neon.S"
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// The main loop filter macro is templated and can produce filters for
|
|
|
|
++// vectors of 8 or 16 bytes. The register mapping throughout the filter
|
|
|
|
++// is close to identical to the arm version (please try to maintain this,
|
|
|
|
++// if either is changed!). When the arm version uses e.g. d20 for the
|
|
|
|
++// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending
|
|
|
|
++// on vector length.
|
|
|
|
++//
|
|
|
|
++// The number of elements in the vector is passed in via the macro parameter
|
|
|
|
++// \sz, which is either .8b or .16b. For simple instructions that doesn't
|
|
|
|
++// lengthen or narrow things, this can easily be templated like this:
|
|
|
|
++// uabd v4\sz, v20\sz, v21\sz
|
|
|
|
++//
|
|
|
|
++// For instructions that lengthen or narrow content, the arm version would
|
|
|
|
++// have used q registers. For these instructions, we have macros that expand
|
|
|
|
++// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2
|
|
|
|
++// pair, depending on the \sz parameter. Wherever the arm version would have
|
|
|
|
++// used a q register, these macros instead take two v registers, i.e. q3
|
|
|
|
++// is mapped to v6+v7. For the case with 8 byte input vectors, such a
|
|
|
|
++// lengthening operation is only stored in v6.8h (what was in q3 in the arm
|
|
|
|
++// case), while the 16 byte input vectors will use v6.8h + v7.8h.
|
|
|
|
++// Such a macro invocation would look like this:
|
|
|
|
++// uaddl_sz v8.8h, v9.8h, v17, v18, \sz
|
|
|
|
++//
|
|
|
|
++// That is, in the 8 byte input vector case, the second register in these
|
|
|
|
++// register pairs will be unused.
|
|
|
|
++// Unfortunately, this makes the code quite hard to read. For readability,
|
|
|
|
++// see the arm version instead.
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
|
|
|
|
++ add \dst1, \in1, \in3
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ add \dst2, \in2, \in4
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz
|
|
|
|
++ sub \dst1, \in1, \in3
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ sub \dst2, \in2, \in4
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro uaddw_sz dst1, dst2, in1, in2, in3, sz
|
|
|
|
++ uaddw \dst1, \in1, \in3\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ uaddw2 \dst2, \in2, \in3\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro usubw_sz dst1, dst2, in1, in2, in3, sz
|
|
|
|
++ usubw \dst1, \in1, \in3\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ usubw2 \dst2, \in2, \in3\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro usubl_sz dst1, dst2, in1, in2, sz
|
|
|
|
++ usubl \dst1, \in1\().8b, \in2\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ usubl2 \dst2, \in1\().16b, \in2\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro sqxtn_sz dst, in1, in2, sz
|
|
|
|
++ sqxtn \dst\().8b, \in1
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ sqxtn2 \dst\().16b, \in2
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro sqxtun_sz dst, in1, in2, sz
|
|
|
|
++ sqxtun \dst\().8b, \in1
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ sqxtun2 \dst\().16b, \in2
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz
|
|
|
|
++ mul \dst1, \in1, \in3
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ mul \dst2, \in2, \in4
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro saddw_sz dst1, dst2, in1, in2, in3, sz
|
|
|
|
++ saddw \dst1, \in1, \in3\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ saddw2 \dst2, \in2, \in3\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro ssubw_sz dst1, dst2, in1, in2, in3, sz
|
|
|
|
++ ssubw \dst1, \in1, \in3\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ ssubw2 \dst2, \in2, \in3\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro uxtl_sz dst1, dst2, in, sz
|
|
|
|
++ uxtl \dst1, \in\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ uxtl2 \dst2, \in\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro uaddl_sz dst1, dst2, in1, in2, sz
|
|
|
|
++ uaddl \dst1, \in1\().8b, \in2\().8b
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ uaddl2 \dst2, \in1\().16b, \in2\().16b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro rshrn_sz dst, in1, in2, shift, sz
|
|
|
|
++ rshrn \dst\().8b, \in1, \shift
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ rshrn2 \dst\().16b, \in2, \shift
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro ushll_sz dst1, dst2, in, shift, sz
|
|
|
|
++ ushll \dst1, \in\().8b, \shift
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ ushll2 \dst2, \in\().16b, \shift
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// The input to and output from this macro is in the registers v16-v31,
|
|
|
|
++// and v0-v7 are used as scratch registers.
|
|
|
|
++// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
|
|
|
|
++// Depending on the width of the loop filter, we either use v16-v19
|
|
|
|
++// and v28-v31 as temp registers, or v8-v15.
|
|
|
|
++// When comparing to the arm version, tmpq1 == tmp1 + tmp2,
|
|
|
|
++// tmpq2 == tmp3 + tmp4, etc.
|
|
|
|
++.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
|
|
|
|
++.if \mix == 0
|
|
|
|
++ dup v0\sz, w2 // E
|
|
|
|
++ dup v2\sz, w3 // I
|
|
|
|
++ dup v3\sz, w4 // H
|
|
|
|
++.else
|
|
|
|
++ dup v0.8h, w2 // E
|
|
|
|
++ dup v2.8h, w3 // I
|
|
|
|
++ dup v3.8h, w4 // H
|
|
|
|
++ rev16 v1.16b, v0.16b // E
|
|
|
|
++ rev16 v4.16b, v2.16b // I
|
|
|
|
++ rev16 v5.16b, v3.16b // H
|
|
|
|
++ uzp1 v0.16b, v0.16b, v1.16b
|
|
|
|
++ uzp1 v2.16b, v2.16b, v4.16b
|
|
|
|
++ uzp1 v3.16b, v3.16b, v5.16b
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2)
|
|
|
|
++ uabd v5\sz, v21\sz, v22\sz // abs(p2 - p1)
|
|
|
|
++ uabd v6\sz, v22\sz, v23\sz // abs(p1 - p0)
|
|
|
|
++ uabd v7\sz, v24\sz, v25\sz // abs(q0 - q1)
|
|
|
|
++ uabd \tmp1\sz, v25\sz, v26\sz // abs(q1 - q2)
|
|
|
|
++ uabd \tmp2\sz, v26\sz, v27\sz // abs(q2 - q3)
|
|
|
|
++ umax v4\sz, v4\sz, v5\sz
|
|
|
|
++ umax v5\sz, v6\sz, v7\sz
|
|
|
|
++ umax \tmp1\sz, \tmp1\sz, \tmp2\sz
|
|
|
|
++ uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0)
|
|
|
|
++ umax v4\sz, v4\sz, v5\sz
|
|
|
|
++ uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2
|
|
|
|
++ uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1)
|
|
|
|
++ umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3))
|
|
|
|
++ ushr v5\sz, v5\sz, #1
|
|
|
|
++ cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I
|
|
|
|
++ uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
|
|
|
++ cmhs v5\sz, v0\sz, v6\sz
|
|
|
|
++ and v4\sz, v4\sz, v5\sz // fm
|
|
|
|
++
|
|
|
|
++ // If no pixels need filtering, just exit as soon as possible
|
|
|
|
++ mov x5, v4.d[0]
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ mov x6, v4.d[1]
|
|
|
|
++ adds x5, x5, x6
|
|
|
|
++ b.eq 9f
|
|
|
|
++.else
|
|
|
|
++ cbz x5, 9f
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++.if \wd >= 8
|
|
|
|
++ movi v0\sz, #1
|
|
|
|
++
|
|
|
|
++ uabd v6\sz, v20\sz, v23\sz // abs(p3 - p0)
|
|
|
|
++ uabd v2\sz, v21\sz, v23\sz // abs(p2 - p0)
|
|
|
|
++ uabd v1\sz, v22\sz, v23\sz // abs(p1 - p0)
|
|
|
|
++ uabd \tmp1\sz, v25\sz, v24\sz // abs(q1 - q0)
|
|
|
|
++ uabd \tmp2\sz, v26\sz, v24\sz // abs(q2 - q0)
|
|
|
|
++ uabd \tmp3\sz, v27\sz, v24\sz // abs(q3 - q0)
|
|
|
|
++ umax v6\sz, v6\sz, v2\sz
|
|
|
|
++ umax v1\sz, v1\sz, \tmp1\sz
|
|
|
|
++ umax \tmp2\sz, \tmp2\sz, \tmp3\sz
|
|
|
|
++.if \wd == 16
|
|
|
|
++ uabd v7\sz, v16\sz, v23\sz // abs(p7 - p0)
|
|
|
|
++ umax v6\sz, v6\sz, v1\sz
|
|
|
|
++ uabd v2\sz, v17\sz, v23\sz // abs(p6 - p0)
|
|
|
|
++ umax v6\sz, v6\sz, \tmp2\sz
|
|
|
|
++ uabd v1\sz, v18\sz, v23\sz // abs(p5 - p0)
|
|
|
|
++ cmhs v6\sz, v0\sz, v6\sz // flat8in
|
|
|
|
++ uabd v8\sz, v19\sz, v23\sz // abs(p4 - p0)
|
|
|
|
++ and v6\sz, v6\sz, v4\sz // flat8in && fm
|
|
|
|
++ uabd v9\sz, v28\sz, v24\sz // abs(q4 - q0)
|
|
|
|
++ bic v4\sz, v4\sz, v6\sz // fm && !flat8in
|
|
|
|
++ uabd v10\sz, v29\sz, v24\sz // abs(q5 - q0)
|
|
|
|
++ uabd v11\sz, v30\sz, v24\sz // abs(q6 - q0)
|
|
|
|
++ uabd v12\sz, v31\sz, v24\sz // abs(q7 - q0)
|
|
|
|
++
|
|
|
|
++ umax v7\sz, v7\sz, v2\sz
|
|
|
|
++ umax v1\sz, v1\sz, v8\sz
|
|
|
|
++ umax v9\sz, v9\sz, v10\sz
|
|
|
|
++ umax v11\sz, v11\sz, v12\sz
|
|
|
|
++ // The rest of the calculation of flat8out is interleaved below
|
|
|
|
++.else
|
|
|
|
++ // The rest of the calculation of flat8in is interleaved below
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ // Calculate the normal inner loop filter for 2 or 4 pixels
|
|
|
|
++ uabd v5\sz, v22\sz, v23\sz // abs(p1 - p0)
|
|
|
|
++.if \wd == 16
|
|
|
|
++ umax v7\sz, v7\sz, v1\sz
|
|
|
|
++ umax v9\sz, v9\sz, v11\sz
|
|
|
|
++.elseif \wd == 8
|
|
|
|
++ umax v6\sz, v6\sz, v1\sz
|
|
|
|
++.endif
|
|
|
|
++ uabd v1\sz, v25\sz, v24\sz // abs(q1 - q0)
|
|
|
|
++.if \wd == 16
|
|
|
|
++ umax v7\sz, v7\sz, v9\sz
|
|
|
|
++.elseif \wd == 8
|
|
|
|
++ umax v6\sz, v6\sz, \tmp2\sz
|
|
|
|
++.endif
|
|
|
|
++ usubl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz // p1 - q1
|
|
|
|
++ umax v5\sz, v5\sz, v1\sz // max(abs(p1 - p0), abs(q1 - q0))
|
|
|
|
++.if \mix != 0
|
|
|
|
++ mov v1.d[0], x11
|
|
|
|
++.endif
|
|
|
|
++ usubl_sz \tmp3\().8h, \tmp4\().8h, v24, v23, \sz // q0 - p0
|
|
|
|
++ movi \tmp5\().8h, #3
|
|
|
|
++.if \wd == 8
|
|
|
|
++ cmhs v6\sz, v0\sz, v6\sz // flat8in
|
|
|
|
++.endif
|
|
|
|
++.if \mix != 0
|
|
|
|
++ sxtl v1.8h, v1.8b
|
|
|
|
++.endif
|
|
|
|
++ cmhs v5\sz, v3\sz, v5\sz // !hev
|
|
|
|
++.if \wd == 8
|
|
|
|
++ // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
|
|
|
|
++.if \mix != 0
|
|
|
|
++ and v6\sz, v6\sz, v1.16b
|
|
|
|
++.endif
|
|
|
|
++ and v6\sz, v6\sz, v4\sz // flat8in && fm
|
|
|
|
++.endif
|
|
|
|
++ sqxtn_sz \tmp1, \tmp1\().8h, \tmp2\().8h, \sz // av_clip_int8(p1 - q1)
|
|
|
|
++.if \wd == 16
|
|
|
|
++ cmhs v7\sz, v0\sz, v7\sz // flat8out
|
|
|
|
++.elseif \wd == 8
|
|
|
|
++ bic v4\sz, v4\sz, v6\sz // fm && !flat8in
|
|
|
|
++.endif
|
|
|
|
++ and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in
|
|
|
|
++.if \wd == 16
|
|
|
|
++ and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0)
|
|
|
|
++ bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) av_clip_int8 = 0
|
|
|
|
++ movi v2\sz, #4
|
|
|
|
++ saddw_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
|
|
|
|
++ movi v3\sz, #3
|
|
|
|
++ sqxtn_sz \tmp1, \tmp3\().8h, \tmp4\().8h, \sz // f
|
|
|
|
++.if \wd == 16
|
|
|
|
++ bic v6\sz, v6\sz, v7\sz // fm && flat8in && !flat8out
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ sqadd \tmp3\sz, \tmp1\sz, v2\sz // FFMIN(f + 4, 127)
|
|
|
|
++ sqadd \tmp4\sz, \tmp1\sz, v3\sz // FFMIN(f + 3, 127)
|
|
|
|
++ uxtl_sz v0.8h, v1.8h, v23, \sz // p0
|
|
|
|
++ sshr \tmp3\sz, \tmp3\sz, #3 // f1
|
|
|
|
++ sshr \tmp4\sz, \tmp4\sz, #3 // f2
|
|
|
|
++
|
|
|
|
++ uxtl_sz v2.8h, v3.8h, v24, \sz // q0
|
|
|
|
++ saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp4, \sz // p0 + f2
|
|
|
|
++ ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q0 - f1
|
|
|
|
++ sqxtun_sz v0, v0.8h, v1.8h, \sz // out p0
|
|
|
|
++ sqxtun_sz v1, v2.8h, v3.8h, \sz // out q0
|
|
|
|
++ srshr \tmp3\sz, \tmp3\sz, #1 // f = (f1 + 1) >> 1
|
|
|
|
++ bit v23\sz, v0\sz, v4\sz // if (fm && !flat8in)
|
|
|
|
++ bit v24\sz, v1\sz, v4\sz
|
|
|
|
++
|
|
|
|
++ uxtl_sz v0.8h, v1.8h, v22, \sz // p1
|
|
|
|
++ uxtl_sz v2.8h, v3.8h, v25, \sz // q1
|
|
|
|
++.if \wd >= 8
|
|
|
|
++ mov x5, v6.d[0]
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ mov x6, v6.d[1]
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f
|
|
|
|
++ ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f
|
|
|
|
++ sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1
|
|
|
|
++ sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1
|
|
|
|
++.if \wd >= 8
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ adds x5, x5, x6
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in)
|
|
|
|
++ bit v25\sz, v2\sz, v5\sz
|
|
|
|
++
|
|
|
|
++ // If no pixels need flat8in, jump to flat8out
|
|
|
|
++ // (or to a writeout of the inner 4 pixels, for wd=8)
|
|
|
|
++.if \wd >= 8
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ b.eq 6f
|
|
|
|
++.else
|
|
|
|
++ cbz x5, 6f
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ // flat8in
|
|
|
|
++ uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v21, \sz
|
|
|
|
++ uaddl_sz \tmp3\().8h, \tmp4\().8h, v22, v25, \sz
|
|
|
|
++ uaddl_sz \tmp5\().8h, \tmp6\().8h, v20, v22, \sz
|
|
|
|
++ uaddl_sz \tmp7\().8h, \tmp8\().8h, v23, v26, \sz
|
|
|
|
++ add_sz v0.8h, v1.8h, \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz
|
|
|
|
++ uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v23, \sz
|
|
|
|
++ uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v24, \sz
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp5\().8h, \tmp6\().8h, \sz
|
|
|
|
++ sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
|
|
|
|
++ sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz
|
|
|
|
++ rshrn_sz v2, v0.8h, v1.8h, #3, \sz // out p2
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
|
|
|
|
++ uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v23, \sz
|
|
|
|
++ uaddl_sz \tmp3\().8h, \tmp4\().8h, v24, v27, \sz
|
|
|
|
++ rshrn_sz v3, v0.8h, v1.8h, #3, \sz // out p1
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz
|
|
|
|
++ sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
|
|
|
|
++ uaddl_sz \tmp5\().8h, \tmp6\().8h, v21, v24, \sz
|
|
|
|
++ uaddl_sz \tmp7\().8h, \tmp8\().8h, v25, v27, \sz
|
|
|
|
++ rshrn_sz v4, v0.8h, v1.8h, #3, \sz // out p0
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
|
|
|
|
++ sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz
|
|
|
|
++ uaddl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz
|
|
|
|
++ uaddl_sz \tmp3\().8h, \tmp4\().8h, v26, v27, \sz
|
|
|
|
++ rshrn_sz v5, v0.8h, v1.8h, #3, \sz // out q0
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz
|
|
|
|
++ sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
|
|
|
|
++ rshrn_sz \tmp5, v0.8h, v1.8h, #3, \sz // out q1
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
|
|
|
|
++ // The output here is written back into the input registers. This doesn't
|
|
|
|
++ // matter for the flat8part below, since we only update those pixels
|
|
|
|
++ // which won't be touched below.
|
|
|
|
++ bit v21\sz, v2\sz, v6\sz
|
|
|
|
++ bit v22\sz, v3\sz, v6\sz
|
|
|
|
++ bit v23\sz, v4\sz, v6\sz
|
|
|
|
++ rshrn_sz \tmp6, v0.8h, v1.8h, #3, \sz // out q2
|
|
|
|
++ bit v24\sz, v5\sz, v6\sz
|
|
|
|
++ bit v25\sz, \tmp5\sz, v6\sz
|
|
|
|
++ bit v26\sz, \tmp6\sz, v6\sz
|
|
|
|
++.endif
|
|
|
|
++.if \wd == 16
|
|
|
|
++6:
|
|
|
|
++ orr v2\sz, v6\sz, v7\sz
|
|
|
|
++ mov x5, v2.d[0]
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ mov x6, v2.d[1]
|
|
|
|
++ adds x5, x5, x6
|
|
|
|
++ b.ne 1f
|
|
|
|
++.else
|
|
|
|
++ cbnz x5, 1f
|
|
|
|
++.endif
|
|
|
|
++ // If no pixels needed flat8in nor flat8out, jump to a
|
|
|
|
++ // writeout of the inner 4 pixels
|
|
|
|
++ br x14
|
|
|
|
++1:
|
|
|
|
++
|
|
|
|
++ mov x5, v7.d[0]
|
|
|
|
++.ifc \sz, .16b
|
|
|
|
++ mov x6, v7.d[1]
|
|
|
|
++ adds x5, x5, x6
|
|
|
|
++ b.ne 1f
|
|
|
|
++.else
|
|
|
|
++ cbnz x5, 1f
|
|
|
|
++.endif
|
|
|
|
++ // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
|
|
|
++ br x15
|
|
|
|
++
|
|
|
|
++1:
|
|
|
|
++ // flat8out
|
|
|
|
++ // This writes all outputs into v2-v17 (skipping v6 and v16).
|
|
|
|
++ // If this part is skipped, the output is read from v21-v26 (which is the input
|
|
|
|
++ // to this section).
|
|
|
|
++ ushll_sz v0.8h, v1.8h, v16, #3, \sz // 8 * v16
|
|
|
|
++ usubw_sz v0.8h, v1.8h, v0.8h, v1.8h, v16, \sz // 7 * v16
|
|
|
|
++ uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v17, \sz
|
|
|
|
++ uaddl_sz v8.8h, v9.8h, v17, v18, \sz
|
|
|
|
++ uaddl_sz v10.8h, v11.8h, v19, v20, \sz
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v8.8h, v9.8h, \sz
|
|
|
|
++ uaddl_sz v8.8h, v9.8h, v16, v17, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v21, v22, \sz
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
|
|
|
|
++ uaddl_sz v10.8h, v11.8h, v18, v25, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v23, v24, \sz
|
|
|
|
++ sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v16, v18, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v19, v26, \sz
|
|
|
|
++ rshrn_sz v2, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
|
|
|
|
++ uaddl_sz v8.8h, v9.8h, v16, v19, \sz
|
|
|
|
++ uaddl_sz v10.8h, v11.8h, v20, v27, \sz
|
|
|
|
++ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ bif v2\sz, v17\sz, v7\sz
|
|
|
|
++ rshrn_sz v3, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v16, v20, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v21, v28, \sz
|
|
|
|
++ sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
|
|
|
|
++ bif v3\sz, v18\sz, v7\sz
|
|
|
|
++ rshrn_sz v4, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
|
|
|
|
++ uaddl_sz v8.8h, v9.8h, v16, v21, \sz
|
|
|
|
++ uaddl_sz v10.8h, v11.8h, v22, v29, \sz
|
|
|
|
++ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ bif v4\sz, v19\sz, v7\sz
|
|
|
|
++ rshrn_sz v5, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v16, v22, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v23, v30, \sz
|
|
|
|
++ sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
|
|
|
|
++ bif v5\sz, v20\sz, v7\sz
|
|
|
|
++ rshrn_sz v6, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
|
|
|
|
++ uaddl_sz v10.8h, v11.8h, v16, v23, \sz
|
|
|
|
++ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v24, v31, \sz
|
|
|
|
++ bif v6\sz, v21\sz, v7\sz
|
|
|
|
++ rshrn_sz v8, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ sub_sz v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v17, v24, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v25, v31, \sz
|
|
|
|
++ bif v8\sz, v22\sz, v7\sz
|
|
|
|
++ rshrn_sz v9, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
|
|
|
|
++ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v26, v31, \sz
|
|
|
|
++ bif v9\sz, v23\sz, v7\sz
|
|
|
|
++ rshrn_sz v10, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v18, v25, \sz
|
|
|
|
++ uaddl_sz v18.8h, v19.8h, v19, v26, \sz
|
|
|
|
++ sub_sz v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v27, v31, \sz
|
|
|
|
++ bif v10\sz, v24\sz, v7\sz
|
|
|
|
++ rshrn_sz v11, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ uaddl_sz v12.8h, v13.8h, v20, v27, \sz
|
|
|
|
++ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz
|
|
|
|
++ uaddl_sz v18.8h, v19.8h, v28, v31, \sz
|
|
|
|
++ bif v11\sz, v25\sz, v7\sz
|
|
|
|
++ sub_sz v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz
|
|
|
|
++ rshrn_sz v12, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v14.8h, v15.8h, v21, v28, \sz
|
|
|
|
++ uaddl_sz v20.8h, v21.8h, v29, v31, \sz
|
|
|
|
++ bif v12\sz, v26\sz, v7\sz
|
|
|
|
++ rshrn_sz v13, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v18.8h, v19.8h, \sz
|
|
|
|
++ sub_sz v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz
|
|
|
|
++ uaddl_sz v18.8h, v19.8h, v22, v29, \sz
|
|
|
|
++ uaddl_sz v22.8h, v23.8h, v30, v31, \sz
|
|
|
|
++ bif v13\sz, v27\sz, v7\sz
|
|
|
|
++ rshrn_sz v14, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v20.8h, v21.8h, \sz
|
|
|
|
++ sub_sz v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz
|
|
|
|
++ bif v14\sz, v28\sz, v7\sz
|
|
|
|
++ rshrn_sz v15, v0.8h, v1.8h, #4, \sz
|
|
|
|
++
|
|
|
|
++ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v22.8h, v23.8h, \sz
|
|
|
|
++ bif v15\sz, v29\sz, v7\sz
|
|
|
|
++ rshrn_sz v17, v0.8h, v1.8h, #4, \sz
|
|
|
|
++ bif v17\sz, v30\sz, v7\sz
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
|
|
|
|
++// while we need those for inputs/outputs in wd=16 and use v8-v15
|
|
|
|
++// for temp registers there instead.
|
|
|
|
++function vp9_loop_filter_4
|
|
|
|
++ loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
|
|
|
|
++ ret
|
|
|
|
++9:
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_4_16b_mix_44
|
|
|
|
++ loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31
|
|
|
|
++ ret
|
|
|
|
++9:
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_8
|
|
|
|
++ loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
|
|
|
|
++ ret
|
|
|
|
++6:
|
|
|
|
++ br x13
|
|
|
|
++9:
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_8_16b_mix
|
|
|
|
++ loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
|
|
|
|
++ ret
|
|
|
|
++6:
|
|
|
|
++ br x13
|
|
|
|
++9:
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_16
|
|
|
|
++ loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
|
|
|
|
++ ret
|
|
|
|
++9:
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function vp9_loop_filter_16_16b
|
|
|
|
++ loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
|
|
|
|
++ ret
|
|
|
|
++9:
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro loop_filter_4
|
|
|
|
++ bl vp9_loop_filter_4
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_4_16b_mix mix
|
|
|
|
++ bl vp9_loop_filter_4_16b_mix_\mix
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_8
|
|
|
|
++ // calculate alternative 'return' targets
|
|
|
|
++ adr x13, 6f
|
|
|
|
++ bl vp9_loop_filter_8
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_8_16b_mix mix
|
|
|
|
++ // calculate alternative 'return' targets
|
|
|
|
++ adr x13, 6f
|
|
|
|
++.if \mix == 48
|
|
|
|
++ mov x11, #0xffffffff00000000
|
|
|
|
++.elseif \mix == 84
|
|
|
|
++ mov x11, #0x00000000ffffffff
|
|
|
|
++.else
|
|
|
|
++ mov x11, #0xffffffffffffffff
|
|
|
|
++.endif
|
|
|
|
++ bl vp9_loop_filter_8_16b_mix
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_16
|
|
|
|
++ // calculate alternative 'return' targets
|
|
|
|
++ adr x14, 7f
|
|
|
|
++ adr x15, 8f
|
|
|
|
++ bl vp9_loop_filter_16
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro loop_filter_16_16b
|
|
|
|
++ // calculate alternative 'return' targets
|
|
|
|
++ adr x14, 7f
|
|
|
|
++ adr x15, 8f
|
|
|
|
++ bl vp9_loop_filter_16_16b
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// The public functions in this file have got the following signature:
|
|
|
|
++// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_v_4_8_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #2
|
|
|
|
++ ld1 {v20.8b}, [x9], x1 // p3
|
|
|
|
++ ld1 {v24.8b}, [x0], x1 // q0
|
|
|
|
++ ld1 {v21.8b}, [x9], x1 // p2
|
|
|
|
++ ld1 {v25.8b}, [x0], x1 // q1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1 // p1
|
|
|
|
++ ld1 {v26.8b}, [x0], x1 // q2
|
|
|
|
++ ld1 {v23.8b}, [x9], x1 // p0
|
|
|
|
++ ld1 {v27.8b}, [x0], x1 // q3
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ sub x9, x9, x1, lsl #1
|
|
|
|
++
|
|
|
|
++ loop_filter_4
|
|
|
|
++
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_v_44_16_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #2
|
|
|
|
++ ld1 {v20.16b}, [x9], x1 // p3
|
|
|
|
++ ld1 {v24.16b}, [x0], x1 // q0
|
|
|
|
++ ld1 {v21.16b}, [x9], x1 // p2
|
|
|
|
++ ld1 {v25.16b}, [x0], x1 // q1
|
|
|
|
++ ld1 {v22.16b}, [x9], x1 // p1
|
|
|
|
++ ld1 {v26.16b}, [x0], x1 // q2
|
|
|
|
++ ld1 {v23.16b}, [x9], x1 // p0
|
|
|
|
++ ld1 {v27.16b}, [x0], x1 // q3
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ sub x9, x9, x1, lsl #1
|
|
|
|
++
|
|
|
|
++ loop_filter_4_16b_mix 44
|
|
|
|
++
|
|
|
|
++ st1 {v22.16b}, [x9], x1
|
|
|
|
++ st1 {v24.16b}, [x0], x1
|
|
|
|
++ st1 {v23.16b}, [x9], x1
|
|
|
|
++ st1 {v25.16b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_h_4_8_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ ld1 {v20.8b}, [x9], x1
|
|
|
|
++ ld1 {v24.8b}, [x0], x1
|
|
|
|
++ ld1 {v21.8b}, [x9], x1
|
|
|
|
++ ld1 {v25.8b}, [x0], x1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1
|
|
|
|
++ ld1 {v26.8b}, [x0], x1
|
|
|
|
++ ld1 {v23.8b}, [x9], x1
|
|
|
|
++ ld1 {v27.8b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ // Move x0/x9 forward by 2 pixels; we don't need to rewrite the
|
|
|
|
++ // outermost 2 pixels since they aren't changed.
|
|
|
|
++ add x9, x9, #2
|
|
|
|
++ add x0, x0, #2
|
|
|
|
++
|
|
|
|
++ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ loop_filter_4
|
|
|
|
++
|
|
|
|
++ // We only will write the mid 4 pixels back; after the loop filter,
|
|
|
|
++ // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
|
|
|
|
++ // We need to transpose them to columns, done with a 4x8 transpose
|
|
|
|
++ // (which in practice is two 4x4 transposes of the two 4x4 halves
|
|
|
|
++ // of the 8x4 pixels; into 4x8 pixels).
|
|
|
|
++ transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.s}[0], [x9], x1
|
|
|
|
++ st1 {v22.s}[1], [x0], x1
|
|
|
|
++ st1 {v23.s}[0], [x9], x1
|
|
|
|
++ st1 {v23.s}[1], [x0], x1
|
|
|
|
++ st1 {v24.s}[0], [x9], x1
|
|
|
|
++ st1 {v24.s}[1], [x0], x1
|
|
|
|
++ st1 {v25.s}[0], [x9], x1
|
|
|
|
++ st1 {v25.s}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_h_44_16_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #3
|
|
|
|
++ ld1 {v20.8b}, [x9], x1
|
|
|
|
++ ld1 {v20.d}[1], [x0], x1
|
|
|
|
++ ld1 {v21.8b}, [x9], x1
|
|
|
|
++ ld1 {v21.d}[1], [x0], x1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1
|
|
|
|
++ ld1 {v22.d}[1], [x0], x1
|
|
|
|
++ ld1 {v23.8b}, [x9], x1
|
|
|
|
++ ld1 {v23.d}[1], [x0], x1
|
|
|
|
++ ld1 {v24.8b}, [x9], x1
|
|
|
|
++ ld1 {v24.d}[1], [x0], x1
|
|
|
|
++ ld1 {v25.8b}, [x9], x1
|
|
|
|
++ ld1 {v25.d}[1], [x0], x1
|
|
|
|
++ ld1 {v26.8b}, [x9], x1
|
|
|
|
++ ld1 {v26.d}[1], [x0], x1
|
|
|
|
++ ld1 {v27.8b}, [x9], x1
|
|
|
|
++ ld1 {v27.d}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x9, x9, #2
|
|
|
|
++ add x0, x0, #2
|
|
|
|
++
|
|
|
|
++ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ loop_filter_4_16b_mix 44
|
|
|
|
++
|
|
|
|
++ transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v22.s}[0], [x9], x1
|
|
|
|
++ st1 {v22.s}[2], [x0], x1
|
|
|
|
++ st1 {v23.s}[0], [x9], x1
|
|
|
|
++ st1 {v23.s}[2], [x0], x1
|
|
|
|
++ st1 {v24.s}[0], [x9], x1
|
|
|
|
++ st1 {v24.s}[2], [x0], x1
|
|
|
|
++ st1 {v25.s}[0], [x9], x1
|
|
|
|
++ st1 {v25.s}[2], [x0], x1
|
|
|
|
++ st1 {v22.s}[1], [x9], x1
|
|
|
|
++ st1 {v22.s}[3], [x0], x1
|
|
|
|
++ st1 {v23.s}[1], [x9], x1
|
|
|
|
++ st1 {v23.s}[3], [x0], x1
|
|
|
|
++ st1 {v24.s}[1], [x9], x1
|
|
|
|
++ st1 {v24.s}[3], [x0], x1
|
|
|
|
++ st1 {v25.s}[1], [x9], x1
|
|
|
|
++ st1 {v25.s}[3], [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_v_8_8_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #2
|
|
|
|
++ ld1 {v20.8b}, [x9], x1 // p3
|
|
|
|
++ ld1 {v24.8b}, [x0], x1 // q0
|
|
|
|
++ ld1 {v21.8b}, [x9], x1 // p2
|
|
|
|
++ ld1 {v25.8b}, [x0], x1 // q1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1 // p1
|
|
|
|
++ ld1 {v26.8b}, [x0], x1 // q2
|
|
|
|
++ ld1 {v23.8b}, [x9], x1 // p0
|
|
|
|
++ ld1 {v27.8b}, [x0], x1 // q3
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ add x9, x9, x1
|
|
|
|
++
|
|
|
|
++ loop_filter_8
|
|
|
|
++
|
|
|
|
++ st1 {v21.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v26.8b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++6:
|
|
|
|
++ sub x9, x0, x1, lsl #1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro mix_v_16 mix
|
|
|
|
++function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, x1, lsl #2
|
|
|
|
++ ld1 {v20.16b}, [x9], x1 // p3
|
|
|
|
++ ld1 {v24.16b}, [x0], x1 // q0
|
|
|
|
++ ld1 {v21.16b}, [x9], x1 // p2
|
|
|
|
++ ld1 {v25.16b}, [x0], x1 // q1
|
|
|
|
++ ld1 {v22.16b}, [x9], x1 // p1
|
|
|
|
++ ld1 {v26.16b}, [x0], x1 // q2
|
|
|
|
++ ld1 {v23.16b}, [x9], x1 // p0
|
|
|
|
++ ld1 {v27.16b}, [x0], x1 // q3
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++ add x9, x9, x1
|
|
|
|
++
|
|
|
|
++ loop_filter_8_16b_mix \mix
|
|
|
|
++
|
|
|
|
++ st1 {v21.16b}, [x9], x1
|
|
|
|
++ st1 {v24.16b}, [x0], x1
|
|
|
|
++ st1 {v22.16b}, [x9], x1
|
|
|
|
++ st1 {v25.16b}, [x0], x1
|
|
|
|
++ st1 {v23.16b}, [x9], x1
|
|
|
|
++ st1 {v26.16b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++6:
|
|
|
|
++ sub x9, x0, x1, lsl #1
|
|
|
|
++ st1 {v22.16b}, [x9], x1
|
|
|
|
++ st1 {v24.16b}, [x0], x1
|
|
|
|
++ st1 {v23.16b}, [x9], x1
|
|
|
|
++ st1 {v25.16b}, [x0], x1
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++mix_v_16 48
|
|
|
|
++mix_v_16 84
|
|
|
|
++mix_v_16 88
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_h_8_8_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ ld1 {v20.8b}, [x9], x1
|
|
|
|
++ ld1 {v24.8b}, [x0], x1
|
|
|
|
++ ld1 {v21.8b}, [x9], x1
|
|
|
|
++ ld1 {v25.8b}, [x0], x1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1
|
|
|
|
++ ld1 {v26.8b}, [x0], x1
|
|
|
|
++ ld1 {v23.8b}, [x9], x1
|
|
|
|
++ ld1 {v27.8b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ sub x9, x9, x1, lsl #2
|
|
|
|
++ sub x0, x0, x1, lsl #2
|
|
|
|
++
|
|
|
|
++ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ loop_filter_8
|
|
|
|
++
|
|
|
|
++ // Even though only 6 pixels per row have been changed, we write the
|
|
|
|
++ // full 8 pixel registers.
|
|
|
|
++ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v20.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v21.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v26.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v27.8b}, [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++6:
|
|
|
|
++ // If we didn't need to do the flat8in part, we use the same writeback
|
|
|
|
++ // as in loop_filter_h_4_8.
|
|
|
|
++ add x9, x9, #2
|
|
|
|
++ add x0, x0, #2
|
|
|
|
++ transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.s}[0], [x9], x1
|
|
|
|
++ st1 {v22.s}[1], [x0], x1
|
|
|
|
++ st1 {v23.s}[0], [x9], x1
|
|
|
|
++ st1 {v23.s}[1], [x0], x1
|
|
|
|
++ st1 {v24.s}[0], [x9], x1
|
|
|
|
++ st1 {v24.s}[1], [x0], x1
|
|
|
|
++ st1 {v25.s}[0], [x9], x1
|
|
|
|
++ st1 {v25.s}[1], [x0], x1
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++.macro mix_h_16 mix
|
|
|
|
++function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #3
|
|
|
|
++ ld1 {v20.8b}, [x9], x1
|
|
|
|
++ ld1 {v20.d}[1], [x0], x1
|
|
|
|
++ ld1 {v21.8b}, [x9], x1
|
|
|
|
++ ld1 {v21.d}[1], [x0], x1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1
|
|
|
|
++ ld1 {v22.d}[1], [x0], x1
|
|
|
|
++ ld1 {v23.8b}, [x9], x1
|
|
|
|
++ ld1 {v23.d}[1], [x0], x1
|
|
|
|
++ ld1 {v24.8b}, [x9], x1
|
|
|
|
++ ld1 {v24.d}[1], [x0], x1
|
|
|
|
++ ld1 {v25.8b}, [x9], x1
|
|
|
|
++ ld1 {v25.d}[1], [x0], x1
|
|
|
|
++ ld1 {v26.8b}, [x9], x1
|
|
|
|
++ ld1 {v26.d}[1], [x0], x1
|
|
|
|
++ ld1 {v27.8b}, [x9], x1
|
|
|
|
++ ld1 {v27.d}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++
|
|
|
|
++ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ loop_filter_8_16b_mix \mix
|
|
|
|
++
|
|
|
|
++ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v20.8b}, [x9], x1
|
|
|
|
++ st1 {v20.d}[1], [x0], x1
|
|
|
|
++ st1 {v21.8b}, [x9], x1
|
|
|
|
++ st1 {v21.d}[1], [x0], x1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v22.d}[1], [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v23.d}[1], [x0], x1
|
|
|
|
++ st1 {v24.8b}, [x9], x1
|
|
|
|
++ st1 {v24.d}[1], [x0], x1
|
|
|
|
++ st1 {v25.8b}, [x9], x1
|
|
|
|
++ st1 {v25.d}[1], [x0], x1
|
|
|
|
++ st1 {v26.8b}, [x9], x1
|
|
|
|
++ st1 {v26.d}[1], [x0], x1
|
|
|
|
++ st1 {v27.8b}, [x9], x1
|
|
|
|
++ st1 {v27.d}[1], [x0], x1
|
|
|
|
++
|
|
|
|
++ br x10
|
|
|
|
++6:
|
|
|
|
++ add x9, x9, #2
|
|
|
|
++ add x0, x0, #2
|
|
|
|
++ transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.s}[0], [x9], x1
|
|
|
|
++ st1 {v22.s}[2], [x0], x1
|
|
|
|
++ st1 {v23.s}[0], [x9], x1
|
|
|
|
++ st1 {v23.s}[2], [x0], x1
|
|
|
|
++ st1 {v24.s}[0], [x9], x1
|
|
|
|
++ st1 {v24.s}[2], [x0], x1
|
|
|
|
++ st1 {v25.s}[0], [x9], x1
|
|
|
|
++ st1 {v25.s}[2], [x0], x1
|
|
|
|
++ st1 {v22.s}[1], [x9], x1
|
|
|
|
++ st1 {v22.s}[3], [x0], x1
|
|
|
|
++ st1 {v23.s}[1], [x9], x1
|
|
|
|
++ st1 {v23.s}[3], [x0], x1
|
|
|
|
++ st1 {v24.s}[1], [x9], x1
|
|
|
|
++ st1 {v24.s}[3], [x0], x1
|
|
|
|
++ st1 {v25.s}[1], [x9], x1
|
|
|
|
++ st1 {v25.s}[3], [x0], x1
|
|
|
|
++ br x10
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++mix_h_16 48
|
|
|
|
++mix_h_16 84
|
|
|
|
++mix_h_16 88
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_v_16_8_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++ sub x9, x0, x1, lsl #3
|
|
|
|
++ ld1 {v16.8b}, [x9], x1 // p7
|
|
|
|
++ ld1 {v24.8b}, [x0], x1 // q0
|
|
|
|
++ ld1 {v17.8b}, [x9], x1 // p6
|
|
|
|
++ ld1 {v25.8b}, [x0], x1 // q1
|
|
|
|
++ ld1 {v18.8b}, [x9], x1 // p5
|
|
|
|
++ ld1 {v26.8b}, [x0], x1 // q2
|
|
|
|
++ ld1 {v19.8b}, [x9], x1 // p4
|
|
|
|
++ ld1 {v27.8b}, [x0], x1 // q3
|
|
|
|
++ ld1 {v20.8b}, [x9], x1 // p3
|
|
|
|
++ ld1 {v28.8b}, [x0], x1 // q4
|
|
|
|
++ ld1 {v21.8b}, [x9], x1 // p2
|
|
|
|
++ ld1 {v29.8b}, [x0], x1 // q5
|
|
|
|
++ ld1 {v22.8b}, [x9], x1 // p1
|
|
|
|
++ ld1 {v30.8b}, [x0], x1 // q6
|
|
|
|
++ ld1 {v23.8b}, [x9], x1 // p0
|
|
|
|
++ ld1 {v31.8b}, [x0], x1 // q7
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x9, x9, x1
|
|
|
|
++
|
|
|
|
++ loop_filter_16
|
|
|
|
++
|
|
|
|
++ // If we did the flat8out part, we get the output in
|
|
|
|
++ // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
|
|
|
|
++ // store v2-v9 there, and v10-v17 into x0.
|
|
|
|
++ st1 {v2.8b}, [x9], x1
|
|
|
|
++ st1 {v10.8b}, [x0], x1
|
|
|
|
++ st1 {v3.8b}, [x9], x1
|
|
|
|
++ st1 {v11.8b}, [x0], x1
|
|
|
|
++ st1 {v4.8b}, [x9], x1
|
|
|
|
++ st1 {v12.8b}, [x0], x1
|
|
|
|
++ st1 {v5.8b}, [x9], x1
|
|
|
|
++ st1 {v13.8b}, [x0], x1
|
|
|
|
++ st1 {v6.8b}, [x9], x1
|
|
|
|
++ st1 {v14.8b}, [x0], x1
|
|
|
|
++ st1 {v8.8b}, [x9], x1
|
|
|
|
++ st1 {v15.8b}, [x0], x1
|
|
|
|
++ st1 {v9.8b}, [x9], x1
|
|
|
|
++ st1 {v17.8b}, [x0], x1
|
|
|
|
++9:
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x10
|
|
|
|
++8:
|
|
|
|
++ add x9, x9, x1, lsl #2
|
|
|
|
++ // If we didn't do the flat8out part, the output is left in the
|
|
|
|
++ // input registers.
|
|
|
|
++ st1 {v21.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v26.8b}, [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++7:
|
|
|
|
++ sub x9, x0, x1, lsl #1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_v_16_16_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++ sub x9, x0, x1, lsl #3
|
|
|
|
++ ld1 {v16.16b}, [x9], x1 // p7
|
|
|
|
++ ld1 {v24.16b}, [x0], x1 // q0
|
|
|
|
++ ld1 {v17.16b}, [x9], x1 // p6
|
|
|
|
++ ld1 {v25.16b}, [x0], x1 // q1
|
|
|
|
++ ld1 {v18.16b}, [x9], x1 // p5
|
|
|
|
++ ld1 {v26.16b}, [x0], x1 // q2
|
|
|
|
++ ld1 {v19.16b}, [x9], x1 // p4
|
|
|
|
++ ld1 {v27.16b}, [x0], x1 // q3
|
|
|
|
++ ld1 {v20.16b}, [x9], x1 // p3
|
|
|
|
++ ld1 {v28.16b}, [x0], x1 // q4
|
|
|
|
++ ld1 {v21.16b}, [x9], x1 // p2
|
|
|
|
++ ld1 {v29.16b}, [x0], x1 // q5
|
|
|
|
++ ld1 {v22.16b}, [x9], x1 // p1
|
|
|
|
++ ld1 {v30.16b}, [x0], x1 // q6
|
|
|
|
++ ld1 {v23.16b}, [x9], x1 // p0
|
|
|
|
++ ld1 {v31.16b}, [x0], x1 // q7
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ add x9, x9, x1
|
|
|
|
++
|
|
|
|
++ loop_filter_16_16b
|
|
|
|
++
|
|
|
|
++ st1 {v2.16b}, [x9], x1
|
|
|
|
++ st1 {v10.16b}, [x0], x1
|
|
|
|
++ st1 {v3.16b}, [x9], x1
|
|
|
|
++ st1 {v11.16b}, [x0], x1
|
|
|
|
++ st1 {v4.16b}, [x9], x1
|
|
|
|
++ st1 {v12.16b}, [x0], x1
|
|
|
|
++ st1 {v5.16b}, [x9], x1
|
|
|
|
++ st1 {v13.16b}, [x0], x1
|
|
|
|
++ st1 {v6.16b}, [x9], x1
|
|
|
|
++ st1 {v14.16b}, [x0], x1
|
|
|
|
++ st1 {v8.16b}, [x9], x1
|
|
|
|
++ st1 {v15.16b}, [x0], x1
|
|
|
|
++ st1 {v9.16b}, [x9], x1
|
|
|
|
++ st1 {v17.16b}, [x0], x1
|
|
|
|
++9:
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x10
|
|
|
|
++8:
|
|
|
|
++ add x9, x9, x1, lsl #2
|
|
|
|
++ st1 {v21.16b}, [x9], x1
|
|
|
|
++ st1 {v24.16b}, [x0], x1
|
|
|
|
++ st1 {v22.16b}, [x9], x1
|
|
|
|
++ st1 {v25.16b}, [x0], x1
|
|
|
|
++ st1 {v23.16b}, [x9], x1
|
|
|
|
++ st1 {v26.16b}, [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++7:
|
|
|
|
++ sub x9, x0, x1, lsl #1
|
|
|
|
++ st1 {v22.16b}, [x9], x1
|
|
|
|
++ st1 {v24.16b}, [x0], x1
|
|
|
|
++ st1 {v23.16b}, [x9], x1
|
|
|
|
++ st1 {v25.16b}, [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_h_16_8_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++ sub x9, x0, #8
|
|
|
|
++ ld1 {v16.8b}, [x9], x1
|
|
|
|
++ ld1 {v24.8b}, [x0], x1
|
|
|
|
++ ld1 {v17.8b}, [x9], x1
|
|
|
|
++ ld1 {v25.8b}, [x0], x1
|
|
|
|
++ ld1 {v18.8b}, [x9], x1
|
|
|
|
++ ld1 {v26.8b}, [x0], x1
|
|
|
|
++ ld1 {v19.8b}, [x9], x1
|
|
|
|
++ ld1 {v27.8b}, [x0], x1
|
|
|
|
++ ld1 {v20.8b}, [x9], x1
|
|
|
|
++ ld1 {v28.8b}, [x0], x1
|
|
|
|
++ ld1 {v21.8b}, [x9], x1
|
|
|
|
++ ld1 {v29.8b}, [x0], x1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1
|
|
|
|
++ ld1 {v30.8b}, [x0], x1
|
|
|
|
++ ld1 {v23.8b}, [x9], x1
|
|
|
|
++ ld1 {v31.8b}, [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #3
|
|
|
|
++ sub x9, x9, x1, lsl #3
|
|
|
|
++
|
|
|
|
++ // The 16x8 pixels read above is in two 8x8 blocks; the left
|
|
|
|
++ // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
|
|
|
|
++ // of this, to get one column per register.
|
|
|
|
++ transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
|
|
|
++ transpose_8x8B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
|
|
|
++
|
|
|
|
++ loop_filter_16
|
|
|
|
++
|
|
|
|
++ transpose_8x8B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
|
|
|
|
++ transpose_8x8B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
|
|
|
|
++
|
|
|
|
++ st1 {v16.8b}, [x9], x1
|
|
|
|
++ st1 {v10.8b}, [x0], x1
|
|
|
|
++ st1 {v2.8b}, [x9], x1
|
|
|
|
++ st1 {v11.8b}, [x0], x1
|
|
|
|
++ st1 {v3.8b}, [x9], x1
|
|
|
|
++ st1 {v12.8b}, [x0], x1
|
|
|
|
++ st1 {v4.8b}, [x9], x1
|
|
|
|
++ st1 {v13.8b}, [x0], x1
|
|
|
|
++ st1 {v5.8b}, [x9], x1
|
|
|
|
++ st1 {v14.8b}, [x0], x1
|
|
|
|
++ st1 {v6.8b}, [x9], x1
|
|
|
|
++ st1 {v15.8b}, [x0], x1
|
|
|
|
++ st1 {v8.8b}, [x9], x1
|
|
|
|
++ st1 {v17.8b}, [x0], x1
|
|
|
|
++ st1 {v9.8b}, [x9], x1
|
|
|
|
++ st1 {v31.8b}, [x0], x1
|
|
|
|
++9:
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x10
|
|
|
|
++8:
|
|
|
|
++ // The same writeback as in loop_filter_h_8_8
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v20.8b}, [x9], x1
|
|
|
|
++ st1 {v24.8b}, [x0], x1
|
|
|
|
++ st1 {v21.8b}, [x9], x1
|
|
|
|
++ st1 {v25.8b}, [x0], x1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v26.8b}, [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v27.8b}, [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++7:
|
|
|
|
++ // The same writeback as in loop_filter_h_4_8
|
|
|
|
++ sub x9, x0, #2
|
|
|
|
++ add x0, x9, x1, lsl #2
|
|
|
|
++ transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.s}[0], [x9], x1
|
|
|
|
++ st1 {v22.s}[1], [x0], x1
|
|
|
|
++ st1 {v23.s}[0], [x9], x1
|
|
|
|
++ st1 {v23.s}[1], [x0], x1
|
|
|
|
++ st1 {v24.s}[0], [x9], x1
|
|
|
|
++ st1 {v24.s}[1], [x0], x1
|
|
|
|
++ st1 {v25.s}[0], [x9], x1
|
|
|
|
++ st1 {v25.s}[1], [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_loop_filter_h_16_16_neon, export=1
|
|
|
|
++ mov x10, x30
|
|
|
|
++ stp d14, d15, [sp, #-0x10]!
|
|
|
|
++ stp d12, d13, [sp, #-0x10]!
|
|
|
|
++ stp d10, d11, [sp, #-0x10]!
|
|
|
|
++ stp d8, d9, [sp, #-0x10]!
|
|
|
|
++ sub x9, x0, #8
|
|
|
|
++ ld1 {v16.8b}, [x9], x1
|
|
|
|
++ ld1 {v24.8b}, [x0], x1
|
|
|
|
++ ld1 {v17.8b}, [x9], x1
|
|
|
|
++ ld1 {v25.8b}, [x0], x1
|
|
|
|
++ ld1 {v18.8b}, [x9], x1
|
|
|
|
++ ld1 {v26.8b}, [x0], x1
|
|
|
|
++ ld1 {v19.8b}, [x9], x1
|
|
|
|
++ ld1 {v27.8b}, [x0], x1
|
|
|
|
++ ld1 {v20.8b}, [x9], x1
|
|
|
|
++ ld1 {v28.8b}, [x0], x1
|
|
|
|
++ ld1 {v21.8b}, [x9], x1
|
|
|
|
++ ld1 {v29.8b}, [x0], x1
|
|
|
|
++ ld1 {v22.8b}, [x9], x1
|
|
|
|
++ ld1 {v30.8b}, [x0], x1
|
|
|
|
++ ld1 {v23.8b}, [x9], x1
|
|
|
|
++ ld1 {v31.8b}, [x0], x1
|
|
|
|
++ ld1 {v16.d}[1], [x9], x1
|
|
|
|
++ ld1 {v24.d}[1], [x0], x1
|
|
|
|
++ ld1 {v17.d}[1], [x9], x1
|
|
|
|
++ ld1 {v25.d}[1], [x0], x1
|
|
|
|
++ ld1 {v18.d}[1], [x9], x1
|
|
|
|
++ ld1 {v26.d}[1], [x0], x1
|
|
|
|
++ ld1 {v19.d}[1], [x9], x1
|
|
|
|
++ ld1 {v27.d}[1], [x0], x1
|
|
|
|
++ ld1 {v20.d}[1], [x9], x1
|
|
|
|
++ ld1 {v28.d}[1], [x0], x1
|
|
|
|
++ ld1 {v21.d}[1], [x9], x1
|
|
|
|
++ ld1 {v29.d}[1], [x0], x1
|
|
|
|
++ ld1 {v22.d}[1], [x9], x1
|
|
|
|
++ ld1 {v30.d}[1], [x0], x1
|
|
|
|
++ ld1 {v23.d}[1], [x9], x1
|
|
|
|
++ ld1 {v31.d}[1], [x0], x1
|
|
|
|
++ sub x0, x0, x1, lsl #4
|
|
|
|
++ sub x9, x9, x1, lsl #4
|
|
|
|
++
|
|
|
|
++ transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
|
|
|
++ transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
|
|
|
++
|
|
|
|
++ loop_filter_16_16b
|
|
|
|
++
|
|
|
|
++ transpose_8x16B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
|
|
|
|
++ transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
|
|
|
|
++
|
|
|
|
++ st1 {v16.8b}, [x9], x1
|
|
|
|
++ st1 {v10.8b}, [x0], x1
|
|
|
|
++ st1 {v2.8b}, [x9], x1
|
|
|
|
++ st1 {v11.8b}, [x0], x1
|
|
|
|
++ st1 {v3.8b}, [x9], x1
|
|
|
|
++ st1 {v12.8b}, [x0], x1
|
|
|
|
++ st1 {v4.8b}, [x9], x1
|
|
|
|
++ st1 {v13.8b}, [x0], x1
|
|
|
|
++ st1 {v5.8b}, [x9], x1
|
|
|
|
++ st1 {v14.8b}, [x0], x1
|
|
|
|
++ st1 {v6.8b}, [x9], x1
|
|
|
|
++ st1 {v15.8b}, [x0], x1
|
|
|
|
++ st1 {v8.8b}, [x9], x1
|
|
|
|
++ st1 {v17.8b}, [x0], x1
|
|
|
|
++ st1 {v9.8b}, [x9], x1
|
|
|
|
++ st1 {v31.8b}, [x0], x1
|
|
|
|
++ st1 {v16.d}[1], [x9], x1
|
|
|
|
++ st1 {v10.d}[1], [x0], x1
|
|
|
|
++ st1 {v2.d}[1], [x9], x1
|
|
|
|
++ st1 {v11.d}[1], [x0], x1
|
|
|
|
++ st1 {v3.d}[1], [x9], x1
|
|
|
|
++ st1 {v12.d}[1], [x0], x1
|
|
|
|
++ st1 {v4.d}[1], [x9], x1
|
|
|
|
++ st1 {v13.d}[1], [x0], x1
|
|
|
|
++ st1 {v5.d}[1], [x9], x1
|
|
|
|
++ st1 {v14.d}[1], [x0], x1
|
|
|
|
++ st1 {v6.d}[1], [x9], x1
|
|
|
|
++ st1 {v15.d}[1], [x0], x1
|
|
|
|
++ st1 {v8.d}[1], [x9], x1
|
|
|
|
++ st1 {v17.d}[1], [x0], x1
|
|
|
|
++ st1 {v9.d}[1], [x9], x1
|
|
|
|
++ st1 {v31.d}[1], [x0], x1
|
|
|
|
++9:
|
|
|
|
++ ldp d8, d9, [sp], 0x10
|
|
|
|
++ ldp d10, d11, [sp], 0x10
|
|
|
|
++ ldp d12, d13, [sp], 0x10
|
|
|
|
++ ldp d14, d15, [sp], 0x10
|
|
|
|
++ br x10
|
|
|
|
++8:
|
|
|
|
++ sub x9, x0, #4
|
|
|
|
++ add x0, x9, x1, lsl #3
|
|
|
|
++ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++
|
|
|
|
++ st1 {v20.8b}, [x9], x1
|
|
|
|
++ st1 {v20.d}[1], [x0], x1
|
|
|
|
++ st1 {v21.8b}, [x9], x1
|
|
|
|
++ st1 {v21.d}[1], [x0], x1
|
|
|
|
++ st1 {v22.8b}, [x9], x1
|
|
|
|
++ st1 {v22.d}[1], [x0], x1
|
|
|
|
++ st1 {v23.8b}, [x9], x1
|
|
|
|
++ st1 {v23.d}[1], [x0], x1
|
|
|
|
++ st1 {v24.8b}, [x9], x1
|
|
|
|
++ st1 {v24.d}[1], [x0], x1
|
|
|
|
++ st1 {v25.8b}, [x9], x1
|
|
|
|
++ st1 {v25.d}[1], [x0], x1
|
|
|
|
++ st1 {v26.8b}, [x9], x1
|
|
|
|
++ st1 {v26.d}[1], [x0], x1
|
|
|
|
++ st1 {v27.8b}, [x9], x1
|
|
|
|
++ st1 {v27.d}[1], [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++7:
|
|
|
|
++ sub x9, x0, #2
|
|
|
|
++ add x0, x9, x1, lsl #3
|
|
|
|
++ transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
|
|
|
|
++ st1 {v22.s}[0], [x9], x1
|
|
|
|
++ st1 {v22.s}[2], [x0], x1
|
|
|
|
++ st1 {v23.s}[0], [x9], x1
|
|
|
|
++ st1 {v23.s}[2], [x0], x1
|
|
|
|
++ st1 {v24.s}[0], [x9], x1
|
|
|
|
++ st1 {v24.s}[2], [x0], x1
|
|
|
|
++ st1 {v25.s}[0], [x9], x1
|
|
|
|
++ st1 {v25.s}[2], [x0], x1
|
|
|
|
++ st1 {v22.s}[1], [x9], x1
|
|
|
|
++ st1 {v22.s}[3], [x0], x1
|
|
|
|
++ st1 {v23.s}[1], [x9], x1
|
|
|
|
++ st1 {v23.s}[3], [x0], x1
|
|
|
|
++ st1 {v24.s}[1], [x9], x1
|
|
|
|
++ st1 {v24.s}[3], [x0], x1
|
|
|
|
++ st1 {v25.s}[1], [x9], x1
|
|
|
|
++ st1 {v25.s}[3], [x0], x1
|
|
|
|
++ b 9b
|
|
|
|
++endfunc
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
|
|
|
|
+@@ -0,0 +1,631 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2017 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++// All public functions in this file have the following signature:
|
|
|
|
++// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
|
|
|
++// const uint8_t *ref, ptrdiff_t ref_stride,
|
|
|
|
++// int h, int mx, int my);
|
|
|
|
++
|
|
|
|
++function ff_vp9_copy128_aarch64, export=1
|
|
|
|
++1:
|
|
|
|
++ ldp x5, x6, [x2]
|
|
|
|
++ ldp x7, x8, [x2, #16]
|
|
|
|
++ stp x5, x6, [x0]
|
|
|
|
++ ldp x9, x10, [x2, #32]
|
|
|
|
++ stp x7, x8, [x0, #16]
|
|
|
|
++ subs w4, w4, #1
|
|
|
|
++ ldp x11, x12, [x2, #48]
|
|
|
|
++ stp x9, x10, [x0, #32]
|
|
|
|
++ stp x11, x12, [x0, #48]
|
|
|
|
++ ldp x5, x6, [x2, #64]
|
|
|
|
++ ldp x7, x8, [x2, #80]
|
|
|
|
++ stp x5, x6, [x0, #64]
|
|
|
|
++ ldp x9, x10, [x2, #96]
|
|
|
|
++ stp x7, x8, [x0, #80]
|
|
|
|
++ ldp x11, x12, [x2, #112]
|
|
|
|
++ stp x9, x10, [x0, #96]
|
|
|
|
++ stp x11, x12, [x0, #112]
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x0, x0, x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg64_16_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++ sub x1, x1, #64
|
|
|
|
++ sub x3, x3, #64
|
|
|
|
++1:
|
|
|
|
++ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
|
|
|
|
++ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
|
|
|
|
++ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
|
|
|
|
++ urhadd v0.8h, v0.8h, v4.8h
|
|
|
|
++ urhadd v1.8h, v1.8h, v5.8h
|
|
|
|
++ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
|
|
|
|
++ urhadd v2.8h, v2.8h, v6.8h
|
|
|
|
++ urhadd v3.8h, v3.8h, v7.8h
|
|
|
|
++ subs w4, w4, #1
|
|
|
|
++ urhadd v16.8h, v16.8h, v20.8h
|
|
|
|
++ urhadd v17.8h, v17.8h, v21.8h
|
|
|
|
++ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
|
|
|
|
++ urhadd v18.8h, v18.8h, v22.8h
|
|
|
|
++ urhadd v19.8h, v19.8h, v23.8h
|
|
|
|
++ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg32_16_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
|
|
|
|
++ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
|
|
|
|
++ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
|
|
|
|
++ urhadd v0.8h, v0.8h, v4.8h
|
|
|
|
++ urhadd v1.8h, v1.8h, v5.8h
|
|
|
|
++ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
|
|
|
|
++ urhadd v2.8h, v2.8h, v6.8h
|
|
|
|
++ urhadd v3.8h, v3.8h, v7.8h
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ urhadd v16.8h, v16.8h, v20.8h
|
|
|
|
++ urhadd v17.8h, v17.8h, v21.8h
|
|
|
|
++ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
|
|
|
|
++ urhadd v18.8h, v18.8h, v22.8h
|
|
|
|
++ urhadd v19.8h, v19.8h, v23.8h
|
|
|
|
++ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg16_16_neon, export=1
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.8h, v3.8h}, [x2], x3
|
|
|
|
++ ld1 {v0.8h, v1.8h}, [x0]
|
|
|
|
++ urhadd v0.8h, v0.8h, v2.8h
|
|
|
|
++ urhadd v1.8h, v1.8h, v3.8h
|
|
|
|
++ subs w4, w4, #1
|
|
|
|
++ st1 {v0.8h, v1.8h}, [x0], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg8_16_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.8h}, [x2], x3
|
|
|
|
++ ld1 {v0.8h}, [x0], x1
|
|
|
|
++ ld1 {v3.8h}, [x2], x3
|
|
|
|
++ urhadd v0.8h, v0.8h, v2.8h
|
|
|
|
++ ld1 {v1.8h}, [x0], x1
|
|
|
|
++ urhadd v1.8h, v1.8h, v3.8h
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ st1 {v0.8h}, [x5], x1
|
|
|
|
++ st1 {v1.8h}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg4_16_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.4h}, [x2], x3
|
|
|
|
++ ld1 {v0.4h}, [x0], x1
|
|
|
|
++ ld1 {v3.4h}, [x2], x3
|
|
|
|
++ urhadd v0.4h, v0.4h, v2.4h
|
|
|
|
++ ld1 {v1.4h}, [x0], x1
|
|
|
|
++ urhadd v1.4h, v1.4h, v3.4h
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ st1 {v0.4h}, [x5], x1
|
|
|
|
++ st1 {v1.8b}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
|
|
|
|
++// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
|
|
|
|
++// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
|
|
|
|
++// for size >= 16)
|
|
|
|
++.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
|
|
|
|
++ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
|
|
|
++ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
|
|
|
++ smlal \dst1\().4s, v20.4h, v0.h[\offset]
|
|
|
|
++ smlal \dst5\().4s, v22.4h, v0.h[\offset]
|
|
|
|
++.if \size >= 16
|
|
|
|
++ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
|
|
|
++ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
|
|
|
++.endif
|
|
|
|
++.if \size >= 8
|
|
|
|
++ smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
|
|
|
|
++ smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
|
|
|
|
++.endif
|
|
|
|
++.if \size >= 16
|
|
|
|
++ smlal \dst3\().4s, v21.4h, v0.h[\offset]
|
|
|
|
++ smlal \dst7\().4s, v23.4h, v0.h[\offset]
|
|
|
|
++ smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
|
|
|
|
++ smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Instantiate a horizontal filter function for the given size.
|
|
|
|
++// This can work on 4, 8 or 16 pixels in parallel; for larger
|
|
|
|
++// widths it will do 16 pixels at a time and loop horizontally.
|
|
|
|
++// The actual width (in bytes) is passed in x5, the height in w4 and
|
|
|
|
++// the filter coefficients in x9.
|
|
|
|
++.macro do_8tap_h type, size
|
|
|
|
++function \type\()_8tap_\size\()h
|
|
|
|
++ sub x2, x2, #6
|
|
|
|
++ add x6, x0, x1
|
|
|
|
++ add x7, x2, x3
|
|
|
|
++ add x1, x1, x1
|
|
|
|
++ add x3, x3, x3
|
|
|
|
++ // Only size >= 16 loops horizontally and needs
|
|
|
|
++ // reduced dst stride
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sub x1, x1, x5
|
|
|
|
++.endif
|
|
|
|
++ // size >= 16 loads two qwords and increments r2,
|
|
|
|
++ // for size 4/8 it's enough with one qword and no
|
|
|
|
++ // postincrement
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sub x3, x3, x5
|
|
|
|
++ sub x3, x3, #16
|
|
|
|
++.endif
|
|
|
|
++ // Load the filter vector
|
|
|
|
++ ld1 {v0.8h}, [x9]
|
|
|
|
++1:
|
|
|
|
++.if \size >= 16
|
|
|
|
++ mov x9, x5
|
|
|
|
++.endif
|
|
|
|
++ // Load src
|
|
|
|
++.if \size >= 16
|
|
|
|
++ ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
|
|
|
|
++ ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
|
|
|
|
++.else
|
|
|
|
++ ld1 {v5.8h, v6.8h}, [x2]
|
|
|
|
++ ld1 {v16.8h, v17.8h}, [x7]
|
|
|
|
++.endif
|
|
|
|
++2:
|
|
|
|
++
|
|
|
|
++ smull v1.4s, v5.4h, v0.h[0]
|
|
|
|
++ smull v24.4s, v16.4h, v0.h[0]
|
|
|
|
++.if \size >= 8
|
|
|
|
++ smull2 v2.4s, v5.8h, v0.h[0]
|
|
|
|
++ smull2 v25.4s, v16.8h, v0.h[0]
|
|
|
|
++.endif
|
|
|
|
++.if \size >= 16
|
|
|
|
++ smull v3.4s, v6.4h, v0.h[0]
|
|
|
|
++ smull v26.4s, v17.4h, v0.h[0]
|
|
|
|
++ smull2 v4.4s, v6.8h, v0.h[0]
|
|
|
|
++ smull2 v27.4s, v17.8h, v0.h[0]
|
|
|
|
++.endif
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
|
|
|
|
++ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
|
|
|
|
++
|
|
|
|
++ // Round, shift and saturate
|
|
|
|
++ // The sqrshrun takes care of clamping negative values to zero, but
|
|
|
|
++ // we manually need to do umin with the max pixel value.
|
|
|
|
++ sqrshrun v1.4h, v1.4s, #7
|
|
|
|
++ sqrshrun v24.4h, v24.4s, #7
|
|
|
|
++.if \size >= 8
|
|
|
|
++ sqrshrun2 v1.8h, v2.4s, #7
|
|
|
|
++ sqrshrun2 v24.8h, v25.4s, #7
|
|
|
|
++ umin v1.8h, v1.8h, v31.8h
|
|
|
|
++ umin v24.8h, v24.8h, v31.8h
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sqrshrun v2.4h, v3.4s, #7
|
|
|
|
++ sqrshrun v25.4h, v26.4s, #7
|
|
|
|
++ sqrshrun2 v2.8h, v4.4s, #7
|
|
|
|
++ sqrshrun2 v25.8h, v27.4s, #7
|
|
|
|
++ umin v2.8h, v2.8h, v31.8h
|
|
|
|
++ umin v25.8h, v25.8h, v31.8h
|
|
|
|
++.endif
|
|
|
|
++.else
|
|
|
|
++ umin v1.4h, v1.4h, v31.4h
|
|
|
|
++ umin v24.4h, v24.4h, v31.4h
|
|
|
|
++.endif
|
|
|
|
++ // Average
|
|
|
|
++.ifc \type,avg
|
|
|
|
++.if \size >= 16
|
|
|
|
++ ld1 {v3.8h, v4.8h}, [x0]
|
|
|
|
++ ld1 {v29.8h, v30.8h}, [x6]
|
|
|
|
++ urhadd v1.8h, v1.8h, v3.8h
|
|
|
|
++ urhadd v2.8h, v2.8h, v4.8h
|
|
|
|
++ urhadd v24.8h, v24.8h, v29.8h
|
|
|
|
++ urhadd v25.8h, v25.8h, v30.8h
|
|
|
|
++.elseif \size >= 8
|
|
|
|
++ ld1 {v3.8h}, [x0]
|
|
|
|
++ ld1 {v4.8h}, [x6]
|
|
|
|
++ urhadd v1.8h, v1.8h, v3.8h
|
|
|
|
++ urhadd v24.8h, v24.8h, v4.8h
|
|
|
|
++.else
|
|
|
|
++ ld1 {v3.4h}, [x0]
|
|
|
|
++ ld1 {v4.4h}, [x6]
|
|
|
|
++ urhadd v1.4h, v1.4h, v3.4h
|
|
|
|
++ urhadd v24.4h, v24.4h, v4.4h
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ // Store and loop horizontally (for size >= 16)
|
|
|
|
++.if \size >= 16
|
|
|
|
++ subs x9, x9, #32
|
|
|
|
++ st1 {v1.8h, v2.8h}, [x0], #32
|
|
|
|
++ st1 {v24.8h, v25.8h}, [x6], #32
|
|
|
|
++ b.eq 3f
|
|
|
|
++ mov v5.16b, v7.16b
|
|
|
|
++ mov v16.16b, v18.16b
|
|
|
|
++ ld1 {v6.8h, v7.8h}, [x2], #32
|
|
|
|
++ ld1 {v17.8h, v18.8h}, [x7], #32
|
|
|
|
++ b 2b
|
|
|
|
++.elseif \size == 8
|
|
|
|
++ st1 {v1.8h}, [x0]
|
|
|
|
++ st1 {v24.8h}, [x6]
|
|
|
|
++.else // \size == 4
|
|
|
|
++ st1 {v1.4h}, [x0]
|
|
|
|
++ st1 {v24.4h}, [x6]
|
|
|
|
++.endif
|
|
|
|
++3:
|
|
|
|
++ // Loop vertically
|
|
|
|
++ add x0, x0, x1
|
|
|
|
++ add x6, x6, x1
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x7, x7, x3
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_size size
|
|
|
|
++do_8tap_h put, \size
|
|
|
|
++do_8tap_h avg, \size
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_h_size 4
|
|
|
|
++do_8tap_h_size 8
|
|
|
|
++do_8tap_h_size 16
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_func type, filter, offset, size, bpp
|
|
|
|
++function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
|
|
|
|
++ mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
|
|
|
++ movrel x6, X(ff_vp9_subpel_filters), 256*\offset
|
|
|
|
++ cmp w5, #8
|
|
|
|
++ add x9, x6, w5, uxtw #4
|
|
|
|
++ mov x5, #2*\size
|
|
|
|
++.if \size >= 16
|
|
|
|
++ b \type\()_8tap_16h
|
|
|
|
++.else
|
|
|
|
++ b \type\()_8tap_\size\()h
|
|
|
|
++.endif
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_filters size, bpp
|
|
|
|
++do_8tap_h_func put, regular, 1, \size, \bpp
|
|
|
|
++do_8tap_h_func avg, regular, 1, \size, \bpp
|
|
|
|
++do_8tap_h_func put, sharp, 2, \size, \bpp
|
|
|
|
++do_8tap_h_func avg, sharp, 2, \size, \bpp
|
|
|
|
++do_8tap_h_func put, smooth, 0, \size, \bpp
|
|
|
|
++do_8tap_h_func avg, smooth, 0, \size, \bpp
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_filters_bpp bpp
|
|
|
|
++do_8tap_h_filters 64, \bpp
|
|
|
|
++do_8tap_h_filters 32, \bpp
|
|
|
|
++do_8tap_h_filters 16, \bpp
|
|
|
|
++do_8tap_h_filters 8, \bpp
|
|
|
|
++do_8tap_h_filters 4, \bpp
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_h_filters_bpp 10
|
|
|
|
++do_8tap_h_filters_bpp 12
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Vertical filters
|
|
|
|
++
|
|
|
|
++// Round, shift and saturate and store reg1-reg4
|
|
|
|
++.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
|
|
|
|
++ sqrshrun \reg1\().4h, \reg1\().4s, #7
|
|
|
|
++ sqrshrun \reg2\().4h, \reg2\().4s, #7
|
|
|
|
++ sqrshrun \reg3\().4h, \reg3\().4s, #7
|
|
|
|
++ sqrshrun \reg4\().4h, \reg4\().4s, #7
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ ld1 {\tmp1\().4h}, [x7], x1
|
|
|
|
++ ld1 {\tmp2\().4h}, [x7], x1
|
|
|
|
++ ld1 {\tmp3\().4h}, [x7], x1
|
|
|
|
++ ld1 {\tmp4\().4h}, [x7], x1
|
|
|
|
++.endif
|
|
|
|
++ umin \reg1\().4h, \reg1\().4h, \minreg\().4h
|
|
|
|
++ umin \reg2\().4h, \reg2\().4h, \minreg\().4h
|
|
|
|
++ umin \reg3\().4h, \reg3\().4h, \minreg\().4h
|
|
|
|
++ umin \reg4\().4h, \reg4\().4h, \minreg\().4h
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
|
|
|
|
++ urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
|
|
|
|
++ urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
|
|
|
|
++ urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
|
|
|
|
++.endif
|
|
|
|
++ st1 {\reg1\().4h}, [x0], x1
|
|
|
|
++ st1 {\reg2\().4h}, [x0], x1
|
|
|
|
++ st1 {\reg3\().4h}, [x0], x1
|
|
|
|
++ st1 {\reg4\().4h}, [x0], x1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Round, shift and saturate and store reg1-8, where
|
|
|
|
++// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
|
|
|
|
++.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
|
|
|
|
++ sqrshrun \reg1\().4h, \reg1\().4s, #7
|
|
|
|
++ sqrshrun2 \reg1\().8h, \reg2\().4s, #7
|
|
|
|
++ sqrshrun \reg2\().4h, \reg3\().4s, #7
|
|
|
|
++ sqrshrun2 \reg2\().8h, \reg4\().4s, #7
|
|
|
|
++ sqrshrun \reg3\().4h, \reg5\().4s, #7
|
|
|
|
++ sqrshrun2 \reg3\().8h, \reg6\().4s, #7
|
|
|
|
++ sqrshrun \reg4\().4h, \reg7\().4s, #7
|
|
|
|
++ sqrshrun2 \reg4\().8h, \reg8\().4s, #7
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ ld1 {\reg5\().8h}, [x7], x1
|
|
|
|
++ ld1 {\reg6\().8h}, [x7], x1
|
|
|
|
++ ld1 {\reg7\().8h}, [x7], x1
|
|
|
|
++ ld1 {\reg8\().8h}, [x7], x1
|
|
|
|
++.endif
|
|
|
|
++ umin \reg1\().8h, \reg1\().8h, \minreg\().8h
|
|
|
|
++ umin \reg2\().8h, \reg2\().8h, \minreg\().8h
|
|
|
|
++ umin \reg3\().8h, \reg3\().8h, \minreg\().8h
|
|
|
|
++ umin \reg4\().8h, \reg4\().8h, \minreg\().8h
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
|
|
|
|
++ urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
|
|
|
|
++ urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
|
|
|
|
++ urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
|
|
|
|
++.endif
|
|
|
|
++ st1 {\reg1\().8h}, [x0], x1
|
|
|
|
++ st1 {\reg2\().8h}, [x0], x1
|
|
|
|
++ st1 {\reg3\().8h}, [x0], x1
|
|
|
|
++ st1 {\reg4\().8h}, [x0], x1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
|
|
|
|
++// (src1-src8 into dst1, src2-src9 into dst2).
|
|
|
|
++.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
|
|
|
|
++ smull \dst1\().4s, \src1\().4h, v0.h[0]
|
|
|
|
++ smull \dst2\().4s, \src2\().4h, v0.h[0]
|
|
|
|
++ smull \tmp1\().4s, \src2\().4h, v0.h[1]
|
|
|
|
++ smull \tmp2\().4s, \src3\().4h, v0.h[1]
|
|
|
|
++ smlal \dst1\().4s, \src3\().4h, v0.h[2]
|
|
|
|
++ smlal \dst2\().4s, \src4\().4h, v0.h[2]
|
|
|
|
++ smlal \tmp1\().4s, \src4\().4h, v0.h[3]
|
|
|
|
++ smlal \tmp2\().4s, \src5\().4h, v0.h[3]
|
|
|
|
++ smlal \dst1\().4s, \src5\().4h, v0.h[4]
|
|
|
|
++ smlal \dst2\().4s, \src6\().4h, v0.h[4]
|
|
|
|
++ smlal \tmp1\().4s, \src6\().4h, v0.h[5]
|
|
|
|
++ smlal \tmp2\().4s, \src7\().4h, v0.h[5]
|
|
|
|
++ smlal \dst1\().4s, \src7\().4h, v0.h[6]
|
|
|
|
++ smlal \dst2\().4s, \src8\().4h, v0.h[6]
|
|
|
|
++ smlal \tmp1\().4s, \src8\().4h, v0.h[7]
|
|
|
|
++ smlal \tmp2\().4s, \src9\().4h, v0.h[7]
|
|
|
|
++ add \dst1\().4s, \dst1\().4s, \tmp1\().4s
|
|
|
|
++ add \dst2\().4s, \dst2\().4s, \tmp2\().4s
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
|
|
|
|
++// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
|
|
|
|
++.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
|
|
|
|
++ smull \dst1\().4s, \src1\().4h, v0.h[0]
|
|
|
|
++ smull2 \dst2\().4s, \src1\().8h, v0.h[0]
|
|
|
|
++ smull \dst3\().4s, \src2\().4h, v0.h[0]
|
|
|
|
++ smull2 \dst4\().4s, \src2\().8h, v0.h[0]
|
|
|
|
++ smlal \dst1\().4s, \src2\().4h, v0.h[1]
|
|
|
|
++ smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
|
|
|
|
++ smlal \dst3\().4s, \src3\().4h, v0.h[1]
|
|
|
|
++ smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
|
|
|
|
++ smlal \dst1\().4s, \src3\().4h, v0.h[2]
|
|
|
|
++ smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
|
|
|
|
++ smlal \dst3\().4s, \src4\().4h, v0.h[2]
|
|
|
|
++ smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
|
|
|
|
++ smlal \dst1\().4s, \src4\().4h, v0.h[3]
|
|
|
|
++ smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
|
|
|
|
++ smlal \dst3\().4s, \src5\().4h, v0.h[3]
|
|
|
|
++ smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
|
|
|
|
++ smlal \dst1\().4s, \src5\().4h, v0.h[4]
|
|
|
|
++ smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
|
|
|
|
++ smlal \dst3\().4s, \src6\().4h, v0.h[4]
|
|
|
|
++ smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
|
|
|
|
++ smlal \dst1\().4s, \src6\().4h, v0.h[5]
|
|
|
|
++ smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
|
|
|
|
++ smlal \dst3\().4s, \src7\().4h, v0.h[5]
|
|
|
|
++ smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
|
|
|
|
++ smlal \dst1\().4s, \src7\().4h, v0.h[6]
|
|
|
|
++ smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
|
|
|
|
++ smlal \dst3\().4s, \src8\().4h, v0.h[6]
|
|
|
|
++ smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
|
|
|
|
++ smlal \dst1\().4s, \src8\().4h, v0.h[7]
|
|
|
|
++ smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
|
|
|
|
++ smlal \dst3\().4s, \src9\().4h, v0.h[7]
|
|
|
|
++ smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Instantiate a vertical filter function for filtering 8 pixels at a time.
|
|
|
|
++// The height is passed in x4, the width in x5 and the filter coefficients
|
|
|
|
++// in x6.
|
|
|
|
++.macro do_8tap_8v type
|
|
|
|
++function \type\()_8tap_8v
|
|
|
|
++ sub x2, x2, x3, lsl #1
|
|
|
|
++ sub x2, x2, x3
|
|
|
|
++ ld1 {v0.8h}, [x6]
|
|
|
|
++1:
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ mov x7, x0
|
|
|
|
++.endif
|
|
|
|
++ mov x6, x4
|
|
|
|
++
|
|
|
|
++ ld1 {v17.8h}, [x2], x3
|
|
|
|
++ ld1 {v18.8h}, [x2], x3
|
|
|
|
++ ld1 {v19.8h}, [x2], x3
|
|
|
|
++ ld1 {v20.8h}, [x2], x3
|
|
|
|
++ ld1 {v21.8h}, [x2], x3
|
|
|
|
++ ld1 {v22.8h}, [x2], x3
|
|
|
|
++ ld1 {v23.8h}, [x2], x3
|
|
|
|
++2:
|
|
|
|
++ ld1 {v24.8h}, [x2], x3
|
|
|
|
++ ld1 {v25.8h}, [x2], x3
|
|
|
|
++ ld1 {v26.8h}, [x2], x3
|
|
|
|
++ ld1 {v27.8h}, [x2], x3
|
|
|
|
++
|
|
|
|
++ convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
|
|
|
|
++ convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
|
|
|
|
++ do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #4
|
|
|
|
++ b.eq 8f
|
|
|
|
++
|
|
|
|
++ ld1 {v16.8h}, [x2], x3
|
|
|
|
++ ld1 {v17.8h}, [x2], x3
|
|
|
|
++ ld1 {v18.8h}, [x2], x3
|
|
|
|
++ ld1 {v19.8h}, [x2], x3
|
|
|
|
++ convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
|
|
|
|
++ convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
|
|
|
|
++ do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #4
|
|
|
|
++ b.eq 8f
|
|
|
|
++
|
|
|
|
++ ld1 {v20.8h}, [x2], x3
|
|
|
|
++ ld1 {v21.8h}, [x2], x3
|
|
|
|
++ ld1 {v22.8h}, [x2], x3
|
|
|
|
++ ld1 {v23.8h}, [x2], x3
|
|
|
|
++ convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
|
|
|
|
++ convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
++ do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #4
|
|
|
|
++ b.ne 2b
|
|
|
|
++
|
|
|
|
++8:
|
|
|
|
++ subs x5, x5, #8
|
|
|
|
++ b.eq 9f
|
|
|
|
++ // x0 -= h * dst_stride
|
|
|
|
++ msub x0, x1, x4, x0
|
|
|
|
++ // x2 -= h * src_stride
|
|
|
|
++ msub x2, x3, x4, x2
|
|
|
|
++ // x2 -= 8 * src_stride
|
|
|
|
++ sub x2, x2, x3, lsl #3
|
|
|
|
++ // x2 += 1 * src_stride
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x2, x2, #16
|
|
|
|
++ add x0, x0, #16
|
|
|
|
++ b 1b
|
|
|
|
++9:
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_8v put
|
|
|
|
++do_8tap_8v avg
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Instantiate a vertical filter function for filtering a 4 pixels wide
|
|
|
|
++// slice. This only is designed to work for 4 or 8 output lines.
|
|
|
|
++.macro do_8tap_4v type
|
|
|
|
++function \type\()_8tap_4v
|
|
|
|
++ sub x2, x2, x3, lsl #1
|
|
|
|
++ sub x2, x2, x3
|
|
|
|
++ ld1 {v0.8h}, [x6]
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ mov x7, x0
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ ld1 {v16.4h}, [x2], x3
|
|
|
|
++ ld1 {v17.4h}, [x2], x3
|
|
|
|
++ ld1 {v18.4h}, [x2], x3
|
|
|
|
++ ld1 {v19.4h}, [x2], x3
|
|
|
|
++ ld1 {v20.4h}, [x2], x3
|
|
|
|
++ ld1 {v21.4h}, [x2], x3
|
|
|
|
++ ld1 {v22.4h}, [x2], x3
|
|
|
|
++ ld1 {v23.4h}, [x2], x3
|
|
|
|
++ ld1 {v24.4h}, [x2], x3
|
|
|
|
++ ld1 {v25.4h}, [x2], x3
|
|
|
|
++ ld1 {v26.4h}, [x2], x3
|
|
|
|
++
|
|
|
|
++ convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
|
|
|
|
++ convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
|
|
|
|
++ do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
|
|
|
|
++
|
|
|
|
++ subs x4, x4, #4
|
|
|
|
++ b.eq 9f
|
|
|
|
++
|
|
|
|
++ ld1 {v27.4h}, [x2], x3
|
|
|
|
++ ld1 {v28.4h}, [x2], x3
|
|
|
|
++ ld1 {v29.4h}, [x2], x3
|
|
|
|
++ ld1 {v30.4h}, [x2], x3
|
|
|
|
++
|
|
|
|
++ convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
|
|
|
|
++ convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
|
|
|
|
++ do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
|
|
|
|
++
|
|
|
|
++9:
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_4v put
|
|
|
|
++do_8tap_4v avg
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro do_8tap_v_func type, filter, offset, size, bpp
|
|
|
|
++function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
|
|
|
|
++ uxtw x4, w4
|
|
|
|
++ mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
|
|
|
++ movrel x5, X(ff_vp9_subpel_filters), 256*\offset
|
|
|
|
++ add x6, x5, w6, uxtw #4
|
|
|
|
++ mov x5, #\size
|
|
|
|
++.if \size >= 8
|
|
|
|
++ b \type\()_8tap_8v
|
|
|
|
++.else
|
|
|
|
++ b \type\()_8tap_4v
|
|
|
|
++.endif
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_v_filters size, bpp
|
|
|
|
++do_8tap_v_func put, regular, 1, \size, \bpp
|
|
|
|
++do_8tap_v_func avg, regular, 1, \size, \bpp
|
|
|
|
++do_8tap_v_func put, sharp, 2, \size, \bpp
|
|
|
|
++do_8tap_v_func avg, sharp, 2, \size, \bpp
|
|
|
|
++do_8tap_v_func put, smooth, 0, \size, \bpp
|
|
|
|
++do_8tap_v_func avg, smooth, 0, \size, \bpp
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_v_filters_bpp bpp
|
|
|
|
++do_8tap_v_filters 64, \bpp
|
|
|
|
++do_8tap_v_filters 32, \bpp
|
|
|
|
++do_8tap_v_filters 16, \bpp
|
|
|
|
++do_8tap_v_filters 8, \bpp
|
|
|
|
++do_8tap_v_filters 4, \bpp
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_v_filters_bpp 10
|
|
|
|
++do_8tap_v_filters_bpp 12
|
|
|
|
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
|
|
|
|
+@@ -0,0 +1,687 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2016 Google Inc.
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/aarch64/asm.S"
|
|
|
|
++
|
|
|
|
++// All public functions in this file have the following signature:
|
|
|
|
++// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
|
|
|
++// const uint8_t *ref, ptrdiff_t ref_stride,
|
|
|
|
++// int h, int mx, int my);
|
|
|
|
++
|
|
|
|
++function ff_vp9_copy64_aarch64, export=1
|
|
|
|
++1:
|
|
|
|
++ ldp x5, x6, [x2]
|
|
|
|
++ ldp x7, x8, [x2, #16]
|
|
|
|
++ stp x5, x6, [x0]
|
|
|
|
++ ldp x9, x10, [x2, #32]
|
|
|
|
++ stp x7, x8, [x0, #16]
|
|
|
|
++ subs w4, w4, #1
|
|
|
|
++ ldp x11, x12, [x2, #48]
|
|
|
|
++ stp x9, x10, [x0, #32]
|
|
|
|
++ stp x11, x12, [x0, #48]
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x0, x0, x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg64_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
|
|
|
|
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
|
|
|
|
++ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
|
|
|
|
++ urhadd v0.16b, v0.16b, v4.16b
|
|
|
|
++ urhadd v1.16b, v1.16b, v5.16b
|
|
|
|
++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
|
|
|
|
++ urhadd v2.16b, v2.16b, v6.16b
|
|
|
|
++ urhadd v3.16b, v3.16b, v7.16b
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ urhadd v16.16b, v16.16b, v20.16b
|
|
|
|
++ urhadd v17.16b, v17.16b, v21.16b
|
|
|
|
++ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
|
|
|
|
++ urhadd v18.16b, v18.16b, v22.16b
|
|
|
|
++ urhadd v19.16b, v19.16b, v23.16b
|
|
|
|
++ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_copy32_aarch64, export=1
|
|
|
|
++1:
|
|
|
|
++ ldp x5, x6, [x2]
|
|
|
|
++ ldp x7, x8, [x2, #16]
|
|
|
|
++ stp x5, x6, [x0]
|
|
|
|
++ subs w4, w4, #1
|
|
|
|
++ stp x7, x8, [x0, #16]
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x0, x0, x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg32_neon, export=1
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.16b, v3.16b}, [x2], x3
|
|
|
|
++ ld1 {v0.16b, v1.16b}, [x0]
|
|
|
|
++ urhadd v0.16b, v0.16b, v2.16b
|
|
|
|
++ urhadd v1.16b, v1.16b, v3.16b
|
|
|
|
++ subs w4, w4, #1
|
|
|
|
++ st1 {v0.16b, v1.16b}, [x0], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_copy16_neon, export=1
|
|
|
|
++ add x5, x0, x1
|
|
|
|
++ lsl x1, x1, #1
|
|
|
|
++ add x6, x2, x3
|
|
|
|
++ lsl x3, x3, #1
|
|
|
|
++1:
|
|
|
|
++ ld1 {v0.16b}, [x2], x3
|
|
|
|
++ ld1 {v1.16b}, [x6], x3
|
|
|
|
++ ld1 {v2.16b}, [x2], x3
|
|
|
|
++ ld1 {v3.16b}, [x6], x3
|
|
|
|
++ subs w4, w4, #4
|
|
|
|
++ st1 {v0.16b}, [x0], x1
|
|
|
|
++ st1 {v1.16b}, [x5], x1
|
|
|
|
++ st1 {v2.16b}, [x0], x1
|
|
|
|
++ st1 {v3.16b}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg16_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.16b}, [x2], x3
|
|
|
|
++ ld1 {v0.16b}, [x0], x1
|
|
|
|
++ ld1 {v3.16b}, [x2], x3
|
|
|
|
++ urhadd v0.16b, v0.16b, v2.16b
|
|
|
|
++ ld1 {v1.16b}, [x0], x1
|
|
|
|
++ urhadd v1.16b, v1.16b, v3.16b
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ st1 {v0.16b}, [x5], x1
|
|
|
|
++ st1 {v1.16b}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_copy8_neon, export=1
|
|
|
|
++1:
|
|
|
|
++ ld1 {v0.8b}, [x2], x3
|
|
|
|
++ ld1 {v1.8b}, [x2], x3
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ st1 {v0.8b}, [x0], x1
|
|
|
|
++ st1 {v1.8b}, [x0], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg8_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.8b}, [x2], x3
|
|
|
|
++ ld1 {v0.8b}, [x0], x1
|
|
|
|
++ ld1 {v3.8b}, [x2], x3
|
|
|
|
++ urhadd v0.8b, v0.8b, v2.8b
|
|
|
|
++ ld1 {v1.8b}, [x0], x1
|
|
|
|
++ urhadd v1.8b, v1.8b, v3.8b
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ st1 {v0.8b}, [x5], x1
|
|
|
|
++ st1 {v1.8b}, [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_copy4_neon, export=1
|
|
|
|
++1:
|
|
|
|
++ ld1 {v0.s}[0], [x2], x3
|
|
|
|
++ ld1 {v1.s}[0], [x2], x3
|
|
|
|
++ st1 {v0.s}[0], [x0], x1
|
|
|
|
++ ld1 {v2.s}[0], [x2], x3
|
|
|
|
++ st1 {v1.s}[0], [x0], x1
|
|
|
|
++ ld1 {v3.s}[0], [x2], x3
|
|
|
|
++ subs w4, w4, #4
|
|
|
|
++ st1 {v2.s}[0], [x0], x1
|
|
|
|
++ st1 {v3.s}[0], [x0], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vp9_avg4_neon, export=1
|
|
|
|
++ mov x5, x0
|
|
|
|
++1:
|
|
|
|
++ ld1 {v2.s}[0], [x2], x3
|
|
|
|
++ ld1 {v0.s}[0], [x0], x1
|
|
|
|
++ ld1 {v2.s}[1], [x2], x3
|
|
|
|
++ ld1 {v0.s}[1], [x0], x1
|
|
|
|
++ ld1 {v3.s}[0], [x2], x3
|
|
|
|
++ ld1 {v1.s}[0], [x0], x1
|
|
|
|
++ ld1 {v3.s}[1], [x2], x3
|
|
|
|
++ ld1 {v1.s}[1], [x0], x1
|
|
|
|
++ subs w4, w4, #4
|
|
|
|
++ urhadd v0.8b, v0.8b, v2.8b
|
|
|
|
++ urhadd v1.8b, v1.8b, v3.8b
|
|
|
|
++ st1 {v0.s}[0], [x5], x1
|
|
|
|
++ st1 {v0.s}[1], [x5], x1
|
|
|
|
++ st1 {v1.s}[0], [x5], x1
|
|
|
|
++ st1 {v1.s}[1], [x5], x1
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
|
|
|
|
++// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
|
|
|
|
++// dst1-dst2 and dst3-dst4 for size >= 16)
|
|
|
|
++.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
|
|
|
|
++ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
|
|
|
++ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
|
|
|
++.if \size >= 16
|
|
|
|
++ mla \dst1\().8h, v20.8h, v0.h[\offset]
|
|
|
|
++ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
|
|
|
++ mla \dst3\().8h, v22.8h, v0.h[\offset]
|
|
|
|
++ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
|
|
|
++ mla \dst2\().8h, v21.8h, v0.h[\offset]
|
|
|
|
++ mla \dst4\().8h, v23.8h, v0.h[\offset]
|
|
|
|
++.elseif \size == 8
|
|
|
|
++ mla \dst1\().8h, v20.8h, v0.h[\offset]
|
|
|
|
++ mla \dst3\().8h, v22.8h, v0.h[\offset]
|
|
|
|
++.else
|
|
|
|
++ mla \dst1\().4h, v20.4h, v0.h[\offset]
|
|
|
|
++ mla \dst3\().4h, v22.4h, v0.h[\offset]
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++// The same as above, but don't accumulate straight into the
|
|
|
|
++// destination, but use a temp register and accumulate with saturation.
|
|
|
|
++.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
|
|
|
|
++ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
|
|
|
++ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
|
|
|
++.if \size >= 16
|
|
|
|
++ mul v20.8h, v20.8h, v0.h[\offset]
|
|
|
|
++ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
|
|
|
++ mul v22.8h, v22.8h, v0.h[\offset]
|
|
|
|
++ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
|
|
|
++ mul v21.8h, v21.8h, v0.h[\offset]
|
|
|
|
++ mul v23.8h, v23.8h, v0.h[\offset]
|
|
|
|
++.elseif \size == 8
|
|
|
|
++ mul v20.8h, v20.8h, v0.h[\offset]
|
|
|
|
++ mul v22.8h, v22.8h, v0.h[\offset]
|
|
|
|
++.else
|
|
|
|
++ mul v20.4h, v20.4h, v0.h[\offset]
|
|
|
|
++ mul v22.4h, v22.4h, v0.h[\offset]
|
|
|
|
++.endif
|
|
|
|
++.if \size == 4
|
|
|
|
++ sqadd \dst1\().4h, \dst1\().4h, v20.4h
|
|
|
|
++ sqadd \dst3\().4h, \dst3\().4h, v22.4h
|
|
|
|
++.else
|
|
|
|
++ sqadd \dst1\().8h, \dst1\().8h, v20.8h
|
|
|
|
++ sqadd \dst3\().8h, \dst3\().8h, v22.8h
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sqadd \dst2\().8h, \dst2\().8h, v21.8h
|
|
|
|
++ sqadd \dst4\().8h, \dst4\().8h, v23.8h
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Instantiate a horizontal filter function for the given size.
|
|
|
|
++// This can work on 4, 8 or 16 pixels in parallel; for larger
|
|
|
|
++// widths it will do 16 pixels at a time and loop horizontally.
|
|
|
|
++// The actual width is passed in x5, the height in w4 and the
|
|
|
|
++// filter coefficients in x9. idx2 is the index of the largest
|
|
|
|
++// filter coefficient (3 or 4) and idx1 is the other one of them.
|
|
|
|
++.macro do_8tap_h type, size, idx1, idx2
|
|
|
|
++function \type\()_8tap_\size\()h_\idx1\idx2
|
|
|
|
++ sub x2, x2, #3
|
|
|
|
++ add x6, x0, x1
|
|
|
|
++ add x7, x2, x3
|
|
|
|
++ add x1, x1, x1
|
|
|
|
++ add x3, x3, x3
|
|
|
|
++ // Only size >= 16 loops horizontally and needs
|
|
|
|
++ // reduced dst stride
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sub x1, x1, x5
|
|
|
|
++.endif
|
|
|
|
++ // size >= 16 loads two qwords and increments x2,
|
|
|
|
++ // for size 4/8 it's enough with one qword and no
|
|
|
|
++ // postincrement
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sub x3, x3, x5
|
|
|
|
++ sub x3, x3, #8
|
|
|
|
++.endif
|
|
|
|
++ // Load the filter vector
|
|
|
|
++ ld1 {v0.8h}, [x9]
|
|
|
|
++1:
|
|
|
|
++.if \size >= 16
|
|
|
|
++ mov x9, x5
|
|
|
|
++.endif
|
|
|
|
++ // Load src
|
|
|
|
++.if \size >= 16
|
|
|
|
++ ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
|
|
|
|
++ ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
|
|
|
|
++.else
|
|
|
|
++ ld1 {v4.8b, v5.8b}, [x2]
|
|
|
|
++ ld1 {v16.8b, v17.8b}, [x7]
|
|
|
|
++.endif
|
|
|
|
++ uxtl v4.8h, v4.8b
|
|
|
|
++ uxtl v5.8h, v5.8b
|
|
|
|
++ uxtl v16.8h, v16.8b
|
|
|
|
++ uxtl v17.8h, v17.8b
|
|
|
|
++.if \size >= 16
|
|
|
|
++ uxtl v6.8h, v6.8b
|
|
|
|
++ uxtl v18.8h, v18.8b
|
|
|
|
++.endif
|
|
|
|
++2:
|
|
|
|
++
|
|
|
|
++ // Accumulate, adding idx2 last with a separate
|
|
|
|
++ // saturating add. The positive filter coefficients
|
|
|
|
++ // for all indices except idx2 must add up to less
|
|
|
|
++ // than 127 for this not to overflow.
|
|
|
|
++ mul v1.8h, v4.8h, v0.h[0]
|
|
|
|
++ mul v24.8h, v16.8h, v0.h[0]
|
|
|
|
++.if \size >= 16
|
|
|
|
++ mul v2.8h, v5.8h, v0.h[0]
|
|
|
|
++ mul v25.8h, v17.8h, v0.h[0]
|
|
|
|
++.endif
|
|
|
|
++ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size
|
|
|
|
++ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size
|
|
|
|
++ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size
|
|
|
|
++ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size
|
|
|
|
++ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size
|
|
|
|
++ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size
|
|
|
|
++ extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size
|
|
|
|
++
|
|
|
|
++ // Round, shift and saturate
|
|
|
|
++ sqrshrun v1.8b, v1.8h, #7
|
|
|
|
++ sqrshrun v24.8b, v24.8h, #7
|
|
|
|
++.if \size >= 16
|
|
|
|
++ sqrshrun2 v1.16b, v2.8h, #7
|
|
|
|
++ sqrshrun2 v24.16b, v25.8h, #7
|
|
|
|
++.endif
|
|
|
|
++ // Average
|
|
|
|
++.ifc \type,avg
|
|
|
|
++.if \size >= 16
|
|
|
|
++ ld1 {v2.16b}, [x0]
|
|
|
|
++ ld1 {v3.16b}, [x6]
|
|
|
|
++ urhadd v1.16b, v1.16b, v2.16b
|
|
|
|
++ urhadd v24.16b, v24.16b, v3.16b
|
|
|
|
++.elseif \size == 8
|
|
|
|
++ ld1 {v2.8b}, [x0]
|
|
|
|
++ ld1 {v3.8b}, [x6]
|
|
|
|
++ urhadd v1.8b, v1.8b, v2.8b
|
|
|
|
++ urhadd v24.8b, v24.8b, v3.8b
|
|
|
|
++.else
|
|
|
|
++ ld1 {v2.s}[0], [x0]
|
|
|
|
++ ld1 {v3.s}[0], [x6]
|
|
|
|
++ urhadd v1.8b, v1.8b, v2.8b
|
|
|
|
++ urhadd v24.8b, v24.8b, v3.8b
|
|
|
|
++.endif
|
|
|
|
++.endif
|
|
|
|
++ // Store and loop horizontally (for size >= 16)
|
|
|
|
++.if \size >= 16
|
|
|
|
++ subs x9, x9, #16
|
|
|
|
++ st1 {v1.16b}, [x0], #16
|
|
|
|
++ st1 {v24.16b}, [x6], #16
|
|
|
|
++ b.eq 3f
|
|
|
|
++ mov v4.16b, v6.16b
|
|
|
|
++ mov v16.16b, v18.16b
|
|
|
|
++ ld1 {v6.16b}, [x2], #16
|
|
|
|
++ ld1 {v18.16b}, [x7], #16
|
|
|
|
++ uxtl v5.8h, v6.8b
|
|
|
|
++ uxtl2 v6.8h, v6.16b
|
|
|
|
++ uxtl v17.8h, v18.8b
|
|
|
|
++ uxtl2 v18.8h, v18.16b
|
|
|
|
++ b 2b
|
|
|
|
++.elseif \size == 8
|
|
|
|
++ st1 {v1.8b}, [x0]
|
|
|
|
++ st1 {v24.8b}, [x6]
|
|
|
|
++.else // \size == 4
|
|
|
|
++ st1 {v1.s}[0], [x0]
|
|
|
|
++ st1 {v24.s}[0], [x6]
|
|
|
|
++.endif
|
|
|
|
++3:
|
|
|
|
++ // Loop vertically
|
|
|
|
++ add x0, x0, x1
|
|
|
|
++ add x6, x6, x1
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x7, x7, x3
|
|
|
|
++ subs w4, w4, #2
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_size size
|
|
|
|
++do_8tap_h put, \size, 3, 4
|
|
|
|
++do_8tap_h avg, \size, 3, 4
|
|
|
|
++do_8tap_h put, \size, 4, 3
|
|
|
|
++do_8tap_h avg, \size, 4, 3
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_h_size 4
|
|
|
|
++do_8tap_h_size 8
|
|
|
|
++do_8tap_h_size 16
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_func type, filter, offset, size
|
|
|
|
++function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
|
|
|
|
++ movrel x6, X(ff_vp9_subpel_filters), 256*\offset
|
|
|
|
++ cmp w5, #8
|
|
|
|
++ add x9, x6, w5, uxtw #4
|
|
|
|
++ mov x5, #\size
|
|
|
|
++.if \size >= 16
|
|
|
|
++ b.ge \type\()_8tap_16h_34
|
|
|
|
++ b \type\()_8tap_16h_43
|
|
|
|
++.else
|
|
|
|
++ b.ge \type\()_8tap_\size\()h_34
|
|
|
|
++ b \type\()_8tap_\size\()h_43
|
|
|
|
++.endif
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_h_filters size
|
|
|
|
++do_8tap_h_func put, regular, 1, \size
|
|
|
|
++do_8tap_h_func avg, regular, 1, \size
|
|
|
|
++do_8tap_h_func put, sharp, 2, \size
|
|
|
|
++do_8tap_h_func avg, sharp, 2, \size
|
|
|
|
++do_8tap_h_func put, smooth, 0, \size
|
|
|
|
++do_8tap_h_func avg, smooth, 0, \size
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_h_filters 64
|
|
|
|
++do_8tap_h_filters 32
|
|
|
|
++do_8tap_h_filters 16
|
|
|
|
++do_8tap_h_filters 8
|
|
|
|
++do_8tap_h_filters 4
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Vertical filters
|
|
|
|
++
|
|
|
|
++// Round, shift and saturate and store reg1-reg2 over 4 lines
|
|
|
|
++.macro do_store4 reg1, reg2, tmp1, tmp2, type
|
|
|
|
++ sqrshrun \reg1\().8b, \reg1\().8h, #7
|
|
|
|
++ sqrshrun \reg2\().8b, \reg2\().8h, #7
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ ld1 {\tmp1\().s}[0], [x7], x1
|
|
|
|
++ ld1 {\tmp2\().s}[0], [x7], x1
|
|
|
|
++ ld1 {\tmp1\().s}[1], [x7], x1
|
|
|
|
++ ld1 {\tmp2\().s}[1], [x7], x1
|
|
|
|
++ urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
|
|
|
|
++ urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
|
|
|
|
++.endif
|
|
|
|
++ st1 {\reg1\().s}[0], [x0], x1
|
|
|
|
++ st1 {\reg2\().s}[0], [x0], x1
|
|
|
|
++ st1 {\reg1\().s}[1], [x0], x1
|
|
|
|
++ st1 {\reg2\().s}[1], [x0], x1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Round, shift and saturate and store reg1-4
|
|
|
|
++.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
|
|
|
|
++ sqrshrun \reg1\().8b, \reg1\().8h, #7
|
|
|
|
++ sqrshrun \reg2\().8b, \reg2\().8h, #7
|
|
|
|
++ sqrshrun \reg3\().8b, \reg3\().8h, #7
|
|
|
|
++ sqrshrun \reg4\().8b, \reg4\().8h, #7
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ ld1 {\tmp1\().8b}, [x7], x1
|
|
|
|
++ ld1 {\tmp2\().8b}, [x7], x1
|
|
|
|
++ ld1 {\tmp3\().8b}, [x7], x1
|
|
|
|
++ ld1 {\tmp4\().8b}, [x7], x1
|
|
|
|
++ urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
|
|
|
|
++ urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
|
|
|
|
++ urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
|
|
|
|
++ urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
|
|
|
|
++.endif
|
|
|
|
++ st1 {\reg1\().8b}, [x0], x1
|
|
|
|
++ st1 {\reg2\().8b}, [x0], x1
|
|
|
|
++ st1 {\reg3\().8b}, [x0], x1
|
|
|
|
++ st1 {\reg4\().8b}, [x0], x1
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
|
|
|
|
++// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
|
|
|
|
++// at the end with saturation. Indices 0 and 7 always have negative or zero
|
|
|
|
++// coefficients, so they can be accumulated into tmp1-tmp2 together with the
|
|
|
|
++// largest coefficient.
|
|
|
|
++.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
|
|
|
|
++ mul \dst1\().8h, \src2\().8h, v0.h[1]
|
|
|
|
++ mul \dst2\().8h, \src3\().8h, v0.h[1]
|
|
|
|
++ mul \tmp1\().8h, \src1\().8h, v0.h[0]
|
|
|
|
++ mul \tmp2\().8h, \src2\().8h, v0.h[0]
|
|
|
|
++ mla \dst1\().8h, \src3\().8h, v0.h[2]
|
|
|
|
++ mla \dst2\().8h, \src4\().8h, v0.h[2]
|
|
|
|
++.if \idx1 == 3
|
|
|
|
++ mla \dst1\().8h, \src4\().8h, v0.h[3]
|
|
|
|
++ mla \dst2\().8h, \src5\().8h, v0.h[3]
|
|
|
|
++.else
|
|
|
|
++ mla \dst1\().8h, \src5\().8h, v0.h[4]
|
|
|
|
++ mla \dst2\().8h, \src6\().8h, v0.h[4]
|
|
|
|
++.endif
|
|
|
|
++ mla \dst1\().8h, \src6\().8h, v0.h[5]
|
|
|
|
++ mla \dst2\().8h, \src7\().8h, v0.h[5]
|
|
|
|
++ mla \tmp1\().8h, \src8\().8h, v0.h[7]
|
|
|
|
++ mla \tmp2\().8h, \src9\().8h, v0.h[7]
|
|
|
|
++ mla \dst1\().8h, \src7\().8h, v0.h[6]
|
|
|
|
++ mla \dst2\().8h, \src8\().8h, v0.h[6]
|
|
|
|
++.if \idx2 == 3
|
|
|
|
++ mla \tmp1\().8h, \src4\().8h, v0.h[3]
|
|
|
|
++ mla \tmp2\().8h, \src5\().8h, v0.h[3]
|
|
|
|
++.else
|
|
|
|
++ mla \tmp1\().8h, \src5\().8h, v0.h[4]
|
|
|
|
++ mla \tmp2\().8h, \src6\().8h, v0.h[4]
|
|
|
|
++.endif
|
|
|
|
++ sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
|
|
|
|
++ sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Load pixels and extend them to 16 bit
|
|
|
|
++.macro loadl dst1, dst2, dst3, dst4
|
|
|
|
++ ld1 {v1.8b}, [x2], x3
|
|
|
|
++ ld1 {v2.8b}, [x2], x3
|
|
|
|
++ ld1 {v3.8b}, [x2], x3
|
|
|
|
++.ifnb \dst4
|
|
|
|
++ ld1 {v4.8b}, [x2], x3
|
|
|
|
++.endif
|
|
|
|
++ uxtl \dst1\().8h, v1.8b
|
|
|
|
++ uxtl \dst2\().8h, v2.8b
|
|
|
|
++ uxtl \dst3\().8h, v3.8b
|
|
|
|
++.ifnb \dst4
|
|
|
|
++ uxtl \dst4\().8h, v4.8b
|
|
|
|
++.endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++// Instantiate a vertical filter function for filtering 8 pixels at a time.
|
|
|
|
++// The height is passed in x4, the width in x5 and the filter coefficients
|
|
|
|
++// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
|
|
|
|
++// and idx1 is the other one of them.
|
|
|
|
++.macro do_8tap_8v type, idx1, idx2
|
|
|
|
++function \type\()_8tap_8v_\idx1\idx2
|
|
|
|
++ sub x2, x2, x3, lsl #1
|
|
|
|
++ sub x2, x2, x3
|
|
|
|
++ ld1 {v0.8h}, [x6]
|
|
|
|
++1:
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ mov x7, x0
|
|
|
|
++.endif
|
|
|
|
++ mov x6, x4
|
|
|
|
++
|
|
|
|
++ loadl v17, v18, v19
|
|
|
|
++
|
|
|
|
++ loadl v20, v21, v22, v23
|
|
|
|
++2:
|
|
|
|
++ loadl v24, v25, v26, v27
|
|
|
|
++ convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
|
|
|
|
++ convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
|
|
|
|
++ do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #4
|
|
|
|
++ b.eq 8f
|
|
|
|
++
|
|
|
|
++ loadl v16, v17, v18, v19
|
|
|
|
++ convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
|
|
|
|
++ convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
|
|
|
|
++ do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #4
|
|
|
|
++ b.eq 8f
|
|
|
|
++
|
|
|
|
++ loadl v20, v21, v22, v23
|
|
|
|
++ convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
|
|
|
|
++ convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
|
|
|
|
++ do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
|
|
|
++
|
|
|
|
++ subs x6, x6, #4
|
|
|
|
++ b.ne 2b
|
|
|
|
++
|
|
|
|
++8:
|
|
|
|
++ subs x5, x5, #8
|
|
|
|
++ b.eq 9f
|
|
|
|
++ // x0 -= h * dst_stride
|
|
|
|
++ msub x0, x1, x4, x0
|
|
|
|
++ // x2 -= h * src_stride
|
|
|
|
++ msub x2, x3, x4, x2
|
|
|
|
++ // x2 -= 8 * src_stride
|
|
|
|
++ sub x2, x2, x3, lsl #3
|
|
|
|
++ // x2 += 1 * src_stride
|
|
|
|
++ add x2, x2, x3
|
|
|
|
++ add x2, x2, #8
|
|
|
|
++ add x0, x0, #8
|
|
|
|
++ b 1b
|
|
|
|
++9:
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_8v put, 3, 4
|
|
|
|
++do_8tap_8v put, 4, 3
|
|
|
|
++do_8tap_8v avg, 3, 4
|
|
|
|
++do_8tap_8v avg, 4, 3
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++// Instantiate a vertical filter function for filtering a 4 pixels wide
|
|
|
|
++// slice. The first half of the registers contain one row, while the second
|
|
|
|
++// half of a register contains the second-next row (also stored in the first
|
|
|
|
++// half of the register two steps ahead). The convolution does two outputs
|
|
|
|
++// at a time; the output of v17-v24 into one, and v18-v25 into another one.
|
|
|
|
++// The first half of first output is the first output row, the first half
|
|
|
|
++// of the other output is the second output row. The second halves of the
|
|
|
|
++// registers are rows 3 and 4.
|
|
|
|
++// This only is designed to work for 4 or 8 output lines.
|
|
|
|
++.macro do_8tap_4v type, idx1, idx2
|
|
|
|
++function \type\()_8tap_4v_\idx1\idx2
|
|
|
|
++ sub x2, x2, x3, lsl #1
|
|
|
|
++ sub x2, x2, x3
|
|
|
|
++ ld1 {v0.8h}, [x6]
|
|
|
|
++.ifc \type,avg
|
|
|
|
++ mov x7, x0
|
|
|
|
++.endif
|
|
|
|
++
|
|
|
|
++ ld1 {v1.s}[0], [x2], x3
|
|
|
|
++ ld1 {v2.s}[0], [x2], x3
|
|
|
|
++ ld1 {v3.s}[0], [x2], x3
|
|
|
|
++ ld1 {v4.s}[0], [x2], x3
|
|
|
|
++ ld1 {v5.s}[0], [x2], x3
|
|
|
|
++ ld1 {v6.s}[0], [x2], x3
|
|
|
|
++ trn1 v1.2s, v1.2s, v3.2s
|
|
|
|
++ ld1 {v7.s}[0], [x2], x3
|
|
|
|
++ trn1 v2.2s, v2.2s, v4.2s
|
|
|
|
++ ld1 {v26.s}[0], [x2], x3
|
|
|
|
++ uxtl v17.8h, v1.8b
|
|
|
|
++ trn1 v3.2s, v3.2s, v5.2s
|
|
|
|
++ ld1 {v27.s}[0], [x2], x3
|
|
|
|
++ uxtl v18.8h, v2.8b
|
|
|
|
++ trn1 v4.2s, v4.2s, v6.2s
|
|
|
|
++ ld1 {v28.s}[0], [x2], x3
|
|
|
|
++ uxtl v19.8h, v3.8b
|
|
|
|
++ trn1 v5.2s, v5.2s, v7.2s
|
|
|
|
++ ld1 {v29.s}[0], [x2], x3
|
|
|
|
++ uxtl v20.8h, v4.8b
|
|
|
|
++ trn1 v6.2s, v6.2s, v26.2s
|
|
|
|
++ uxtl v21.8h, v5.8b
|
|
|
|
++ trn1 v7.2s, v7.2s, v27.2s
|
|
|
|
++ uxtl v22.8h, v6.8b
|
|
|
|
++ trn1 v26.2s, v26.2s, v28.2s
|
|
|
|
++ uxtl v23.8h, v7.8b
|
|
|
|
++ trn1 v27.2s, v27.2s, v29.2s
|
|
|
|
++ uxtl v24.8h, v26.8b
|
|
|
|
++ uxtl v25.8h, v27.8b
|
|
|
|
++
|
|
|
|
++ convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
|
|
|
|
++ do_store4 v1, v2, v5, v6, \type
|
|
|
|
++
|
|
|
|
++ subs x4, x4, #4
|
|
|
|
++ b.eq 9f
|
|
|
|
++
|
|
|
|
++ ld1 {v1.s}[0], [x2], x3
|
|
|
|
++ ld1 {v2.s}[0], [x2], x3
|
|
|
|
++ trn1 v28.2s, v28.2s, v1.2s
|
|
|
|
++ trn1 v29.2s, v29.2s, v2.2s
|
|
|
|
++ ld1 {v1.s}[1], [x2], x3
|
|
|
|
++ uxtl v26.8h, v28.8b
|
|
|
|
++ ld1 {v2.s}[1], [x2], x3
|
|
|
|
++ uxtl v27.8h, v29.8b
|
|
|
|
++ uxtl v28.8h, v1.8b
|
|
|
|
++ uxtl v29.8h, v2.8b
|
|
|
|
++
|
|
|
|
++ convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
|
|
|
|
++ do_store4 v1, v2, v5, v6, \type
|
|
|
|
++
|
|
|
|
++9:
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_4v put, 3, 4
|
|
|
|
++do_8tap_4v put, 4, 3
|
|
|
|
++do_8tap_4v avg, 3, 4
|
|
|
|
++do_8tap_4v avg, 4, 3
|
|
|
|
++
|
|
|
|
++
|
|
|
|
++.macro do_8tap_v_func type, filter, offset, size
|
|
|
|
++function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
|
|
|
|
++ uxtw x4, w4
|
|
|
|
++ movrel x5, X(ff_vp9_subpel_filters), 256*\offset
|
|
|
|
++ cmp w6, #8
|
|
|
|
++ add x6, x5, w6, uxtw #4
|
|
|
|
++ mov x5, #\size
|
|
|
|
++.if \size >= 8
|
|
|
|
++ b.ge \type\()_8tap_8v_34
|
|
|
|
++ b \type\()_8tap_8v_43
|
|
|
|
++.else
|
|
|
|
++ b.ge \type\()_8tap_4v_34
|
|
|
|
++ b \type\()_8tap_4v_43
|
|
|
|
++.endif
|
|
|
|
++endfunc
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro do_8tap_v_filters size
|
|
|
|
++do_8tap_v_func put, regular, 1, \size
|
|
|
|
++do_8tap_v_func avg, regular, 1, \size
|
|
|
|
++do_8tap_v_func put, sharp, 2, \size
|
|
|
|
++do_8tap_v_func avg, sharp, 2, \size
|
|
|
|
++do_8tap_v_func put, smooth, 0, \size
|
|
|
|
++do_8tap_v_func avg, smooth, 0, \size
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++do_8tap_v_filters 64
|
|
|
|
++do_8tap_v_filters 32
|
|
|
|
++do_8tap_v_filters 16
|
|
|
|
++do_8tap_v_filters 8
|
|
|
|
++do_8tap_v_filters 4
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/asm.S b/media/ffvpx/libavutil/aarch64/asm.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/asm.S
|
|
|
|
+@@ -0,0 +1,104 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++#ifdef __ELF__
|
|
|
|
++# define ELF
|
|
|
|
++#else
|
|
|
|
++# define ELF #
|
|
|
|
++#endif
|
|
|
|
++
|
|
|
|
++#if HAVE_AS_FUNC
|
|
|
|
++# define FUNC
|
|
|
|
++#else
|
|
|
|
++# define FUNC #
|
|
|
|
++#endif
|
|
|
|
++
|
|
|
|
++.macro function name, export=0, align=2
|
|
|
|
++ .macro endfunc
|
|
|
|
++ELF .size \name, . - \name
|
|
|
|
++FUNC .endfunc
|
|
|
|
++ .purgem endfunc
|
|
|
|
++ .endm
|
|
|
|
++ .text
|
|
|
|
++ .align \align
|
|
|
|
++ .if \export
|
|
|
|
++ .global EXTERN_ASM\name
|
|
|
|
++ELF .type EXTERN_ASM\name, %function
|
|
|
|
++FUNC .func EXTERN_ASM\name
|
|
|
|
++EXTERN_ASM\name:
|
|
|
|
++ .else
|
|
|
|
++ELF .type \name, %function
|
|
|
|
++FUNC .func \name
|
|
|
|
++\name:
|
|
|
|
++ .endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro const name, align=2, relocate=0
|
|
|
|
++ .macro endconst
|
|
|
|
++ELF .size \name, . - \name
|
|
|
|
++ .purgem endconst
|
|
|
|
++ .endm
|
|
|
|
++#if HAVE_SECTION_DATA_REL_RO
|
|
|
|
++.if \relocate
|
|
|
|
++ .section .data.rel.ro
|
|
|
|
++.else
|
|
|
|
++ .section .rodata
|
|
|
|
++.endif
|
|
|
|
++#elif !defined(__MACH__)
|
|
|
|
++ .section .rodata
|
|
|
|
++#else
|
|
|
|
++ .const_data
|
|
|
|
++#endif
|
|
|
|
++ .align \align
|
|
|
|
++\name:
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++.macro movrel rd, val, offset=0
|
|
|
|
++#if CONFIG_PIC && defined(__APPLE__)
|
|
|
|
++ .if \offset < 0
|
|
|
|
++ adrp \rd, \val@PAGE
|
|
|
|
++ add \rd, \rd, \val@PAGEOFF
|
|
|
|
++ sub \rd, \rd, -(\offset)
|
|
|
|
++ .else
|
|
|
|
++ adrp \rd, \val+(\offset)@PAGE
|
|
|
|
++ add \rd, \rd, \val+(\offset)@PAGEOFF
|
|
|
|
++ .endif
|
|
|
|
++#elif CONFIG_PIC && defined(_WIN32)
|
|
|
|
++ .if \offset < 0
|
|
|
|
++ adrp \rd, \val
|
|
|
|
++ add \rd, \rd, :lo12:\val
|
|
|
|
++ sub \rd, \rd, -(\offset)
|
|
|
|
++ .else
|
|
|
|
++ adrp \rd, \val+(\offset)
|
|
|
|
++ add \rd, \rd, :lo12:\val+(\offset)
|
|
|
|
++ .endif
|
|
|
|
++#elif CONFIG_PIC
|
|
|
|
++ adrp \rd, \val+(\offset)
|
|
|
|
++ add \rd, \rd, :lo12:\val+(\offset)
|
|
|
|
++#else
|
|
|
|
++ ldr \rd, =\val+\offset
|
|
|
|
++#endif
|
|
|
|
++.endm
|
|
|
|
++
|
|
|
|
++#define GLUE(a, b) a ## b
|
|
|
|
++#define JOIN(a, b) GLUE(a, b)
|
|
|
|
++#define X(s) JOIN(EXTERN_ASM, s)
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/bswap.h b/media/ffvpx/libavutil/aarch64/bswap.h
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/bswap.h
|
|
|
|
+@@ -0,0 +1,51 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#ifndef AVUTIL_AARCH64_BSWAP_H
|
|
|
|
++#define AVUTIL_AARCH64_BSWAP_H
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++#include "config.h"
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++
|
|
|
|
++#if HAVE_INLINE_ASM
|
|
|
|
++
|
|
|
|
++#define av_bswap16 av_bswap16
|
|
|
|
++static av_always_inline av_const unsigned av_bswap16(unsigned x)
|
|
|
|
++{
|
|
|
|
++ __asm__("rev16 %w0, %w0" : "+r"(x));
|
|
|
|
++ return x;
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define av_bswap32 av_bswap32
|
|
|
|
++static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
|
|
|
|
++{
|
|
|
|
++ __asm__("rev %w0, %w0" : "+r"(x));
|
|
|
|
++ return x;
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#define av_bswap64 av_bswap64
|
|
|
|
++static av_always_inline av_const uint64_t av_bswap64(uint64_t x)
|
|
|
|
++{
|
|
|
|
++ __asm__("rev %0, %0" : "+r"(x));
|
|
|
|
++ return x;
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#endif /* HAVE_INLINE_ASM */
|
|
|
|
++
|
|
|
|
++#endif /* AVUTIL_AARCH64_BSWAP_H */
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/cpu.c b/media/ffvpx/libavutil/aarch64/cpu.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/cpu.c
|
|
|
|
+@@ -0,0 +1,38 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/cpu_internal.h"
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++int ff_get_cpu_flags_aarch64(void)
|
|
|
|
++{
|
|
|
|
++ return AV_CPU_FLAG_ARMV8 * HAVE_ARMV8 |
|
|
|
|
++ AV_CPU_FLAG_NEON * HAVE_NEON |
|
|
|
|
++ AV_CPU_FLAG_VFP * HAVE_VFP;
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++size_t ff_get_cpu_max_align_aarch64(void)
|
|
|
|
++{
|
|
|
|
++ int flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (flags & AV_CPU_FLAG_NEON)
|
|
|
|
++ return 16;
|
|
|
|
++
|
|
|
|
++ return 8;
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/cpu.h b/media/ffvpx/libavutil/aarch64/cpu.h
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/cpu.h
|
|
|
|
+@@ -0,0 +1,29 @@
|
|
|
|
++/*
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#ifndef AVUTIL_AARCH64_CPU_H
|
|
|
|
++#define AVUTIL_AARCH64_CPU_H
|
|
|
|
++
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/cpu_internal.h"
|
|
|
|
++
|
|
|
|
++#define have_armv8(flags) CPUEXT(flags, ARMV8)
|
|
|
|
++#define have_neon(flags) CPUEXT(flags, NEON)
|
|
|
|
++#define have_vfp(flags) CPUEXT(flags, VFP)
|
|
|
|
++
|
|
|
|
++#endif /* AVUTIL_AARCH64_CPU_H */
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_init.c b/media/ffvpx/libavutil/aarch64/float_dsp_init.c
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/float_dsp_init.c
|
|
|
|
+@@ -0,0 +1,69 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON optimised Float DSP functions
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++
|
|
|
|
++#include "libavutil/attributes.h"
|
|
|
|
++#include "libavutil/cpu.h"
|
|
|
|
++#include "libavutil/float_dsp.h"
|
|
|
|
++#include "cpu.h"
|
|
|
|
++
|
|
|
|
++void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1,
|
|
|
|
++ int len);
|
|
|
|
++
|
|
|
|
++void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
|
|
|
|
++ int len);
|
|
|
|
++
|
|
|
|
++void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
|
|
|
|
++ int len);
|
|
|
|
++
|
|
|
|
++void ff_vector_dmul_scalar_neon(double *dst, const double *src, double mul,
|
|
|
|
++ int len);
|
|
|
|
++
|
|
|
|
++void ff_vector_fmul_window_neon(float *dst, const float *src0,
|
|
|
|
++ const float *src1, const float *win, int len);
|
|
|
|
++
|
|
|
|
++void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
|
|
|
++ const float *src2, int len);
|
|
|
|
++
|
|
|
|
++void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
|
|
|
|
++ const float *src1, int len);
|
|
|
|
++
|
|
|
|
++void ff_butterflies_float_neon(float *v1, float *v2, int len);
|
|
|
|
++
|
|
|
|
++float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
|
|
|
|
++
|
|
|
|
++av_cold void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp)
|
|
|
|
++{
|
|
|
|
++ int cpu_flags = av_get_cpu_flags();
|
|
|
|
++
|
|
|
|
++ if (have_neon(cpu_flags)) {
|
|
|
|
++ fdsp->butterflies_float = ff_butterflies_float_neon;
|
|
|
|
++ fdsp->scalarproduct_float = ff_scalarproduct_float_neon;
|
|
|
|
++ fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_neon;
|
|
|
|
++ fdsp->vector_fmul = ff_vector_fmul_neon;
|
|
|
|
++ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon;
|
|
|
|
++ fdsp->vector_fmul_add = ff_vector_fmul_add_neon;
|
|
|
|
++ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
|
|
|
|
++ fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
|
|
|
|
++ fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
|
|
|
|
++ }
|
|
|
|
++}
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_neon.S b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S
|
|
|
|
+@@ -0,0 +1,202 @@
|
|
|
|
++/*
|
|
|
|
++ * ARM NEON optimised Float DSP functions
|
|
|
|
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
++ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#include "config.h"
|
|
|
|
++#include "asm.S"
|
|
|
|
++
|
|
|
|
++function ff_vector_fmul_neon, export=1
|
|
|
|
++1: subs w3, w3, #16
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++ ld1 {v2.4S, v3.4S}, [x1], #32
|
|
|
|
++ ld1 {v4.4S, v5.4S}, [x2], #32
|
|
|
|
++ ld1 {v6.4S, v7.4S}, [x2], #32
|
|
|
|
++ fmul v16.4S, v0.4S, v4.4S
|
|
|
|
++ fmul v17.4S, v1.4S, v5.4S
|
|
|
|
++ fmul v18.4S, v2.4S, v6.4S
|
|
|
|
++ fmul v19.4S, v3.4S, v7.4S
|
|
|
|
++ st1 {v16.4S, v17.4S}, [x0], #32
|
|
|
|
++ st1 {v18.4S, v19.4S}, [x0], #32
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vector_fmac_scalar_neon, export=1
|
|
|
|
++ mov x3, #-32
|
|
|
|
++1: subs w2, w2, #16
|
|
|
|
++ ld1 {v16.4S, v17.4S}, [x0], #32
|
|
|
|
++ ld1 {v18.4S, v19.4S}, [x0], x3
|
|
|
|
++ ld1 {v4.4S, v5.4S}, [x1], #32
|
|
|
|
++ ld1 {v6.4S, v7.4S}, [x1], #32
|
|
|
|
++ fmla v16.4S, v4.4S, v0.S[0]
|
|
|
|
++ fmla v17.4S, v5.4S, v0.S[0]
|
|
|
|
++ fmla v18.4S, v6.4S, v0.S[0]
|
|
|
|
++ fmla v19.4S, v7.4S, v0.S[0]
|
|
|
|
++ st1 {v16.4S, v17.4S}, [x0], #32
|
|
|
|
++ st1 {v18.4S, v19.4S}, [x0], #32
|
|
|
|
++ b.ne 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vector_fmul_scalar_neon, export=1
|
|
|
|
++ mov w4, #15
|
|
|
|
++ bics w3, w2, w4
|
|
|
|
++ dup v16.4S, v0.S[0]
|
|
|
|
++ b.eq 3f
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++1: subs w3, w3, #16
|
|
|
|
++ fmul v0.4S, v0.4S, v16.4S
|
|
|
|
++ ld1 {v2.4S, v3.4S}, [x1], #32
|
|
|
|
++ fmul v1.4S, v1.4S, v16.4S
|
|
|
|
++ fmul v2.4S, v2.4S, v16.4S
|
|
|
|
++ st1 {v0.4S, v1.4S}, [x0], #32
|
|
|
|
++ fmul v3.4S, v3.4S, v16.4S
|
|
|
|
++ b.eq 2f
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++ st1 {v2.4S, v3.4S}, [x0], #32
|
|
|
|
++ b 1b
|
|
|
|
++2: ands w2, w2, #15
|
|
|
|
++ st1 {v2.4S, v3.4S}, [x0], #32
|
|
|
|
++ b.eq 4f
|
|
|
|
++3: ld1 {v0.4S}, [x1], #16
|
|
|
|
++ fmul v0.4S, v0.4S, v16.4S
|
|
|
|
++ st1 {v0.4S}, [x0], #16
|
|
|
|
++ subs w2, w2, #4
|
|
|
|
++ b.gt 3b
|
|
|
|
++4: ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vector_dmul_scalar_neon, export=1
|
|
|
|
++ dup v16.2D, v0.D[0]
|
|
|
|
++ ld1 {v0.2D, v1.2D}, [x1], #32
|
|
|
|
++1: subs w2, w2, #8
|
|
|
|
++ fmul v0.2D, v0.2D, v16.2D
|
|
|
|
++ ld1 {v2.2D, v3.2D}, [x1], #32
|
|
|
|
++ fmul v1.2D, v1.2D, v16.2D
|
|
|
|
++ fmul v2.2D, v2.2D, v16.2D
|
|
|
|
++ st1 {v0.2D, v1.2D}, [x0], #32
|
|
|
|
++ fmul v3.2D, v3.2D, v16.2D
|
|
|
|
++ ld1 {v0.2D, v1.2D}, [x1], #32
|
|
|
|
++ st1 {v2.2D, v3.2D}, [x0], #32
|
|
|
|
++ b.gt 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vector_fmul_window_neon, export=1
|
|
|
|
++ sxtw x4, w4 // len
|
|
|
|
++ sub x2, x2, #8
|
|
|
|
++ sub x5, x4, #2
|
|
|
|
++ add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4)
|
|
|
|
++ add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
|
|
|
|
++ add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
|
|
|
|
++ mov x7, #-16
|
|
|
|
++ ld1 {v0.4S}, [x1], #16 // s0
|
|
|
|
++ ld1 {v2.4S}, [x3], #16 // wi
|
|
|
|
++ ld1 {v1.4S}, [x2], x7 // s1
|
|
|
|
++1: ld1 {v3.4S}, [x6], x7 // wj
|
|
|
|
++ subs x4, x4, #4
|
|
|
|
++ fmul v17.4S, v0.4S, v2.4S // s0 * wi
|
|
|
|
++ rev64 v4.4S, v1.4S
|
|
|
|
++ rev64 v5.4S, v3.4S
|
|
|
|
++ rev64 v17.4S, v17.4S
|
|
|
|
++ ext v4.16B, v4.16B, v4.16B, #8 // s1_r
|
|
|
|
++ ext v5.16B, v5.16B, v5.16B, #8 // wj_r
|
|
|
|
++ ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
|
|
|
|
++ fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
|
|
|
|
++ fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
|
|
|
|
++ b.eq 2f
|
|
|
|
++ ld1 {v0.4S}, [x1], #16
|
|
|
|
++ fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
|
|
|
++ st1 {v17.4S}, [x5], x7
|
|
|
|
++ ld1 {v2.4S}, [x3], #16
|
|
|
|
++ ld1 {v1.4S}, [x2], x7
|
|
|
|
++ st1 {v16.4S}, [x0], #16
|
|
|
|
++ b 1b
|
|
|
|
++2:
|
|
|
|
++ fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
|
|
|
++ st1 {v17.4S}, [x5], x7
|
|
|
|
++ st1 {v16.4S}, [x0], #16
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vector_fmul_add_neon, export=1
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++ ld1 {v2.4S, v3.4S}, [x2], #32
|
|
|
|
++ ld1 {v4.4S, v5.4S}, [x3], #32
|
|
|
|
++1: subs w4, w4, #8
|
|
|
|
++ fmla v4.4S, v0.4S, v2.4S
|
|
|
|
++ fmla v5.4S, v1.4S, v3.4S
|
|
|
|
++ b.eq 2f
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++ ld1 {v2.4S, v3.4S}, [x2], #32
|
|
|
|
++ st1 {v4.4S, v5.4S}, [x0], #32
|
|
|
|
++ ld1 {v4.4S, v5.4S}, [x3], #32
|
|
|
|
++ b 1b
|
|
|
|
++2: st1 {v4.4S, v5.4S}, [x0], #32
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_vector_fmul_reverse_neon, export=1
|
|
|
|
++ sxtw x3, w3
|
|
|
|
++ add x2, x2, x3, lsl #2
|
|
|
|
++ sub x2, x2, #32
|
|
|
|
++ mov x4, #-32
|
|
|
|
++ ld1 {v2.4S, v3.4S}, [x2], x4
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++1: subs x3, x3, #8
|
|
|
|
++ rev64 v3.4S, v3.4S
|
|
|
|
++ rev64 v2.4S, v2.4S
|
|
|
|
++ ext v3.16B, v3.16B, v3.16B, #8
|
|
|
|
++ ext v2.16B, v2.16B, v2.16B, #8
|
|
|
|
++ fmul v16.4S, v0.4S, v3.4S
|
|
|
|
++ fmul v17.4S, v1.4S, v2.4S
|
|
|
|
++ b.eq 2f
|
|
|
|
++ ld1 {v2.4S, v3.4S}, [x2], x4
|
|
|
|
++ ld1 {v0.4S, v1.4S}, [x1], #32
|
|
|
|
++ st1 {v16.4S, v17.4S}, [x0], #32
|
|
|
|
++ b 1b
|
|
|
|
++2: st1 {v16.4S, v17.4S}, [x0], #32
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_butterflies_float_neon, export=1
|
|
|
|
++1: ld1 {v0.4S}, [x0]
|
|
|
|
++ ld1 {v1.4S}, [x1]
|
|
|
|
++ subs w2, w2, #4
|
|
|
|
++ fsub v2.4S, v0.4S, v1.4S
|
|
|
|
++ fadd v3.4S, v0.4S, v1.4S
|
|
|
|
++ st1 {v2.4S}, [x1], #16
|
|
|
|
++ st1 {v3.4S}, [x0], #16
|
|
|
|
++ b.gt 1b
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
++
|
|
|
|
++function ff_scalarproduct_float_neon, export=1
|
|
|
|
++ movi v2.4S, #0
|
|
|
|
++1: ld1 {v0.4S}, [x0], #16
|
|
|
|
++ ld1 {v1.4S}, [x1], #16
|
|
|
|
++ subs w2, w2, #4
|
|
|
|
++ fmla v2.4S, v0.4S, v1.4S
|
|
|
|
++ b.gt 1b
|
|
|
|
++ faddp v0.4S, v2.4S, v2.4S
|
|
|
|
++ faddp s0, v0.2S
|
|
|
|
++ ret
|
|
|
|
++endfunc
|
|
|
|
+diff --git a/media/ffvpx/libavutil/aarch64/timer.h b/media/ffvpx/libavutil/aarch64/timer.h
|
|
|
|
+new file mode 100644
|
|
|
|
+--- /dev/null
|
|
|
|
++++ b/media/ffvpx/libavutil/aarch64/timer.h
|
|
|
|
+@@ -0,0 +1,44 @@
|
|
|
|
++/*
|
|
|
|
++ * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
++ *
|
|
|
|
++ * This file is part of FFmpeg.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is free software; you can redistribute it and/or
|
|
|
|
++ * modify it under the terms of the GNU Lesser General Public
|
|
|
|
++ * License as published by the Free Software Foundation; either
|
|
|
|
++ * version 2.1 of the License, or (at your option) any later version.
|
|
|
|
++ *
|
|
|
|
++ * FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
++ * Lesser General Public License for more details.
|
|
|
|
++ *
|
|
|
|
++ * You should have received a copy of the GNU Lesser General Public
|
|
|
|
++ * License along with FFmpeg; if not, write to the Free Software
|
|
|
|
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
++ */
|
|
|
|
++
|
|
|
|
++#ifndef AVUTIL_AARCH64_TIMER_H
|
|
|
|
++#define AVUTIL_AARCH64_TIMER_H
|
|
|
|
++
|
|
|
|
++#include <stdint.h>
|
|
|
|
++#include "config.h"
|
|
|
|
++
|
|
|
|
++#if HAVE_INLINE_ASM
|
|
|
|
++
|
|
|
|
++#define AV_READ_TIME read_time
|
|
|
|
++
|
|
|
|
++static inline uint64_t read_time(void)
|
|
|
|
++{
|
|
|
|
++ uint64_t cycle_counter;
|
|
|
|
++ __asm__ volatile(
|
|
|
|
++ "isb \t\n"
|
|
|
|
++ "mrs %0, pmccntr_el0 "
|
|
|
|
++ : "=r"(cycle_counter) :: "memory" );
|
|
|
|
++
|
|
|
|
++ return cycle_counter;
|
|
|
|
++}
|
|
|
|
++
|
|
|
|
++#endif /* HAVE_INLINE_ASM */
|
|
|
|
++
|
|
|
|
++#endif /* AVUTIL_AARCH64_TIMER_H */
|
|
|
|
+
|