Browse Source

get rid of yasm

Frank-Rainer Grahl 1 month ago
parent
commit
c70fcdef8e
35 changed files with 23394 additions and 121 deletions
  1. 384 0
      mozilla-release/patches/1525393-1-75a1.patch
  2. 105 0
      mozilla-release/patches/1540760-1-68a1.patch
  3. 74 0
      mozilla-release/patches/1540760-2-68a1.patch
  4. 57 0
      mozilla-release/patches/1540760-3-68a1.patch
  5. 115 0
      mozilla-release/patches/1540760-4-68a1.patch
  6. 12454 0
      mozilla-release/patches/1540760-5-68a1.patch
  7. 1016 0
      mozilla-release/patches/1540760-6-68a1.patch
  8. 2 2
      mozilla-release/patches/1585358-71a1.patch
  9. 6250 0
      mozilla-release/patches/1585359-71a1.patch
  10. 204 0
      mozilla-release/patches/1650299-80a1.patch
  11. 33 0
      mozilla-release/patches/1656063-81a1.patch
  12. 57 0
      mozilla-release/patches/1669888-83a1.patch
  13. 44 0
      mozilla-release/patches/1692940-01-88a1.patch
  14. 208 0
      mozilla-release/patches/1692940-02-88a1.patch
  15. 120 0
      mozilla-release/patches/1692940-03-88a1.patch
  16. 166 0
      mozilla-release/patches/1692940-04-88a1.patch
  17. 73 38
      mozilla-release/patches/1692940-05-88a1.patch
  18. 218 0
      mozilla-release/patches/1692940-06-88a1.patch
  19. 80 0
      mozilla-release/patches/1692940-07-88a1.patch
  20. 89 0
      mozilla-release/patches/1692940-08-88a1.patch
  21. 197 0
      mozilla-release/patches/1692940-09-88a1.patch
  22. 397 0
      mozilla-release/patches/1692940-10no11-88a1.patch
  23. 32 0
      mozilla-release/patches/1692940-12-88a1.patch
  24. 33 0
      mozilla-release/patches/1692945-1-87a1.patch
  25. 80 0
      mozilla-release/patches/1692945-2-87a1.patch
  26. 122 0
      mozilla-release/patches/1693215-1-88a1.patch
  27. 168 0
      mozilla-release/patches/1693215-2-88a1.patch
  28. 152 0
      mozilla-release/patches/1693215-3-88a1.patch
  29. 89 0
      mozilla-release/patches/1693498-1-88a1.patch
  30. 258 0
      mozilla-release/patches/1693498-2-88a1.patch
  31. 16 16
      mozilla-release/patches/1709303-1-94a1.patch
  32. 0 29
      mozilla-release/patches/NOBUG-nasm-icu-25320.patch
  33. 0 28
      mozilla-release/patches/TOP-1445683-14-PLASTER-aom-fix-win32-bustage-2535.patch
  34. 66 0
      mozilla-release/patches/TOP-NOBUG-fixnasmcheck-25320.patch
  35. 35 8
      mozilla-release/patches/series

+ 384 - 0
mozilla-release/patches/1525393-1-75a1.patch

@@ -0,0 +1,384 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1582826660 0
+# Node ID 6d2821c6e36e659b2d007f1782d1e3346b7c3af6
+# Parent  3f2c958afd9e5e713b8f3a186956196a91878b99
+Bug 1525393 - Changes to update scripts for libvpx 1.8.2; r=bryce
+
+This makes the following changes:
+* Change update.py to use Python 3.
+* Have update.py remove some unused portions of the upstream library.
+* Update local patches to apply against libvpx 1.8.2.
+* Remove local patches that are no longer necessary.
+* Update vs build configurations in generate_sources_mozbuild.sh.
+* Remove the #define for stdint from VPXDecoder.h.
+* Disable AVX512 support
+* Make sure float_control_word.asm is included in win64 builds
+
+Differential Revision: https://phabricator.services.mozilla.com/D63919
+
+diff --git a/dom/media/platforms/agnostic/VPXDecoder.h b/dom/media/platforms/agnostic/VPXDecoder.h
+--- a/dom/media/platforms/agnostic/VPXDecoder.h
++++ b/dom/media/platforms/agnostic/VPXDecoder.h
+@@ -5,16 +5,17 @@
+  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+ #if !defined(VPXDecoder_h_)
+ #define VPXDecoder_h_
+ 
+ #include "PlatformDecoderModule.h"
+ #include "mozilla/Span.h"
+ 
+ #include <stdint.h>
++// Remove when Bug 1525393 part 2 goes in.
+ #define VPX_DONT_DEFINE_STDINT_TYPES
+ #include "vpx/vp8dx.h"
+ #include "vpx/vpx_codec.h"
+ #include "vpx/vpx_decoder.h"
+ 
+ namespace mozilla {
+ 
+ DDLoggedTypeDeclNameAndBase(VPXDecoder, MediaDataDecoder);
+diff --git a/dom/media/platforms/agnostic/VPXDecoder.h.1525393-1.later b/dom/media/platforms/agnostic/VPXDecoder.h.1525393-1.later
+new file mode 100644
+--- /dev/null
++++ b/dom/media/platforms/agnostic/VPXDecoder.h.1525393-1.later
+@@ -0,0 +1,22 @@
++--- VPXDecoder.h
+++++ VPXDecoder.h
++// Fix when Bug 1525393 part 2 goes in.
++
++@@ -5,17 +5,16 @@
++  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++ #if !defined(VPXDecoder_h_)
++ #  define VPXDecoder_h_
++ 
++ #  include "PlatformDecoderModule.h"
++ #  include "mozilla/Span.h"
++ 
++ #  include <stdint.h>
++-#  define VPX_DONT_DEFINE_STDINT_TYPES
++ #  include "mozilla/gfx/Types.h"
++ #  include "vpx/vp8dx.h"
++ #  include "vpx/vpx_codec.h"
++ #  include "vpx/vpx_decoder.h"
++ 
++ namespace mozilla {
++ 
++ DDLoggedTypeDeclNameAndBase(VPXDecoder, MediaDataDecoder);
+diff --git a/media/libvpx/aarch64-windows.patch b/media/libvpx/aarch64-windows.patch
+deleted file mode 100644
+--- a/media/libvpx/aarch64-windows.patch
++++ /dev/null
+@@ -1,12 +0,0 @@
+-diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure
+-index e5a74c6..12bab6c 100755
+---- a/media/libvpx/libvpx/configure
+-+++ b/media/libvpx/libvpx/configure
+-@@ -159,6 +159,7 @@ all_platforms="${all_platforms} x86_64-win64-vs11"
+- all_platforms="${all_platforms} x86_64-win64-vs12"
+- all_platforms="${all_platforms} x86_64-win64-vs14"
+- all_platforms="${all_platforms} x86_64-win64-vs15"
+-+all_platforms="${all_platforms} aarch64-win64-vs12"
+- all_platforms="${all_platforms} generic-gnu"
+- 
+- # all_targets is a list of all targets that can be configured
+diff --git a/media/libvpx/bug1480092.patch b/media/libvpx/bug1480092.patch
+deleted file mode 100644
+--- a/media/libvpx/bug1480092.patch
++++ /dev/null
+@@ -1,22 +0,0 @@
+-diff --git a/media/libvpx/libvpx/vp8/common/postproc.c b/media/libvpx/libvpx/vp8/common/postproc.c
+---- a/media/libvpx/libvpx/vp8/common/postproc.c
+-+++ b/media/libvpx/libvpx/vp8/common/postproc.c
+-@@ -60,17 +60,17 @@ static void vp8_de_mblock(YV12_BUFFER_CO
+- }
+- 
+- void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
+-                  YV12_BUFFER_CONFIG *post, int q, int low_var_thresh,
+-                  int flag) {
+-   double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+-   int ppl = (int)(level + .5);
+- 
+--  const MODE_INFO *mode_info_context = cm->show_frame_mi;
+-+  const MODE_INFO *mode_info_context = cm->mi;
+-   int mbr, mbc;
+- 
+-   /* The pixel thresholds are adjusted according to if or not the macroblock
+-    * is a skipped block.  */
+-   unsigned char *ylimits = cm->pp_limits_buffer;
+-   unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols;
+-   (void)low_var_thresh;
+-   (void)flag;
+diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh
+--- a/media/libvpx/generate_sources_mozbuild.sh
++++ b/media/libvpx/generate_sources_mozbuild.sh
+@@ -12,16 +12,17 @@
+ #
+ # Usage:
+ # $ ./generate_sources_mozbuild.sh
+ 
+ export LC_ALL=C
+ BASE_DIR=$(pwd)
+ LIBVPX_SRC_DIR="libvpx"
+ LIBVPX_CONFIG_DIR="config"
++DISABLE_AVX="--disable-avx512"
+ 
+ # Print license header.
+ # $1 - Output base name
+ function write_license {
+   echo "# This file is generated. Do not edit." >> $1
+   echo "" >> $1
+ }
+ 
+@@ -201,21 +202,22 @@ all_platforms="${all_platforms} --disabl
+ x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm"
+ arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only"
+ arm64_platforms="--enable-realtime-only"
+ 
+ gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files mac/x64 "--target=x86_64-darwin9-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files mac/ia32 "--target=x86-darwin9-gcc ${all_platforms} ${x86_platforms}"
+-gen_config_files win/x64 "--target=x86_64-win64-vs12 ${all_platforms} ${x86_platforms}"
++gen_config_files win/x64 "--target=x86_64-win64-vs15 ${all_platforms} ${x86_platforms}"
+ gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}"
+ 
+ gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}"
+ gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}"
++gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms}"
+ 
+ gen_config_files generic "--target=generic-gnu ${all_platforms}"
+ 
+ echo "Remove temporary directory."
+ cd $BASE_DIR
+ rm -rf $TEMP_DIR
+ 
+ echo "Create temporary directory."
+@@ -225,21 +227,20 @@ cp -R $LIBVPX_SRC_DIR $TEMP_DIR
+ cd $TEMP_DIR
+ 
+ gen_rtcd_header linux/x64 x86_64
+ gen_rtcd_header linux/ia32 x86
+ gen_rtcd_header mac/x64 x86_64
+ gen_rtcd_header mac/ia32 x86
+ gen_rtcd_header win/x64 x86_64
+ gen_rtcd_header win/ia32 x86
+-gen_rtcd_header win/aarch64 aarch64
+-
+ 
+ gen_rtcd_header linux/arm armv7
+ gen_rtcd_header linux/arm64 arm64
++gen_rtcd_header win/aarch64 arm64
+ 
+ gen_rtcd_header generic generic
+ 
+ echo "Prepare Makefile."
+ ./configure --target=generic-gnu > /dev/null
+ make_clean
+ 
+ # Remove existing source files.
+diff --git a/media/libvpx/input_frame_validation.patch b/media/libvpx/input_frame_validation.patch
+--- a/media/libvpx/input_frame_validation.patch
++++ b/media/libvpx/input_frame_validation.patch
+@@ -5,19 +5,19 @@ Bug 1263384: validate input frames again
+ 
+ MozReview-Commit-ID: BxDCnJe0mzs
+ 
+ diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+ --- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+ +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+ @@ -855,20 +855,29 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
+      dst_time_stamp =
+-         pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+-     dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
+-                          ctx->cfg.g_timebase.den;
++         pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
++     dst_end_time_stamp = (pts_val + (int64_t)duration) *
++                          ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+ 
+      if (img != NULL) {
+        res = image2yuvconfig(img, &sd);
+  
+ -      if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
+ -                                dst_time_stamp, dst_end_time_stamp)) {
+ -        VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+ -        res = update_error_state(ctx, &cpi->common.error);
+diff --git a/media/libvpx/rename_duplicate_files.patch b/media/libvpx/rename_duplicate_files.patch
+--- a/media/libvpx/rename_duplicate_files.patch
++++ b/media/libvpx/rename_duplicate_files.patch
+@@ -1,23 +1,22 @@
+-diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
+-index 84b529136ba9..7f3111320dc9 100644
+---- a/libvpx/vpx_dsp/vpx_dsp.mk
+-+++ b/libvpx/vpx_dsp/vpx_dsp.mk
+-@@ -133,17 +133,17 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_dspr2.c
+- DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_horiz_dspr2.c
+- DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
++diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
++--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
++@@ -160,17 +160,17 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve
+  DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
+  DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
+-
++ 
++ DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
++ 
+  # loop filters
+  DSP_SRCS-yes += loopfilter.c
+-
+--DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+-+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_intrin_sse2.c
+- DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+-
++ 
++-DSP_SRCS-$(HAVE_SSE2)  += x86/loopfilter_sse2.c
+++DSP_SRCS-$(HAVE_SSE2)  += x86/loopfilter_intrin_sse2.c
++ DSP_SRCS-$(HAVE_AVX2)  += x86/loopfilter_avx2.c
++ 
+  ifeq ($(HAVE_NEON_ASM),yes)
+  DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+  DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+  DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+  else
+  DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+diff --git a/media/libvpx/stdint.patch b/media/libvpx/stdint.patch
+deleted file mode 100644
+--- a/media/libvpx/stdint.patch
++++ /dev/null
+@@ -1,41 +0,0 @@
+-diff --git a/media/libvpx/libvpx/vpx/vpx_integer.h b/media/libvpx/libvpx/vpx/vpx_integer.h
+---- a/media/libvpx/libvpx/vpx/vpx_integer.h
+-+++ b/media/libvpx/libvpx/vpx/vpx_integer.h
+-@@ -18,16 +18,18 @@
+- #define VPX_FORCE_INLINE __forceinline
+- #define VPX_INLINE __inline
+- #else
+- #define VPX_FORCE_INLINE __inline__ __attribute__(always_inline)
+- // TODO(jbb): Allow a way to force inline off for older compilers.
+- #define VPX_INLINE inline
+- #endif
+- 
+-+#if !defined(VPX_DONT_DEFINE_STDINT_TYPES)
+-+
+- #if defined(VPX_EMULATE_INTTYPES)
+- typedef signed char int8_t;
+- typedef signed short int16_t;
+- typedef signed int int32_t;
+-
+- typedef unsigned char uint8_t;
+- typedef unsigned short uint16_t;
+- typedef unsigned int uint32_t;
+-@@ -48,16 +50,18 @@ typedef size_t uintptr_t;
+- #define __STDC_LIMIT_MACROS
+- #endif
+- #endif  // __cplusplus
+-
+- #include <stdint.h>
+-
+- #endif
+-
+-+#endif // VPX_DONT_DEFINE_STDINT_TYPES
+-+
+- /* VS2010 defines stdint.h, but not inttypes.h */
+- #if defined(_MSC_VER) && _MSC_VER < 1800
+- #define PRId64 "I64d"
+- #else
+- #include <inttypes.h>
+- #endif
+-
+- #endif  // VPX_VPX_INTEGER_H_
+diff --git a/media/libvpx/update.py b/media/libvpx/update.py
+--- a/media/libvpx/update.py
++++ b/media/libvpx/update.py
+@@ -4,53 +4,48 @@
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ import argparse
+ import os
+ import re
+ import shutil
+ import sys
+ import subprocess
+ import tarfile
+-import urllib
++import urllib.request
+ from pprint import pprint
+-from StringIO import StringIO
++from io import StringIO
+ 
+ def prepare_upstream(prefix, commit=None):
+     upstream_url = 'https://chromium.googlesource.com/webm/libvpx'
+     shutil.rmtree(os.path.join(base, 'libvpx/'))
+     print(upstream_url + '/+archive/' + commit + '.tar.gz')
+-    urllib.urlretrieve(upstream_url + '/+archive/' + commit + '.tar.gz', 'libvpx.tar.gz')
++    urllib.request.urlretrieve(upstream_url + '/+archive/' + commit + '.tar.gz', 'libvpx.tar.gz')
+     tarfile.open('libvpx.tar.gz').extractall(path='libvpx')
+     os.remove(os.path.join(base, 'libvpx.tar.gz'))
+     os.chdir(base)
+     return commit
+ 
+ def cleanup_upstream():
+-    os.remove(os.path.join(base, 'libvpx/.gitattributes'))
+-    os.remove(os.path.join(base, 'libvpx/.gitignore'))
+-    os.remove(os.path.join(base, 'libvpx/build/.gitattributes'))
+-    os.remove(os.path.join(base, 'libvpx/build/.gitignore'))
++    os.remove(os.path.join(base, 'libvpx', '.gitattributes'))
++    os.remove(os.path.join(base, 'libvpx', '.gitignore'))
++    shutil.rmtree(os.path.join(base, 'libvpx', 'third_party', 'libwebm'))
++    shutil.rmtree(os.path.join(base, 'libvpx', 'tools'))
+ 
+ def apply_patches():
+-    # Patch to permit vpx users to specify their own <stdint.h> types.
+-    os.system("patch -p3 < stdint.patch")
+     # Patch to fix a crash caused by MSVC 2013
+     os.system("patch -p3 < bug1137614.patch")
+     # Bug 1263384 - Check input frame resolution
+     os.system("patch -p3 < input_frame_validation.patch")
+     # Bug 1315288 - Check input frame resolution for vp9
+     os.system("patch -p3 < input_frame_validation_vp9.patch")
+     # Avoid c/asm name collision for loopfilter_sse2
+-    os.system("patch -p1 < rename_duplicate_files.patch")
++    os.system("patch -p3 < rename_duplicate_files.patch")
+     os.system("mv libvpx/vpx_dsp/x86/loopfilter_sse2.c libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c")
+-    # Cherrypick fix from upstream
+-    os.system("patch -p3 < bug1480092.patch")
+-    # AArch64 Windows support
+-    os.system("patch -p3 < aarch64-windows.patch")
+-
++    # Ensure float_control_word.asm is included
++    os.system("patch -p3 < win64_build_fix.patch")
+ 
+ def update_readme(commit):
+     with open('README_MOZILLA') as f:
+         readme = f.read()
+ 
+     if 'The git commit ID used was' in readme:
+         new_readme = re.sub('The git commit ID used was [v\.a-f0-9]+',
+             'The git commit ID used was %s' % commit, readme)
+diff --git a/media/libvpx/win64_build_fix.patch b/media/libvpx/win64_build_fix.patch
+new file mode 100644
+--- /dev/null
++++ b/media/libvpx/win64_build_fix.patch
+@@ -0,0 +1,22 @@
++diff --git a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk
++--- a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk
+++++ b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk
++@@ -21,17 +21,17 @@ ifeq ($(VPX_ARCH_X86),yes)
++ PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c
++ endif
++ ifeq ($(VPX_ARCH_X86_64),yes)
++ # Visual Studio x64 does not support the _mm_empty() intrinsic.
++ PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm
++ endif
++ 
++ ifeq ($(VPX_ARCH_X86_64),yes)
++-PORTS_SRCS-$(CONFIG_MSVS) += float_control_word.asm
+++PORTS_SRCS-yes += float_control_word.asm
++ endif
++ 
++ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
++ PORTS_SRCS-yes += x86.h
++ PORTS_SRCS-yes += x86_abi_support.asm
++ endif
++ 
++ PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c

+ 105 - 0
mozilla-release/patches/1540760-1-68a1.patch

@@ -0,0 +1,105 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1556723088 0
+# Node ID f1bc4fcd152e66e858c3a1d0b0afd30a78e9474b
+# Parent  3ca39e2837e77b27b1058be69bcd6129a3dfcff9
+Bug 1540760 - Make it possible to use clang-cl as an assembler; r=firefox-build-system-reviewers,mshal
+
+Some media libraries use gas syntax in their assembly files. Rather than
+converting these arm assembly syntax files for aarch64, we can use clang-cl
+to build them directly.
+
+Differential Revision: https://phabricator.services.mozilla.com/D27785
+
+diff --git a/build/moz.configure/toolchain.configure b/build/moz.configure/toolchain.configure
+--- a/build/moz.configure/toolchain.configure
++++ b/build/moz.configure/toolchain.configure
+@@ -2217,16 +2217,30 @@ def have_yasm(yasm_asflags):
+         return True
+ 
+ set_config('HAVE_NASM', have_nasm)
+ 
+ set_config('HAVE_YASM', have_yasm)
+ # Until the YASM variable is not necessary in old-configure.
+ add_old_configure_assignment('YASM', have_yasm)
+ 
++
++# clang-cl integrated assembler support
++# ==============================================================
++@depends(target)
++def clangcl_asflags(target):
++    asflags = None
++    if target.os == 'WINNT' and target.cpu == 'aarch64':
++        asflags = ['--target=aarch64-windows-msvc']
++    return asflags
++
++
++set_config('CLANGCL_ASFLAGS', clangcl_asflags)
++
++
+ # Code Coverage
+ # ==============================================================
+ 
+ js_option('--enable-coverage', env='MOZ_CODE_COVERAGE',
+           help='Enable code coverage')
+ 
+ @depends('--enable-coverage')
+ def code_coverage(value):
+diff --git a/python/mozbuild/mozbuild/frontend/context.py b/python/mozbuild/mozbuild/frontend/context.py
+--- a/python/mozbuild/mozbuild/frontend/context.py
++++ b/python/mozbuild/mozbuild/frontend/context.py
+@@ -2274,16 +2274,24 @@ VARIABLES = {
+ 
+         By default, the build will use the toolchain assembler, $(AS), to
+         assemble source files in assembly language (.s or .asm files). Setting
+         this value to ``True`` will cause it to use yasm instead.
+ 
+         If yasm is not available on this system, or does not support the
+         current target architecture, an error will be raised.
+         """),
++
++    'USE_INTEGRATED_CLANGCL_AS': (bool, bool,
++        """Use the integrated clang-cl assembler to assemble assembly files from SOURCES.
++
++        This allows using clang-cl to assemble assembly files which is useful
++        on platforms like aarch64 where the alternative is to have to run a
++        pre-processor to generate files with suitable syntax.
++        """),
+ }
+ 
+ # Sanity check: we don't want any variable above to have a list as storage type.
+ for name, (storage_type, input_types, docs) in VARIABLES.items():
+     if storage_type == list:
+         raise RuntimeError('%s has a "list" storage type. Use "List" instead.'
+                            % name)
+ 
+diff --git a/python/mozbuild/mozbuild/frontend/emitter.py b/python/mozbuild/mozbuild/frontend/emitter.py
+--- a/python/mozbuild/mozbuild/frontend/emitter.py
++++ b/python/mozbuild/mozbuild/frontend/emitter.py
+@@ -1327,16 +1327,26 @@ class TreeMetadataEmitter(LoggingMixin):
+             if not nasm:
+                 raise SandboxValidationError('nasm is not available', context)
+             passthru.variables['AS'] = nasm
+             passthru.variables['AS_DASH_C_FLAG'] = ''
+             passthru.variables['ASOUTOPTION'] = '-o '
+             computed_as_flags.resolve_flags('OS',
+                                             context.config.substs.get('NASM_ASFLAGS', []))
+ 
++        if context.get('USE_INTEGRATED_CLANGCL_AS') is True:
++            clangcl = context.config.substs.get('CLANG_CL')
++            if not clangcl:
++                raise SandboxValidationError('clang-cl is not available', context)
++            passthru.variables['AS'] = 'clang-cl'
++            passthru.variables['AS_DASH_C_FLAG'] = '-c'
++            passthru.variables['ASOUTOPTION'] = '-o '
++            computed_as_flags.resolve_flags('OS',
++                                            context.config.substs.get('CLANGCL_ASFLAGS', []))
++
+         if passthru.variables:
+             yield passthru
+ 
+         if context.objdir in self._compile_dirs:
+             self._compile_flags[context.objdir] = computed_flags
+             yield computed_link_flags
+ 
+         if context.objdir in self._asm_compile_dirs:

+ 74 - 0
mozilla-release/patches/1540760-2-68a1.patch

@@ -0,0 +1,74 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1556723089 0
+# Node ID cd666f0befca97073d92bf1c46f39ead6eff3646
+# Parent  f1bc4fcd152e66e858c3a1d0b0afd30a78e9474b
+Bug 1540760 - Enable neon for libyuv for aarch64; r=jya
+
+Differential Revision: https://phabricator.services.mozilla.com/D27786
+
+diff --git a/media/libyuv/aarch64-windows-noneon.patch b/media/libyuv/aarch64-windows-noneon.patch
+deleted file mode 100644
+--- a/media/libyuv/aarch64-windows-noneon.patch
++++ /dev/null
+@@ -1,14 +0,0 @@
+-diff --git a/media/libyuv/libyuv/libyuv.gyp b/media/libyuv/libyuv/libyuv.gyp
+-index 776510b..51ab531 100644
+---- a/media/libyuv/libyuv/libyuv.gyp
+-+++ b/media/libyuv/libyuv/libyuv.gyp
+-@@ -33,7 +33,8 @@
+-     'build_msa': 0,
+-     'conditions': [
+-        ['(target_arch == "armv7" or target_arch == "armv7s" or \
+--       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
+-+       (target_arch == "arm" and arm_version >= 7) or \
+-+       (OS != "win" and target_arch == "arm64")) \
+-        and (arm_neon == 1 or arm_neon_optional == 1)', {
+-          'build_neon': 1,
+-        }],
+diff --git a/media/libyuv/libyuv/libyuv.gyp b/media/libyuv/libyuv/libyuv.gyp
+--- a/media/libyuv/libyuv/libyuv.gyp
++++ b/media/libyuv/libyuv/libyuv.gyp
+@@ -28,18 +28,17 @@
+     'use_lto%': 0,
+     'yuv_disable_asm%': 0,
+     'yuv_disable_avx2%': 0,
+     'mips_msa%': 0,  # Default to msa off.
+     'build_neon': 0,
+     'build_msa': 0,
+     'conditions': [
+        ['(target_arch == "armv7" or target_arch == "armv7s" or \
+-       (target_arch == "arm" and arm_version >= 7) or \
+-       (OS != "win" and target_arch == "arm64")) \
++       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
+        and (arm_neon == 1 or arm_neon_optional == 1)', {
+          'build_neon': 1,
+        }],
+        ['(target_arch == "mipsel" or target_arch == "mips64el")\
+        and (mips_msa == 1)',
+        {
+          'build_msa': 1,
+        }],
+diff --git a/media/libyuv/update.py b/media/libyuv/update.py
+--- a/media/libyuv/update.py
++++ b/media/libyuv/update.py
+@@ -47,18 +47,16 @@ def apply_patches(base):
+         # fix build errors
+         'fix_build_errors.patch',
+         # make mjpeg printfs optional at build time
+         'make_mjpeg_printfs_optional.patch',
+         # allow disabling of inline ASM and AVX2 code
+         'allow_disabling_asm_avx2.patch',
+         # add H444ToARGB() variant
+         'add_H444ToARGB.patch',
+-        # avoid selecting neon codepaths on AArch64 Windows
+-        'aarch64-windows-noneon.patch',
+         # fix the x86 mingw-clang build
+         'bug_1491848.patch',
+     ]
+ 
+     for patch in patches:
+         print('\nApplying patch %s' % patch)
+         with open(os.path.join(base, patch)) as f:
+             Popen(["patch", "-p3"], stdin=f, cwd=base).wait()
+

+ 57 - 0
mozilla-release/patches/1540760-3-68a1.patch

@@ -0,0 +1,57 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1556723089 0
+# Node ID a338bdcb894fbfbc4412e9f8efefc54667cd32a6
+# Parent  cd666f0befca97073d92bf1c46f39ead6eff3646
+Bug 1540760 - Use arm sources for libvpx; r=jya
+
+Differential Revision: https://phabricator.services.mozilla.com/D27787
+
+diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh
+--- a/media/libvpx/generate_sources_mozbuild.sh
++++ b/media/libvpx/generate_sources_mozbuild.sh
+@@ -204,17 +204,17 @@ arm64_platforms="--enable-realtime-only"
+ 
+ gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files mac/x64 "--target=x86_64-darwin9-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files mac/ia32 "--target=x86-darwin9-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files win/x64 "--target=x86_64-win64-vs12 ${all_platforms} ${x86_platforms}"
+ gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}"
+ gen_config_files win/mingw32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}"
+-gen_config_files win/aarch64 "--target=aarch64-win64-vs12 ${all_platforms}"
++gen_config_files win/aarch64 "--target=aarch64-win64-vs12 ${all_platforms} ${arm64_platforms}"
+ 
+ gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}"
+ gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}"
+ 
+ gen_config_files generic "--target=generic-gnu ${all_platforms}"
+ 
+ # vpx doesn't know if mingw32 has winpthreads or not, and doesn't try to detect it.
+ sed -i 's/HAVE_PTHREAD_H 0/HAVE_PTHREAD_H 1/' $BASE_DIR/$LIBVPX_CONFIG_DIR/win/mingw32/vpx_config.asm
+diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build
+--- a/media/libvpx/moz.build
++++ b/media/libvpx/moz.build
+@@ -71,19 +71,18 @@ elif CONFIG['CPU_ARCH'] == 'arm':
+         LOCAL_INCLUDES += [
+             '%%%s/sources/android/cpufeatures' % CONFIG['ANDROID_NDK'],
+         ]
+     if CONFIG['CC_TYPE'] == 'clang':
+         ASFLAGS += [
+             '-no-integrated-as',
+         ]
+ elif CONFIG['CPU_ARCH'] == 'aarch64' and CONFIG['OS_TARGET'] == 'WINNT':
+-    # Generic C-only configuration
+-    EXPORTS.vpx += files['GENERIC_EXPORTS']
+-    SOURCES += files['GENERIC_SOURCES']
++    EXPORTS.vpx += files['ARM64_EXPORTS']
++    SOURCES += files['ARM64_SOURCES']
+     ASFLAGS += [ '-I%s/media/libvpx/config/win/aarch64/' % TOPSRCDIR ]
+     LOCAL_INCLUDES += [ '/media/libvpx/config/win/aarch64/' ]
+ elif CONFIG['CPU_ARCH'] == 'aarch64':
+     EXPORTS.vpx += files['ARM64_EXPORTS']
+     SOURCES += files['ARM64_SOURCES']
+     ASFLAGS += [ '-I%s/media/libvpx/config/linux/arm64/' % TOPSRCDIR ]
+     LOCAL_INCLUDES += [ '/media/libvpx/config/linux/arm64/' ]
+ else:
+

+ 115 - 0
mozilla-release/patches/1540760-4-68a1.patch

@@ -0,0 +1,115 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1556723089 0
+# Node ID f40ae51578ac27c6ea38af1e2818a12ac0b93dbd
+# Parent  a338bdcb894fbfbc4412e9f8efefc54667cd32a6
+Bug 1540760 - Rerun generate_sources_mozbuild.sh for arm64 windows; r=jya
+
+Differential Revision: https://phabricator.services.mozilla.com/D27788
+
+diff --git a/media/libvpx/config/win/aarch64/vp8_rtcd.h b/media/libvpx/config/win/aarch64/vp8_rtcd.h
+--- a/media/libvpx/config/win/aarch64/vp8_rtcd.h
++++ b/media/libvpx/config/win/aarch64/vp8_rtcd.h
+@@ -142,19 +142,16 @@ void vp8_sixtap_predict4x4_c(unsigned ch
+ #define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+ 
+ void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+ #define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+ 
+ void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+ #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+ 
+-void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+-#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+-
+ void vp8_rtcd(void);
+ 
+ #include "vpx_config.h"
+ 
+ #ifdef RTCD_C
+ static void setup_rtcd_internal(void)
+ {
+ }
+diff --git a/media/libvpx/config/win/aarch64/vp9_rtcd.h b/media/libvpx/config/win/aarch64/vp9_rtcd.h
+--- a/media/libvpx/config/win/aarch64/vp9_rtcd.h
++++ b/media/libvpx/config/win/aarch64/vp9_rtcd.h
+@@ -68,19 +68,16 @@ void vp9_quantize_fp_c(const tran_low_t 
+ #define vp9_quantize_fp vp9_quantize_fp_c
+ 
+ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+ #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+ 
+ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+ #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
+ 
+-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+-
+ void vp9_rtcd(void);
+ 
+ #include "vpx_config.h"
+ 
+ #ifdef RTCD_C
+ static void setup_rtcd_internal(void)
+ {
+ }
+diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm
+--- a/media/libvpx/config/win/aarch64/vpx_config.asm
++++ b/media/libvpx/config/win/aarch64/vpx_config.asm
+@@ -53,17 +53,17 @@
+ .equ CONFIG_VP9_ENCODER ,  1
+ .equ CONFIG_VP9_DECODER ,  1
+ .equ CONFIG_VP8 ,  1
+ .equ CONFIG_VP9 ,  1
+ .equ CONFIG_ENCODERS ,  1
+ .equ CONFIG_DECODERS ,  1
+ .equ CONFIG_STATIC_MSVCRT ,  0
+ .equ CONFIG_SPATIAL_RESAMPLING ,  1
+-.equ CONFIG_REALTIME_ONLY ,  0
++.equ CONFIG_REALTIME_ONLY ,  1
+ .equ CONFIG_ONTHEFLY_BITPACKING ,  0
+ .equ CONFIG_ERROR_CONCEALMENT ,  0
+ .equ CONFIG_SHARED ,  0
+ .equ CONFIG_STATIC ,  1
+ .equ CONFIG_SMALL ,  0
+ .equ CONFIG_POSTPROC_VISUALIZER ,  0
+ .equ CONFIG_OS_SUPPORT ,  1
+ .equ CONFIG_UNIT_TESTS ,  0
+diff --git a/media/libvpx/config/win/aarch64/vpx_config.c b/media/libvpx/config/win/aarch64/vpx_config.c
+--- a/media/libvpx/config/win/aarch64/vpx_config.c
++++ b/media/libvpx/config/win/aarch64/vpx_config.c
+@@ -1,10 +1,10 @@
+ /* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+ /*  */
+ /* Use of this source code is governed by a BSD-style license */
+ /* that can be found in the LICENSE file in the root of the source */
+ /* tree. An additional intellectual property rights grant can be found */
+ /* in the file PATENTS.  All contributing project authors may */
+ /* be found in the AUTHORS file in the root of the source tree. */
+ #include "vpx/vpx_codec.h"
+-static const char* const cfg = "--target=aarch64-win64-vs12 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512";
++static const char* const cfg = "--target=aarch64-win64-vs12 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only";
+ const char *vpx_codec_build_config(void) {return cfg;}
+diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h
+--- a/media/libvpx/config/win/aarch64/vpx_config.h
++++ b/media/libvpx/config/win/aarch64/vpx_config.h
+@@ -62,17 +62,17 @@
+ #define CONFIG_VP9_ENCODER 1
+ #define CONFIG_VP9_DECODER 1
+ #define CONFIG_VP8 1
+ #define CONFIG_VP9 1
+ #define CONFIG_ENCODERS 1
+ #define CONFIG_DECODERS 1
+ #define CONFIG_STATIC_MSVCRT 0
+ #define CONFIG_SPATIAL_RESAMPLING 1
+-#define CONFIG_REALTIME_ONLY 0
++#define CONFIG_REALTIME_ONLY 1
+ #define CONFIG_ONTHEFLY_BITPACKING 0
+ #define CONFIG_ERROR_CONCEALMENT 0
+ #define CONFIG_SHARED 0
+ #define CONFIG_STATIC 1
+ #define CONFIG_SMALL 0
+ #define CONFIG_POSTPROC_VISUALIZER 0
+ #define CONFIG_OS_SUPPORT 1
+ #define CONFIG_UNIT_TESTS 0
+

+ 12454 - 0
mozilla-release/patches/1540760-5-68a1.patch

@@ -0,0 +1,12454 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1556751985 0
+# Node ID 742b7c0a4bdbbe5f4004b038b4b5b4467ef4484b
+# Parent  f40ae51578ac27c6ea38af1e2818a12ac0b93dbd
+Bug 1540760 - Add missing aarch64 files for ffvpx; r=jya
+
+Differential Revision: https://phabricator.services.mozilla.com/D27789
+
+diff --git a/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
+@@ -0,0 +1,50 @@
++/*
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/aarch64/cpu.h"
++
++#include "libavcodec/fft.h"
++
++void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
++void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
++
++void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
++void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
++void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
++
++av_cold void ff_fft_init_aarch64(FFTContext *s)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++        s->fft_permute  = ff_fft_permute_neon;
++        s->fft_calc     = ff_fft_calc_neon;
++#if CONFIG_MDCT
++        s->imdct_calc   = ff_imdct_calc_neon;
++        s->imdct_half   = ff_imdct_half_neon;
++        s->mdct_calc    = ff_mdct_calc_neon;
++        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
++#endif
++    }
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/fft_neon.S b/media/ffvpx/libavcodec/aarch64/fft_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/fft_neon.S
+@@ -0,0 +1,442 @@
++/*
++ * ARM NEON optimised FFT
++ *
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2009 Naotoshi Nojiri
++ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This algorithm (though not any of the implementation details) is
++ * based on libdjbfft by D. J. Bernstein.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++#define M_SQRT1_2 0.70710678118654752440
++
++.macro transpose d0, d1, s0, s1
++        trn1            \d0, \s0, \s1
++        trn2            \d1, \s0, \s1
++.endm
++
++
++function fft4_neon
++        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
++
++        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
++        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
++
++        ext             v16.8b, v2.8b,  v3.8b,  #4
++        ext             v17.8b, v3.8b,  v2.8b,  #4
++
++        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
++        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
++
++        fadd            v0.2s,  v4.2s,  v5.2s
++        fsub            v2.2s,  v4.2s,  v5.2s
++        fadd            v1.2s,  v6.2s,  v7.2s
++        fsub            v3.2s,  v6.2s,  v7.2s
++
++        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
++
++        ret
++endfunc
++
++function fft8_neon
++        mov             x1,  x0
++        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
++        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
++        ext             v22.8b, v2.8b,  v3.8b,  #4
++        ext             v23.8b, v3.8b,  v2.8b,  #4
++        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
++        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
++        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
++        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
++        rev64           v27.2s, v28.2s  // ???
++        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
++        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
++        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
++        ext             v6.8b,  v4.8b,  v5.8b,  #4
++        ext             v7.8b,  v5.8b,  v4.8b,  #4
++        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
++        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
++        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
++        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
++        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
++        fadd            v0.2s,  v20.2s, v21.2s
++        fsub            v2.2s,  v20.2s, v21.2s
++        fadd            v1.2s,  v22.2s, v23.2s
++        rev64           v26.2s, v26.2s
++        rev64           v27.2s, v27.2s
++        fsub            v3.2s,  v22.2s, v23.2s
++        fsub            v6.2s,  v6.2s,  v7.2s
++        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
++        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
++        fadd            v7.2s,  v4.2s,  v5.2s
++        fsub            v18.2s, v2.2s,  v6.2s
++        ext             v26.8b, v24.8b, v25.8b, #4
++        ext             v27.8b, v25.8b, v24.8b, #4
++        fadd            v2.2s,  v2.2s,  v6.2s
++        fsub            v16.2s, v0.2s,  v7.2s
++        fadd            v5.2s,  v25.2s, v24.2s
++        fsub            v4.2s,  v26.2s, v27.2s
++        fadd            v0.2s,  v0.2s,  v7.2s
++        fsub            v17.2s, v1.2s,  v5.2s
++        fsub            v19.2s, v3.2s,  v4.2s
++        fadd            v3.2s,  v3.2s,  v4.2s
++        fadd            v1.2s,  v1.2s,  v5.2s
++
++        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
++        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
++
++        ret
++endfunc
++
++function fft16_neon
++        mov             x1,  x0
++        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
++        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
++        ext             v22.8b, v2.8b,  v3.8b,  #4
++        ext             v23.8b, v3.8b,  v2.8b,  #4
++        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
++        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
++        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
++        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
++        rev64           v27.2s, v28.2s  // ???
++        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
++        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
++        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
++        ext             v6.8b,  v4.8b,  v5.8b,  #4
++        ext             v7.8b,  v5.8b,  v4.8b,  #4
++        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
++        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
++        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
++        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
++        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
++        fadd            v0.2s,  v20.2s, v21.2s
++        fsub            v2.2s,  v20.2s, v21.2s
++        fadd            v1.2s,  v22.2s, v23.2s
++        rev64           v26.2s, v26.2s
++        rev64           v27.2s, v27.2s
++        fsub            v3.2s,  v22.2s, v23.2s
++        fsub            v6.2s,  v6.2s,  v7.2s
++        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
++        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
++        fadd            v7.2s,  v4.2s,  v5.2s
++        fsub            v18.2s, v2.2s,  v6.2s
++        ld1             {v20.4s,v21.4s}, [x0], #32
++        ld1             {v22.4s,v23.4s}, [x0], #32
++        ext             v26.8b, v24.8b, v25.8b, #4
++        ext             v27.8b, v25.8b, v24.8b, #4
++        fadd            v2.2s,  v2.2s,  v6.2s
++        fsub            v16.2s, v0.2s,  v7.2s
++        fadd            v5.2s,  v25.2s, v24.2s
++        fsub            v4.2s,  v26.2s, v27.2s
++        transpose       v24.2d, v25.2d, v20.2d, v22.2d
++        transpose       v26.2d, v27.2d, v21.2d, v23.2d
++        fadd            v0.2s,  v0.2s,  v7.2s
++        fsub            v17.2s, v1.2s,  v5.2s
++        fsub            v19.2s, v3.2s,  v4.2s
++        fadd            v3.2s,  v3.2s,  v4.2s
++        fadd            v1.2s,  v1.2s,  v5.2s
++        ext             v20.16b, v21.16b, v21.16b,  #4
++        ext             v21.16b, v23.16b, v23.16b,  #4
++
++        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
++        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
++        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
++        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
++
++        // 2 x fft4
++        transpose       v22.2d, v23.2d, v20.2d, v21.2d
++
++        fadd            v4.4s,  v24.4s, v25.4s
++        fadd            v5.4s,  v26.4s, v27.4s
++        fsub            v6.4s,  v24.4s, v25.4s
++        fsub            v7.4s,  v22.4s, v23.4s
++
++        ld1             {v23.4s},  [x14]
++
++        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
++        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
++        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
++        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
++
++        //fft_pass_neon_16
++        rev64           v7.4s,  v25.4s
++        fmul            v25.4s, v25.4s, v23.s[1]
++        fmul            v7.4s,  v7.4s,  v29.4s
++        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
++
++        zip1            v20.4s, v24.4s, v25.4s
++        zip2            v21.4s, v24.4s, v25.4s
++        fneg            v22.4s, v20.4s
++        fadd            v4.4s,  v21.4s, v20.4s
++        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
++        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
++
++        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
++        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
++
++        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
++        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
++        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
++        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
++
++//second half
++        rev64           v6.4s,  v26.4s
++        fmul            v26.4s, v26.4s, v23.s[2]
++        rev64           v7.4s,  v27.4s
++        fmul            v27.4s, v27.4s, v23.s[3]
++        fmul            v6.4s,  v6.4s,  v29.4s
++        fmul            v7.4s,  v7.4s,  v29.4s
++        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
++        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
++
++        zip1            v24.4s, v26.4s, v27.4s
++        zip2            v25.4s, v26.4s, v27.4s
++        fneg            v26.4s, v24.4s
++        fadd            v4.4s,  v25.4s, v24.4s
++        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
++        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
++
++        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
++        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
++
++        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
++        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
++        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
++        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
++
++        st1             {v16.4s,v17.4s}, [x1], #32
++        st1             {v18.4s,v19.4s}, [x1], #32
++        st1             {v20.4s,v21.4s}, [x1], #32
++        st1             {v22.4s,v23.4s}, [x1], #32
++
++        ret
++endfunc
++
++
++const  trans4_float, align=4
++        .byte    0,  1,  2,  3
++        .byte    8,  9, 10, 11
++        .byte    4,  5,  6,  7
++        .byte   12, 13, 14, 15
++endconst
++
++const  trans8_float, align=4
++        .byte   24, 25, 26, 27
++        .byte    0,  1,  2,  3
++        .byte   28, 29, 30, 31
++        .byte    4,  5,  6,  7
++endconst
++
++function fft_pass_neon
++        sub             x6,  x2,  #1            // n - 1, loop counter
++        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
++        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
++        add             x5,  x4,  x5            // wim
++        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
++        add             x2,  x0,  x2,  lsl #5   // &z[o2]
++        add             x3,  x0,  x3            // &z[o3]
++        add             x1,  x0,  x1            // &z[o1]
++        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
++        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
++        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
++        trn2            v25.2d, v20.2d, v22.2d
++        sub             x5,  x5,  #4            // wim--
++        trn1            v24.2d, v20.2d, v22.2d
++        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
++        rev64           v7.4s,  v25.4s
++        fmul            v25.4s, v25.4s, v4.s[1]
++        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
++        fmul            v7.4s,  v7.4s,  v29.4s
++        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
++        prfm            pldl1keep, [x2, #16]
++        prfm            pldl1keep, [x3, #16]
++        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
++        prfm            pldl1keep, [x0, #16]
++        prfm            pldl1keep, [x1, #16]
++
++        zip1            v20.4s, v24.4s, v25.4s
++        zip2            v21.4s, v24.4s, v25.4s
++        fneg            v22.4s, v20.4s
++        fadd            v4.4s,  v21.4s, v20.4s
++        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
++        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
++
++        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
++        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
++
++        fadd            v20.4s, v16.4s, v4.4s
++        fsub            v22.4s, v16.4s, v4.4s
++        fadd            v21.4s, v17.4s, v5.4s
++        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
++        fsub            v23.4s, v17.4s, v5.4s
++
++        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
++        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
++        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
++1:
++        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
++        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
++        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
++        transpose       v26.2d, v27.2d, v20.2d, v22.2d
++        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
++        rev64           v6.4s,  v26.4s
++        fmul            v26.4s, v26.4s, v4.s[0]
++        rev64           v7.4s,  v27.4s
++        fmul            v27.4s, v27.4s, v4.s[1]
++        fmul            v6.4s,  v6.4s,  v29.4s
++        fmul            v7.4s,  v7.4s,  v29.4s
++        ld1             {v16.4s},[x0]           // {z[0],z[1]}
++        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
++        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
++        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
++
++        subs            x6,  x6,  #1            // n--
++
++        zip1            v20.4s, v26.4s, v27.4s
++        zip2            v21.4s, v26.4s, v27.4s
++        fneg            v22.4s, v20.4s
++        fadd            v4.4s,  v21.4s, v20.4s
++        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
++        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
++
++        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
++        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
++
++        fadd            v20.4s, v16.4s, v4.4s
++        fsub            v22.4s, v16.4s, v4.4s
++        fadd            v21.4s, v17.4s, v5.4s
++        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
++        fsub            v23.4s, v17.4s, v5.4s
++
++        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
++        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
++        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
++        b.ne            1b
++
++        ret
++endfunc
++
++.macro  def_fft n, n2, n4
++function fft\n\()_neon, align=6
++        sub             sp,  sp,  #16
++        stp             x28, x30, [sp]
++        add             x28, x0,  #\n4*2*8
++        bl              fft\n2\()_neon
++        mov             x0,  x28
++        bl              fft\n4\()_neon
++        add             x0,  x28, #\n4*1*8
++        bl              fft\n4\()_neon
++        sub             x0,  x28, #\n4*2*8
++        ldp             x28, x30, [sp], #16
++        movrel          x4,  X(ff_cos_\n)
++        mov             x2,  #\n4>>1
++        b               fft_pass_neon
++endfunc
++.endm
++
++        def_fft    32,    16,     8
++        def_fft    64,    32,    16
++        def_fft   128,    64,    32
++        def_fft   256,   128,    64
++        def_fft   512,   256,   128
++        def_fft  1024,   512,   256
++        def_fft  2048,  1024,   512
++        def_fft  4096,  2048,  1024
++        def_fft  8192,  4096,  2048
++        def_fft 16384,  8192,  4096
++        def_fft 32768, 16384,  8192
++        def_fft 65536, 32768, 16384
++
++function ff_fft_calc_neon, export=1
++        prfm            pldl1keep, [x1]
++        movrel          x10, trans4_float
++        ldr             w2,  [x0]
++        movrel          x11, trans8_float
++        sub             w2,  w2,  #2
++        movrel          x3,  fft_tab_neon
++        ld1             {v30.16b}, [x10]
++        mov             x7,  #-8
++        movrel          x12, pmmp
++        ldr             x3,  [x3, x2, lsl #3]
++        movrel          x13, mppm
++        movrel          x14, X(ff_cos_16)
++        ld1             {v31.16b}, [x11]
++        mov             x0,  x1
++        ld1             {v29.4s},  [x12]         // pmmp
++        ld1             {v28.4s},  [x13]
++        br              x3
++endfunc
++
++function ff_fft_permute_neon, export=1
++        mov             x6,  #1
++        ldr             w2,  [x0]       // nbits
++        ldr             x3,  [x0, #16]  // tmp_buf
++        ldr             x0,  [x0, #8]   // revtab
++        lsl             x6,  x6, x2
++        mov             x2,  x6
++1:
++        ld1             {v0.2s,v1.2s}, [x1], #16
++        ldr             w4,  [x0], #4
++        uxth            w5,  w4
++        lsr             w4,  w4,  #16
++        add             x5,  x3,  x5,  lsl #3
++        add             x4,  x3,  x4,  lsl #3
++        st1             {v0.2s}, [x5]
++        st1             {v1.2s}, [x4]
++        subs            x6,  x6, #2
++        b.gt            1b
++
++        sub             x1,  x1,  x2,  lsl #3
++1:
++        ld1             {v0.4s,v1.4s}, [x3], #32
++        st1             {v0.4s,v1.4s}, [x1], #32
++        subs            x2,  x2,  #4
++        b.gt            1b
++
++        ret
++endfunc
++
++const   fft_tab_neon, relocate=1
++        .quad fft4_neon
++        .quad fft8_neon
++        .quad fft16_neon
++        .quad fft32_neon
++        .quad fft64_neon
++        .quad fft128_neon
++        .quad fft256_neon
++        .quad fft512_neon
++        .quad fft1024_neon
++        .quad fft2048_neon
++        .quad fft4096_neon
++        .quad fft8192_neon
++        .quad fft16384_neon
++        .quad fft32768_neon
++        .quad fft65536_neon
++endconst
++
++const   pmmp, align=4
++        .float          +1.0, -1.0, -1.0, +1.0
++endconst
++
++const   mppm, align=4
++        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
++endconst
+diff --git a/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
+@@ -0,0 +1,59 @@
++/*
++ * ARM NEON optimised H.264 chroma functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/h264chroma.h"
++
++#include "config.h"
++
++void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                 int h, int x, int y);
++void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                 int h, int x, int y);
++void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                 int h, int x, int y);
++
++void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                 int h, int x, int y);
++void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                 int h, int x, int y);
++void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                 int h, int x, int y);
++
++av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
++{
++    const int high_bit_depth = bit_depth > 8;
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags) && !high_bit_depth) {
++        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
++        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
++        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
++
++        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
++        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
++        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
++    }
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
+@@ -0,0 +1,450 @@
++/*
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
++.macro  h264_chroma_mc8 type, codec=h264
++function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
++  .ifc \type,avg
++        mov             x8,  x0
++  .endif
++        prfm            pldl1strm, [x1]
++        prfm            pldl1strm, [x1, x2]
++  .ifc \codec,rv40
++        movrel          x6,  rv40bias
++        lsr             w9,  w5,  #1
++        lsr             w10, w4,  #1
++        lsl             w9,  w9,  #3
++        lsl             w10, w10, #1
++        add             w9,  w9,  w10
++        add             x6,  x6,  w9, UXTW
++        ld1r            {v22.8H}, [x6]
++  .endif
++  .ifc \codec,vc1
++        movi            v22.8H,   #28
++  .endif
++        mul             w7,  w4,  w5
++        lsl             w14, w5,  #3
++        lsl             w13, w4,  #3
++        cmp             w7,  #0
++        sub             w6,  w14, w7
++        sub             w12, w13, w7
++        sub             w4,  w7,  w13
++        sub             w4,  w4,  w14
++        add             w4,  w4,  #64
++        b.eq            2f
++
++        dup             v0.8B,  w4
++        dup             v1.8B,  w12
++        ld1             {v4.8B, v5.8B}, [x1], x2
++        dup             v2.8B,  w6
++        dup             v3.8B,  w7
++        ext             v5.8B,  v4.8B,  v5.8B,  #1
++1:      ld1             {v6.8B, v7.8B}, [x1], x2
++        umull           v16.8H, v4.8B,  v0.8B
++        umlal           v16.8H, v5.8B,  v1.8B
++        ext             v7.8B,  v6.8B,  v7.8B,  #1
++        ld1             {v4.8B, v5.8B}, [x1], x2
++        umlal           v16.8H, v6.8B,  v2.8B
++        prfm            pldl1strm, [x1]
++        ext             v5.8B,  v4.8B,  v5.8B,  #1
++        umlal           v16.8H, v7.8B,  v3.8B
++        umull           v17.8H, v6.8B,  v0.8B
++        subs            w3,  w3,  #2
++        umlal           v17.8H, v7.8B, v1.8B
++        umlal           v17.8H, v4.8B, v2.8B
++        umlal           v17.8H, v5.8B, v3.8B
++        prfm            pldl1strm, [x1, x2]
++  .ifc \codec,h264
++        rshrn           v16.8B, v16.8H, #6
++        rshrn           v17.8B, v17.8H, #6
++  .else
++        add             v16.8H, v16.8H, v22.8H
++        add             v17.8H, v17.8H, v22.8H
++        shrn            v16.8B, v16.8H, #6
++        shrn            v17.8B, v17.8H, #6
++  .endif
++  .ifc \type,avg
++        ld1             {v20.8B}, [x8], x2
++        ld1             {v21.8B}, [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++        urhadd          v17.8B, v17.8B, v21.8B
++  .endif
++        st1             {v16.8B}, [x0], x2
++        st1             {v17.8B}, [x0], x2
++        b.gt            1b
++        ret
++
++2:      adds            w12, w12, w6
++        dup             v0.8B, w4
++        b.eq            5f
++        tst             w6,  w6
++        dup             v1.8B, w12
++        b.eq            4f
++
++        ld1             {v4.8B}, [x1], x2
++3:      ld1             {v6.8B}, [x1], x2
++        umull           v16.8H, v4.8B,  v0.8B
++        umlal           v16.8H, v6.8B,  v1.8B
++        ld1             {v4.8B}, [x1], x2
++        umull           v17.8H, v6.8B,  v0.8B
++        umlal           v17.8H, v4.8B,  v1.8B
++        prfm            pldl1strm, [x1]
++  .ifc \codec,h264
++        rshrn           v16.8B, v16.8H, #6
++        rshrn           v17.8B, v17.8H, #6
++  .else
++        add             v16.8H, v16.8H, v22.8H
++        add             v17.8H, v17.8H, v22.8H
++        shrn            v16.8B, v16.8H, #6
++        shrn            v17.8B, v17.8H, #6
++  .endif
++        prfm            pldl1strm, [x1, x2]
++  .ifc \type,avg
++        ld1             {v20.8B}, [x8], x2
++        ld1             {v21.8B}, [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++        urhadd          v17.8B, v17.8B, v21.8B
++  .endif
++        subs            w3,  w3,  #2
++        st1             {v16.8B}, [x0], x2
++        st1             {v17.8B}, [x0], x2
++        b.gt            3b
++        ret
++
++4:      ld1             {v4.8B, v5.8B}, [x1], x2
++        ld1             {v6.8B, v7.8B}, [x1], x2
++        ext             v5.8B,  v4.8B,  v5.8B,  #1
++        ext             v7.8B,  v6.8B,  v7.8B,  #1
++        prfm            pldl1strm, [x1]
++        subs            w3,  w3,  #2
++        umull           v16.8H, v4.8B, v0.8B
++        umlal           v16.8H, v5.8B, v1.8B
++        umull           v17.8H, v6.8B, v0.8B
++        umlal           v17.8H, v7.8B, v1.8B
++        prfm            pldl1strm, [x1, x2]
++  .ifc \codec,h264
++        rshrn           v16.8B, v16.8H, #6
++        rshrn           v17.8B, v17.8H, #6
++  .else
++        add             v16.8H, v16.8H, v22.8H
++        add             v17.8H, v17.8H, v22.8H
++        shrn            v16.8B, v16.8H, #6
++        shrn            v17.8B, v17.8H, #6
++  .endif
++  .ifc \type,avg
++        ld1             {v20.8B}, [x8], x2
++        ld1             {v21.8B}, [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++        urhadd          v17.8B, v17.8B, v21.8B
++  .endif
++        st1             {v16.8B}, [x0], x2
++        st1             {v17.8B}, [x0], x2
++        b.gt            4b
++        ret
++
++5:      ld1             {v4.8B}, [x1], x2
++        ld1             {v5.8B}, [x1], x2
++        prfm            pldl1strm, [x1]
++        subs            w3,  w3,  #2
++        umull           v16.8H, v4.8B, v0.8B
++        umull           v17.8H, v5.8B, v0.8B
++        prfm            pldl1strm, [x1, x2]
++  .ifc \codec,h264
++        rshrn           v16.8B, v16.8H, #6
++        rshrn           v17.8B, v17.8H, #6
++  .else
++        add             v16.8H, v16.8H, v22.8H
++        add             v17.8H, v17.8H, v22.8H
++        shrn            v16.8B, v16.8H, #6
++        shrn            v17.8B, v17.8H, #6
++  .endif
++  .ifc \type,avg
++        ld1             {v20.8B}, [x8], x2
++        ld1             {v21.8B}, [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++        urhadd          v17.8B, v17.8B, v21.8B
++  .endif
++        st1             {v16.8B}, [x0], x2
++        st1             {v17.8B}, [x0], x2
++        b.gt            5b
++        ret
++endfunc
++.endm
++
++/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
++.macro  h264_chroma_mc4 type, codec=h264
++function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
++  .ifc \type,avg
++        mov             x8,  x0
++  .endif
++        prfm            pldl1strm, [x1]
++        prfm            pldl1strm, [x1, x2]
++  .ifc \codec,rv40
++        movrel          x6,  rv40bias
++        lsr             w9,  w5,  #1
++        lsr             w10, w4,  #1
++        lsl             w9,  w9,  #3
++        lsl             w10, w10, #1
++        add             w9,  w9,  w10
++        add             x6,  x6,  w9, UXTW
++        ld1r            {v22.8H}, [x6]
++  .endif
++  .ifc \codec,vc1
++        movi            v22.8H,   #28
++  .endif
++        mul             w7,  w4,  w5
++        lsl             w14, w5,  #3
++        lsl             w13, w4,  #3
++        cmp             w7,  #0
++        sub             w6,  w14, w7
++        sub             w12, w13, w7
++        sub             w4,  w7,  w13
++        sub             w4,  w4,  w14
++        add             w4,  w4,  #64
++        b.eq            2f
++
++        dup             v24.8B,  w4
++        dup             v25.8B,  w12
++        ld1             {v4.8B}, [x1], x2
++        dup             v26.8B,  w6
++        dup             v27.8B,  w7
++        ext             v5.8B,  v4.8B,  v5.8B, #1
++        trn1            v0.2S,  v24.2S, v25.2S
++        trn1            v2.2S,  v26.2S, v27.2S
++        trn1            v4.2S,  v4.2S,  v5.2S
++1:      ld1             {v6.8B}, [x1], x2
++        ext             v7.8B,  v6.8B,  v7.8B, #1
++        trn1            v6.2S,  v6.2S,  v7.2S
++        umull           v18.8H, v4.8B,  v0.8B
++        umlal           v18.8H, v6.8B,  v2.8B
++        ld1             {v4.8B}, [x1], x2
++        ext             v5.8B,  v4.8B,  v5.8B, #1
++        trn1            v4.2S,  v4.2S,  v5.2S
++        prfm            pldl1strm, [x1]
++        umull           v19.8H, v6.8B,  v0.8B
++        umlal           v19.8H, v4.8B,  v2.8B
++        trn1            v30.2D, v18.2D, v19.2D
++        trn2            v31.2D, v18.2D, v19.2D
++        add             v18.8H, v30.8H, v31.8H
++  .ifc \codec,h264
++        rshrn           v16.8B, v18.8H, #6
++  .else
++        add             v18.8H, v18.8H, v22.8H
++        shrn            v16.8B, v18.8H, #6
++  .endif
++        subs            w3,  w3,  #2
++        prfm            pldl1strm, [x1, x2]
++  .ifc \type,avg
++        ld1             {v20.S}[0], [x8], x2
++        ld1             {v20.S}[1], [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++  .endif
++        st1             {v16.S}[0], [x0], x2
++        st1             {v16.S}[1], [x0], x2
++        b.gt            1b
++        ret
++
++2:      adds            w12, w12, w6
++        dup             v30.8B, w4
++        b.eq            5f
++        tst             w6,  w6
++        dup             v31.8B, w12
++        trn1            v0.2S,  v30.2S, v31.2S
++        trn2            v1.2S,  v30.2S, v31.2S
++        b.eq            4f
++
++        ext             v1.8B,  v0.8B,  v1.8B, #4
++        ld1             {v4.S}[0], [x1], x2
++3:      ld1             {v4.S}[1], [x1], x2
++        umull           v18.8H, v4.8B,  v0.8B
++        ld1             {v4.S}[0], [x1], x2
++        umull           v19.8H, v4.8B,  v1.8B
++        trn1            v30.2D, v18.2D, v19.2D
++        trn2            v31.2D, v18.2D, v19.2D
++        add             v18.8H, v30.8H, v31.8H
++        prfm            pldl1strm, [x1]
++  .ifc \codec,h264
++        rshrn           v16.8B, v18.8H, #6
++  .else
++        add             v18.8H, v18.8H, v22.8H
++        shrn            v16.8B, v18.8H, #6
++  .endif
++  .ifc \type,avg
++        ld1             {v20.S}[0], [x8], x2
++        ld1             {v20.S}[1], [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++  .endif
++        subs            w3,  w3,  #2
++        prfm            pldl1strm, [x1, x2]
++        st1             {v16.S}[0], [x0], x2
++        st1             {v16.S}[1], [x0], x2
++        b.gt            3b
++        ret
++
++4:      ld1             {v4.8B}, [x1], x2
++        ld1             {v6.8B}, [x1], x2
++        ext             v5.8B,  v4.8B,  v5.8B, #1
++        ext             v7.8B,  v6.8B,  v7.8B, #1
++        trn1            v4.2S,  v4.2S,  v5.2S
++        trn1            v6.2S,  v6.2S,  v7.2S
++        umull           v18.8H, v4.8B,  v0.8B
++        umull           v19.8H, v6.8B,  v0.8B
++        subs            w3,  w3,  #2
++        trn1            v30.2D, v18.2D, v19.2D
++        trn2            v31.2D, v18.2D, v19.2D
++        add             v18.8H, v30.8H, v31.8H
++        prfm            pldl1strm, [x1]
++  .ifc \codec,h264
++        rshrn           v16.8B, v18.8H, #6
++  .else
++        add             v18.8H, v18.8H, v22.8H
++        shrn            v16.8B, v18.8H, #6
++  .endif
++  .ifc \type,avg
++        ld1             {v20.S}[0], [x8], x2
++        ld1             {v20.S}[1], [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++  .endif
++        prfm            pldl1strm, [x1]
++        st1             {v16.S}[0], [x0], x2
++        st1             {v16.S}[1], [x0], x2
++        b.gt            4b
++        ret
++
++5:      ld1             {v4.S}[0], [x1], x2
++        ld1             {v4.S}[1], [x1], x2
++        umull           v18.8H, v4.8B,  v30.8B
++        subs            w3,  w3,  #2
++        prfm            pldl1strm, [x1]
++  .ifc \codec,h264
++        rshrn           v16.8B, v18.8H, #6
++  .else
++        add             v18.8H, v18.8H, v22.8H
++        shrn            v16.8B, v18.8H, #6
++  .endif
++  .ifc \type,avg
++        ld1             {v20.S}[0], [x8], x2
++        ld1             {v20.S}[1], [x8], x2
++        urhadd          v16.8B, v16.8B, v20.8B
++  .endif
++        prfm            pldl1strm, [x1]
++        st1             {v16.S}[0], [x0], x2
++        st1             {v16.S}[1], [x0], x2
++        b.gt            5b
++        ret
++endfunc
++.endm
++
++.macro  h264_chroma_mc2 type
++function ff_\type\()_h264_chroma_mc2_neon, export=1
++        prfm            pldl1strm, [x1]
++        prfm            pldl1strm, [x1, x2]
++        orr             w7,  w4,  w5
++        cbz             w7,  2f
++
++        mul             w7,  w4,  w5
++        lsl             w14, w5,  #3
++        lsl             w13, w4,  #3
++        sub             w6,  w14, w7
++        sub             w12, w13, w7
++        sub             w4,  w7,  w13
++        sub             w4,  w4,  w14
++        add             w4,  w4,  #64
++        dup             v0.8B,  w4
++        dup             v2.8B,  w12
++        dup             v1.8B,  w6
++        dup             v3.8B,  w7
++        trn1            v0.4H,  v0.4H,  v2.4H
++        trn1            v1.4H,  v1.4H,  v3.4H
++1:
++        ld1             {v4.S}[0],  [x1], x2
++        ld1             {v4.S}[1],  [x1], x2
++        rev64           v5.2S,  v4.2S
++        ld1             {v5.S}[1],  [x1]
++        ext             v6.8B,  v4.8B,  v5.8B,  #1
++        ext             v7.8B,  v5.8B,  v4.8B,  #1
++        trn1            v4.4H,  v4.4H,  v6.4H
++        trn1            v5.4H,  v5.4H,  v7.4H
++        umull           v16.8H, v4.8B,  v0.8B
++        umlal           v16.8H, v5.8B,  v1.8B
++  .ifc \type,avg
++        ld1             {v18.H}[0], [x0], x2
++        ld1             {v18.H}[2], [x0]
++        sub             x0,  x0,  x2
++  .endif
++        rev64           v17.4S, v16.4S
++        add             v16.8H, v16.8H, v17.8H
++        rshrn           v16.8B, v16.8H, #6
++  .ifc \type,avg
++        urhadd          v16.8B, v16.8B, v18.8B
++  .endif
++        st1             {v16.H}[0], [x0], x2
++        st1             {v16.H}[2], [x0], x2
++        subs            w3,  w3,  #2
++        b.gt            1b
++        ret
++
++2:
++        ld1             {v16.H}[0], [x1], x2
++        ld1             {v16.H}[1], [x1], x2
++  .ifc \type,avg
++        ld1             {v18.H}[0], [x0], x2
++        ld1             {v18.H}[1], [x0]
++        sub             x0,  x0,  x2
++        urhadd          v16.8B, v16.8B, v18.8B
++  .endif
++        st1             {v16.H}[0], [x0], x2
++        st1             {v16.H}[1], [x0], x2
++        subs            w3,  w3,  #2
++        b.gt            2b
++        ret
++endfunc
++.endm
++
++        h264_chroma_mc8 put
++        h264_chroma_mc8 avg
++        h264_chroma_mc4 put
++        h264_chroma_mc4 avg
++        h264_chroma_mc2 put
++        h264_chroma_mc2 avg
++
++#if CONFIG_RV40_DECODER
++const   rv40bias
++        .short           0, 16, 32, 16
++        .short          32, 28, 32, 28
++        .short           0, 32, 16, 32
++        .short          32, 28, 32, 28
++endconst
++
++        h264_chroma_mc8 put, rv40
++        h264_chroma_mc8 avg, rv40
++        h264_chroma_mc4 put, rv40
++        h264_chroma_mc4 avg, rv40
++#endif
++
++#if CONFIG_VC1DSP
++        h264_chroma_mc8 put, vc1
++        h264_chroma_mc8 avg, vc1
++        h264_chroma_mc4 put, vc1
++        h264_chroma_mc4 avg, vc1
++#endif
+diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
+@@ -0,0 +1,102 @@
++/*
++ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/h264dsp.h"
++
++void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
++                                     int beta, int8_t *tc0);
++void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
++                                     int beta, int8_t *tc0);
++void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
++                                       int beta, int8_t *tc0);
++void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
++                                       int beta, int8_t *tc0);
++
++void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
++                                   int log2_den, int weight, int offset);
++void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
++                                  int log2_den, int weight, int offset);
++void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
++                                  int log2_den, int weight, int offset);
++
++void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
++                                     int height, int log2_den, int weightd,
++                                     int weights, int offset);
++void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
++                                    int height, int log2_den, int weightd,
++                                    int weights, int offset);
++void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
++                                    int height, int log2_den, int weightd,
++                                    int weights, int offset);
++
++void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
++void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
++void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
++                             int16_t *block, int stride,
++                             const uint8_t nnzc[6*8]);
++void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
++                                  int16_t *block, int stride,
++                                  const uint8_t nnzc[6*8]);
++void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
++                            int16_t *block, int stride,
++                            const uint8_t nnzc[6*8]);
++
++void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
++void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
++void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
++                             int16_t *block, int stride,
++                             const uint8_t nnzc[6*8]);
++
++av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
++                                     const int chroma_format_idc)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags) && bit_depth == 8) {
++        c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
++        c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
++        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
++        if (chroma_format_idc <= 1)
++        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
++
++        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
++        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
++        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
++
++        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
++        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
++        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
++
++        c->h264_idct_add        = ff_h264_idct_add_neon;
++        c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
++        c->h264_idct_add16      = ff_h264_idct_add16_neon;
++        c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
++        if (chroma_format_idc <= 1)
++            c->h264_idct_add8   = ff_h264_idct_add8_neon;
++        c->h264_idct8_add       = ff_h264_idct8_add_neon;
++        c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
++        c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
++    }
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
+@@ -0,0 +1,498 @@
++/*
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++#include "neon.S"
++
++.macro  h264_loop_filter_start
++        cmp             w2,  #0
++        ldr             w6,  [x4]
++        ccmp            w3,  #0, #0, ne
++        mov             v24.S[0], w6
++        and             w6,  w6,  w6,  lsl #16
++        b.eq            1f
++        ands            w6,  w6,  w6,  lsl #8
++        b.ge            2f
++1:
++        ret
++2:
++.endm
++
++.macro  h264_loop_filter_luma
++        dup             v22.16B, w2                     // alpha
++        uxtl            v24.8H,  v24.8B
++        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
++        uxtl            v24.4S,  v24.4H
++        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
++        sli             v24.8H,  v24.8H,  #8
++        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
++        sli             v24.4S,  v24.4S,  #16
++        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
++        dup             v22.16B, w3                     // beta
++        cmlt            v23.16B, v24.16B, #0
++        cmhi            v28.16B, v22.16B, v28.16B       // < beta
++        cmhi            v30.16B, v22.16B, v30.16B       // < beta
++        bic             v21.16B, v21.16B, v23.16B
++        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
++        and             v21.16B, v21.16B, v28.16B
++        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
++        cmhi            v17.16B, v22.16B, v17.16B       // < beta
++        and             v21.16B, v21.16B, v30.16B
++        cmhi            v19.16B, v22.16B, v19.16B       // < beta
++        and             v17.16B, v17.16B, v21.16B
++        and             v19.16B, v19.16B, v21.16B
++        and             v24.16B, v24.16B, v21.16B
++        urhadd          v28.16B, v16.16B,  v0.16B
++        sub             v21.16B, v24.16B, v17.16B
++        uqadd           v23.16B, v18.16B, v24.16B
++        uhadd           v20.16B, v20.16B, v28.16B
++        sub             v21.16B, v21.16B, v19.16B
++        uhadd           v28.16B,  v4.16B, v28.16B
++        umin            v23.16B, v23.16B, v20.16B
++        uqsub           v22.16B, v18.16B, v24.16B
++        uqadd           v4.16B,   v2.16B, v24.16B
++        umax            v23.16B, v23.16B, v22.16B
++        uqsub           v22.16B,  v2.16B, v24.16B
++        umin            v28.16B,  v4.16B, v28.16B
++        uxtl            v4.8H,    v0.8B
++        umax            v28.16B, v28.16B, v22.16B
++        uxtl2           v20.8H,   v0.16B
++        usubw           v4.8H,    v4.8H,  v16.8B
++        usubw2          v20.8H,  v20.8H,  v16.16B
++        shl             v4.8H,    v4.8H,  #2
++        shl             v20.8H,  v20.8H,  #2
++        uaddw           v4.8H,    v4.8H,  v18.8B
++        uaddw2          v20.8H,  v20.8H,  v18.16B
++        usubw           v4.8H,    v4.8H,   v2.8B
++        usubw2          v20.8H,  v20.8H,   v2.16B
++        rshrn           v4.8B,    v4.8H,  #3
++        rshrn2          v4.16B,  v20.8H,  #3
++        bsl             v17.16B, v23.16B, v18.16B
++        bsl             v19.16B, v28.16B,  v2.16B
++        neg             v23.16B, v21.16B
++        uxtl            v28.8H,  v16.8B
++        smin            v4.16B,   v4.16B, v21.16B
++        uxtl2           v21.8H,  v16.16B
++        smax            v4.16B,   v4.16B, v23.16B
++        uxtl            v22.8H,   v0.8B
++        uxtl2           v24.8H,   v0.16B
++        saddw           v28.8H,  v28.8H,  v4.8B
++        saddw2          v21.8H,  v21.8H,  v4.16B
++        ssubw           v22.8H,  v22.8H,  v4.8B
++        ssubw2          v24.8H,  v24.8H,  v4.16B
++        sqxtun          v16.8B,  v28.8H
++        sqxtun2         v16.16B, v21.8H
++        sqxtun          v0.8B,   v22.8H
++        sqxtun2         v0.16B,  v24.8H
++.endm
++
++function ff_h264_v_loop_filter_luma_neon, export=1
++        h264_loop_filter_start
++        sxtw            x1,  w1
++
++        ld1             {v0.16B},  [x0], x1
++        ld1             {v2.16B},  [x0], x1
++        ld1             {v4.16B},  [x0], x1
++        sub             x0,  x0,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #1
++        ld1             {v20.16B},  [x0], x1
++        ld1             {v18.16B},  [x0], x1
++        ld1             {v16.16B},  [x0], x1
++
++        h264_loop_filter_luma
++
++        sub             x0,  x0,  x1, lsl #1
++        st1             {v17.16B},  [x0], x1
++        st1             {v16.16B}, [x0], x1
++        st1             {v0.16B},  [x0], x1
++        st1             {v19.16B}, [x0]
++
++        ret
++endfunc
++
++function ff_h264_h_loop_filter_luma_neon, export=1
++        h264_loop_filter_start
++
++        sub             x0,  x0,  #4
++        ld1             {v6.8B},  [x0], x1
++        ld1             {v20.8B}, [x0], x1
++        ld1             {v18.8B}, [x0], x1
++        ld1             {v16.8B}, [x0], x1
++        ld1             {v0.8B},  [x0], x1
++        ld1             {v2.8B},  [x0], x1
++        ld1             {v4.8B},  [x0], x1
++        ld1             {v26.8B}, [x0], x1
++        ld1             {v6.D}[1],  [x0], x1
++        ld1             {v20.D}[1], [x0], x1
++        ld1             {v18.D}[1], [x0], x1
++        ld1             {v16.D}[1], [x0], x1
++        ld1             {v0.D}[1],  [x0], x1
++        ld1             {v2.D}[1],  [x0], x1
++        ld1             {v4.D}[1],  [x0], x1
++        ld1             {v26.D}[1], [x0], x1
++
++        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
++
++        h264_loop_filter_luma
++
++        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
++
++        sub             x0,  x0,  x1, lsl #4
++        add             x0,  x0,  #2
++        st1             {v17.S}[0],  [x0], x1
++        st1             {v16.S}[0], [x0], x1
++        st1             {v0.S}[0],  [x0], x1
++        st1             {v19.S}[0], [x0], x1
++        st1             {v17.S}[1],  [x0], x1
++        st1             {v16.S}[1], [x0], x1
++        st1             {v0.S}[1],  [x0], x1
++        st1             {v19.S}[1], [x0], x1
++        st1             {v17.S}[2],  [x0], x1
++        st1             {v16.S}[2], [x0], x1
++        st1             {v0.S}[2],  [x0], x1
++        st1             {v19.S}[2], [x0], x1
++        st1             {v17.S}[3],  [x0], x1
++        st1             {v16.S}[3], [x0], x1
++        st1             {v0.S}[3],  [x0], x1
++        st1             {v19.S}[3], [x0], x1
++
++        ret
++endfunc
++
++.macro  h264_loop_filter_chroma
++        dup             v22.8B, w2              // alpha
++        uxtl            v24.8H, v24.8B
++        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
++        uxtl            v4.8H,  v0.8B
++        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
++        usubw           v4.8H,  v4.8H,  v16.8B
++        sli             v24.8H, v24.8H, #8
++        shl             v4.8H,  v4.8H,  #2
++        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
++        uaddw           v4.8H,  v4.8H,  v18.8B
++        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
++        usubw           v4.8H,  v4.8H,  v2.8B
++        dup             v22.8B, w3              // beta
++        rshrn           v4.8B,  v4.8H,  #3
++        cmhi            v28.8B, v22.8B, v28.8B  // < beta
++        cmhi            v30.8B, v22.8B, v30.8B  // < beta
++        smin            v4.8B,  v4.8B,  v24.8B
++        neg             v25.8B, v24.8B
++        and             v26.8B, v26.8B, v28.8B
++        smax            v4.8B,  v4.8B,  v25.8B
++        and             v26.8B, v26.8B, v30.8B
++        uxtl            v22.8H, v0.8B
++        and             v4.8B,  v4.8B,  v26.8B
++        uxtl            v28.8H, v16.8B
++        saddw           v28.8H, v28.8H, v4.8B
++        ssubw           v22.8H, v22.8H, v4.8B
++        sqxtun          v16.8B, v28.8H
++        sqxtun          v0.8B,  v22.8H
++.endm
++
++function ff_h264_v_loop_filter_chroma_neon, export=1
++        h264_loop_filter_start
++
++        sub             x0,  x0,  x1, lsl #1
++        ld1             {v18.8B}, [x0], x1
++        ld1             {v16.8B}, [x0], x1
++        ld1             {v0.8B},  [x0], x1
++        ld1             {v2.8B},  [x0]
++
++        h264_loop_filter_chroma
++
++        sub             x0,  x0,  x1, lsl #1
++        st1             {v16.8B}, [x0], x1
++        st1             {v0.8B},  [x0], x1
++
++        ret
++endfunc
++
++function ff_h264_h_loop_filter_chroma_neon, export=1
++        h264_loop_filter_start
++
++        sub             x0,  x0,  #2
++        ld1             {v18.S}[0], [x0], x1
++        ld1             {v16.S}[0], [x0], x1
++        ld1             {v0.S}[0],  [x0], x1
++        ld1             {v2.S}[0],  [x0], x1
++        ld1             {v18.S}[1], [x0], x1
++        ld1             {v16.S}[1], [x0], x1
++        ld1             {v0.S}[1],  [x0], x1
++        ld1             {v2.S}[1],  [x0], x1
++
++        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
++
++        h264_loop_filter_chroma
++
++        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
++
++        sub             x0,  x0,  x1, lsl #3
++        st1             {v18.S}[0], [x0], x1
++        st1             {v16.S}[0], [x0], x1
++        st1             {v0.S}[0],  [x0], x1
++        st1             {v2.S}[0],  [x0], x1
++        st1             {v18.S}[1], [x0], x1
++        st1             {v16.S}[1], [x0], x1
++        st1             {v0.S}[1],  [x0], x1
++        st1             {v2.S}[1],  [x0], x1
++
++        ret
++endfunc
++
++.macro  biweight_16     macs, macd
++        dup             v0.16B,  w5
++        dup             v1.16B,  w6
++        mov             v4.16B,  v16.16B
++        mov             v6.16B,  v16.16B
++1:      subs            w3,  w3,  #2
++        ld1             {v20.16B}, [x0], x2
++        \macd           v4.8H,   v0.8B,  v20.8B
++        \macd\()2       v6.8H,   v0.16B, v20.16B
++        ld1             {v22.16B}, [x1], x2
++        \macs           v4.8H,   v1.8B,  v22.8B
++        \macs\()2       v6.8H,   v1.16B, v22.16B
++        mov             v24.16B, v16.16B
++        ld1             {v28.16B}, [x0], x2
++        mov             v26.16B, v16.16B
++        \macd           v24.8H,  v0.8B,  v28.8B
++        \macd\()2       v26.8H,  v0.16B, v28.16B
++        ld1             {v30.16B}, [x1], x2
++        \macs           v24.8H,  v1.8B,  v30.8B
++        \macs\()2       v26.8H,  v1.16B, v30.16B
++        sshl            v4.8H,   v4.8H,  v18.8H
++        sshl            v6.8H,   v6.8H,  v18.8H
++        sqxtun          v4.8B,   v4.8H
++        sqxtun2         v4.16B,  v6.8H
++        sshl            v24.8H,  v24.8H, v18.8H
++        sshl            v26.8H,  v26.8H, v18.8H
++        sqxtun          v24.8B,  v24.8H
++        sqxtun2         v24.16B, v26.8H
++        mov             v6.16B,  v16.16B
++        st1             {v4.16B},  [x7], x2
++        mov             v4.16B,  v16.16B
++        st1             {v24.16B}, [x7], x2
++        b.ne            1b
++        ret
++.endm
++
++.macro  biweight_8      macs, macd
++        dup             v0.8B,  w5
++        dup             v1.8B,  w6
++        mov             v2.16B,  v16.16B
++        mov             v20.16B, v16.16B
++1:      subs            w3,  w3,  #2
++        ld1             {v4.8B}, [x0], x2
++        \macd           v2.8H,  v0.8B,  v4.8B
++        ld1             {v5.8B}, [x1], x2
++        \macs           v2.8H,  v1.8B,  v5.8B
++        ld1             {v6.8B}, [x0], x2
++        \macd           v20.8H, v0.8B,  v6.8B
++        ld1             {v7.8B}, [x1], x2
++        \macs           v20.8H, v1.8B,  v7.8B
++        sshl            v2.8H,  v2.8H,  v18.8H
++        sqxtun          v2.8B,  v2.8H
++        sshl            v20.8H, v20.8H, v18.8H
++        sqxtun          v4.8B,  v20.8H
++        mov             v20.16B, v16.16B
++        st1             {v2.8B}, [x7], x2
++        mov             v2.16B,  v16.16B
++        st1             {v4.8B}, [x7], x2
++        b.ne            1b
++        ret
++.endm
++
++.macro  biweight_4      macs, macd
++        dup             v0.8B,  w5
++        dup             v1.8B,  w6
++        mov             v2.16B, v16.16B
++        mov             v20.16B,v16.16B
++1:      subs            w3,  w3,  #4
++        ld1             {v4.S}[0], [x0], x2
++        ld1             {v4.S}[1], [x0], x2
++        \macd           v2.8H,  v0.8B,  v4.8B
++        ld1             {v5.S}[0], [x1], x2
++        ld1             {v5.S}[1], [x1], x2
++        \macs           v2.8H,  v1.8B,  v5.8B
++        b.lt            2f
++        ld1             {v6.S}[0], [x0], x2
++        ld1             {v6.S}[1], [x0], x2
++        \macd           v20.8H, v0.8B,  v6.8B
++        ld1             {v7.S}[0], [x1], x2
++        ld1             {v7.S}[1], [x1], x2
++        \macs           v20.8H, v1.8B,  v7.8B
++        sshl            v2.8H,  v2.8H,  v18.8H
++        sqxtun          v2.8B,  v2.8H
++        sshl            v20.8H, v20.8H, v18.8H
++        sqxtun          v4.8B,  v20.8H
++        mov             v20.16B, v16.16B
++        st1             {v2.S}[0], [x7], x2
++        st1             {v2.S}[1], [x7], x2
++        mov             v2.16B,  v16.16B
++        st1             {v4.S}[0], [x7], x2
++        st1             {v4.S}[1], [x7], x2
++        b.ne            1b
++        ret
++2:      sshl            v2.8H,  v2.8H,  v18.8H
++        sqxtun          v2.8B,  v2.8H
++        st1             {v2.S}[0], [x7], x2
++        st1             {v2.S}[1], [x7], x2
++        ret
++.endm
++
++.macro  biweight_func   w
++function ff_biweight_h264_pixels_\w\()_neon, export=1
++        sxtw            x2,  w2
++        lsr             w8,  w5,  #31
++        add             w7,  w7,  #1
++        eor             w8,  w8,  w6,  lsr #30
++        orr             w7,  w7,  #1
++        dup             v18.8H,   w4
++        lsl             w7,  w7,  w4
++        not             v18.16B,  v18.16B
++        dup             v16.8H,   w7
++        mov             x7,  x0
++        cbz             w8,  10f
++        subs            w8,  w8,  #1
++        b.eq            20f
++        subs            w8,  w8,  #1
++        b.eq            30f
++        b               40f
++10:     biweight_\w     umlal, umlal
++20:     neg             w5, w5
++        biweight_\w     umlal, umlsl
++30:     neg             w5, w5
++        neg             w6, w6
++        biweight_\w     umlsl, umlsl
++40:     neg             w6, w6
++        biweight_\w     umlsl, umlal
++endfunc
++.endm
++
++        biweight_func   16
++        biweight_func   8
++        biweight_func   4
++
++.macro  weight_16       add
++        dup             v0.16B,  w4
++1:      subs            w2,  w2,  #2
++        ld1             {v20.16B}, [x0], x1
++        umull           v4.8H,   v0.8B,  v20.8B
++        umull2          v6.8H,   v0.16B, v20.16B
++        ld1             {v28.16B}, [x0], x1
++        umull           v24.8H,  v0.8B,  v28.8B
++        umull2          v26.8H,  v0.16B, v28.16B
++        \add            v4.8H,   v16.8H, v4.8H
++        srshl           v4.8H,   v4.8H,  v18.8H
++        \add            v6.8H,   v16.8H, v6.8H
++        srshl           v6.8H,   v6.8H,  v18.8H
++        sqxtun          v4.8B,   v4.8H
++        sqxtun2         v4.16B,  v6.8H
++        \add            v24.8H,  v16.8H, v24.8H
++        srshl           v24.8H,  v24.8H, v18.8H
++        \add            v26.8H,  v16.8H, v26.8H
++        srshl           v26.8H,  v26.8H, v18.8H
++        sqxtun          v24.8B,  v24.8H
++        sqxtun2         v24.16B, v26.8H
++        st1             {v4.16B},  [x5], x1
++        st1             {v24.16B}, [x5], x1
++        b.ne            1b
++        ret
++.endm
++
++.macro  weight_8        add
++        dup             v0.8B,  w4
++1:      subs            w2,  w2,  #2
++        ld1             {v4.8B}, [x0], x1
++        umull           v2.8H,  v0.8B,  v4.8B
++        ld1             {v6.8B}, [x0], x1
++        umull           v20.8H, v0.8B,  v6.8B
++        \add            v2.8H,  v16.8H,  v2.8H
++        srshl           v2.8H,  v2.8H,  v18.8H
++        sqxtun          v2.8B,  v2.8H
++        \add            v20.8H, v16.8H,  v20.8H
++        srshl           v20.8H, v20.8H, v18.8H
++        sqxtun          v4.8B,  v20.8H
++        st1             {v2.8B}, [x5], x1
++        st1             {v4.8B}, [x5], x1
++        b.ne            1b
++        ret
++.endm
++
++.macro  weight_4        add
++        dup             v0.8B,  w4
++1:      subs            w2,  w2,  #4
++        ld1             {v4.S}[0], [x0], x1
++        ld1             {v4.S}[1], [x0], x1
++        umull           v2.8H,  v0.8B,  v4.8B
++        b.lt            2f
++        ld1             {v6.S}[0], [x0], x1
++        ld1             {v6.S}[1], [x0], x1
++        umull           v20.8H, v0.8B,  v6.8B
++        \add            v2.8H,  v16.8H,  v2.8H
++        srshl           v2.8H,  v2.8H,  v18.8H
++        sqxtun          v2.8B,  v2.8H
++        \add            v20.8H, v16.8H,  v20.8H
++        srshl           v20.8H, v20.8h, v18.8H
++        sqxtun          v4.8B,  v20.8H
++        st1             {v2.S}[0], [x5], x1
++        st1             {v2.S}[1], [x5], x1
++        st1             {v4.S}[0], [x5], x1
++        st1             {v4.S}[1], [x5], x1
++        b.ne            1b
++        ret
++2:      \add            v2.8H,  v16.8H,  v2.8H
++        srshl           v2.8H,  v2.8H,  v18.8H
++        sqxtun          v2.8B,  v2.8H
++        st1             {v2.S}[0], [x5], x1
++        st1             {v2.S}[1], [x5], x1
++        ret
++.endm
++
++.macro  weight_func     w
++function ff_weight_h264_pixels_\w\()_neon, export=1
++        sxtw            x1,  w1
++        cmp             w3,  #1
++        mov             w6,  #1
++        lsl             w5,  w5,  w3
++        dup             v16.8H,  w5
++        mov             x5,  x0
++        b.le            20f
++        sub             w6,  w6,  w3
++        dup             v18.8H,  w6
++        cmp             w4, #0
++        b.lt            10f
++        weight_\w       shadd
++10:     neg             w4,  w4
++        weight_\w       shsub
++20:     neg             w6,  w3
++        dup             v18.8H,  w6
++        cmp             w4,  #0
++        b.lt            10f
++        weight_\w       add
++10:     neg             w4,  w4
++        weight_\w       sub
++endfunc
++.endm
++
++        weight_func     16
++        weight_func     8
++        weight_func     4
+diff --git a/media/ffvpx/libavcodec/aarch64/h264idct_neon.S b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
+@@ -0,0 +1,409 @@
++/*
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++#include "neon.S"
++
++function ff_h264_idct_add_neon, export=1
++        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
++        sxtw            x2,     w2
++        movi            v30.8H, #0
++
++        add             v4.4H,  v0.4H,  v2.4H
++        sshr            v16.4H, v1.4H,  #1
++        st1             {v30.8H},    [x1], #16
++        sshr            v17.4H, v3.4H,  #1
++        st1             {v30.8H},    [x1], #16
++        sub             v5.4H,  v0.4H,  v2.4H
++        sub             v6.4H,  v16.4H, v3.4H
++        add             v7.4H,  v1.4H,  v17.4H
++        add             v0.4H,  v4.4H,  v7.4H
++        add             v1.4H,  v5.4H,  v6.4H
++        sub             v2.4H,  v5.4H,  v6.4H
++        sub             v3.4H,  v4.4H,  v7.4H
++
++        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
++
++        add             v4.4H,  v0.4H,  v2.4H
++        ld1             {v18.S}[0], [x0], x2
++        sshr            v16.4H,  v3.4H,  #1
++        sshr            v17.4H,  v1.4H,  #1
++        ld1             {v18.S}[1], [x0], x2
++        sub             v5.4H,  v0.4H,  v2.4H
++        ld1             {v19.S}[1], [x0], x2
++        add             v6.4H,  v16.4H, v1.4H
++        ins             v4.D[1],  v5.D[0]
++        sub             v7.4H,  v17.4H, v3.4H
++        ld1             {v19.S}[0], [x0], x2
++        ins             v6.D[1],  v7.D[0]
++        sub             x0,  x0,  x2, lsl #2
++        add             v0.8H,  v4.8H,  v6.8H
++        sub             v1.8H,  v4.8H,  v6.8H
++
++        srshr           v0.8H,  v0.8H,  #6
++        srshr           v1.8H,  v1.8H,  #6
++
++        uaddw           v0.8H,  v0.8H,  v18.8B
++        uaddw           v1.8H,  v1.8H,  v19.8B
++
++        sqxtun          v0.8B, v0.8H
++        sqxtun          v1.8B, v1.8H
++
++        st1             {v0.S}[0],  [x0], x2
++        st1             {v0.S}[1],  [x0], x2
++        st1             {v1.S}[1],  [x0], x2
++        st1             {v1.S}[0],  [x0], x2
++
++        sub             x1,  x1,  #32
++        ret
++endfunc
++
++function ff_h264_idct_dc_add_neon, export=1
++        sxtw            x2,  w2
++        mov             w3,       #0
++        ld1r            {v2.8H},  [x1]
++        strh            w3,       [x1]
++        srshr           v2.8H,  v2.8H,  #6
++        ld1             {v0.S}[0],  [x0], x2
++        ld1             {v0.S}[1],  [x0], x2
++        uaddw           v3.8H,  v2.8H,  v0.8B
++        ld1             {v1.S}[0],  [x0], x2
++        ld1             {v1.S}[1],  [x0], x2
++        uaddw           v4.8H,  v2.8H,  v1.8B
++        sqxtun          v0.8B,  v3.8H
++        sqxtun          v1.8B,  v4.8H
++        sub             x0,  x0,  x2, lsl #2
++        st1             {v0.S}[0],  [x0], x2
++        st1             {v0.S}[1],  [x0], x2
++        st1             {v1.S}[0],  [x0], x2
++        st1             {v1.S}[1],  [x0], x2
++        ret
++endfunc
++
++function ff_h264_idct_add16_neon, export=1
++        mov             x12, x30
++        mov             x6,  x0         // dest
++        mov             x5,  x1         // block_offset
++        mov             x1,  x2         // block
++        mov             w9,  w3         // stride
++        movrel          x7,  scan8
++        mov             x10, #16
++        movrel          x13, X(ff_h264_idct_dc_add_neon)
++        movrel          x14, X(ff_h264_idct_add_neon)
++1:      mov             w2,  w9
++        ldrb            w3,  [x7], #1
++        ldrsw           x0,  [x5], #4
++        ldrb            w3,  [x4,  w3,  uxtw]
++        subs            w3,  w3,  #1
++        b.lt            2f
++        ldrsh           w3,  [x1]
++        add             x0,  x0,  x6
++        ccmp            w3,  #0,  #4,  eq
++        csel            x15, x13, x14, ne
++        blr             x15
++2:      subs            x10, x10, #1
++        add             x1,  x1,  #32
++        b.ne            1b
++        ret             x12
++endfunc
++
++function ff_h264_idct_add16intra_neon, export=1
++        mov             x12, x30
++        mov             x6,  x0         // dest
++        mov             x5,  x1         // block_offset
++        mov             x1,  x2         // block
++        mov             w9,  w3         // stride
++        movrel          x7,  scan8
++        mov             x10, #16
++        movrel          x13, X(ff_h264_idct_dc_add_neon)
++        movrel          x14, X(ff_h264_idct_add_neon)
++1:      mov             w2,  w9
++        ldrb            w3,  [x7], #1
++        ldrsw           x0,  [x5], #4
++        ldrb            w3,  [x4,  w3,  uxtw]
++        add             x0,  x0,  x6
++        cmp             w3,  #0
++        ldrsh           w3,  [x1]
++        csel            x15, x13, x14, eq
++        ccmp            w3,  #0,  #0,  eq
++        b.eq            2f
++        blr             x15
++2:      subs            x10, x10, #1
++        add             x1,  x1,  #32
++        b.ne            1b
++        ret             x12
++endfunc
++
++function ff_h264_idct_add8_neon, export=1
++        sub             sp,  sp, #0x40
++        stp             x19, x20, [sp]
++        mov             x12, x30
++        ldp             x6,  x15, [x0]          // dest[0], dest[1]
++        add             x5,  x1,  #16*4         // block_offset
++        add             x9,  x2,  #16*32        // block
++        mov             w19, w3                 // stride
++        movrel          x13, X(ff_h264_idct_dc_add_neon)
++        movrel          x14, X(ff_h264_idct_add_neon)
++        movrel          x7,  scan8, 16
++        mov             x10, #0
++        mov             x11, #16
++1:      mov             w2,  w19
++        ldrb            w3,  [x7, x10]          // scan8[i]
++        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
++        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
++        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
++        add             x1,  x9,  x10, lsl #5   // block + i * 16
++        cmp             w3,  #0
++        ldrsh           w3,  [x1]               // block[i*16]
++        csel            x20, x13, x14, eq
++        ccmp            w3,  #0,  #0,  eq
++        b.eq            2f
++        blr             x20
++2:      add             x10, x10, #1
++        cmp             x10, #4
++        csel            x10, x11, x10, eq     // mov x10, #16
++        csel            x6,  x15, x6,  eq
++        cmp             x10, #20
++        b.lt            1b
++        ldp             x19, x20, [sp]
++        add             sp,  sp,  #0x40
++        ret             x12
++endfunc
++
++.macro  idct8x8_cols    pass
++  .if \pass == 0
++        va      .req    v18
++        vb      .req    v30
++        sshr            v18.8H, v26.8H, #1
++        add             v16.8H, v24.8H, v28.8H
++        ld1             {v30.8H, v31.8H}, [x1]
++        st1             {v19.8H}, [x1],  #16
++        st1             {v19.8H}, [x1],  #16
++        sub             v17.8H,  v24.8H, v28.8H
++        sshr            v19.8H,  v30.8H, #1
++        sub             v18.8H,  v18.8H,  v30.8H
++        add             v19.8H,  v19.8H,  v26.8H
++  .else
++        va      .req    v30
++        vb      .req    v18
++        sshr            v30.8H, v26.8H, #1
++        sshr            v19.8H, v18.8H, #1
++        add             v16.8H, v24.8H, v28.8H
++        sub             v17.8H, v24.8H, v28.8H
++        sub             v30.8H, v30.8H, v18.8H
++        add             v19.8H, v19.8H, v26.8H
++  .endif
++        add             v26.8H, v17.8H, va.8H
++        sub             v28.8H, v17.8H, va.8H
++        add             v24.8H, v16.8H, v19.8H
++        sub             vb.8H,  v16.8H, v19.8H
++        sub             v16.8H, v29.8H, v27.8H
++        add             v17.8H, v31.8H, v25.8H
++        sub             va.8H,  v31.8H, v25.8H
++        add             v19.8H, v29.8H, v27.8H
++        sub             v16.8H, v16.8H, v31.8H
++        sub             v17.8H, v17.8H, v27.8H
++        add             va.8H,  va.8H,  v29.8H
++        add             v19.8H, v19.8H, v25.8H
++        sshr            v25.8H, v25.8H, #1
++        sshr            v27.8H, v27.8H, #1
++        sshr            v29.8H, v29.8H, #1
++        sshr            v31.8H, v31.8H, #1
++        sub             v16.8H, v16.8H, v31.8H
++        sub             v17.8H, v17.8H, v27.8H
++        add             va.8H,  va.8H,  v29.8H
++        add             v19.8H, v19.8H, v25.8H
++        sshr            v25.8H, v16.8H, #2
++        sshr            v27.8H, v17.8H, #2
++        sshr            v29.8H, va.8H,  #2
++        sshr            v31.8H, v19.8H, #2
++        sub             v19.8H, v19.8H, v25.8H
++        sub             va.8H,  v27.8H, va.8H
++        add             v17.8H, v17.8H, v29.8H
++        add             v16.8H, v16.8H, v31.8H
++  .if \pass == 0
++        sub             v31.8H, v24.8H, v19.8H
++        add             v24.8H, v24.8H, v19.8H
++        add             v25.8H, v26.8H, v18.8H
++        sub             v18.8H, v26.8H, v18.8H
++        add             v26.8H, v28.8H, v17.8H
++        add             v27.8H, v30.8H, v16.8H
++        sub             v29.8H, v28.8H, v17.8H
++        sub             v28.8H, v30.8H, v16.8H
++  .else
++        sub             v31.8H, v24.8H, v19.8H
++        add             v24.8H, v24.8H, v19.8H
++        add             v25.8H, v26.8H, v30.8H
++        sub             v30.8H, v26.8H, v30.8H
++        add             v26.8H, v28.8H, v17.8H
++        sub             v29.8H, v28.8H, v17.8H
++        add             v27.8H, v18.8H, v16.8H
++        sub             v28.8H, v18.8H, v16.8H
++  .endif
++        .unreq          va
++        .unreq          vb
++.endm
++
++function ff_h264_idct8_add_neon, export=1
++        movi            v19.8H,   #0
++        sxtw            x2,       w2
++        ld1             {v24.8H, v25.8H}, [x1]
++        st1             {v19.8H},  [x1],   #16
++        st1             {v19.8H},  [x1],   #16
++        ld1             {v26.8H, v27.8H}, [x1]
++        st1             {v19.8H},  [x1],   #16
++        st1             {v19.8H},  [x1],   #16
++        ld1             {v28.8H, v29.8H}, [x1]
++        st1             {v19.8H},  [x1],   #16
++        st1             {v19.8H},  [x1],   #16
++
++        idct8x8_cols    0
++        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
++        idct8x8_cols    1
++
++        mov             x3,  x0
++        srshr           v24.8H, v24.8H, #6
++        ld1             {v0.8B},     [x0], x2
++        srshr           v25.8H, v25.8H, #6
++        ld1             {v1.8B},     [x0], x2
++        srshr           v26.8H, v26.8H, #6
++        ld1             {v2.8B},     [x0], x2
++        srshr           v27.8H, v27.8H, #6
++        ld1             {v3.8B},     [x0], x2
++        srshr           v28.8H, v28.8H, #6
++        ld1             {v4.8B},     [x0], x2
++        srshr           v29.8H, v29.8H, #6
++        ld1             {v5.8B},     [x0], x2
++        srshr           v30.8H, v30.8H, #6
++        ld1             {v6.8B},     [x0], x2
++        srshr           v31.8H, v31.8H, #6
++        ld1             {v7.8B},     [x0], x2
++        uaddw           v24.8H, v24.8H, v0.8B
++        uaddw           v25.8H, v25.8H, v1.8B
++        uaddw           v26.8H, v26.8H, v2.8B
++        sqxtun          v0.8B,  v24.8H
++        uaddw           v27.8H, v27.8H, v3.8B
++        sqxtun          v1.8B,  v25.8H
++        uaddw           v28.8H, v28.8H, v4.8B
++        sqxtun          v2.8B,  v26.8H
++        st1             {v0.8B},     [x3], x2
++        uaddw           v29.8H, v29.8H, v5.8B
++        sqxtun          v3.8B,  v27.8H
++        st1             {v1.8B},     [x3], x2
++        uaddw           v30.8H, v30.8H, v6.8B
++        sqxtun          v4.8B,  v28.8H
++        st1             {v2.8B},     [x3], x2
++        uaddw           v31.8H, v31.8H, v7.8B
++        sqxtun          v5.8B,  v29.8H
++        st1             {v3.8B},     [x3], x2
++        sqxtun          v6.8B,  v30.8H
++        sqxtun          v7.8B,  v31.8H
++        st1             {v4.8B},     [x3], x2
++        st1             {v5.8B},     [x3], x2
++        st1             {v6.8B},     [x3], x2
++        st1             {v7.8B},     [x3], x2
++
++        sub             x1,  x1,  #128
++        ret
++endfunc
++
++function ff_h264_idct8_dc_add_neon, export=1
++        mov             w3,       #0
++        sxtw            x2,       w2
++        ld1r            {v31.8H}, [x1]
++        strh            w3,       [x1]
++        ld1             {v0.8B},  [x0], x2
++        srshr           v31.8H, v31.8H, #6
++        ld1             {v1.8B},     [x0], x2
++        ld1             {v2.8B},     [x0], x2
++        uaddw           v24.8H, v31.8H, v0.8B
++        ld1             {v3.8B},     [x0], x2
++        uaddw           v25.8H, v31.8H, v1.8B
++        ld1             {v4.8B},     [x0], x2
++        uaddw           v26.8H, v31.8H, v2.8B
++        ld1             {v5.8B},     [x0], x2
++        uaddw           v27.8H, v31.8H, v3.8B
++        ld1             {v6.8B},     [x0], x2
++        uaddw           v28.8H, v31.8H, v4.8B
++        ld1             {v7.8B},     [x0], x2
++        uaddw           v29.8H, v31.8H, v5.8B
++        uaddw           v30.8H, v31.8H, v6.8B
++        uaddw           v31.8H, v31.8H, v7.8B
++        sqxtun          v0.8B,  v24.8H
++        sqxtun          v1.8B,  v25.8H
++        sqxtun          v2.8B,  v26.8H
++        sqxtun          v3.8B,  v27.8H
++        sub             x0,  x0,  x2, lsl #3
++        st1             {v0.8B},     [x0], x2
++        sqxtun          v4.8B,  v28.8H
++        st1             {v1.8B},     [x0], x2
++        sqxtun          v5.8B,  v29.8H
++        st1             {v2.8B},     [x0], x2
++        sqxtun          v6.8B,  v30.8H
++        st1             {v3.8B},     [x0], x2
++        sqxtun          v7.8B,  v31.8H
++        st1             {v4.8B},     [x0], x2
++        st1             {v5.8B},     [x0], x2
++        st1             {v6.8B},     [x0], x2
++        st1             {v7.8B},     [x0], x2
++        ret
++endfunc
++
++function ff_h264_idct8_add4_neon, export=1
++        mov             x12, x30
++        mov             x6,  x0
++        mov             x5,  x1
++        mov             x1,  x2
++        mov             w2,  w3
++        movrel          x7,  scan8
++        mov             w10, #16
++        movrel          x13, X(ff_h264_idct8_dc_add_neon)
++        movrel          x14, X(ff_h264_idct8_add_neon)
++1:      ldrb            w9,  [x7], #4
++        ldrsw           x0,  [x5], #16
++        ldrb            w9,  [x4, w9, UXTW]
++        subs            w9,  w9,  #1
++        b.lt            2f
++        ldrsh           w11,  [x1]
++        add             x0,  x6,  x0
++        ccmp            w11, #0,  #4,  eq
++        csel            x15, x13, x14, ne
++        blr             x15
++2:      subs            w10, w10, #4
++        add             x1,  x1,  #128
++        b.ne            1b
++        ret             x12
++endfunc
++
++const   scan8
++        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
++        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
++        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
++        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
++        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
++        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
++        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
++        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
++        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
++        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
++        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
++        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
++endconst
+diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_init.c b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
+@@ -0,0 +1,93 @@
++/*
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/h264pred.h"
++
++void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
++
++void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
++void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
++
++static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
++                                        const int bit_depth,
++                                        const int chroma_format_idc)
++{
++    const int high_depth = bit_depth > 8;
++
++    if (high_depth)
++        return;
++
++    if (chroma_format_idc <= 1) {
++        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
++        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
++        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
++            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
++        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
++        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
++            codec_id != AV_CODEC_ID_VP8) {
++            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
++            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
++            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
++            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
++            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
++            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
++            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
++        }
++    }
++
++    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
++    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
++    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
++    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
++    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
++    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
++    if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
++        codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
++        h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
++}
++
++av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
++                                       int bit_depth, const int chroma_format_idc)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags))
++        h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_neon.S b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
+@@ -0,0 +1,361 @@
++/*
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
++.if \n >= 8 || \hi == 0
++        ld1             {\rd\().b}[0],  [\rs], \rt
++        ld1             {\rd\().b}[1],  [\rs], \rt
++        ld1             {\rd\().b}[2],  [\rs], \rt
++        ld1             {\rd\().b}[3],  [\rs], \rt
++.endif
++.if \n >= 8 || \hi == 1
++        ld1             {\rd\().b}[4],  [\rs], \rt
++        ld1             {\rd\().b}[5],  [\rs], \rt
++        ld1             {\rd\().b}[6],  [\rs], \rt
++        ld1             {\rd\().b}[7],  [\rs], \rt
++.endif
++.if \n == 16
++        ld1             {\rd\().b}[8],  [\rs], \rt
++        ld1             {\rd\().b}[9],  [\rs], \rt
++        ld1             {\rd\().b}[10], [\rs], \rt
++        ld1             {\rd\().b}[11], [\rs], \rt
++        ld1             {\rd\().b}[12], [\rs], \rt
++        ld1             {\rd\().b}[13], [\rs], \rt
++        ld1             {\rd\().b}[14], [\rs], \rt
++        ld1             {\rd\().b}[15], [\rs], \rt
++.endif
++.endm
++
++function ff_pred16x16_128_dc_neon, export=1
++        movi            v0.16b,  #128
++        b               .L_pred16x16_dc_end
++endfunc
++
++function ff_pred16x16_top_dc_neon, export=1
++        sub             x2,  x0,  x1
++        ld1             {v0.16b},  [x2]
++        uaddlv          h0,  v0.16b
++        rshrn           v0.8b,  v0.8h,  #4
++        dup             v0.16b, v0.b[0]
++        b               .L_pred16x16_dc_end
++endfunc
++
++function ff_pred16x16_left_dc_neon, export=1
++        sub             x2,  x0,  #1
++        ldcol.8         v0,  x2,  x1, 16
++        uaddlv          h0,  v0.16b
++        rshrn           v0.8b,  v0.8h,  #4
++        dup             v0.16b, v0.b[0]
++        b               .L_pred16x16_dc_end
++endfunc
++
++function ff_pred16x16_dc_neon, export=1
++        sub             x2,  x0,  x1
++        sub             x3,  x0,  #1
++        ld1             {v0.16b}, [x2]
++        ldcol.8         v1,  x3,  x1, 16
++        uaddlv          h0,  v0.16b
++        uaddlv          h1,  v1.16b
++        add             v0.4h,  v0.4h,  v1.4h
++        rshrn           v0.8b,  v0.8h,  #5
++        dup             v0.16b, v0.b[0]
++.L_pred16x16_dc_end:
++        mov             w3,  #8
++6:      st1             {v0.16b}, [x0], x1
++        st1             {v0.16b}, [x0], x1
++        subs            w3,  w3,  #1
++        b.ne            6b
++        ret
++endfunc
++
++function ff_pred16x16_hor_neon, export=1
++        sub             x2,  x0,  #1
++        mov             w3,  #16
++1:      ld1r            {v0.16b}, [x2], x1
++        st1             {v0.16b}, [x0], x1
++        subs            w3,  w3,  #1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_pred16x16_vert_neon, export=1
++        sub             x2,  x0,  x1
++        add             x1,  x1,  x1
++        ld1             {v0.16b}, [x2], x1
++        mov             w3,  #8
++1:      st1             {v0.16b}, [x0], x1
++        st1             {v0.16b}, [x2], x1
++        subs            w3,  w3,  #1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_pred16x16_plane_neon, export=1
++        sub             x3,  x0,  x1
++        movrel          x4,  p16weight
++        add             x2,  x3,  #8
++        sub             x3,  x3,  #1
++        ld1             {v0.8b},  [x3]
++        ld1             {v2.8b},  [x2], x1
++        ldcol.8         v1,  x3,  x1
++        add             x3,  x3,  x1
++        ldcol.8         v3,  x3,  x1
++        rev64           v0.8b,  v0.8b
++        rev64           v1.8b,  v1.8b
++        uaddl           v7.8h,  v2.8b,  v3.8b
++        usubl           v2.8h,  v2.8b,  v0.8b
++        usubl           v3.8h,  v3.8b,  v1.8b
++        ld1             {v0.8h},     [x4]
++        mul             v2.8h,  v2.8h,  v0.8h
++        mul             v3.8h,  v3.8h,  v0.8h
++        addp            v2.8h,  v2.8h,  v3.8h
++        addp            v2.8h,  v2.8h,  v2.8h
++        addp            v2.4h,  v2.4h,  v2.4h
++        sshll           v3.4s,  v2.4h,  #2
++        saddw           v2.4s,  v3.4s,  v2.4h
++        rshrn           v4.4h,  v2.4s,  #6
++        trn2            v5.4h,  v4.4h,  v4.4h
++        add             v2.4h,  v4.4h,  v5.4h
++        shl             v3.4h,  v2.4h,  #3
++        ext             v7.16b, v7.16b, v7.16b, #14
++        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
++        add             v7.4h,  v7.4h,  v0.4h
++        shl             v2.4h,  v7.4h,  #4
++        sub             v2.4h,  v2.4h,  v3.4h
++        shl             v3.4h,  v4.4h,  #4
++        ext             v0.16b, v0.16b, v0.16b, #14
++        sub             v6.4h,  v5.4h,  v3.4h
++        mov             v0.h[0],  wzr
++        mul             v0.8h,  v0.8h,  v4.h[0]
++        dup             v1.8h,  v2.h[0]
++        dup             v2.8h,  v4.h[0]
++        dup             v3.8h,  v6.h[0]
++        shl             v2.8h,  v2.8h,  #3
++        add             v1.8h,  v1.8h,  v0.8h
++        add             v3.8h,  v3.8h,  v2.8h
++        mov             w3,  #16
++1:
++        sqshrun         v0.8b,  v1.8h,  #5
++        add             v1.8h,  v1.8h,  v2.8h
++        sqshrun2        v0.16b, v1.8h,  #5
++        add             v1.8h,  v1.8h,  v3.8h
++        st1             {v0.16b}, [x0], x1
++        subs            w3,  w3,  #1
++        b.ne            1b
++        ret
++endfunc
++
++const   p16weight, align=4
++        .short          1,2,3,4,5,6,7,8
++endconst
++const   p8weight, align=4
++        .short          1,2,3,4,1,2,3,4
++endconst
++
++function ff_pred8x8_hor_neon, export=1
++        sub             x2,  x0,  #1
++        mov             w3,  #8
++1:      ld1r            {v0.8b},  [x2], x1
++        st1             {v0.8b},  [x0], x1
++        subs            w3,  w3,  #1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_pred8x8_vert_neon, export=1
++        sub             x2,  x0,  x1
++        lsl             x1,  x1,  #1
++        ld1             {v0.8b},  [x2], x1
++        mov             w3,  #4
++1:      st1             {v0.8b},  [x0], x1
++        st1             {v0.8b},  [x2], x1
++        subs            w3,  w3,  #1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_pred8x8_plane_neon, export=1
++        sub             x3,  x0,  x1
++        movrel          x4,  p8weight
++        movrel          x5,  p16weight
++        add             x2,  x3,  #4
++        sub             x3,  x3,  #1
++        ld1             {v0.s}[0],  [x3]
++        ld1             {v2.s}[0],  [x2], x1
++        ldcol.8         v0,  x3,  x1,  4,  hi=1
++        add             x3,  x3,  x1
++        ldcol.8         v3,  x3,  x1,  4
++        uaddl           v7.8h,  v2.8b,  v3.8b
++        rev32           v0.8b,  v0.8b
++        trn1            v2.2s,  v2.2s,  v3.2s
++        usubl           v2.8h,  v2.8b,  v0.8b
++        ld1             {v6.8h},  [x4]
++        mul             v2.8h,  v2.8h,  v6.8h
++        ld1             {v0.8h},  [x5]
++        saddlp          v2.4s,  v2.8h
++        addp            v2.4s,  v2.4s,  v2.4s
++        shl             v3.4s,  v2.4s,  #4
++        add             v2.4s,  v3.4s,  v2.4s
++        rshrn           v5.4h,  v2.4s,  #5
++        addp            v2.4h,  v5.4h,  v5.4h
++        shl             v3.4h,  v2.4h,  #1
++        add             v3.4h,  v3.4h,  v2.4h
++        rev64           v7.4h,  v7.4h
++        add             v7.4h,  v7.4h,  v0.4h
++        shl             v2.4h,  v7.4h,  #4
++        sub             v2.4h,  v2.4h,  v3.4h
++        ext             v0.16b, v0.16b, v0.16b, #14
++        mov             v0.h[0],  wzr
++        mul             v0.8h,  v0.8h,  v5.h[0]
++        dup             v1.8h,  v2.h[0]
++        dup             v2.8h,  v5.h[1]
++        add             v1.8h,  v1.8h,  v0.8h
++        mov             w3,  #8
++1:
++        sqshrun         v0.8b,  v1.8h,  #5
++        add             v1.8h,  v1.8h,  v2.8h
++        st1             {v0.8b},  [x0], x1
++        subs            w3,  w3,  #1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_pred8x8_128_dc_neon, export=1
++        movi            v0.8b,  #128
++        movi            v1.8b,  #128
++        b               .L_pred8x8_dc_end
++endfunc
++
++function ff_pred8x8_top_dc_neon, export=1
++        sub             x2,  x0,  x1
++        ld1             {v0.8b},  [x2]
++        uaddlp          v0.4h,  v0.8b
++        addp            v0.4h,  v0.4h,  v0.4h
++        zip1            v0.8h,  v0.8h,  v0.8h
++        rshrn           v2.8b,  v0.8h,  #2
++        zip1            v0.8b,  v2.8b,  v2.8b
++        zip1            v1.8b,  v2.8b,  v2.8b
++        b               .L_pred8x8_dc_end
++endfunc
++
++function ff_pred8x8_left_dc_neon, export=1
++        sub             x2,  x0,  #1
++        ldcol.8         v0,  x2,  x1
++        uaddlp          v0.4h,  v0.8b
++        addp            v0.4h,  v0.4h,  v0.4h
++        rshrn           v2.8b,  v0.8h,  #2
++        dup             v1.8b,  v2.b[1]
++        dup             v0.8b,  v2.b[0]
++        b               .L_pred8x8_dc_end
++endfunc
++
++function ff_pred8x8_dc_neon, export=1
++        sub             x2,  x0,  x1
++        sub             x3,  x0,  #1
++        ld1             {v0.8b}, [x2]
++        ldcol.8         v1,  x3,  x1
++        uaddlp          v0.4h,  v0.8b
++        uaddlp          v1.4h,  v1.8b
++        trn1            v2.2s,  v0.2s,  v1.2s
++        trn2            v3.2s,  v0.2s,  v1.2s
++        addp            v4.4h,  v2.4h,  v3.4h
++        addp            v5.4h,  v4.4h,  v4.4h
++        rshrn           v6.8b,  v5.8h,  #3
++        rshrn           v7.8b,  v4.8h,  #2
++        dup             v0.8b,  v6.b[0]
++        dup             v2.8b,  v7.b[2]
++        dup             v1.8b,  v7.b[3]
++        dup             v3.8b,  v6.b[1]
++        zip1            v0.2s,  v0.2s,  v2.2s
++        zip1            v1.2s,  v1.2s,  v3.2s
++.L_pred8x8_dc_end:
++        mov             w3,  #4
++        add             x2,  x0,  x1,  lsl #2
++6:      st1             {v0.8b},  [x0], x1
++        st1             {v1.8b},  [x2], x1
++        subs            w3,  w3,  #1
++        b.ne            6b
++        ret
++endfunc
++
++function ff_pred8x8_l0t_dc_neon, export=1
++        sub             x2,  x0,  x1
++        sub             x3,  x0,  #1
++        ld1             {v0.8b},  [x2]
++        ldcol.8         v1,  x3,  x1,  4
++        zip1            v0.4s,  v0.4s,  v1.4s
++        uaddlp          v0.8h,  v0.16b
++        addp            v0.8h,  v0.8h,  v0.8h
++        addp            v1.4h,  v0.4h,  v0.4h
++        rshrn           v2.8b,  v0.8h,  #2
++        rshrn           v3.8b,  v1.8h,  #3
++        dup             v4.8b,  v3.b[0]
++        dup             v6.8b,  v2.b[2]
++        dup             v5.8b,  v2.b[0]
++        zip1            v0.2s,  v4.2s,  v6.2s
++        zip1            v1.2s,  v5.2s,  v6.2s
++        b               .L_pred8x8_dc_end
++endfunc
++
++function ff_pred8x8_l00_dc_neon, export=1
++        sub             x2,  x0,  #1
++        ldcol.8         v0,  x2,  x1,  4
++        uaddlp          v0.4h,  v0.8b
++        addp            v0.4h,  v0.4h,  v0.4h
++        rshrn           v0.8b,  v0.8h,  #2
++        movi            v1.8b,  #128
++        dup             v0.8b,  v0.b[0]
++        b               .L_pred8x8_dc_end
++endfunc
++
++function ff_pred8x8_0lt_dc_neon, export=1
++        add             x3,  x0,  x1,  lsl #2
++        sub             x2,  x0,  x1
++        sub             x3,  x3,  #1
++        ld1             {v0.8b},  [x2]
++        ldcol.8         v1,  x3,  x1,  4,  hi=1
++        zip1            v0.4s,  v0.4s,  v1.4s
++        uaddlp          v0.8h,  v0.16b
++        addp            v0.8h,  v0.8h,  v0.8h
++        addp            v1.4h,  v0.4h,  v0.4h
++        rshrn           v2.8b,  v0.8h,  #2
++        rshrn           v3.8b,  v1.8h,  #3
++        dup             v4.8b,  v2.b[0]
++        dup             v5.8b,  v2.b[3]
++        dup             v6.8b,  v2.b[2]
++        dup             v7.8b,  v3.b[1]
++        zip1            v0.2s,  v4.2s,  v6.2s
++        zip1            v1.2s,  v5.2s,  v7.2s
++        b               .L_pred8x8_dc_end
++endfunc
++
++function ff_pred8x8_0l0_dc_neon, export=1
++        add             x2,  x0,  x1,  lsl #2
++        sub             x2,  x2,  #1
++        ldcol.8         v1,  x2,  x1,  4
++        uaddlp          v2.4h,  v1.8b
++        addp            v2.4h,  v2.4h,  v2.4h
++        rshrn           v1.8b,  v2.8h,  #2
++        movi            v0.8b,  #128
++        dup             v1.8b,  v1.b[0]
++        b               .L_pred8x8_dc_end
++endfunc
+diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
+@@ -0,0 +1,123 @@
++/*
++ * ARM NEON optimised DSP functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stddef.h>
++#include <stdint.h>
++
++#include "config.h"
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/hpeldsp.h"
++
++void     ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void  ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void  ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void      ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void   ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void   ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void  ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++
++void  ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void  ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void   ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void   ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void  ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++
++void     ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void  ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void  ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void      ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void   ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void   ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++void  ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
++                              ptrdiff_t line_size, int h);
++
++void  ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void  ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
++                                     ptrdiff_t line_size, int h);
++
++av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++        c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
++        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
++        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
++        c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
++        c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
++        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
++        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
++        c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
++
++        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
++        c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
++        c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
++        c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
++        c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
++        c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
++        c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
++        c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
++
++        c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
++        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
++        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
++        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
++        c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
++        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
++        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
++        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
++
++        c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
++        c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
++        c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
++        c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
++    }
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
+@@ -0,0 +1,397 @@
++/*
++ * ARM NEON optimised DSP functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++.macro  pixels16        rnd=1, avg=0
++  .if \avg
++        mov             x12, x0
++  .endif
++1:      ld1             {v0.16B},  [x1], x2
++        ld1             {v1.16B},  [x1], x2
++        ld1             {v2.16B},  [x1], x2
++        ld1             {v3.16B},  [x1], x2
++  .if \avg
++        ld1             {v4.16B},  [x12], x2
++        urhadd          v0.16B,  v0.16B,  v4.16B
++        ld1             {v5.16B},  [x12], x2
++        urhadd          v1.16B,  v1.16B,  v5.16B
++        ld1             {v6.16B},  [x12], x2
++        urhadd          v2.16B,  v2.16B,  v6.16B
++        ld1             {v7.16B},  [x12], x2
++        urhadd          v3.16B,  v3.16B,  v7.16B
++  .endif
++        subs            w3,  w3,  #4
++        st1             {v0.16B},  [x0], x2
++        st1             {v1.16B},  [x0], x2
++        st1             {v2.16B},  [x0], x2
++        st1             {v3.16B},  [x0], x2
++        b.ne            1b
++        ret
++.endm
++
++.macro  pixels16_x2     rnd=1, avg=0
++1:      ld1             {v0.16B, v1.16B}, [x1], x2
++        ld1             {v2.16B, v3.16B}, [x1], x2
++        subs            w3,  w3,  #2
++        ext             v1.16B,  v0.16B,  v1.16B,  #1
++        avg             v0.16B,  v0.16B,  v1.16B
++        ext             v3.16B,  v2.16B,  v3.16B,  #1
++        avg             v2.16B,  v2.16B,  v3.16B
++  .if \avg
++        ld1             {v1.16B}, [x0], x2
++        ld1             {v3.16B}, [x0]
++        urhadd          v0.16B,  v0.16B,  v1.16B
++        urhadd          v2.16B,  v2.16B,  v3.16B
++        sub             x0,  x0,  x2
++  .endif
++        st1             {v0.16B}, [x0], x2
++        st1             {v2.16B}, [x0], x2
++        b.ne            1b
++        ret
++.endm
++
++.macro  pixels16_y2     rnd=1, avg=0
++        sub             w3,  w3,  #2
++        ld1             {v0.16B}, [x1], x2
++        ld1             {v1.16B}, [x1], x2
++1:      subs            w3,  w3,  #2
++        avg             v2.16B,  v0.16B,  v1.16B
++        ld1             {v0.16B}, [x1], x2
++        avg             v3.16B,  v0.16B,  v1.16B
++        ld1             {v1.16B}, [x1], x2
++  .if \avg
++        ld1             {v4.16B}, [x0], x2
++        ld1             {v5.16B}, [x0]
++        urhadd          v2.16B,  v2.16B,  v4.16B
++        urhadd          v3.16B,  v3.16B,  v5.16B
++        sub             x0,  x0,  x2
++  .endif
++        st1             {v2.16B}, [x0], x2
++        st1             {v3.16B}, [x0], x2
++        b.ne            1b
++
++        avg             v2.16B,  v0.16B,  v1.16B
++        ld1             {v0.16B}, [x1], x2
++        avg             v3.16B,  v0.16B,  v1.16B
++  .if \avg
++        ld1             {v4.16B}, [x0], x2
++        ld1             {v5.16B}, [x0]
++        urhadd          v2.16B,  v2.16B,  v4.16B
++        urhadd          v3.16B,  v3.16B,  v5.16B
++        sub             x0,  x0,  x2
++  .endif
++        st1             {v2.16B},     [x0], x2
++        st1             {v3.16B},     [x0], x2
++
++        ret
++.endm
++
++.macro  pixels16_xy2    rnd=1, avg=0
++        sub             w3,  w3,  #2
++        ld1             {v0.16B, v1.16B}, [x1], x2
++        ld1             {v4.16B, v5.16B}, [x1], x2
++NRND    movi            v26.8H, #1
++        ext             v1.16B,  v0.16B,  v1.16B,  #1
++        ext             v5.16B,  v4.16B,  v5.16B,  #1
++        uaddl           v16.8H,  v0.8B,   v1.8B
++        uaddl2          v20.8H,  v0.16B,  v1.16B
++        uaddl           v18.8H,  v4.8B,   v5.8B
++        uaddl2          v22.8H,  v4.16B,  v5.16B
++1:      subs            w3,  w3,  #2
++        ld1             {v0.16B, v1.16B}, [x1], x2
++        add             v24.8H,  v16.8H,  v18.8H
++NRND    add             v24.8H,  v24.8H,  v26.8H
++        ext             v30.16B, v0.16B,  v1.16B,  #1
++        add             v1.8H,   v20.8H,  v22.8H
++        mshrn           v28.8B,  v24.8H,  #2
++NRND    add             v1.8H,   v1.8H,   v26.8H
++        mshrn2          v28.16B, v1.8H,   #2
++  .if \avg
++        ld1             {v16.16B},        [x0]
++        urhadd          v28.16B, v28.16B, v16.16B
++  .endif
++        uaddl           v16.8H,  v0.8B,   v30.8B
++        ld1             {v2.16B, v3.16B}, [x1], x2
++        uaddl2          v20.8H,  v0.16B,  v30.16B
++        st1             {v28.16B},        [x0], x2
++        add             v24.8H,  v16.8H,  v18.8H
++NRND    add             v24.8H,  v24.8H,  v26.8H
++        ext             v3.16B,  v2.16B,  v3.16B,  #1
++        add             v0.8H,   v20.8H,  v22.8H
++        mshrn           v30.8B,  v24.8H,  #2
++NRND    add             v0.8H,   v0.8H,   v26.8H
++        mshrn2          v30.16B, v0.8H,   #2
++  .if \avg
++        ld1             {v18.16B},        [x0]
++        urhadd          v30.16B, v30.16B, v18.16B
++  .endif
++        uaddl           v18.8H,   v2.8B,  v3.8B
++        uaddl2          v22.8H,   v2.16B, v3.16B
++        st1             {v30.16B},        [x0], x2
++        b.gt            1b
++
++        ld1             {v0.16B, v1.16B}, [x1], x2
++        add             v24.8H,  v16.8H,  v18.8H
++NRND    add             v24.8H,  v24.8H,  v26.8H
++        ext             v30.16B, v0.16B,  v1.16B,  #1
++        add             v1.8H,   v20.8H,  v22.8H
++        mshrn           v28.8B,  v24.8H,  #2
++NRND    add             v1.8H,   v1.8H,   v26.8H
++        mshrn2          v28.16B, v1.8H,   #2
++  .if \avg
++        ld1             {v16.16B},        [x0]
++        urhadd          v28.16B, v28.16B, v16.16B
++  .endif
++        uaddl           v16.8H,  v0.8B,   v30.8B
++        uaddl2          v20.8H,  v0.16B,  v30.16B
++        st1             {v28.16B},        [x0], x2
++        add             v24.8H,  v16.8H,  v18.8H
++NRND    add             v24.8H,  v24.8H,  v26.8H
++        add             v0.8H,   v20.8H,  v22.8H
++        mshrn           v30.8B,  v24.8H,  #2
++NRND    add             v0.8H,   v0.8H,   v26.8H
++        mshrn2          v30.16B, v0.8H,   #2
++  .if \avg
++        ld1             {v18.16B},        [x0]
++        urhadd          v30.16B, v30.16B, v18.16B
++  .endif
++        st1             {v30.16B},        [x0], x2
++
++        ret
++.endm
++
++.macro  pixels8         rnd=1, avg=0
++1:      ld1             {v0.8B}, [x1], x2
++        ld1             {v1.8B}, [x1], x2
++        ld1             {v2.8B}, [x1], x2
++        ld1             {v3.8B}, [x1], x2
++  .if \avg
++        ld1             {v4.8B}, [x0], x2
++        urhadd          v0.8B,  v0.8B,  v4.8B
++        ld1             {v5.8B}, [x0], x2
++        urhadd          v1.8B,  v1.8B,  v5.8B
++        ld1             {v6.8B}, [x0], x2
++        urhadd          v2.8B,  v2.8B,  v6.8B
++        ld1             {v7.8B}, [x0], x2
++        urhadd          v3.8B,  v3.8B,  v7.8B
++        sub             x0,  x0,  x2,  lsl #2
++  .endif
++        subs            w3,  w3,  #4
++        st1             {v0.8B}, [x0], x2
++        st1             {v1.8B}, [x0], x2
++        st1             {v2.8B}, [x0], x2
++        st1             {v3.8B}, [x0], x2
++        b.ne            1b
++        ret
++.endm
++
++.macro  pixels8_x2      rnd=1, avg=0
++1:      ld1             {v0.8B, v1.8B}, [x1], x2
++        ext             v1.8B,  v0.8B,  v1.8B,  #1
++        ld1             {v2.8B, v3.8B}, [x1], x2
++        ext             v3.8B,  v2.8B,  v3.8B,  #1
++        subs            w3,  w3,  #2
++        avg             v0.8B,   v0.8B,   v1.8B
++        avg             v2.8B,   v2.8B,   v3.8B
++  .if \avg
++        ld1             {v4.8B},     [x0], x2
++        ld1             {v5.8B},     [x0]
++        urhadd          v0.8B,   v0.8B,   v4.8B
++        urhadd          v2.8B,   v2.8B,   v5.8B
++        sub             x0,  x0,  x2
++  .endif
++        st1             {v0.8B}, [x0], x2
++        st1             {v2.8B}, [x0], x2
++        b.ne            1b
++        ret
++.endm
++
++.macro  pixels8_y2      rnd=1, avg=0
++        sub             w3,  w3,  #2
++        ld1             {v0.8B},  [x1], x2
++        ld1             {v1.8B},  [x1], x2
++1:      subs            w3,  w3,  #2
++        avg             v4.8B,  v0.8B,  v1.8B
++        ld1             {v0.8B},  [x1], x2
++        avg             v5.8B,  v0.8B,  v1.8B
++        ld1             {v1.8B},  [x1], x2
++  .if \avg
++        ld1             {v2.8B},     [x0], x2
++        ld1             {v3.8B},     [x0]
++        urhadd          v4.8B,  v4.8B,  v2.8B
++        urhadd          v5.8B,  v5.8B,  v3.8B
++        sub             x0,  x0,  x2
++  .endif
++        st1             {v4.8B},     [x0], x2
++        st1             {v5.8B},     [x0], x2
++        b.ne            1b
++
++        avg             v4.8B,  v0.8B,  v1.8B
++        ld1             {v0.8B},  [x1], x2
++        avg             v5.8B,  v0.8B,  v1.8B
++  .if \avg
++        ld1             {v2.8B},     [x0], x2
++        ld1             {v3.8B},     [x0]
++        urhadd          v4.8B,  v4.8B,  v2.8B
++        urhadd          v5.8B,  v5.8B,  v3.8B
++        sub             x0,  x0,  x2
++  .endif
++        st1             {v4.8B},     [x0], x2
++        st1             {v5.8B},     [x0], x2
++
++        ret
++.endm
++
++.macro  pixels8_xy2     rnd=1, avg=0
++        sub             w3,  w3,  #2
++        ld1             {v0.16B},     [x1], x2
++        ld1             {v1.16B},     [x1], x2
++NRND    movi            v19.8H, #1
++        ext             v4.16B,  v0.16B,  v4.16B,  #1
++        ext             v6.16B,  v1.16B,  v6.16B,  #1
++        uaddl           v16.8H,  v0.8B,  v4.8B
++        uaddl           v17.8H,  v1.8B,  v6.8B
++1:      subs            w3,  w3,  #2
++        ld1             {v0.16B},     [x1], x2
++        add             v18.8H, v16.8H,  v17.8H
++        ext             v4.16B,  v0.16B,  v4.16B,  #1
++NRND    add             v18.8H, v18.8H, v19.8H
++        uaddl           v16.8H,  v0.8B,  v4.8B
++        mshrn           v5.8B,  v18.8H, #2
++        ld1             {v1.16B},     [x1], x2
++        add             v18.8H, v16.8H,  v17.8H
++  .if \avg
++        ld1             {v7.8B},     [x0]
++        urhadd          v5.8B,  v5.8B,  v7.8B
++  .endif
++NRND    add             v18.8H, v18.8H, v19.8H
++        st1             {v5.8B},     [x0], x2
++        mshrn           v7.8B,  v18.8H, #2
++  .if \avg
++        ld1             {v5.8B},     [x0]
++        urhadd          v7.8B,  v7.8B,  v5.8B
++  .endif
++        ext             v6.16B,  v1.16B,  v6.16B,  #1
++        uaddl           v17.8H,  v1.8B,   v6.8B
++        st1             {v7.8B},     [x0], x2
++        b.gt            1b
++
++        ld1             {v0.16B},     [x1], x2
++        add             v18.8H, v16.8H, v17.8H
++        ext             v4.16B, v0.16B, v4.16B,  #1
++NRND    add             v18.8H, v18.8H, v19.8H
++        uaddl           v16.8H,  v0.8B, v4.8B
++        mshrn           v5.8B,  v18.8H, #2
++        add             v18.8H, v16.8H, v17.8H
++  .if \avg
++        ld1             {v7.8B},     [x0]
++        urhadd          v5.8B,  v5.8B,  v7.8B
++  .endif
++NRND    add             v18.8H, v18.8H, v19.8H
++        st1             {v5.8B},     [x0], x2
++        mshrn           v7.8B,  v18.8H, #2
++  .if \avg
++        ld1             {v5.8B},     [x0]
++        urhadd          v7.8B,  v7.8B,  v5.8B
++  .endif
++        st1             {v7.8B},     [x0], x2
++
++        ret
++.endm
++
++.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
++  .if \rnd
++    .macro avg  rd, rn, rm
++        urhadd          \rd, \rn, \rm
++    .endm
++    .macro mshrn rd, rn, rm
++        rshrn           \rd, \rn, \rm
++    .endm
++    .macro mshrn2 rd, rn, rm
++        rshrn2          \rd, \rn, \rm
++    .endm
++    .macro NRND insn:vararg
++    .endm
++  .else
++    .macro avg  rd, rn, rm
++        uhadd           \rd, \rn, \rm
++    .endm
++    .macro mshrn rd, rn, rm
++        shrn            \rd, \rn, \rm
++    .endm
++    .macro mshrn2 rd, rn, rm
++        shrn2           \rd, \rn, \rm
++    .endm
++    .macro NRND insn:vararg
++        \insn
++    .endm
++  .endif
++function ff_\pfx\name\suf\()_neon, export=1
++        \name           \rnd, \avg
++endfunc
++        .purgem         avg
++        .purgem         mshrn
++        .purgem         mshrn2
++        .purgem         NRND
++.endm
++
++.macro  pixfunc2        pfx, name, avg=0
++        pixfunc         \pfx, \name,          rnd=1, avg=\avg
++        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
++.endm
++
++function ff_put_h264_qpel16_mc00_neon, export=1
++        mov             w3,  #16
++endfunc
++
++        pixfunc         put_, pixels16,     avg=0
++        pixfunc2        put_, pixels16_x2,  avg=0
++        pixfunc2        put_, pixels16_y2,  avg=0
++        pixfunc2        put_, pixels16_xy2, avg=0
++
++function ff_avg_h264_qpel16_mc00_neon, export=1
++        mov             w3,  #16
++endfunc
++
++        pixfunc         avg_, pixels16,     avg=1
++        pixfunc2        avg_, pixels16_x2,  avg=1
++        pixfunc2        avg_, pixels16_y2,  avg=1
++        pixfunc2        avg_, pixels16_xy2, avg=1
++
++function ff_put_h264_qpel8_mc00_neon, export=1
++        mov             w3,  #8
++endfunc
++
++        pixfunc         put_, pixels8,     avg=0
++        pixfunc2        put_, pixels8_x2,  avg=0
++        pixfunc2        put_, pixels8_y2,  avg=0
++        pixfunc2        put_, pixels8_xy2, avg=0
++
++function ff_avg_h264_qpel8_mc00_neon, export=1
++        mov             w3,  #8
++endfunc
++
++        pixfunc         avg_, pixels8,     avg=1
++        pixfunc         avg_, pixels8_x2,  avg=1
++        pixfunc         avg_, pixels8_y2,  avg=1
++        pixfunc         avg_, pixels8_xy2, avg=1
+diff --git a/media/ffvpx/libavcodec/aarch64/idct.h b/media/ffvpx/libavcodec/aarch64/idct.h
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/idct.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_AARCH64_IDCT_H
++#define AVCODEC_AARCH64_IDCT_H
++
++#include <stdint.h>
++
++void ff_simple_idct_neon(int16_t *data);
++void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
++void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
++
++#endif /* AVCODEC_AARCH64_IDCT_H */
+diff --git a/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -0,0 +1,41 @@
++/*
++ * ARM-NEON-optimized IDCT functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/idctdsp.h"
++#include "idct.h"
++
++av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
++                                     unsigned high_bit_depth)
++{
++    if (!avctx->lowres && !high_bit_depth) {
++        if (avctx->idct_algo == FF_IDCT_AUTO ||
++            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++            c->idct_put  = ff_simple_idct_put_neon;
++            c->idct_add  = ff_simple_idct_add_neon;
++            c->idct      = ff_simple_idct_neon;
++            c->perm_type = FF_IDCT_PERM_PARTTRANS;
++        }
++    }
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/mdct_neon.S b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
+@@ -0,0 +1,323 @@
++/*
++ * AArch64 NEON optimised MDCT
++ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++function ff_imdct_half_neon, export=1
++        sub             sp,  sp,  #32
++        stp             x19, x20, [sp]
++        str             x30, [sp, #16]
++        mov             x12, #1
++        ldr             w14, [x0, #28]          // mdct_bits
++        ldr             x4,  [x0, #32]          // tcos
++        ldr             x3,  [x0, #8]           // revtab
++        lsl             x12, x12, x14           // n  = 1 << nbits
++        lsr             x14, x12, #2            // n4 = n >> 2
++        add             x7,  x2,  x12,  lsl #1
++        mov             x12, #-16
++        sub             x7,  x7,  #16
++
++        ld2             {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
++        ld2             {v0.2s,v1.2s},   [x2], #16 // d0 =m0,x d1 =m1,x
++        rev64           v17.2s, v17.2s
++        ld2             {v2.2s,v3.2s},   [x4], #16 // d2=c0,c1 d3=s0,s2
++        fmul            v6.2s,  v17.2s, v2.2s
++        fmul            v7.2s,  v0.2s,  v2.2s
++1:
++        subs            x14, x14, #2
++        ldr             w6,  [x3], #4
++        fmul            v4.2s,  v0.2s,  v3.2s
++        fmul            v5.2s,  v17.2s, v3.2s
++        fsub            v4.2s,  v6.2s,  v4.2s
++        fadd            v5.2s,  v5.2s,  v7.2s
++        ubfm            x8,  x6,  #16, #31
++        ubfm            x6,  x6,  #0,  #15
++        add             x8,  x1,  x8,  lsl #3
++        add             x6,  x1,  x6,  lsl #3
++        b.eq            2f
++        ld2             {v16.2s,v17.2s}, [x7], x12
++        ld2             {v0.2s,v1.2s},   [x2], #16
++        rev64           v17.2s, v17.2s
++        ld2             {v2.2s,v3.2s},   [x4], #16    // d2=c0,c1 d3=s0,s2
++        fmul            v6.2s,  v17.2s, v2.2s
++        fmul            v7.2s,  v0.2s,  v2.2s
++        st2             {v4.s,v5.s}[0], [x6]
++        st2             {v4.s,v5.s}[1], [x8]
++        b               1b
++2:
++        st2             {v4.s,v5.s}[0], [x6]
++        st2             {v4.s,v5.s}[1], [x8]
++
++        mov             x19, x0
++        mov             x20, x1
++        bl              X(ff_fft_calc_neon)
++
++        mov             x12, #1
++        ldr             w14, [x19, #28]          // mdct_bits
++        ldr             x4,  [x19, #32]          // tcos
++        lsl             x12, x12, x14            // n  = 1 << nbits
++        lsr             x14, x12, #3             // n8 = n >> 3
++
++        add             x4,  x4,  x14, lsl #3
++        add             x6,  x20, x14, lsl #3
++        sub             x1,  x4,  #16
++        sub             x3,  x6,  #16
++
++        mov             x7,  #-16
++        mov             x8,  x6
++        mov             x0,  x3
++
++        ld2             {v0.2s,v1.2s},  [x3], x7 // d0 =i1,r1 d1 =i0,r0
++        ld2             {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
++        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
++3:
++        subs            x14, x14, #2
++        fmul            v7.2s,  v0.2s,  v17.2s
++        ld2             {v18.2s,v19.2s},[x4], #16    // d17=c2,c3 d19=s2,s3
++        fmul            v4.2s,  v1.2s,  v17.2s
++        fmul            v6.2s,  v21.2s, v19.2s
++        fmul            v5.2s,  v20.2s, v19.2s
++        fmul            v22.2s, v1.2s,  v16.2s
++        fmul            v23.2s, v21.2s, v18.2s
++        fmul            v24.2s, v0.2s,  v16.2s
++        fmul            v25.2s, v20.2s, v18.2s
++        fadd            v7.2s,  v7.2s,  v22.2s
++        fadd            v5.2s,  v5.2s,  v23.2s
++        fsub            v4.2s,  v4.2s,  v24.2s
++        fsub            v6.2s,  v6.2s,  v25.2s
++        b.eq            4f
++        ld2             {v0.2s,v1.2s},  [x3], x7
++        ld2             {v20.2s,v21.2s},[x6], #16
++        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
++        rev64           v5.2s,  v5.2s
++        rev64           v7.2s,  v7.2s
++        st2             {v4.2s,v5.2s},  [x0], x7
++        st2             {v6.2s,v7.2s},  [x8], #16
++        b               3b
++4:
++        rev64           v5.2s,  v5.2s
++        rev64           v7.2s,  v7.2s
++        st2             {v4.2s,v5.2s},  [x0]
++        st2             {v6.2s,v7.2s},  [x8]
++
++        ldp             x19, x20, [sp]
++        ldr             x30, [sp, #16]
++        add             sp,  sp,  #32
++
++        ret
++endfunc
++
++function ff_imdct_calc_neon, export=1
++        sub             sp,  sp,  #32
++        stp             x19, x20, [sp]
++        str             x30, [sp, #16]
++        ldr             w3,  [x0, #28]          // mdct_bits
++        mov             x19, #1
++        mov             x20, x1
++        lsl             x19, x19, x3
++        add             x1,  x1,  x19
++
++        bl              X(ff_imdct_half_neon)
++
++        add             x0,  x20, x19,  lsl #2
++        add             x1,  x20, x19,  lsl #1
++        sub             x0,  x0,  #8
++        sub             x2,  x1,  #16
++        mov             x3,  #-16
++        mov             x6,  #-8
++1:
++        ld1             {v0.4s}, [x2], x3
++        prfum           pldl1keep, [x0, #-16]
++        rev64           v0.4s, v0.4s
++        ld1             {v2.2s,v3.2s}, [x1], #16
++        fneg            v4.4s,  v0.4s
++        prfum           pldl1keep, [x2, #-16]
++        rev64           v2.2s, v2.2s
++        rev64           v3.2s, v3.2s
++        ext             v4.16b, v4.16b, v4.16b, #8
++        st1             {v2.2s}, [x0], x6
++        st1             {v3.2s}, [x0], x6
++        st1             {v4.4s}, [x20], #16
++        subs            x19, x19,  #16
++        b.gt            1b
++
++        ldp             x19, x20, [sp], #16
++        ldr             x30, [sp], #16
++
++        ret
++endfunc
++
++
++function ff_mdct_calc_neon, export=1
++        sub             sp,  sp,  #32
++        stp             x19, x20, [sp]
++        str             x30, [sp, #16]
++
++        mov             x12, #1
++        ldr             w14, [x0, #28]          // mdct_bits
++        ldr             x4,  [x0, #32]          // tcos
++        ldr             x3,  [x0, #8]           // revtab
++        lsl             x14, x12, x14           // n  = 1 << nbits
++        add             x7,  x2,  x14           // in4u
++        sub             x9,  x7,  #16           // in4d
++        add             x2,  x7,  x14, lsl #1   // in3u
++        add             x8,  x9,  x14, lsl #1   // in3d
++        add             x5,  x4,  x14, lsl #1
++        sub             x5,  x5,  #16
++        sub             x3,  x3,  #4
++        mov             x12, #-16
++        lsr             x13, x14, #1
++
++        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
++        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
++        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
++        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
++        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
++        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
++        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
++        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
++        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
++        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
++        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
++        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
++        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
++        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
++1:
++        fmul            v7.2s,  v0.2s,  v21.2s      //  I*s
++        ldr             w10, [x3, x13]
++        fmul            v6.2s,  v2.2s,  v20.2s      // -R*c
++        ldr             w6,  [x3, #4]!
++        fmul            v4.2s,  v2.2s,  v21.2s      // -R*s
++        fmul            v5.2s,  v0.2s,  v20.2s      //  I*c
++        fmul            v24.2s, v16.2s, v30.2s      //  R*c
++        fmul            v25.2s, v18.2s, v31.2s      // -I*s
++        fmul            v22.2s, v16.2s, v31.2s      //  R*s
++        fmul            v23.2s, v18.2s, v30.2s      //  I*c
++        subs            x14, x14, #16
++        subs            x13, x13, #8
++        fsub            v6.2s,  v6.2s,  v7.2s       // -R*c-I*s
++        fadd            v7.2s,  v4.2s,  v5.2s       // -R*s+I*c
++        fsub            v24.2s, v25.2s, v24.2s      // I*s-R*c
++        fadd            v25.2s, v22.2s, v23.2s      // R*s-I*c
++        b.eq            1f
++        mov             x12, #-16
++        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
++        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
++        fneg            v7.2s,  v7.2s               //  R*s-I*c
++        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
++        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
++        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
++        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
++        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
++        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
++        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
++        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
++        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
++        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
++        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
++        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
++        ubfm            x12, x6,  #16, #31
++        ubfm            x6,  x6,  #0,  #15
++        add             x12, x1,  x12, lsl #3
++        add             x6,  x1,  x6,  lsl #3
++        st2             {v6.s,v7.s}[0],   [x6]
++        st2             {v6.s,v7.s}[1],   [x12]
++        ubfm            x6,  x10, #16, #31
++        ubfm            x10, x10, #0,  #15
++        add             x6 , x1,  x6,  lsl #3
++        add             x10, x1,  x10, lsl #3
++        st2             {v24.s,v25.s}[0], [x10]
++        st2             {v24.s,v25.s}[1], [x6]
++        b               1b
++1:
++        fneg            v7.2s,  v7.2s           //  R*s-I*c
++        ubfm            x12, x6,  #16, #31
++        ubfm            x6,  x6,  #0,  #15
++        add             x12, x1,  x12, lsl #3
++        add             x6,  x1,  x6,  lsl #3
++        st2             {v6.s,v7.s}[0],   [x6]
++        st2             {v6.s,v7.s}[1],   [x12]
++        ubfm            x6,  x10, #16, #31
++        ubfm            x10, x10, #0,  #15
++        add             x6 , x1,  x6,  lsl #3
++        add             x10, x1,  x10, lsl #3
++        st2             {v24.s,v25.s}[0], [x10]
++        st2             {v24.s,v25.s}[1], [x6]
++
++        mov             x19, x0
++        mov             x20, x1
++        bl              X(ff_fft_calc_neon)
++
++        mov             x12, #1
++        ldr             w14, [x19, #28]         // mdct_bits
++        ldr             x4,  [x19, #32]         // tcos
++        lsl             x12, x12, x14           // n  = 1 << nbits
++        lsr             x14, x12, #3            // n8 = n >> 3
++
++        add             x4,  x4,  x14, lsl #3
++        add             x6,  x20, x14, lsl #3
++        sub             x1,  x4,  #16
++        sub             x3,  x6,  #16
++
++        mov             x7,  #-16
++        mov             x8,  x6
++        mov             x0,  x3
++
++        ld2             {v0.2s,v1.2s},   [x3], x7   // d0 =r1,i1 d1 =r0,i0
++        ld2             {v20.2s,v21.2s}, [x6], #16  // d20=r2,i2 d21=r3,i3
++        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
++1:
++        subs            x14, x14, #2
++        fmul            v7.2s,  v0.2s,  v17.2s      // r1*s1,r0*s0
++        ld2             {v18.2s,v19.2s}, [x4], #16  // c2,c3 s2,s3
++        fmul            v4.2s,  v1.2s,  v17.2s      // i1*s1,i0*s0
++        fmul            v6.2s,  v21.2s, v19.2s      // i2*s2,i3*s3
++        fmul            v5.2s,  v20.2s, v19.2s      // r2*s2,r3*s3
++        fmul            v24.2s, v0.2s,  v16.2s      // r1*c1,r0*c0
++        fmul            v25.2s, v20.2s, v18.2s      // r2*c2,r3*c3
++        fmul            v22.2s, v21.2s, v18.2s      // i2*c2,i3*c3
++        fmul            v23.2s, v1.2s,  v16.2s      // i1*c1,i0*c0
++        fadd            v4.2s,  v4.2s,  v24.2s      // i1*s1+r1*c1,i0*s0+r0*c0
++        fadd            v6.2s,  v6.2s,  v25.2s      // i2*s2+r2*c2,i3*s3+r3*c3
++        fsub            v5.2s,  v22.2s, v5.2s       // i2*c2-r2*s2,i3*c3-r3*s3
++        fsub            v7.2s,  v23.2s, v7.2s       // i1*c1-r1*s1,i0*c0-r0*s0
++        fneg            v4.2s,  v4.2s
++        fneg            v6.2s,  v6.2s
++        b.eq            1f
++        ld2             {v0.2s, v1.2s},  [x3], x7
++        ld2             {v20.2s,v21.2s}, [x6], #16
++        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
++        rev64           v5.2s,  v5.2s
++        rev64           v7.2s,  v7.2s
++        st2             {v4.2s,v5.2s},  [x0], x7
++        st2             {v6.2s,v7.2s},  [x8], #16
++        b               1b
++1:
++        rev64           v5.2s,  v5.2s
++        rev64           v7.2s,  v7.2s
++        st2             {v4.2s,v5.2s},  [x0]
++        st2             {v6.2s,v7.2s},  [x8]
++
++        ldp             x19, x20, [sp], #16
++        ldr             x30, [sp], #16
++        ret
++endfunc
+diff --git a/media/ffvpx/libavcodec/aarch64/neon.S b/media/ffvpx/libavcodec/aarch64/neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/neon.S
+@@ -0,0 +1,149 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++.macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
++        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
++        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
++        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
++        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
++        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
++        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
++        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
++        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B
++
++        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
++        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
++        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
++        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
++        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
++        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
++        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
++        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H
++
++        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
++        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S
++
++        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
++        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S
++
++        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
++        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S
++
++        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
++        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
++.endm
++
++.macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
++        trn1            \t0\().16B, \r0\().16B, \r1\().16B
++        trn2            \t1\().16B, \r0\().16B, \r1\().16B
++        trn1            \r1\().16B, \r2\().16B, \r3\().16B
++        trn2            \r3\().16B, \r2\().16B, \r3\().16B
++        trn1            \r0\().16B, \r4\().16B, \r5\().16B
++        trn2            \r5\().16B, \r4\().16B, \r5\().16B
++        trn1            \r2\().16B, \r6\().16B, \r7\().16B
++        trn2            \r7\().16B, \r6\().16B, \r7\().16B
++
++        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
++        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
++        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
++        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
++        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
++        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
++        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
++        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H
++
++        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
++        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S
++
++        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
++        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S
++
++        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
++        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S
++
++        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
++        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
++.endm
++
++.macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
++        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
++        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
++        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
++        trn2            \t7\().16B, \r2\().16B,  \r3\().16B
++
++        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
++        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
++        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
++        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
++.endm
++
++.macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
++        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
++        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
++        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
++        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B
++
++        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
++        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
++        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
++        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
++.endm
++
++.macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
++        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
++        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
++        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
++        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
++        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
++        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
++        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
++        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
++.endm
++
++.macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
++        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
++        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
++        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
++        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
++        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
++        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
++        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
++        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H
++
++        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
++        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
++        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
++        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
++        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
++        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
++        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
++        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S
++
++        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
++        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D
++
++        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
++        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D
++
++        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
++        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D
++
++        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
++        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D
++
++.endm
+diff --git a/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
+@@ -0,0 +1,362 @@
++/*
++ * ARM NEON IDCT
++ *
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
++ *
++ * Based on Simple IDCT
++ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
++#define Z4c ((1<<(COL_SHIFT-1))/Z4)
++#define ROW_SHIFT 11
++#define COL_SHIFT 20
++
++#define z1 v0.H[0]
++#define z2 v0.H[1]
++#define z3 v0.H[2]
++#define z4 v0.H[3]
++#define z5 v0.H[4]
++#define z6 v0.H[5]
++#define z7 v0.H[6]
++#define z4c v0.H[7]
++
++const   idct_coeff_neon, align=4
++        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
++endconst
++
++.macro idct_start data
++        prfm            pldl1keep, [\data]
++        mov             x10, x30
++        movrel          x3, idct_coeff_neon
++        ld1             {v0.2D}, [x3]
++.endm
++
++.macro idct_end
++        br              x10
++.endm
++
++.macro smull1 a, b, c
++        smull           \a, \b, \c
++.endm
++
++.macro smlal1 a, b, c
++        smlal           \a, \b, \c
++.endm
++
++.macro smlsl1 a, b, c
++        smlsl           \a, \b, \c
++.endm
++
++.macro idct_col4_top y1, y2, y3, y4, i, l
++        smull\i         v7.4S,  \y3\l, z2
++        smull\i         v16.4S, \y3\l, z6
++        smull\i         v17.4S, \y2\l, z1
++        add             v19.4S, v23.4S, v7.4S
++        smull\i         v18.4S, \y2\l, z3
++        add             v20.4S, v23.4S, v16.4S
++        smull\i         v5.4S,  \y2\l, z5
++        sub             v21.4S, v23.4S, v16.4S
++        smull\i         v6.4S,  \y2\l, z7
++        sub             v22.4S, v23.4S, v7.4S
++
++        smlal\i         v17.4S, \y4\l, z3
++        smlsl\i         v18.4S, \y4\l, z7
++        smlsl\i         v5.4S,  \y4\l, z1
++        smlsl\i         v6.4S,  \y4\l, z5
++.endm
++
++.macro idct_row4_neon y1, y2, y3, y4, pass
++        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
++        movi            v23.4S, #1<<2, lsl #8
++        orr             v5.16B, \y1\().16B, \y2\().16B
++        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
++        orr             v6.16B, \y3\().16B, \y4\().16B
++        orr             v5.16B, v5.16B, v6.16B
++        mov             x3, v5.D[1]
++        smlal           v23.4S, \y1\().4H, z4
++
++        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
++
++        cmp             x3, #0
++        b.eq            \pass\()f
++
++        smull2          v7.4S, \y1\().8H, z4
++        smlal2          v17.4S, \y2\().8H, z5
++        smlsl2          v18.4S, \y2\().8H, z1
++        smull2          v16.4S, \y3\().8H, z2
++        smlal2          v5.4S, \y2\().8H, z7
++        add             v19.4S, v19.4S, v7.4S
++        sub             v20.4S, v20.4S, v7.4S
++        sub             v21.4S, v21.4S, v7.4S
++        add             v22.4S, v22.4S, v7.4S
++        smlal2          v6.4S, \y2\().8H, z3
++        smull2          v7.4S, \y3\().8H, z6
++        smlal2          v17.4S, \y4\().8H, z7
++        smlsl2          v18.4S, \y4\().8H, z5
++        smlal2          v5.4S, \y4\().8H, z3
++        smlsl2          v6.4S, \y4\().8H, z1
++        add             v19.4S, v19.4S, v7.4S
++        sub             v20.4S, v20.4S, v16.4S
++        add             v21.4S, v21.4S, v16.4S
++        sub             v22.4S, v22.4S, v7.4S
++
++\pass:  add             \y3\().4S, v19.4S, v17.4S
++        add             \y4\().4S, v20.4S, v18.4S
++        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
++        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
++        add             v7.4S, v21.4S, v5.4S
++        add             v16.4S, v22.4S, v6.4S
++        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
++        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
++        sub             v22.4S, v22.4S, v6.4S
++        sub             v19.4S, v19.4S, v17.4S
++        sub             v21.4S, v21.4S, v5.4S
++        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
++        sub             v20.4S, v20.4S, v18.4S
++        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
++        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
++        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
++
++        trn1            v16.8H, \y1\().8H, \y2\().8H
++        trn2            v17.8H, \y1\().8H, \y2\().8H
++        trn1            v18.8H, \y3\().8H, \y4\().8H
++        trn2            v19.8H, \y3\().8H, \y4\().8H
++        trn1            \y1\().4S, v16.4S, v18.4S
++        trn1            \y2\().4S, v17.4S, v19.4S
++        trn2            \y3\().4S, v16.4S, v18.4S
++        trn2            \y4\().4S, v17.4S, v19.4S
++.endm
++
++.macro declare_idct_col4_neon i, l
++function idct_col4_neon\i
++        dup             v23.4H, z4c
++.if \i == 1
++        add             v23.4H, v23.4H, v24.4H
++.else
++        mov             v5.D[0], v24.D[1]
++        add             v23.4H, v23.4H, v5.4H
++.endif
++        smull           v23.4S, v23.4H, z4
++
++        idct_col4_top   v24, v25, v26, v27, \i, \l
++
++        mov             x4, v28.D[\i - 1]
++        mov             x5, v29.D[\i - 1]
++        cmp             x4, #0
++        b.eq            1f
++
++        smull\i         v7.4S,  v28\l,  z4
++        add             v19.4S, v19.4S, v7.4S
++        sub             v20.4S, v20.4S, v7.4S
++        sub             v21.4S, v21.4S, v7.4S
++        add             v22.4S, v22.4S, v7.4S
++
++1:      mov             x4, v30.D[\i - 1]
++        cmp             x5, #0
++        b.eq            2f
++
++        smlal\i         v17.4S, v29\l, z5
++        smlsl\i         v18.4S, v29\l, z1
++        smlal\i         v5.4S,  v29\l, z7
++        smlal\i         v6.4S,  v29\l, z3
++
++2:      mov             x5, v31.D[\i - 1]
++        cmp             x4, #0
++        b.eq            3f
++
++        smull\i         v7.4S,  v30\l, z6
++        smull\i         v16.4S, v30\l, z2
++        add             v19.4S, v19.4S, v7.4S
++        sub             v22.4S, v22.4S, v7.4S
++        sub             v20.4S, v20.4S, v16.4S
++        add             v21.4S, v21.4S, v16.4S
++
++3:      cmp             x5, #0
++        b.eq            4f
++
++        smlal\i         v17.4S, v31\l, z7
++        smlsl\i         v18.4S, v31\l, z5
++        smlal\i         v5.4S,  v31\l, z3
++        smlsl\i         v6.4S,  v31\l, z1
++
++4:      addhn           v7.4H, v19.4S, v17.4S
++        addhn2          v7.8H, v20.4S, v18.4S
++        subhn           v18.4H, v20.4S, v18.4S
++        subhn2          v18.8H, v19.4S, v17.4S
++
++        addhn           v16.4H, v21.4S, v5.4S
++        addhn2          v16.8H, v22.4S, v6.4S
++        subhn           v17.4H, v22.4S, v6.4S
++        subhn2          v17.8H, v21.4S, v5.4S
++
++        ret
++endfunc
++.endm
++
++declare_idct_col4_neon 1, .4H
++declare_idct_col4_neon 2, .8H
++
++function ff_simple_idct_put_neon, export=1
++        idct_start      x2
++
++        idct_row4_neon  v24, v25, v26, v27, 1
++        idct_row4_neon  v28, v29, v30, v31, 2
++        bl              idct_col4_neon1
++
++        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
++        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
++        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
++        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
++
++        bl              idct_col4_neon2
++
++        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
++        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
++        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
++        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
++
++        zip1            v16.4S, v1.4S, v2.4S
++        zip2            v17.4S, v1.4S, v2.4S
++
++        st1             {v16.D}[0], [x0], x1
++        st1             {v16.D}[1], [x0], x1
++
++        zip1            v18.4S, v3.4S, v4.4S
++        zip2            v19.4S, v3.4S, v4.4S
++
++        st1             {v17.D}[0], [x0], x1
++        st1             {v17.D}[1], [x0], x1
++        st1             {v18.D}[0], [x0], x1
++        st1             {v18.D}[1], [x0], x1
++        st1             {v19.D}[0], [x0], x1
++        st1             {v19.D}[1], [x0], x1
++
++        idct_end
++endfunc
++
++function ff_simple_idct_add_neon, export=1
++        idct_start      x2
++
++        idct_row4_neon  v24, v25, v26, v27, 1
++        idct_row4_neon  v28, v29, v30, v31, 2
++        bl              idct_col4_neon1
++
++        sshr            v1.8H, v7.8H, #COL_SHIFT-16
++        sshr            v2.8H, v16.8H, #COL_SHIFT-16
++        sshr            v3.8H, v17.8H, #COL_SHIFT-16
++        sshr            v4.8H, v18.8H, #COL_SHIFT-16
++
++        bl              idct_col4_neon2
++
++        sshr            v7.8H, v7.8H, #COL_SHIFT-16
++        sshr            v16.8H, v16.8H, #COL_SHIFT-16
++        sshr            v17.8H, v17.8H, #COL_SHIFT-16
++        sshr            v18.8H, v18.8H, #COL_SHIFT-16
++
++        mov             x9,  x0
++        ld1             {v19.D}[0], [x0], x1
++        zip1            v23.2D, v1.2D, v7.2D
++        zip2            v24.2D, v1.2D, v7.2D
++        ld1             {v19.D}[1], [x0], x1
++        zip1            v25.2D, v2.2D, v16.2D
++        zip2            v26.2D, v2.2D, v16.2D
++        ld1             {v20.D}[0], [x0], x1
++        zip1            v27.2D, v3.2D, v17.2D
++        zip2            v28.2D, v3.2D, v17.2D
++        ld1             {v20.D}[1], [x0], x1
++        zip1            v29.2D, v4.2D, v18.2D
++        zip2            v30.2D, v4.2D, v18.2D
++        ld1             {v21.D}[0], [x0], x1
++        uaddw           v23.8H, v23.8H, v19.8B
++        uaddw2          v24.8H, v24.8H, v19.16B
++        ld1             {v21.D}[1], [x0], x1
++        sqxtun          v23.8B, v23.8H
++        sqxtun2         v23.16B, v24.8H
++        ld1             {v22.D}[0], [x0], x1
++        uaddw           v24.8H, v25.8H, v20.8B
++        uaddw2          v25.8H, v26.8H, v20.16B
++        ld1             {v22.D}[1], [x0], x1
++        sqxtun          v24.8B, v24.8H
++        sqxtun2         v24.16B, v25.8H
++        st1             {v23.D}[0], [x9], x1
++        uaddw           v25.8H, v27.8H, v21.8B
++        uaddw2          v26.8H, v28.8H, v21.16B
++        st1             {v23.D}[1], [x9], x1
++        sqxtun          v25.8B, v25.8H
++        sqxtun2         v25.16B, v26.8H
++        st1             {v24.D}[0], [x9], x1
++        uaddw           v26.8H, v29.8H, v22.8B
++        uaddw2          v27.8H, v30.8H, v22.16B
++        st1             {v24.D}[1], [x9], x1
++        sqxtun          v26.8B, v26.8H
++        sqxtun2         v26.16B, v27.8H
++        st1             {v25.D}[0], [x9], x1
++        st1             {v25.D}[1], [x9], x1
++        st1             {v26.D}[0], [x9], x1
++        st1             {v26.D}[1], [x9], x1
++
++        idct_end
++endfunc
++
++function ff_simple_idct_neon, export=1
++        idct_start      x0
++
++        mov             x2,  x0
++        idct_row4_neon  v24, v25, v26, v27, 1
++        idct_row4_neon  v28, v29, v30, v31, 2
++        sub             x2, x2, #128
++        bl              idct_col4_neon1
++
++        sshr            v1.8H, v7.8H, #COL_SHIFT-16
++        sshr            v2.8H, v16.8H, #COL_SHIFT-16
++        sshr            v3.8H, v17.8H, #COL_SHIFT-16
++        sshr            v4.8H, v18.8H, #COL_SHIFT-16
++
++        bl              idct_col4_neon2
++
++        sshr            v7.8H, v7.8H, #COL_SHIFT-16
++        sshr            v16.8H, v16.8H, #COL_SHIFT-16
++        sshr            v17.8H, v17.8H, #COL_SHIFT-16
++        sshr            v18.8H, v18.8H, #COL_SHIFT-16
++
++        zip1            v23.2D, v1.2D, v7.2D
++        zip2            v24.2D, v1.2D, v7.2D
++        st1             {v23.2D,v24.2D}, [x2], #32
++        zip1            v25.2D, v2.2D, v16.2D
++        zip2            v26.2D, v2.2D, v16.2D
++        st1             {v25.2D,v26.2D}, [x2], #32
++        zip1            v27.2D, v3.2D, v17.2D
++        zip2            v28.2D, v3.2D, v17.2D
++        st1             {v27.2D,v28.2D}, [x2], #32
++        zip1            v29.2D, v4.2D, v18.2D
++        zip2            v30.2D, v4.2D, v18.2D
++        st1             {v29.2D,v30.2D}, [x2], #32
++
++        idct_end
++endfunc
+diff --git a/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -0,0 +1,47 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/vc1dsp.h"
++
++#include "config.h"
++
++void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                int h, int x, int y);
++void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                int h, int x, int y);
++void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                int h, int x, int y);
++void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
++                                int h, int x, int y);
++
++av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
++        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
++        dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
++        dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++    }
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/videodsp.S b/media/ffvpx/libavcodec/aarch64/videodsp.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/videodsp.S
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++function ff_prefetch_aarch64, export=1
++        subs            w2,  w2,  #2
++        prfm            pldl1strm, [x0]
++        prfm            pldl1strm, [x0,  x1]
++        add             x0,  x0,  x1,  lsl #1
++        b.gt            X(ff_prefetch_aarch64)
++        ret
++endfunc
+diff --git a/media/ffvpx/libavcodec/aarch64/videodsp_init.c b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
+@@ -0,0 +1,32 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/videodsp.h"
++
++void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
++
++av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_armv8(cpu_flags))
++        ctx->prefetch = ff_prefetch_aarch64;
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
+@@ -0,0 +1,29 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
++#define AVCODEC_AARCH64_VP9DSP_INIT_H
++
++#include "libavcodec/vp9dsp.h"
++
++void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
++void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
++
++#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
+@@ -0,0 +1,23 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#define BPP 10
++#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
++#include "vp9dsp_init_16bpp_aarch64_template.c"
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
+@@ -0,0 +1,23 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#define BPP 12
++#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
++#include "vp9dsp_init_16bpp_aarch64_template.c"
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
+@@ -0,0 +1,273 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/internal.h"
++#include "libavutil/aarch64/cpu.h"
++#include "vp9dsp_init.h"
++
++#define declare_fpel(type, sz, suffix)                                          \
++void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
++                                      const uint8_t *src, ptrdiff_t src_stride, \
++                                      int h, int mx, int my)
++
++#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
++void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
++                                                       const uint8_t *src, ptrdiff_t src_stride, \
++                                                       int h, int mx, int my)
++
++#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
++static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
++                                                const uint8_t *src,                 \
++                                                ptrdiff_t src_stride,               \
++                                                int h, int mx, int my)              \
++{                                                                                   \
++    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
++    /* We only need h + 7 lines, but the horizontal filter assumes an               \
++     * even number of rows, so filter h + 8 lines here. */                          \
++    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
++                                             src - 3 * src_stride, src_stride,      \
++                                             h + 8, mx, 0);                         \
++    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
++                                                temp + 3 * 2 * sz, 2 * sz,          \
++                                                h, 0, my);                          \
++}
++
++#define decl_filter_funcs(op, dir, sz, bpp)  \
++    decl_mc_func(op, regular, dir, sz, bpp); \
++    decl_mc_func(op, sharp,   dir, sz, bpp); \
++    decl_mc_func(op, smooth,  dir, sz, bpp)
++
++#define decl_mc_funcs(sz, bpp)           \
++    decl_filter_funcs(put, h,  sz, bpp); \
++    decl_filter_funcs(avg, h,  sz, bpp); \
++    decl_filter_funcs(put, v,  sz, bpp); \
++    decl_filter_funcs(avg, v,  sz, bpp); \
++    decl_filter_funcs(put, hv, sz, bpp); \
++    decl_filter_funcs(avg, hv, sz, bpp)
++
++#define ff_vp9_copy32_neon  ff_vp9_copy32_aarch64
++#define ff_vp9_copy64_neon  ff_vp9_copy64_aarch64
++#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
++
++declare_fpel(copy, 128, );
++declare_fpel(copy, 64,  );
++declare_fpel(copy, 32,  );
++declare_fpel(copy, 16,  );
++declare_fpel(copy, 8,   );
++declare_fpel(avg, 64, _16);
++declare_fpel(avg, 32, _16);
++declare_fpel(avg, 16, _16);
++declare_fpel(avg, 8,  _16);
++declare_fpel(avg, 4,  _16);
++
++decl_mc_funcs(64, BPP);
++decl_mc_funcs(32, BPP);
++decl_mc_funcs(16, BPP);
++decl_mc_funcs(8, BPP);
++decl_mc_funcs(4, BPP);
++
++#define define_8tap_2d_funcs(sz, bpp)        \
++    define_8tap_2d_fn(put, regular, sz, bpp) \
++    define_8tap_2d_fn(put, sharp,   sz, bpp) \
++    define_8tap_2d_fn(put, smooth,  sz, bpp) \
++    define_8tap_2d_fn(avg, regular, sz, bpp) \
++    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
++    define_8tap_2d_fn(avg, smooth,  sz, bpp)
++
++define_8tap_2d_funcs(64, BPP)
++define_8tap_2d_funcs(32, BPP)
++define_8tap_2d_funcs(16, BPP)
++define_8tap_2d_funcs(8,  BPP)
++define_8tap_2d_funcs(4,  BPP)
++
++static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++#define init_fpel(idx1, idx2, sz, type, suffix)      \
++    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
++    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
++    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
++    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
++
++#define init_copy(idx, sz, suffix) \
++    init_fpel(idx, 0, sz, copy, suffix)
++
++#define init_avg(idx, sz, suffix) \
++    init_fpel(idx, 1, sz, avg,  suffix)
++
++#define init_copy_avg(idx, sz1, sz2) \
++    init_copy(idx, sz2, _neon);      \
++    init_avg (idx, sz1, _16_neon)
++
++    if (have_armv8(cpu_flags)) {
++        init_copy(0, 128, _aarch64);
++        init_copy(1, 64,  _aarch64);
++        init_copy(2, 32,  _aarch64);
++    }
++
++    if (have_neon(cpu_flags)) {
++#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
++    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
++
++#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
++    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
++    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
++    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
++    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
++    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
++    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
++
++#define init_mc_funcs_dirs(idx, sz, bpp)            \
++    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
++    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
++    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
++
++
++        init_avg(0, 64, _16_neon);
++        init_avg(1, 32, _16_neon);
++        init_avg(2, 16, _16_neon);
++        init_copy_avg(3, 8, 16);
++        init_copy_avg(4, 4, 8);
++
++        init_mc_funcs_dirs(0, 64, BPP);
++        init_mc_funcs_dirs(1, 32, BPP);
++        init_mc_funcs_dirs(2, 16, BPP);
++        init_mc_funcs_dirs(3, 8,  BPP);
++        init_mc_funcs_dirs(4, 4,  BPP);
++    }
++}
++
++#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
++void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
++                                                                 ptrdiff_t stride, \
++                                                                 int16_t *_block, int eob)
++#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
++
++#define define_itxfm_funcs(sz, bpp)      \
++    define_itxfm(idct,  idct,  sz, bpp); \
++    define_itxfm(iadst, idct,  sz, bpp); \
++    define_itxfm(idct,  iadst, sz, bpp); \
++    define_itxfm(iadst, iadst, sz, bpp)
++
++define_itxfm_funcs(4,  BPP);
++define_itxfm_funcs(8,  BPP);
++define_itxfm_funcs(16, BPP);
++define_itxfm(idct, idct, 32, BPP);
++define_itxfm(iwht, iwht, 4,  BPP);
++
++
++static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++#define init_itxfm2(tx, sz, bpp)                                               \
++    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
++    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
++    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
++    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
++#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
++
++#define init_idct2(tx, nm, bpp)     \
++    dsp->itxfm_add[tx][DCT_DCT]   = \
++    dsp->itxfm_add[tx][ADST_DCT]  = \
++    dsp->itxfm_add[tx][DCT_ADST]  = \
++    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
++#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
++
++        init_itxfm(TX_4X4,   4x4,   BPP);
++        init_itxfm(TX_8X8,   8x8,   BPP);
++        init_itxfm(TX_16X16, 16x16, BPP);
++        init_idct(TX_32X32, idct_idct_32x32, BPP);
++        init_idct(4,        iwht_iwht_4x4,   BPP);
++    }
++}
++
++#define define_loop_filter(dir, wd, size, bpp) \
++void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
++
++#define define_loop_filters(wd, size, bpp) \
++    define_loop_filter(h, wd, size, bpp);  \
++    define_loop_filter(v, wd, size, bpp)
++
++define_loop_filters(4,  8,  BPP);
++define_loop_filters(8,  8,  BPP);
++define_loop_filters(16, 8,  BPP);
++
++define_loop_filters(16, 16, BPP);
++
++define_loop_filters(44, 16, BPP);
++define_loop_filters(48, 16, BPP);
++define_loop_filters(84, 16, BPP);
++define_loop_filters(88, 16, BPP);
++
++static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
++    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
++
++#define init_lpf_func_16(idx, dir, bpp) \
++    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
++
++#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
++    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
++
++#define init_lpf_funcs_8_wd(idx, wd, bpp) \
++    init_lpf_func_8(idx, 0, h, wd, bpp);  \
++    init_lpf_func_8(idx, 1, v, wd, bpp)
++
++#define init_lpf_funcs_16(bpp)   \
++    init_lpf_func_16(0, h, bpp); \
++    init_lpf_func_16(1, v, bpp)
++
++#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
++    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
++    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
++
++#define init_lpf_funcs_8(bpp)        \
++    init_lpf_funcs_8_wd(0, 4,  bpp); \
++    init_lpf_funcs_8_wd(1, 8,  bpp); \
++    init_lpf_funcs_8_wd(2, 16, bpp)
++
++#define init_lpf_funcs_mix2(bpp)           \
++    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
++    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
++    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
++    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
++
++        init_lpf_funcs_8(BPP);
++        init_lpf_funcs_16(BPP);
++        init_lpf_funcs_mix2(BPP);
++    }
++}
++
++av_cold void INIT_FUNC(VP9DSPContext *dsp)
++{
++    vp9dsp_mc_init_aarch64(dsp);
++    vp9dsp_loopfilter_init_aarch64(dsp);
++    vp9dsp_itxfm_init_aarch64(dsp);
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
+@@ -0,0 +1,258 @@
++/*
++ * Copyright (c) 2016 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/internal.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavcodec/vp9dsp.h"
++#include "vp9dsp_init.h"
++
++#define declare_fpel(type, sz)                                          \
++void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
++                              const uint8_t *src, ptrdiff_t src_stride, \
++                              int h, int mx, int my)
++
++#define declare_copy_avg(sz) \
++    declare_fpel(copy, sz);  \
++    declare_fpel(avg , sz)
++
++#define decl_mc_func(op, filter, dir, sz)                                                \
++void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
++                                               const uint8_t *src, ptrdiff_t src_stride, \
++                                               int h, int mx, int my)
++
++#define define_8tap_2d_fn(op, filter, sz)                                         \
++static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
++                                        const uint8_t *src, ptrdiff_t src_stride, \
++                                        int h, int mx, int my)                    \
++{                                                                                 \
++    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]);           \
++    /* We only need h + 7 lines, but the horizontal filter assumes an             \
++     * even number of rows, so filter h + 8 lines here. */                        \
++    ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
++                                     src - 3 * src_stride, src_stride,            \
++                                     h + 8, mx, 0);                               \
++    ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride,                          \
++                                        temp + 3 * sz, sz,                        \
++                                        h, 0, my);                                \
++}
++
++#define decl_filter_funcs(op, dir, sz)  \
++    decl_mc_func(op, regular, dir, sz); \
++    decl_mc_func(op, sharp,   dir, sz); \
++    decl_mc_func(op, smooth,  dir, sz)
++
++#define decl_mc_funcs(sz)           \
++    decl_filter_funcs(put, h,  sz); \
++    decl_filter_funcs(avg, h,  sz); \
++    decl_filter_funcs(put, v,  sz); \
++    decl_filter_funcs(avg, v,  sz); \
++    decl_filter_funcs(put, hv, sz); \
++    decl_filter_funcs(avg, hv, sz)
++
++#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
++#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
++
++declare_copy_avg(64);
++declare_copy_avg(32);
++declare_copy_avg(16);
++declare_copy_avg(8);
++declare_copy_avg(4);
++
++decl_mc_funcs(64);
++decl_mc_funcs(32);
++decl_mc_funcs(16);
++decl_mc_funcs(8);
++decl_mc_funcs(4);
++
++#define define_8tap_2d_funcs(sz)        \
++    define_8tap_2d_fn(put, regular, sz) \
++    define_8tap_2d_fn(put, sharp,   sz) \
++    define_8tap_2d_fn(put, smooth,  sz) \
++    define_8tap_2d_fn(avg, regular, sz) \
++    define_8tap_2d_fn(avg, sharp,   sz) \
++    define_8tap_2d_fn(avg, smooth,  sz)
++
++define_8tap_2d_funcs(64)
++define_8tap_2d_funcs(32)
++define_8tap_2d_funcs(16)
++define_8tap_2d_funcs(8)
++define_8tap_2d_funcs(4)
++
++static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++#define init_fpel(idx1, idx2, sz, type, suffix)      \
++    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
++    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
++    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
++    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
++
++#define init_copy(idx, sz, suffix) \
++    init_fpel(idx, 0, sz, copy, suffix)
++
++#define init_avg(idx, sz, suffix) \
++    init_fpel(idx, 1, sz, avg,  suffix)
++
++#define init_copy_avg(idx, sz) \
++    init_copy(idx, sz, _neon); \
++    init_avg (idx, sz, _neon)
++
++    if (have_armv8(cpu_flags)) {
++        init_copy(0, 64, _aarch64);
++        init_copy(1, 32, _aarch64);
++    }
++
++    if (have_neon(cpu_flags)) {
++#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
++    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
++
++#define init_mc_funcs(idx, dir, mx, my, sz, pfx)                                   \
++    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
++    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
++    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx); \
++    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
++    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
++    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx)
++
++#define init_mc_funcs_dirs(idx, sz)            \
++    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
++    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
++    init_mc_funcs(idx, hv, 1, 1, sz,)
++
++        init_avg(0, 64, _neon);
++        init_avg(1, 32, _neon);
++        init_copy_avg(2, 16);
++        init_copy_avg(3, 8);
++        init_copy_avg(4, 4);
++
++        init_mc_funcs_dirs(0, 64);
++        init_mc_funcs_dirs(1, 32);
++        init_mc_funcs_dirs(2, 16);
++        init_mc_funcs_dirs(3, 8);
++        init_mc_funcs_dirs(4, 4);
++    }
++}
++
++#define define_itxfm(type_a, type_b, sz)                                   \
++void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
++                                                         ptrdiff_t stride, \
++                                                         int16_t *_block, int eob)
++
++#define define_itxfm_funcs(sz)      \
++    define_itxfm(idct,  idct,  sz); \
++    define_itxfm(iadst, idct,  sz); \
++    define_itxfm(idct,  iadst, sz); \
++    define_itxfm(iadst, iadst, sz)
++
++define_itxfm_funcs(4);
++define_itxfm_funcs(8);
++define_itxfm_funcs(16);
++define_itxfm(idct, idct, 32);
++define_itxfm(iwht, iwht, 4);
++
++
++static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++#define init_itxfm(tx, sz)                                             \
++    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
++    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
++    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
++    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
++
++#define init_idct(tx, nm)           \
++    dsp->itxfm_add[tx][DCT_DCT]   = \
++    dsp->itxfm_add[tx][ADST_DCT]  = \
++    dsp->itxfm_add[tx][DCT_ADST]  = \
++    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
++
++        init_itxfm(TX_4X4, 4x4);
++        init_itxfm(TX_8X8, 8x8);
++        init_itxfm(TX_16X16, 16x16);
++        init_idct(TX_32X32, idct_idct_32x32);
++        init_idct(4, iwht_iwht_4x4);
++    }
++}
++
++#define define_loop_filter(dir, wd, len) \
++void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
++
++#define define_loop_filters(wd, len) \
++    define_loop_filter(h, wd, len);  \
++    define_loop_filter(v, wd, len)
++
++define_loop_filters(4, 8);
++define_loop_filters(8, 8);
++define_loop_filters(16, 8);
++
++define_loop_filters(16, 16);
++
++define_loop_filters(44, 16);
++define_loop_filters(48, 16);
++define_loop_filters(84, 16);
++define_loop_filters(88, 16);
++
++static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
++        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
++        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
++        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
++        dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
++        dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
++
++        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
++        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
++
++        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
++        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
++        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
++        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
++        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
++        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
++        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
++        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
++    }
++}
++
++av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
++{
++    if (bpp == 10) {
++        ff_vp9dsp_init_10bpp_aarch64(dsp);
++        return;
++    } else if (bpp == 12) {
++        ff_vp9dsp_init_12bpp_aarch64(dsp);
++        return;
++    } else if (bpp != 8)
++        return;
++
++    vp9dsp_mc_init_aarch64(dsp);
++    vp9dsp_loopfilter_init_aarch64(dsp);
++    vp9dsp_itxfm_init_aarch64(dsp);
++}
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+@@ -0,0 +1,2017 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++#include "neon.S"
++
++const itxfm4_coeffs, align=4
++        .short  11585, 0, 6270, 15137
++iadst4_coeffs:
++        .short  5283, 15212, 9929, 13377
++endconst
++
++const iadst8_coeffs, align=4
++        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
++idct_coeffs:
++        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
++        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
++        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
++        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
++endconst
++
++const iadst16_coeffs, align=4
++        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
++        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
++endconst
++
++.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
++        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
++        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
++        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
++        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
++        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
++        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
++        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
++        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
++.endm
++
++// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
++// over two registers.
++.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
++        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
++        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
++
++        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
++        // while swapping the two 4x4 matrices between each other
++
++        // First step of the 4x4 transpose of r1-r7, into t0-t3
++        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
++        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
++        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
++        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
++
++        // First step of the 4x4 transpose of r8-r12, into r1-r7
++        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
++        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
++        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
++        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
++
++        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
++        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
++        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
++        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
++        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
++
++        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
++        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
++        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
++        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
++        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
++
++        // Move the outputs of trn1 back in place
++        mov             \r1\().16b,  \t0\().16b
++        mov             \r3\().16b,  \t1\().16b
++.endm
++
++// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
++// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
++// in/out are .4s registers; this can do with 4 temp registers, but is
++// more efficient if 6 temp registers are available.
++.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
++.if \neg > 0
++        neg             \tmp4\().4s, v0.4s
++.endif
++        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
++        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
++.if \neg > 0
++        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
++        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
++.else
++        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
++        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
++.endif
++.ifb \tmp5
++        rshrn           \out1\().2s, \tmp3\().2d, #14
++        rshrn2          \out1\().4s, \tmp4\().2d, #14
++        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
++        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
++        rshrn           \out2\().2s, \tmp3\().2d, #14
++        rshrn2          \out2\().4s, \tmp4\().2d, #14
++.else
++        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
++        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
++        rshrn           \out1\().2s, \tmp3\().2d, #14
++        rshrn2          \out1\().4s, \tmp4\().2d, #14
++        rshrn           \out2\().2s, \tmp5\().2d, #14
++        rshrn2          \out2\().4s, \tmp6\().2d, #14
++.endif
++.endm
++
++// Same as dmbutterfly0 above, but treating the input in in2 as zero,
++// writing the same output into both out1 and out2.
++.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
++        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
++        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
++        rshrn           \out1\().2s, \tmp1\().2d, #14
++        rshrn2          \out1\().4s, \tmp2\().2d, #14
++        rshrn           \out2\().2s, \tmp1\().2d, #14
++        rshrn2          \out2\().4s, \tmp2\().2d, #14
++.endm
++
++// out1,out2 = in1 * coef1 - in2 * coef2
++// out3,out4 = in1 * coef2 + in2 * coef1
++// out are 4 x .2d registers, in are 2 x .4s registers
++.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
++        smull           \out1\().2d, \in1\().2s, \coef1
++        smull2          \out2\().2d, \in1\().4s, \coef1
++        smull           \out3\().2d, \in1\().2s, \coef2
++        smull2          \out4\().2d, \in1\().4s, \coef2
++        smlsl           \out1\().2d, \in2\().2s, \coef2
++        smlsl2          \out2\().2d, \in2\().4s, \coef2
++        smlal           \out3\().2d, \in2\().2s, \coef1
++        smlal2          \out4\().2d, \in2\().4s, \coef1
++.endm
++
++// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
++// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
++// inout are 2 x .4s registers
++.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
++        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
++.if \neg > 0
++        neg             \tmp3\().2d, \tmp3\().2d
++        neg             \tmp4\().2d, \tmp4\().2d
++.endif
++        rshrn           \inout1\().2s, \tmp1\().2d,  #14
++        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
++        rshrn           \inout2\().2s, \tmp3\().2d,  #14
++        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
++.endm
++
++// Same as dmbutterfly above, but treating the input in inout2 as zero
++.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
++        smull           \tmp1\().2d, \inout1\().2s, \coef1
++        smull2          \tmp2\().2d, \inout1\().4s, \coef1
++        smull           \tmp3\().2d, \inout1\().2s, \coef2
++        smull2          \tmp4\().2d, \inout1\().4s, \coef2
++        rshrn           \inout1\().2s, \tmp1\().2d, #14
++        rshrn2          \inout1\().4s, \tmp2\().2d, #14
++        rshrn           \inout2\().2s, \tmp3\().2d, #14
++        rshrn2          \inout2\().4s, \tmp4\().2d, #14
++.endm
++
++// Same as dmbutterfly above, but treating the input in inout1 as zero
++.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
++        smull           \tmp1\().2d, \inout2\().2s, \coef2
++        smull2          \tmp2\().2d, \inout2\().4s, \coef2
++        smull           \tmp3\().2d, \inout2\().2s, \coef1
++        smull2          \tmp4\().2d, \inout2\().4s, \coef1
++        neg             \tmp1\().2d, \tmp1\().2d
++        neg             \tmp2\().2d, \tmp2\().2d
++        rshrn           \inout2\().2s, \tmp3\().2d, #14
++        rshrn2          \inout2\().4s, \tmp4\().2d, #14
++        rshrn           \inout1\().2s, \tmp1\().2d, #14
++        rshrn2          \inout1\().4s, \tmp2\().2d, #14
++.endm
++
++.macro dsmull_h out1, out2, in, coef
++        smull           \out1\().2d, \in\().2s, \coef
++        smull2          \out2\().2d, \in\().4s, \coef
++.endm
++
++.macro drshrn_h out, in1, in2, shift
++        rshrn           \out\().2s, \in1\().2d, \shift
++        rshrn2          \out\().4s, \in2\().2d, \shift
++.endm
++
++
++// out1 = in1 + in2
++// out2 = in1 - in2
++.macro butterfly_4s out1, out2, in1, in2
++        add             \out1\().4s, \in1\().4s, \in2\().4s
++        sub             \out2\().4s, \in1\().4s, \in2\().4s
++.endm
++
++// out1 = in1 - in2
++// out2 = in1 + in2
++.macro butterfly_4s_r out1, out2, in1, in2
++        sub             \out1\().4s, \in1\().4s, \in2\().4s
++        add             \out2\().4s, \in1\().4s, \in2\().4s
++.endm
++
++// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
++// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
++// out are 2 x .4s registers, in are 4 x .2d registers
++.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
++        add             \tmp1\().2d, \in1\().2d, \in3\().2d
++        add             \tmp2\().2d, \in2\().2d, \in4\().2d
++        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
++        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
++        rshrn           \out1\().2s, \tmp1\().2d,  #14
++        rshrn2          \out1\().4s, \tmp2\().2d,  #14
++        rshrn           \out2\().2s, \tmp3\().2d,  #14
++        rshrn2          \out2\().4s, \tmp4\().2d,  #14
++.endm
++
++.macro iwht4_10 c0, c1, c2, c3
++        add             \c0\().4s, \c0\().4s, \c1\().4s
++        sub             v17.4s,    \c2\().4s, \c3\().4s
++        sub             v16.4s,    \c0\().4s, v17.4s
++        sshr            v16.4s,    v16.4s,    #1
++        sub             \c2\().4s, v16.4s,    \c1\().4s
++        sub             \c1\().4s, v16.4s,    \c3\().4s
++        add             \c3\().4s, v17.4s,    \c2\().4s
++        sub             \c0\().4s, \c0\().4s, \c1\().4s
++.endm
++
++.macro iwht4_12 c0, c1, c2, c3
++        iwht4_10        \c0, \c1, \c2, \c3
++.endm
++
++.macro idct4_10 c0, c1, c2, c3
++        mul             v22.4s,    \c1\().4s, v0.s[3]
++        mul             v20.4s,    \c1\().4s, v0.s[2]
++        add             v16.4s,    \c0\().4s, \c2\().4s
++        sub             v17.4s,    \c0\().4s, \c2\().4s
++        mla             v22.4s,    \c3\().4s, v0.s[2]
++        mul             v18.4s,    v16.4s,    v0.s[0]
++        mul             v24.4s,    v17.4s,    v0.s[0]
++        mls             v20.4s,    \c3\().4s, v0.s[3]
++        srshr           v22.4s,    v22.4s,    #14
++        srshr           v18.4s,    v18.4s,    #14
++        srshr           v24.4s,    v24.4s,    #14
++        srshr           v20.4s,    v20.4s,    #14
++        add             \c0\().4s, v18.4s,    v22.4s
++        sub             \c3\().4s, v18.4s,    v22.4s
++        add             \c1\().4s, v24.4s,    v20.4s
++        sub             \c2\().4s, v24.4s,    v20.4s
++.endm
++
++.macro idct4_12 c0, c1, c2, c3
++        smull           v22.2d,    \c1\().2s, v0.s[3]
++        smull2          v23.2d,    \c1\().4s, v0.s[3]
++        smull           v20.2d,    \c1\().2s, v0.s[2]
++        smull2          v21.2d,    \c1\().4s, v0.s[2]
++        add             v16.4s,    \c0\().4s, \c2\().4s
++        sub             v17.4s,    \c0\().4s, \c2\().4s
++        smlal           v22.2d,    \c3\().2s, v0.s[2]
++        smlal2          v23.2d,    \c3\().4s, v0.s[2]
++        smull           v18.2d,    v16.2s,    v0.s[0]
++        smull2          v19.2d,    v16.4s,    v0.s[0]
++        smull           v24.2d,    v17.2s,    v0.s[0]
++        smull2          v25.2d,    v17.4s,    v0.s[0]
++        smlsl           v20.2d,    \c3\().2s, v0.s[3]
++        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
++        rshrn           v22.2s,    v22.2d,    #14
++        rshrn2          v22.4s,    v23.2d,    #14
++        rshrn           v18.2s,    v18.2d,    #14
++        rshrn2          v18.4s,    v19.2d,    #14
++        rshrn           v24.2s,    v24.2d,    #14
++        rshrn2          v24.4s,    v25.2d,    #14
++        rshrn           v20.2s,    v20.2d,    #14
++        rshrn2          v20.4s,    v21.2d,    #14
++        add             \c0\().4s, v18.4s,    v22.4s
++        sub             \c3\().4s, v18.4s,    v22.4s
++        add             \c1\().4s, v24.4s,    v20.4s
++        sub             \c2\().4s, v24.4s,    v20.4s
++.endm
++
++.macro iadst4_10 c0, c1, c2, c3
++        mul             v16.4s,    \c0\().4s, v1.s[0]
++        mla             v16.4s,    \c2\().4s, v1.s[1]
++        mla             v16.4s,    \c3\().4s, v1.s[2]
++        mul             v18.4s,    \c0\().4s, v1.s[2]
++        mls             v18.4s,    \c2\().4s, v1.s[0]
++        sub             \c0\().4s, \c0\().4s, \c2\().4s
++        mls             v18.4s,    \c3\().4s, v1.s[1]
++        add             \c0\().4s, \c0\().4s, \c3\().4s
++        mul             v22.4s,    \c1\().4s, v1.s[3]
++        mul             v20.4s,    \c0\().4s, v1.s[3]
++        add             v24.4s,    v16.4s,    v22.4s
++        add             v26.4s,    v18.4s,    v22.4s
++        srshr           \c0\().4s, v24.4s,    #14
++        add             v16.4s,    v16.4s,    v18.4s
++        srshr           \c1\().4s, v26.4s,    #14
++        sub             v16.4s,    v16.4s,    v22.4s
++        srshr           \c2\().4s, v20.4s,    #14
++        srshr           \c3\().4s, v16.4s,    #14
++.endm
++
++.macro iadst4_12 c0, c1, c2, c3
++        smull           v16.2d,    \c0\().2s, v1.s[0]
++        smull2          v17.2d,    \c0\().4s, v1.s[0]
++        smlal           v16.2d,    \c2\().2s, v1.s[1]
++        smlal2          v17.2d,    \c2\().4s, v1.s[1]
++        smlal           v16.2d,    \c3\().2s, v1.s[2]
++        smlal2          v17.2d,    \c3\().4s, v1.s[2]
++        smull           v18.2d,    \c0\().2s, v1.s[2]
++        smull2          v19.2d,    \c0\().4s, v1.s[2]
++        smlsl           v18.2d,    \c2\().2s, v1.s[0]
++        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
++        sub             \c0\().4s, \c0\().4s, \c2\().4s
++        smlsl           v18.2d,    \c3\().2s, v1.s[1]
++        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
++        add             \c0\().4s, \c0\().4s, \c3\().4s
++        smull           v22.2d,    \c1\().2s, v1.s[3]
++        smull2          v23.2d,    \c1\().4s, v1.s[3]
++        smull           v20.2d,    \c0\().2s, v1.s[3]
++        smull2          v21.2d,    \c0\().4s, v1.s[3]
++        add             v24.2d,    v16.2d,    v22.2d
++        add             v25.2d,    v17.2d,    v23.2d
++        add             v26.2d,    v18.2d,    v22.2d
++        add             v27.2d,    v19.2d,    v23.2d
++        rshrn           \c0\().2s, v24.2d,    #14
++        rshrn2          \c0\().4s, v25.2d,    #14
++        add             v16.2d,    v16.2d,    v18.2d
++        add             v17.2d,    v17.2d,    v19.2d
++        rshrn           \c1\().2s, v26.2d,    #14
++        rshrn2          \c1\().4s, v27.2d,    #14
++        sub             v16.2d,    v16.2d,    v22.2d
++        sub             v17.2d,    v17.2d,    v23.2d
++        rshrn           \c2\().2s, v20.2d,    #14
++        rshrn2          \c2\().4s, v21.2d,    #14
++        rshrn           \c3\().2s, v16.2d,    #14
++        rshrn2          \c3\().4s, v17.2d,    #14
++.endm
++
++// The public functions in this file have got the following signature:
++// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
++
++.macro itxfm_func4x4 txfm1, txfm2, bpp
++function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
++.ifc \txfm1,\txfm2
++.ifc \txfm1,idct
++        movrel          x4,  itxfm4_coeffs
++        ld1             {v0.4h}, [x4]
++        sxtl            v0.4s,  v0.4h
++.endif
++.ifc \txfm1,iadst
++        movrel          x4,  iadst4_coeffs
++        ld1             {v0.d}[1], [x4]
++        sxtl2           v1.4s,  v0.8h
++.endif
++.else
++        movrel          x4,  itxfm4_coeffs
++        ld1             {v0.8h}, [x4]
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++.endif
++
++        movi            v30.4s, #0
++        movi            v31.4s, #0
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #1
++        b.ne            1f
++        // DC-only for idct/idct
++        ld1             {v2.s}[0],  [x2]
++        smull           v2.2d,  v2.2s, v0.s[0]
++        rshrn           v2.2s,  v2.2d, #14
++        smull           v2.2d,  v2.2s, v0.s[0]
++        rshrn           v2.2s,  v2.2d, #14
++        st1             {v31.s}[0], [x2]
++        dup             v4.4s,  v2.s[0]
++        mov             v5.16b, v4.16b
++        mov             v6.16b, v4.16b
++        mov             v7.16b, v4.16b
++        b               2f
++.endif
++
++1:
++        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
++        st1             {v30.4s,v31.4s}, [x2], #32
++
++.ifc \txfm1,iwht
++        sshr            v4.4s,  v4.4s,  #2
++        sshr            v5.4s,  v5.4s,  #2
++        sshr            v6.4s,  v6.4s,  #2
++        sshr            v7.4s,  v7.4s,  #2
++.endif
++
++        \txfm1\()4_\bpp v4,  v5,  v6,  v7
++
++        st1             {v30.4s,v31.4s}, [x2], #32
++        // Transpose 4x4 with 32 bit elements
++        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
++
++        \txfm2\()4_\bpp v4,  v5,  v6,  v7
++2:
++        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
++        ld1             {v0.4h},   [x0], x1
++        ld1             {v1.4h},   [x0], x1
++.ifnc \txfm1,iwht
++        srshr           v4.4s,  v4.4s,  #4
++        srshr           v5.4s,  v5.4s,  #4
++        srshr           v6.4s,  v6.4s,  #4
++        srshr           v7.4s,  v7.4s,  #4
++.endif
++        uaddw           v4.4s,  v4.4s,  v0.4h
++        uaddw           v5.4s,  v5.4s,  v1.4h
++        ld1             {v2.4h},   [x0], x1
++        ld1             {v3.4h},   [x0], x1
++        sqxtun          v0.4h,  v4.4s
++        sqxtun2         v0.8h,  v5.4s
++        sub             x0,  x0,  x1, lsl #2
++
++        uaddw           v6.4s,  v6.4s,  v2.4h
++        umin            v0.8h,  v0.8h,  v31.8h
++        uaddw           v7.4s,  v7.4s,  v3.4h
++        st1             {v0.4h},   [x0], x1
++        sqxtun          v2.4h,  v6.4s
++        sqxtun2         v2.8h,  v7.4s
++        umin            v2.8h,  v2.8h,  v31.8h
++
++        st1             {v0.d}[1], [x0], x1
++        st1             {v2.4h},   [x0], x1
++        st1             {v2.d}[1], [x0], x1
++
++        ret
++endfunc
++.endm
++
++.macro itxfm_funcs4x4 bpp
++itxfm_func4x4 idct,  idct,  \bpp
++itxfm_func4x4 iadst, idct,  \bpp
++itxfm_func4x4 idct,  iadst, \bpp
++itxfm_func4x4 iadst, iadst, \bpp
++itxfm_func4x4 iwht,  iwht,  \bpp
++.endm
++
++itxfm_funcs4x4 10
++itxfm_funcs4x4 12
++
++function idct8x8_dc_add_neon
++        movrel          x4,  idct_coeffs
++        ld1             {v0.4h}, [x4]
++
++        movi            v1.4h,  #0
++        sxtl            v0.4s,  v0.4h
++
++        ld1             {v2.s}[0],  [x2]
++        smull           v2.2d,  v2.2s,  v0.s[0]
++        rshrn           v2.2s,  v2.2d,  #14
++        smull           v2.2d,  v2.2s,  v0.s[0]
++        rshrn           v2.2s,  v2.2d,  #14
++        st1             {v1.s}[0],  [x2]
++        dup             v2.4s,  v2.s[0]
++
++        srshr           v2.4s,  v2.4s,  #5
++
++        mov             x4,  #8
++        mov             x3,  x0
++        dup             v31.8h, w5
++1:
++        // Loop to add the constant from v2 into all 8x8 outputs
++        subs            x4,  x4,  #2
++        ld1             {v3.8h},  [x0], x1
++        ld1             {v4.8h},  [x0], x1
++        uaddw           v16.4s, v2.4s,  v3.4h
++        uaddw2          v17.4s, v2.4s,  v3.8h
++        uaddw           v18.4s, v2.4s,  v4.4h
++        uaddw2          v19.4s, v2.4s,  v4.8h
++        sqxtun          v3.4h,  v16.4s
++        sqxtun2         v3.8h,  v17.4s
++        sqxtun          v4.4h,  v18.4s
++        sqxtun2         v4.8h,  v19.4s
++        umin            v3.8h,  v3.8h,  v31.8h
++        umin            v4.8h,  v4.8h,  v31.8h
++        st1             {v3.8h},  [x3], x1
++        st1             {v4.8h},  [x3], x1
++        b.ne            1b
++
++        ret
++endfunc
++
++.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
++        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
++        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
++        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
++        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
++
++        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
++        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
++        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
++        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
++
++        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
++
++        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
++        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
++        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
++        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
++.endm
++
++.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
++        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
++        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
++
++        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
++        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
++
++        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
++        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
++
++        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
++        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
++
++        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
++        neg             \r7\().4s, \r7\().4s // r7 = out[7]
++        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
++
++        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
++        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
++
++        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
++
++        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
++        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
++
++        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
++        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
++
++        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
++        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
++.endm
++
++
++.macro itxfm_func8x8 txfm1, txfm2
++function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #1
++        b.eq            idct8x8_dc_add_neon
++.endif
++        // The iadst also uses a few coefficients from
++        // idct, so those always need to be loaded.
++.ifc \txfm1\()_\txfm2,idct_idct
++        movrel          x4,  idct_coeffs
++.else
++        movrel          x4,  iadst8_coeffs
++        ld1             {v1.8h}, [x4], #16
++        stp             d8,  d9,  [sp, #-0x10]!
++        sxtl2           v3.4s,  v1.8h
++        sxtl            v2.4s,  v1.4h
++.endif
++        ld1             {v0.8h}, [x4]
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++
++        movi            v4.4s, #0
++        movi            v5.4s, #0
++        movi            v6.4s, #0
++        movi            v7.4s, #0
++
++1:
++        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
++        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
++        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
++        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
++        sub             x2,  x2,  #256
++        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
++        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
++        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
++        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
++        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
++.else
++        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
++        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
++.endif
++
++        // Transpose 8x8 with 16 bit elements
++        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
++        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
++.else
++        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
++        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
++.endif
++2:
++        mov             x3,  x0
++        // Add into the destination
++        ld1             {v0.8h},  [x0], x1
++        srshr           v16.4s, v16.4s, #5
++        srshr           v17.4s, v17.4s, #5
++        ld1             {v1.8h},  [x0], x1
++        srshr           v18.4s, v18.4s, #5
++        srshr           v19.4s, v19.4s, #5
++        ld1             {v2.8h},  [x0], x1
++        srshr           v20.4s, v20.4s, #5
++        srshr           v21.4s, v21.4s, #5
++        uaddw           v16.4s, v16.4s, v0.4h
++        uaddw2          v17.4s, v17.4s, v0.8h
++        ld1             {v3.8h},  [x0], x1
++        srshr           v22.4s, v22.4s, #5
++        srshr           v23.4s, v23.4s, #5
++        uaddw           v18.4s, v18.4s, v1.4h
++        uaddw2          v19.4s, v19.4s, v1.8h
++        ld1             {v4.8h},  [x0], x1
++        srshr           v24.4s, v24.4s, #5
++        srshr           v25.4s, v25.4s, #5
++        uaddw           v20.4s, v20.4s, v2.4h
++        uaddw2          v21.4s, v21.4s, v2.8h
++        sqxtun          v0.4h,  v16.4s
++        sqxtun2         v0.8h,  v17.4s
++        dup             v16.8h, w5
++        ld1             {v5.8h},  [x0], x1
++        srshr           v26.4s, v26.4s, #5
++        srshr           v27.4s, v27.4s, #5
++        uaddw           v22.4s, v22.4s, v3.4h
++        uaddw2          v23.4s, v23.4s, v3.8h
++        sqxtun          v1.4h,  v18.4s
++        sqxtun2         v1.8h,  v19.4s
++        umin            v0.8h,  v0.8h,  v16.8h
++        ld1             {v6.8h},  [x0], x1
++        srshr           v28.4s, v28.4s, #5
++        srshr           v29.4s, v29.4s, #5
++        uaddw           v24.4s, v24.4s, v4.4h
++        uaddw2          v25.4s, v25.4s, v4.8h
++        sqxtun          v2.4h,  v20.4s
++        sqxtun2         v2.8h,  v21.4s
++        umin            v1.8h,  v1.8h,  v16.8h
++        ld1             {v7.8h},  [x0], x1
++        srshr           v30.4s, v30.4s, #5
++        srshr           v31.4s, v31.4s, #5
++        uaddw           v26.4s, v26.4s, v5.4h
++        uaddw2          v27.4s, v27.4s, v5.8h
++        sqxtun          v3.4h,  v22.4s
++        sqxtun2         v3.8h,  v23.4s
++        umin            v2.8h,  v2.8h,  v16.8h
++
++        st1             {v0.8h},  [x3], x1
++        uaddw           v28.4s, v28.4s, v6.4h
++        uaddw2          v29.4s, v29.4s, v6.8h
++        st1             {v1.8h},  [x3], x1
++        sqxtun          v4.4h,  v24.4s
++        sqxtun2         v4.8h,  v25.4s
++        umin            v3.8h,  v3.8h,  v16.8h
++        st1             {v2.8h},  [x3], x1
++        uaddw           v30.4s, v30.4s, v7.4h
++        uaddw2          v31.4s, v31.4s, v7.8h
++        st1             {v3.8h},  [x3], x1
++        sqxtun          v5.4h,  v26.4s
++        sqxtun2         v5.8h,  v27.4s
++        umin            v4.8h,  v4.8h,  v16.8h
++        st1             {v4.8h},  [x3], x1
++        sqxtun          v6.4h,  v28.4s
++        sqxtun2         v6.8h,  v29.4s
++        umin            v5.8h,  v5.8h,  v16.8h
++        st1             {v5.8h},  [x3], x1
++        sqxtun          v7.4h,  v30.4s
++        sqxtun2         v7.8h,  v31.4s
++        umin            v6.8h,  v6.8h,  v16.8h
++
++        st1             {v6.8h},  [x3], x1
++        umin            v7.8h,  v7.8h,  v16.8h
++        st1             {v7.8h},  [x3], x1
++
++.ifnc \txfm1\()_\txfm2,idct_idct
++        ldp             d8,  d9,  [sp], 0x10
++.endif
++        ret
++endfunc
++
++function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
++        mov             x5,  #0x03ff
++        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
++endfunc
++
++function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
++        mov             x5,  #0x0fff
++        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
++endfunc
++.endm
++
++itxfm_func8x8 idct,  idct
++itxfm_func8x8 iadst, idct
++itxfm_func8x8 idct,  iadst
++itxfm_func8x8 iadst, iadst
++
++
++function idct16x16_dc_add_neon
++        movrel          x4,  idct_coeffs
++        ld1             {v0.4h}, [x4]
++        sxtl            v0.4s,  v0.4h
++
++        movi            v1.4h,  #0
++
++        ld1             {v2.s}[0],  [x2]
++        smull           v2.2d,  v2.2s,  v0.s[0]
++        rshrn           v2.2s,  v2.2d,  #14
++        smull           v2.2d,  v2.2s,  v0.s[0]
++        rshrn           v2.2s,  v2.2d,  #14
++        st1             {v1.s}[0],  [x2]
++        dup             v2.4s,  v2.s[0]
++
++        srshr           v0.4s,  v2.4s,  #6
++
++        mov             x3, x0
++        mov             x4, #16
++        dup             v31.8h, w13
++1:
++        // Loop to add the constant from v2 into all 16x16 outputs
++        subs            x4,  x4,  #2
++        ld1             {v1.8h,v2.8h},  [x0], x1
++        uaddw           v16.4s, v0.4s,  v1.4h
++        uaddw2          v17.4s, v0.4s,  v1.8h
++        ld1             {v3.8h,v4.8h},  [x0], x1
++        uaddw           v18.4s, v0.4s,  v2.4h
++        uaddw2          v19.4s, v0.4s,  v2.8h
++        uaddw           v20.4s, v0.4s,  v3.4h
++        uaddw2          v21.4s, v0.4s,  v3.8h
++        uaddw           v22.4s, v0.4s,  v4.4h
++        uaddw2          v23.4s, v0.4s,  v4.8h
++        sqxtun          v1.4h,  v16.4s
++        sqxtun2         v1.8h,  v17.4s
++        sqxtun          v2.4h,  v18.4s
++        sqxtun2         v2.8h,  v19.4s
++        sqxtun          v3.4h,  v20.4s
++        sqxtun2         v3.8h,  v21.4s
++        sqxtun          v4.4h,  v22.4s
++        sqxtun2         v4.8h,  v23.4s
++        umin            v1.8h,  v1.8h,  v31.8h
++        umin            v2.8h,  v2.8h,  v31.8h
++        st1             {v1.8h,v2.8h},  [x3], x1
++        umin            v3.8h,  v3.8h,  v31.8h
++        umin            v4.8h,  v4.8h,  v31.8h
++        st1             {v3.8h,v4.8h},  [x3], x1
++        b.ne            1b
++
++        ret
++endfunc
++
++.macro idct16_end
++        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
++        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
++        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
++        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
++        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
++        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
++        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
++        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
++
++        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
++        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
++
++        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
++        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
++        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
++        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
++        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
++        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
++        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
++        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
++        ret
++.endm
++
++function idct16
++        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
++        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
++        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
++        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
++        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
++        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
++        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
++        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
++
++        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
++        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
++        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
++        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
++        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
++        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
++        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
++        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
++
++        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
++        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
++        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
++        idct16_end
++endfunc
++
++function idct16_half
++        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
++        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
++        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
++        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
++        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
++        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
++        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
++        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
++
++        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
++        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
++        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
++        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
++        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
++        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
++        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
++        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
++
++        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
++        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
++        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
++        idct16_end
++endfunc
++
++function idct16_quarter
++        dsmull_h        v24, v25, v19, v3.s[3]
++        dsmull_h        v4,  v5,  v17, v2.s[0]
++        dsmull_h        v7,  v6,  v18, v1.s[1]
++        dsmull_h        v30, v31, v18, v1.s[0]
++        neg             v24.2d,  v24.2d
++        neg             v25.2d,  v25.2d
++        dsmull_h        v29, v28, v17, v2.s[1]
++        dsmull_h        v26, v27, v19, v3.s[2]
++        dsmull_h        v22, v23, v16, v0.s[0]
++        drshrn_h        v24, v24, v25, #14
++        drshrn_h        v16, v4,  v5,  #14
++        drshrn_h        v7,  v7,  v6,  #14
++        drshrn_h        v6,  v30, v31, #14
++        drshrn_h        v29, v29, v28, #14
++        drshrn_h        v17, v26, v27, #14
++        drshrn_h        v28, v22, v23, #14
++
++        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
++        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
++        neg             v22.2d,  v22.2d
++        neg             v23.2d,  v23.2d
++        drshrn_h        v27, v20, v21, #14
++        drshrn_h        v21, v22, v23, #14
++        drshrn_h        v23, v18, v19, #14
++        drshrn_h        v25, v30, v31, #14
++        mov             v4.16b,  v28.16b
++        mov             v5.16b,  v28.16b
++        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
++        mov             v20.16b, v28.16b
++        idct16_end
++endfunc
++
++function iadst16
++        ld1             {v0.8h,v1.8h}, [x11]
++        sxtl            v2.4s,  v1.4h
++        sxtl2           v3.4s,  v1.8h
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++
++        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
++        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
++        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
++        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
++        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
++
++        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
++        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
++        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
++        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
++
++        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
++        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
++        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
++        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
++
++        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
++        ld1             {v0.8h}, [x10]
++        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
++        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
++
++        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
++        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
++        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
++        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
++        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
++
++        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
++        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
++        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
++        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
++
++        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
++        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
++
++        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
++        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
++
++        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
++        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
++        neg             v29.4s, v29.4s                   // v29 = out[13]
++
++        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
++        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
++
++        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
++        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
++
++        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
++        neg             v19.4s, v19.4s                   // v19 = out[3]
++        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
++
++        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
++        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
++
++        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
++        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
++        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
++        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
++
++        neg             v31.4s,  v5.4s                    // v31 = out[15]
++        neg             v17.4s,  v3.4s                    // v17 = out[1]
++
++        mov             v16.16b, v2.16b
++        mov             v30.16b, v4.16b
++        ret
++endfunc
++
++// Helper macros; we can't use these expressions directly within
++// e.g. .irp due to the extra concatenation \(). Therefore wrap
++// them in macros to allow using .irp below.
++.macro load i, src, inc
++        ld1             {v\i\().4s},  [\src], \inc
++.endm
++.macro store i, dst, inc
++        st1             {v\i\().4s},  [\dst], \inc
++.endm
++.macro movi_v i, size, imm
++        movi            v\i\()\size,  \imm
++.endm
++.macro load_clear i, src, inc
++        ld1             {v\i\().4s}, [\src]
++        st1             {v4.4s},  [\src], \inc
++.endm
++
++.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
++        srshr           \coef0, \coef0, #6
++        ld1             {v4.4h},   [x0], x1
++        srshr           \coef1, \coef1, #6
++        ld1             {v4.d}[1], [x3], x1
++        srshr           \coef2, \coef2, #6
++        ld1             {v5.4h},   [x0], x1
++        srshr           \coef3, \coef3, #6
++        uaddw           \coef0, \coef0, v4.4h
++        ld1             {v5.d}[1], [x3], x1
++        srshr           \coef4, \coef4, #6
++        uaddw2          \coef1, \coef1, v4.8h
++        ld1             {v6.4h},   [x0], x1
++        srshr           \coef5, \coef5, #6
++        uaddw           \coef2, \coef2, v5.4h
++        ld1             {v6.d}[1], [x3], x1
++        sqxtun          v4.4h,  \coef0
++        srshr           \coef6, \coef6, #6
++        uaddw2          \coef3, \coef3, v5.8h
++        ld1             {v7.4h},   [x0], x1
++        sqxtun2         v4.8h,  \coef1
++        srshr           \coef7, \coef7, #6
++        uaddw           \coef4, \coef4, v6.4h
++        ld1             {v7.d}[1], [x3], x1
++        umin            v4.8h,  v4.8h,  v8.8h
++        sub             x0,  x0,  x1, lsl #2
++        sub             x3,  x3,  x1, lsl #2
++        sqxtun          v5.4h,  \coef2
++        uaddw2          \coef5, \coef5, v6.8h
++        st1             {v4.4h},   [x0], x1
++        sqxtun2         v5.8h,  \coef3
++        uaddw           \coef6, \coef6, v7.4h
++        st1             {v4.d}[1], [x3], x1
++        umin            v5.8h,  v5.8h,  v8.8h
++        sqxtun          v6.4h,  \coef4
++        uaddw2          \coef7, \coef7, v7.8h
++        st1             {v5.4h},   [x0], x1
++        sqxtun2         v6.8h,  \coef5
++        st1             {v5.d}[1], [x3], x1
++        umin            v6.8h,  v6.8h,  v8.8h
++        sqxtun          v7.4h,  \coef6
++        st1             {v6.4h},   [x0], x1
++        sqxtun2         v7.8h,  \coef7
++        st1             {v6.d}[1], [x3], x1
++        umin            v7.8h,  v7.8h,  v8.8h
++        st1             {v7.4h},   [x0], x1
++        st1             {v7.d}[1], [x3], x1
++.endm
++
++// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
++// transpose into a horizontal 16x4 slice and store.
++// x0 = dst (temp buffer)
++// x1 = slice offset
++// x2 = src
++// x9 = input stride
++.macro itxfm16_1d_funcs txfm
++function \txfm\()16_1d_4x16_pass1_neon
++        mov             x14, x30
++
++        movi            v4.4s, #0
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load_clear      \i,  x2,  x9
++.endr
++
++        bl              \txfm\()16
++
++        // Do four 4x4 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
++        // contain the four transposed 4x4 blocks.
++        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
++        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
++        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
++        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
++
++        // Store the transposed 4x4 blocks horizontally.
++        cmp             x1,  #12
++        b.eq            1f
++.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
++        store           \i,  x0,  #16
++.endr
++        br              x14
++1:
++        // Special case: For the last input column (x1 == 12),
++        // which would be stored as the last row in the temp buffer,
++        // don't store the first 4x4 block, but keep it in registers
++        // for the first slice of the second pass (where it is the
++        // last 4x4 block).
++        add             x0,  x0,  #16
++        st1             {v20.4s},  [x0], #16
++        st1             {v24.4s},  [x0], #16
++        st1             {v28.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v21.4s},  [x0], #16
++        st1             {v25.4s},  [x0], #16
++        st1             {v29.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v22.4s},  [x0], #16
++        st1             {v26.4s},  [x0], #16
++        st1             {v30.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v23.4s},  [x0], #16
++        st1             {v27.4s},  [x0], #16
++        st1             {v31.4s},  [x0], #16
++
++        mov             v28.16b, v16.16b
++        mov             v29.16b, v17.16b
++        mov             v30.16b, v18.16b
++        mov             v31.16b, v19.16b
++        br              x14
++endfunc
++
++// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
++// load the destination pixels (from a similar 4x16 slice), add and store back.
++// x0 = dst
++// x1 = dst stride
++// x2 = src (temp buffer)
++// x3 = slice offset
++// x9 = temp buffer stride
++function \txfm\()16_1d_4x16_pass2_neon
++        mov             x14, x30
++
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
++        load            \i,  x2,  x9
++.endr
++        cbz             x3,  1f
++.irp i, 28, 29, 30, 31
++        load            \i,  x2,  x9
++.endr
++1:
++
++        add             x3,  x0,  x1
++        lsl             x1,  x1,  #1
++        bl              \txfm\()16
++
++        dup             v8.8h, w13
++        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
++        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
++
++        br              x14
++endfunc
++.endm
++
++itxfm16_1d_funcs idct
++itxfm16_1d_funcs iadst
++
++// This is the minimum eob value for each subpartition, in increments of 4
++const min_eob_idct_idct_16, align=4
++        .short  0, 10, 38, 89
++endconst
++
++.macro itxfm_func16x16 txfm1, txfm2
++function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #1
++        b.eq            idct16x16_dc_add_neon
++.endif
++        mov             x15, x30
++        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
++.ifnc \txfm1\()_\txfm2,idct_idct
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++.endif
++        stp             d8,  d9,  [sp, #-0x10]!
++
++        sub             sp,  sp,  #1024
++
++        mov             x4,  x0
++        mov             x5,  x1
++        mov             x6,  x2
++
++        movrel          x10, idct_coeffs
++.ifnc \txfm1\()_\txfm2,idct_idct
++        movrel          x11, iadst16_coeffs
++.endif
++.ifc \txfm1,idct
++        ld1             {v0.8h,v1.8h}, [x10]
++        sxtl            v2.4s,  v1.4h
++        sxtl2           v3.4s,  v1.8h
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++.endif
++        mov             x9,  #64
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #10
++        b.le            idct16x16_quarter_add_16_neon
++        cmp             w3,  #38
++        b.le            idct16x16_half_add_16_neon
++
++        movrel          x12, min_eob_idct_idct_16, 2
++.endif
++
++.irp i, 0, 4, 8, 12
++        add             x0,  sp,  #(\i*64)
++.ifc \txfm1\()_\txfm2,idct_idct
++.if \i > 0
++        ldrh            w1,  [x12], #2
++        cmp             w3,  w1
++        mov             x1,  #(16 - \i)/4
++        b.le            1f
++.endif
++.endif
++        mov             x1,  #\i
++        add             x2,  x6,  #(\i*4)
++        bl              \txfm1\()16_1d_4x16_pass1_neon
++.endr
++.ifc \txfm1\()_\txfm2,iadst_idct
++        ld1             {v0.8h,v1.8h}, [x10]
++        sxtl            v2.4s,  v1.4h
++        sxtl2           v3.4s,  v1.8h
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++.endif
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        b               3f
++1:
++        // Set v28-v31 to zero, for the in-register passthrough of
++        // coefficients to pass 2.
++        movi            v28.4s,  #0
++        movi            v29.4s,  #0
++        movi            v30.4s,  #0
++        movi            v31.4s,  #0
++2:
++        subs            x1,  x1,  #1
++.rept 4
++        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
++.endr
++        b.ne            2b
++3:
++.endif
++
++.irp i, 0, 4, 8, 12
++        add             x0,  x4,  #(\i*2)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*4)
++        mov             x3,  #\i
++        bl              \txfm2\()16_1d_4x16_pass2_neon
++.endr
++
++        add             sp,  sp,  #1024
++        ldp             d8,  d9,  [sp], 0x10
++.ifnc \txfm1\()_\txfm2,idct_idct
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++.endif
++        br              x15
++endfunc
++
++function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
++        mov             x13, #0x03ff
++        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
++endfunc
++
++function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
++        mov             x13, #0x0fff
++        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
++endfunc
++.endm
++
++itxfm_func16x16 idct,  idct
++itxfm_func16x16 iadst, idct
++itxfm_func16x16 idct,  iadst
++itxfm_func16x16 iadst, iadst
++
++function idct16_1d_4x16_pass1_quarter_neon
++        mov             x14, x30
++
++        movi            v4.4s, #0
++.irp i, 16, 17, 18, 19
++        load_clear      \i,  x2,  x9
++.endr
++
++        bl              idct16_quarter
++
++        // Do four 4x4 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
++        // contain the four transposed 4x4 blocks.
++        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
++        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
++        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
++        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
++
++        // Store the transposed 4x4 blocks horizontally.
++        // The first 4x4 block is kept in registers for the second pass,
++        // store the rest in the temp buffer.
++        add             x0,  x0,  #16
++        st1             {v20.4s},  [x0], #16
++        st1             {v24.4s},  [x0], #16
++        st1             {v28.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v21.4s},  [x0], #16
++        st1             {v25.4s},  [x0], #16
++        st1             {v29.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v22.4s},  [x0], #16
++        st1             {v26.4s},  [x0], #16
++        st1             {v30.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v23.4s},  [x0], #16
++        st1             {v27.4s},  [x0], #16
++        st1             {v31.4s},  [x0], #16
++        br              x14
++endfunc
++
++function idct16_1d_4x16_pass2_quarter_neon
++        mov             x14, x30
++
++        // Only load the top 4 lines, and only do it for the later slices.
++        // For the first slice, d16-d19 is kept in registers from the first pass.
++        cbz             x3,  1f
++.irp i, 16, 17, 18, 19
++        load            \i,  x2,  x9
++.endr
++1:
++
++        add             x3,  x0,  x1
++        lsl             x1,  x1,  #1
++        bl              idct16_quarter
++
++        dup             v8.8h, w13
++        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
++        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
++
++        br              x14
++endfunc
++
++function idct16_1d_4x16_pass1_half_neon
++        mov             x14, x30
++
++        movi            v4.4s, #0
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load_clear      \i,  x2,  x9
++.endr
++
++        bl              idct16_half
++
++        // Do four 4x4 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
++        // contain the four transposed 4x4 blocks.
++        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
++        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
++        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
++        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
++
++        // Store the transposed 4x4 blocks horizontally.
++        cmp             x1,  #4
++        b.eq            1f
++.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
++        store           \i,  x0,  #16
++.endr
++        br              x14
++1:
++        // Special case: For the second input column (r1 == 4),
++        // which would be stored as the second row in the temp buffer,
++        // don't store the first 4x4 block, but keep it in registers
++        // for the first slice of the second pass (where it is the
++        // second 4x4 block).
++        add             x0,  x0,  #16
++        st1             {v20.4s},  [x0], #16
++        st1             {v24.4s},  [x0], #16
++        st1             {v28.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v21.4s},  [x0], #16
++        st1             {v25.4s},  [x0], #16
++        st1             {v29.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v22.4s},  [x0], #16
++        st1             {v26.4s},  [x0], #16
++        st1             {v30.4s},  [x0], #16
++        add             x0,  x0,  #16
++        st1             {v23.4s},  [x0], #16
++        st1             {v27.4s},  [x0], #16
++        st1             {v31.4s},  [x0], #16
++
++        mov             v20.16b, v16.16b
++        mov             v21.16b, v17.16b
++        mov             v22.16b, v18.16b
++        mov             v23.16b, v19.16b
++        br              x14
++endfunc
++
++function idct16_1d_4x16_pass2_half_neon
++        mov             x14, x30
++
++.irp i, 16, 17, 18, 19
++        load            \i,  x2,  x9
++.endr
++        cbz             x3,  1f
++.irp i, 20, 21, 22, 23
++        load            \i,  x2,  x9
++.endr
++1:
++
++        add             x3,  x0,  x1
++        lsl             x1,  x1,  #1
++        bl              idct16_half
++
++        dup             v8.8h, w13
++        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
++        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
++
++        br              x14
++endfunc
++
++.macro idct16_partial size
++function idct16x16_\size\()_add_16_neon
++        add             x0,  sp,  #(0*64)
++        mov             x1,  #0
++        add             x2,  x6,  #(0*4)
++        bl              idct16_1d_4x16_pass1_\size\()_neon
++.ifc \size,half
++        add             x0,  sp,  #(4*64)
++        mov             x1,  #4
++        add             x2,  x6,  #(4*4)
++        bl              idct16_1d_4x16_pass1_\size\()_neon
++.endif
++
++.irp i, 0, 4, 8, 12
++        add             x0,  x4,  #(\i*2)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*4)
++        mov             x3,  #\i
++        bl              idct16_1d_4x16_pass2_\size\()_neon
++.endr
++
++        add             sp,  sp,  #1024
++        ldp             d8,  d9,  [sp], 0x10
++        br              x15
++endfunc
++.endm
++
++idct16_partial quarter
++idct16_partial half
++
++function idct32x32_dc_add_neon
++        movrel          x4,  idct_coeffs
++        ld1             {v0.4h}, [x4]
++        sxtl            v0.4s,  v0.4h
++
++        movi            v1.4h,  #0
++
++        ld1             {v2.s}[0],  [x2]
++        smull           v2.2d,  v2.2s,  v0.s[0]
++        rshrn           v2.2s,  v2.2d,  #14
++        smull           v2.2d,  v2.2s,  v0.s[0]
++        rshrn           v2.2s,  v2.2d,  #14
++        st1             {v1.s}[0],  [x2]
++        dup             v2.4s,  v2.s[0]
++
++        srshr           v0.4s,  v2.4s,  #6
++
++        mov             x3,  x0
++        mov             x4,  #32
++        sub             x1,  x1,  #32
++        dup             v31.8h, w13
++1:
++        // Loop to add the constant v0 into all 32x32 outputs
++        subs            x4,  x4,  #1
++        ld1             {v1.8h,v2.8h},  [x0], #32
++        uaddw           v16.4s, v0.4s,  v1.4h
++        uaddw2          v17.4s, v0.4s,  v1.8h
++        ld1             {v3.8h,v4.8h},  [x0], x1
++        uaddw           v18.4s, v0.4s,  v2.4h
++        uaddw2          v19.4s, v0.4s,  v2.8h
++        uaddw           v20.4s, v0.4s,  v3.4h
++        uaddw2          v21.4s, v0.4s,  v3.8h
++        uaddw           v22.4s, v0.4s,  v4.4h
++        uaddw2          v23.4s, v0.4s,  v4.8h
++        sqxtun          v1.4h,  v16.4s
++        sqxtun2         v1.8h,  v17.4s
++        sqxtun          v2.4h,  v18.4s
++        sqxtun2         v2.8h,  v19.4s
++        sqxtun          v3.4h,  v20.4s
++        sqxtun2         v3.8h,  v21.4s
++        sqxtun          v4.4h,  v22.4s
++        sqxtun2         v4.8h,  v23.4s
++        umin            v1.8h,  v1.8h,  v31.8h
++        umin            v2.8h,  v2.8h,  v31.8h
++        st1             {v1.8h,v2.8h},  [x3], #32
++        umin            v3.8h,  v3.8h,  v31.8h
++        umin            v4.8h,  v4.8h,  v31.8h
++        st1             {v3.8h,v4.8h},  [x3], x1
++        b.ne            1b
++
++        ret
++endfunc
++
++.macro idct32_end
++        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
++        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
++        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
++        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
++        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
++        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
++        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
++        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
++
++        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
++        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
++        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
++        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
++
++        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
++        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
++        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
++        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
++        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
++        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
++        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
++        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
++
++        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
++        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
++        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
++        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
++        ret
++.endm
++
++function idct32_odd
++        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
++        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
++        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
++        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
++        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
++        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
++        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
++        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
++
++        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
++        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
++        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
++        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
++        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
++        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
++        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
++        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
++
++        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
++        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
++        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
++        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
++        idct32_end
++endfunc
++
++function idct32_odd_half
++        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
++        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
++        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
++        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
++        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
++        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
++        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
++        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
++
++        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
++        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
++        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
++        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
++        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
++        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
++        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
++        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
++
++        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
++        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
++        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
++        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
++        idct32_end
++endfunc
++
++function idct32_odd_quarter
++        dsmull_h        v4,  v5,  v16, v10.s[0]
++        dsmull_h        v28, v29, v19, v11.s[3]
++        dsmull_h        v30, v31, v16, v10.s[1]
++        dsmull_h        v22, v23, v17, v13.s[2]
++        dsmull_h        v7,  v6,  v17, v13.s[3]
++        dsmull_h        v26, v27, v19, v11.s[2]
++        dsmull_h        v20, v21, v18, v12.s[0]
++        dsmull_h        v24, v25, v18, v12.s[1]
++
++        neg             v28.2d, v28.2d
++        neg             v29.2d, v29.2d
++        neg             v7.2d,  v7.2d
++        neg             v6.2d,  v6.2d
++
++        drshrn_h        v4,  v4,  v5,  #14
++        drshrn_h        v5,  v28, v29, #14
++        drshrn_h        v29, v30, v31, #14
++        drshrn_h        v28, v22, v23, #14
++        drshrn_h        v7,  v7,  v6,  #14
++        drshrn_h        v31, v26, v27, #14
++        drshrn_h        v6,  v20, v21, #14
++        drshrn_h        v30, v24, v25, #14
++
++        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
++        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
++        drshrn_h        v23, v16, v17, #14
++        drshrn_h        v24, v18, v19, #14
++        neg             v20.2d, v20.2d
++        neg             v21.2d, v21.2d
++        drshrn_h        v27, v27, v26, #14
++        drshrn_h        v20, v20, v21, #14
++        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
++        drshrn_h        v21, v16, v17, #14
++        drshrn_h        v26, v18, v19, #14
++        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
++        drshrn_h        v25, v16, v17, #14
++        neg             v18.2d, v18.2d
++        neg             v19.2d, v19.2d
++        drshrn_h        v22, v18, v19, #14
++
++        idct32_end
++endfunc
++
++.macro idct32_funcs suffix
++// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
++// The 32-point IDCT can be decomposed into two 16-point IDCTs;
++// a normal IDCT16 with every other input component (the even ones, with
++// each output written twice), followed by a separate 16-point IDCT
++// of the odd inputs, added/subtracted onto the outputs of the first idct16.
++// x0 = dst (temp buffer)
++// x1 = unused
++// x2 = src
++// x9 = double input stride
++function idct32_1d_4x32_pass1\suffix\()_neon
++        mov             x14, x30
++
++        movi            v4.4s,  #0
++
++        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load_clear      \i, x2, x9
++.endr
++.endif
++
++        bl              idct16\suffix
++
++        // Do four 4x4 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
++        // contain the four transposed 4x4 blocks.
++        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
++        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
++        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
++        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
++
++        // Store the registers a, b, c, d horizontally, followed by the
++        // same registers d, c, b, a mirrored.
++.macro store_rev a, b, c, d
++        // There's no rev128 instruction, but we reverse each 64 bit
++        // half, and then flip them using an ext with 8 bytes offset.
++        rev64           v7.4s, \d
++        st1             {\a},  [x0], #16
++        ext             v7.16b, v7.16b, v7.16b, #8
++        st1             {\b},  [x0], #16
++        rev64           v6.4s, \c
++        st1             {\c},  [x0], #16
++        ext             v6.16b, v6.16b, v6.16b, #8
++        st1             {\d},  [x0], #16
++        rev64           v5.4s, \b
++        st1             {v7.4s},  [x0], #16
++        ext             v5.16b, v5.16b, v5.16b, #8
++        st1             {v6.4s},  [x0], #16
++        rev64           v4.4s, \a
++        st1             {v5.4s},  [x0], #16
++        ext             v4.16b, v4.16b, v4.16b, #8
++        st1             {v4.4s},  [x0], #16
++.endm
++        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
++        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
++        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
++        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
++        sub             x0,  x0,  #512
++.purgem store_rev
++
++        // Move x2 back to the start of the input, and move
++        // to the first odd row
++.ifb \suffix
++        sub             x2,  x2,  x9, lsl #4
++.endif
++.ifc \suffix,_quarter
++        sub             x2,  x2,  x9, lsl #2
++.endif
++.ifc \suffix,_half
++        sub             x2,  x2,  x9, lsl #3
++.endif
++        add             x2,  x2,  #128
++
++        movi            v4.4s,  #0
++        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load_clear      \i, x2, x9
++.endr
++.endif
++
++        bl              idct32_odd\suffix
++
++        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
++        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
++        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
++        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
++
++        // Store the registers a, b, c, d horizontally,
++        // adding into the output first, and the mirrored,
++        // subtracted from the output.
++.macro store_rev a, b, c, d, a16b, b16b
++        ld1             {v4.4s},  [x0]
++        rev64           v9.4s, \d
++        add             v4.4s, v4.4s, \a
++        st1             {v4.4s},  [x0], #16
++        rev64           v8.4s, \c
++        ld1             {v4.4s},  [x0]
++        ext             v9.16b, v9.16b, v9.16b, #8
++        add             v4.4s, v4.4s, \b
++        st1             {v4.4s},  [x0], #16
++        ext             v8.16b, v8.16b, v8.16b, #8
++        ld1             {v4.4s},  [x0]
++        rev64           \b, \b
++        add             v4.4s, v4.4s, \c
++        st1             {v4.4s},  [x0], #16
++        rev64           \a, \a
++        ld1             {v4.4s},  [x0]
++        ext             \b16b, \b16b, \b16b, #8
++        add             v4.4s, v4.4s, \d
++        st1             {v4.4s},  [x0], #16
++        ext             \a16b, \a16b, \a16b, #8
++        ld1             {v4.4s},  [x0]
++        sub             v4.4s, v4.4s, v9.4s
++        st1             {v4.4s},  [x0], #16
++        ld1             {v4.4s},  [x0]
++        sub             v4.4s, v4.4s, v8.4s
++        st1             {v4.4s},  [x0], #16
++        ld1             {v4.4s},  [x0]
++        sub             v4.4s, v4.4s, \b
++        st1             {v4.4s},  [x0], #16
++        ld1             {v4.4s},  [x0]
++        sub             v4.4s, v4.4s, \a
++        st1             {v4.4s},  [x0], #16
++.endm
++
++        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
++        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
++        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
++        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
++.purgem store_rev
++        br              x14
++endfunc
++
++// This is mostly the same as 4x32_pass1, but without the transpose,
++// and use the source as temp buffer between the two idct passes, and
++// add into the destination.
++// x0 = dst
++// x1 = dst stride
++// x2 = src (temp buffer)
++// x7 = negative double temp buffer stride
++// x9 = double temp buffer stride
++function idct32_1d_4x32_pass2\suffix\()_neon
++        mov             x14, x30
++
++        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #4
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #2
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #3
++.endif
++
++        bl              idct16\suffix
++
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        store           \i, x2, x9
++.endr
++
++        sub             x2,  x2,  x9, lsl #4
++        add             x2,  x2,  #128
++
++        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #4
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #2
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #3
++.endif
++        sub             x2,  x2,  #128
++
++        bl              idct32_odd\suffix
++
++.macro load_acc_store a, b, c, d, neg=0
++.if \neg == 0
++        ld1             {v4.4s},  [x2], x9
++        ld1             {v5.4s},  [x2], x9
++        add             v4.4s, v4.4s, \a
++        ld1             {v6.4s},  [x2], x9
++        add             v5.4s, v5.4s, \b
++        ld1             {v7.4s},  [x2], x9
++        add             v6.4s, v6.4s, \c
++        add             v7.4s, v7.4s, \d
++.else
++        ld1             {v4.4s},  [x2], x7
++        ld1             {v5.4s},  [x2], x7
++        sub             v4.4s, v4.4s, \a
++        ld1             {v6.4s},  [x2], x7
++        sub             v5.4s, v5.4s, \b
++        ld1             {v7.4s},  [x2], x7
++        sub             v6.4s, v6.4s, \c
++        sub             v7.4s, v7.4s, \d
++.endif
++        ld1             {v8.4h},   [x0], x1
++        ld1             {v8.d}[1], [x0], x1
++        srshr           v4.4s, v4.4s, #6
++        ld1             {v9.4h},   [x0], x1
++        srshr           v5.4s, v5.4s, #6
++        uaddw           v4.4s, v4.4s, v8.4h
++        ld1             {v9.d}[1], [x0], x1
++        srshr           v6.4s, v6.4s, #6
++        uaddw2          v5.4s, v5.4s, v8.8h
++        srshr           v7.4s, v7.4s, #6
++        sub             x0,  x0,  x1, lsl #2
++        uaddw           v6.4s, v6.4s, v9.4h
++        sqxtun          v4.4h, v4.4s
++        uaddw2          v7.4s, v7.4s, v9.8h
++        sqxtun2         v4.8h, v5.4s
++        umin            v4.8h, v4.8h, v15.8h
++        st1             {v4.4h},   [x0], x1
++        sqxtun          v5.4h, v6.4s
++        st1             {v4.d}[1], [x0], x1
++        sqxtun2         v5.8h, v7.4s
++        umin            v5.8h, v5.8h, v15.8h
++        st1             {v5.4h},   [x0], x1
++        st1             {v5.d}[1], [x0], x1
++.endm
++        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
++        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
++        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
++        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
++        sub             x2,  x2,  x9
++        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
++        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
++        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
++        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
++.purgem load_acc_store
++        br              x14
++endfunc
++.endm
++
++idct32_funcs
++idct32_funcs _quarter
++idct32_funcs _half
++
++const min_eob_idct_idct_32, align=4
++        .short  0, 9, 34, 70, 135, 240, 336, 448
++endconst
++
++function vp9_idct_idct_32x32_add_16_neon
++        cmp             w3,  #1
++        b.eq            idct32x32_dc_add_neon
++
++        movrel          x10, idct_coeffs
++
++        mov             x15, x30
++        stp             d8,  d9,  [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d14, d15, [sp, #-0x10]!
++
++        sub             sp,  sp,  #4096
++
++        mov             x4,  x0
++        mov             x5,  x1
++        mov             x6,  x2
++
++        // Double stride of the input, since we only read every other line
++        mov             x9,  #256
++        neg             x7,  x9
++
++        ld1             {v0.8h,v1.8h},   [x10], #32
++        sxtl            v2.4s,  v1.4h
++        sxtl2           v3.4s,  v1.8h
++        sxtl2           v1.4s,  v0.8h
++        sxtl            v0.4s,  v0.4h
++        ld1             {v10.8h,v11.8h}, [x10]
++        sxtl            v12.4s, v11.4h
++        sxtl2           v13.4s, v11.8h
++        sxtl2           v11.4s, v10.8h
++        sxtl            v10.4s, v10.4h
++
++        dup             v15.8h, w13
++
++        cmp             w3,  #34
++        b.le            idct32x32_quarter_add_16_neon
++        cmp             w3,  #135
++        b.le            idct32x32_half_add_16_neon
++
++        movrel          x12, min_eob_idct_idct_32, 2
++
++.irp i, 0, 4, 8, 12, 16, 20, 24, 28
++        add             x0,  sp,  #(\i*128)
++.if \i > 0
++        ldrh            w1,  [x12], #2
++        cmp             w3,  w1
++        mov             x1,  #(32 - \i)/4
++        b.le            1f
++.endif
++        add             x2,  x6,  #(\i*4)
++        bl              idct32_1d_4x32_pass1_neon
++.endr
++        b               3f
++
++1:
++        // Write zeros to the temp buffer for pass 2
++        movi            v16.4s,  #0
++        movi            v17.4s,  #0
++        movi            v18.4s,  #0
++        movi            v19.4s,  #0
++2:
++        subs            x1,  x1,  #1
++.rept 4
++        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
++        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
++.endr
++        b.ne            2b
++3:
++.irp i, 0, 4, 8, 12, 16, 20, 24, 28
++        add             x0,  x4,  #(\i*2)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*4)
++        bl              idct32_1d_4x32_pass2_neon
++.endr
++
++        add             sp,  sp,  #4096
++        ldp             d14, d15, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d8,  d9,  [sp], 0x10
++
++        br              x15
++endfunc
++
++function ff_vp9_idct_idct_32x32_add_10_neon, export=1
++        mov             x13, #0x03ff
++        b               vp9_idct_idct_32x32_add_16_neon
++endfunc
++
++function ff_vp9_idct_idct_32x32_add_12_neon, export=1
++        mov             x13, #0x0fff
++        b               vp9_idct_idct_32x32_add_16_neon
++endfunc
++
++.macro idct32_partial size
++function idct32x32_\size\()_add_16_neon
++.irp i, 0, 4
++        add             x0,  sp,  #(\i*128)
++.ifc \size,quarter
++.if \i == 4
++        cmp             w3,  #9
++        b.le            1f
++.endif
++.endif
++        add             x2,  x6,  #(\i*4)
++        bl              idct32_1d_4x32_pass1_\size\()_neon
++.endr
++
++.ifc \size,half
++.irp i, 8, 12
++        add             x0,  sp,  #(\i*128)
++.if \i == 12
++        cmp             w3,  #70
++        b.le            1f
++.endif
++        add             x2,  x6,  #(\i*4)
++        bl              idct32_1d_4x32_pass1_\size\()_neon
++.endr
++.endif
++        b               3f
++
++1:
++        // Write zeros to the temp buffer for pass 2
++        movi            v16.4s,  #0
++        movi            v17.4s,  #0
++        movi            v18.4s,  #0
++        movi            v19.4s,  #0
++
++.rept 4
++        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
++        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
++.endr
++
++3:
++.irp i, 0, 4, 8, 12, 16, 20, 24, 28
++        add             x0,  x4,  #(\i*2)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*4)
++        bl              idct32_1d_4x32_pass2_\size\()_neon
++.endr
++
++        add             sp,  sp,  #4096
++        ldp             d14, d15, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d8,  d9,  [sp], 0x10
++
++        br              x15
++endfunc
++.endm
++
++idct32_partial quarter
++idct32_partial half
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
+@@ -0,0 +1,1580 @@
++/*
++ * Copyright (c) 2016 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++#include "neon.S"
++
++const itxfm4_coeffs, align=4
++        .short  11585, 0, 6270, 15137
++iadst4_coeffs:
++        .short  5283, 15212, 9929, 13377
++endconst
++
++const iadst8_coeffs, align=4
++        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
++idct_coeffs:
++        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
++        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
++        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
++        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
++endconst
++
++const iadst16_coeffs, align=4
++        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
++        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
++endconst
++
++// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
++// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
++// in/out are .8h registers; this can do with 4 temp registers, but is
++// more efficient if 6 temp registers are available.
++.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
++.if \neg > 0
++        neg             \tmp4\().4h, v0.4h
++.endif
++        add             \tmp1\().8h, \in1\().8h,  \in2\().8h
++        sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
++.if \neg > 0
++        smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
++        smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
++.else
++        smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
++        smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
++.endif
++.ifb \tmp5
++        rshrn           \out1\().4h, \tmp3\().4s, #14
++        rshrn2          \out1\().8h, \tmp4\().4s, #14
++        smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
++        smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
++        rshrn           \out2\().4h, \tmp3\().4s, #14
++        rshrn2          \out2\().8h, \tmp4\().4s, #14
++.else
++        smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
++        smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
++        rshrn           \out1\().4h, \tmp3\().4s, #14
++        rshrn2          \out1\().8h, \tmp4\().4s, #14
++        rshrn           \out2\().4h, \tmp5\().4s, #14
++        rshrn2          \out2\().8h, \tmp6\().4s, #14
++.endif
++.endm
++
++// Same as dmbutterfly0 above, but treating the input in in2 as zero,
++// writing the same output into both out1 and out2.
++.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
++        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
++        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
++        rshrn           \out1\().4h,  \tmp1\().4s, #14
++        rshrn2          \out1\().8h,  \tmp2\().4s, #14
++        rshrn           \out2\().4h,  \tmp1\().4s, #14
++        rshrn2          \out2\().8h,  \tmp2\().4s, #14
++.endm
++
++// out1,out2 = in1 * coef1 - in2 * coef2
++// out3,out4 = in1 * coef2 + in2 * coef1
++// out are 4 x .4s registers, in are 2 x .8h registers
++.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
++        smull           \out1\().4s, \in1\().4h, \coef1
++        smull2          \out2\().4s, \in1\().8h, \coef1
++        smull           \out3\().4s, \in1\().4h, \coef2
++        smull2          \out4\().4s, \in1\().8h, \coef2
++        smlsl           \out1\().4s, \in2\().4h, \coef2
++        smlsl2          \out2\().4s, \in2\().8h, \coef2
++        smlal           \out3\().4s, \in2\().4h, \coef1
++        smlal2          \out4\().4s, \in2\().8h, \coef1
++.endm
++
++// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
++// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
++// inout are 2 x .8h registers
++.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
++        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
++.if \neg > 0
++        neg             \tmp3\().4s, \tmp3\().4s
++        neg             \tmp4\().4s, \tmp4\().4s
++.endif
++        rshrn           \inout1\().4h, \tmp1\().4s,  #14
++        rshrn2          \inout1\().8h, \tmp2\().4s,  #14
++        rshrn           \inout2\().4h, \tmp3\().4s,  #14
++        rshrn2          \inout2\().8h, \tmp4\().4s,  #14
++.endm
++
++// Same as dmbutterfly above, but treating the input in inout2 as zero
++.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
++        smull           \tmp1\().4s, \inout1\().4h, \coef1
++        smull2          \tmp2\().4s, \inout1\().8h, \coef1
++        smull           \tmp3\().4s, \inout1\().4h, \coef2
++        smull2          \tmp4\().4s, \inout1\().8h, \coef2
++        rshrn           \inout1\().4h, \tmp1\().4s, #14
++        rshrn2          \inout1\().8h, \tmp2\().4s, #14
++        rshrn           \inout2\().4h, \tmp3\().4s, #14
++        rshrn2          \inout2\().8h, \tmp4\().4s, #14
++.endm
++
++// Same as dmbutterfly above, but treating the input in inout1 as zero
++.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
++        smull           \tmp1\().4s, \inout2\().4h, \coef2
++        smull2          \tmp2\().4s, \inout2\().8h, \coef2
++        smull           \tmp3\().4s, \inout2\().4h, \coef1
++        smull2          \tmp4\().4s, \inout2\().8h, \coef1
++        neg             \tmp1\().4s, \tmp1\().4s
++        neg             \tmp2\().4s, \tmp2\().4s
++        rshrn           \inout2\().4h, \tmp3\().4s, #14
++        rshrn2          \inout2\().8h, \tmp4\().4s, #14
++        rshrn           \inout1\().4h, \tmp1\().4s, #14
++        rshrn2          \inout1\().8h, \tmp2\().4s, #14
++.endm
++
++.macro dsmull_h out1, out2, in, coef
++        smull           \out1\().4s, \in\().4h, \coef
++        smull2          \out2\().4s, \in\().8h, \coef
++.endm
++
++.macro drshrn_h out, in1, in2, shift
++        rshrn           \out\().4h, \in1\().4s, \shift
++        rshrn2          \out\().8h, \in2\().4s, \shift
++.endm
++
++
++// out1 = in1 + in2
++// out2 = in1 - in2
++.macro butterfly_8h out1, out2, in1, in2
++        add             \out1\().8h, \in1\().8h, \in2\().8h
++        sub             \out2\().8h, \in1\().8h, \in2\().8h
++.endm
++
++// out1 = in1 - in2
++// out2 = in1 + in2
++.macro butterfly_8h_r out1, out2, in1, in2
++        sub             \out1\().8h, \in1\().8h, \in2\().8h
++        add             \out2\().8h, \in1\().8h, \in2\().8h
++.endm
++
++// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
++// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
++// out are 2 x .8h registers, in are 4 x .4s registers
++.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
++        add             \tmp1\().4s, \in1\().4s, \in3\().4s
++        add             \tmp2\().4s, \in2\().4s, \in4\().4s
++        sub             \tmp3\().4s, \in1\().4s, \in3\().4s
++        sub             \tmp4\().4s, \in2\().4s, \in4\().4s
++        rshrn           \out1\().4h, \tmp1\().4s,  #14
++        rshrn2          \out1\().8h, \tmp2\().4s,  #14
++        rshrn           \out2\().4h, \tmp3\().4s,  #14
++        rshrn2          \out2\().8h, \tmp4\().4s,  #14
++.endm
++
++.macro iwht4 c0, c1, c2, c3
++        add             \c0\().4h, \c0\().4h, \c1\().4h
++        sub             v17.4h,    \c2\().4h, \c3\().4h
++        sub             v16.4h,    \c0\().4h, v17.4h
++        sshr            v16.4h,    v16.4h,    #1
++        sub             \c2\().4h, v16.4h,    \c1\().4h
++        sub             \c1\().4h, v16.4h,    \c3\().4h
++        add             \c3\().4h, v17.4h,    \c2\().4h
++        sub             \c0\().4h, \c0\().4h, \c1\().4h
++.endm
++
++.macro idct4 c0, c1, c2, c3
++        smull           v22.4s,    \c1\().4h, v0.h[3]
++        smull           v20.4s,    \c1\().4h, v0.h[2]
++        add             v16.4h,    \c0\().4h, \c2\().4h
++        sub             v17.4h,    \c0\().4h, \c2\().4h
++        smlal           v22.4s,    \c3\().4h, v0.h[2]
++        smull           v18.4s,    v16.4h,    v0.h[0]
++        smull           v19.4s,    v17.4h,    v0.h[0]
++        smlsl           v20.4s,    \c3\().4h, v0.h[3]
++        rshrn           v22.4h,    v22.4s,    #14
++        rshrn           v18.4h,    v18.4s,    #14
++        rshrn           v19.4h,    v19.4s,    #14
++        rshrn           v20.4h,    v20.4s,    #14
++        add             \c0\().4h, v18.4h,    v22.4h
++        sub             \c3\().4h, v18.4h,    v22.4h
++        add             \c1\().4h, v19.4h,    v20.4h
++        sub             \c2\().4h, v19.4h,    v20.4h
++.endm
++
++.macro iadst4 c0, c1, c2, c3
++        smull           v16.4s,    \c0\().4h, v0.h[4]
++        smlal           v16.4s,    \c2\().4h, v0.h[5]
++        smlal           v16.4s,    \c3\().4h, v0.h[6]
++        smull           v17.4s,    \c0\().4h, v0.h[6]
++        smlsl           v17.4s,    \c2\().4h, v0.h[4]
++        sub             \c0\().4h, \c0\().4h, \c2\().4h
++        smlsl           v17.4s,    \c3\().4h, v0.h[5]
++        add             \c0\().4h, \c0\().4h, \c3\().4h
++        smull           v19.4s,    \c1\().4h, v0.h[7]
++        smull           v18.4s,    \c0\().4h, v0.h[7]
++        add             v20.4s,    v16.4s,    v19.4s
++        add             v21.4s,    v17.4s,    v19.4s
++        rshrn           \c0\().4h, v20.4s,    #14
++        add             v16.4s,    v16.4s,    v17.4s
++        rshrn           \c1\().4h, v21.4s,    #14
++        sub             v16.4s,    v16.4s,    v19.4s
++        rshrn           \c2\().4h, v18.4s,    #14
++        rshrn           \c3\().4h, v16.4s,    #14
++.endm
++
++// The public functions in this file have got the following signature:
++// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
++
++.macro itxfm_func4x4 txfm1, txfm2
++function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
++.ifc \txfm1,\txfm2
++.ifc \txfm1,idct
++        movrel          x4,  itxfm4_coeffs
++        ld1             {v0.4h}, [x4]
++.endif
++.ifc \txfm1,iadst
++        movrel          x4,  iadst4_coeffs
++        ld1             {v0.d}[1], [x4]
++.endif
++.else
++        movrel          x4,  itxfm4_coeffs
++        ld1             {v0.8h}, [x4]
++.endif
++
++        movi            v31.8h, #0
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #1
++        b.ne            1f
++        // DC-only for idct/idct
++        ld1             {v2.h}[0], [x2]
++        smull           v2.4s,  v2.4h, v0.h[0]
++        rshrn           v2.4h,  v2.4s, #14
++        smull           v2.4s,  v2.4h, v0.h[0]
++        rshrn           v2.4h,  v2.4s, #14
++        st1             {v31.h}[0], [x2]
++        dup             v4.4h,  v2.h[0]
++        mov             v5.16b, v4.16b
++        mov             v6.16b, v4.16b
++        mov             v7.16b, v4.16b
++        b               2f
++.endif
++
++1:
++        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
++        st1             {v31.8h}, [x2], #16
++
++.ifc \txfm1,iwht
++        sshr            v4.4h,  v4.4h,  #2
++        sshr            v5.4h,  v5.4h,  #2
++        sshr            v6.4h,  v6.4h,  #2
++        sshr            v7.4h,  v7.4h,  #2
++.endif
++
++        \txfm1\()4      v4,  v5,  v6,  v7
++
++        st1             {v31.8h}, [x2], #16
++        // Transpose 4x4 with 16 bit elements
++        transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
++
++        \txfm2\()4      v4,  v5,  v6,  v7
++2:
++        ld1             {v0.s}[0],   [x0], x1
++        ld1             {v1.s}[0],   [x0], x1
++.ifnc \txfm1,iwht
++        srshr           v4.4h,  v4.4h,  #4
++        srshr           v5.4h,  v5.4h,  #4
++        srshr           v6.4h,  v6.4h,  #4
++        srshr           v7.4h,  v7.4h,  #4
++.endif
++        uaddw           v4.8h,  v4.8h,  v0.8b
++        uaddw           v5.8h,  v5.8h,  v1.8b
++        ld1             {v2.s}[0],   [x0], x1
++        ld1             {v3.s}[0],   [x0], x1
++        sqxtun          v0.8b,  v4.8h
++        sqxtun          v1.8b,  v5.8h
++        sub             x0,  x0,  x1, lsl #2
++
++        uaddw           v6.8h,  v6.8h,  v2.8b
++        uaddw           v7.8h,  v7.8h,  v3.8b
++        st1             {v0.s}[0],  [x0], x1
++        sqxtun          v2.8b,  v6.8h
++        sqxtun          v3.8b,  v7.8h
++
++        st1             {v1.s}[0],  [x0], x1
++        st1             {v2.s}[0],  [x0], x1
++        st1             {v3.s}[0],  [x0], x1
++
++        ret
++endfunc
++.endm
++
++itxfm_func4x4 idct,  idct
++itxfm_func4x4 iadst, idct
++itxfm_func4x4 idct,  iadst
++itxfm_func4x4 iadst, iadst
++itxfm_func4x4 iwht,  iwht
++
++
++.macro idct8
++        dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
++        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
++        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
++        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
++
++        butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
++        butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
++        butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
++        butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
++
++        dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
++
++        butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
++        butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
++        butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
++        butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
++.endm
++
++.macro iadst8
++        dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
++        dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
++        dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
++        dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
++
++        dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
++        dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
++        dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
++        dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
++
++        butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
++        butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
++        neg             v23.8h,   v23.8h  // v23 = out[7]
++
++        dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
++        neg             v19.8h,   v19.8h  // v19 = out[3]
++
++        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
++        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
++
++        dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
++        dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
++        neg             v17.8h,   v17.8h  // v17 = out[1]
++
++        dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
++        neg             v21.8h,   v21.8h  // v21 = out[5]
++.endm
++
++
++.macro itxfm_func8x8 txfm1, txfm2
++function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
++        // The iadst also uses a few coefficients from
++        // idct, so those always need to be loaded.
++.ifc \txfm1\()_\txfm2,idct_idct
++        movrel          x4,  idct_coeffs
++.else
++        movrel          x4,  iadst8_coeffs
++        ld1             {v1.8h}, [x4], #16
++.endif
++        ld1             {v0.8h}, [x4]
++
++        movi            v2.8h, #0
++        movi            v3.8h, #0
++        movi            v4.8h, #0
++        movi            v5.8h, #0
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #1
++        b.ne            1f
++        // DC-only for idct/idct
++        ld1             {v2.h}[0],  [x2]
++        smull           v2.4s,  v2.4h, v0.h[0]
++        rshrn           v2.4h,  v2.4s, #14
++        smull           v2.4s,  v2.4h, v0.h[0]
++        rshrn           v2.4h,  v2.4s, #14
++        st1             {v3.h}[0],  [x2]
++        dup             v16.8h,  v2.h[0]
++        mov             v17.16b, v16.16b
++        mov             v18.16b, v16.16b
++        mov             v19.16b, v16.16b
++        mov             v20.16b, v16.16b
++        mov             v21.16b, v16.16b
++        mov             v22.16b, v16.16b
++        mov             v23.16b, v16.16b
++        b               2f
++.endif
++1:
++        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
++        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
++        sub             x2,  x2,  #128
++        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
++        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
++
++        \txfm1\()8
++
++        // Transpose 8x8 with 16 bit elements
++        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
++
++        \txfm2\()8
++2:
++        mov             x3,  x0
++        // Add into the destination
++        ld1             {v0.8b},  [x0], x1
++        srshr           v16.8h, v16.8h, #5
++        ld1             {v1.8b},  [x0], x1
++        srshr           v17.8h, v17.8h, #5
++        ld1             {v2.8b},  [x0], x1
++        srshr           v18.8h, v18.8h, #5
++        uaddw           v16.8h, v16.8h, v0.8b
++        ld1             {v3.8b},  [x0], x1
++        srshr           v19.8h, v19.8h, #5
++        uaddw           v17.8h, v17.8h, v1.8b
++        ld1             {v4.8b},  [x0], x1
++        srshr           v20.8h, v20.8h, #5
++        uaddw           v18.8h, v18.8h, v2.8b
++        sqxtun          v0.8b,  v16.8h
++        ld1             {v5.8b},  [x0], x1
++        srshr           v21.8h, v21.8h, #5
++        uaddw           v19.8h, v19.8h, v3.8b
++        sqxtun          v1.8b,  v17.8h
++        ld1             {v6.8b},  [x0], x1
++        srshr           v22.8h, v22.8h, #5
++        uaddw           v20.8h, v20.8h, v4.8b
++        sqxtun          v2.8b,  v18.8h
++        ld1             {v7.8b},  [x0], x1
++        srshr           v23.8h, v23.8h, #5
++        uaddw           v21.8h, v21.8h, v5.8b
++        sqxtun          v3.8b,  v19.8h
++
++        st1             {v0.8b},  [x3], x1
++        uaddw           v22.8h, v22.8h, v6.8b
++        st1             {v1.8b},  [x3], x1
++        sqxtun          v4.8b,  v20.8h
++        st1             {v2.8b},  [x3], x1
++        uaddw           v23.8h, v23.8h, v7.8b
++        st1             {v3.8b},  [x3], x1
++        sqxtun          v5.8b,  v21.8h
++        st1             {v4.8b},  [x3], x1
++        sqxtun          v6.8b,  v22.8h
++        st1             {v5.8b},  [x3], x1
++        sqxtun          v7.8b,  v23.8h
++
++        st1             {v6.8b},  [x3], x1
++        st1             {v7.8b},  [x3], x1
++
++        ret
++endfunc
++.endm
++
++itxfm_func8x8 idct,  idct
++itxfm_func8x8 iadst, idct
++itxfm_func8x8 idct,  iadst
++itxfm_func8x8 iadst, iadst
++
++
++function idct16x16_dc_add_neon
++        movrel          x4,  idct_coeffs
++        ld1             {v0.4h}, [x4]
++
++        movi            v1.4h,  #0
++
++        ld1             {v2.h}[0], [x2]
++        smull           v2.4s,  v2.4h,  v0.h[0]
++        rshrn           v2.4h,  v2.4s,  #14
++        smull           v2.4s,  v2.4h,  v0.h[0]
++        rshrn           v2.4h,  v2.4s,  #14
++        dup             v2.8h,  v2.h[0]
++        st1             {v1.h}[0], [x2]
++
++        srshr           v2.8h,  v2.8h,  #6
++
++        mov             x3,  x0
++        mov             x4,  #16
++1:
++        // Loop to add the constant from v2 into all 16x16 outputs
++        subs            x4,  x4,  #2
++        ld1             {v3.16b},  [x0], x1
++        ld1             {v4.16b},  [x0], x1
++        uaddw           v16.8h, v2.8h,  v3.8b
++        uaddw2          v17.8h, v2.8h,  v3.16b
++        uaddw           v18.8h, v2.8h,  v4.8b
++        uaddw2          v19.8h, v2.8h,  v4.16b
++        sqxtun          v3.8b,  v16.8h
++        sqxtun2         v3.16b, v17.8h
++        sqxtun          v4.8b,  v18.8h
++        sqxtun2         v4.16b, v19.8h
++        st1             {v3.16b},  [x3], x1
++        st1             {v4.16b},  [x3], x1
++        b.ne            1b
++
++        ret
++endfunc
++
++.macro idct16_end
++        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
++        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
++        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
++        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
++        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
++        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
++        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
++        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
++
++        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
++        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
++
++        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
++        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
++        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
++        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
++        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
++        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
++        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
++        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
++        ret
++.endm
++
++function idct16
++        dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
++        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
++        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
++        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
++        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
++        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
++        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
++        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
++
++        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
++        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
++        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
++        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
++        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
++        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
++        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
++        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
++
++        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
++        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
++        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
++        idct16_end
++endfunc
++
++function idct16_half
++        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
++        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
++        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
++        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
++        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
++        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
++        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
++        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
++
++        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
++        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
++        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
++        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
++        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
++        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
++        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
++        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
++
++        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
++        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
++        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
++        idct16_end
++endfunc
++
++function idct16_quarter
++        dsmull_h        v24, v25, v19, v1.h[7]
++        dsmull_h        v4,  v5,  v17, v1.h[0]
++        dsmull_h        v7,  v6,  v18, v0.h[5]
++        dsmull_h        v30, v31, v18, v0.h[4]
++        neg             v24.4s,  v24.4s
++        neg             v25.4s,  v25.4s
++        dsmull_h        v29, v28, v17, v1.h[1]
++        dsmull_h        v26, v27, v19, v1.h[6]
++        dsmull_h        v22, v23, v16, v0.h[0]
++        drshrn_h        v24, v24, v25, #14
++        drshrn_h        v16, v4,  v5,  #14
++        drshrn_h        v7,  v7,  v6,  #14
++        drshrn_h        v6,  v30, v31, #14
++        drshrn_h        v29, v29, v28, #14
++        drshrn_h        v17, v26, v27, #14
++        drshrn_h        v28, v22, v23, #14
++
++        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
++        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
++        neg             v22.4s,  v22.4s
++        neg             v23.4s,  v23.4s
++        drshrn_h        v27, v20, v21, #14
++        drshrn_h        v21, v22, v23, #14
++        drshrn_h        v23, v18, v19, #14
++        drshrn_h        v25, v30, v31, #14
++        mov             v4.16b,  v28.16b
++        mov             v5.16b,  v28.16b
++        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
++        mov             v20.16b, v28.16b
++        idct16_end
++endfunc
++
++function iadst16
++        ld1             {v0.8h,v1.8h}, [x11]
++
++        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
++        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
++        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
++        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
++        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
++
++        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
++        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
++        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
++        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
++
++        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
++        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
++        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
++        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
++
++        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
++        ld1             {v0.8h}, [x10]
++        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
++        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
++        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
++
++        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
++        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
++        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
++        butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
++        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
++
++        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
++        butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
++        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
++        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
++
++        butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
++        butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
++
++        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
++        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
++
++        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
++        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
++        neg             v29.8h, v29.8h                   // v29 = out[13]
++
++        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
++        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
++
++        butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
++        butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
++
++        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
++        neg             v19.8h, v19.8h                   // v19 = out[3]
++        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
++
++        butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
++        butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
++
++        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
++        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
++        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
++        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
++
++        neg             v31.8h,  v5.8h                    // v31 = out[15]
++        neg             v17.8h,  v3.8h                    // v17 = out[1]
++
++        mov             v16.16b, v2.16b
++        mov             v30.16b, v4.16b
++        ret
++endfunc
++
++// Helper macros; we can't use these expressions directly within
++// e.g. .irp due to the extra concatenation \(). Therefore wrap
++// them in macros to allow using .irp below.
++.macro load i, src, inc
++        ld1             {v\i\().8h},  [\src], \inc
++.endm
++.macro store i, dst, inc
++        st1             {v\i\().8h},  [\dst], \inc
++.endm
++.macro movi_v i, size, imm
++        movi            v\i\()\size,  \imm
++.endm
++.macro load_clear i, src, inc
++        ld1             {v\i\().8h}, [\src]
++        st1             {v2.8h},  [\src], \inc
++.endm
++
++.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
++        srshr           \coef0, \coef0, #6
++        ld1             {v2.8b},  [x0], x1
++        srshr           \coef1, \coef1, #6
++        ld1             {v3.8b},  [x3], x1
++        srshr           \coef2, \coef2, #6
++        ld1             {v4.8b},  [x0], x1
++        srshr           \coef3, \coef3, #6
++        uaddw           \coef0, \coef0, v2.8b
++        ld1             {v5.8b},  [x3], x1
++        uaddw           \coef1, \coef1, v3.8b
++        srshr           \coef4, \coef4, #6
++        ld1             {v6.8b},  [x0], x1
++        srshr           \coef5, \coef5, #6
++        ld1             {v7.8b},  [x3], x1
++        sqxtun          v2.8b,  \coef0
++        srshr           \coef6, \coef6, #6
++        sqxtun          v3.8b,  \coef1
++        srshr           \coef7, \coef7, #6
++        uaddw           \coef2, \coef2, v4.8b
++        ld1             {\tmp1},  [x0], x1
++        uaddw           \coef3, \coef3, v5.8b
++        ld1             {\tmp2},  [x3], x1
++        sqxtun          v4.8b,  \coef2
++        sub             x0,  x0,  x1, lsl #2
++        sub             x3,  x3,  x1, lsl #2
++        sqxtun          v5.8b,  \coef3
++        uaddw           \coef4, \coef4, v6.8b
++        st1             {v2.8b},  [x0], x1
++        uaddw           \coef5, \coef5, v7.8b
++        st1             {v3.8b},  [x3], x1
++        sqxtun          v6.8b,  \coef4
++        st1             {v4.8b},  [x0], x1
++        sqxtun          v7.8b,  \coef5
++        st1             {v5.8b},  [x3], x1
++        uaddw           \coef6, \coef6, \tmp1
++        st1             {v6.8b},  [x0], x1
++        uaddw           \coef7, \coef7, \tmp2
++        st1             {v7.8b},  [x3], x1
++        sqxtun          \tmp1,  \coef6
++        sqxtun          \tmp2,  \coef7
++        st1             {\tmp1},  [x0], x1
++        st1             {\tmp2},  [x3], x1
++.endm
++
++// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
++// transpose into a horizontal 16x8 slice and store.
++// x0 = dst (temp buffer)
++// x1 = slice offset
++// x2 = src
++// x9 = input stride
++.macro itxfm16_1d_funcs txfm
++function \txfm\()16_1d_8x16_pass1_neon
++        mov             x14, x30
++
++        movi            v2.8h, #0
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load_clear      \i,  x2,  x9
++.endr
++
++        bl              \txfm\()16
++
++        // Do two 8x8 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
++        // transposed 8x8 blocks.
++        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
++        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
++
++        // Store the transposed 8x8 blocks horizontally.
++        cmp             x1,  #8
++        b.eq            1f
++.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
++        store           \i,  x0,  #16
++.endr
++        br              x14
++1:
++        // Special case: For the last input column (x1 == 8),
++        // which would be stored as the last row in the temp buffer,
++        // don't store the first 8x8 block, but keep it in registers
++        // for the first slice of the second pass (where it is the
++        // last 8x8 block).
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
++        add             x0,  x0,  #16
++        store           \i,  x0,  #16
++.endr
++        mov             v24.16b, v16.16b
++        mov             v25.16b, v17.16b
++        mov             v26.16b, v18.16b
++        mov             v27.16b, v19.16b
++        mov             v28.16b, v20.16b
++        mov             v29.16b, v21.16b
++        mov             v30.16b, v22.16b
++        mov             v31.16b, v23.16b
++        br              x14
++endfunc
++
++// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
++// load the destination pixels (from a similar 8x16 slice), add and store back.
++// x0 = dst
++// x1 = dst stride
++// x2 = src (temp buffer)
++// x3 = slice offset
++// x9 = temp buffer stride
++function \txfm\()16_1d_8x16_pass2_neon
++        mov             x14, x30
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load            \i,  x2,  x9
++.endr
++        cbz             x3,  1f
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
++        load            \i,  x2,  x9
++.endr
++1:
++
++        add             x3,  x0,  x1
++        lsl             x1,  x1,  #1
++        bl              \txfm\()16
++
++        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
++        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
++
++        br              x14
++endfunc
++.endm
++
++itxfm16_1d_funcs idct
++itxfm16_1d_funcs iadst
++
++.macro itxfm_func16x16 txfm1, txfm2
++function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #1
++        b.eq            idct16x16_dc_add_neon
++.endif
++        mov             x15, x30
++        // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
++.ifnc \txfm1\()_\txfm2,idct_idct
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++.endif
++
++        sub             sp,  sp,  #512
++
++        mov             x4,  x0
++        mov             x5,  x1
++        mov             x6,  x2
++
++        movrel          x10, idct_coeffs
++.ifnc \txfm1\()_\txfm2,idct_idct
++        movrel          x11, iadst16_coeffs
++.endif
++.ifc \txfm1,idct
++        ld1             {v0.8h,v1.8h}, [x10]
++.endif
++        mov             x9,  #32
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        cmp             w3,  #10
++        b.le            idct16x16_quarter_add_neon
++        cmp             w3,  #38
++        b.le            idct16x16_half_add_neon
++.endif
++
++.irp i, 0, 8
++        add             x0,  sp,  #(\i*32)
++.ifc \txfm1\()_\txfm2,idct_idct
++.if \i == 8
++        cmp             w3,  #38
++        b.le            1f
++.endif
++.endif
++        mov             x1,  #\i
++        add             x2,  x6,  #(\i*2)
++        bl              \txfm1\()16_1d_8x16_pass1_neon
++.endr
++.ifc \txfm1\()_\txfm2,iadst_idct
++        ld1             {v0.8h,v1.8h}, [x10]
++.endif
++
++.ifc \txfm1\()_\txfm2,idct_idct
++        b               3f
++1:
++        // Set v24-v31 to zero, for the in-register passthrough of
++        // coefficients to pass 2. Since we only do two slices, this can
++        // only ever happen for the second slice. So we only need to store
++        // zeros to the temp buffer for the second half of the buffer.
++        // Move x0 to the second half, and use x9 == 32 as increment.
++        add             x0,  x0,  #16
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
++        movi_v          \i,  .16b, #0
++        st1             {v24.8h},  [x0], x9
++.endr
++3:
++.endif
++
++.irp i, 0, 8
++        add             x0,  x4,  #(\i)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*2)
++        mov             x3,  #\i
++        bl              \txfm2\()16_1d_8x16_pass2_neon
++.endr
++
++        add             sp,  sp,  #512
++.ifnc \txfm1\()_\txfm2,idct_idct
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++.endif
++        br              x15
++endfunc
++.endm
++
++itxfm_func16x16 idct,  idct
++itxfm_func16x16 iadst, idct
++itxfm_func16x16 idct,  iadst
++itxfm_func16x16 iadst, iadst
++
++function idct16_1d_8x16_pass1_quarter_neon
++        mov             x14, x30
++        movi            v2.8h, #0
++.irp i, 16, 17, 18, 19
++        load_clear      \i,  x2,  x9
++.endr
++
++        bl              idct16_quarter
++
++        // Do two 8x8 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
++        // transposed 8x8 blocks.
++        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
++        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
++
++        // Store the transposed 8x8 blocks horizontally.
++        // The first 8x8 block is kept in registers for the second pass,
++        // store the rest in the temp buffer.
++        // Since only a 4x4 part of the input was nonzero, this means that
++        // only 4 rows are nonzero after transposing, and the second pass
++        // only reads the topmost 4 rows. Therefore only store the topmost
++        // 4 rows.
++        add             x0,  x0,  #16
++.irp i, 24, 25, 26, 27
++        store           \i,  x0,  x9
++.endr
++        br              x14
++endfunc
++
++function idct16_1d_8x16_pass2_quarter_neon
++        mov             x14, x30
++        cbz             x3,  1f
++.irp i, 16, 17, 18, 19
++        load            \i,  x2,  x9
++.endr
++1:
++
++        add             x3,  x0,  x1
++        lsl             x1,  x1,  #1
++        bl              idct16_quarter
++
++        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
++        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
++
++        br              x14
++endfunc
++
++function idct16_1d_8x16_pass1_half_neon
++        mov             x14, x30
++        movi            v2.8h, #0
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load_clear      \i,  x2,  x9
++.endr
++
++        bl              idct16_half
++
++        // Do two 8x8 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
++        // transposed 8x8 blocks.
++        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
++        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
++
++        // Store the transposed 8x8 blocks horizontally.
++        // The first 8x8 block is kept in registers for the second pass,
++        // store the rest in the temp buffer.
++        add             x0,  x0,  #16
++.irp i, 24, 25, 26, 27, 28, 29, 30, 31
++        store           \i,  x0,  x9
++.endr
++        br              x14
++endfunc
++
++function idct16_1d_8x16_pass2_half_neon
++        mov             x14, x30
++        cbz             x3,  1f
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load            \i,  x2,  x9
++.endr
++1:
++
++        add             x3,  x0,  x1
++        lsl             x1,  x1,  #1
++        bl              idct16_half
++
++        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
++        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
++
++        br              x14
++endfunc
++
++.macro idct16_partial size
++function idct16x16_\size\()_add_neon
++        add             x0,  sp,  #(0*32)
++        add             x2,  x6,  #(0*2)
++        bl              idct16_1d_8x16_pass1_\size\()_neon
++.irp i, 0, 8
++        add             x0,  x4,  #(\i)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*2)
++        mov             x3,  #\i
++        bl              idct16_1d_8x16_pass2_\size\()_neon
++.endr
++
++        add             sp,  sp,  #512
++        br              x15
++endfunc
++.endm
++
++idct16_partial quarter
++idct16_partial half
++
++function idct32x32_dc_add_neon
++        movrel          x4,  idct_coeffs
++        ld1             {v0.4h}, [x4]
++
++        movi            v1.4h,  #0
++
++        ld1             {v2.h}[0], [x2]
++        smull           v2.4s,  v2.4h,  v0.h[0]
++        rshrn           v2.4h,  v2.4s,  #14
++        smull           v2.4s,  v2.4h,  v0.h[0]
++        rshrn           v2.4h,  v2.4s,  #14
++        dup             v2.8h,  v2.h[0]
++        st1             {v1.h}[0], [x2]
++
++        srshr           v0.8h,  v2.8h,  #6
++
++        mov             x3,  x0
++        mov             x4,  #32
++1:
++        // Loop to add the constant v0 into all 32x32 outputs
++        subs            x4,  x4,  #2
++        ld1             {v1.16b,v2.16b},  [x0], x1
++        uaddw           v16.8h, v0.8h,  v1.8b
++        uaddw2          v17.8h, v0.8h,  v1.16b
++        ld1             {v3.16b,v4.16b},  [x0], x1
++        uaddw           v18.8h, v0.8h,  v2.8b
++        uaddw2          v19.8h, v0.8h,  v2.16b
++        uaddw           v20.8h, v0.8h,  v3.8b
++        uaddw2          v21.8h, v0.8h,  v3.16b
++        uaddw           v22.8h, v0.8h,  v4.8b
++        uaddw2          v23.8h, v0.8h,  v4.16b
++        sqxtun          v1.8b,  v16.8h
++        sqxtun2         v1.16b, v17.8h
++        sqxtun          v2.8b,  v18.8h
++        sqxtun2         v2.16b, v19.8h
++        sqxtun          v3.8b,  v20.8h
++        sqxtun2         v3.16b, v21.8h
++        st1             {v1.16b,v2.16b},  [x3], x1
++        sqxtun          v4.8b,  v22.8h
++        sqxtun2         v4.16b, v23.8h
++        st1             {v3.16b,v4.16b},  [x3], x1
++        b.ne            1b
++
++        ret
++endfunc
++
++.macro idct32_end
++        butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
++        butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
++        butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
++        butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
++        butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
++        butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
++        butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
++        butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
++
++        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
++        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
++        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
++        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
++
++        butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
++        butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
++        butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
++        butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
++        butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
++        butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
++        butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
++        butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
++
++        dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
++        dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
++        dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
++        dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
++        ret
++.endm
++
++function idct32_odd
++        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
++        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
++        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
++        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
++        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
++        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
++        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
++        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
++
++        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
++        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
++        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
++        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
++        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
++        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
++        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
++        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
++
++        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
++        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
++        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
++        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
++        idct32_end
++endfunc
++
++function idct32_odd_half
++        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
++        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
++        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
++        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
++        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
++        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
++        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
++        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
++
++        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
++        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
++        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
++        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
++        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
++        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
++        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
++        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
++
++        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
++        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
++        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
++        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
++        idct32_end
++endfunc
++
++function idct32_odd_quarter
++        dsmull_h        v4,  v5,  v16, v8.h[0]
++        dsmull_h        v28, v29, v19, v8.h[7]
++        dsmull_h        v30, v31, v16, v8.h[1]
++        dsmull_h        v22, v23, v17, v9.h[6]
++        dsmull_h        v7,  v6,  v17, v9.h[7]
++        dsmull_h        v26, v27, v19, v8.h[6]
++        dsmull_h        v20, v21, v18, v9.h[0]
++        dsmull_h        v24, v25, v18, v9.h[1]
++
++        neg             v28.4s, v28.4s
++        neg             v29.4s, v29.4s
++        neg             v7.4s,  v7.4s
++        neg             v6.4s,  v6.4s
++
++        drshrn_h        v4,  v4,  v5,  #14
++        drshrn_h        v5,  v28, v29, #14
++        drshrn_h        v29, v30, v31, #14
++        drshrn_h        v28, v22, v23, #14
++        drshrn_h        v7,  v7,  v6,  #14
++        drshrn_h        v31, v26, v27, #14
++        drshrn_h        v6,  v20, v21, #14
++        drshrn_h        v30, v24, v25, #14
++
++        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
++        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
++        drshrn_h        v23, v16, v17, #14
++        drshrn_h        v24, v18, v19, #14
++        neg             v20.4s, v20.4s
++        neg             v21.4s, v21.4s
++        drshrn_h        v27, v27, v26, #14
++        drshrn_h        v20, v20, v21, #14
++        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
++        drshrn_h        v21, v16, v17, #14
++        drshrn_h        v26, v18, v19, #14
++        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
++        drshrn_h        v25, v16, v17, #14
++        neg             v18.4s, v18.4s
++        neg             v19.4s, v19.4s
++        drshrn_h        v22, v18, v19, #14
++
++        idct32_end
++endfunc
++
++.macro idct32_funcs suffix
++// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
++// The 32-point IDCT can be decomposed into two 16-point IDCTs;
++// a normal IDCT16 with every other input component (the even ones, with
++// each output written twice), followed by a separate 16-point IDCT
++// of the odd inputs, added/subtracted onto the outputs of the first idct16.
++// x0 = dst (temp buffer)
++// x1 = unused
++// x2 = src
++// x9 = double input stride
++function idct32_1d_8x32_pass1\suffix\()_neon
++        mov             x14, x30
++        movi            v2.8h,  #0
++
++        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load_clear      \i, x2, x9
++.endr
++.endif
++
++        bl              idct16\suffix
++
++        // Do two 8x8 transposes. Originally, v16-v31 contain the
++        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
++        // two transposed 8x8 blocks.
++        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
++        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
++
++        // Store the registers a, b horizontally, followed by the
++        // same registers b, a mirrored.
++.macro store_rev a, b
++        // There's no rev128 instruction, but we reverse each 64 bit
++        // half, and then flip them using an ext with 8 bytes offset.
++        rev64           v3.8h, \b
++        st1             {\a},  [x0], #16
++        rev64           v2.8h, \a
++        ext             v3.16b, v3.16b, v3.16b, #8
++        st1             {\b},  [x0], #16
++        ext             v2.16b, v2.16b, v2.16b, #8
++        st1             {v3.8h},  [x0], #16
++        st1             {v2.8h},  [x0], #16
++.endm
++        store_rev       v16.8h, v24.8h
++        store_rev       v17.8h, v25.8h
++        store_rev       v18.8h, v26.8h
++        store_rev       v19.8h, v27.8h
++        store_rev       v20.8h, v28.8h
++        store_rev       v21.8h, v29.8h
++        store_rev       v22.8h, v30.8h
++        store_rev       v23.8h, v31.8h
++        sub             x0,  x0,  #512
++.purgem store_rev
++
++        // Move x2 back to the start of the input, and move
++        // to the first odd row
++.ifb \suffix
++        sub             x2,  x2,  x9, lsl #4
++.endif
++.ifc \suffix,_quarter
++        sub             x2,  x2,  x9, lsl #2
++.endif
++.ifc \suffix,_half
++        sub             x2,  x2,  x9, lsl #3
++.endif
++        add             x2,  x2,  #64
++
++        movi            v2.8h,  #0
++        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load_clear      \i, x2, x9
++.endr
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load_clear      \i, x2, x9
++.endr
++.endif
++
++        bl              idct32_odd\suffix
++
++        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
++        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
++
++        // Store the registers a, b horizontally,
++        // adding into the output first, and the mirrored,
++        // subtracted from the output.
++.macro store_rev a, b
++        ld1             {v4.8h},  [x0]
++        rev64           v3.8h, \b
++        add             v4.8h, v4.8h, \a
++        rev64           v2.8h, \a
++        st1             {v4.8h},  [x0], #16
++        ext             v3.16b, v3.16b, v3.16b, #8
++        ld1             {v5.8h},  [x0]
++        ext             v2.16b, v2.16b, v2.16b, #8
++        add             v5.8h, v5.8h, \b
++        st1             {v5.8h},  [x0], #16
++        ld1             {v6.8h},  [x0]
++        sub             v6.8h, v6.8h, v3.8h
++        st1             {v6.8h},  [x0], #16
++        ld1             {v7.8h},  [x0]
++        sub             v7.8h, v7.8h, v2.8h
++        st1             {v7.8h},  [x0], #16
++.endm
++
++        store_rev       v31.8h, v23.8h
++        store_rev       v30.8h, v22.8h
++        store_rev       v29.8h, v21.8h
++        store_rev       v28.8h, v20.8h
++        store_rev       v27.8h, v19.8h
++        store_rev       v26.8h, v18.8h
++        store_rev       v25.8h, v17.8h
++        store_rev       v24.8h, v16.8h
++.purgem store_rev
++        br              x14
++endfunc
++
++// This is mostly the same as 8x32_pass1, but without the transpose,
++// and use the source as temp buffer between the two idct passes, and
++// add into the destination.
++// x0 = dst
++// x1 = dst stride
++// x2 = src (temp buffer)
++// x7 = negative double temp buffer stride
++// x9 = double temp buffer stride
++function idct32_1d_8x32_pass2\suffix\()_neon
++        mov             x14, x30
++        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #4
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #2
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #3
++.endif
++
++        bl              idct16\suffix
++
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        store           \i, x2, x9
++.endr
++
++        sub             x2,  x2,  x9, lsl #4
++        add             x2,  x2,  #64
++
++        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
++.ifb \suffix
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #4
++.endif
++.ifc \suffix,_quarter
++.irp i, 16, 17, 18, 19
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #2
++.endif
++.ifc \suffix,_half
++.irp i, 16, 17, 18, 19, 20, 21, 22, 23
++        load            \i, x2, x9
++.endr
++        sub             x2,  x2,  x9, lsl #3
++.endif
++        sub             x2,  x2,  #64
++
++        bl              idct32_odd\suffix
++
++.macro load_acc_store a, b, c, d, neg=0
++.if \neg == 0
++        ld1             {v4.8h},  [x2], x9
++        ld1             {v5.8h},  [x2], x9
++        add             v4.8h, v4.8h, \a
++        ld1             {v6.8h},  [x2], x9
++        add             v5.8h, v5.8h, \b
++        ld1             {v7.8h},  [x2], x9
++        add             v6.8h, v6.8h, \c
++        add             v7.8h, v7.8h, \d
++.else
++        ld1             {v4.8h},  [x2], x7
++        ld1             {v5.8h},  [x2], x7
++        sub             v4.8h, v4.8h, \a
++        ld1             {v6.8h},  [x2], x7
++        sub             v5.8h, v5.8h, \b
++        ld1             {v7.8h},  [x2], x7
++        sub             v6.8h, v6.8h, \c
++        sub             v7.8h, v7.8h, \d
++.endif
++        ld1             {v10.8b}, [x0], x1
++        ld1             {v11.8b}, [x0], x1
++        srshr           v4.8h, v4.8h, #6
++        ld1             {v2.8b}, [x0], x1
++        srshr           v5.8h, v5.8h, #6
++        uaddw           v4.8h, v4.8h, v10.8b
++        ld1             {v3.8b}, [x0], x1
++        srshr           v6.8h, v6.8h, #6
++        uaddw           v5.8h, v5.8h, v11.8b
++        srshr           v7.8h, v7.8h, #6
++        sub             x0,  x0,  x1, lsl #2
++        uaddw           v6.8h, v6.8h, v2.8b
++        sqxtun          v4.8b, v4.8h
++        uaddw           v7.8h, v7.8h, v3.8b
++        sqxtun          v5.8b, v5.8h
++        st1             {v4.8b}, [x0], x1
++        sqxtun          v6.8b, v6.8h
++        st1             {v5.8b}, [x0], x1
++        sqxtun          v7.8b, v7.8h
++        st1             {v6.8b}, [x0], x1
++        st1             {v7.8b}, [x0], x1
++.endm
++        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
++        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
++        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
++        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
++        sub             x2,  x2,  x9
++        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
++        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
++        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
++        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
++.purgem load_acc_store
++        br              x14
++endfunc
++.endm
++
++idct32_funcs
++idct32_funcs _quarter
++idct32_funcs _half
++
++const min_eob_idct_idct_32, align=4
++        .short  0, 34, 135, 336
++endconst
++
++function ff_vp9_idct_idct_32x32_add_neon, export=1
++        cmp             w3,  #1
++        b.eq            idct32x32_dc_add_neon
++
++        movrel          x10, idct_coeffs
++
++        mov             x15, x30
++
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++
++        sub             sp,  sp,  #2048
++
++        mov             x4,  x0
++        mov             x5,  x1
++        mov             x6,  x2
++
++        // Double stride of the input, since we only read every other line
++        mov             x9,  #128
++        neg             x7,  x9
++
++        ld1             {v0.8h,v1.8h}, [x10], #32
++        ld1             {v8.8h,v9.8h}, [x10]
++
++        cmp             w3,  #34
++        b.le            idct32x32_quarter_add_neon
++        cmp             w3,  #135
++        b.le            idct32x32_half_add_neon
++
++        movrel          x12, min_eob_idct_idct_32, 2
++
++.irp i, 0, 8, 16, 24
++        add             x0,  sp,  #(\i*64)
++.if \i > 0
++        ldrh            w1,  [x12], #2
++        cmp             w3,  w1
++        mov             x1,  #(32 - \i)/4
++        b.le            1f
++.endif
++        add             x2,  x6,  #(\i*2)
++        bl              idct32_1d_8x32_pass1_neon
++.endr
++        b               3f
++
++1:
++        // Write zeros to the temp buffer for pass 2
++        movi            v16.8h,  #0
++        movi            v17.8h,  #0
++        movi            v18.8h,  #0
++        movi            v19.8h,  #0
++2:
++        subs            x1,  x1,  #1
++.rept 4
++        st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
++.endr
++        b.ne            2b
++3:
++.irp i, 0, 8, 16, 24
++        add             x0,  x4,  #(\i)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*2)
++        bl              idct32_1d_8x32_pass2_neon
++.endr
++
++        add             sp,  sp,  #2048
++
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++
++        br              x15
++endfunc
++
++.macro idct32_partial size
++function idct32x32_\size\()_add_neon
++        add             x0,  sp,  #(0*64)
++        add             x2,  x6,  #(0*2)
++        bl              idct32_1d_8x32_pass1_\size\()_neon
++.ifc \size,half
++        add             x0,  sp,  #(8*64)
++        add             x2,  x6,  #(8*2)
++        bl              idct32_1d_8x32_pass1_\size\()_neon
++.endif
++.irp i, 0, 8, 16, 24
++        add             x0,  x4,  #(\i)
++        mov             x1,  x5
++        add             x2,  sp,  #(\i*2)
++        bl              idct32_1d_8x32_pass2_\size\()_neon
++.endr
++
++        add             sp,  sp,  #2048
++
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++
++        br              x15
++endfunc
++.endm
++
++idct32_partial quarter
++idct32_partial half
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+@@ -0,0 +1,873 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++#include "neon.S"
++
++
++.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
++        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
++        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
++        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
++        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
++
++        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
++        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
++        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
++        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
++.endm
++
++// The input to and output from this macro is in the registers v16-v31,
++// and v0-v7 are used as scratch registers.
++// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
++// Depending on the width of the loop filter, we either use v16-v19
++// and v28-v31 as temp registers, or v8-v15.
++.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
++        dup             v0.8h,  w2                   // E
++        dup             v2.8h,  w3                   // I
++        dup             v3.8h,  w4                   // H
++
++        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
++        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
++        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
++        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
++        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
++        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
++        umax            v4.8h,  v4.8h,  v5.8h
++        umax            v5.8h,  v6.8h,  v7.8h
++        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
++        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
++        umax            v4.8h,  v4.8h,  v5.8h
++        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
++        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
++        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
++        ushr            v5.8h,  v5.8h,  #1
++        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
++        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
++        cmhs            v6.8h,  v0.8h,  v6.8h
++        and             v4.16b, v4.16b, v6.16b       // fm
++
++        // If no pixels need filtering, just exit as soon as possible
++        mov             x11, v4.d[0]
++        mov             x12, v4.d[1]
++        adds            x11, x11, x12
++        b.ne            1f
++        br              x10
++1:
++
++.if \wd >= 8
++        dup             v0.8h,  w5
++
++        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
++        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
++        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
++        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
++        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
++        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
++        umax            v6.8h,  v6.8h,  v2.8h
++        umax            v1.8h,  v1.8h,  \tmp1\().8h
++        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
++.if \wd == 16
++        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
++        umax            v6.8h,  v6.8h,  v1.8h
++        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
++        umax            v6.8h,  v6.8h,  \tmp2\().8h
++        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
++        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
++        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
++        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
++        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
++        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
++        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
++        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
++        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
++
++        umax            v7.8h,  v7.8h,  v2.8h
++        umax            v1.8h,  v1.8h,  v8.8h
++        umax            v9.8h,  v9.8h,  v10.8h
++        umax            v11.8h, v11.8h, v12.8h
++        // The rest of the calculation of flat8out is interleaved below
++.else
++        // The rest of the calculation of flat8in is interleaved below
++.endif
++.endif
++
++        // Calculate the normal inner loop filter for 2 or 4 pixels
++        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
++.if \wd == 16
++        umax            v7.8h,  v7.8h,  v1.8h
++        umax            v9.8h,  v9.8h,  v11.8h
++.elseif \wd == 8
++        umax            v6.8h,  v6.8h,  v1.8h
++.endif
++        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
++.if \wd == 16
++        umax            v7.8h,  v7.8h,  v9.8h
++.elseif \wd == 8
++        umax            v6.8h,  v6.8h,  \tmp2\().8h
++.endif
++        dup             \tmp2\().8h,  w6                        // left shift for saturation
++        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
++        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
++        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
++        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
++        movi            \tmp5\().8h,  #3
++.if \wd == 8
++        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
++.endif
++        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
++.if \wd == 8
++        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
++.endif
++        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
++.if \wd == 16
++        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
++.elseif \wd == 8
++        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
++.endif
++        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
++.if \wd == 16
++        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
++.endif
++        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
++
++        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
++        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
++        movi            v2.8h,  #4
++        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
++        movi            v3.8h,  #3
++        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
++        movi            \tmp5\().8h,  #0
++        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
++        dup             \tmp6\().8h,  w7                        // max pixel value
++.if \wd == 16
++        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
++.endif
++
++        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
++
++        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
++        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
++        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
++        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
++        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
++        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
++
++        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
++        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
++        smin            v0.8h,   v0.8h,   \tmp6\().8h
++        smin            v2.8h,   v2.8h,   \tmp6\().8h
++        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
++        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
++        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
++        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
++        bit             v24.16b, v2.16b,  v4.16b
++
++        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
++        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
++.if \wd >= 8
++        mov             x11, v6.d[0]
++.endif
++        smin            v0.8h,  v0.8h,  \tmp6\().8h
++        smin            v2.8h,  v2.8h,  \tmp6\().8h
++.if \wd >= 8
++        mov             x12, v6.d[1]
++.endif
++        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
++        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
++.if \wd >= 8
++        adds            x11, x11, x12
++.endif
++        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
++        bit             v25.16b, v2.16b,  v5.16b
++
++        // If no pixels need flat8in, jump to flat8out
++        // (or to a writeout of the inner 4 pixels, for wd=8)
++.if \wd >= 8
++.if \wd == 16
++        b.eq            6f
++.else
++        b.ne            1f
++        br              x13
++1:
++.endif
++
++        // flat8in
++        add             \tmp1\().8h, v20.8h, v21.8h
++        add             \tmp3\().8h, v22.8h, v25.8h
++        add             \tmp5\().8h, v20.8h, v22.8h
++        add             \tmp7\().8h, v23.8h, v26.8h
++        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
++        add             v0.8h,  v0.8h,  v23.8h
++        add             v0.8h,  v0.8h,  v24.8h
++        add             v0.8h,  v0.8h,  \tmp5\().8h
++        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
++        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
++        urshr           v2.8h,  v0.8h,  #3                      // out p2
++
++        add             v0.8h,  v0.8h,  \tmp3\().8h
++        add             \tmp1\().8h, v20.8h,  v23.8h
++        add             \tmp3\().8h, v24.8h,  v27.8h
++        urshr           v3.8h,  v0.8h,  #3                      // out p1
++
++        add             v0.8h,  v0.8h,  \tmp7\().8h
++        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
++        add             \tmp5\().8h, v21.8h,  v24.8h
++        add             \tmp7\().8h, v25.8h,  v27.8h
++        urshr           v4.8h,  v0.8h,  #3                      // out p0
++
++        add             v0.8h,  v0.8h,  \tmp3\().8h
++        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
++        add             \tmp1\().8h, v22.8h,  v25.8h
++        add             \tmp3\().8h, v26.8h,  v27.8h
++        urshr           v5.8h,  v0.8h,  #3                      // out q0
++
++        add             v0.8h,  v0.8h,  \tmp7\().8h
++        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
++        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
++
++        add             v0.8h,  v0.8h,  \tmp3\().8h
++        // The output here is written back into the input registers. This doesn't
++        // matter for the flat8part below, since we only update those pixels
++        // which won't be touched below.
++        bit             v21.16b, v2.16b,  v6.16b
++        bit             v22.16b, v3.16b,  v6.16b
++        bit             v23.16b, v4.16b,  v6.16b
++        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
++        bit             v24.16b, v5.16b,  v6.16b
++        bit             v25.16b, \tmp5\().16b,  v6.16b
++        bit             v26.16b, \tmp6\().16b,  v6.16b
++.endif
++.if \wd == 16
++6:
++        orr             v2.16b,  v6.16b,  v7.16b
++        mov             x11, v2.d[0]
++        mov             x12, v2.d[1]
++        adds            x11, x11, x12
++        b.ne            1f
++        // If no pixels needed flat8in nor flat8out, jump to a
++        // writeout of the inner 4 pixels
++        br              x14
++1:
++
++        mov             x11, v7.d[0]
++        mov             x12, v7.d[1]
++        adds            x11, x11, x12
++        b.ne            1f
++        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
++        br              x15
++
++1:
++        // flat8out
++        // This writes all outputs into v2-v17 (skipping v6 and v16).
++        // If this part is skipped, the output is read from v21-v26 (which is the input
++        // to this section).
++        shl             v0.8h,   v16.8h,  #3     // 8 * v16
++        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
++        add             v0.8h,   v0.8h,   v17.8h
++        add             v8.8h,   v17.8h,  v18.8h
++        add             v10.8h,  v19.8h,  v20.8h
++        add             v0.8h,   v0.8h,   v8.8h
++        add             v8.8h,   v16.8h,  v17.8h
++        add             v12.8h,  v21.8h,  v22.8h
++        add             v0.8h,   v0.8h,   v10.8h
++        add             v10.8h,  v18.8h,  v25.8h
++        add             v14.8h,  v23.8h,  v24.8h
++        sub             v10.8h,  v10.8h,  v8.8h
++        add             v0.8h,   v0.8h,   v12.8h
++        add             v0.8h,   v0.8h,   v14.8h
++        add             v12.8h,  v16.8h,  v18.8h
++        add             v14.8h,  v19.8h,  v26.8h
++        urshr           v2.8h,   v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v10.8h
++        add             v8.8h,   v16.8h,  v19.8h
++        add             v10.8h,  v20.8h,  v27.8h
++        sub             v14.8h,  v14.8h,  v12.8h
++        bif             v2.16b,  v17.16b, v7.16b
++        urshr           v3.8h ,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v14.8h
++        add             v12.8h,  v16.8h,  v20.8h
++        add             v14.8h,  v21.8h,  v28.8h
++        sub             v10.8h,  v10.8h,  v8.8h
++        bif             v3.16b,  v18.16b, v7.16b
++        urshr           v4.8h,   v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v10.8h
++        add             v8.8h,   v16.8h,  v21.8h
++        add             v10.8h,  v22.8h,  v29.8h
++        sub             v14.8h,  v14.8h,  v12.8h
++        bif             v4.16b,  v19.16b, v7.16b
++        urshr           v5.8h,   v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v14.8h
++        add             v12.8h,  v16.8h,  v22.8h
++        add             v14.8h,  v23.8h,  v30.8h
++        sub             v10.8h,  v10.8h,  v8.8h
++        bif             v5.16b,  v20.16b, v7.16b
++        urshr           v6.8h,   v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v10.8h
++        add             v10.8h,  v16.8h,  v23.8h
++        sub             v14.8h,  v14.8h,  v12.8h
++        add             v12.8h,  v24.8h,  v31.8h
++        bif             v6.16b,  v21.16b, v7.16b
++        urshr           v8.8h,   v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v14.8h
++        sub             v10.8h,  v12.8h,  v10.8h
++        add             v12.8h,  v17.8h,  v24.8h
++        add             v14.8h,  v25.8h,  v31.8h
++        bif             v8.16b,  v22.16b, v7.16b
++        urshr           v9.8h,   v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v10.8h
++        sub             v14.8h,  v14.8h,  v12.8h
++        add             v12.8h,  v26.8h,  v31.8h
++        bif             v9.16b,  v23.16b, v7.16b
++        urshr           v10.8h,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v14.8h
++        add             v14.8h,  v18.8h,  v25.8h
++        add             v18.8h,  v19.8h,  v26.8h
++        sub             v12.8h,  v12.8h,  v14.8h
++        add             v14.8h,  v27.8h,  v31.8h
++        bif             v10.16b, v24.16b, v7.16b
++        urshr           v11.8h,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v12.8h
++        add             v12.8h,  v20.8h,  v27.8h
++        sub             v14.8h,  v14.8h,  v18.8h
++        add             v18.8h,  v28.8h,  v31.8h
++        bif             v11.16b, v25.16b, v7.16b
++        sub             v18.8h,  v18.8h,  v12.8h
++        urshr           v12.8h,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v14.8h
++        add             v14.8h,  v21.8h,  v28.8h
++        add             v20.8h,  v29.8h,  v31.8h
++        bif             v12.16b, v26.16b, v7.16b
++        urshr           v13.8h,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v18.8h
++        sub             v20.8h,  v20.8h,  v14.8h
++        add             v18.8h,  v22.8h,  v29.8h
++        add             v22.8h,  v30.8h,  v31.8h
++        bif             v13.16b, v27.16b, v7.16b
++        urshr           v14.8h,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v20.8h
++        sub             v22.8h,  v22.8h,  v18.8h
++        bif             v14.16b, v28.16b, v7.16b
++        urshr           v15.8h,  v0.8h,   #4
++
++        add             v0.8h,   v0.8h,   v22.8h
++        bif             v15.16b, v29.16b, v7.16b
++        urshr           v17.8h,  v0.8h,   #4
++        bif             v17.16b, v30.16b, v7.16b
++.endif
++.endm
++
++// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
++// while we need those for inputs/outputs in wd=16 and use v8-v15
++// for temp registers there instead.
++function vp9_loop_filter_4
++        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
++        ret
++endfunc
++
++function vp9_loop_filter_8
++        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
++        ret
++endfunc
++
++function vp9_loop_filter_16
++        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
++        ret
++endfunc
++
++.macro loop_filter_4
++        bl              vp9_loop_filter_4
++.endm
++
++.macro loop_filter_8
++        // calculate alternative 'return' targets
++        adr             x13, 6f
++        bl              vp9_loop_filter_8
++.endm
++
++.macro loop_filter_16
++        // calculate alternative 'return' targets
++        adr             x14, 7f
++        adr             x15, 8f
++        bl              vp9_loop_filter_16
++.endm
++
++
++// The public functions in this file have got the following signature:
++// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
++
++.macro bpp_frontend func, bpp, push
++function ff_\func\()_\bpp\()_neon, export=1
++.if \push
++        mov             x16, x30
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++.endif
++        lsl             w2,  w2,  #\bpp - 8
++        lsl             w3,  w3,  #\bpp - 8
++        lsl             w4,  w4,  #\bpp - 8
++        mov             x5,  #1 << (\bpp - 8)
++        mov             x6,  #16 - \bpp
++        mov             x7,  #((1 << \bpp) - 1)
++.if \push
++        bl              \func\()_16_neon
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x16
++.else
++        b               \func\()_16_neon
++.endif
++endfunc
++.endm
++
++.macro bpp_frontends func, push=0
++        bpp_frontend    \func, 10, \push
++        bpp_frontend    \func, 12, \push
++.endm
++
++.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
++function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
++        mov             x16, x30
++.if \push
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++.endif
++        lsl             w2,  w2,  #\bpp - 8
++        lsl             w3,  w3,  #\bpp - 8
++        lsl             w4,  w4,  #\bpp - 8
++        mov             x5,  #1 << (\bpp - 8)
++        mov             x6,  #16 - \bpp
++        mov             x7,  #((1 << \bpp) - 1)
++        bl              \func\()_\int_suffix\()_16_neon
++.ifc \dir,h
++        add             x0,  x0,  x1, lsl #3
++.else
++        add             x0,  x0,  #16
++.endif
++        bl              \func\()_\int_suffix\()_16_neon
++.if \push
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++.endif
++        br              x16
++endfunc
++.endm
++
++.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
++        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
++        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
++.endm
++
++.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
++function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
++        mov             x16, x30
++        lsr             w8,  w2,  #8
++        lsr             w14, w3,  #8
++        lsr             w15, w4,  #8
++        and             w2,  w2,  #0xff
++        and             w3,  w3,  #0xff
++        and             w4,  w4,  #0xff
++        lsl             w2,  w2,  #\bpp - 8
++        lsl             w3,  w3,  #\bpp - 8
++        lsl             w4,  w4,  #\bpp - 8
++        mov             x5,  #1 << (\bpp - 8)
++        mov             x6,  #16 - \bpp
++        mov             x7,  #((1 << \bpp) - 1)
++        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
++.ifc \dir,h
++        add             x0,  x0,  x1, lsl #3
++.else
++        add             x0,  x0,  #16
++.endif
++        lsl             w2,  w8,  #\bpp - 8
++        lsl             w3,  w14, #\bpp - 8
++        lsl             w4,  w15, #\bpp - 8
++        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
++        br              x16
++endfunc
++.endm
++
++.macro bpp_frontends_mix2 wd1, wd2
++        bpp_frontend_mix2 \wd1, \wd2, v, 10
++        bpp_frontend_mix2 \wd1, \wd2, v, 12
++        bpp_frontend_mix2 \wd1, \wd2, h, 10
++        bpp_frontend_mix2 \wd1, \wd2, h, 12
++.endm
++
++function vp9_loop_filter_v_4_8_16_neon
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #2
++        ld1             {v20.8h}, [x9], x1 // p3
++        ld1             {v24.8h}, [x0], x1 // q0
++        ld1             {v21.8h}, [x9], x1 // p2
++        ld1             {v25.8h}, [x0], x1 // q1
++        ld1             {v22.8h}, [x9], x1 // p1
++        ld1             {v26.8h}, [x0], x1 // q2
++        ld1             {v23.8h}, [x9], x1 // p0
++        ld1             {v27.8h}, [x0], x1 // q3
++        sub             x0,  x0,  x1, lsl #2
++        sub             x9,  x9,  x1, lsl #1
++
++        loop_filter_4
++
++        st1             {v22.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #1
++
++        br              x10
++endfunc
++
++bpp_frontends vp9_loop_filter_v_4_8
++
++function vp9_loop_filter_h_4_8_16_neon
++        mov             x10, x30
++        sub             x9,  x0,  #8
++        add             x0,  x9,  x1, lsl #2
++        ld1             {v20.8h}, [x9], x1
++        ld1             {v24.8h}, [x0], x1
++        ld1             {v21.8h}, [x9], x1
++        ld1             {v25.8h}, [x0], x1
++        ld1             {v22.8h}, [x9], x1
++        ld1             {v26.8h}, [x0], x1
++        ld1             {v23.8h}, [x9], x1
++        ld1             {v27.8h}, [x0], x1
++
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #8
++
++        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        loop_filter_4
++
++        // Move x9 forward by 2 pixels; we don't need to rewrite the
++        // outermost 2 pixels since they aren't changed.
++        add             x9,  x9,  #4
++        add             x0,  x9,  x1, lsl #2
++
++        // We only will write the mid 4 pixels back; after the loop filter,
++        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
++        // We need to transpose them to columns, done with a 4x8 transpose
++        // (which in practice is two 4x4 transposes of the two 4x4 halves
++        // of the 8x4 pixels; into 4x8 pixels).
++        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.d}[0], [x9], x1
++        st1             {v22.d}[1], [x0], x1
++        st1             {v23.d}[0], [x9], x1
++        st1             {v23.d}[1], [x0], x1
++        st1             {v24.d}[0], [x9], x1
++        st1             {v24.d}[1], [x0], x1
++        st1             {v25.d}[0], [x9], x1
++        st1             {v25.d}[1], [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #4
++
++        br              x10
++endfunc
++
++bpp_frontends vp9_loop_filter_h_4_8
++
++function vp9_loop_filter_v_8_8_16_neon
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #2
++        ld1             {v20.8h}, [x9], x1 // p3
++        ld1             {v24.8h}, [x0], x1 // q0
++        ld1             {v21.8h}, [x9], x1 // p2
++        ld1             {v25.8h}, [x0], x1 // q1
++        ld1             {v22.8h}, [x9], x1 // p1
++        ld1             {v26.8h}, [x0], x1 // q2
++        ld1             {v23.8h}, [x9], x1 // p0
++        ld1             {v27.8h}, [x0], x1 // q3
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #2
++        add             x9,  x9,  x1
++
++        loop_filter_8
++
++        st1             {v21.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v22.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v26.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #1
++        sub             x0,  x0,  x1
++
++        br              x10
++6:
++        sub             x9,  x0,  x1, lsl #1
++        st1             {v22.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #1
++        br              x10
++endfunc
++
++bpp_frontends vp9_loop_filter_v_8_8
++
++function vp9_loop_filter_h_8_8_16_neon
++        mov             x10, x30
++        sub             x9,  x0,  #8
++        add             x0,  x9,  x1, lsl #2
++        ld1             {v20.8h}, [x9], x1
++        ld1             {v24.8h}, [x0], x1
++        ld1             {v21.8h}, [x9], x1
++        ld1             {v25.8h}, [x0], x1
++        ld1             {v22.8h}, [x9], x1
++        ld1             {v26.8h}, [x0], x1
++        ld1             {v23.8h}, [x9], x1
++        ld1             {v27.8h}, [x0], x1
++
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #8
++
++        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        loop_filter_8
++
++        add             x0,  x9,  x1, lsl #2
++
++        // Even though only 6 pixels per row have been changed, we write the
++        // full 8 pixel registers.
++        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v20.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v21.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        st1             {v22.8h}, [x9], x1
++        st1             {v26.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v27.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #8
++
++        br              x10
++6:
++        // If we didn't need to do the flat8in part, we use the same writeback
++        // as in loop_filter_h_4_8.
++        add             x9,  x9,  #4
++        add             x0,  x9,  x1, lsl #2
++        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.d}[0], [x9], x1
++        st1             {v22.d}[1], [x0], x1
++        st1             {v23.d}[0], [x9], x1
++        st1             {v23.d}[1], [x0], x1
++        st1             {v24.d}[0], [x9], x1
++        st1             {v24.d}[1], [x0], x1
++        st1             {v25.d}[0], [x9], x1
++        st1             {v25.d}[1], [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #4
++        br              x10
++endfunc
++
++bpp_frontends vp9_loop_filter_h_8_8
++
++bpp_frontends_mix2 4, 4
++bpp_frontends_mix2 4, 8
++bpp_frontends_mix2 8, 4
++bpp_frontends_mix2 8, 8
++
++function vp9_loop_filter_v_16_8_16_neon
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #3
++        ld1             {v16.8h}, [x9], x1 // p7
++        ld1             {v24.8h}, [x0], x1 // q0
++        ld1             {v17.8h}, [x9], x1 // p6
++        ld1             {v25.8h}, [x0], x1 // q1
++        ld1             {v18.8h}, [x9], x1 // p5
++        ld1             {v26.8h}, [x0], x1 // q2
++        ld1             {v19.8h}, [x9], x1 // p4
++        ld1             {v27.8h}, [x0], x1 // q3
++        ld1             {v20.8h}, [x9], x1 // p3
++        ld1             {v28.8h}, [x0], x1 // q4
++        ld1             {v21.8h}, [x9], x1 // p2
++        ld1             {v29.8h}, [x0], x1 // q5
++        ld1             {v22.8h}, [x9], x1 // p1
++        ld1             {v30.8h}, [x0], x1 // q6
++        ld1             {v23.8h}, [x9], x1 // p0
++        ld1             {v31.8h}, [x0], x1 // q7
++        sub             x9,  x9,  x1, lsl #3
++        sub             x0,  x0,  x1, lsl #3
++        add             x9,  x9,  x1
++
++        loop_filter_16
++
++        // If we did the flat8out part, we get the output in
++        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
++        // store v2-v9 there, and v10-v17 into x0.
++        st1             {v2.8h},  [x9], x1
++        st1             {v10.8h}, [x0], x1
++        st1             {v3.8h},  [x9], x1
++        st1             {v11.8h}, [x0], x1
++        st1             {v4.8h},  [x9], x1
++        st1             {v12.8h}, [x0], x1
++        st1             {v5.8h},  [x9], x1
++        st1             {v13.8h}, [x0], x1
++        st1             {v6.8h},  [x9], x1
++        st1             {v14.8h}, [x0], x1
++        st1             {v8.8h},  [x9], x1
++        st1             {v15.8h}, [x0], x1
++        st1             {v9.8h},  [x9], x1
++        st1             {v17.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  x1
++
++        br              x10
++8:
++        add             x9,  x9,  x1, lsl #2
++        // If we didn't do the flat8out part, the output is left in the
++        // input registers.
++        st1             {v21.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v22.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v26.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #1
++        sub             x0,  x0,  x1
++        br              x10
++7:
++        sub             x9,  x0,  x1, lsl #1
++        st1             {v22.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #1
++        br              x10
++endfunc
++
++bpp_frontends vp9_loop_filter_v_16_8, push=1
++bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
++
++function vp9_loop_filter_h_16_8_16_neon
++        mov             x10, x30
++        sub             x9,  x0,  #16
++        ld1             {v16.8h}, [x9], x1
++        ld1             {v24.8h}, [x0], x1
++        ld1             {v17.8h}, [x9], x1
++        ld1             {v25.8h}, [x0], x1
++        ld1             {v18.8h}, [x9], x1
++        ld1             {v26.8h}, [x0], x1
++        ld1             {v19.8h}, [x9], x1
++        ld1             {v27.8h}, [x0], x1
++        ld1             {v20.8h}, [x9], x1
++        ld1             {v28.8h}, [x0], x1
++        ld1             {v21.8h}, [x9], x1
++        ld1             {v29.8h}, [x0], x1
++        ld1             {v22.8h}, [x9], x1
++        ld1             {v30.8h}, [x0], x1
++        ld1             {v23.8h}, [x9], x1
++        ld1             {v31.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        sub             x9,  x9,  x1, lsl #3
++
++        // The 16x8 pixels read above is in two 8x8 blocks; the left
++        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
++        // of this, to get one column per register.
++        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
++        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
++
++        loop_filter_16
++
++        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
++        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
++
++        st1             {v16.8h}, [x9], x1
++        st1             {v10.8h}, [x0], x1
++        st1             {v2.8h},  [x9], x1
++        st1             {v11.8h}, [x0], x1
++        st1             {v3.8h},  [x9], x1
++        st1             {v12.8h}, [x0], x1
++        st1             {v4.8h},  [x9], x1
++        st1             {v13.8h}, [x0], x1
++        st1             {v5.8h},  [x9], x1
++        st1             {v14.8h}, [x0], x1
++        st1             {v6.8h},  [x9], x1
++        st1             {v15.8h}, [x0], x1
++        st1             {v8.8h},  [x9], x1
++        st1             {v17.8h}, [x0], x1
++        st1             {v9.8h},  [x9], x1
++        st1             {v31.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++
++        br              x10
++8:
++        // The same writeback as in loop_filter_h_8_8
++        sub             x9,  x0,  #8
++        add             x0,  x9,  x1, lsl #2
++        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v20.8h}, [x9], x1
++        st1             {v24.8h}, [x0], x1
++        st1             {v21.8h}, [x9], x1
++        st1             {v25.8h}, [x0], x1
++        st1             {v22.8h}, [x9], x1
++        st1             {v26.8h}, [x0], x1
++        st1             {v23.8h}, [x9], x1
++        st1             {v27.8h}, [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #8
++        br              x10
++7:
++        // The same writeback as in loop_filter_h_4_8
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #2
++        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.d}[0], [x9], x1
++        st1             {v22.d}[1], [x0], x1
++        st1             {v23.d}[0], [x9], x1
++        st1             {v23.d}[1], [x0], x1
++        st1             {v24.d}[0], [x9], x1
++        st1             {v24.d}[1], [x0], x1
++        st1             {v25.d}[0], [x9], x1
++        st1             {v25.d}[1], [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        add             x0,  x0,  #4
++        br              x10
++endfunc
++
++bpp_frontends vp9_loop_filter_h_16_8, push=1
++bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
+@@ -0,0 +1,1334 @@
++/*
++ * Copyright (c) 2016 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++#include "neon.S"
++
++
++// The main loop filter macro is templated and can produce filters for
++// vectors of 8 or 16 bytes. The register mapping throughout the filter
++// is close to identical to the arm version (please try to maintain this,
++// if either is changed!). When the arm version uses e.g. d20 for the
++// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending
++// on vector length.
++//
++// The number of elements in the vector is passed in via the macro parameter
++// \sz, which is either .8b or .16b. For simple instructions that doesn't
++// lengthen or narrow things, this can easily be templated like this:
++//      uabd            v4\sz,  v20\sz, v21\sz
++//
++// For instructions that lengthen or narrow content, the arm version would
++// have used q registers. For these instructions, we have macros that expand
++// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2
++// pair, depending on the \sz parameter. Wherever the arm version would have
++// used a q register, these macros instead take two v registers, i.e. q3
++// is mapped to v6+v7. For the case with 8 byte input vectors, such a
++// lengthening operation is only stored in v6.8h (what was in q3 in the arm
++// case), while the 16 byte input vectors will use v6.8h + v7.8h.
++// Such a macro invocation would look like this:
++//      uaddl_sz        v8.8h,  v9.8h,  v17, v18, \sz
++//
++// That is, in the 8 byte input vector case, the second register in these
++// register pairs will be unused.
++// Unfortunately, this makes the code quite hard to read. For readability,
++// see the arm version instead.
++
++
++.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
++        add             \dst1,  \in1,  \in3
++.ifc \sz, .16b
++        add             \dst2,  \in2,  \in4
++.endif
++.endm
++
++.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz
++        sub             \dst1,  \in1,  \in3
++.ifc \sz, .16b
++        sub             \dst2,  \in2,  \in4
++.endif
++.endm
++
++.macro uaddw_sz dst1, dst2, in1, in2, in3, sz
++        uaddw           \dst1,  \in1, \in3\().8b
++.ifc \sz, .16b
++        uaddw2          \dst2,  \in2, \in3\().16b
++.endif
++.endm
++
++.macro usubw_sz dst1, dst2, in1, in2, in3, sz
++        usubw           \dst1,  \in1, \in3\().8b
++.ifc \sz, .16b
++        usubw2          \dst2,  \in2, \in3\().16b
++.endif
++.endm
++
++.macro usubl_sz dst1, dst2, in1, in2, sz
++        usubl           \dst1,  \in1\().8b,  \in2\().8b
++.ifc \sz, .16b
++        usubl2          \dst2,  \in1\().16b, \in2\().16b
++.endif
++.endm
++
++.macro sqxtn_sz dst, in1, in2, sz
++        sqxtn           \dst\().8b,  \in1
++.ifc \sz, .16b
++        sqxtn2          \dst\().16b, \in2
++.endif
++.endm
++
++.macro sqxtun_sz dst, in1, in2, sz
++        sqxtun          \dst\().8b,  \in1
++.ifc \sz, .16b
++        sqxtun2         \dst\().16b, \in2
++.endif
++.endm
++
++.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz
++        mul             \dst1,  \in1,  \in3
++.ifc \sz, .16b
++        mul             \dst2,  \in2,  \in4
++.endif
++.endm
++
++.macro saddw_sz dst1, dst2, in1, in2, in3, sz
++        saddw           \dst1,  \in1, \in3\().8b
++.ifc \sz, .16b
++        saddw2          \dst2,  \in2, \in3\().16b
++.endif
++.endm
++
++.macro ssubw_sz dst1, dst2, in1, in2, in3, sz
++        ssubw           \dst1,  \in1, \in3\().8b
++.ifc \sz, .16b
++        ssubw2          \dst2,  \in2, \in3\().16b
++.endif
++.endm
++
++.macro uxtl_sz dst1, dst2, in, sz
++        uxtl            \dst1,  \in\().8b
++.ifc \sz, .16b
++        uxtl2           \dst2,  \in\().16b
++.endif
++.endm
++
++.macro uaddl_sz dst1, dst2, in1, in2, sz
++        uaddl           \dst1,  \in1\().8b,  \in2\().8b
++.ifc \sz, .16b
++        uaddl2          \dst2,  \in1\().16b, \in2\().16b
++.endif
++.endm
++
++.macro rshrn_sz dst, in1, in2, shift, sz
++        rshrn           \dst\().8b,  \in1, \shift
++.ifc \sz, .16b
++        rshrn2          \dst\().16b, \in2, \shift
++.endif
++.endm
++
++.macro ushll_sz dst1, dst2, in, shift, sz
++        ushll           \dst1,  \in\().8b,  \shift
++.ifc \sz, .16b
++        ushll2          \dst2,  \in\().16b, \shift
++.endif
++.endm
++
++// The input to and output from this macro is in the registers v16-v31,
++// and v0-v7 are used as scratch registers.
++// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
++// Depending on the width of the loop filter, we either use v16-v19
++// and v28-v31 as temp registers, or v8-v15.
++// When comparing to the arm version, tmpq1 == tmp1 + tmp2,
++// tmpq2 == tmp3 + tmp4, etc.
++.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
++.if \mix == 0
++        dup             v0\sz,  w2        // E
++        dup             v2\sz,  w3        // I
++        dup             v3\sz,  w4        // H
++.else
++        dup             v0.8h,  w2        // E
++        dup             v2.8h,  w3        // I
++        dup             v3.8h,  w4        // H
++        rev16           v1.16b, v0.16b    // E
++        rev16           v4.16b, v2.16b    // I
++        rev16           v5.16b, v3.16b    // H
++        uzp1            v0.16b, v0.16b, v1.16b
++        uzp1            v2.16b, v2.16b, v4.16b
++        uzp1            v3.16b, v3.16b, v5.16b
++.endif
++
++        uabd            v4\sz,  v20\sz, v21\sz        // abs(p3 - p2)
++        uabd            v5\sz,  v21\sz, v22\sz        // abs(p2 - p1)
++        uabd            v6\sz,  v22\sz, v23\sz        // abs(p1 - p0)
++        uabd            v7\sz,  v24\sz, v25\sz        // abs(q0 - q1)
++        uabd            \tmp1\sz,  v25\sz, v26\sz     // abs(q1 - q2)
++        uabd            \tmp2\sz,  v26\sz, v27\sz     // abs(q2 - q3)
++        umax            v4\sz,  v4\sz,  v5\sz
++        umax            v5\sz,  v6\sz,  v7\sz
++        umax            \tmp1\sz, \tmp1\sz, \tmp2\sz
++        uabd            v6\sz,  v23\sz, v24\sz        // abs(p0 - q0)
++        umax            v4\sz,  v4\sz,  v5\sz
++        uqadd           v6\sz,  v6\sz,  v6\sz         // abs(p0 - q0) * 2
++        uabd            v5\sz,  v22\sz, v25\sz        // abs(p1 - q1)
++        umax            v4\sz,  v4\sz,  \tmp1\sz      // max(abs(p3 - p2), ..., abs(q2 - q3))
++        ushr            v5\sz,  v5\sz,  #1
++        cmhs            v4\sz,  v2\sz,  v4\sz         // max(abs()) <= I
++        uqadd           v6\sz,  v6\sz,  v5\sz         // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
++        cmhs            v5\sz,  v0\sz,  v6\sz
++        and             v4\sz,  v4\sz,  v5\sz         // fm
++
++        // If no pixels need filtering, just exit as soon as possible
++        mov             x5,  v4.d[0]
++.ifc \sz, .16b
++        mov             x6,  v4.d[1]
++        adds            x5,  x5,  x6
++        b.eq            9f
++.else
++        cbz             x5,  9f
++.endif
++
++.if \wd >= 8
++        movi            v0\sz,  #1
++
++        uabd            v6\sz,  v20\sz, v23\sz    // abs(p3 - p0)
++        uabd            v2\sz,  v21\sz, v23\sz    // abs(p2 - p0)
++        uabd            v1\sz,  v22\sz, v23\sz    // abs(p1 - p0)
++        uabd            \tmp1\sz,  v25\sz, v24\sz // abs(q1 - q0)
++        uabd            \tmp2\sz,  v26\sz, v24\sz // abs(q2 - q0)
++        uabd            \tmp3\sz,  v27\sz, v24\sz // abs(q3 - q0)
++        umax            v6\sz,  v6\sz,  v2\sz
++        umax            v1\sz,  v1\sz,  \tmp1\sz
++        umax            \tmp2\sz,  \tmp2\sz,  \tmp3\sz
++.if \wd == 16
++        uabd            v7\sz,  v16\sz, v23\sz    // abs(p7 - p0)
++        umax            v6\sz,  v6\sz,  v1\sz
++        uabd            v2\sz,  v17\sz, v23\sz    // abs(p6 - p0)
++        umax            v6\sz,  v6\sz,  \tmp2\sz
++        uabd            v1\sz,  v18\sz, v23\sz    // abs(p5 - p0)
++        cmhs            v6\sz,  v0\sz,  v6\sz     // flat8in
++        uabd            v8\sz,  v19\sz, v23\sz    // abs(p4 - p0)
++        and             v6\sz,  v6\sz,  v4\sz     // flat8in && fm
++        uabd            v9\sz,  v28\sz, v24\sz    // abs(q4 - q0)
++        bic             v4\sz,  v4\sz,  v6\sz     // fm && !flat8in
++        uabd            v10\sz, v29\sz, v24\sz    // abs(q5 - q0)
++        uabd            v11\sz, v30\sz, v24\sz    // abs(q6 - q0)
++        uabd            v12\sz, v31\sz, v24\sz    // abs(q7 - q0)
++
++        umax            v7\sz,  v7\sz,  v2\sz
++        umax            v1\sz,  v1\sz,  v8\sz
++        umax            v9\sz,  v9\sz,  v10\sz
++        umax            v11\sz, v11\sz, v12\sz
++        // The rest of the calculation of flat8out is interleaved below
++.else
++        // The rest of the calculation of flat8in is interleaved below
++.endif
++.endif
++
++        // Calculate the normal inner loop filter for 2 or 4 pixels
++        uabd            v5\sz,  v22\sz, v23\sz // abs(p1 - p0)
++.if \wd == 16
++        umax            v7\sz,  v7\sz,  v1\sz
++        umax            v9\sz,  v9\sz,  v11\sz
++.elseif \wd == 8
++        umax            v6\sz,  v6\sz,  v1\sz
++.endif
++        uabd            v1\sz,  v25\sz, v24\sz // abs(q1 - q0)
++.if \wd == 16
++        umax            v7\sz,  v7\sz,  v9\sz
++.elseif \wd == 8
++        umax            v6\sz,  v6\sz,  \tmp2\sz
++.endif
++        usubl_sz        \tmp1\().8h,  \tmp2\().8h,  v22,  v25, \sz // p1 - q1
++        umax            v5\sz,  v5\sz,  v1\sz  // max(abs(p1 - p0), abs(q1 - q0))
++.if \mix != 0
++        mov             v1.d[0], x11
++.endif
++        usubl_sz        \tmp3\().8h,  \tmp4\().8h,  v24,  v23, \sz // q0 - p0
++        movi            \tmp5\().8h,  #3
++.if \wd == 8
++        cmhs            v6\sz,  v0\sz,  v6\sz  // flat8in
++.endif
++.if \mix != 0
++        sxtl            v1.8h,  v1.8b
++.endif
++        cmhs            v5\sz,  v3\sz,  v5\sz  // !hev
++.if \wd == 8
++        // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
++.if \mix != 0
++        and             v6\sz,  v6\sz,  v1.16b
++.endif
++        and             v6\sz,  v6\sz,  v4\sz  // flat8in && fm
++.endif
++        sqxtn_sz        \tmp1,        \tmp1\().8h,  \tmp2\().8h, \sz // av_clip_int8(p1 - q1)
++.if \wd == 16
++        cmhs            v7\sz,  v0\sz,  v7\sz  // flat8out
++.elseif \wd == 8
++        bic             v4\sz,  v4\sz,  v6\sz  // fm && !flat8in
++.endif
++        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
++.if \wd == 16
++        and             v7\sz,  v7\sz,  v6\sz  // flat8out && flat8in && fm
++.endif
++
++        mul_sz          \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp5\().8h,  \tmp5\().8h, \sz // 3 * (q0 - p0)
++        bic             \tmp1\sz,  \tmp1\sz,  v5\sz    // if (!hev) av_clip_int8 = 0
++        movi            v2\sz,  #4
++        saddw_sz        \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
++        movi            v3\sz,  #3
++        sqxtn_sz        \tmp1,        \tmp3\().8h,  \tmp4\().8h, \sz       // f
++.if \wd == 16
++        bic             v6\sz,  v6\sz,  v7\sz  // fm && flat8in && !flat8out
++.endif
++
++        sqadd           \tmp3\sz,  \tmp1\sz,  v2\sz // FFMIN(f + 4, 127)
++        sqadd           \tmp4\sz,  \tmp1\sz,  v3\sz // FFMIN(f + 3, 127)
++        uxtl_sz         v0.8h,  v1.8h,  v23, \sz    // p0
++        sshr            \tmp3\sz,  \tmp3\sz,  #3    // f1
++        sshr            \tmp4\sz,  \tmp4\sz,  #3    // f2
++
++        uxtl_sz         v2.8h,  v3.8h,  v24, \sz    // q0
++        saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp4, \sz // p0 + f2
++        ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q0 - f1
++        sqxtun_sz       v0,  v0.8h,  v1.8h,  \sz    // out p0
++        sqxtun_sz       v1,  v2.8h,  v3.8h,  \sz    // out q0
++        srshr           \tmp3\sz, \tmp3\sz, #1      // f = (f1 + 1) >> 1
++        bit             v23\sz, v0\sz,  v4\sz       // if (fm && !flat8in)
++        bit             v24\sz, v1\sz,  v4\sz
++
++        uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
++        uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
++.if \wd >= 8
++        mov             x5,  v6.d[0]
++.ifc \sz, .16b
++        mov             x6,  v6.d[1]
++.endif
++.endif
++        saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
++        ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
++        sqxtun_sz       v0,  v0.8h,  v1.8h, \sz     // out p1
++        sqxtun_sz       v2,  v2.8h,  v3.8h, \sz     // out q1
++.if \wd >= 8
++.ifc \sz, .16b
++        adds            x5,  x5,  x6
++.endif
++.endif
++        bit             v22\sz, v0\sz,  v5\sz       // if (!hev && fm && !flat8in)
++        bit             v25\sz, v2\sz,  v5\sz
++
++        // If no pixels need flat8in, jump to flat8out
++        // (or to a writeout of the inner 4 pixels, for wd=8)
++.if \wd >= 8
++.ifc \sz, .16b
++        b.eq            6f
++.else
++        cbz             x5,  6f
++.endif
++
++        // flat8in
++        uaddl_sz        \tmp1\().8h, \tmp2\().8h,  v20, v21, \sz
++        uaddl_sz        \tmp3\().8h, \tmp4\().8h,  v22, v25, \sz
++        uaddl_sz        \tmp5\().8h, \tmp6\().8h,  v20, v22, \sz
++        uaddl_sz        \tmp7\().8h, \tmp8\().8h,  v23, v26, \sz
++        add_sz          v0.8h,  v1.8h,  \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz
++        uaddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v23, \sz
++        uaddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v24, \sz
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp5\().8h, \tmp6\().8h, \sz
++        sub_sz          \tmp3\().8h, \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1\().8h, \tmp2\().8h, \sz
++        sub_sz          \tmp7\().8h, \tmp8\().8h,  \tmp7\().8h, \tmp8\().8h,  \tmp5\().8h, \tmp6\().8h, \sz
++        rshrn_sz        v2,  v0.8h,  v1.8h,  #3,  \sz // out p2
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3\().8h, \tmp4\().8h, \sz
++        uaddl_sz        \tmp1\().8h, \tmp2\().8h,  v20,  v23, \sz
++        uaddl_sz        \tmp3\().8h, \tmp4\().8h,  v24,  v27, \sz
++        rshrn_sz        v3,  v0.8h,  v1.8h,  #3,  \sz // out p1
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp7\().8h, \tmp8\().8h, \sz
++        sub_sz          \tmp3\().8h, \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1\().8h, \tmp2\().8h, \sz
++        uaddl_sz        \tmp5\().8h, \tmp6\().8h,  v21,  v24, \sz
++        uaddl_sz        \tmp7\().8h, \tmp8\().8h,  v25,  v27, \sz
++        rshrn_sz        v4,  v0.8h,  v1.8h,  #3,  \sz // out p0
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3\().8h, \tmp4\().8h, \sz
++        sub_sz          \tmp7\().8h, \tmp8\().8h,  \tmp7\().8h, \tmp8\().8h,  \tmp5\().8h, \tmp6\().8h, \sz
++        uaddl_sz        \tmp1\().8h, \tmp2\().8h,  v22,  v25, \sz
++        uaddl_sz        \tmp3\().8h, \tmp4\().8h,  v26,  v27, \sz
++        rshrn_sz        v5,  v0.8h,  v1.8h,  #3,  \sz // out q0
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp7\().8h, \tmp8\().8h, \sz
++        sub_sz          \tmp3\().8h, \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1\().8h, \tmp2\().8h, \sz
++        rshrn_sz        \tmp5,  v0.8h,  v1.8h,  #3,  \sz // out q1
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3\().8h, \tmp4\().8h, \sz
++        // The output here is written back into the input registers. This doesn't
++        // matter for the flat8part below, since we only update those pixels
++        // which won't be touched below.
++        bit             v21\sz, v2\sz,  v6\sz
++        bit             v22\sz, v3\sz,  v6\sz
++        bit             v23\sz, v4\sz,  v6\sz
++        rshrn_sz        \tmp6,  v0.8h,  v1.8h,  #3,  \sz // out q2
++        bit             v24\sz, v5\sz,  v6\sz
++        bit             v25\sz, \tmp5\sz,  v6\sz
++        bit             v26\sz, \tmp6\sz,  v6\sz
++.endif
++.if \wd == 16
++6:
++        orr             v2\sz,  v6\sz,  v7\sz
++        mov             x5,  v2.d[0]
++.ifc \sz, .16b
++        mov             x6,  v2.d[1]
++        adds            x5,  x5,  x6
++        b.ne            1f
++.else
++        cbnz            x5,  1f
++.endif
++        // If no pixels needed flat8in nor flat8out, jump to a
++        // writeout of the inner 4 pixels
++        br              x14
++1:
++
++        mov             x5,  v7.d[0]
++.ifc \sz, .16b
++        mov             x6,  v7.d[1]
++        adds            x5,  x5,  x6
++        b.ne            1f
++.else
++        cbnz            x5,  1f
++.endif
++        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
++        br              x15
++
++1:
++        // flat8out
++        // This writes all outputs into v2-v17 (skipping v6 and v16).
++        // If this part is skipped, the output is read from v21-v26 (which is the input
++        // to this section).
++        ushll_sz        v0.8h,  v1.8h,  v16,  #3,  \sz           // 8 * v16
++        usubw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v16, \sz // 7 * v16
++        uaddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v17, \sz
++        uaddl_sz        v8.8h,  v9.8h,  v17, v18, \sz
++        uaddl_sz        v10.8h, v11.8h, v19, v20, \sz
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v8.8h,  v9.8h,  \sz
++        uaddl_sz        v8.8h,  v9.8h,  v16, v17, \sz
++        uaddl_sz        v12.8h, v13.8h, v21, v22, \sz
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
++        uaddl_sz        v10.8h, v11.8h, v18, v25, \sz
++        uaddl_sz        v14.8h, v15.8h, v23, v24, \sz
++        sub_sz          v10.8h, v11.8h, v10.8h, v11.8h, v8.8h,  v9.8h,  \sz
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v12.8h, v13.8h, \sz
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v16, v18, \sz
++        uaddl_sz        v14.8h, v15.8h, v19, v26, \sz
++        rshrn_sz        v2,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
++        uaddl_sz        v8.8h,  v9.8h,  v16, v19, \sz
++        uaddl_sz        v10.8h, v11.8h, v20, v27, \sz
++        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
++        bif             v2\sz,  v17\sz, v7\sz
++        rshrn_sz        v3,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v16, v20, \sz
++        uaddl_sz        v14.8h, v15.8h, v21, v28, \sz
++        sub_sz          v10.8h, v11.8h, v10.8h, v11.8h, v8.8h,  v9.8h,  \sz
++        bif             v3\sz,  v18\sz, v7\sz
++        rshrn_sz        v4,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
++        uaddl_sz        v8.8h,  v9.8h,  v16, v21, \sz
++        uaddl_sz        v10.8h, v11.8h, v22, v29, \sz
++        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
++        bif             v4\sz,  v19\sz, v7\sz
++        rshrn_sz        v5,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v16, v22, \sz
++        uaddl_sz        v14.8h, v15.8h, v23, v30, \sz
++        sub_sz          v10.8h, v11.8h, v10.8h, v11.8h, v8.8h,  v9.8h,  \sz
++        bif             v5\sz,  v20\sz, v7\sz
++        rshrn_sz        v6,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
++        uaddl_sz        v10.8h, v11.8h, v16, v23, \sz
++        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v24, v31, \sz
++        bif             v6\sz,  v21\sz, v7\sz
++        rshrn_sz        v8,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
++        sub_sz          v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v17, v24, \sz
++        uaddl_sz        v14.8h, v15.8h, v25, v31, \sz
++        bif             v8\sz,  v22\sz, v7\sz
++        rshrn_sz        v9,  v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
++        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v26, v31, \sz
++        bif             v9\sz,  v23\sz, v7\sz
++        rshrn_sz        v10, v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
++        uaddl_sz        v14.8h, v15.8h, v18, v25, \sz
++        uaddl_sz        v18.8h, v19.8h, v19, v26, \sz
++        sub_sz          v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz
++        uaddl_sz        v14.8h, v15.8h, v27, v31, \sz
++        bif             v10\sz, v24\sz, v7\sz
++        rshrn_sz        v11, v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v12.8h, v13.8h, \sz
++        uaddl_sz        v12.8h, v13.8h, v20, v27, \sz
++        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz
++        uaddl_sz        v18.8h, v19.8h, v28, v31, \sz
++        bif             v11\sz, v25\sz, v7\sz
++        sub_sz          v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz
++        rshrn_sz        v12, v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
++        uaddl_sz        v14.8h, v15.8h, v21, v28, \sz
++        uaddl_sz        v20.8h, v21.8h, v29, v31, \sz
++        bif             v12\sz, v26\sz, v7\sz
++        rshrn_sz        v13, v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v18.8h, v19.8h, \sz
++        sub_sz          v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz
++        uaddl_sz        v18.8h, v19.8h, v22, v29, \sz
++        uaddl_sz        v22.8h, v23.8h, v30, v31, \sz
++        bif             v13\sz, v27\sz, v7\sz
++        rshrn_sz        v14, v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v20.8h, v21.8h, \sz
++        sub_sz          v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz
++        bif             v14\sz, v28\sz, v7\sz
++        rshrn_sz        v15, v0.8h,  v1.8h,  #4,  \sz
++
++        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v22.8h, v23.8h, \sz
++        bif             v15\sz, v29\sz, v7\sz
++        rshrn_sz        v17, v0.8h,  v1.8h,  #4,  \sz
++        bif             v17\sz, v30\sz, v7\sz
++.endif
++.endm
++
++// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
++// while we need those for inputs/outputs in wd=16 and use v8-v15
++// for temp registers there instead.
++function vp9_loop_filter_4
++        loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
++        ret
++9:
++        br              x10
++endfunc
++
++function vp9_loop_filter_4_16b_mix_44
++        loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
++        ret
++9:
++        br              x10
++endfunc
++
++function vp9_loop_filter_8
++        loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
++        ret
++6:
++        br              x13
++9:
++        br              x10
++endfunc
++
++function vp9_loop_filter_8_16b_mix
++        loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
++        ret
++6:
++        br              x13
++9:
++        br              x10
++endfunc
++
++function vp9_loop_filter_16
++        loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
++        ret
++9:
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x10
++endfunc
++
++function vp9_loop_filter_16_16b
++        loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
++        ret
++9:
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x10
++endfunc
++
++.macro loop_filter_4
++        bl              vp9_loop_filter_4
++.endm
++
++.macro loop_filter_4_16b_mix mix
++        bl              vp9_loop_filter_4_16b_mix_\mix
++.endm
++
++.macro loop_filter_8
++        // calculate alternative 'return' targets
++        adr             x13, 6f
++        bl              vp9_loop_filter_8
++.endm
++
++.macro loop_filter_8_16b_mix mix
++        // calculate alternative 'return' targets
++        adr             x13, 6f
++.if \mix == 48
++        mov             x11, #0xffffffff00000000
++.elseif \mix == 84
++        mov             x11, #0x00000000ffffffff
++.else
++        mov             x11, #0xffffffffffffffff
++.endif
++        bl              vp9_loop_filter_8_16b_mix
++.endm
++
++.macro loop_filter_16
++        // calculate alternative 'return' targets
++        adr             x14, 7f
++        adr             x15, 8f
++        bl              vp9_loop_filter_16
++.endm
++
++.macro loop_filter_16_16b
++        // calculate alternative 'return' targets
++        adr             x14, 7f
++        adr             x15, 8f
++        bl              vp9_loop_filter_16_16b
++.endm
++
++
++// The public functions in this file have got the following signature:
++// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
++
++function ff_vp9_loop_filter_v_4_8_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #2
++        ld1             {v20.8b}, [x9], x1 // p3
++        ld1             {v24.8b}, [x0], x1 // q0
++        ld1             {v21.8b}, [x9], x1 // p2
++        ld1             {v25.8b}, [x0], x1 // q1
++        ld1             {v22.8b}, [x9], x1 // p1
++        ld1             {v26.8b}, [x0], x1 // q2
++        ld1             {v23.8b}, [x9], x1 // p0
++        ld1             {v27.8b}, [x0], x1 // q3
++        sub             x0,  x0,  x1, lsl #2
++        sub             x9,  x9,  x1, lsl #1
++
++        loop_filter_4
++
++        st1             {v22.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++
++        br              x10
++endfunc
++
++function ff_vp9_loop_filter_v_44_16_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #2
++        ld1             {v20.16b}, [x9], x1 // p3
++        ld1             {v24.16b}, [x0], x1 // q0
++        ld1             {v21.16b}, [x9], x1 // p2
++        ld1             {v25.16b}, [x0], x1 // q1
++        ld1             {v22.16b}, [x9], x1 // p1
++        ld1             {v26.16b}, [x0], x1 // q2
++        ld1             {v23.16b}, [x9], x1 // p0
++        ld1             {v27.16b}, [x0], x1 // q3
++        sub             x0,  x0,  x1, lsl #2
++        sub             x9,  x9,  x1, lsl #1
++
++        loop_filter_4_16b_mix 44
++
++        st1             {v22.16b}, [x9], x1
++        st1             {v24.16b}, [x0], x1
++        st1             {v23.16b}, [x9], x1
++        st1             {v25.16b}, [x0], x1
++
++        br              x10
++endfunc
++
++function ff_vp9_loop_filter_h_4_8_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #2
++        ld1             {v20.8b}, [x9], x1
++        ld1             {v24.8b}, [x0], x1
++        ld1             {v21.8b}, [x9], x1
++        ld1             {v25.8b}, [x0], x1
++        ld1             {v22.8b}, [x9], x1
++        ld1             {v26.8b}, [x0], x1
++        ld1             {v23.8b}, [x9], x1
++        ld1             {v27.8b}, [x0], x1
++
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #2
++        // Move x0/x9 forward by 2 pixels; we don't need to rewrite the
++        // outermost 2 pixels since they aren't changed.
++        add             x9,  x9,  #2
++        add             x0,  x0,  #2
++
++        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        loop_filter_4
++
++        // We only will write the mid 4 pixels back; after the loop filter,
++        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
++        // We need to transpose them to columns, done with a 4x8 transpose
++        // (which in practice is two 4x4 transposes of the two 4x4 halves
++        // of the 8x4 pixels; into 4x8 pixels).
++        transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.s}[0], [x9], x1
++        st1             {v22.s}[1], [x0], x1
++        st1             {v23.s}[0], [x9], x1
++        st1             {v23.s}[1], [x0], x1
++        st1             {v24.s}[0], [x9], x1
++        st1             {v24.s}[1], [x0], x1
++        st1             {v25.s}[0], [x9], x1
++        st1             {v25.s}[1], [x0], x1
++
++        br              x10
++endfunc
++
++function ff_vp9_loop_filter_h_44_16_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #3
++        ld1             {v20.8b},   [x9], x1
++        ld1             {v20.d}[1], [x0], x1
++        ld1             {v21.8b},   [x9], x1
++        ld1             {v21.d}[1], [x0], x1
++        ld1             {v22.8b},   [x9], x1
++        ld1             {v22.d}[1], [x0], x1
++        ld1             {v23.8b},   [x9], x1
++        ld1             {v23.d}[1], [x0], x1
++        ld1             {v24.8b},   [x9], x1
++        ld1             {v24.d}[1], [x0], x1
++        ld1             {v25.8b},   [x9], x1
++        ld1             {v25.d}[1], [x0], x1
++        ld1             {v26.8b},   [x9], x1
++        ld1             {v26.d}[1], [x0], x1
++        ld1             {v27.8b},   [x9], x1
++        ld1             {v27.d}[1], [x0], x1
++
++        sub             x9,  x9,  x1, lsl #3
++        sub             x0,  x0,  x1, lsl #3
++        add             x9,  x9,  #2
++        add             x0,  x0,  #2
++
++        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        loop_filter_4_16b_mix 44
++
++        transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v22.s}[0], [x9], x1
++        st1             {v22.s}[2], [x0], x1
++        st1             {v23.s}[0], [x9], x1
++        st1             {v23.s}[2], [x0], x1
++        st1             {v24.s}[0], [x9], x1
++        st1             {v24.s}[2], [x0], x1
++        st1             {v25.s}[0], [x9], x1
++        st1             {v25.s}[2], [x0], x1
++        st1             {v22.s}[1], [x9], x1
++        st1             {v22.s}[3], [x0], x1
++        st1             {v23.s}[1], [x9], x1
++        st1             {v23.s}[3], [x0], x1
++        st1             {v24.s}[1], [x9], x1
++        st1             {v24.s}[3], [x0], x1
++        st1             {v25.s}[1], [x9], x1
++        st1             {v25.s}[3], [x0], x1
++
++        br              x10
++endfunc
++
++function ff_vp9_loop_filter_v_8_8_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #2
++        ld1             {v20.8b}, [x9], x1 // p3
++        ld1             {v24.8b}, [x0], x1 // q0
++        ld1             {v21.8b}, [x9], x1 // p2
++        ld1             {v25.8b}, [x0], x1 // q1
++        ld1             {v22.8b}, [x9], x1 // p1
++        ld1             {v26.8b}, [x0], x1 // q2
++        ld1             {v23.8b}, [x9], x1 // p0
++        ld1             {v27.8b}, [x0], x1 // q3
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #2
++        add             x9,  x9,  x1
++
++        loop_filter_8
++
++        st1             {v21.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v22.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v26.8b}, [x0], x1
++
++        br              x10
++6:
++        sub             x9,  x0,  x1, lsl #1
++        st1             {v22.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++        br              x10
++endfunc
++
++.macro mix_v_16 mix
++function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  x1, lsl #2
++        ld1             {v20.16b}, [x9], x1 // p3
++        ld1             {v24.16b}, [x0], x1 // q0
++        ld1             {v21.16b}, [x9], x1 // p2
++        ld1             {v25.16b}, [x0], x1 // q1
++        ld1             {v22.16b}, [x9], x1 // p1
++        ld1             {v26.16b}, [x0], x1 // q2
++        ld1             {v23.16b}, [x9], x1 // p0
++        ld1             {v27.16b}, [x0], x1 // q3
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #2
++        add             x9,  x9,  x1
++
++        loop_filter_8_16b_mix \mix
++
++        st1             {v21.16b}, [x9], x1
++        st1             {v24.16b}, [x0], x1
++        st1             {v22.16b}, [x9], x1
++        st1             {v25.16b}, [x0], x1
++        st1             {v23.16b}, [x9], x1
++        st1             {v26.16b}, [x0], x1
++
++        br              x10
++6:
++        sub             x9,  x0,  x1, lsl #1
++        st1             {v22.16b}, [x9], x1
++        st1             {v24.16b}, [x0], x1
++        st1             {v23.16b}, [x9], x1
++        st1             {v25.16b}, [x0], x1
++        br              x10
++endfunc
++.endm
++
++mix_v_16 48
++mix_v_16 84
++mix_v_16 88
++
++function ff_vp9_loop_filter_h_8_8_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #2
++        ld1             {v20.8b}, [x9], x1
++        ld1             {v24.8b}, [x0], x1
++        ld1             {v21.8b}, [x9], x1
++        ld1             {v25.8b}, [x0], x1
++        ld1             {v22.8b}, [x9], x1
++        ld1             {v26.8b}, [x0], x1
++        ld1             {v23.8b}, [x9], x1
++        ld1             {v27.8b}, [x0], x1
++
++        sub             x9,  x9,  x1, lsl #2
++        sub             x0,  x0,  x1, lsl #2
++
++        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        loop_filter_8
++
++        // Even though only 6 pixels per row have been changed, we write the
++        // full 8 pixel registers.
++        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v20.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v21.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++        st1             {v22.8b}, [x9], x1
++        st1             {v26.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v27.8b}, [x0], x1
++
++        br              x10
++6:
++        // If we didn't need to do the flat8in part, we use the same writeback
++        // as in loop_filter_h_4_8.
++        add             x9,  x9,  #2
++        add             x0,  x0,  #2
++        transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.s}[0], [x9], x1
++        st1             {v22.s}[1], [x0], x1
++        st1             {v23.s}[0], [x9], x1
++        st1             {v23.s}[1], [x0], x1
++        st1             {v24.s}[0], [x9], x1
++        st1             {v24.s}[1], [x0], x1
++        st1             {v25.s}[0], [x9], x1
++        st1             {v25.s}[1], [x0], x1
++        br              x10
++endfunc
++
++.macro mix_h_16 mix
++function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
++        mov             x10, x30
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #3
++        ld1             {v20.8b},   [x9], x1
++        ld1             {v20.d}[1], [x0], x1
++        ld1             {v21.8b},   [x9], x1
++        ld1             {v21.d}[1], [x0], x1
++        ld1             {v22.8b},   [x9], x1
++        ld1             {v22.d}[1], [x0], x1
++        ld1             {v23.8b},   [x9], x1
++        ld1             {v23.d}[1], [x0], x1
++        ld1             {v24.8b},   [x9], x1
++        ld1             {v24.d}[1], [x0], x1
++        ld1             {v25.8b},   [x9], x1
++        ld1             {v25.d}[1], [x0], x1
++        ld1             {v26.8b},   [x9], x1
++        ld1             {v26.d}[1], [x0], x1
++        ld1             {v27.8b},   [x9], x1
++        ld1             {v27.d}[1], [x0], x1
++
++        sub             x9,  x9,  x1, lsl #3
++        sub             x0,  x0,  x1, lsl #3
++
++        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        loop_filter_8_16b_mix \mix
++
++        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v20.8b},   [x9], x1
++        st1             {v20.d}[1], [x0], x1
++        st1             {v21.8b},   [x9], x1
++        st1             {v21.d}[1], [x0], x1
++        st1             {v22.8b},   [x9], x1
++        st1             {v22.d}[1], [x0], x1
++        st1             {v23.8b},   [x9], x1
++        st1             {v23.d}[1], [x0], x1
++        st1             {v24.8b},   [x9], x1
++        st1             {v24.d}[1], [x0], x1
++        st1             {v25.8b},   [x9], x1
++        st1             {v25.d}[1], [x0], x1
++        st1             {v26.8b},   [x9], x1
++        st1             {v26.d}[1], [x0], x1
++        st1             {v27.8b},   [x9], x1
++        st1             {v27.d}[1], [x0], x1
++
++        br              x10
++6:
++        add             x9,  x9,  #2
++        add             x0,  x0,  #2
++        transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.s}[0], [x9], x1
++        st1             {v22.s}[2], [x0], x1
++        st1             {v23.s}[0], [x9], x1
++        st1             {v23.s}[2], [x0], x1
++        st1             {v24.s}[0], [x9], x1
++        st1             {v24.s}[2], [x0], x1
++        st1             {v25.s}[0], [x9], x1
++        st1             {v25.s}[2], [x0], x1
++        st1             {v22.s}[1], [x9], x1
++        st1             {v22.s}[3], [x0], x1
++        st1             {v23.s}[1], [x9], x1
++        st1             {v23.s}[3], [x0], x1
++        st1             {v24.s}[1], [x9], x1
++        st1             {v24.s}[3], [x0], x1
++        st1             {v25.s}[1], [x9], x1
++        st1             {v25.s}[3], [x0], x1
++        br              x10
++endfunc
++.endm
++
++mix_h_16 48
++mix_h_16 84
++mix_h_16 88
++
++function ff_vp9_loop_filter_v_16_8_neon, export=1
++        mov             x10, x30
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++        sub             x9,  x0,  x1, lsl #3
++        ld1             {v16.8b}, [x9], x1 // p7
++        ld1             {v24.8b}, [x0], x1 // q0
++        ld1             {v17.8b}, [x9], x1 // p6
++        ld1             {v25.8b}, [x0], x1 // q1
++        ld1             {v18.8b}, [x9], x1 // p5
++        ld1             {v26.8b}, [x0], x1 // q2
++        ld1             {v19.8b}, [x9], x1 // p4
++        ld1             {v27.8b}, [x0], x1 // q3
++        ld1             {v20.8b}, [x9], x1 // p3
++        ld1             {v28.8b}, [x0], x1 // q4
++        ld1             {v21.8b}, [x9], x1 // p2
++        ld1             {v29.8b}, [x0], x1 // q5
++        ld1             {v22.8b}, [x9], x1 // p1
++        ld1             {v30.8b}, [x0], x1 // q6
++        ld1             {v23.8b}, [x9], x1 // p0
++        ld1             {v31.8b}, [x0], x1 // q7
++        sub             x9,  x9,  x1, lsl #3
++        sub             x0,  x0,  x1, lsl #3
++        add             x9,  x9,  x1
++
++        loop_filter_16
++
++        // If we did the flat8out part, we get the output in
++        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
++        // store v2-v9 there, and v10-v17 into x0.
++        st1             {v2.8b},  [x9], x1
++        st1             {v10.8b}, [x0], x1
++        st1             {v3.8b},  [x9], x1
++        st1             {v11.8b}, [x0], x1
++        st1             {v4.8b},  [x9], x1
++        st1             {v12.8b}, [x0], x1
++        st1             {v5.8b},  [x9], x1
++        st1             {v13.8b}, [x0], x1
++        st1             {v6.8b},  [x9], x1
++        st1             {v14.8b}, [x0], x1
++        st1             {v8.8b},  [x9], x1
++        st1             {v15.8b}, [x0], x1
++        st1             {v9.8b},  [x9], x1
++        st1             {v17.8b}, [x0], x1
++9:
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x10
++8:
++        add             x9,  x9,  x1, lsl #2
++        // If we didn't do the flat8out part, the output is left in the
++        // input registers.
++        st1             {v21.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v22.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v26.8b}, [x0], x1
++        b               9b
++7:
++        sub             x9,  x0,  x1, lsl #1
++        st1             {v22.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++        b               9b
++endfunc
++
++function ff_vp9_loop_filter_v_16_16_neon, export=1
++        mov             x10, x30
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++        sub             x9,  x0,  x1, lsl #3
++        ld1             {v16.16b}, [x9], x1 // p7
++        ld1             {v24.16b}, [x0], x1 // q0
++        ld1             {v17.16b}, [x9], x1 // p6
++        ld1             {v25.16b}, [x0], x1 // q1
++        ld1             {v18.16b}, [x9], x1 // p5
++        ld1             {v26.16b}, [x0], x1 // q2
++        ld1             {v19.16b}, [x9], x1 // p4
++        ld1             {v27.16b}, [x0], x1 // q3
++        ld1             {v20.16b}, [x9], x1 // p3
++        ld1             {v28.16b}, [x0], x1 // q4
++        ld1             {v21.16b}, [x9], x1 // p2
++        ld1             {v29.16b}, [x0], x1 // q5
++        ld1             {v22.16b}, [x9], x1 // p1
++        ld1             {v30.16b}, [x0], x1 // q6
++        ld1             {v23.16b}, [x9], x1 // p0
++        ld1             {v31.16b}, [x0], x1 // q7
++        sub             x9,  x9,  x1, lsl #3
++        sub             x0,  x0,  x1, lsl #3
++        add             x9,  x9,  x1
++
++        loop_filter_16_16b
++
++        st1             {v2.16b},  [x9], x1
++        st1             {v10.16b}, [x0], x1
++        st1             {v3.16b},  [x9], x1
++        st1             {v11.16b}, [x0], x1
++        st1             {v4.16b},  [x9], x1
++        st1             {v12.16b}, [x0], x1
++        st1             {v5.16b},  [x9], x1
++        st1             {v13.16b}, [x0], x1
++        st1             {v6.16b},  [x9], x1
++        st1             {v14.16b}, [x0], x1
++        st1             {v8.16b},  [x9], x1
++        st1             {v15.16b}, [x0], x1
++        st1             {v9.16b},  [x9], x1
++        st1             {v17.16b}, [x0], x1
++9:
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x10
++8:
++        add             x9,  x9,  x1, lsl #2
++        st1             {v21.16b}, [x9], x1
++        st1             {v24.16b}, [x0], x1
++        st1             {v22.16b}, [x9], x1
++        st1             {v25.16b}, [x0], x1
++        st1             {v23.16b}, [x9], x1
++        st1             {v26.16b}, [x0], x1
++        b               9b
++7:
++        sub             x9,  x0,  x1, lsl #1
++        st1             {v22.16b}, [x9], x1
++        st1             {v24.16b}, [x0], x1
++        st1             {v23.16b}, [x9], x1
++        st1             {v25.16b}, [x0], x1
++        b               9b
++endfunc
++
++function ff_vp9_loop_filter_h_16_8_neon, export=1
++        mov             x10, x30
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++        sub             x9,  x0,  #8
++        ld1             {v16.8b}, [x9], x1
++        ld1             {v24.8b}, [x0], x1
++        ld1             {v17.8b}, [x9], x1
++        ld1             {v25.8b}, [x0], x1
++        ld1             {v18.8b}, [x9], x1
++        ld1             {v26.8b}, [x0], x1
++        ld1             {v19.8b}, [x9], x1
++        ld1             {v27.8b}, [x0], x1
++        ld1             {v20.8b}, [x9], x1
++        ld1             {v28.8b}, [x0], x1
++        ld1             {v21.8b}, [x9], x1
++        ld1             {v29.8b}, [x0], x1
++        ld1             {v22.8b}, [x9], x1
++        ld1             {v30.8b}, [x0], x1
++        ld1             {v23.8b}, [x9], x1
++        ld1             {v31.8b}, [x0], x1
++        sub             x0,  x0,  x1, lsl #3
++        sub             x9,  x9,  x1, lsl #3
++
++        // The 16x8 pixels read above is in two 8x8 blocks; the left
++        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
++        // of this, to get one column per register.
++        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
++        transpose_8x8B  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
++
++        loop_filter_16
++
++        transpose_8x8B  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
++        transpose_8x8B  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
++
++        st1             {v16.8b}, [x9], x1
++        st1             {v10.8b}, [x0], x1
++        st1             {v2.8b},  [x9], x1
++        st1             {v11.8b}, [x0], x1
++        st1             {v3.8b},  [x9], x1
++        st1             {v12.8b}, [x0], x1
++        st1             {v4.8b},  [x9], x1
++        st1             {v13.8b}, [x0], x1
++        st1             {v5.8b},  [x9], x1
++        st1             {v14.8b}, [x0], x1
++        st1             {v6.8b},  [x9], x1
++        st1             {v15.8b}, [x0], x1
++        st1             {v8.8b},  [x9], x1
++        st1             {v17.8b}, [x0], x1
++        st1             {v9.8b},  [x9], x1
++        st1             {v31.8b}, [x0], x1
++9:
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x10
++8:
++        // The same writeback as in loop_filter_h_8_8
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #2
++        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v20.8b}, [x9], x1
++        st1             {v24.8b}, [x0], x1
++        st1             {v21.8b}, [x9], x1
++        st1             {v25.8b}, [x0], x1
++        st1             {v22.8b}, [x9], x1
++        st1             {v26.8b}, [x0], x1
++        st1             {v23.8b}, [x9], x1
++        st1             {v27.8b}, [x0], x1
++        b               9b
++7:
++        // The same writeback as in loop_filter_h_4_8
++        sub             x9,  x0,  #2
++        add             x0,  x9,  x1, lsl #2
++        transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.s}[0], [x9], x1
++        st1             {v22.s}[1], [x0], x1
++        st1             {v23.s}[0], [x9], x1
++        st1             {v23.s}[1], [x0], x1
++        st1             {v24.s}[0], [x9], x1
++        st1             {v24.s}[1], [x0], x1
++        st1             {v25.s}[0], [x9], x1
++        st1             {v25.s}[1], [x0], x1
++        b               9b
++endfunc
++
++function ff_vp9_loop_filter_h_16_16_neon, export=1
++        mov             x10, x30
++        stp             d14, d15, [sp, #-0x10]!
++        stp             d12, d13, [sp, #-0x10]!
++        stp             d10, d11, [sp, #-0x10]!
++        stp             d8,  d9,  [sp, #-0x10]!
++        sub             x9,  x0,  #8
++        ld1             {v16.8b},   [x9], x1
++        ld1             {v24.8b},   [x0], x1
++        ld1             {v17.8b},   [x9], x1
++        ld1             {v25.8b},   [x0], x1
++        ld1             {v18.8b},   [x9], x1
++        ld1             {v26.8b},   [x0], x1
++        ld1             {v19.8b},   [x9], x1
++        ld1             {v27.8b},   [x0], x1
++        ld1             {v20.8b},   [x9], x1
++        ld1             {v28.8b},   [x0], x1
++        ld1             {v21.8b},   [x9], x1
++        ld1             {v29.8b},   [x0], x1
++        ld1             {v22.8b},   [x9], x1
++        ld1             {v30.8b},   [x0], x1
++        ld1             {v23.8b},   [x9], x1
++        ld1             {v31.8b},   [x0], x1
++        ld1             {v16.d}[1], [x9], x1
++        ld1             {v24.d}[1], [x0], x1
++        ld1             {v17.d}[1], [x9], x1
++        ld1             {v25.d}[1], [x0], x1
++        ld1             {v18.d}[1], [x9], x1
++        ld1             {v26.d}[1], [x0], x1
++        ld1             {v19.d}[1], [x9], x1
++        ld1             {v27.d}[1], [x0], x1
++        ld1             {v20.d}[1], [x9], x1
++        ld1             {v28.d}[1], [x0], x1
++        ld1             {v21.d}[1], [x9], x1
++        ld1             {v29.d}[1], [x0], x1
++        ld1             {v22.d}[1], [x9], x1
++        ld1             {v30.d}[1], [x0], x1
++        ld1             {v23.d}[1], [x9], x1
++        ld1             {v31.d}[1], [x0], x1
++        sub             x0,  x0,  x1, lsl #4
++        sub             x9,  x9,  x1, lsl #4
++
++        transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
++        transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
++
++        loop_filter_16_16b
++
++        transpose_8x16B v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
++        transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
++
++        st1             {v16.8b},   [x9], x1
++        st1             {v10.8b},   [x0], x1
++        st1             {v2.8b},    [x9], x1
++        st1             {v11.8b},   [x0], x1
++        st1             {v3.8b},    [x9], x1
++        st1             {v12.8b},   [x0], x1
++        st1             {v4.8b},    [x9], x1
++        st1             {v13.8b},   [x0], x1
++        st1             {v5.8b},    [x9], x1
++        st1             {v14.8b},   [x0], x1
++        st1             {v6.8b},    [x9], x1
++        st1             {v15.8b},   [x0], x1
++        st1             {v8.8b},    [x9], x1
++        st1             {v17.8b},   [x0], x1
++        st1             {v9.8b},    [x9], x1
++        st1             {v31.8b},   [x0], x1
++        st1             {v16.d}[1], [x9], x1
++        st1             {v10.d}[1], [x0], x1
++        st1             {v2.d}[1],  [x9], x1
++        st1             {v11.d}[1], [x0], x1
++        st1             {v3.d}[1],  [x9], x1
++        st1             {v12.d}[1], [x0], x1
++        st1             {v4.d}[1],  [x9], x1
++        st1             {v13.d}[1], [x0], x1
++        st1             {v5.d}[1],  [x9], x1
++        st1             {v14.d}[1], [x0], x1
++        st1             {v6.d}[1],  [x9], x1
++        st1             {v15.d}[1], [x0], x1
++        st1             {v8.d}[1],  [x9], x1
++        st1             {v17.d}[1], [x0], x1
++        st1             {v9.d}[1],  [x9], x1
++        st1             {v31.d}[1], [x0], x1
++9:
++        ldp             d8,  d9,  [sp], 0x10
++        ldp             d10, d11, [sp], 0x10
++        ldp             d12, d13, [sp], 0x10
++        ldp             d14, d15, [sp], 0x10
++        br              x10
++8:
++        sub             x9,  x0,  #4
++        add             x0,  x9,  x1, lsl #3
++        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
++
++        st1             {v20.8b},   [x9], x1
++        st1             {v20.d}[1], [x0], x1
++        st1             {v21.8b},   [x9], x1
++        st1             {v21.d}[1], [x0], x1
++        st1             {v22.8b},   [x9], x1
++        st1             {v22.d}[1], [x0], x1
++        st1             {v23.8b},   [x9], x1
++        st1             {v23.d}[1], [x0], x1
++        st1             {v24.8b},   [x9], x1
++        st1             {v24.d}[1], [x0], x1
++        st1             {v25.8b},   [x9], x1
++        st1             {v25.d}[1], [x0], x1
++        st1             {v26.8b},   [x9], x1
++        st1             {v26.d}[1], [x0], x1
++        st1             {v27.8b},   [x9], x1
++        st1             {v27.d}[1], [x0], x1
++        b               9b
++7:
++        sub             x9,  x0,  #2
++        add             x0,  x9,  x1, lsl #3
++        transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
++        st1             {v22.s}[0], [x9], x1
++        st1             {v22.s}[2], [x0], x1
++        st1             {v23.s}[0], [x9], x1
++        st1             {v23.s}[2], [x0], x1
++        st1             {v24.s}[0], [x9], x1
++        st1             {v24.s}[2], [x0], x1
++        st1             {v25.s}[0], [x9], x1
++        st1             {v25.s}[2], [x0], x1
++        st1             {v22.s}[1], [x9], x1
++        st1             {v22.s}[3], [x0], x1
++        st1             {v23.s}[1], [x9], x1
++        st1             {v23.s}[3], [x0], x1
++        st1             {v24.s}[1], [x9], x1
++        st1             {v24.s}[3], [x0], x1
++        st1             {v25.s}[1], [x9], x1
++        st1             {v25.s}[3], [x0], x1
++        b               9b
++endfunc
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
+@@ -0,0 +1,631 @@
++/*
++ * Copyright (c) 2017 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// All public functions in this file have the following signature:
++// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
++//                            const uint8_t *ref, ptrdiff_t ref_stride,
++//                            int h, int mx, int my);
++
++function ff_vp9_copy128_aarch64, export=1
++1:
++        ldp             x5,  x6,  [x2]
++        ldp             x7,  x8,  [x2, #16]
++        stp             x5,  x6,  [x0]
++        ldp             x9,  x10, [x2, #32]
++        stp             x7,  x8,  [x0, #16]
++        subs            w4,  w4,  #1
++        ldp             x11, x12, [x2, #48]
++        stp             x9,  x10, [x0, #32]
++        stp             x11, x12, [x0, #48]
++        ldp             x5,  x6,  [x2, #64]
++        ldp             x7,  x8,  [x2, #80]
++        stp             x5,  x6,  [x0, #64]
++        ldp             x9,  x10, [x2, #96]
++        stp             x7,  x8,  [x0, #80]
++        ldp             x11, x12, [x2, #112]
++        stp             x9,  x10, [x0, #96]
++        stp             x11, x12, [x0, #112]
++        add             x2,  x2,  x3
++        add             x0,  x0,  x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg64_16_neon, export=1
++        mov             x5,  x0
++        sub             x1,  x1,  #64
++        sub             x3,  x3,  #64
++1:
++        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
++        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
++        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
++        urhadd          v0.8h,  v0.8h,  v4.8h
++        urhadd          v1.8h,  v1.8h,  v5.8h
++        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
++        urhadd          v2.8h,  v2.8h,  v6.8h
++        urhadd          v3.8h,  v3.8h,  v7.8h
++        subs            w4,  w4,  #1
++        urhadd          v16.8h, v16.8h, v20.8h
++        urhadd          v17.8h, v17.8h, v21.8h
++        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], #64
++        urhadd          v18.8h, v18.8h, v22.8h
++        urhadd          v19.8h, v19.8h, v23.8h
++        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg32_16_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
++        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
++        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
++        urhadd          v0.8h,  v0.8h,  v4.8h
++        urhadd          v1.8h,  v1.8h,  v5.8h
++        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
++        urhadd          v2.8h,  v2.8h,  v6.8h
++        urhadd          v3.8h,  v3.8h,  v7.8h
++        subs            w4,  w4,  #2
++        urhadd          v16.8h, v16.8h, v20.8h
++        urhadd          v17.8h, v17.8h, v21.8h
++        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], x1
++        urhadd          v18.8h, v18.8h, v22.8h
++        urhadd          v19.8h, v19.8h, v23.8h
++        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg16_16_neon, export=1
++1:
++        ld1             {v2.8h, v3.8h},  [x2], x3
++        ld1             {v0.8h, v1.8h},  [x0]
++        urhadd          v0.8h,  v0.8h,  v2.8h
++        urhadd          v1.8h,  v1.8h,  v3.8h
++        subs            w4,  w4,  #1
++        st1             {v0.8h, v1.8h},  [x0], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg8_16_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v2.8h},  [x2], x3
++        ld1             {v0.8h},  [x0], x1
++        ld1             {v3.8h},  [x2], x3
++        urhadd          v0.8h,  v0.8h,  v2.8h
++        ld1             {v1.8h},  [x0], x1
++        urhadd          v1.8h,  v1.8h,  v3.8h
++        subs            w4,  w4,  #2
++        st1             {v0.8h},  [x5], x1
++        st1             {v1.8h},  [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg4_16_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v2.4h},  [x2], x3
++        ld1             {v0.4h},  [x0], x1
++        ld1             {v3.4h},  [x2], x3
++        urhadd          v0.4h,  v0.4h,  v2.4h
++        ld1             {v1.4h},  [x0], x1
++        urhadd          v1.4h,  v1.4h,  v3.4h
++        subs            w4,  w4,  #2
++        st1             {v0.4h},  [x5], x1
++        st1             {v1.8b},  [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++
++// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
++// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
++// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
++// for size >= 16)
++.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
++        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
++        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
++        smlal           \dst1\().4s, v20.4h, v0.h[\offset]
++        smlal           \dst5\().4s, v22.4h, v0.h[\offset]
++.if \size >= 16
++        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
++        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
++.endif
++.if \size >= 8
++        smlal2          \dst2\().4s, v20.8h, v0.h[\offset]
++        smlal2          \dst6\().4s, v22.8h, v0.h[\offset]
++.endif
++.if \size >= 16
++        smlal           \dst3\().4s, v21.4h, v0.h[\offset]
++        smlal           \dst7\().4s, v23.4h, v0.h[\offset]
++        smlal2          \dst4\().4s, v21.8h, v0.h[\offset]
++        smlal2          \dst8\().4s, v23.8h, v0.h[\offset]
++.endif
++.endm
++
++
++// Instantiate a horizontal filter function for the given size.
++// This can work on 4, 8 or 16 pixels in parallel; for larger
++// widths it will do 16 pixels at a time and loop horizontally.
++// The actual width (in bytes) is passed in x5, the height in w4 and
++// the filter coefficients in x9.
++.macro do_8tap_h type, size
++function \type\()_8tap_\size\()h
++        sub             x2,  x2,  #6
++        add             x6,  x0,  x1
++        add             x7,  x2,  x3
++        add             x1,  x1,  x1
++        add             x3,  x3,  x3
++        // Only size >= 16 loops horizontally and needs
++        // reduced dst stride
++.if \size >= 16
++        sub             x1,  x1,  x5
++.endif
++        // size >= 16 loads two qwords and increments r2,
++        // for size 4/8 it's enough with one qword and no
++        // postincrement
++.if \size >= 16
++        sub             x3,  x3,  x5
++        sub             x3,  x3,  #16
++.endif
++        // Load the filter vector
++        ld1             {v0.8h},  [x9]
++1:
++.if \size >= 16
++        mov             x9,  x5
++.endif
++        // Load src
++.if \size >= 16
++        ld1             {v5.8h,  v6.8h,  v7.8h},  [x2], #48
++        ld1             {v16.8h, v17.8h, v18.8h}, [x7], #48
++.else
++        ld1             {v5.8h,  v6.8h},  [x2]
++        ld1             {v16.8h, v17.8h}, [x7]
++.endif
++2:
++
++        smull           v1.4s,  v5.4h,  v0.h[0]
++        smull           v24.4s, v16.4h, v0.h[0]
++.if \size >= 8
++        smull2          v2.4s,  v5.8h,  v0.h[0]
++        smull2          v25.4s, v16.8h, v0.h[0]
++.endif
++.if \size >= 16
++        smull           v3.4s,  v6.4h,  v0.h[0]
++        smull           v26.4s, v17.4h, v0.h[0]
++        smull2          v4.4s,  v6.8h,  v0.h[0]
++        smull2          v27.4s, v17.8h, v0.h[0]
++.endif
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 1, \size
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 2, \size
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 3, \size
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 4, \size
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 5, \size
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 6, \size
++        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 7, \size
++
++        // Round, shift and saturate
++        // The sqrshrun takes care of clamping negative values to zero, but
++        // we manually need to do umin with the max pixel value.
++        sqrshrun        v1.4h,  v1.4s,  #7
++        sqrshrun        v24.4h, v24.4s, #7
++.if \size >= 8
++        sqrshrun2       v1.8h,  v2.4s,  #7
++        sqrshrun2       v24.8h, v25.4s, #7
++        umin            v1.8h,  v1.8h,  v31.8h
++        umin            v24.8h, v24.8h, v31.8h
++.if \size >= 16
++        sqrshrun        v2.4h,  v3.4s,  #7
++        sqrshrun        v25.4h, v26.4s, #7
++        sqrshrun2       v2.8h,  v4.4s,  #7
++        sqrshrun2       v25.8h, v27.4s, #7
++        umin            v2.8h,  v2.8h,  v31.8h
++        umin            v25.8h, v25.8h, v31.8h
++.endif
++.else
++        umin            v1.4h,  v1.4h,  v31.4h
++        umin            v24.4h, v24.4h, v31.4h
++.endif
++        // Average
++.ifc \type,avg
++.if \size >= 16
++        ld1             {v3.8h,  v4.8h},  [x0]
++        ld1             {v29.8h, v30.8h}, [x6]
++        urhadd          v1.8h,  v1.8h,  v3.8h
++        urhadd          v2.8h,  v2.8h,  v4.8h
++        urhadd          v24.8h, v24.8h, v29.8h
++        urhadd          v25.8h, v25.8h, v30.8h
++.elseif \size >= 8
++        ld1             {v3.8h},  [x0]
++        ld1             {v4.8h},  [x6]
++        urhadd          v1.8h,  v1.8h,  v3.8h
++        urhadd          v24.8h, v24.8h, v4.8h
++.else
++        ld1             {v3.4h},  [x0]
++        ld1             {v4.4h},  [x6]
++        urhadd          v1.4h,  v1.4h,  v3.4h
++        urhadd          v24.4h, v24.4h, v4.4h
++.endif
++.endif
++        // Store and loop horizontally (for size >= 16)
++.if \size >= 16
++        subs            x9,  x9,  #32
++        st1             {v1.8h,  v2.8h},  [x0], #32
++        st1             {v24.8h, v25.8h}, [x6], #32
++        b.eq            3f
++        mov             v5.16b,  v7.16b
++        mov             v16.16b, v18.16b
++        ld1             {v6.8h,  v7.8h},  [x2], #32
++        ld1             {v17.8h, v18.8h}, [x7], #32
++        b               2b
++.elseif \size == 8
++        st1             {v1.8h},  [x0]
++        st1             {v24.8h}, [x6]
++.else // \size == 4
++        st1             {v1.4h},  [x0]
++        st1             {v24.4h}, [x6]
++.endif
++3:
++        // Loop vertically
++        add             x0,  x0,  x1
++        add             x6,  x6,  x1
++        add             x2,  x2,  x3
++        add             x7,  x7,  x3
++        subs            w4,  w4,  #2
++        b.ne            1b
++        ret
++endfunc
++.endm
++
++.macro do_8tap_h_size size
++do_8tap_h put, \size
++do_8tap_h avg, \size
++.endm
++
++do_8tap_h_size 4
++do_8tap_h_size 8
++do_8tap_h_size 16
++
++.macro do_8tap_h_func type, filter, offset, size, bpp
++function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
++        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
++        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
++        cmp             w5,  #8
++        add             x9,  x6,  w5, uxtw #4
++        mov             x5,  #2*\size
++.if \size >= 16
++        b               \type\()_8tap_16h
++.else
++        b               \type\()_8tap_\size\()h
++.endif
++endfunc
++.endm
++
++.macro do_8tap_h_filters size, bpp
++do_8tap_h_func put, regular, 1, \size, \bpp
++do_8tap_h_func avg, regular, 1, \size, \bpp
++do_8tap_h_func put, sharp,   2, \size, \bpp
++do_8tap_h_func avg, sharp,   2, \size, \bpp
++do_8tap_h_func put, smooth,  0, \size, \bpp
++do_8tap_h_func avg, smooth,  0, \size, \bpp
++.endm
++
++.macro do_8tap_h_filters_bpp bpp
++do_8tap_h_filters 64, \bpp
++do_8tap_h_filters 32, \bpp
++do_8tap_h_filters 16, \bpp
++do_8tap_h_filters 8,  \bpp
++do_8tap_h_filters 4,  \bpp
++.endm
++
++do_8tap_h_filters_bpp 10
++do_8tap_h_filters_bpp 12
++
++
++// Vertical filters
++
++// Round, shift and saturate and store reg1-reg4
++.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
++        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
++        sqrshrun        \reg2\().4h,  \reg2\().4s, #7
++        sqrshrun        \reg3\().4h,  \reg3\().4s, #7
++        sqrshrun        \reg4\().4h,  \reg4\().4s, #7
++.ifc \type,avg
++        ld1             {\tmp1\().4h},  [x7], x1
++        ld1             {\tmp2\().4h},  [x7], x1
++        ld1             {\tmp3\().4h},  [x7], x1
++        ld1             {\tmp4\().4h},  [x7], x1
++.endif
++        umin            \reg1\().4h,  \reg1\().4h,  \minreg\().4h
++        umin            \reg2\().4h,  \reg2\().4h,  \minreg\().4h
++        umin            \reg3\().4h,  \reg3\().4h,  \minreg\().4h
++        umin            \reg4\().4h,  \reg4\().4h,  \minreg\().4h
++.ifc \type,avg
++        urhadd          \reg1\().4h,  \reg1\().4h,  \tmp1\().4h
++        urhadd          \reg2\().4h,  \reg2\().4h,  \tmp2\().4h
++        urhadd          \reg3\().4h,  \reg3\().4h,  \tmp3\().4h
++        urhadd          \reg4\().4h,  \reg4\().4h,  \tmp4\().4h
++.endif
++        st1             {\reg1\().4h},  [x0], x1
++        st1             {\reg2\().4h},  [x0], x1
++        st1             {\reg3\().4h},  [x0], x1
++        st1             {\reg4\().4h},  [x0], x1
++.endm
++
++// Round, shift and saturate and store reg1-8, where
++// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
++.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
++        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
++        sqrshrun2       \reg1\().8h,  \reg2\().4s, #7
++        sqrshrun        \reg2\().4h,  \reg3\().4s, #7
++        sqrshrun2       \reg2\().8h,  \reg4\().4s, #7
++        sqrshrun        \reg3\().4h,  \reg5\().4s, #7
++        sqrshrun2       \reg3\().8h,  \reg6\().4s, #7
++        sqrshrun        \reg4\().4h,  \reg7\().4s, #7
++        sqrshrun2       \reg4\().8h,  \reg8\().4s, #7
++.ifc \type,avg
++        ld1             {\reg5\().8h},  [x7], x1
++        ld1             {\reg6\().8h},  [x7], x1
++        ld1             {\reg7\().8h},  [x7], x1
++        ld1             {\reg8\().8h},  [x7], x1
++.endif
++        umin            \reg1\().8h,  \reg1\().8h,  \minreg\().8h
++        umin            \reg2\().8h,  \reg2\().8h,  \minreg\().8h
++        umin            \reg3\().8h,  \reg3\().8h,  \minreg\().8h
++        umin            \reg4\().8h,  \reg4\().8h,  \minreg\().8h
++.ifc \type,avg
++        urhadd          \reg1\().8h,  \reg1\().8h,  \reg5\().8h
++        urhadd          \reg2\().8h,  \reg2\().8h,  \reg6\().8h
++        urhadd          \reg3\().8h,  \reg3\().8h,  \reg7\().8h
++        urhadd          \reg4\().8h,  \reg4\().8h,  \reg8\().8h
++.endif
++        st1             {\reg1\().8h},  [x0], x1
++        st1             {\reg2\().8h},  [x0], x1
++        st1             {\reg3\().8h},  [x0], x1
++        st1             {\reg4\().8h},  [x0], x1
++.endm
++
++// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
++// (src1-src8 into dst1, src2-src9 into dst2).
++.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
++        smull           \dst1\().4s, \src1\().4h, v0.h[0]
++        smull           \dst2\().4s, \src2\().4h, v0.h[0]
++        smull           \tmp1\().4s, \src2\().4h, v0.h[1]
++        smull           \tmp2\().4s, \src3\().4h, v0.h[1]
++        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
++        smlal           \dst2\().4s, \src4\().4h, v0.h[2]
++        smlal           \tmp1\().4s, \src4\().4h, v0.h[3]
++        smlal           \tmp2\().4s, \src5\().4h, v0.h[3]
++        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
++        smlal           \dst2\().4s, \src6\().4h, v0.h[4]
++        smlal           \tmp1\().4s, \src6\().4h, v0.h[5]
++        smlal           \tmp2\().4s, \src7\().4h, v0.h[5]
++        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
++        smlal           \dst2\().4s, \src8\().4h, v0.h[6]
++        smlal           \tmp1\().4s, \src8\().4h, v0.h[7]
++        smlal           \tmp2\().4s, \src9\().4h, v0.h[7]
++        add             \dst1\().4s, \dst1\().4s, \tmp1\().4s
++        add             \dst2\().4s, \dst2\().4s, \tmp2\().4s
++.endm
++
++// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
++// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
++.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
++        smull           \dst1\().4s, \src1\().4h, v0.h[0]
++        smull2          \dst2\().4s, \src1\().8h, v0.h[0]
++        smull           \dst3\().4s, \src2\().4h, v0.h[0]
++        smull2          \dst4\().4s, \src2\().8h, v0.h[0]
++        smlal           \dst1\().4s, \src2\().4h, v0.h[1]
++        smlal2          \dst2\().4s, \src2\().8h, v0.h[1]
++        smlal           \dst3\().4s, \src3\().4h, v0.h[1]
++        smlal2          \dst4\().4s, \src3\().8h, v0.h[1]
++        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
++        smlal2          \dst2\().4s, \src3\().8h, v0.h[2]
++        smlal           \dst3\().4s, \src4\().4h, v0.h[2]
++        smlal2          \dst4\().4s, \src4\().8h, v0.h[2]
++        smlal           \dst1\().4s, \src4\().4h, v0.h[3]
++        smlal2          \dst2\().4s, \src4\().8h, v0.h[3]
++        smlal           \dst3\().4s, \src5\().4h, v0.h[3]
++        smlal2          \dst4\().4s, \src5\().8h, v0.h[3]
++        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
++        smlal2          \dst2\().4s, \src5\().8h, v0.h[4]
++        smlal           \dst3\().4s, \src6\().4h, v0.h[4]
++        smlal2          \dst4\().4s, \src6\().8h, v0.h[4]
++        smlal           \dst1\().4s, \src6\().4h, v0.h[5]
++        smlal2          \dst2\().4s, \src6\().8h, v0.h[5]
++        smlal           \dst3\().4s, \src7\().4h, v0.h[5]
++        smlal2          \dst4\().4s, \src7\().8h, v0.h[5]
++        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
++        smlal2          \dst2\().4s, \src7\().8h, v0.h[6]
++        smlal           \dst3\().4s, \src8\().4h, v0.h[6]
++        smlal2          \dst4\().4s, \src8\().8h, v0.h[6]
++        smlal           \dst1\().4s, \src8\().4h, v0.h[7]
++        smlal2          \dst2\().4s, \src8\().8h, v0.h[7]
++        smlal           \dst3\().4s, \src9\().4h, v0.h[7]
++        smlal2          \dst4\().4s, \src9\().8h, v0.h[7]
++.endm
++
++// Instantiate a vertical filter function for filtering 8 pixels at a time.
++// The height is passed in x4, the width in x5 and the filter coefficients
++// in x6.
++.macro do_8tap_8v type
++function \type\()_8tap_8v
++        sub             x2,  x2,  x3, lsl #1
++        sub             x2,  x2,  x3
++        ld1             {v0.8h},  [x6]
++1:
++.ifc \type,avg
++        mov             x7,  x0
++.endif
++        mov             x6,  x4
++
++        ld1             {v17.8h}, [x2], x3
++        ld1             {v18.8h}, [x2], x3
++        ld1             {v19.8h}, [x2], x3
++        ld1             {v20.8h}, [x2], x3
++        ld1             {v21.8h}, [x2], x3
++        ld1             {v22.8h}, [x2], x3
++        ld1             {v23.8h}, [x2], x3
++2:
++        ld1             {v24.8h}, [x2], x3
++        ld1             {v25.8h}, [x2], x3
++        ld1             {v26.8h}, [x2], x3
++        ld1             {v27.8h}, [x2], x3
++
++        convolve8       v2,  v3,  v4,  v5,  v17, v18, v19, v20, v21, v22, v23, v24, v25
++        convolve8       v6,  v7,  v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
++        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v30, v31, v1,  \type
++
++        subs            x6,  x6,  #4
++        b.eq            8f
++
++        ld1             {v16.8h}, [x2], x3
++        ld1             {v17.8h}, [x2], x3
++        ld1             {v18.8h}, [x2], x3
++        ld1             {v19.8h}, [x2], x3
++        convolve8       v2,  v3,  v4,  v5,  v21, v22, v23, v24, v25, v26, v27, v16, v17
++        convolve8       v6,  v7,  v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
++        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v20, v21, v1,  \type
++
++        subs            x6,  x6,  #4
++        b.eq            8f
++
++        ld1             {v20.8h}, [x2], x3
++        ld1             {v21.8h}, [x2], x3
++        ld1             {v22.8h}, [x2], x3
++        ld1             {v23.8h}, [x2], x3
++        convolve8       v2,  v3,  v4,  v5,  v25, v26, v27, v16, v17, v18, v19, v20, v21
++        convolve8       v6,  v7,  v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
++        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v24, v25, v1,  \type
++
++        subs            x6,  x6,  #4
++        b.ne            2b
++
++8:
++        subs            x5,  x5,  #8
++        b.eq            9f
++        // x0 -= h * dst_stride
++        msub            x0,  x1,  x4, x0
++        // x2 -= h * src_stride
++        msub            x2,  x3,  x4, x2
++        // x2 -= 8 * src_stride
++        sub             x2,  x2,  x3, lsl #3
++        // x2 += 1 * src_stride
++        add             x2,  x2,  x3
++        add             x2,  x2,  #16
++        add             x0,  x0,  #16
++        b               1b
++9:
++        ret
++endfunc
++.endm
++
++do_8tap_8v put
++do_8tap_8v avg
++
++
++// Instantiate a vertical filter function for filtering a 4 pixels wide
++// slice. This only is designed to work for 4 or 8 output lines.
++.macro do_8tap_4v type
++function \type\()_8tap_4v
++        sub             x2,  x2,  x3, lsl #1
++        sub             x2,  x2,  x3
++        ld1             {v0.8h},  [x6]
++.ifc \type,avg
++        mov             x7,  x0
++.endif
++
++        ld1             {v16.4h}, [x2], x3
++        ld1             {v17.4h}, [x2], x3
++        ld1             {v18.4h}, [x2], x3
++        ld1             {v19.4h}, [x2], x3
++        ld1             {v20.4h}, [x2], x3
++        ld1             {v21.4h}, [x2], x3
++        ld1             {v22.4h}, [x2], x3
++        ld1             {v23.4h}, [x2], x3
++        ld1             {v24.4h}, [x2], x3
++        ld1             {v25.4h}, [x2], x3
++        ld1             {v26.4h}, [x2], x3
++
++        convolve4       v2,  v3,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
++        convolve4       v4,  v5,  v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
++        do_store4       v2,  v3,  v4,  v5,  v28, v29, v30, v31, v1,  \type
++
++        subs            x4,  x4,  #4
++        b.eq            9f
++
++        ld1             {v27.4h}, [x2], x3
++        ld1             {v28.4h}, [x2], x3
++        ld1             {v29.4h}, [x2], x3
++        ld1             {v30.4h}, [x2], x3
++
++        convolve4       v2,  v3,  v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
++        convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
++        do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
++
++9:
++        ret
++endfunc
++.endm
++
++do_8tap_4v put
++do_8tap_4v avg
++
++
++.macro do_8tap_v_func type, filter, offset, size, bpp
++function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
++        uxtw            x4,  w4
++        mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
++        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
++        add             x6,  x5,  w6, uxtw #4
++        mov             x5,  #\size
++.if \size >= 8
++        b               \type\()_8tap_8v
++.else
++        b               \type\()_8tap_4v
++.endif
++endfunc
++.endm
++
++.macro do_8tap_v_filters size, bpp
++do_8tap_v_func put, regular, 1, \size, \bpp
++do_8tap_v_func avg, regular, 1, \size, \bpp
++do_8tap_v_func put, sharp,   2, \size, \bpp
++do_8tap_v_func avg, sharp,   2, \size, \bpp
++do_8tap_v_func put, smooth,  0, \size, \bpp
++do_8tap_v_func avg, smooth,  0, \size, \bpp
++.endm
++
++.macro do_8tap_v_filters_bpp bpp
++do_8tap_v_filters 64, \bpp
++do_8tap_v_filters 32, \bpp
++do_8tap_v_filters 16, \bpp
++do_8tap_v_filters 8,  \bpp
++do_8tap_v_filters 4,  \bpp
++.endm
++
++do_8tap_v_filters_bpp 10
++do_8tap_v_filters_bpp 12
+diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
+@@ -0,0 +1,687 @@
++/*
++ * Copyright (c) 2016 Google Inc.
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// All public functions in this file have the following signature:
++// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
++//                            const uint8_t *ref, ptrdiff_t ref_stride,
++//                            int h, int mx, int my);
++
++function ff_vp9_copy64_aarch64, export=1
++1:
++        ldp             x5,  x6,  [x2]
++        ldp             x7,  x8,  [x2, #16]
++        stp             x5,  x6,  [x0]
++        ldp             x9,  x10, [x2, #32]
++        stp             x7,  x8,  [x0, #16]
++        subs            w4,  w4,  #1
++        ldp             x11, x12, [x2, #48]
++        stp             x9,  x10, [x0, #32]
++        stp             x11, x12, [x0, #48]
++        add             x2,  x2,  x3
++        add             x0,  x0,  x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg64_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
++        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
++        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
++        urhadd          v0.16b,  v0.16b,  v4.16b
++        urhadd          v1.16b,  v1.16b,  v5.16b
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
++        urhadd          v2.16b,  v2.16b,  v6.16b
++        urhadd          v3.16b,  v3.16b,  v7.16b
++        subs            w4,  w4,  #2
++        urhadd          v16.16b, v16.16b, v20.16b
++        urhadd          v17.16b, v17.16b, v21.16b
++        st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
++        urhadd          v18.16b, v18.16b, v22.16b
++        urhadd          v19.16b, v19.16b, v23.16b
++        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_copy32_aarch64, export=1
++1:
++        ldp             x5,  x6,  [x2]
++        ldp             x7,  x8,  [x2, #16]
++        stp             x5,  x6,  [x0]
++        subs            w4,  w4,  #1
++        stp             x7,  x8,  [x0, #16]
++        add             x2,  x2,  x3
++        add             x0,  x0,  x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg32_neon, export=1
++1:
++        ld1             {v2.16b, v3.16b},  [x2], x3
++        ld1             {v0.16b, v1.16b},  [x0]
++        urhadd          v0.16b,  v0.16b,  v2.16b
++        urhadd          v1.16b,  v1.16b,  v3.16b
++        subs            w4,  w4,  #1
++        st1             {v0.16b, v1.16b},  [x0], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_copy16_neon, export=1
++        add             x5,  x0,  x1
++        lsl             x1,  x1,  #1
++        add             x6,  x2,  x3
++        lsl             x3,  x3,  #1
++1:
++        ld1             {v0.16b},  [x2], x3
++        ld1             {v1.16b},  [x6], x3
++        ld1             {v2.16b},  [x2], x3
++        ld1             {v3.16b},  [x6], x3
++        subs            w4,  w4,  #4
++        st1             {v0.16b},  [x0], x1
++        st1             {v1.16b},  [x5], x1
++        st1             {v2.16b},  [x0], x1
++        st1             {v3.16b},  [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg16_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v2.16b},  [x2], x3
++        ld1             {v0.16b},  [x0], x1
++        ld1             {v3.16b},  [x2], x3
++        urhadd          v0.16b,  v0.16b,  v2.16b
++        ld1             {v1.16b},  [x0], x1
++        urhadd          v1.16b,  v1.16b,  v3.16b
++        subs            w4,  w4,  #2
++        st1             {v0.16b},  [x5], x1
++        st1             {v1.16b},  [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_copy8_neon, export=1
++1:
++        ld1             {v0.8b},  [x2], x3
++        ld1             {v1.8b},  [x2], x3
++        subs            w4,  w4,  #2
++        st1             {v0.8b},  [x0], x1
++        st1             {v1.8b},  [x0], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg8_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v2.8b},  [x2], x3
++        ld1             {v0.8b},  [x0], x1
++        ld1             {v3.8b},  [x2], x3
++        urhadd          v0.8b,  v0.8b,  v2.8b
++        ld1             {v1.8b},  [x0], x1
++        urhadd          v1.8b,  v1.8b,  v3.8b
++        subs            w4,  w4,  #2
++        st1             {v0.8b},  [x5], x1
++        st1             {v1.8b},  [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_copy4_neon, export=1
++1:
++        ld1             {v0.s}[0], [x2], x3
++        ld1             {v1.s}[0], [x2], x3
++        st1             {v0.s}[0], [x0], x1
++        ld1             {v2.s}[0], [x2], x3
++        st1             {v1.s}[0], [x0], x1
++        ld1             {v3.s}[0], [x2], x3
++        subs            w4,  w4,  #4
++        st1             {v2.s}[0], [x0], x1
++        st1             {v3.s}[0], [x0], x1
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vp9_avg4_neon, export=1
++        mov             x5,  x0
++1:
++        ld1             {v2.s}[0], [x2], x3
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v2.s}[1], [x2], x3
++        ld1             {v0.s}[1], [x0], x1
++        ld1             {v3.s}[0], [x2], x3
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v3.s}[1], [x2], x3
++        ld1             {v1.s}[1], [x0], x1
++        subs            w4,  w4,  #4
++        urhadd          v0.8b,  v0.8b,  v2.8b
++        urhadd          v1.8b,  v1.8b,  v3.8b
++        st1             {v0.s}[0], [x5], x1
++        st1             {v0.s}[1], [x5], x1
++        st1             {v1.s}[0], [x5], x1
++        st1             {v1.s}[1], [x5], x1
++        b.ne            1b
++        ret
++endfunc
++
++
++// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
++// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
++// dst1-dst2 and dst3-dst4 for size >= 16)
++.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
++        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
++        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
++.if \size >= 16
++        mla             \dst1\().8h, v20.8h, v0.h[\offset]
++        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
++        mla             \dst3\().8h, v22.8h, v0.h[\offset]
++        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
++        mla             \dst2\().8h, v21.8h, v0.h[\offset]
++        mla             \dst4\().8h, v23.8h, v0.h[\offset]
++.elseif \size == 8
++        mla             \dst1\().8h, v20.8h, v0.h[\offset]
++        mla             \dst3\().8h, v22.8h, v0.h[\offset]
++.else
++        mla             \dst1\().4h, v20.4h, v0.h[\offset]
++        mla             \dst3\().4h, v22.4h, v0.h[\offset]
++.endif
++.endm
++// The same as above, but don't accumulate straight into the
++// destination, but use a temp register and accumulate with saturation.
++.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
++        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
++        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
++.if \size >= 16
++        mul             v20.8h, v20.8h, v0.h[\offset]
++        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
++        mul             v22.8h, v22.8h, v0.h[\offset]
++        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
++        mul             v21.8h, v21.8h, v0.h[\offset]
++        mul             v23.8h, v23.8h, v0.h[\offset]
++.elseif \size == 8
++        mul             v20.8h, v20.8h, v0.h[\offset]
++        mul             v22.8h, v22.8h, v0.h[\offset]
++.else
++        mul             v20.4h, v20.4h, v0.h[\offset]
++        mul             v22.4h, v22.4h, v0.h[\offset]
++.endif
++.if \size == 4
++        sqadd           \dst1\().4h, \dst1\().4h, v20.4h
++        sqadd           \dst3\().4h, \dst3\().4h, v22.4h
++.else
++        sqadd           \dst1\().8h, \dst1\().8h, v20.8h
++        sqadd           \dst3\().8h, \dst3\().8h, v22.8h
++.if \size >= 16
++        sqadd           \dst2\().8h, \dst2\().8h, v21.8h
++        sqadd           \dst4\().8h, \dst4\().8h, v23.8h
++.endif
++.endif
++.endm
++
++
++// Instantiate a horizontal filter function for the given size.
++// This can work on 4, 8 or 16 pixels in parallel; for larger
++// widths it will do 16 pixels at a time and loop horizontally.
++// The actual width is passed in x5, the height in w4 and the
++// filter coefficients in x9. idx2 is the index of the largest
++// filter coefficient (3 or 4) and idx1 is the other one of them.
++.macro do_8tap_h type, size, idx1, idx2
++function \type\()_8tap_\size\()h_\idx1\idx2
++        sub             x2,  x2,  #3
++        add             x6,  x0,  x1
++        add             x7,  x2,  x3
++        add             x1,  x1,  x1
++        add             x3,  x3,  x3
++        // Only size >= 16 loops horizontally and needs
++        // reduced dst stride
++.if \size >= 16
++        sub             x1,  x1,  x5
++.endif
++        // size >= 16 loads two qwords and increments x2,
++        // for size 4/8 it's enough with one qword and no
++        // postincrement
++.if \size >= 16
++        sub             x3,  x3,  x5
++        sub             x3,  x3,  #8
++.endif
++        // Load the filter vector
++        ld1             {v0.8h},  [x9]
++1:
++.if \size >= 16
++        mov             x9,  x5
++.endif
++        // Load src
++.if \size >= 16
++        ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
++        ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
++.else
++        ld1             {v4.8b,  v5.8b},  [x2]
++        ld1             {v16.8b, v17.8b}, [x7]
++.endif
++        uxtl            v4.8h,  v4.8b
++        uxtl            v5.8h,  v5.8b
++        uxtl            v16.8h, v16.8b
++        uxtl            v17.8h, v17.8b
++.if \size >= 16
++        uxtl            v6.8h,  v6.8b
++        uxtl            v18.8h, v18.8b
++.endif
++2:
++
++        // Accumulate, adding idx2 last with a separate
++        // saturating add. The positive filter coefficients
++        // for all indices except idx2 must add up to less
++        // than 127 for this not to overflow.
++        mul             v1.8h,  v4.8h,  v0.h[0]
++        mul             v24.8h, v16.8h, v0.h[0]
++.if \size >= 16
++        mul             v2.8h,  v5.8h,  v0.h[0]
++        mul             v25.8h, v17.8h, v0.h[0]
++.endif
++        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 1,     \size
++        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 2,     \size
++        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx1, \size
++        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 5,     \size
++        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 6,     \size
++        extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 7,     \size
++        extmulqadd      v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx2, \size
++
++        // Round, shift and saturate
++        sqrshrun        v1.8b,   v1.8h,  #7
++        sqrshrun        v24.8b,  v24.8h, #7
++.if \size >= 16
++        sqrshrun2       v1.16b,  v2.8h,  #7
++        sqrshrun2       v24.16b, v25.8h, #7
++.endif
++        // Average
++.ifc \type,avg
++.if \size >= 16
++        ld1             {v2.16b}, [x0]
++        ld1             {v3.16b}, [x6]
++        urhadd          v1.16b,  v1.16b,  v2.16b
++        urhadd          v24.16b, v24.16b, v3.16b
++.elseif \size == 8
++        ld1             {v2.8b},  [x0]
++        ld1             {v3.8b},  [x6]
++        urhadd          v1.8b,  v1.8b,  v2.8b
++        urhadd          v24.8b, v24.8b, v3.8b
++.else
++        ld1             {v2.s}[0], [x0]
++        ld1             {v3.s}[0], [x6]
++        urhadd          v1.8b,  v1.8b,  v2.8b
++        urhadd          v24.8b, v24.8b, v3.8b
++.endif
++.endif
++        // Store and loop horizontally (for size >= 16)
++.if \size >= 16
++        subs            x9,  x9,  #16
++        st1             {v1.16b},  [x0], #16
++        st1             {v24.16b}, [x6], #16
++        b.eq            3f
++        mov             v4.16b,  v6.16b
++        mov             v16.16b, v18.16b
++        ld1             {v6.16b},  [x2], #16
++        ld1             {v18.16b}, [x7], #16
++        uxtl            v5.8h,  v6.8b
++        uxtl2           v6.8h,  v6.16b
++        uxtl            v17.8h, v18.8b
++        uxtl2           v18.8h, v18.16b
++        b               2b
++.elseif \size == 8
++        st1             {v1.8b},    [x0]
++        st1             {v24.8b},   [x6]
++.else // \size == 4
++        st1             {v1.s}[0],  [x0]
++        st1             {v24.s}[0], [x6]
++.endif
++3:
++        // Loop vertically
++        add             x0,  x0,  x1
++        add             x6,  x6,  x1
++        add             x2,  x2,  x3
++        add             x7,  x7,  x3
++        subs            w4,  w4,  #2
++        b.ne            1b
++        ret
++endfunc
++.endm
++
++.macro do_8tap_h_size size
++do_8tap_h put, \size, 3, 4
++do_8tap_h avg, \size, 3, 4
++do_8tap_h put, \size, 4, 3
++do_8tap_h avg, \size, 4, 3
++.endm
++
++do_8tap_h_size 4
++do_8tap_h_size 8
++do_8tap_h_size 16
++
++.macro do_8tap_h_func type, filter, offset, size
++function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
++        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
++        cmp             w5,  #8
++        add             x9,  x6,  w5, uxtw #4
++        mov             x5,  #\size
++.if \size >= 16
++        b.ge            \type\()_8tap_16h_34
++        b               \type\()_8tap_16h_43
++.else
++        b.ge            \type\()_8tap_\size\()h_34
++        b               \type\()_8tap_\size\()h_43
++.endif
++endfunc
++.endm
++
++.macro do_8tap_h_filters size
++do_8tap_h_func put, regular, 1, \size
++do_8tap_h_func avg, regular, 1, \size
++do_8tap_h_func put, sharp,   2, \size
++do_8tap_h_func avg, sharp,   2, \size
++do_8tap_h_func put, smooth,  0, \size
++do_8tap_h_func avg, smooth,  0, \size
++.endm
++
++do_8tap_h_filters 64
++do_8tap_h_filters 32
++do_8tap_h_filters 16
++do_8tap_h_filters 8
++do_8tap_h_filters 4
++
++
++// Vertical filters
++
++// Round, shift and saturate and store reg1-reg2 over 4 lines
++.macro do_store4 reg1, reg2, tmp1, tmp2, type
++        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
++        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
++.ifc \type,avg
++        ld1             {\tmp1\().s}[0],  [x7], x1
++        ld1             {\tmp2\().s}[0],  [x7], x1
++        ld1             {\tmp1\().s}[1],  [x7], x1
++        ld1             {\tmp2\().s}[1],  [x7], x1
++        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
++        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
++.endif
++        st1             {\reg1\().s}[0],  [x0], x1
++        st1             {\reg2\().s}[0],  [x0], x1
++        st1             {\reg1\().s}[1],  [x0], x1
++        st1             {\reg2\().s}[1],  [x0], x1
++.endm
++
++// Round, shift and saturate and store reg1-4
++.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
++        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
++        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
++        sqrshrun        \reg3\().8b,  \reg3\().8h, #7
++        sqrshrun        \reg4\().8b,  \reg4\().8h, #7
++.ifc \type,avg
++        ld1             {\tmp1\().8b},  [x7], x1
++        ld1             {\tmp2\().8b},  [x7], x1
++        ld1             {\tmp3\().8b},  [x7], x1
++        ld1             {\tmp4\().8b},  [x7], x1
++        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
++        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
++        urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
++        urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
++.endif
++        st1             {\reg1\().8b},  [x0], x1
++        st1             {\reg2\().8b},  [x0], x1
++        st1             {\reg3\().8b},  [x0], x1
++        st1             {\reg4\().8b},  [x0], x1
++.endm
++
++// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
++// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
++// at the end with saturation. Indices 0 and 7 always have negative or zero
++// coefficients, so they can be accumulated into tmp1-tmp2 together with the
++// largest coefficient.
++.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
++        mul             \dst1\().8h, \src2\().8h, v0.h[1]
++        mul             \dst2\().8h, \src3\().8h, v0.h[1]
++        mul             \tmp1\().8h, \src1\().8h, v0.h[0]
++        mul             \tmp2\().8h, \src2\().8h, v0.h[0]
++        mla             \dst1\().8h, \src3\().8h, v0.h[2]
++        mla             \dst2\().8h, \src4\().8h, v0.h[2]
++.if \idx1 == 3
++        mla             \dst1\().8h, \src4\().8h, v0.h[3]
++        mla             \dst2\().8h, \src5\().8h, v0.h[3]
++.else
++        mla             \dst1\().8h, \src5\().8h, v0.h[4]
++        mla             \dst2\().8h, \src6\().8h, v0.h[4]
++.endif
++        mla             \dst1\().8h, \src6\().8h, v0.h[5]
++        mla             \dst2\().8h, \src7\().8h, v0.h[5]
++        mla             \tmp1\().8h, \src8\().8h, v0.h[7]
++        mla             \tmp2\().8h, \src9\().8h, v0.h[7]
++        mla             \dst1\().8h, \src7\().8h, v0.h[6]
++        mla             \dst2\().8h, \src8\().8h, v0.h[6]
++.if \idx2 == 3
++        mla             \tmp1\().8h, \src4\().8h, v0.h[3]
++        mla             \tmp2\().8h, \src5\().8h, v0.h[3]
++.else
++        mla             \tmp1\().8h, \src5\().8h, v0.h[4]
++        mla             \tmp2\().8h, \src6\().8h, v0.h[4]
++.endif
++        sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
++        sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
++.endm
++
++// Load pixels and extend them to 16 bit
++.macro loadl dst1, dst2, dst3, dst4
++        ld1             {v1.8b}, [x2], x3
++        ld1             {v2.8b}, [x2], x3
++        ld1             {v3.8b}, [x2], x3
++.ifnb \dst4
++        ld1             {v4.8b}, [x2], x3
++.endif
++        uxtl            \dst1\().8h, v1.8b
++        uxtl            \dst2\().8h, v2.8b
++        uxtl            \dst3\().8h, v3.8b
++.ifnb \dst4
++        uxtl            \dst4\().8h, v4.8b
++.endif
++.endm
++
++// Instantiate a vertical filter function for filtering 8 pixels at a time.
++// The height is passed in x4, the width in x5 and the filter coefficients
++// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
++// and idx1 is the other one of them.
++.macro do_8tap_8v type, idx1, idx2
++function \type\()_8tap_8v_\idx1\idx2
++        sub             x2,  x2,  x3, lsl #1
++        sub             x2,  x2,  x3
++        ld1             {v0.8h},  [x6]
++1:
++.ifc \type,avg
++        mov             x7,  x0
++.endif
++        mov             x6,  x4
++
++        loadl           v17, v18, v19
++
++        loadl           v20, v21, v22, v23
++2:
++        loadl           v24, v25, v26, v27
++        convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
++        convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
++        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
++
++        subs            x6,  x6,  #4
++        b.eq            8f
++
++        loadl           v16, v17, v18, v19
++        convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
++        convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
++        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
++
++        subs            x6,  x6,  #4
++        b.eq            8f
++
++        loadl           v20, v21, v22, v23
++        convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
++        convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
++        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
++
++        subs            x6,  x6,  #4
++        b.ne            2b
++
++8:
++        subs            x5,  x5,  #8
++        b.eq            9f
++        // x0 -= h * dst_stride
++        msub            x0,  x1,  x4, x0
++        // x2 -= h * src_stride
++        msub            x2,  x3,  x4, x2
++        // x2 -= 8 * src_stride
++        sub             x2,  x2,  x3, lsl #3
++        // x2 += 1 * src_stride
++        add             x2,  x2,  x3
++        add             x2,  x2,  #8
++        add             x0,  x0,  #8
++        b               1b
++9:
++        ret
++endfunc
++.endm
++
++do_8tap_8v put, 3, 4
++do_8tap_8v put, 4, 3
++do_8tap_8v avg, 3, 4
++do_8tap_8v avg, 4, 3
++
++
++// Instantiate a vertical filter function for filtering a 4 pixels wide
++// slice. The first half of the registers contain one row, while the second
++// half of a register contains the second-next row (also stored in the first
++// half of the register two steps ahead). The convolution does two outputs
++// at a time; the output of v17-v24 into one, and v18-v25 into another one.
++// The first half of first output is the first output row, the first half
++// of the other output is the second output row. The second halves of the
++// registers are rows 3 and 4.
++// This only is designed to work for 4 or 8 output lines.
++.macro do_8tap_4v type, idx1, idx2
++function \type\()_8tap_4v_\idx1\idx2
++        sub             x2,  x2,  x3, lsl #1
++        sub             x2,  x2,  x3
++        ld1             {v0.8h},  [x6]
++.ifc \type,avg
++        mov             x7,  x0
++.endif
++
++        ld1             {v1.s}[0],  [x2], x3
++        ld1             {v2.s}[0],  [x2], x3
++        ld1             {v3.s}[0],  [x2], x3
++        ld1             {v4.s}[0],  [x2], x3
++        ld1             {v5.s}[0],  [x2], x3
++        ld1             {v6.s}[0],  [x2], x3
++        trn1            v1.2s,  v1.2s,  v3.2s
++        ld1             {v7.s}[0],  [x2], x3
++        trn1            v2.2s,  v2.2s,  v4.2s
++        ld1             {v26.s}[0], [x2], x3
++        uxtl            v17.8h, v1.8b
++        trn1            v3.2s,  v3.2s,  v5.2s
++        ld1             {v27.s}[0], [x2], x3
++        uxtl            v18.8h, v2.8b
++        trn1            v4.2s,  v4.2s,  v6.2s
++        ld1             {v28.s}[0], [x2], x3
++        uxtl            v19.8h, v3.8b
++        trn1            v5.2s,  v5.2s,  v7.2s
++        ld1             {v29.s}[0], [x2], x3
++        uxtl            v20.8h, v4.8b
++        trn1            v6.2s,  v6.2s,  v26.2s
++        uxtl            v21.8h, v5.8b
++        trn1            v7.2s,  v7.2s,  v27.2s
++        uxtl            v22.8h, v6.8b
++        trn1            v26.2s, v26.2s, v28.2s
++        uxtl            v23.8h, v7.8b
++        trn1            v27.2s, v27.2s, v29.2s
++        uxtl            v24.8h, v26.8b
++        uxtl            v25.8h, v27.8b
++
++        convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
++        do_store4       v1,  v2,  v5,  v6,  \type
++
++        subs            x4,  x4,  #4
++        b.eq            9f
++
++        ld1             {v1.s}[0],  [x2], x3
++        ld1             {v2.s}[0],  [x2], x3
++        trn1            v28.2s, v28.2s, v1.2s
++        trn1            v29.2s, v29.2s, v2.2s
++        ld1             {v1.s}[1],  [x2], x3
++        uxtl            v26.8h, v28.8b
++        ld1             {v2.s}[1],  [x2], x3
++        uxtl            v27.8h, v29.8b
++        uxtl            v28.8h, v1.8b
++        uxtl            v29.8h, v2.8b
++
++        convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
++        do_store4       v1,  v2,  v5,  v6,  \type
++
++9:
++        ret
++endfunc
++.endm
++
++do_8tap_4v put, 3, 4
++do_8tap_4v put, 4, 3
++do_8tap_4v avg, 3, 4
++do_8tap_4v avg, 4, 3
++
++
++.macro do_8tap_v_func type, filter, offset, size
++function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
++        uxtw            x4,  w4
++        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
++        cmp             w6,  #8
++        add             x6,  x5,  w6, uxtw #4
++        mov             x5,  #\size
++.if \size >= 8
++        b.ge            \type\()_8tap_8v_34
++        b               \type\()_8tap_8v_43
++.else
++        b.ge            \type\()_8tap_4v_34
++        b               \type\()_8tap_4v_43
++.endif
++endfunc
++.endm
++
++.macro do_8tap_v_filters size
++do_8tap_v_func put, regular, 1, \size
++do_8tap_v_func avg, regular, 1, \size
++do_8tap_v_func put, sharp,   2, \size
++do_8tap_v_func avg, sharp,   2, \size
++do_8tap_v_func put, smooth,  0, \size
++do_8tap_v_func avg, smooth,  0, \size
++.endm
++
++do_8tap_v_filters 64
++do_8tap_v_filters 32
++do_8tap_v_filters 16
++do_8tap_v_filters 8
++do_8tap_v_filters 4
+diff --git a/media/ffvpx/libavutil/aarch64/asm.S b/media/ffvpx/libavutil/aarch64/asm.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/asm.S
+@@ -0,0 +1,104 @@
++/*
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++
++#ifdef __ELF__
++#   define ELF
++#else
++#   define ELF #
++#endif
++
++#if HAVE_AS_FUNC
++#   define FUNC
++#else
++#   define FUNC #
++#endif
++
++.macro  function name, export=0, align=2
++    .macro endfunc
++ELF     .size   \name, . - \name
++FUNC    .endfunc
++        .purgem endfunc
++    .endm
++        .text
++        .align          \align
++    .if \export
++        .global EXTERN_ASM\name
++ELF     .type   EXTERN_ASM\name, %function
++FUNC    .func   EXTERN_ASM\name
++EXTERN_ASM\name:
++    .else
++ELF     .type   \name, %function
++FUNC    .func   \name
++\name:
++    .endif
++.endm
++
++.macro  const   name, align=2, relocate=0
++    .macro endconst
++ELF     .size   \name, . - \name
++        .purgem endconst
++    .endm
++#if HAVE_SECTION_DATA_REL_RO
++.if \relocate
++        .section        .data.rel.ro
++.else
++        .section        .rodata
++.endif
++#elif !defined(__MACH__)
++        .section        .rodata
++#else
++        .const_data
++#endif
++        .align          \align
++\name:
++.endm
++
++.macro  movrel rd, val, offset=0
++#if CONFIG_PIC && defined(__APPLE__)
++    .if \offset < 0
++        adrp            \rd, \val@PAGE
++        add             \rd, \rd, \val@PAGEOFF
++        sub             \rd, \rd, -(\offset)
++    .else
++        adrp            \rd, \val+(\offset)@PAGE
++        add             \rd, \rd, \val+(\offset)@PAGEOFF
++    .endif
++#elif CONFIG_PIC && defined(_WIN32)
++    .if \offset < 0
++        adrp            \rd, \val
++        add             \rd, \rd, :lo12:\val
++        sub             \rd, \rd, -(\offset)
++    .else
++        adrp            \rd, \val+(\offset)
++        add             \rd, \rd, :lo12:\val+(\offset)
++    .endif
++#elif CONFIG_PIC
++        adrp            \rd, \val+(\offset)
++        add             \rd, \rd, :lo12:\val+(\offset)
++#else
++        ldr             \rd, =\val+\offset
++#endif
++.endm
++
++#define GLUE(a, b) a ## b
++#define JOIN(a, b) GLUE(a, b)
++#define X(s) JOIN(EXTERN_ASM, s)
+diff --git a/media/ffvpx/libavutil/aarch64/bswap.h b/media/ffvpx/libavutil/aarch64/bswap.h
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/bswap.h
+@@ -0,0 +1,51 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVUTIL_AARCH64_BSWAP_H
++#define AVUTIL_AARCH64_BSWAP_H
++
++#include <stdint.h>
++#include "config.h"
++#include "libavutil/attributes.h"
++
++#if HAVE_INLINE_ASM
++
++#define av_bswap16 av_bswap16
++static av_always_inline av_const unsigned av_bswap16(unsigned x)
++{
++    __asm__("rev16 %w0, %w0" : "+r"(x));
++    return x;
++}
++
++#define av_bswap32 av_bswap32
++static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
++{
++    __asm__("rev %w0, %w0" : "+r"(x));
++    return x;
++}
++
++#define av_bswap64 av_bswap64
++static av_always_inline av_const uint64_t av_bswap64(uint64_t x)
++{
++    __asm__("rev %0, %0" : "+r"(x));
++    return x;
++}
++
++#endif /* HAVE_INLINE_ASM */
++
++#endif /* AVUTIL_AARCH64_BSWAP_H */
+diff --git a/media/ffvpx/libavutil/aarch64/cpu.c b/media/ffvpx/libavutil/aarch64/cpu.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/cpu.c
+@@ -0,0 +1,38 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/cpu.h"
++#include "libavutil/cpu_internal.h"
++#include "config.h"
++
++int ff_get_cpu_flags_aarch64(void)
++{
++    return AV_CPU_FLAG_ARMV8 * HAVE_ARMV8 |
++           AV_CPU_FLAG_NEON  * HAVE_NEON  |
++           AV_CPU_FLAG_VFP   * HAVE_VFP;
++}
++
++size_t ff_get_cpu_max_align_aarch64(void)
++{
++    int flags = av_get_cpu_flags();
++
++    if (flags & AV_CPU_FLAG_NEON)
++        return 16;
++
++    return 8;
++}
+diff --git a/media/ffvpx/libavutil/aarch64/cpu.h b/media/ffvpx/libavutil/aarch64/cpu.h
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/cpu.h
+@@ -0,0 +1,29 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVUTIL_AARCH64_CPU_H
++#define AVUTIL_AARCH64_CPU_H
++
++#include "libavutil/cpu.h"
++#include "libavutil/cpu_internal.h"
++
++#define have_armv8(flags) CPUEXT(flags, ARMV8)
++#define have_neon(flags) CPUEXT(flags, NEON)
++#define have_vfp(flags)  CPUEXT(flags, VFP)
++
++#endif /* AVUTIL_AARCH64_CPU_H */
+diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_init.c b/media/ffvpx/libavutil/aarch64/float_dsp_init.c
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/float_dsp_init.c
+@@ -0,0 +1,69 @@
++/*
++ * ARM NEON optimised Float DSP functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/float_dsp.h"
++#include "cpu.h"
++
++void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1,
++                         int len);
++
++void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
++                                int len);
++
++void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
++                                int len);
++
++void ff_vector_dmul_scalar_neon(double *dst, const double *src, double mul,
++                                int len);
++
++void ff_vector_fmul_window_neon(float *dst, const float *src0,
++                                const float *src1, const float *win, int len);
++
++void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
++                             const float *src2, int len);
++
++void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
++                                 const float *src1, int len);
++
++void ff_butterflies_float_neon(float *v1, float *v2, int len);
++
++float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
++
++av_cold void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++        fdsp->butterflies_float   = ff_butterflies_float_neon;
++        fdsp->scalarproduct_float = ff_scalarproduct_float_neon;
++        fdsp->vector_dmul_scalar  = ff_vector_dmul_scalar_neon;
++        fdsp->vector_fmul         = ff_vector_fmul_neon;
++        fdsp->vector_fmac_scalar  = ff_vector_fmac_scalar_neon;
++        fdsp->vector_fmul_add     = ff_vector_fmul_add_neon;
++        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
++        fdsp->vector_fmul_scalar  = ff_vector_fmul_scalar_neon;
++        fdsp->vector_fmul_window  = ff_vector_fmul_window_neon;
++    }
++}
+diff --git a/media/ffvpx/libavutil/aarch64/float_dsp_neon.S b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/float_dsp_neon.S
+@@ -0,0 +1,202 @@
++/*
++ * ARM NEON optimised Float DSP functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "asm.S"
++
++function ff_vector_fmul_neon, export=1
++1:      subs            w3,  w3,  #16
++        ld1             {v0.4S, v1.4S}, [x1], #32
++        ld1             {v2.4S, v3.4S}, [x1], #32
++        ld1             {v4.4S, v5.4S}, [x2], #32
++        ld1             {v6.4S, v7.4S}, [x2], #32
++        fmul            v16.4S, v0.4S,  v4.4S
++        fmul            v17.4S, v1.4S,  v5.4S
++        fmul            v18.4S, v2.4S,  v6.4S
++        fmul            v19.4S, v3.4S,  v7.4S
++        st1             {v16.4S, v17.4S}, [x0], #32
++        st1             {v18.4S, v19.4S}, [x0], #32
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vector_fmac_scalar_neon, export=1
++        mov             x3,  #-32
++1:      subs            w2,  w2,  #16
++        ld1             {v16.4S, v17.4S}, [x0], #32
++        ld1             {v18.4S, v19.4S}, [x0], x3
++        ld1             {v4.4S,  v5.4S},  [x1], #32
++        ld1             {v6.4S,  v7.4S},  [x1], #32
++        fmla            v16.4S, v4.4S,  v0.S[0]
++        fmla            v17.4S, v5.4S,  v0.S[0]
++        fmla            v18.4S, v6.4S,  v0.S[0]
++        fmla            v19.4S, v7.4S,  v0.S[0]
++        st1             {v16.4S, v17.4S}, [x0], #32
++        st1             {v18.4S, v19.4S}, [x0], #32
++        b.ne            1b
++        ret
++endfunc
++
++function ff_vector_fmul_scalar_neon, export=1
++        mov             w4,  #15
++        bics            w3,  w2,  w4
++        dup             v16.4S, v0.S[0]
++        b.eq            3f
++        ld1             {v0.4S, v1.4S}, [x1], #32
++1:      subs            w3,  w3,  #16
++        fmul            v0.4S,  v0.4S,  v16.4S
++        ld1             {v2.4S, v3.4S}, [x1], #32
++        fmul            v1.4S,  v1.4S,  v16.4S
++        fmul            v2.4S,  v2.4S,  v16.4S
++        st1             {v0.4S, v1.4S}, [x0], #32
++        fmul            v3.4S,  v3.4S,  v16.4S
++        b.eq            2f
++        ld1             {v0.4S, v1.4S}, [x1], #32
++        st1             {v2.4S, v3.4S}, [x0], #32
++        b               1b
++2:      ands            w2,  w2,  #15
++        st1             {v2.4S, v3.4S}, [x0], #32
++        b.eq            4f
++3:      ld1             {v0.4S}, [x1], #16
++        fmul            v0.4S,  v0.4S,  v16.4S
++        st1             {v0.4S}, [x0], #16
++        subs            w2,  w2,  #4
++        b.gt            3b
++4:      ret
++endfunc
++
++function ff_vector_dmul_scalar_neon, export=1
++        dup             v16.2D, v0.D[0]
++        ld1             {v0.2D, v1.2D}, [x1], #32
++1:      subs            w2,  w2,  #8
++        fmul            v0.2D,  v0.2D,  v16.2D
++        ld1             {v2.2D, v3.2D}, [x1], #32
++        fmul            v1.2D,  v1.2D,  v16.2D
++        fmul            v2.2D,  v2.2D,  v16.2D
++        st1             {v0.2D, v1.2D}, [x0], #32
++        fmul            v3.2D,  v3.2D,  v16.2D
++        ld1             {v0.2D, v1.2D}, [x1], #32
++        st1             {v2.2D, v3.2D}, [x0], #32
++        b.gt            1b
++        ret
++endfunc
++
++function ff_vector_fmul_window_neon, export=1
++        sxtw            x4,  w4                 // len
++        sub             x2,  x2,  #8
++        sub             x5,  x4,  #2
++        add             x2,  x2,  x5, lsl #2    // src1 + 4 * (len - 4)
++        add             x6,  x3,  x5, lsl #3    // win  + 8 * (len - 2)
++        add             x5,  x0,  x5, lsl #3    // dst  + 8 * (len - 2)
++        mov             x7,  #-16
++        ld1             {v0.4S},  [x1], #16     // s0
++        ld1             {v2.4S},  [x3], #16     // wi
++        ld1             {v1.4S},  [x2], x7      // s1
++1:      ld1             {v3.4S},  [x6], x7      // wj
++        subs            x4,  x4,  #4
++        fmul            v17.4S, v0.4S,  v2.4S   // s0 * wi
++        rev64           v4.4S,  v1.4S
++        rev64           v5.4S,  v3.4S
++        rev64           v17.4S, v17.4S
++        ext             v4.16B,  v4.16B,  v4.16B,  #8 // s1_r
++        ext             v5.16B,  v5.16B,  v5.16B,  #8 // wj_r
++        ext             v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
++        fmul            v16.4S, v0.4S,  v5.4S  // s0 * wj_r
++        fmla            v17.4S, v1.4S,  v3.4S  // (s0 * wi)_rev + s1 * wj
++        b.eq            2f
++        ld1             {v0.4S},  [x1], #16
++        fmls            v16.4S, v4.4S,  v2.4S  // s0 * wj_r - s1_r * wi
++        st1             {v17.4S}, [x5], x7
++        ld1             {v2.4S},  [x3], #16
++        ld1             {v1.4S},  [x2], x7
++        st1             {v16.4S}, [x0], #16
++        b               1b
++2:
++        fmls            v16.4S, v4.4S,  v2.4S  // s0 * wj_r - s1_r * wi
++        st1             {v17.4S}, [x5], x7
++        st1             {v16.4S}, [x0], #16
++        ret
++endfunc
++
++function ff_vector_fmul_add_neon, export=1
++        ld1             {v0.4S, v1.4S},  [x1], #32
++        ld1             {v2.4S, v3.4S},  [x2], #32
++        ld1             {v4.4S, v5.4S},  [x3], #32
++1:      subs            w4,  w4,  #8
++        fmla            v4.4S,  v0.4S,  v2.4S
++        fmla            v5.4S,  v1.4S,  v3.4S
++        b.eq            2f
++        ld1             {v0.4S, v1.4S},  [x1], #32
++        ld1             {v2.4S, v3.4S},  [x2], #32
++        st1             {v4.4S, v5.4S},  [x0], #32
++        ld1             {v4.4S, v5.4S},  [x3], #32
++        b               1b
++2:      st1             {v4.4S, v5.4S},  [x0], #32
++        ret
++endfunc
++
++function ff_vector_fmul_reverse_neon, export=1
++        sxtw            x3,  w3
++        add             x2,  x2,  x3,  lsl #2
++        sub             x2,  x2,  #32
++        mov             x4, #-32
++        ld1             {v2.4S, v3.4S},  [x2], x4
++        ld1             {v0.4S, v1.4S},  [x1], #32
++1:      subs            x3,  x3,  #8
++        rev64           v3.4S,  v3.4S
++        rev64           v2.4S,  v2.4S
++        ext             v3.16B, v3.16B, v3.16B,  #8
++        ext             v2.16B, v2.16B, v2.16B,  #8
++        fmul            v16.4S, v0.4S,  v3.4S
++        fmul            v17.4S, v1.4S,  v2.4S
++        b.eq            2f
++        ld1             {v2.4S, v3.4S},  [x2], x4
++        ld1             {v0.4S, v1.4S},  [x1], #32
++        st1             {v16.4S, v17.4S},  [x0], #32
++        b               1b
++2:      st1             {v16.4S, v17.4S},  [x0], #32
++        ret
++endfunc
++
++function ff_butterflies_float_neon, export=1
++1:      ld1             {v0.4S}, [x0]
++        ld1             {v1.4S}, [x1]
++        subs            w2,  w2,  #4
++        fsub            v2.4S,   v0.4S,  v1.4S
++        fadd            v3.4S,   v0.4S,  v1.4S
++        st1             {v2.4S}, [x1],   #16
++        st1             {v3.4S}, [x0],   #16
++        b.gt            1b
++        ret
++endfunc
++
++function ff_scalarproduct_float_neon, export=1
++        movi            v2.4S,  #0
++1:      ld1             {v0.4S}, [x0],   #16
++        ld1             {v1.4S}, [x1],   #16
++        subs            w2,      w2,     #4
++        fmla            v2.4S,   v0.4S,  v1.4S
++        b.gt            1b
++        faddp           v0.4S,   v2.4S,  v2.4S
++        faddp           s0,      v0.2S
++        ret
++endfunc
+diff --git a/media/ffvpx/libavutil/aarch64/timer.h b/media/ffvpx/libavutil/aarch64/timer.h
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/timer.h
+@@ -0,0 +1,44 @@
++/*
++ * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVUTIL_AARCH64_TIMER_H
++#define AVUTIL_AARCH64_TIMER_H
++
++#include <stdint.h>
++#include "config.h"
++
++#if HAVE_INLINE_ASM
++
++#define AV_READ_TIME read_time
++
++static inline uint64_t read_time(void)
++{
++    uint64_t cycle_counter;
++    __asm__ volatile(
++        "isb                   \t\n"
++        "mrs %0, pmccntr_el0       "
++        : "=r"(cycle_counter) :: "memory" );
++
++    return cycle_counter;
++}
++
++#endif /* HAVE_INLINE_ASM */
++
++#endif /* AVUTIL_AARCH64_TIMER_H */
+

+ 1016 - 0
mozilla-release/patches/1540760-6-68a1.patch

@@ -0,0 +1,1016 @@
+# HG changeset patch
+# User Dan Minor <dminor@mozilla.com>
+# Date 1556723090 0
+# Node ID d000d40067de32c45c46b39a413ad6a9d2949411
+# Parent  a73351520d7b4e2532abbf7ea81767b1c778c0ce
+Bug 1540760 - Build system changes for aarch64-win64 support in ffvpx; r=jya
+
+Differential Revision: https://phabricator.services.mozilla.com/D27790
+
+diff --git a/media/ffvpx/config.h b/media/ffvpx/config.h
+--- a/media/ffvpx/config.h
++++ b/media/ffvpx/config.h
+@@ -14,21 +14,25 @@
+ #endif
+ #else // MOZ_FFVPX_FLACONLY
+ #if defined(XP_WIN)
+ // Avoid conflicts with mozilla-config.h
+ #if !defined(_MSC_VER)
+ #undef HAVE_DIRENT_H
+ #undef HAVE_UNISTD_H
+ #endif
++#if defined(_ARM64_)
++#include "config_aarch64_win64.h"
++#else
+ #if defined(HAVE_64BIT_BUILD)
+ #include "config_win64.h"
+ #else
+ #include "config_win32.h"
+ #endif
++#endif
+ // Adjust configure defines for GCC
+ #if !defined(_MSC_VER)
+ #if !defined(HAVE_64BIT_BUILD)
+ #undef HAVE_MM_EMPTY
+ #define HAVE_MM_EMPTY 0
+ #endif
+ #undef HAVE_LIBC_MSVCRT
+ #define HAVE_LIBC_MSVCRT 0
+diff --git a/media/ffvpx/config_aarch64_win64.h b/media/ffvpx/config_aarch64_win64.h
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/config_aarch64_win64.h
+@@ -0,0 +1,665 @@
++/* Automatically generated by configure - do not modify! */
++#ifndef FFMPEG_CONFIG_H
++#define FFMPEG_CONFIG_H
++#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --toolchain=msvc --disable-cuda --disable-cuvid"
++#define FFMPEG_LICENSE "LGPL version 2.1 or later"
++#define CONFIG_THIS_YEAR 2018
++#define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
++#define AVCONV_DATADIR "/usr/local/share/ffmpeg"
++#define CC_IDENT "Microsoft (R) C/C++ Optimizing Compiler Version 19.15.26726 for x64"
++#define av_restrict __restrict
++#define EXTERN_PREFIX ""
++#define EXTERN_ASM
++#define BUILDSUF ""
++#define SLIBSUF ".dll"
++#define HAVE_MMX2 HAVE_MMXEXT
++#define SWS_MAX_FILTER_SIZE 256
++#define ARCH_AARCH64 1
++#define ARCH_ALPHA 0
++#define ARCH_ARM 0
++#define ARCH_AVR32 0
++#define ARCH_AVR32_AP 0
++#define ARCH_AVR32_UC 0
++#define ARCH_BFIN 0
++#define ARCH_IA64 0
++#define ARCH_M68K 0
++#define ARCH_MIPS 0
++#define ARCH_MIPS64 0
++#define ARCH_PARISC 0
++#define ARCH_PPC 0
++#define ARCH_PPC64 0
++#define ARCH_S390 0
++#define ARCH_SH4 0
++#define ARCH_SPARC 0
++#define ARCH_SPARC64 0
++#define ARCH_TILEGX 0
++#define ARCH_TILEPRO 0
++#define ARCH_TOMI 0
++#define ARCH_X86 0
++#define ARCH_X86_32 0
++#define ARCH_X86_64 0
++#define HAVE_ARMV5TE 0
++#define HAVE_ARMV6 0
++#define HAVE_ARMV6T2 0
++#define HAVE_ARMV8 0
++#define HAVE_NEON 0
++#define HAVE_VFP 0
++#define HAVE_VFPV3 0
++#define HAVE_SETEND 0
++#define HAVE_ALTIVEC 0
++#define HAVE_DCBZL 0
++#define HAVE_LDBRX 0
++#define HAVE_POWER8 0
++#define HAVE_PPC4XX 0
++#define HAVE_VSX 0
++#define HAVE_AESNI 0
++#define HAVE_AMD3DNOW 0
++#define HAVE_AMD3DNOWEXT 0
++#define HAVE_AVX 1
++#define HAVE_AVX2 1
++#define HAVE_AVX512 1
++#define HAVE_FMA3 1
++#define HAVE_FMA4 1
++#define HAVE_MMX 1
++#define HAVE_MMXEXT 1
++#define HAVE_SSE 1
++#define HAVE_SSE2 1
++#define HAVE_SSE3 1
++#define HAVE_SSE4 1
++#define HAVE_SSE42 1
++#define HAVE_SSSE3 1
++#define HAVE_XOP 1
++#define HAVE_CPUNOP 0
++#define HAVE_I686 1
++#define HAVE_MIPSFPU 0
++#define HAVE_MIPS32R2 0
++#define HAVE_MIPS32R5 0
++#define HAVE_MIPS64R2 0
++#define HAVE_MIPS32R6 0
++#define HAVE_MIPS64R6 0
++#define HAVE_MIPSDSP 0
++#define HAVE_MIPSDSPR2 0
++#define HAVE_MSA 0
++#define HAVE_LOONGSON2 0
++#define HAVE_LOONGSON3 0
++#define HAVE_MMI 0
++#define HAVE_ARMV5TE_EXTERNAL 0
++#define HAVE_ARMV6_EXTERNAL 0
++#define HAVE_ARMV6T2_EXTERNAL 0
++#define HAVE_ARMV8_EXTERNAL 0
++#define HAVE_NEON_EXTERNAL 0
++#define HAVE_VFP_EXTERNAL 0
++#define HAVE_VFPV3_EXTERNAL 0
++#define HAVE_SETEND_EXTERNAL 0
++#define HAVE_ALTIVEC_EXTERNAL 0
++#define HAVE_DCBZL_EXTERNAL 0
++#define HAVE_LDBRX_EXTERNAL 0
++#define HAVE_POWER8_EXTERNAL 0
++#define HAVE_PPC4XX_EXTERNAL 0
++#define HAVE_VSX_EXTERNAL 0
++#define HAVE_AESNI_EXTERNAL 0
++#define HAVE_AMD3DNOW_EXTERNAL 0
++#define HAVE_AMD3DNOWEXT_EXTERNAL 0
++#define HAVE_AVX_EXTERNAL 0
++#define HAVE_AVX2_EXTERNAL 0
++#define HAVE_AVX512_EXTERNAL 0
++#define HAVE_FMA3_EXTERNAL 0
++#define HAVE_FMA4_EXTERNAL 0
++#define HAVE_MMX_EXTERNAL 0 
++#define HAVE_MMXEXT_EXTERNAL 0
++#define HAVE_SSE_EXTERNAL 0
++#define HAVE_SSE2_EXTERNAL 0
++#define HAVE_SSE3_EXTERNAL 0
++#define HAVE_SSE4_EXTERNAL 0
++#define HAVE_SSE42_EXTERNAL 0
++#define HAVE_SSSE3_EXTERNAL 0
++#define HAVE_XOP_EXTERNAL 0
++#define HAVE_CPUNOP_EXTERNAL 0
++#define HAVE_I686_EXTERNAL 0
++#define HAVE_MIPSFPU_EXTERNAL 0
++#define HAVE_MIPS32R2_EXTERNAL 0
++#define HAVE_MIPS32R5_EXTERNAL 0
++#define HAVE_MIPS64R2_EXTERNAL 0
++#define HAVE_MIPS32R6_EXTERNAL 0
++#define HAVE_MIPS64R6_EXTERNAL 0
++#define HAVE_MIPSDSP_EXTERNAL 0
++#define HAVE_MIPSDSPR2_EXTERNAL 0
++#define HAVE_MSA_EXTERNAL 0
++#define HAVE_LOONGSON2_EXTERNAL 0
++#define HAVE_LOONGSON3_EXTERNAL 0
++#define HAVE_MMI_EXTERNAL 0
++#define HAVE_ARMV5TE_INLINE 0
++#define HAVE_ARMV6_INLINE 0
++#define HAVE_ARMV6T2_INLINE 0
++#define HAVE_ARMV8_INLINE 0
++#define HAVE_NEON_INLINE 1
++#define HAVE_VFP_INLINE 0
++#define HAVE_VFPV3_INLINE 0
++#define HAVE_SETEND_INLINE 0
++#define HAVE_ALTIVEC_INLINE 0
++#define HAVE_DCBZL_INLINE 0
++#define HAVE_LDBRX_INLINE 0
++#define HAVE_POWER8_INLINE 0
++#define HAVE_PPC4XX_INLINE 0
++#define HAVE_VSX_INLINE 0
++#define HAVE_AESNI_INLINE 0
++#define HAVE_AMD3DNOW_INLINE 0
++#define HAVE_AMD3DNOWEXT_INLINE 0
++#define HAVE_AVX_INLINE 0
++#define HAVE_AVX2_INLINE 0
++#define HAVE_AVX512_INLINE 0
++#define HAVE_FMA3_INLINE 0
++#define HAVE_FMA4_INLINE 0
++#define HAVE_MMX_INLINE 0
++#define HAVE_MMXEXT_INLINE 0
++#define HAVE_SSE_INLINE 0
++#define HAVE_SSE2_INLINE 0
++#define HAVE_SSE3_INLINE 0
++#define HAVE_SSE4_INLINE 0
++#define HAVE_SSE42_INLINE 0
++#define HAVE_SSSE3_INLINE 0
++#define HAVE_XOP_INLINE 0
++#define HAVE_CPUNOP_INLINE 0
++#define HAVE_I686_INLINE 0
++#define HAVE_MIPSFPU_INLINE 0
++#define HAVE_MIPS32R2_INLINE 0
++#define HAVE_MIPS32R5_INLINE 0
++#define HAVE_MIPS64R2_INLINE 0
++#define HAVE_MIPS32R6_INLINE 0
++#define HAVE_MIPS64R6_INLINE 0
++#define HAVE_MIPSDSP_INLINE 0
++#define HAVE_MIPSDSPR2_INLINE 0
++#define HAVE_MSA_INLINE 0
++#define HAVE_LOONGSON2_INLINE 0
++#define HAVE_LOONGSON3_INLINE 0
++#define HAVE_MMI_INLINE 0
++#define HAVE_ALIGNED_STACK 1
++#define HAVE_FAST_64BIT 1
++#define HAVE_FAST_CLZ 0
++#define HAVE_FAST_CMOV 1
++#define HAVE_LOCAL_ALIGNED 1
++#define HAVE_SIMD_ALIGN_16 1
++#define HAVE_SIMD_ALIGN_32 1
++#define HAVE_SIMD_ALIGN_64 1
++#define HAVE_ATOMIC_CAS_PTR 0
++#define HAVE_MACHINE_RW_BARRIER 0
++#define HAVE_MEMORYBARRIER 1
++#define HAVE_MM_EMPTY 0
++#define HAVE_RDTSC 1
++#define HAVE_SEM_TIMEDWAIT 0
++#define HAVE_SYNC_VAL_COMPARE_AND_SWAP 0
++#define HAVE_CABS 0
++#define HAVE_CEXP 0
++#define HAVE_INLINE_ASM 0
++#define HAVE_SYMVER 0
++#define HAVE_X86ASM 0
++#define HAVE_BIGENDIAN 0
++#define HAVE_FAST_UNALIGNED 1
++#define HAVE_ARPA_INET_H 0
++#define HAVE_ASM_TYPES_H 0
++#define HAVE_CDIO_PARANOIA_H 0
++#define HAVE_CDIO_PARANOIA_PARANOIA_H 0
++#define HAVE_CUDA_H 0
++#define HAVE_DISPATCH_DISPATCH_H 0
++#define HAVE_DEV_BKTR_IOCTL_BT848_H 0
++#define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
++#define HAVE_DEV_IC_BT8XX_H 0
++#define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0
++#define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
++#define HAVE_DIRECT_H 1
++#define HAVE_DIRENT_H 0
++#define HAVE_DXGIDEBUG_H 1
++#define HAVE_DXVA_H 1
++#define HAVE_ES2_GL_H 0
++#define HAVE_GSM_H 0
++#define HAVE_IO_H 1
++#define HAVE_LINUX_PERF_EVENT_H 0
++#define HAVE_MACHINE_IOCTL_BT848_H 0
++#define HAVE_MACHINE_IOCTL_METEOR_H 0
++#define HAVE_OPENCV2_CORE_CORE_C_H 0
++#define HAVE_OPENGL_GL3_H 0
++#define HAVE_POLL_H 0
++#define HAVE_SYS_PARAM_H 0
++#define HAVE_SYS_RESOURCE_H 0
++#define HAVE_SYS_SELECT_H 0
++#define HAVE_SYS_SOUNDCARD_H 0
++#define HAVE_SYS_TIME_H 0
++#define HAVE_SYS_UN_H 0
++#define HAVE_SYS_VIDEOIO_H 0
++#define HAVE_TERMIOS_H 0
++#define HAVE_UDPLITE_H 0
++#define HAVE_UNISTD_H 0
++#define HAVE_VALGRIND_VALGRIND_H 0
++#define HAVE_WINDOWS_H 1
++#define HAVE_WINSOCK2_H 1
++#define HAVE_INTRINSICS_NEON 0
++#define HAVE_ATANF 1
++#define HAVE_ATAN2F 1
++#define HAVE_CBRT 1
++#define HAVE_CBRTF 1
++#define HAVE_COPYSIGN 1
++#define HAVE_COSF 1
++#define HAVE_ERF 1
++#define HAVE_EXP2 1
++#define HAVE_EXP2F 1
++#define HAVE_EXPF 1
++#define HAVE_HYPOT 1
++#define HAVE_ISFINITE 1
++#define HAVE_ISINF 1
++#define HAVE_ISNAN 1
++#define HAVE_LDEXPF 1
++#define HAVE_LLRINT 1
++#define HAVE_LLRINTF 1
++#define HAVE_LOG2 1
++#define HAVE_LOG2F 1
++#define HAVE_LOG10F 1
++#define HAVE_LRINT 1
++#define HAVE_LRINTF 1
++#define HAVE_POWF 1
++#define HAVE_RINT 1
++#define HAVE_ROUND 1
++#define HAVE_ROUNDF 1
++#define HAVE_SINF 1
++#define HAVE_TRUNC 1
++#define HAVE_TRUNCF 1
++#define HAVE_DOS_PATHS 1
++#define HAVE_LIBC_MSVCRT 1
++#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
++#define HAVE_SECTION_DATA_REL_RO 0
++#define HAVE_THREADS 1
++#define HAVE_UWP 0
++#define HAVE_WINRT 0
++#define HAVE_ACCESS 1
++#define HAVE_ALIGNED_MALLOC 1
++#define HAVE_CLOCK_GETTIME 0
++#define HAVE_CLOSESOCKET 1
++#define HAVE_COMMANDLINETOARGVW 1
++#define HAVE_FCNTL 0
++#define HAVE_GETADDRINFO 1
++#define HAVE_GETHRTIME 0
++#define HAVE_GETOPT 0
++#define HAVE_GETPROCESSAFFINITYMASK 1
++#define HAVE_GETPROCESSMEMORYINFO 1
++#define HAVE_GETPROCESSTIMES 1
++#define HAVE_GETRUSAGE 0
++#define HAVE_GETSYSTEMTIMEASFILETIME 1
++#define HAVE_GETTIMEOFDAY 0
++#define HAVE_GLOB 0
++#define HAVE_GLXGETPROCADDRESS 0
++#define HAVE_GMTIME_R 0
++#define HAVE_INET_ATON 0
++#define HAVE_ISATTY 1
++#define HAVE_KBHIT 1
++#define HAVE_LSTAT 0
++#define HAVE_LZO1X_999_COMPRESS 0
++#define HAVE_MACH_ABSOLUTE_TIME 0
++#define HAVE_MAPVIEWOFFILE 1
++#define HAVE_MKSTEMP 0
++#define HAVE_MMAP 0
++#define HAVE_MPROTECT 0
++#define HAVE_NANOSLEEP 0
++#define HAVE_PEEKNAMEDPIPE 1
++#define HAVE_PTHREAD_CANCEL 0
++#define HAVE_SCHED_GETAFFINITY 0
++#define HAVE_SECITEMIMPORT 0
++#define HAVE_SETCONSOLETEXTATTRIBUTE 1
++#define HAVE_SETCONSOLECTRLHANDLER 1
++#define HAVE_SETMODE 1
++#define HAVE_SETRLIMIT 0
++#define HAVE_SLEEP 1
++#define HAVE_STRERROR_R 0
++#define HAVE_SYSCONF 0
++#define HAVE_SYSCTL 0
++#define HAVE_USLEEP 0
++#define HAVE_UTGETOSTYPEFROMSTRING 0
++#define HAVE_VIRTUALALLOC 1
++#define HAVE_WGLGETPROCADDRESS 0
++#define HAVE_BCRYPT 1
++#define HAVE_VAAPI_DRM 0
++#define HAVE_VAAPI_X11 0
++#define HAVE_VDPAU_X11 0
++#define HAVE_PTHREADS 0
++#define HAVE_OS2THREADS 0
++#define HAVE_W32THREADS 1
++#define HAVE_AS_ARCH_DIRECTIVE 0
++#define HAVE_AS_DN_DIRECTIVE 0
++#define HAVE_AS_FPU_DIRECTIVE 0
++#define HAVE_AS_FUNC 0
++#define HAVE_AS_OBJECT_ARCH 0
++#define HAVE_ASM_MOD_Q 0
++#define HAVE_BLOCKS_EXTENSION 0
++#define HAVE_EBP_AVAILABLE 0
++#define HAVE_EBX_AVAILABLE 0
++#define HAVE_GNU_AS 0
++#define HAVE_GNU_WINDRES 0
++#define HAVE_IBM_ASM 0
++#define HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS 0
++#define HAVE_INLINE_ASM_LABELS 0
++#define HAVE_INLINE_ASM_NONLOCAL_LABELS 0
++#define HAVE_PRAGMA_DEPRECATED 1
++#define HAVE_RSYNC_CONTIMEOUT 0
++#define HAVE_SYMVER_ASM_LABEL 0
++#define HAVE_SYMVER_GNU_ASM 0
++#define HAVE_VFP_ARGS 0
++#define HAVE_XFORM_ASM 0
++#define HAVE_XMM_CLOBBERS 0
++#define HAVE_KCMVIDEOCODECTYPE_HEVC 0
++#define HAVE_SOCKLEN_T 1
++#define HAVE_STRUCT_ADDRINFO 1
++#define HAVE_STRUCT_GROUP_SOURCE_REQ 1
++#define HAVE_STRUCT_IP_MREQ_SOURCE 1
++#define HAVE_STRUCT_IPV6_MREQ 1
++#define HAVE_STRUCT_MSGHDR_MSG_FLAGS 0
++#define HAVE_STRUCT_POLLFD 1
++#define HAVE_STRUCT_RUSAGE_RU_MAXRSS 0
++#define HAVE_STRUCT_SCTP_EVENT_SUBSCRIBE 0
++#define HAVE_STRUCT_SOCKADDR_IN6 1
++#define HAVE_STRUCT_SOCKADDR_SA_LEN 0
++#define HAVE_STRUCT_SOCKADDR_STORAGE 1
++#define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
++#define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
++#define HAVE_MAKEINFO 1
++#define HAVE_MAKEINFO_HTML 0
++#define HAVE_OPENCL_D3D11 0
++#define HAVE_OPENCL_DRM_ARM 0
++#define HAVE_OPENCL_DRM_BEIGNET 0
++#define HAVE_OPENCL_DXVA2 0
++#define HAVE_OPENCL_VAAPI_BEIGNET 0
++#define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
++#define HAVE_PERL 1
++#define HAVE_POD2MAN 1
++#define HAVE_TEXI2HTML 0
++#define CONFIG_DOC 0
++#define CONFIG_HTMLPAGES 0
++#define CONFIG_MANPAGES 1
++#define CONFIG_PODPAGES 1
++#define CONFIG_TXTPAGES 1
++#define CONFIG_AVIO_DIR_CMD_EXAMPLE 1
++#define CONFIG_AVIO_READING_EXAMPLE 1
++#define CONFIG_DECODE_AUDIO_EXAMPLE 1
++#define CONFIG_DECODE_VIDEO_EXAMPLE 1
++#define CONFIG_DEMUXING_DECODING_EXAMPLE 0
++#define CONFIG_ENCODE_AUDIO_EXAMPLE 1
++#define CONFIG_ENCODE_VIDEO_EXAMPLE 1
++#define CONFIG_EXTRACT_MVS_EXAMPLE 0
++#define CONFIG_FILTER_AUDIO_EXAMPLE 0
++#define CONFIG_FILTERING_AUDIO_EXAMPLE 0
++#define CONFIG_FILTERING_VIDEO_EXAMPLE 0
++#define CONFIG_HTTP_MULTICLIENT_EXAMPLE 0
++#define CONFIG_HW_DECODE_EXAMPLE 0
++#define CONFIG_METADATA_EXAMPLE 0
++#define CONFIG_MUXING_EXAMPLE 0
++#define CONFIG_QSVDEC_EXAMPLE 0
++#define CONFIG_REMUXING_EXAMPLE 0
++#define CONFIG_RESAMPLING_AUDIO_EXAMPLE 0
++#define CONFIG_SCALING_VIDEO_EXAMPLE 0
++#define CONFIG_TRANSCODE_AAC_EXAMPLE 0
++#define CONFIG_TRANSCODING_EXAMPLE 0
++#define CONFIG_VAAPI_ENCODE_EXAMPLE 0
++#define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
++#define CONFIG_AVISYNTH 0
++#define CONFIG_FREI0R 0
++#define CONFIG_LIBCDIO 0
++#define CONFIG_LIBDAVS2 0
++#define CONFIG_LIBRUBBERBAND 0
++#define CONFIG_LIBVIDSTAB 0
++#define CONFIG_LIBX264 0
++#define CONFIG_LIBX265 0
++#define CONFIG_LIBXAVS 0
++#define CONFIG_LIBXVID 0
++#define CONFIG_DECKLINK 0
++#define CONFIG_LIBNDI_NEWTEK 0
++#define CONFIG_LIBFDK_AAC 0
++#define CONFIG_OPENSSL 0
++#define CONFIG_LIBTLS 0
++#define CONFIG_GMP 0
++#define CONFIG_LIBLENSFUN 0
++#define CONFIG_LIBOPENCORE_AMRNB 0
++#define CONFIG_LIBOPENCORE_AMRWB 0
++#define CONFIG_LIBVMAF 0
++#define CONFIG_LIBVO_AMRWBENC 0
++#define CONFIG_MBEDTLS 0
++#define CONFIG_RKMPP 0
++#define CONFIG_LIBSMBCLIENT 0
++#define CONFIG_CHROMAPRINT 0
++#define CONFIG_GCRYPT 0
++#define CONFIG_GNUTLS 0
++#define CONFIG_JNI 0
++#define CONFIG_LADSPA 0
++#define CONFIG_LIBAOM 0
++#define CONFIG_LIBASS 0
++#define CONFIG_LIBBLURAY 0
++#define CONFIG_LIBBS2B 0
++#define CONFIG_LIBCACA 0
++#define CONFIG_LIBCELT 0
++#define CONFIG_LIBCODEC2 0
++#define CONFIG_LIBDC1394 0
++#define CONFIG_LIBDRM 0
++#define CONFIG_LIBFLITE 0
++#define CONFIG_LIBFONTCONFIG 0
++#define CONFIG_LIBFREETYPE 0
++#define CONFIG_LIBFRIBIDI 0
++#define CONFIG_LIBGME 0
++#define CONFIG_LIBGSM 0
++#define CONFIG_LIBIEC61883 0
++#define CONFIG_LIBILBC 0
++#define CONFIG_LIBJACK 0
++#define CONFIG_LIBKVAZAAR 0
++#define CONFIG_LIBMODPLUG 0
++#define CONFIG_LIBMP3LAME 0
++#define CONFIG_LIBMYSOFA 0
++#define CONFIG_LIBOPENCV 0
++#define CONFIG_LIBOPENH264 0
++#define CONFIG_LIBOPENJPEG 0
++#define CONFIG_LIBOPENMPT 0
++#define CONFIG_LIBOPUS 0
++#define CONFIG_LIBPULSE 0
++#define CONFIG_LIBRSVG 0
++#define CONFIG_LIBRTMP 0
++#define CONFIG_LIBSHINE 0
++#define CONFIG_LIBSMBCLIENT 0
++#define CONFIG_LIBSNAPPY 0
++#define CONFIG_LIBSOXR 0
++#define CONFIG_LIBSPEEX 0
++#define CONFIG_LIBSRT 0
++#define CONFIG_LIBSSH 0
++#define CONFIG_LIBTENSORFLOW 0
++#define CONFIG_LIBTESSERACT 0
++#define CONFIG_LIBTHEORA 0
++#define CONFIG_LIBTWOLAME 0
++#define CONFIG_LIBV4L2 0
++#define CONFIG_LIBVORBIS 0
++#define CONFIG_LIBVPX 0
++#define CONFIG_LIBWAVPACK 0
++#define CONFIG_LIBWEBP 0
++#define CONFIG_LIBXML2 0
++#define CONFIG_LIBZIMG 0
++#define CONFIG_LIBZMQ 0
++#define CONFIG_LIBZVBI 0
++#define CONFIG_LV2 0
++#define CONFIG_MEDIACODEC 0
++#define CONFIG_OPENAL 0
++#define CONFIG_OPENGL 0
++#define CONFIG_VAPOURSYNTH 0
++#define CONFIG_ALSA 0
++#define CONFIG_APPKIT 0
++#define CONFIG_AVFOUNDATION 0
++#define CONFIG_BZLIB 0
++#define CONFIG_COREIMAGE 0
++#define CONFIG_ICONV 0
++#define CONFIG_LIBXCB 0
++#define CONFIG_LIBXCB_SHM 0
++#define CONFIG_LIBXCB_SHAPE 0
++#define CONFIG_LIBXCB_XFIXES 0
++#define CONFIG_LZMA 0
++#define CONFIG_SCHANNEL 1
++#define CONFIG_SDL2 0
++#define CONFIG_SECURETRANSPORT 0
++#define CONFIG_SNDIO 0
++#define CONFIG_XLIB 0
++#define CONFIG_ZLIB 0
++#define CONFIG_CUDA_SDK 0
++#define CONFIG_LIBNPP 0
++#define CONFIG_LIBMFX 0
++#define CONFIG_MMAL 0
++#define CONFIG_OMX 0
++#define CONFIG_OPENCL 0
++#define CONFIG_AMF 0
++#define CONFIG_AUDIOTOOLBOX 0
++#define CONFIG_CRYSTALHD 0
++#define CONFIG_CUDA 0
++#define CONFIG_CUVID 0
++#define CONFIG_D3D11VA 0
++#define CONFIG_DXVA2 0
++#define CONFIG_FFNVCODEC 0
++#define CONFIG_NVDEC 0
++#define CONFIG_NVENC 0
++#define CONFIG_VAAPI 0
++#define CONFIG_VDPAU 0
++#define CONFIG_VIDEOTOOLBOX 0
++#define CONFIG_V4L2_M2M 0
++#define CONFIG_XVMC 0
++#define CONFIG_FTRAPV 0
++#define CONFIG_GRAY 0
++#define CONFIG_HARDCODED_TABLES 0
++#define CONFIG_OMX_RPI 0
++#define CONFIG_RUNTIME_CPUDETECT 1
++#define CONFIG_SAFE_BITSTREAM_READER 1
++#define CONFIG_SHARED 1
++#define CONFIG_SMALL 0
++#define CONFIG_STATIC 0
++#define CONFIG_SWSCALE_ALPHA 1
++#define CONFIG_GPL 0
++#define CONFIG_NONFREE 0
++#define CONFIG_VERSION3 0
++#define CONFIG_AVDEVICE 0
++#define CONFIG_AVFILTER 0
++#define CONFIG_SWSCALE 0
++#define CONFIG_POSTPROC 0
++#define CONFIG_AVFORMAT 0
++#define CONFIG_AVCODEC 1
++#define CONFIG_SWRESAMPLE 0
++#define CONFIG_AVRESAMPLE 0
++#define CONFIG_AVUTIL 1
++#define CONFIG_FFPLAY 0
++#define CONFIG_FFPROBE 0
++#define CONFIG_FFMPEG 0
++#define CONFIG_DCT 0
++#define CONFIG_DWT 0
++#define CONFIG_ERROR_RESILIENCE 0
++#define CONFIG_FAAN 1
++#define CONFIG_FAST_UNALIGNED 1
++#define CONFIG_FFT 0
++#define CONFIG_LSP 0
++#define CONFIG_LZO 0
++#define CONFIG_MDCT 0
++#define CONFIG_PIXELUTILS 0
++#define CONFIG_NETWORK 0
++#define CONFIG_RDFT 0
++#define CONFIG_AUTODETECT 0
++#define CONFIG_FONTCONFIG 0
++#define CONFIG_LINUX_PERF 0
++#define CONFIG_MEMORY_POISONING 0
++#define CONFIG_NEON_CLOBBER_TEST 0
++#define CONFIG_OSSFUZZ 0
++#define CONFIG_PIC 1
++#define CONFIG_THUMB 0
++#define CONFIG_VALGRIND_BACKTRACE 0
++#define CONFIG_XMM_CLOBBER_TEST 0
++#define CONFIG_BSFS 1
++#define CONFIG_DECODERS 1
++#define CONFIG_PARSERS 1
++#define CONFIG_AANDCTTABLES 0
++#define CONFIG_AC3DSP 0
++#define CONFIG_ADTS_HEADER 0
++#define CONFIG_AUDIO_FRAME_QUEUE 0
++#define CONFIG_AUDIODSP 0
++#define CONFIG_BLOCKDSP 0
++#define CONFIG_BSWAPDSP 0
++#define CONFIG_CABAC 0
++#define CONFIG_CBS 0
++#define CONFIG_CBS_H264 0
++#define CONFIG_CBS_H265 0
++#define CONFIG_CBS_MPEG2 0
++#define CONFIG_CBS_VP9 0
++#define CONFIG_DIRAC_PARSE 0
++#define CONFIG_DNN 0
++#define CONFIG_DVPROFILE 0
++#define CONFIG_EXIF 0
++#define CONFIG_FAANDCT 1
++#define CONFIG_FAANIDCT 1
++#define CONFIG_FDCTDSP 1
++#define CONFIG_FLACDSP 1
++#define CONFIG_FMTCONVERT 0
++#define CONFIG_G722DSP 0
++#define CONFIG_GOLOMB 0
++#define CONFIG_GPLV3 0
++#define CONFIG_H263DSP 0
++#define CONFIG_H264CHROMA 0
++#define CONFIG_H264DSP 0
++#define CONFIG_H264PARSE 0
++#define CONFIG_H264PRED 1
++#define CONFIG_H264QPEL 0
++#define CONFIG_HEVCPARSE 0
++#define CONFIG_HPELDSP 0
++#define CONFIG_HUFFMAN 0
++#define CONFIG_HUFFYUVDSP 0
++#define CONFIG_HUFFYUVENCDSP 0
++#define CONFIG_IDCTDSP 1
++#define CONFIG_IIRFILTER 0
++#define CONFIG_MDCT15 0
++#define CONFIG_INTRAX8 0
++#define CONFIG_ISO_MEDIA 0
++#define CONFIG_IVIDSP 0
++#define CONFIG_JPEGTABLES 0
++#define CONFIG_LGPLV3 0
++#define CONFIG_LIBX262 0
++#define CONFIG_LLAUDDSP 0
++#define CONFIG_LLVIDDSP 0
++#define CONFIG_LLVIDENCDSP 0
++#define CONFIG_LPC 0
++#define CONFIG_LZF 0
++#define CONFIG_ME_CMP 0
++#define CONFIG_MPEG_ER 0
++#define CONFIG_MPEGAUDIO 0
++#define CONFIG_MPEGAUDIODSP 0
++#define CONFIG_MPEGAUDIOHEADER 0
++#define CONFIG_MPEGVIDEO 0
++#define CONFIG_MPEGVIDEOENC 0
++#define CONFIG_MSS34DSP 0
++#define CONFIG_PIXBLOCKDSP 0
++#define CONFIG_QPELDSP 0
++#define CONFIG_QSV 0
++#define CONFIG_QSVDEC 0
++#define CONFIG_QSVENC 0
++#define CONFIG_QSVVPP 0
++#define CONFIG_RANGECODER 0
++#define CONFIG_RIFFDEC 0
++#define CONFIG_RIFFENC 0
++#define CONFIG_RTPDEC 0
++#define CONFIG_RTPENC_CHAIN 0
++#define CONFIG_RV34DSP 0
++#define CONFIG_SINEWIN 0
++#define CONFIG_SNAPPY 0
++#define CONFIG_SRTP 0
++#define CONFIG_STARTCODE 0
++#define CONFIG_TEXTUREDSP 0
++#define CONFIG_TEXTUREDSPENC 0
++#define CONFIG_TPELDSP 0
++#define CONFIG_VAAPI_1 0
++#define CONFIG_VAAPI_ENCODE 0
++#define CONFIG_VC1DSP 0
++#define CONFIG_VIDEODSP 1
++#define CONFIG_VP3DSP 0
++#define CONFIG_VP56DSP 0
++#define CONFIG_VP8DSP 1
++#define CONFIG_WMA_FREQS 0
++#define CONFIG_WMV2DSP 0
++#define CONFIG_NULL_BSF 1
++#define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
++#define CONFIG_VP8_DECODER 1
++#define CONFIG_VP9_DECODER 1
++#define CONFIG_FLAC_DECODER 1
++#define CONFIG_VP8_PARSER 1
++#define CONFIG_VP9_PARSER 1
++#endif /* FFMPEG_CONFIG_H */
+diff --git a/media/ffvpx/ffvpxcommon.mozbuild b/media/ffvpx/ffvpxcommon.mozbuild
+--- a/media/ffvpx/ffvpxcommon.mozbuild
++++ b/media/ffvpx/ffvpxcommon.mozbuild
+@@ -1,19 +1,20 @@
+ # -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+ # vim: set filetype=python:
+ # This Source Code Form is subject to the terms of the Mozilla Public
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ 
+ # Add assembler flags and includes
+-ASFLAGS += CONFIG['FFVPX_ASFLAGS']
+-ASFLAGS += ['-I%s/media/ffvpx/' % TOPSRCDIR]
+-ASFLAGS += ['-I%s/media/ffvpx/libavcodec/x86/' % TOPSRCDIR]
+-ASFLAGS += ['-I%s/media/ffvpx/libavutil/x86/' % TOPSRCDIR]
++if CONFIG['CPU_ARCH'] != 'aarch64':
++    ASFLAGS += CONFIG['FFVPX_ASFLAGS']
++    ASFLAGS += ['-I%s/media/ffvpx/' % TOPSRCDIR]
++    ASFLAGS += ['-I%s/media/ffvpx/libavcodec/x86/' % TOPSRCDIR]
++    ASFLAGS += ['-I%s/media/ffvpx/libavutil/x86/' % TOPSRCDIR]
+ 
+ if CONFIG['FFVPX_ASFLAGS']:
+     if CONFIG['FFVPX_USE_YASM']:
+         USE_YASM = True
+ 
+     if CONFIG['OS_ARCH'] == 'WINNT':
+        # Fix inline symbols and math defines for windows.
+         DEFINES['_USE_MATH_DEFINES'] = True
+diff --git a/media/ffvpx/libavcodec/aarch64/moz.build b/media/ffvpx/libavcodec/aarch64/moz.build
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavcodec/aarch64/moz.build
+@@ -0,0 +1,47 @@
++## -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
++## vim: set filetype=python:
++## This Source Code Form is subject to the terms of the Mozilla Public
++## License, v. 2.0. If a copy of the MPL was not distributed with this
++## file, You can obtain one at http://mozilla.org/MPL/2.0/.
++
++SOURCES += [
++    'h264chroma_init_aarch64.c',
++    'h264cmc_neon.S',
++    'h264dsp_init_aarch64.c',
++    'h264dsp_neon.S',
++    'h264idct_neon.S',
++    'h264pred_init.c',
++    'h264pred_neon.S',
++    'hpeldsp_init_aarch64.c',
++    'hpeldsp_neon.S',
++    'idctdsp_init_aarch64.c',
++    'mdct_neon.S',
++    'neon.S',
++    'simple_idct_neon.S',
++    'videodsp.S',
++    'videodsp_init.c',
++    'vp9dsp_init_10bpp_aarch64.c',
++    'vp9dsp_init_12bpp_aarch64.c',
++    'vp9dsp_init_16bpp_aarch64_template.c',
++    'vp9dsp_init_aarch64.c',
++    'vp9itxfm_16bpp_neon.S',
++    'vp9itxfm_neon.S',
++    'vp9lpf_16bpp_neon.S',
++    'vp9lpf_neon.S',
++    'vp9mc_16bpp_neon.S',
++    'vp9mc_neon.S',
++]
++
++if CONFIG['OS_ARCH'] == 'WINNT':
++    USE_INTEGRATED_CLANGCL_AS = True
++    DEFINES['EXTERN_ASM'] = ''
++
++if CONFIG['MOZ_LIBAV_FFT']:
++    SOURCES += [
++        'fft_init_aarch64.c',
++        'fft_neon.S',
++    ]
++
++FINAL_LIBRARY = 'mozavcodec'
++
++include('/media/ffvpx/ffvpxcommon.mozbuild')
+diff --git a/media/ffvpx/libavcodec/dummy_funcs.c b/media/ffvpx/libavcodec/dummy_funcs.c
+--- a/media/ffvpx/libavcodec/dummy_funcs.c
++++ b/media/ffvpx/libavcodec/dummy_funcs.c
+@@ -814,43 +814,37 @@ AVBitStreamFilter ff_mjpeg2jpeg_bsf;
+ AVBitStreamFilter ff_mjpega_dump_header_bsf;
+ AVBitStreamFilter ff_mp3_header_decompress_bsf;
+ AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf;
+ AVBitStreamFilter ff_mov2textsub_bsf;
+ AVBitStreamFilter ff_noise_bsf;
+ AVBitStreamFilter ff_remove_extradata_bsf;
+ AVBitStreamFilter ff_text2movsub_bsf;
+ 
+-void ff_fft_init_aarch64(FFTContext *s) {}
+ void ff_fft_init_arm(FFTContext *s) {}
+ void ff_fft_init_mips(FFTContext *s) {}
+ void ff_fft_init_ppc(FFTContext *s) {}
+ void ff_rdft_init_arm(RDFTContext *s) {}
+-void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
+-                               const int bit_depth,
+-                               const int chroma_format_idc) {}
+ void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
+                            const int bit_depth, const int chroma_format_idc) {}
+ void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                             const int bit_depth, const int chroma_format_idc) {}
+ void ff_me_cmp_init_static(void) {}
+ int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options) { return 0; }
+ void ff_frame_thread_encoder_free(AVCodecContext *avctx) {}
+ int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr) { return 0; }
+-void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc) {}
+ void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc) {}
+ void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc) {}
+ void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc) {}
+ void ff_vp7dsp_init(VP8DSPContext *c) {}
+ void ff_vp78dsp_init_arm(VP8DSPContext *c) {}
+ void ff_vp78dsp_init_ppc(VP8DSPContext *c) {}
+ void ff_vp8dsp_init_arm(VP8DSPContext *c) {}
+ void ff_vp8dsp_init_mips(VP8DSPContext *c) {}
+ void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp) {}
+-void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) {}
+ void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp) {}
+ #if !defined(__arm__)
+ void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps) {}
+ #endif
+ #if !defined(HAVE_64BIT_BUILD)
+ void ff_flac_decorrelate_indep8_16_sse2(uint8_t **out, int32_t **in, int channels, int len, int shift) {}
+ void ff_flac_decorrelate_indep8_32_avx(uint8_t **out, int32_t **in, int channels, int len, int shift) {}
+ void ff_flac_decorrelate_indep8_16_avx(uint8_t **out, int32_t **in, int channels, int len, int shift) {}
+diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build
+--- a/media/ffvpx/libavcodec/moz.build
++++ b/media/ffvpx/libavcodec/moz.build
+@@ -4,16 +4,18 @@
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ 
+ # Due to duplicate file names, we compile libavutil/x86 in its own
+ # moz.build file.
+ if CONFIG['FFVPX_ASFLAGS']:
+     if CONFIG['CPU_ARCH'] == 'x86' or CONFIG['CPU_ARCH'] == 'x86_64':
+         DIRS += ['x86']
++    elif CONFIG['CPU_ARCH'] == 'aarch64':
++        DIRS += ['aarch64']
+     elif CONFIG['CPU_ARCH'] == 'arm':
+         DIRS += ['arm']
+ 
+ SharedLibrary('mozavcodec')
+ SOURCES += [
+     'allcodecs.c',
+     'avpacket.c',
+     'bitstream_filters.c',
+diff --git a/media/ffvpx/libavutil/aarch64/moz.build b/media/ffvpx/libavutil/aarch64/moz.build
+new file mode 100644
+--- /dev/null
++++ b/media/ffvpx/libavutil/aarch64/moz.build
+@@ -0,0 +1,19 @@
++# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
++# vim: set filetype=python:
++# This Source Code Form is subject to the terms of the Mozilla Public
++# License, v. 2.0. If a copy of the MPL was not distributed with this
++# file, You can obtain one at http://mozilla.org/MPL/2.0/.
++
++SOURCES += [
++    'cpu.c',
++    'float_dsp_init.c',
++    'float_dsp_neon.S',
++]
++
++if CONFIG['OS_ARCH'] == 'WINNT':
++    USE_INTEGRATED_CLANGCL_AS = True
++    DEFINES['EXTERN_ASM'] = ''
++
++FINAL_LIBRARY = 'mozavutil'
++
++include('/media/ffvpx/ffvpxcommon.mozbuild')
+diff --git a/media/ffvpx/libavutil/avutil.symbols b/media/ffvpx/libavutil/avutil.symbols
+--- a/media/ffvpx/libavutil/avutil.symbols
++++ b/media/ffvpx/libavutil/avutil.symbols
+@@ -306,17 +306,17 @@ avpriv_alloc_fixed_dsp
+ avpriv_float_dsp_alloc
+ avpriv_report_missing_feature
+ avpriv_request_sample
+ avpriv_scalarproduct_float_c
+ avpriv_set_systematic_pal2
+ avutil_configuration
+ avutil_license
+ avutil_version
+-#ifdef XP_WIN
++#if defined(XP_WIN) && !defined(_ARM64_)
+ avpriv_emms_asm
+ #endif
+ avpriv_slicethread_create
+ avpriv_slicethread_execute
+ avpriv_slicethread_free
+ av_hwdevice_get_type_name
+ av_hwframe_ctx_alloc
+ av_hwframe_ctx_init
+diff --git a/media/ffvpx/libavutil/dummy_funcs.c b/media/ffvpx/libavutil/dummy_funcs.c
+--- a/media/ffvpx/libavutil/dummy_funcs.c
++++ b/media/ffvpx/libavutil/dummy_funcs.c
+@@ -3,29 +3,35 @@
+ /* This Source Code Form is subject to the terms of the Mozilla Public
+  * License, v. 2.0. If a copy of the MPL was not distributed with this
+  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+ 
+ #include "avutil.h"
+ #include "hwcontext.h"
+ 
+ // cpu_internal.c
++#if !defined(_ARM64_)
+ int ff_get_cpu_flags_aarch64(void) { return 0; }
++#endif
+ #if !defined(__arm__)
+ int ff_get_cpu_flags_arm(void) { return 0; }
+ #endif
+ int ff_get_cpu_flags_ppc(void) { return 0; }
+ 
+ // float_dsp.c
+ #include "float_dsp.h"
++#if !defined(_ARM64_)
+ void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp) {}
++#endif
+ void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int strict) {}
+ void ff_float_dsp_init_mips(AVFloatDSPContext *fdsp) {}
+ #if !defined(__arm__)
+ void ff_float_dsp_init_arm(AVFloatDSPContext *fdsp) {}
+ #endif
+ 
+ // cpu.c
++#if !defined(_ARM64_)
+ size_t ff_get_cpu_max_align_aarch64() { return 0; }
++#endif
+ size_t ff_get_cpu_max_align_ppc() { return 0; }
+ #if !defined(__arm__)
+ size_t ff_get_cpu_max_align_arm() { return 0; }
+ #endif
+diff --git a/media/ffvpx/libavutil/moz.build b/media/ffvpx/libavutil/moz.build
+--- a/media/ffvpx/libavutil/moz.build
++++ b/media/ffvpx/libavutil/moz.build
+@@ -6,16 +6,18 @@
+ 
+ # Due to duplicate file names, we compile libavutil/x86 in its own
+ # moz.build file.
+ if CONFIG['FFVPX_ASFLAGS']:
+     if CONFIG['CPU_ARCH'] == 'x86' or CONFIG['CPU_ARCH'] == 'x86_64':
+         DIRS += ['x86']
+     elif CONFIG['CPU_ARCH'] == 'arm':
+         DIRS += ['arm']
++    elif CONFIG['CPU_ARCH'] == 'aarch64':
++        DIRS += ['aarch64']
+ 
+ SharedLibrary('mozavutil')
+ SOURCES += [
+     'avstring.c',
+     'bprint.c',
+     'buffer.c',
+     'channel_layout.c',
+     'cpu.c',
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1432,16 +1432,18 @@ with only_when(compile_environment):
+ # Libav-fft Support
+ # ==============================================================
+ with only_when(compile_environment):
+     @depends(target)
+     def libav_fft(target):
+         flags = None
+         if target.kernel == 'WINNT' and target.cpu == 'x86':
+             flags = ['-DPIC', '-DWIN32']
++        elif target.kernel == 'WINNT' and target.cpu == 'aarch64':
++            flags = ['-DPIC', '-DWIN64']
+         elif target.cpu == 'x86_64':
+             if target.kernel == 'Darwin':
+                 flags = ['-D__x86_64__', '-DPIC', '-DMACHO']
+             elif target.kernel == 'WINNT':
+                 flags = ['-D__x86_64__', '-DPIC', '-DWIN64', '-DMSVC']
+             else:
+                 flags = ['-D__x86_64__', '-DPIC', '-DELF']
+         if flags:
+@@ -1467,25 +1469,28 @@ with only_when(compile_environment):
+ 
+     set_config('YASM_HAS_AVX2', yasm_has_avx2)
+ 
+ 
+     @depends(yasm_has_avx2, libav_fft, vpx_as_flags, target)
+     def ffvpx(yasm_has_avx2, libav_fft, vpx_as_flags, target):
+         enable = flac_only = use_yasm = False
+         flags = []
+-        if target.cpu in ('x86', 'x86_64'):
++        if target.cpu in ('x86', 'x86_64') or \
++                target.cpu == 'aarch64' and target.kernel == 'WINNT':
+             enable = True
+             if libav_fft and libav_fft.flags:
+                 use_yasm = True
+                 flags.extend(libav_fft.flags)
+                 if target.kernel == 'WINNT':
+                     if target.cpu == 'x86':
+                         # 32-bit windows need to prefix symbols with an underscore.
+                         flags.extend(('-DPREFIX', '-Pconfig_win32.asm'))
++                    elif target.cpu == 'aarch64':
++                        use_yasm = False
+                     else:
+                         flags.append('-Pconfig_win64.asm')
+                 elif target.kernel == 'Darwin':
+                     # 32/64-bit macosx assemblers need to prefix symbols with an
+                     # underscore.
+                     flags.extend(('-DPREFIX', '-Pconfig_darwin64.asm'))
+                 else:
+                     # Default to unix.

+ 2 - 2
mozilla-release/patches/1585358-71a1.patch

@@ -2,7 +2,7 @@
 # User Tom Ritter <tom@mozilla.com>
 # Date 1570732256 0
 # Node ID a02ea11484ab8ca20eab416d14527fcd2c1cfd8f
-# Parent  03909ce40c67b3e86d9e0d5f1ecb36268bb4bec7
+# Parent  5abcb2db682734c9aa8d61033d248d56dec4bce4
 Bug 1585358 - Remove mingw-gcc configuration stuf from libvpx r=jya
 
 mingw-gcc is no longer supported.
@@ -4184,7 +4184,7 @@ diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_s
  gen_config_files win/x64 "--target=x86_64-win64-vs12 ${all_platforms} ${x86_platforms}"
  gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}"
 -gen_config_files win/mingw32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}"
--gen_config_files win/aarch64 "--target=aarch64-win64-vs12 ${all_platforms}"
+-gen_config_files win/aarch64 "--target=aarch64-win64-vs12 ${all_platforms} ${arm64_platforms}"
  
  gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}"
  gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}"

+ 6250 - 0
mozilla-release/patches/1585359-71a1.patch

@@ -0,0 +1,6250 @@
+# HG changeset patch
+# User Tom Ritter <tom@mozilla.com>
+# Date 1570732256 0
+# Node ID b86bb62c23b1495ebfe4a2a78508408bfd1b722e
+# Parent  de69b48bbeb0bc9623fb257c45b663fc67c405a3
+Bug 1585359 - Remove mingw-gcc configuration stuff from libaom r=jya
+
+mingw-gcc is no longer supported.
+
+Differential Revision: https://phabricator.services.mozilla.com/D48578
+
+diff --git a/media/libaom/config/win/mingw32/config/aom_config.asm b/media/libaom/config/win/mingw32/config/aom_config.asm
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw32/config/aom_config.asm
++++ /dev/null
+@@ -1,76 +0,0 @@
+-;
+-; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+-;
+-; This source code is subject to the terms of the BSD 2 Clause License and
+-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+-; was not distributed with this source code in the LICENSE file, you can
+-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+-; Media Patent License 1.0 was not distributed with this source code in the
+-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+-;
+-
+-ARCH_ARM equ 0
+-ARCH_MIPS equ 0
+-ARCH_PPC equ 0
+-ARCH_X86 equ 1
+-ARCH_X86_64 equ 0
+-CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+-CONFIG_ACCOUNTING equ 0
+-CONFIG_ANALYZER equ 0
+-CONFIG_AV1_DECODER equ 1
+-CONFIG_AV1_ENCODER equ 0
+-CONFIG_BIG_ENDIAN equ 0
+-CONFIG_BITSTREAM_DEBUG equ 0
+-CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+-CONFIG_COLLECT_RD_STATS equ 0
+-CONFIG_DEBUG equ 0
+-CONFIG_DENOISE equ 1
+-CONFIG_DIST_8X8 equ 0
+-CONFIG_ENTROPY_STATS equ 0
+-CONFIG_FILEOPTIONS equ 1
+-CONFIG_FIX_GF_LENGTH equ 1
+-CONFIG_FP_MB_STATS equ 0
+-CONFIG_GCC equ 1
+-CONFIG_GCOV equ 0
+-CONFIG_GLOBAL_MOTION_SEARCH equ 1
+-CONFIG_GPROF equ 0
+-CONFIG_INSPECTION equ 0
+-CONFIG_INTERNAL_STATS equ 0
+-CONFIG_INTER_STATS_ONLY equ 0
+-CONFIG_LIBYUV equ 0
+-CONFIG_LOWBITDEPTH equ 1
+-CONFIG_MAX_DECODE_PROFILE equ 2
+-CONFIG_MISMATCH_DEBUG equ 0
+-CONFIG_MULTITHREAD equ 1
+-CONFIG_NORMAL_TILE_MODE equ 0
+-CONFIG_OS_SUPPORT equ 1
+-CONFIG_PIC equ 0
+-CONFIG_RD_DEBUG equ 0
+-CONFIG_REDUCED_ENCODER_BORDER equ 0
+-CONFIG_RUNTIME_CPU_DETECT equ 1
+-CONFIG_SHARED equ 0
+-CONFIG_SHARP_SETTINGS equ 0
+-CONFIG_SIZE_LIMIT equ 0
+-CONFIG_SPATIAL_RESAMPLING equ 1
+-CONFIG_STATIC equ 1
+-CONFIG_WEBM_IO equ 0
+-DECODE_HEIGHT_LIMIT equ 0
+-DECODE_WIDTH_LIMIT equ 0
+-HAVE_AVX equ 1
+-HAVE_AVX2 equ 1
+-HAVE_DSPR2 equ 0
+-HAVE_FEXCEPT equ 1
+-HAVE_MIPS32 equ 0
+-HAVE_MIPS64 equ 0
+-HAVE_MMX equ 1
+-HAVE_MSA equ 0
+-HAVE_NEON equ 0
+-HAVE_SSE equ 1
+-HAVE_SSE2 equ 1
+-HAVE_SSE3 equ 1
+-HAVE_SSE4_1 equ 1
+-HAVE_SSE4_2 equ 1
+-HAVE_SSSE3 equ 1
+-HAVE_VSX equ 0
+-HAVE_WXWIDGETS equ 0
+diff --git a/media/libaom/config/win/mingw32/config/aom_config.h b/media/libaom/config/win/mingw32/config/aom_config.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw32/config/aom_config.h
++++ /dev/null
+@@ -1,82 +0,0 @@
+-/*
+- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+- *
+- * This source code is subject to the terms of the BSD 2 Clause License and
+- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+- * was not distributed with this source code in the LICENSE file, you can
+- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+- * Media Patent License 1.0 was not distributed with this source code in the
+- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+- */
+-#ifndef AOM_CONFIG_H_
+-#define AOM_CONFIG_H_
+-
+-#define ARCH_ARM 0
+-#define ARCH_MIPS 0
+-#define ARCH_PPC 0
+-#define ARCH_X86 1
+-#define ARCH_X86_64 0
+-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+-#define CONFIG_ACCOUNTING 0
+-#define CONFIG_ANALYZER 0
+-#define CONFIG_AV1_DECODER 1
+-#define CONFIG_AV1_ENCODER 0
+-#define CONFIG_BIG_ENDIAN 0
+-#define CONFIG_BITSTREAM_DEBUG 0
+-#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+-#define CONFIG_COLLECT_RD_STATS 0
+-#define CONFIG_DEBUG 0
+-#define CONFIG_DENOISE 1
+-#define CONFIG_DIST_8X8 0
+-#define CONFIG_ENTROPY_STATS 0
+-#define CONFIG_FILEOPTIONS 1
+-#define CONFIG_FIX_GF_LENGTH 1
+-#define CONFIG_FP_MB_STATS 0
+-#define CONFIG_GCC 1
+-#define CONFIG_GCOV 0
+-#define CONFIG_GLOBAL_MOTION_SEARCH 1
+-#define CONFIG_GPROF 0
+-#define CONFIG_INSPECTION 0
+-#define CONFIG_INTERNAL_STATS 0
+-#define CONFIG_INTER_STATS_ONLY 0
+-#define CONFIG_LIBYUV 0
+-#define CONFIG_LOWBITDEPTH 1
+-#define CONFIG_MAX_DECODE_PROFILE 2
+-#define CONFIG_MISMATCH_DEBUG 0
+-#define CONFIG_MULTITHREAD 1
+-#define CONFIG_NORMAL_TILE_MODE 0
+-#define CONFIG_OS_SUPPORT 1
+-#define CONFIG_PIC 0
+-#define CONFIG_RD_DEBUG 0
+-#define CONFIG_REDUCED_ENCODER_BORDER 0
+-#define CONFIG_RUNTIME_CPU_DETECT 1
+-#define CONFIG_SHARED 0
+-#define CONFIG_SHARP_SETTINGS 0
+-#define CONFIG_SIZE_LIMIT 0
+-#define CONFIG_SPATIAL_RESAMPLING 1
+-#define CONFIG_STATIC 1
+-#define CONFIG_WEBM_IO 0
+-#define DECODE_HEIGHT_LIMIT 0
+-#define DECODE_WIDTH_LIMIT 0
+-#define HAVE_AVX 1
+-#define HAVE_AVX2 1
+-#define HAVE_DSPR2 0
+-#define HAVE_FEXCEPT 1
+-#define HAVE_MIPS32 0
+-#define HAVE_MIPS64 0
+-#define HAVE_MMX 1
+-#define HAVE_MSA 0
+-#define HAVE_NEON 0
+-#define HAVE_SSE 1
+-#define HAVE_SSE2 1
+-#define HAVE_SSE3 1
+-#define HAVE_SSE4_1 1
+-#define HAVE_SSE4_2 1
+-#define HAVE_SSSE3 1
+-#define HAVE_VSX 0
+-#define HAVE_WXWIDGETS 0
+-#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+-#define INLINE inline
+-#define LIB_INSTALL_DIR INSTALLDIR/lib
+-#endif /* AOM_CONFIG_H_ */
+diff --git a/media/libaom/config/win/mingw32/config/aom_dsp_rtcd.h b/media/libaom/config/win/mingw32/config/aom_dsp_rtcd.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw32/config/aom_dsp_rtcd.h
++++ /dev/null
+@@ -1,2379 +0,0 @@
+-// This file is generated. Do not edit.
+-#ifndef AOM_DSP_RTCD_H_
+-#define AOM_DSP_RTCD_H_
+-
+-#ifdef RTCD_C
+-#define RTCD_EXTERN
+-#else
+-#define RTCD_EXTERN extern
+-#endif
+-
+-/*
+- * DSP
+- */
+-
+-#include "aom/aom_integer.h"
+-#include "aom_dsp/aom_dsp_common.h"
+-#include "av1/common/enums.h"
+-#include "av1/common/blockd.h"
+-
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-
+-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-
+-void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-
+-void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-
+-void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-
+-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-RTCD_EXTERN void (*aom_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-
+-void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+-
+-void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+-
+-void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+-
+-void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+-
+-void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+-
+-void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+-#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+-
+-void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-
+-void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+-void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+-RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+-
+-void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-
+-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-
+-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-
+-void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-
+-void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+-
+-void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+-
+-void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+-
+-void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+-
+-void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+-
+-void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+-
+-void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+-
+-void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+-
+-void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+-
+-void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+-
+-void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+-
+-void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+-
+-void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+-
+-void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+-
+-void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+-
+-void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+-
+-void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+-
+-void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+-
+-void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+-
+-void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+-
+-void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+-
+-void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+-
+-void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+-
+-void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+-
+-void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+-
+-void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+-
+-void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+-
+-void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+-
+-void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+-
+-void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+-
+-void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+-
+-void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+-
+-void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+-
+-void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+-
+-void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+-
+-void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+-
+-void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+-
+-void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+-
+-void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+-
+-void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+-
+-void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+-
+-void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+-
+-void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+-
+-void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+-
+-void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+-
+-void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+-
+-void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+-
+-void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+-
+-void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+-
+-void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+-
+-void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_h_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-
+-void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-
+-void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+-
+-void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+-
+-void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+-
+-void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+-
+-void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+-
+-void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+-
+-void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+-
+-void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+-
+-void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+-
+-void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+-
+-void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+-
+-void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+-
+-void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+-
+-void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+-
+-void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+-
+-void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+-
+-void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+-
+-void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+-
+-void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+-
+-void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+-
+-void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+-
+-void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+-
+-void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+-
+-void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+-
+-void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+-
+-void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+-
+-void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+-
+-void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+-
+-void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+-
+-void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+-
+-void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+-
+-void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+-
+-void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+-
+-void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+-
+-void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+-
+-void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+-
+-void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+-
+-void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+-
+-void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+-
+-void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+-
+-void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+-
+-void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+-
+-void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+-
+-void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+-
+-void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+-
+-void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+-
+-void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+-
+-void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+-
+-void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+-
+-void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+-
+-void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+-
+-void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+-
+-void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+-
+-void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+-
+-void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+-
+-void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+-
+-void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+-
+-void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+-
+-void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+-
+-void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+-
+-void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+-
+-void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+-
+-void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+-
+-void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+-
+-void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+-
+-void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+-
+-void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+-
+-void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+-
+-void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+-
+-void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+-
+-void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+-
+-void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+-
+-void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+-
+-void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+-
+-void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+-
+-void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+-
+-void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+-
+-void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+-
+-void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+-
+-void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+-
+-void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+-
+-void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+-
+-void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+-
+-void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+-
+-void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+-
+-void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+-
+-void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+-
+-void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+-
+-void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+-
+-void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+-
+-void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-RTCD_EXTERN void (*aom_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-
+-void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-
+-void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_horizontal_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_horizontal_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_horizontal_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_horizontal_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_vertical_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_vertical_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_vertical_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_vertical_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-RTCD_EXTERN void (*aom_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-
+-void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-RTCD_EXTERN void (*aom_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-
+-void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+-
+-void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+-
+-void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+-
+-void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+-
+-void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+-
+-void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+-void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+-RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+-
+-void aom_dsp_rtcd(void);
+-
+-#ifdef RTCD_C
+-#include "aom_ports/x86.h"
+-static void setup_rtcd_internal(void)
+-{
+-    int flags = x86_simd_caps();
+-
+-    (void)flags;
+-
+-    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+-    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+-    aom_blend_a64_mask = aom_blend_a64_mask_c;
+-    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+-    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+-    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+-    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+-    aom_convolve8_horiz = aom_convolve8_horiz_c;
+-    if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+-    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+-    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+-    aom_convolve8_vert = aom_convolve8_vert_c;
+-    if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
+-    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+-    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+-    aom_convolve_copy = aom_convolve_copy_c;
+-    if (flags & HAS_SSE2) aom_convolve_copy = aom_convolve_copy_sse2;
+-    aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_sse2;
+-    aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_sse2;
+-    aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_sse2;
+-    aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_sse2;
+-    aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_sse2;
+-    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+-    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+-    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+-    aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_sse2;
+-    aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_sse2;
+-    aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_sse2;
+-    aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_sse2;
+-    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+-    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+-    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+-    aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_sse2;
+-    aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_sse2;
+-    aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_sse2;
+-    aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_sse2;
+-    aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_sse2;
+-    aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_sse2;
+-    aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_sse2;
+-    aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_sse2;
+-    aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_sse2;
+-    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+-    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+-    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+-    aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_sse2;
+-    aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_sse2;
+-    aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_sse2;
+-    aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_sse2;
+-    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+-    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+-    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+-    aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_sse2;
+-    aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_sse2;
+-    aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_sse2;
+-    aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_sse2;
+-    aom_dc_predictor_16x16 = aom_dc_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_16x16 = aom_dc_predictor_16x16_sse2;
+-    aom_dc_predictor_16x32 = aom_dc_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_16x32 = aom_dc_predictor_16x32_sse2;
+-    aom_dc_predictor_16x4 = aom_dc_predictor_16x4_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_16x4 = aom_dc_predictor_16x4_sse2;
+-    aom_dc_predictor_16x64 = aom_dc_predictor_16x64_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_16x64 = aom_dc_predictor_16x64_sse2;
+-    aom_dc_predictor_16x8 = aom_dc_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_16x8 = aom_dc_predictor_16x8_sse2;
+-    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+-    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+-    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+-    aom_dc_predictor_32x8 = aom_dc_predictor_32x8_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_32x8 = aom_dc_predictor_32x8_sse2;
+-    aom_dc_predictor_4x16 = aom_dc_predictor_4x16_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_4x16 = aom_dc_predictor_4x16_sse2;
+-    aom_dc_predictor_4x4 = aom_dc_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_4x4 = aom_dc_predictor_4x4_sse2;
+-    aom_dc_predictor_4x8 = aom_dc_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_4x8 = aom_dc_predictor_4x8_sse2;
+-    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+-    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+-    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+-    aom_dc_predictor_8x16 = aom_dc_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_8x16 = aom_dc_predictor_8x16_sse2;
+-    aom_dc_predictor_8x32 = aom_dc_predictor_8x32_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_8x32 = aom_dc_predictor_8x32_sse2;
+-    aom_dc_predictor_8x4 = aom_dc_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_8x4 = aom_dc_predictor_8x4_sse2;
+-    aom_dc_predictor_8x8 = aom_dc_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_dc_predictor_8x8 = aom_dc_predictor_8x8_sse2;
+-    aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_sse2;
+-    aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_sse2;
+-    aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_sse2;
+-    aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_sse2;
+-    aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_sse2;
+-    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+-    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+-    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+-    aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_sse2;
+-    aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_sse2;
+-    aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_sse2;
+-    aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_sse2;
+-    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+-    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+-    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+-    aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_sse2;
+-    aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_sse2;
+-    aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_sse2;
+-    aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_sse2;
+-    aom_h_predictor_16x16 = aom_h_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_16x16 = aom_h_predictor_16x16_sse2;
+-    aom_h_predictor_16x32 = aom_h_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_16x32 = aom_h_predictor_16x32_sse2;
+-    aom_h_predictor_16x4 = aom_h_predictor_16x4_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_16x4 = aom_h_predictor_16x4_sse2;
+-    aom_h_predictor_16x64 = aom_h_predictor_16x64_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_16x64 = aom_h_predictor_16x64_sse2;
+-    aom_h_predictor_16x8 = aom_h_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_16x8 = aom_h_predictor_16x8_sse2;
+-    aom_h_predictor_32x16 = aom_h_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_32x16 = aom_h_predictor_32x16_sse2;
+-    aom_h_predictor_32x32 = aom_h_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+-    aom_h_predictor_32x64 = aom_h_predictor_32x64_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_32x64 = aom_h_predictor_32x64_sse2;
+-    aom_h_predictor_32x8 = aom_h_predictor_32x8_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_32x8 = aom_h_predictor_32x8_sse2;
+-    aom_h_predictor_4x16 = aom_h_predictor_4x16_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_4x16 = aom_h_predictor_4x16_sse2;
+-    aom_h_predictor_4x4 = aom_h_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_4x4 = aom_h_predictor_4x4_sse2;
+-    aom_h_predictor_4x8 = aom_h_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_4x8 = aom_h_predictor_4x8_sse2;
+-    aom_h_predictor_64x16 = aom_h_predictor_64x16_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_64x16 = aom_h_predictor_64x16_sse2;
+-    aom_h_predictor_64x32 = aom_h_predictor_64x32_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_64x32 = aom_h_predictor_64x32_sse2;
+-    aom_h_predictor_64x64 = aom_h_predictor_64x64_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_64x64 = aom_h_predictor_64x64_sse2;
+-    aom_h_predictor_8x16 = aom_h_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_8x16 = aom_h_predictor_8x16_sse2;
+-    aom_h_predictor_8x32 = aom_h_predictor_8x32_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_8x32 = aom_h_predictor_8x32_sse2;
+-    aom_h_predictor_8x4 = aom_h_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_8x4 = aom_h_predictor_8x4_sse2;
+-    aom_h_predictor_8x8 = aom_h_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_h_predictor_8x8 = aom_h_predictor_8x8_sse2;
+-    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+-    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+-    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+-    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+-    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+-    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+-    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_c;
+-    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+-    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_c;
+-    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+-    aom_highbd_convolve_copy = aom_highbd_convolve_copy_c;
+-    if (flags & HAS_SSE2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+-    aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_sse2;
+-    aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_sse2;
+-    aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_sse2;
+-    aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_sse2;
+-    aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_sse2;
+-    aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_sse2;
+-    aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_sse2;
+-    aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_sse2;
+-    aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_sse2;
+-    aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_sse2;
+-    aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_sse2;
+-    aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_sse2;
+-    aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_sse2;
+-    aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_sse2;
+-    aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_sse2;
+-    aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_sse2;
+-    aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_sse2;
+-    aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_sse2;
+-    aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_sse2;
+-    aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_sse2;
+-    aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_sse2;
+-    aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_sse2;
+-    aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_sse2;
+-    aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_sse2;
+-    aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_sse2;
+-    aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_sse2;
+-    aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_sse2;
+-    aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_sse2;
+-    aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_sse2;
+-    aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_sse2;
+-    aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_sse2;
+-    aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_sse2;
+-    aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_sse2;
+-    aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_sse2;
+-    aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_sse2;
+-    aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_sse2;
+-    aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_sse2;
+-    aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_sse2;
+-    aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_sse2;
+-    aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_sse2;
+-    aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_sse2;
+-    aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_sse2;
+-    aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_sse2;
+-    aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_sse2;
+-    aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_sse2;
+-    aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_sse2;
+-    aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_sse2;
+-    aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_sse2;
+-    aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_sse2;
+-    aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_sse2;
+-    aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_sse2;
+-    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+-    aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_sse2;
+-    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+-    aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_sse2;
+-    aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_sse2;
+-    aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_sse2;
+-    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+-    aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_sse2;
+-    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+-    aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_sse2;
+-    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+-    aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_sse2;
+-    aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_sse2;
+-    aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_sse2;
+-    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_c;
+-    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+-    aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_sse2;
+-    aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_sse2;
+-    aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_sse2;
+-    aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_sse2;
+-    aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_sse2;
+-    aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_sse2;
+-    aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_sse2;
+-    aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_sse2;
+-    aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_sse2;
+-    aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_sse2;
+-    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+-    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+-    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+-    aom_lpf_horizontal_14 = aom_lpf_horizontal_14_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_14 = aom_lpf_horizontal_14_sse2;
+-    aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_sse2;
+-    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_sse2;
+-    aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_sse2;
+-    aom_lpf_horizontal_6 = aom_lpf_horizontal_6_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_6 = aom_lpf_horizontal_6_sse2;
+-    aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_sse2;
+-    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_sse2;
+-    aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_sse2;
+-    aom_lpf_vertical_14 = aom_lpf_vertical_14_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_14 = aom_lpf_vertical_14_sse2;
+-    aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_sse2;
+-    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_4 = aom_lpf_vertical_4_sse2;
+-    aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_sse2;
+-    aom_lpf_vertical_6 = aom_lpf_vertical_6_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_6 = aom_lpf_vertical_6_sse2;
+-    aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_sse2;
+-    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_8 = aom_lpf_vertical_8_sse2;
+-    aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_c;
+-    if (flags & HAS_SSE2) aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_sse2;
+-    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+-    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+-    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+-    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+-    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+-    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+-    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+-    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+-    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+-    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+-    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+-    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+-    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+-    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+-    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+-    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+-    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+-    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+-    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+-    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+-    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+-    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+-    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+-    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+-    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+-    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+-    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+-    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+-    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+-    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+-    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+-    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+-    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+-    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+-    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+-    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+-    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+-    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+-    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+-    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+-    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+-    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+-    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+-    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+-    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+-    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+-    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+-    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+-    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+-    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+-    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+-    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+-    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+-    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+-    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+-    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+-    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+-    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+-    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+-    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+-    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+-    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+-    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+-    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+-    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+-    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+-    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+-    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+-    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+-    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+-    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+-    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+-    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+-    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+-    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+-    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+-    aom_v_predictor_16x16 = aom_v_predictor_16x16_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_16x16 = aom_v_predictor_16x16_sse2;
+-    aom_v_predictor_16x32 = aom_v_predictor_16x32_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_16x32 = aom_v_predictor_16x32_sse2;
+-    aom_v_predictor_16x4 = aom_v_predictor_16x4_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_16x4 = aom_v_predictor_16x4_sse2;
+-    aom_v_predictor_16x64 = aom_v_predictor_16x64_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_16x64 = aom_v_predictor_16x64_sse2;
+-    aom_v_predictor_16x8 = aom_v_predictor_16x8_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_16x8 = aom_v_predictor_16x8_sse2;
+-    aom_v_predictor_32x16 = aom_v_predictor_32x16_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+-    aom_v_predictor_32x32 = aom_v_predictor_32x32_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+-    aom_v_predictor_32x64 = aom_v_predictor_32x64_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+-    aom_v_predictor_32x8 = aom_v_predictor_32x8_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_32x8 = aom_v_predictor_32x8_sse2;
+-    aom_v_predictor_4x16 = aom_v_predictor_4x16_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_4x16 = aom_v_predictor_4x16_sse2;
+-    aom_v_predictor_4x4 = aom_v_predictor_4x4_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_4x4 = aom_v_predictor_4x4_sse2;
+-    aom_v_predictor_4x8 = aom_v_predictor_4x8_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_4x8 = aom_v_predictor_4x8_sse2;
+-    aom_v_predictor_64x16 = aom_v_predictor_64x16_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+-    aom_v_predictor_64x32 = aom_v_predictor_64x32_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+-    aom_v_predictor_64x64 = aom_v_predictor_64x64_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+-    aom_v_predictor_8x16 = aom_v_predictor_8x16_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_8x16 = aom_v_predictor_8x16_sse2;
+-    aom_v_predictor_8x32 = aom_v_predictor_8x32_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_8x32 = aom_v_predictor_8x32_sse2;
+-    aom_v_predictor_8x4 = aom_v_predictor_8x4_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_8x4 = aom_v_predictor_8x4_sse2;
+-    aom_v_predictor_8x8 = aom_v_predictor_8x8_c;
+-    if (flags & HAS_SSE2) aom_v_predictor_8x8 = aom_v_predictor_8x8_sse2;
+-    av1_round_shift_array = av1_round_shift_array_c;
+-    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+-}
+-#endif
+-
+-#ifdef __cplusplus
+-}  // extern "C"
+-#endif
+-
+-#endif
+diff --git a/media/libaom/config/win/mingw32/config/aom_scale_rtcd.h b/media/libaom/config/win/mingw32/config/aom_scale_rtcd.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw32/config/aom_scale_rtcd.h
++++ /dev/null
+@@ -1,88 +0,0 @@
+-// This file is generated. Do not edit.
+-#ifndef AOM_SCALE_RTCD_H_
+-#define AOM_SCALE_RTCD_H_
+-
+-#ifdef RTCD_C
+-#define RTCD_EXTERN
+-#else
+-#define RTCD_EXTERN extern
+-#endif
+-
+-struct yv12_buffer_config;
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+-#define aom_extend_frame_borders aom_extend_frame_borders_c
+-
+-void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+-#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+-
+-void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+-#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+-
+-void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+-#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+-
+-void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+-#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+-
+-void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+-#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+-
+-void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+-
+-void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+-
+-void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+-
+-void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+-
+-void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+-#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+-
+-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+-#define aom_yv12_copy_u aom_yv12_copy_u_c
+-
+-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+-#define aom_yv12_copy_v aom_yv12_copy_v_c
+-
+-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+-#define aom_yv12_copy_y aom_yv12_copy_y_c
+-
+-void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+-#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+-
+-void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+-#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+-
+-void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+-#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+-
+-void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+-#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+-
+-void aom_scale_rtcd(void);
+-
+-#ifdef RTCD_C
+-#include "aom_ports/x86.h"
+-static void setup_rtcd_internal(void)
+-{
+-    int flags = x86_simd_caps();
+-
+-    (void)flags;
+-
+-}
+-#endif
+-
+-#ifdef __cplusplus
+-}  // extern "C"
+-#endif
+-
+-#endif
+diff --git a/media/libaom/config/win/mingw32/config/av1_rtcd.h b/media/libaom/config/win/mingw32/config/av1_rtcd.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw32/config/av1_rtcd.h
++++ /dev/null
+@@ -1,605 +0,0 @@
+-// This file is generated. Do not edit.
+-#ifndef AV1_RTCD_H_
+-#define AV1_RTCD_H_
+-
+-#ifdef RTCD_C
+-#define RTCD_EXTERN
+-#else
+-#define RTCD_EXTERN extern
+-#endif
+-
+-/*
+- * AV1
+- */
+-
+-#include "aom/aom_integer.h"
+-#include "aom_dsp/txfm_common.h"
+-#include "av1/common/common.h"
+-#include "av1/common/enums.h"
+-#include "av1/common/quant_common.h"
+-#include "av1/common/filter.h"
+-#include "av1/common/convolve.h"
+-#include "av1/common/av1_txfm.h"
+-#include "av1/common/odintrin.h"
+-#include "av1/common/restoration.h"
+-
+-struct macroblockd;
+-
+-/* Encoder forward decls */
+-struct macroblock;
+-struct txfm_param;
+-struct aom_variance_vtable;
+-struct search_site_config;
+-struct yv12_buffer_config;
+-
+-/* Function pointers return by CfL functions */
+-typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+-                                     uint16_t *output_q3);
+-
+-typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+-                                     uint16_t *output_q3);
+-
+-typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+-
+-typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+-                                   int dst_stride, int alpha_q3);
+-
+-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+-                                   int dst_stride, int alpha_q3, int bd);
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-
+-void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-
+-void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-
+-void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-
+-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+-
+-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+-void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+-RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+-
+-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+-#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+-
+-void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+-#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+-
+-void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+-#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+-
+-void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+-void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+-RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+-
+-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+-void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+-RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+-
+-void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+-void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+-RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+-
+-void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve8 av1_highbd_convolve8_c
+-
+-void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_c
+-
+-void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
+-
+-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+-
+-void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+-
+-void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+-void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+-
+-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+-#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+-
+-void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+-#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+-
+-void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+-#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+-
+-void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+-#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+-
+-void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+-#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+-
+-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-
+-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-
+-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+-
+-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+-
+-void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+-
+-void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+-
+-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+-
+-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+-
+-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+-
+-void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+-
+-void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+-
+-void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+-
+-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-
+-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+-
+-void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+-
+-void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+-
+-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+-
+-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+-
+-void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+-
+-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+-
+-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-
+-void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-
+-void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+-void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+-RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+-
+-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+-void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+-RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+-
+-void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-
+-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-
+-void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-
+-int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+-
+-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-
+-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-
+-cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+-cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+-cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+-
+-cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+-cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+-cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+-
+-cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+-cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+-cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+-
+-void av1_rtcd(void);
+-
+-#ifdef RTCD_C
+-#include "aom_ports/x86.h"
+-static void setup_rtcd_internal(void)
+-{
+-    int flags = x86_simd_caps();
+-
+-    (void)flags;
+-
+-    apply_selfguided_restoration = apply_selfguided_restoration_c;
+-    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+-    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+-    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+-    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+-    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+-    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+-    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+-    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+-    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+-    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+-    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+-    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_c;
+-    if (flags & HAS_SSE2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+-    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+-    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+-    av1_convolve_2d_sr = av1_convolve_2d_sr_c;
+-    if (flags & HAS_SSE2) av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+-    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+-    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+-    av1_convolve_x_sr = av1_convolve_x_sr_c;
+-    if (flags & HAS_SSE2) av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+-    av1_convolve_y_sr = av1_convolve_y_sr_c;
+-    if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+-    av1_filter_intra_edge = av1_filter_intra_edge_c;
+-    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+-    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+-    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+-    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+-    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+-    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_c;
+-    if (flags & HAS_SSE2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+-    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+-    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+-    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+-    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+-    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+-    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+-    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+-    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+-    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+-    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+-    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+-    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+-    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+-    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+-    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+-    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+-    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+-    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+-    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+-    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+-    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+-    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+-    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+-    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+-    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+-    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+-    av1_inv_txfm_add = av1_inv_txfm_add_c;
+-    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+-    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+-    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+-    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+-    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_c;
+-    if (flags & HAS_SSE2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+-    av1_jnt_convolve_x = av1_jnt_convolve_x_c;
+-    if (flags & HAS_SSE2) av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+-    av1_jnt_convolve_y = av1_jnt_convolve_y_c;
+-    if (flags & HAS_SSE2) av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+-    av1_selfguided_restoration = av1_selfguided_restoration_c;
+-    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+-    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+-    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+-    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+-    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+-    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+-    av1_warp_affine = av1_warp_affine_c;
+-    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+-    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_c;
+-    if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+-    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+-    cdef_filter_block = cdef_filter_block_c;
+-    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+-    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+-    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+-    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+-    cdef_find_dir = cdef_find_dir_c;
+-    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+-    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+-    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+-    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+-    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+-    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+-    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+-    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+-    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+-    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+-    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
+-    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+-    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+-    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+-    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+-    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
+-    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+-    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+-    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+-    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+-    get_predict_hbd_fn = get_predict_hbd_fn_c;
+-    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+-    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+-    get_predict_lbd_fn = get_predict_lbd_fn_c;
+-    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+-    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+-    get_subtract_average_fn = get_subtract_average_fn_c;
+-    if (flags & HAS_SSE2) get_subtract_average_fn = get_subtract_average_fn_sse2;
+-    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+-}
+-#endif
+-
+-#ifdef __cplusplus
+-}  // extern "C"
+-#endif
+-
+-#endif
+diff --git a/media/libaom/config/win/mingw64/config/aom_config.asm b/media/libaom/config/win/mingw64/config/aom_config.asm
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw64/config/aom_config.asm
++++ /dev/null
+@@ -1,76 +0,0 @@
+-;
+-; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+-;
+-; This source code is subject to the terms of the BSD 2 Clause License and
+-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+-; was not distributed with this source code in the LICENSE file, you can
+-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+-; Media Patent License 1.0 was not distributed with this source code in the
+-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+-;
+-
+-ARCH_ARM equ 0
+-ARCH_MIPS equ 0
+-ARCH_PPC equ 0
+-ARCH_X86 equ 0
+-ARCH_X86_64 equ 1
+-CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+-CONFIG_ACCOUNTING equ 0
+-CONFIG_ANALYZER equ 0
+-CONFIG_AV1_DECODER equ 1
+-CONFIG_AV1_ENCODER equ 0
+-CONFIG_BIG_ENDIAN equ 0
+-CONFIG_BITSTREAM_DEBUG equ 0
+-CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+-CONFIG_COLLECT_RD_STATS equ 0
+-CONFIG_DEBUG equ 0
+-CONFIG_DENOISE equ 1
+-CONFIG_DIST_8X8 equ 0
+-CONFIG_ENTROPY_STATS equ 0
+-CONFIG_FILEOPTIONS equ 1
+-CONFIG_FIX_GF_LENGTH equ 1
+-CONFIG_FP_MB_STATS equ 0
+-CONFIG_GCC equ 1
+-CONFIG_GCOV equ 0
+-CONFIG_GLOBAL_MOTION_SEARCH equ 1
+-CONFIG_GPROF equ 0
+-CONFIG_INSPECTION equ 0
+-CONFIG_INTERNAL_STATS equ 0
+-CONFIG_INTER_STATS_ONLY equ 0
+-CONFIG_LIBYUV equ 0
+-CONFIG_LOWBITDEPTH equ 1
+-CONFIG_MAX_DECODE_PROFILE equ 2
+-CONFIG_MISMATCH_DEBUG equ 0
+-CONFIG_MULTITHREAD equ 1
+-CONFIG_NORMAL_TILE_MODE equ 0
+-CONFIG_OS_SUPPORT equ 1
+-CONFIG_PIC equ 0
+-CONFIG_RD_DEBUG equ 0
+-CONFIG_REDUCED_ENCODER_BORDER equ 0
+-CONFIG_RUNTIME_CPU_DETECT equ 1
+-CONFIG_SHARED equ 0
+-CONFIG_SHARP_SETTINGS equ 0
+-CONFIG_SIZE_LIMIT equ 0
+-CONFIG_SPATIAL_RESAMPLING equ 1
+-CONFIG_STATIC equ 1
+-CONFIG_WEBM_IO equ 0
+-DECODE_HEIGHT_LIMIT equ 0
+-DECODE_WIDTH_LIMIT equ 0
+-HAVE_AVX equ 1
+-HAVE_AVX2 equ 1
+-HAVE_DSPR2 equ 0
+-HAVE_FEXCEPT equ 1
+-HAVE_MIPS32 equ 0
+-HAVE_MIPS64 equ 0
+-HAVE_MMX equ 1
+-HAVE_MSA equ 0
+-HAVE_NEON equ 0
+-HAVE_SSE equ 1
+-HAVE_SSE2 equ 1
+-HAVE_SSE3 equ 1
+-HAVE_SSE4_1 equ 1
+-HAVE_SSE4_2 equ 1
+-HAVE_SSSE3 equ 1
+-HAVE_VSX equ 0
+-HAVE_WXWIDGETS equ 0
+diff --git a/media/libaom/config/win/mingw64/config/aom_config.h b/media/libaom/config/win/mingw64/config/aom_config.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw64/config/aom_config.h
++++ /dev/null
+@@ -1,82 +0,0 @@
+-/*
+- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+- *
+- * This source code is subject to the terms of the BSD 2 Clause License and
+- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+- * was not distributed with this source code in the LICENSE file, you can
+- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+- * Media Patent License 1.0 was not distributed with this source code in the
+- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+- */
+-#ifndef AOM_CONFIG_H_
+-#define AOM_CONFIG_H_
+-
+-#define ARCH_ARM 0
+-#define ARCH_MIPS 0
+-#define ARCH_PPC 0
+-#define ARCH_X86 0
+-#define ARCH_X86_64 1
+-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+-#define CONFIG_ACCOUNTING 0
+-#define CONFIG_ANALYZER 0
+-#define CONFIG_AV1_DECODER 1
+-#define CONFIG_AV1_ENCODER 0
+-#define CONFIG_BIG_ENDIAN 0
+-#define CONFIG_BITSTREAM_DEBUG 0
+-#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+-#define CONFIG_COLLECT_RD_STATS 0
+-#define CONFIG_DEBUG 0
+-#define CONFIG_DENOISE 1
+-#define CONFIG_DIST_8X8 0
+-#define CONFIG_ENTROPY_STATS 0
+-#define CONFIG_FILEOPTIONS 1
+-#define CONFIG_FIX_GF_LENGTH 1
+-#define CONFIG_FP_MB_STATS 0
+-#define CONFIG_GCC 1
+-#define CONFIG_GCOV 0
+-#define CONFIG_GLOBAL_MOTION_SEARCH 1
+-#define CONFIG_GPROF 0
+-#define CONFIG_INSPECTION 0
+-#define CONFIG_INTERNAL_STATS 0
+-#define CONFIG_INTER_STATS_ONLY 0
+-#define CONFIG_LIBYUV 0
+-#define CONFIG_LOWBITDEPTH 1
+-#define CONFIG_MAX_DECODE_PROFILE 2
+-#define CONFIG_MISMATCH_DEBUG 0
+-#define CONFIG_MULTITHREAD 1
+-#define CONFIG_NORMAL_TILE_MODE 0
+-#define CONFIG_OS_SUPPORT 1
+-#define CONFIG_PIC 0
+-#define CONFIG_RD_DEBUG 0
+-#define CONFIG_REDUCED_ENCODER_BORDER 0
+-#define CONFIG_RUNTIME_CPU_DETECT 1
+-#define CONFIG_SHARED 0
+-#define CONFIG_SHARP_SETTINGS 0
+-#define CONFIG_SIZE_LIMIT 0
+-#define CONFIG_SPATIAL_RESAMPLING 1
+-#define CONFIG_STATIC 1
+-#define CONFIG_WEBM_IO 0
+-#define DECODE_HEIGHT_LIMIT 0
+-#define DECODE_WIDTH_LIMIT 0
+-#define HAVE_AVX 1
+-#define HAVE_AVX2 1
+-#define HAVE_DSPR2 0
+-#define HAVE_FEXCEPT 1
+-#define HAVE_MIPS32 0
+-#define HAVE_MIPS64 0
+-#define HAVE_MMX 1
+-#define HAVE_MSA 0
+-#define HAVE_NEON 0
+-#define HAVE_SSE 1
+-#define HAVE_SSE2 1
+-#define HAVE_SSE3 1
+-#define HAVE_SSE4_1 1
+-#define HAVE_SSE4_2 1
+-#define HAVE_SSSE3 1
+-#define HAVE_VSX 0
+-#define HAVE_WXWIDGETS 0
+-#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+-#define INLINE inline
+-#define LIB_INSTALL_DIR INSTALLDIR/lib
+-#endif /* AOM_CONFIG_H_ */
+diff --git a/media/libaom/config/win/mingw64/config/aom_dsp_rtcd.h b/media/libaom/config/win/mingw64/config/aom_dsp_rtcd.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw64/config/aom_dsp_rtcd.h
++++ /dev/null
+@@ -1,2001 +0,0 @@
+-// This file is generated. Do not edit.
+-#ifndef AOM_DSP_RTCD_H_
+-#define AOM_DSP_RTCD_H_
+-
+-#ifdef RTCD_C
+-#define RTCD_EXTERN
+-#else
+-#define RTCD_EXTERN extern
+-#endif
+-
+-/*
+- * DSP
+- */
+-
+-#include "aom/aom_integer.h"
+-#include "aom_dsp/aom_dsp_common.h"
+-#include "av1/common/enums.h"
+-#include "av1/common/blockd.h"
+-
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-
+-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+-
+-void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+-
+-void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-
+-void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-
+-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+-#define aom_convolve_copy aom_convolve_copy_sse2
+-
+-void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_sse2
+-
+-void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_sse2
+-
+-void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_sse2
+-
+-void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_sse2
+-
+-void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_sse2
+-
+-void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+-
+-void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_sse2
+-
+-void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_sse2
+-
+-void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_sse2
+-
+-void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_sse2
+-
+-void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_sse2
+-
+-void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_sse2
+-
+-void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_sse2
+-
+-void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_sse2
+-
+-void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_sse2
+-
+-void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_sse2
+-
+-void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_sse2
+-
+-void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_sse2
+-
+-void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_sse2
+-
+-void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+-
+-void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_sse2
+-
+-void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_sse2
+-
+-void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_sse2
+-
+-void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_sse2
+-
+-void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_sse2
+-
+-void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_sse2
+-
+-void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_sse2
+-
+-void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_sse2
+-
+-void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_sse2
+-
+-void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_sse2
+-
+-void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_sse2
+-
+-void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_sse2
+-
+-void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_sse2
+-
+-void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+-
+-void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_sse2
+-
+-void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_sse2
+-
+-void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_sse2
+-
+-void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_sse2
+-
+-void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_sse2
+-
+-void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_sse2
+-
+-void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_sse2
+-
+-void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_sse2
+-
+-void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_sse2
+-
+-void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_sse2
+-
+-void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_sse2
+-
+-void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_sse2
+-
+-void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_sse2
+-
+-void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+-
+-void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_sse2
+-
+-void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_sse2
+-
+-void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_sse2
+-
+-void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_sse2
+-
+-void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_sse2
+-
+-void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_sse2
+-
+-void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_sse2
+-
+-void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_sse2
+-
+-void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_16x16 aom_h_predictor_16x16_sse2
+-
+-void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_16x32 aom_h_predictor_16x32_sse2
+-
+-void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_16x4 aom_h_predictor_16x4_sse2
+-
+-void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_16x64 aom_h_predictor_16x64_sse2
+-
+-void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_16x8 aom_h_predictor_16x8_sse2
+-
+-void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+-
+-void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_32x16 aom_h_predictor_32x16_sse2
+-
+-void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_32x64 aom_h_predictor_32x64_sse2
+-
+-void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_32x8 aom_h_predictor_32x8_sse2
+-
+-void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_4x16 aom_h_predictor_4x16_sse2
+-
+-void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_4x4 aom_h_predictor_4x4_sse2
+-
+-void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_4x8 aom_h_predictor_4x8_sse2
+-
+-void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_64x16 aom_h_predictor_64x16_sse2
+-
+-void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_64x32 aom_h_predictor_64x32_sse2
+-
+-void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_64x64 aom_h_predictor_64x64_sse2
+-
+-void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_8x16 aom_h_predictor_8x16_sse2
+-
+-void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_8x32 aom_h_predictor_8x32_sse2
+-
+-void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_8x4 aom_h_predictor_8x4_sse2
+-
+-void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
+-
+-void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+-#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+-
+-void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-
+-void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+-void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+-RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+-
+-void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+-
+-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-
+-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-
+-void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-
+-void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_sse2
+-
+-void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_sse2
+-
+-void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+-
+-void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+-
+-void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_sse2
+-
+-void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+-
+-void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_sse2
+-
+-void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_sse2
+-
+-void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+-
+-void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+-
+-void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+-
+-void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_sse2
+-
+-void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_sse2
+-
+-void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+-
+-void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+-
+-void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+-
+-void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_sse2
+-
+-void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+-
+-void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_sse2
+-
+-void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_sse2
+-
+-void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_sse2
+-
+-void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_sse2
+-
+-void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+-
+-void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+-
+-void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_sse2
+-
+-void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+-
+-void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_sse2
+-
+-void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_sse2
+-
+-void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+-
+-void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+-
+-void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+-
+-void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_sse2
+-
+-void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_sse2
+-
+-void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+-
+-void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+-
+-void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+-
+-void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_sse2
+-
+-void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+-
+-void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_sse2
+-
+-void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_sse2
+-
+-void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_sse2
+-
+-void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_sse2
+-
+-void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+-
+-void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+-
+-void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_sse2
+-
+-void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+-
+-void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_sse2
+-
+-void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_sse2
+-
+-void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+-
+-void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+-
+-void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+-
+-void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_sse2
+-
+-void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_sse2
+-
+-void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+-
+-void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+-
+-void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+-
+-void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_sse2
+-
+-void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+-
+-void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_sse2
+-
+-void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_sse2
+-
+-void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_sse2
+-
+-void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_sse2
+-
+-void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+-
+-void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+-
+-void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_sse2
+-
+-void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+-
+-void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_sse2
+-
+-void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_sse2
+-
+-void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+-
+-void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+-
+-void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+-
+-void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_sse2
+-
+-void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_sse2
+-
+-void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+-
+-void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+-
+-void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+-
+-void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_sse2
+-
+-void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+-
+-void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_sse2
+-
+-void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_sse2
+-
+-void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_sse2
+-
+-void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_sse2
+-
+-void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+-
+-void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+-
+-void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_sse2
+-
+-void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+-
+-void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_sse2
+-
+-void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_sse2
+-
+-void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+-
+-void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+-
+-void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+-
+-void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_sse2
+-
+-void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_sse2
+-
+-void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+-
+-void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+-
+-void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+-
+-void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_sse2
+-
+-void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+-
+-void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_sse2
+-
+-void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_sse2
+-
+-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
+-
+-void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+-
+-void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
+-
+-void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
+-
+-void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
+-
+-void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
+-
+-void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
+-
+-void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
+-
+-void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
+-
+-void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
+-
+-void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+-#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
+-
+-void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+-
+-void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+-
+-void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+-
+-void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+-
+-void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+-
+-void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+-
+-void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+-
+-void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+-
+-void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+-
+-void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+-
+-void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+-
+-void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+-
+-void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+-
+-void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+-
+-void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+-
+-void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+-
+-void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+-
+-void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+-
+-void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+-
+-void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+-
+-void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+-
+-void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+-
+-void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+-
+-void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+-
+-void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+-
+-void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+-
+-void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+-
+-void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+-
+-void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+-
+-void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+-
+-void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+-
+-void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+-
+-void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+-
+-void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+-
+-void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+-
+-void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+-
+-void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+-
+-void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+-
+-void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+-
+-void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+-
+-void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+-
+-void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+-
+-void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+-
+-void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+-
+-void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+-
+-void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+-
+-void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+-
+-void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+-
+-void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+-
+-void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+-
+-void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+-
+-void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+-
+-void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+-
+-void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+-
+-void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+-
+-void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+-
+-void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+-
+-void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+-
+-void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+-
+-void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+-
+-void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+-
+-void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+-
+-void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+-
+-void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+-
+-void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+-
+-void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+-
+-void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+-
+-void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+-
+-void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+-
+-void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+-
+-void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+-
+-void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+-
+-void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+-
+-void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+-
+-void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+-
+-void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+-
+-void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+-
+-void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+-
+-void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+-
+-void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+-
+-void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+-
+-void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_sse2
+-
+-void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_sse2
+-
+-void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+-
+-void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+-
+-void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_sse2
+-
+-void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+-
+-void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_sse2
+-
+-void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_sse2
+-
+-void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+-
+-void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+-
+-void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+-
+-void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_sse2
+-
+-void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_sse2
+-
+-void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+-
+-void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+-
+-void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+-
+-void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_sse2
+-
+-void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+-
+-void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_sse2
+-
+-void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+-#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
+-
+-void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+-
+-void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
+-
+-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
+-
+-void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
+-
+-void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
+-
+-void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
+-
+-void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
+-
+-void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
+-
+-void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
+-
+-void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
+-
+-void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
+-
+-void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
+-
+-void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
+-
+-void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
+-
+-void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
+-
+-void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+-#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
+-
+-void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+-#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
+-
+-void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+-
+-void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+-
+-void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+-
+-void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+-
+-void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_16x16 aom_v_predictor_16x16_sse2
+-
+-void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_16x32 aom_v_predictor_16x32_sse2
+-
+-void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_16x4 aom_v_predictor_16x4_sse2
+-
+-void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_16x64 aom_v_predictor_16x64_sse2
+-
+-void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_16x8 aom_v_predictor_16x8_sse2
+-
+-void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+-
+-void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_32x8 aom_v_predictor_32x8_sse2
+-
+-void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_4x16 aom_v_predictor_4x16_sse2
+-
+-void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_4x4 aom_v_predictor_4x4_sse2
+-
+-void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_4x8 aom_v_predictor_4x8_sse2
+-
+-void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-
+-void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_8x16 aom_v_predictor_8x16_sse2
+-
+-void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_8x32 aom_v_predictor_8x32_sse2
+-
+-void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_8x4 aom_v_predictor_8x4_sse2
+-
+-void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+-#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
+-
+-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+-void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+-RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+-
+-void aom_dsp_rtcd(void);
+-
+-#ifdef RTCD_C
+-#include "aom_ports/x86.h"
+-static void setup_rtcd_internal(void)
+-{
+-    int flags = x86_simd_caps();
+-
+-    (void)flags;
+-
+-    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+-    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+-    aom_blend_a64_mask = aom_blend_a64_mask_c;
+-    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+-    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+-    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+-    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+-    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+-    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+-    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+-    aom_convolve8_vert = aom_convolve8_vert_sse2;
+-    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+-    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+-    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+-    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+-    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+-    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+-    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+-    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+-    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+-    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+-    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+-    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+-    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+-    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+-    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+-    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+-    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+-    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+-    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+-    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+-    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+-    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+-    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+-    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+-    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+-    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+-    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+-    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+-    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+-    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+-    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+-    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+-    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+-    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+-    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+-    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+-    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+-    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+-    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+-    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+-    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+-    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+-    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+-    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+-    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+-    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+-    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+-    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+-    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+-    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+-    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+-    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+-    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+-    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+-    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+-    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+-    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+-    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+-    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+-    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+-    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+-    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+-    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+-    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+-    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+-    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+-    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+-    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+-    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+-    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+-    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+-    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+-    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+-    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+-    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+-    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+-    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+-    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+-    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+-    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+-    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+-    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+-    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+-    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+-    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+-    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+-    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+-    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+-    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+-    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+-    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+-    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+-    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+-    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+-    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+-    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+-    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+-    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+-    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+-    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+-    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+-    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+-    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+-    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+-    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+-    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+-    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+-    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+-    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+-    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+-    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+-    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+-    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+-    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+-    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+-    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+-    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+-    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+-    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+-    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+-    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+-    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+-    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+-    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+-    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+-    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+-    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+-    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+-    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+-    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+-    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+-    av1_round_shift_array = av1_round_shift_array_c;
+-    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+-}
+-#endif
+-
+-#ifdef __cplusplus
+-}  // extern "C"
+-#endif
+-
+-#endif
+diff --git a/media/libaom/config/win/mingw64/config/aom_scale_rtcd.h b/media/libaom/config/win/mingw64/config/aom_scale_rtcd.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw64/config/aom_scale_rtcd.h
++++ /dev/null
+@@ -1,88 +0,0 @@
+-// This file is generated. Do not edit.
+-#ifndef AOM_SCALE_RTCD_H_
+-#define AOM_SCALE_RTCD_H_
+-
+-#ifdef RTCD_C
+-#define RTCD_EXTERN
+-#else
+-#define RTCD_EXTERN extern
+-#endif
+-
+-struct yv12_buffer_config;
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+-#define aom_extend_frame_borders aom_extend_frame_borders_c
+-
+-void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+-#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+-
+-void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+-#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+-
+-void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+-#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+-
+-void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+-#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+-
+-void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+-#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+-
+-void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+-
+-void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+-
+-void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+-
+-void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+-#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+-
+-void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+-#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+-
+-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+-#define aom_yv12_copy_u aom_yv12_copy_u_c
+-
+-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+-#define aom_yv12_copy_v aom_yv12_copy_v_c
+-
+-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+-#define aom_yv12_copy_y aom_yv12_copy_y_c
+-
+-void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+-#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+-
+-void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+-#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+-
+-void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+-#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+-
+-void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+-#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+-
+-void aom_scale_rtcd(void);
+-
+-#ifdef RTCD_C
+-#include "aom_ports/x86.h"
+-static void setup_rtcd_internal(void)
+-{
+-    int flags = x86_simd_caps();
+-
+-    (void)flags;
+-
+-}
+-#endif
+-
+-#ifdef __cplusplus
+-}  // extern "C"
+-#endif
+-
+-#endif
+diff --git a/media/libaom/config/win/mingw64/config/av1_rtcd.h b/media/libaom/config/win/mingw64/config/av1_rtcd.h
+deleted file mode 100644
+--- a/media/libaom/config/win/mingw64/config/av1_rtcd.h
++++ /dev/null
+@@ -1,594 +0,0 @@
+-// This file is generated. Do not edit.
+-#ifndef AV1_RTCD_H_
+-#define AV1_RTCD_H_
+-
+-#ifdef RTCD_C
+-#define RTCD_EXTERN
+-#else
+-#define RTCD_EXTERN extern
+-#endif
+-
+-/*
+- * AV1
+- */
+-
+-#include "aom/aom_integer.h"
+-#include "aom_dsp/txfm_common.h"
+-#include "av1/common/common.h"
+-#include "av1/common/enums.h"
+-#include "av1/common/quant_common.h"
+-#include "av1/common/filter.h"
+-#include "av1/common/convolve.h"
+-#include "av1/common/av1_txfm.h"
+-#include "av1/common/odintrin.h"
+-#include "av1/common/restoration.h"
+-
+-struct macroblockd;
+-
+-/* Encoder forward decls */
+-struct macroblock;
+-struct txfm_param;
+-struct aom_variance_vtable;
+-struct search_site_config;
+-struct yv12_buffer_config;
+-
+-/* Function pointers return by CfL functions */
+-typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+-                                     uint16_t *output_q3);
+-
+-typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+-                                     uint16_t *output_q3);
+-
+-typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+-
+-typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+-                                   int dst_stride, int alpha_q3);
+-
+-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+-                                   int dst_stride, int alpha_q3, int bd);
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+-
+-void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+-
+-void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+-
+-void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+-
+-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+-
+-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+-void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+-RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+-
+-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+-#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+-
+-void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+-#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+-
+-void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+-#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+-
+-void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+-void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+-RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+-
+-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+-void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+-RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+-
+-void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+-void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+-RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+-
+-void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void av1_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve8 av1_highbd_convolve8_sse2
+-
+-void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void av1_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_sse2
+-
+-void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2
+-
+-void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+-
+-void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+-#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+-
+-void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+-void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+-
+-void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+-#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+-
+-void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+-#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+-
+-void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+-#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+-
+-void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+-#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+-
+-void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+-#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+-
+-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+-
+-void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-
+-void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+-
+-void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+-
+-void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+-
+-void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+-
+-void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+-
+-void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+-
+-void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+-
+-void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+-
+-void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+-
+-void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+-
+-void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+-
+-void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-
+-void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+-
+-void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+-
+-void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+-
+-void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+-
+-void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+-
+-void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+-
+-void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+-
+-void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+-
+-void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+-
+-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+-
+-int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+-                                 int sgr_params_idx, int bit_depth, int highbd);
+-
+-void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+-void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+-RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+-
+-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+-void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+-RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+-
+-void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+-
+-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+-
+-void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+-
+-int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+-
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+-
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+-
+-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+-
+-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+-
+-cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+-cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+-cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+-
+-cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+-cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+-cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+-
+-cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+-cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+-cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+-RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+-
+-void av1_rtcd(void);
+-
+-#ifdef RTCD_C
+-#include "aom_ports/x86.h"
+-static void setup_rtcd_internal(void)
+-{
+-    int flags = x86_simd_caps();
+-
+-    (void)flags;
+-
+-    apply_selfguided_restoration = apply_selfguided_restoration_c;
+-    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+-    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+-    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+-    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+-    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+-    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+-    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+-    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+-    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+-    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+-    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+-    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+-    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+-    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+-    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+-    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+-    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+-    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+-    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+-    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+-    av1_filter_intra_edge = av1_filter_intra_edge_c;
+-    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+-    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+-    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+-    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+-    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+-    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+-    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+-    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+-    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+-    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+-    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+-    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+-    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+-    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+-    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+-    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+-    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+-    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+-    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+-    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+-    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+-    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+-    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+-    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+-    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+-    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+-    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+-    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+-    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+-    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+-    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+-    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+-    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+-    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+-    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+-    av1_inv_txfm_add = av1_inv_txfm_add_c;
+-    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+-    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+-    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+-    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+-    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+-    av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+-    av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+-    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+-    av1_selfguided_restoration = av1_selfguided_restoration_c;
+-    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+-    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+-    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+-    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+-    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+-    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+-    av1_warp_affine = av1_warp_affine_c;
+-    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+-    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+-    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+-    cdef_filter_block = cdef_filter_block_sse2;
+-    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+-    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+-    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+-    cdef_find_dir = cdef_find_dir_sse2;
+-    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+-    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+-    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+-    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+-    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+-    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+-    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+-    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+-    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+-    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+-    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+-    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+-    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+-    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+-    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+-    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+-    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+-    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+-    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+-    get_predict_hbd_fn = get_predict_hbd_fn_c;
+-    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+-    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+-    get_predict_lbd_fn = get_predict_lbd_fn_c;
+-    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+-    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+-    get_subtract_average_fn = get_subtract_average_fn_sse2;
+-    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+-}
+-#endif
+-
+-#ifdef __cplusplus
+-}  // extern "C"
+-#endif
+-
+-#endif
+diff --git a/media/libaom/generate_sources_mozbuild.py b/media/libaom/generate_sources_mozbuild.py
+--- a/media/libaom/generate_sources_mozbuild.py
++++ b/media/libaom/generate_sources_mozbuild.py
+@@ -84,36 +84,34 @@ if __name__ == '__main__':
+     f = open('sources.mozbuild', 'w')
+     f.write('# This file is generated. Do not edit.\n\n')
+     f.write('files = {\n')
+ 
+     platforms = [
+         ('armv7', 'linux', 'arm', True),
+         ('generic', '', 'generic', True),
+         ('x86', 'linux', 'ia32', True),
+-        ('x86', 'win', 'mingw32', False),
+         ('x86', 'win', 'ia32', False),
+         ('x86_64', 'linux', 'x64', True),
+         ('x86_64', 'mac', 'x64', False),
+         ('x86_64', 'win', 'x64', False),
+-        ('x86_64', 'win', 'mingw64', False),
+     ]
+     for cpu, system, arch, generate_sources in platforms:
+         print('Running CMake for %s (%s)' % (cpu, system))
+         variables = shared_variables.copy()
+         variables['AOM_TARGET_CPU'] = cpu
+ 
+         # We skip compiling test programs that detect these
+         variables['HAVE_FEXCEPT'] = 1
+         variables['INLINE'] = 'inline'
+         if cpu == 'x86' and system == 'linux':
+             variables['CONFIG_PIC'] = 1
+         if cpu == 'armv7':
+             variables['CONFIG_PIC'] = 1
+-        if system == 'win' and not arch.startswith('mingw'):
++        if system == 'win':
+             variables['MSVC'] = 1
+ 
+         cache_variables = []
+         sources = cp.parse(variables, cache_variables,
+                            os.path.join(AOM_DIR, 'CMakeLists.txt'))
+ 
+         # Disable HAVE_UNISTD_H.
+         cache_variables.remove('HAVE_UNISTD_H')
+diff --git a/media/libaom/generate_sources_mozbuild.sh b/media/libaom/generate_sources_mozbuild.sh
+--- a/media/libaom/generate_sources_mozbuild.sh
++++ b/media/libaom/generate_sources_mozbuild.sh
+@@ -63,18 +63,16 @@ python generate_sources_mozbuild.py
+ # Copy aom_version.h once. The file is the same for all platforms.
+ cp aom_version.h $BASE_DIR/$LIBAOM_CONFIG_DIR
+ 
+ gen_rtcd_header linux/x64 x86_64
+ gen_rtcd_header linux/ia32 x86
+ gen_rtcd_header mac/x64 x86_64
+ gen_rtcd_header win/x64 x86_64
+ gen_rtcd_header win/ia32 x86
+-gen_rtcd_header win/mingw32 x86
+-gen_rtcd_header win/mingw64 x86_64
+ 
+ gen_rtcd_header linux/arm armv7
+ 
+ gen_rtcd_header generic generic
+ 
+ cd $BASE_DIR/$LIBAOM_SRC_DIR
+ 
+ cd $BASE_DIR
+diff --git a/media/libaom/moz.build b/media/libaom/moz.build
+--- a/media/libaom/moz.build
++++ b/media/libaom/moz.build
+@@ -10,45 +10,35 @@ with Files('*'):
+ include('sources.mozbuild')
+ 
+ # Linux, Mac and Win share file lists for x86* but not configurations.
+ if CONFIG['CPU_ARCH'] == 'x86_64':
+     EXPORTS.aom += files['X64_EXPORTS']
+     SOURCES += files['X64_SOURCES']
+     USE_YASM = True
+     if CONFIG['OS_TARGET'] == 'WINNT':
+-        if CONFIG['CC_TYPE'] == 'gcc':
+-            ASFLAGS += [ '-I%s/media/libaom/config/win/mingw64/' % TOPSRCDIR ]
+-            LOCAL_INCLUDES += [ '/media/libaom/config/win/mingw64/' ]
+-            EXPORTS.aom += [ 'config/win/mingw64/config/aom_config.h' ]
+-        else:
+-            ASFLAGS += [ '-I%s/media/libaom/config/win/x64/' % TOPSRCDIR ]
+-            LOCAL_INCLUDES += [ '/media/libaom/config/win/x64/' ]
+-            EXPORTS.aom += [ 'config/win/x64/config/aom_config.h' ]
++        ASFLAGS += [ '-I%s/media/libaom/config/win/x64/' % TOPSRCDIR ]
++        LOCAL_INCLUDES += [ '/media/libaom/config/win/x64/' ]
++        EXPORTS.aom += [ 'config/win/x64/config/aom_config.h' ]
+     elif CONFIG['OS_TARGET'] == 'Darwin':
+         ASFLAGS += [ '-I%s/media/libaom/config/mac/x64/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/mac/x64/' ]
+         EXPORTS.aom += [ 'config/mac/x64/config/aom_config.h' ]
+     else: # Android, Linux, BSDs, etc.
+         ASFLAGS += [ '-I%s/media/libaom/config/linux/x64/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/linux/x64/' ]
+         EXPORTS.aom += [ 'config/linux/x64/config/aom_config.h' ]
+ elif CONFIG['CPU_ARCH'] == 'x86':
+     EXPORTS.aom += files['IA32_EXPORTS']
+     SOURCES += files['IA32_SOURCES']
+     USE_YASM = True
+     if CONFIG['OS_TARGET'] == 'WINNT':
+-        if CONFIG['CC_TYPE'] == 'gcc':
+-            ASFLAGS += [ '-I%s/media/libaom/config/win/mingw32/' % TOPSRCDIR ]
+-            LOCAL_INCLUDES += [ '/media/libaom/config/win/mingw32/' ]
+-            EXPORTS.aom += [ 'config/win/mingw32/config/aom_config.h' ]
+-        else:
+-            ASFLAGS += [ '-I%s/media/libaom/config/win/ia32/' % TOPSRCDIR ]
+-            LOCAL_INCLUDES += [ '/media/libaom/config/win/ia32/' ]
+-            EXPORTS.aom += [ 'config/win/ia32/config/aom_config.h' ]
++        ASFLAGS += [ '-I%s/media/libaom/config/win/ia32/' % TOPSRCDIR ]
++        LOCAL_INCLUDES += [ '/media/libaom/config/win/ia32/' ]
++        EXPORTS.aom += [ 'config/win/ia32/config/aom_config.h' ]
+     else: # Android, Linux, BSDs, etc.
+         ASFLAGS += [ '-I%s/media/libaom/config/linux/ia32/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/linux/ia32/' ]
+         EXPORTS.aom += [ 'config/linux/ia32/config/aom_config.h' ]
+ elif CONFIG['CPU_ARCH'] == 'arm':
+     EXPORTS.aom += files['ARM_EXPORTS']
+     ASFLAGS += [
+         '-I%s/media/libaom/config/linux/arm/' % TOPSRCDIR,

+ 204 - 0
mozilla-release/patches/1650299-80a1.patch

@@ -0,0 +1,204 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1594076273 0
+# Node ID 1773b1745a8c4b938fde6d3dea9a58cf941cff73
+# Parent  c6000c544a4c7cc3a7048ca08bcb540c22e2310c
+Bug 1650299 - Unify the inclusion of the ICU data file. r=froydnj
+
+All the supported compilers support a GNU AS-like syntax, with only a
+few details varying. It means we can use a single, simpler, way to
+include the ICU data file, instead of 3 different ways, including one
+that uses armasm64.exe, possibly wrapped with Wine.
+
+Differential Revision: https://phabricator.services.mozilla.com/D82144
+
+Changed for keeping intel msvc supprt.
+
+diff --git a/config/external/icu/data/genicudata.py b/config/external/icu/data/genicudata.py
+deleted file mode 100644
+--- a/config/external/icu/data/genicudata.py
++++ /dev/null
+@@ -1,16 +0,0 @@
+-# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+-# vim: set filetype=python:
+-# This Source Code Form is subject to the terms of the Mozilla Public
+-# License, v. 2.0. If a copy of the MPL was not distributed with this
+-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+-
+-from __future__ import absolute_import
+-
+-
+-def main(output, data_file, data_symbol):
+-    output.write('''    AREA |.rdata|,ALIGN=4,DATA,READONLY
+-    EXPORT |{data_symbol}|[DATA]
+-|{data_symbol}|
+-    INCBIN {data_file}
+-    END
+-'''.format(data_file=data_file, data_symbol=data_symbol))
+diff --git a/config/external/icu/data/icudata.c b/config/external/icu/data/icudata.c
+new file mode 100644
+--- /dev/null
++++ b/config/external/icu/data/icudata.c
+@@ -0,0 +1,21 @@
++/* This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifdef __APPLE__
++#  define RODATA ".data\n.const"
++#else
++#  define RODATA ".section .rodata"
++#endif
++
++#define DATA(sym, file) DATA2(sym, file)
++// clang-format off
++#define DATA2(sym, file)              \
++  __asm__(".global " #sym "\n"        \
++          RODATA "\n"                 \
++          ".balign 16\n"              \
++          #sym ":\n"                  \
++          "    .incbin " #file "\n")
++// clang-format on
++
++DATA(ICU_DATA_SYMBOL, ICU_DATA_FILE);
+diff --git a/config/external/icu/data/icudata.s b/config/external/icu/data/icudata.s
+--- a/config/external/icu/data/icudata.s
++++ b/config/external/icu/data/icudata.s
+@@ -1,31 +1,11 @@
+ ;; This Source Code Form is subject to the terms of the Mozilla Public
+ ;; License, v. 2.0. If a copy of the MPL was not distributed with this
+ ;; file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ 
+-%ifdef PREFIX
+-    %define DATA_SYMBOL _ %+ ICU_DATA_SYMBOL
+-%else
+     %define DATA_SYMBOL ICU_DATA_SYMBOL
+-%endif
+ 
+-%ifidn __OUTPUT_FORMAT__,elf
+-    %define FORMAT_ELF 1
+-%elifidn __OUTPUT_FORMAT__,elf32
+-    %define FORMAT_ELF 1
+-%elifidn __OUTPUT_FORMAT__,elf64
+-    %define FORMAT_ELF 1
+-%else
+-    %define FORMAT_ELF 0
+-%endif
+-
+-%if FORMAT_ELF
+-    global DATA_SYMBOL:data hidden
+-    ; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+-    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+-%else
+     global DATA_SYMBOL
+-%endif
+ 
+ SECTION .rodata align=16
+ DATA_SYMBOL:
+         incbin ICU_DATA_FILE
+diff --git a/config/external/icu/data/icudata_gas.S b/config/external/icu/data/icudata_gas.S
+deleted file mode 100644
+--- a/config/external/icu/data/icudata_gas.S
++++ /dev/null
+@@ -1,12 +0,0 @@
+-# This Source Code Form is subject to the terms of the Mozilla Public
+-# License, v. 2.0. If a copy of the MPL was not distributed with this
+-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+-
+-#if defined(__linux__) && defined(__ELF__)
+-.section .note.GNU-stack,"",%progbits
+-#endif
+-.global ICU_DATA_SYMBOL
+-.data
+-.balign 16
+-ICU_DATA_SYMBOL:
+-        .incbin ICU_DATA_FILE
+diff --git a/config/external/icu/data/moz.build b/config/external/icu/data/moz.build
+--- a/config/external/icu/data/moz.build
++++ b/config/external/icu/data/moz.build
+@@ -3,34 +3,34 @@
+ # This Source Code Form is subject to the terms of the Mozilla Public
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ 
+ # Build a library containing the ICU data for use in the JS shell, so that
+ # JSAPI consumers don't have to deal with setting ICU's data path.
+ Library('icudata')
+ 
+-if CONFIG['OS_ARCH'] == 'WINNT':
+-    if CONFIG['CPU_ARCH'] == 'x86':
+-        ASFLAGS += ['-DPREFIX']
+-elif CONFIG['OS_ARCH'] == 'Darwin':
+-    ASFLAGS += ['-DPREFIX']
+-
+-data_symbol = 'icudt%s_dat' % CONFIG['MOZ_ICU_VERSION']
+-asflags = [
+-    '-I%s/config/external/icu/data/' % TOPSRCDIR,
+-    '-DICU_DATA_FILE="%s"' % CONFIG['ICU_DATA_FILE'],
+-    '-DICU_DATA_SYMBOL=%s' % data_symbol,
+-]
+ LOCAL_INCLUDES += ['.']
+ 
+-if CONFIG['OS_TARGET'] == 'WINNT' and CONFIG['CPU_ARCH'] == 'aarch64':
+-    icudata = 'icudata.asm'
+-    GeneratedFile(icudata, script='genicudata.py',
+-                  inputs=[CONFIG['ICU_DATA_FILE']], flags=[data_symbol])
+-    SOURCES += ['!%s' % icudata]
+-elif CONFIG['HAVE_YASM']:
+-    USE_YASM = True
+-    SOURCES += ['icudata.s']
+-    ASFLAGS += asflags
+-elif CONFIG['GNU_AS']:
+-    SOURCES += ['icudata_gas.S']
+-    ASFLAGS += asflags
++prefix = ''
++if (CONFIG['OS_ARCH'] == 'WINNT' and CONFIG['CPU_ARCH'] == 'x86') or CONFIG['OS_ARCH'] == 'Darwin':
++    prefix = '_'
++
++data_file = '"%s/icudt%sl.dat"' % (SRCDIR, CONFIG['MOZ_ICU_VERSION'])
++data_symbol = '%sicudt%s_dat' % (prefix, CONFIG['MOZ_ICU_VERSION'])
++
++if CONFIG['OS_TARGET'] == 'WINNT' and CONFIG['CC_TYPE'] == 'msvc' and CONFIG['INTEL_ARCHITECTURE']:
++    USE_NASM = True
++    ASFLAGS += [
++        '-I%s/config/external/icu/data/' % TOPSRCDIR,
++        '-DICU_DATA_FILE=%s' % data_file,
++        '-DICU_DATA_SYMBOL=%s' % data_symbol,
++    ]
++    SOURCES += [
++        'icudata.s',
++    ]
++else:
++    DEFINES['ICU_DATA_FILE'] = data_file
++    DEFINES['ICU_DATA_SYMBOL'] = data_symbol
++
++    SOURCES += [
++        'icudata.c',
++    ]
+diff --git a/js/moz.configure b/js/moz.configure
+--- a/js/moz.configure
++++ b/js/moz.configure
+@@ -517,23 +517,16 @@ def icu_version(build_env):
+                     try:
+                         return str(int(define[2]))
+                     except ValueError:
+                         pass
+     die('Cannot determine ICU version number from uvernum.h header file')
+ 
+ set_config('MOZ_ICU_VERSION', icu_version)
+ 
+-@depends(icu_version, target, when='--with-intl-api')
+-def icu_data_file(version, target):
+-    # target.endianness is always 'big' or 'little'
+-    return 'icudt%s%s.dat' % (version, target.endianness[0])
+-
+-set_config('ICU_DATA_FILE', icu_data_file)
+-
+ # Source files that use ICU should have control over which parts of the ICU
+ # namespace they want to use.
+ set_define('U_USING_ICU_NAMESPACE', '0', when='--with-intl-api')
+ 
+ # We build ICU as a static library.
+ set_define('U_STATIC_IMPLEMENTATION', True, when=depends(system_icu)(lambda x: not x))
+ 
+ @depends(yasm, gnu_as, target, compile_environment)

+ 33 - 0
mozilla-release/patches/1656063-81a1.patch

@@ -0,0 +1,33 @@
+# HG changeset patch
+# User Paul Adenot <paul@paul.cx>
+# Date 1596198541 0
+# Node ID af3620fc2747c1b4dbb72504fd731e9e4e3779d3
+# Parent  a73351520d7b4e2532abbf7ea81767b1c778c0ce
+Bug 1656063 - Only load function pointer in FFTBlock once. r=karlt
+
+Differential Revision: https://phabricator.services.mozilla.com/D85389
+
+diff --git a/dom/media/webaudio/FFTBlock.h b/dom/media/webaudio/FFTBlock.h
+--- a/dom/media/webaudio/FFTBlock.h
++++ b/dom/media/webaudio/FFTBlock.h
+@@ -40,17 +40,19 @@ class FFTBlock final
+     };
+   };
+ 
+ public:
+   static void MainThreadInit()
+   {
+ #ifdef MOZ_LIBAV_FFT
+     FFVPXRuntimeLinker::Init();
+-    FFVPXRuntimeLinker::GetRDFTFuncs(&sRDFTFuncs);
++    if (!sRDFTFuncs.init) {
++      FFVPXRuntimeLinker::GetRDFTFuncs(&sRDFTFuncs);
++    }
+ #endif
+   }
+ 
+   explicit FFTBlock(uint32_t aFFTSize)
+ #if defined(MOZ_LIBAV_FFT)
+     : mAvRDFT(nullptr)
+     , mAvIRDFT(nullptr)
+ #else

+ 57 - 0
mozilla-release/patches/1669888-83a1.patch

@@ -0,0 +1,57 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1602134908 0
+# Node ID 28f9b51522350fb69977f08869eee9636e586bee
+# Parent  519ca1b069e5305d4f901539ea0f4c5ad9d8f54f
+Bug 1669888 - Enable ffvpx on all Mac platforms. r=jya
+
+Differential Revision: https://phabricator.services.mozilla.com/D92875
+
+diff --git a/media/ffvpx/libavcodec/avcodec.symbols b/media/ffvpx/libavcodec/avcodec.symbols
+--- a/media/ffvpx/libavcodec/avcodec.symbols
++++ b/media/ffvpx/libavcodec/avcodec.symbols
+@@ -46,19 +46,21 @@ av_packet_side_data_name
+ av_packet_split_side_data
+ av_packet_unpack_dictionary
+ av_packet_unref
+ av_parser_change
+ av_parser_close
+ av_parser_init
+ av_parser_next
+ av_parser_parse2
++#ifdef MOZ_LIBAV_FFT
+ av_rdft_calc
+ av_rdft_end
+ av_rdft_init
++#endif
+ av_register_codec_parser
+ av_register_hwaccel
+ av_shrink_packet
+ av_vorbis_parse_frame
+ av_vorbis_parse_frame_flags
+ av_vorbis_parse_free
+ av_vorbis_parse_init
+ av_vorbis_parse_reset
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1605,17 +1605,17 @@ with only_when(compile_environment | art
+     set_config('YASM_HAS_AVX2', yasm_has_avx2)
+ 
+ 
+     @depends(yasm_has_avx2, libav_fft, vpx_as_flags, target)
+     def ffvpx(yasm_has_avx2, libav_fft, vpx_as_flags, target):
+         enable = flac_only = use_yasm = False
+         flags = []
+         if target.cpu in ('x86', 'x86_64') or \
+-                target.cpu == 'aarch64' and target.kernel == 'WINNT':
++                target.cpu == 'aarch64' and target.kernel in ('WINNT', 'Darwin'):
+             enable = True
+             if libav_fft and libav_fft.flags:
+                 use_yasm = True
+                 flags.extend(libav_fft.flags)
+                 if target.kernel == 'WINNT':
+                     if target.cpu == 'x86':
+                         # 32-bit windows need to prefix symbols with an underscore.
+                         flags.extend(('-DPREFIX', '-Pconfig_win32.asm'))
+

+ 44 - 0
mozilla-release/patches/1692940-01-88a1.patch

@@ -0,0 +1,44 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043603 0
+#      Tue Feb 23 01:26:43 2021 +0000
+# Node ID 8b391b7adca21255b4367d2435ba109ae481301b
+# Parent  46eab535e154186b019026fb7c56fe776cb5ab97
+Bug 1692940 - Revert bug 1508419. r=firefox-build-system-reviewers,andi,dmajor
+
+Back when bug 1508419 landed, we weren't using a bootstrapped nasm. It
+is less useful now that we are.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105424
+
+diff --git a/build/moz.configure/toolchain.configure b/build/moz.configure/toolchain.configure
+--- a/build/moz.configure/toolchain.configure
++++ b/build/moz.configure/toolchain.configure
+@@ -2156,22 +2156,21 @@ add_old_configure_assignment('ENABLE_MOZ
+ # nasm detection
+ # ==============================================================
+ nasm = check_prog('NASM', ['nasm'], allow_missing=True, paths=toolchain_search_path)
+ 
+ 
+ @depends_if(nasm)
+ @checking('nasm version')
+ def nasm_version(nasm):
+-    (retcode, stdout, _) = get_cmd_output(nasm, '-v')
+-    if retcode:
+-        # mac stub binary
+-        return None
+-
+-    version = stdout.splitlines()[0].split()[2]
++    version = (
++        check_cmd_output(nasm, "-v", onerror=lambda: die("Failed to get nasm version."))
++        .splitlines()[0]
++        .split()[2]
++    )
+     return Version(version)
+ 
+ 
+ @depends(nasm, target)
+ def nasm_asflags(nasm, target):
+     if nasm:
+         asflags = {
+             ('OSX', 'x86'): ['-f', 'macho32'],

+ 208 - 0
mozilla-release/patches/1692940-02-88a1.patch

@@ -0,0 +1,208 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043603 0
+#      Tue Feb 23 01:26:43 2021 +0000
+# Node ID a22f5d28effbce01de85f81f00339389727e29ff
+# Parent  9d7ea1896cd11743c773a1d72b1e00161c0632d8
+Bug 1692940 - Change the logic to check for nasm. r=firefox-build-system-reviewers,dmajor
+
+Instead of preemptively check for it, and then check if it's good enough to
+build AV1, only check for (and bootstrap) nasm when building AV1 requires
+it.
+
+At the same time, we future-proof the code to be able to handle multiple
+things requiring nasm, which we're going to add shortly.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105425
+
+diff --git a/build/moz.configure/toolchain.configure b/build/moz.configure/toolchain.configure
+--- a/build/moz.configure/toolchain.configure
++++ b/build/moz.configure/toolchain.configure
+@@ -2148,53 +2148,16 @@ add_old_configure_assignment('ENABLE_CLA
+                              depends_if('--enable-clang-plugin')(lambda _: True))
+ 
+ js_option('--enable-mozsearch-plugin', env='ENABLE_MOZSEARCH_PLUGIN',
+           help="Enable building with the mozsearch indexer plugin")
+ 
+ add_old_configure_assignment('ENABLE_MOZSEARCH_PLUGIN',
+                              depends_if('--enable-mozsearch-plugin')(lambda _: True))
+ 
+-# nasm detection
+-# ==============================================================
+-nasm = check_prog('NASM', ['nasm'], allow_missing=True, paths=toolchain_search_path)
+-
+-
+-@depends_if(nasm)
+-@checking('nasm version')
+-def nasm_version(nasm):
+-    version = (
+-        check_cmd_output(nasm, "-v", onerror=lambda: die("Failed to get nasm version."))
+-        .splitlines()[0]
+-        .split()[2]
+-    )
+-    return Version(version)
+-
+-
+-@depends(nasm, target)
+-def nasm_asflags(nasm, target):
+-    if nasm:
+-        asflags = {
+-            ('OSX', 'x86'): ['-f', 'macho32'],
+-            ('OSX', 'x86_64'): ['-f', 'macho64'],
+-            ('WINNT', 'x86'): ['-f', 'win32'],
+-            ('WINNT', 'x86_64'): ['-f', 'win64'],
+-        }.get((target.os, target.cpu), None)
+-        if asflags is None:
+-            # We're assuming every x86 platform we support that's
+-            # not Windows or Mac is ELF.
+-            if target.cpu == 'x86':
+-                asflags = ['-f', 'elf32']
+-            elif target.cpu == 'x86_64':
+-                asflags = ['-f', 'elf64']
+-        return asflags
+-
+-
+-set_config('NASM_ASFLAGS', nasm_asflags)
+-
+ 
+ # clang-cl integrated assembler support
+ # ==============================================================
+ @depends(target)
+ def clangcl_asflags(target):
+     asflags = None
+     if target.os == 'WINNT' and target.cpu == 'aarch64':
+         asflags = ['--target=aarch64-windows-msvc']
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -403,26 +403,27 @@ imply_option('--enable-fmp4', ffmpeg, '-
+ option('--disable-av1',
+         help='Disable av1 video support')
+ 
+ @depends('--enable-av1')
+ def av1(value):
+     if value:
+         return True
+ 
+-@depends(target, nasm_version, when=av1 & compile_environment)
+-def dav1d_asm(target, nasm_version):
+-    if target.os != 'Android':
+-        if target.cpu == 'aarch64':
+-            return True
+-        elif target.cpu in ('x86', 'x86_64'):
+-            if nasm_version < '2.14':
+-                die('nasm 2.14 or greater is required for AV1 support. '
+-                    'Either install nasm or add --disable-av1 to your configure options.')
+-            return True
++
++@depends(target, when=av1 & compile_environment)
++def dav1d_asm(target):
++    if target.cpu in ("aarch64", "x86", "x86_64"):
++        return True
++
++
++@depends(target, when=av1 & compile_environment)
++def dav1d_nasm(target):
++    if target.cpu in ("x86", "x86_64"):
++        return namespace(version="2.14", what="AV1")
+ 
+ 
+ set_config('MOZ_DAV1D_ASM', dav1d_asm)
+ set_define('MOZ_DAV1D_ASM', dav1d_asm)
+ set_config('MOZ_AV1', av1)
+ set_define('MOZ_AV1', av1)
+ 
+ # Built-in fragmented MP4 support.
+@@ -1533,16 +1534,92 @@ def valid_yasm_version(yasm_version, for
+     by_version = sorted(versioned.items(), key=lambda x: x[1])
+     if by_version:
+         what, version = by_version[-1]
+         if yasm_version < version:
+             die('Yasm version %s or greater is required to build with %s.'
+                 % (version, what))
+ 
+ 
++# nasm detection
++# ==============================================================
++@depends(dav1d_nasm)
++def need_nasm(*requirements):
++    requires = {
++        x.what: x.version if hasattr(x, "version") else True for x in requirements if x
++    }
++    if requires:
++        items = sorted(requires.keys())
++        if len(items) > 1:
++            what = " and ".join((", ".join(items[:-1]), items[-1]))
++        else:
++            what = items[0]
++        versioned = {k: v for (k, v) in requires.items() if v is not True}
++        return namespace(what=what, versioned=versioned)
++
++
++nasm = check_prog(
++    "NASM",
++    ["nasm"],
++    allow_missing=True,
++    paths=bootstrap_search_path("nasm", when=need_nasm),
++    when=need_nasm,
++)
++
++
++@depends(nasm, need_nasm.what)
++def check_nasm(nasm, what):
++    if not nasm and what:
++        die("Nasm is required to build with %s, but it was not found." % what)
++    return nasm
++
++
++@depends_if(check_nasm)
++@checking("nasm version")
++def nasm_version(nasm):
++    version = (
++        check_cmd_output(nasm, "-v", onerror=lambda: die("Failed to get nasm version."))
++        .splitlines()[0]
++        .split()[2]
++    )
++    return Version(version)
++
++
++@depends(nasm_version, need_nasm.versioned, when=need_nasm.versioned)
++def check_nasm_version(nasm_version, versioned):
++    by_version = sorted(versioned.items(), key=lambda x: x[1])
++    what, version = by_version[-1]
++    if nasm_version < version:
++        die(
++            "Nasm version %s or greater is required to build with %s." % (version, what)
++        )
++    return nasm_version
++
++
++@depends(target, when=check_nasm_version)
++def nasm_asflags(target):
++    asflags = {
++        ("OSX", "x86"): ["-f", "macho32"],
++        ("OSX", "x86_64"): ["-f", "macho64"],
++        ("WINNT", "x86"): ["-f", "win32"],
++        ("WINNT", "x86_64"): ["-f", "win64"],
++    }.get((target.os, target.cpu), None)
++    if asflags is None:
++        # We're assuming every x86 platform we support that's
++        # not Windows or Mac is ELF.
++        if target.cpu == "x86":
++            asflags = ["-f", "elf32"]
++        elif target.cpu == "x86_64":
++            asflags = ["-f", "elf64"]
++    return asflags
++
++
++set_config("NASM_ASFLAGS", nasm_asflags)
++
++
+ # ANGLE OpenGL->D3D translator for WebGL
+ # ==============================================================
+ 
+ with only_when(compile_environment & target_is_windows):
+     def d3d_compiler_dll_result(value):
+         if not value.path:
+             return 'provided by the OS'
+         return value.path

+ 120 - 0
mozilla-release/patches/1692940-03-88a1.patch

@@ -0,0 +1,120 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043604 0
+#      Tue Feb 23 01:26:44 2021 +0000
+# Node ID 157125c6c140d4438c2745355307e4dfb58da631
+# Parent  ec793c77c014306408e31e9fa13a2a21d3f16641
+Bug 1692940 - Remove test case for yasm version validation. r=firefox-build-system-reviewers,dmajor
+
+We're going to remove arguments to the function, and eventually remove
+it.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105426
+
+diff --git a/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py b/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py
+--- a/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py
++++ b/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py
+@@ -6,17 +6,16 @@ from __future__ import absolute_import, 
+ 
+ import os
+ 
+ from buildconfig import topsrcdir
+ from common import BaseConfigureTest
+ from six import StringIO
+ from mozunit import main
+ from mozbuild.configure.options import InvalidOptionError
+-from mozbuild.configure.util import Version
+ from mozpack import path as mozpath
+ 
+ 
+ class TestToolkitMozConfigure(BaseConfigureTest):
+     def test_moz_configure_options(self):
+         def get_value_for(args=[], environ={}, mozconfig=''):
+             sandbox = self.get_sandbox({}, {}, args, environ, mozconfig)
+ 
+@@ -82,85 +81,11 @@ class TestToolkitMozConfigure(BaseConfig
+         self.assertEqual(get_value(['--enable-release'],
+                                    environ={'MOZILLA_OFFICIAL': 1}), None)
+ 
+         with self.assertRaises(InvalidOptionError):
+             get_value(['--disable-release'], environ={'MOZILLA_OFFICIAL': 1})
+ 
+         self.assertEqual(get_value(environ={'MOZ_AUTOMATION': 1}), None)
+ 
+-    def test_valid_yasm_version(self):
+-        out = StringIO()
+-        sandbox = self.get_sandbox({}, {}, out=out)
+-        func = sandbox._depends[sandbox['valid_yasm_version']]._func
+-
+-        # Missing yasm is not an error when nothing requires it.
+-        func(None, False, False)
+-
+-        # Any version of yasm works when nothing requires it.
+-        func(Version('1.0'), False, False)
+-
+-        # Any version of yasm works when something requires any version.
+-        func(Version('1.0'), True, False)
+-        func(Version('1.0'), True, True)
+-        func(Version('1.0'), False, True)
+-
+-        # A version of yasm greater than any requirement works.
+-        func(Version("1.5"), Version("1.0"), True)
+-        func(Version("1.5"), True, Version("1.0"))
+-        func(Version("1.5"), Version("1.1"), Version("1.0"))
+-
+-        out.truncate(0)
+-        out.seek(0)
+-        with self.assertRaises(SystemExit):
+-            func(None, Version('1.0'), False)
+-
+-        self.assertEqual(
+-            out.getvalue(),
+-            ('ERROR: Yasm is required to build with vpx, but you do not appear '
+-             'to have Yasm installed.\n'),
+-        )
+-
+-        out.truncate(0)
+-        out.seek(0)
+-        with self.assertRaises(SystemExit):
+-            func(None, Version('1.0'), Version('1.0'))
+-
+-        self.assertEqual(
+-            out.getvalue(),
+-            ('ERROR: Yasm is required to build with jpeg and vpx, but you do not appear '
+-             'to have Yasm installed.\n'),
+-        )
+-
+-        out.truncate(0)
+-        out.seek(0)
+-        with self.assertRaises(SystemExit):
+-            func(None, Version('1.0'), Version('1.0'))
+-
+-        self.assertEqual(
+-            out.getvalue(),
+-            ('ERROR: Yasm is required to build with jpeg, libav and vpx, but you do not appear '
+-             'to have Yasm installed.\n'),
+-        )
+-
+-        out.truncate(0)
+-        out.seek(0)
+-        with self.assertRaises(SystemExit):
+-            func(Version('1.0'), Version('1.1'), Version('1.0'))
+-
+-        self.assertEqual(
+-            out.getvalue(),
+-            'ERROR: Yasm version 1.1 or greater is required to build with vpx.\n'
+-        )
+-
+-        out.truncate(0)
+-        out.seek(0)
+-        with self.assertRaises(SystemExit):
+-            func(Version('1.0'), True, Version('1.0.1'))
+-
+-        self.assertEqual(
+-            out.getvalue(),
+-            'ERROR: Yasm version 1.0.1 or greater is required to build with jpeg.\n'
+-        )
+-
+ 
+ if __name__ == '__main__':
+     main()

+ 166 - 0
mozilla-release/patches/1692940-04-88a1.patch

@@ -0,0 +1,166 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043604 0
+#      Tue Feb 23 01:26:44 2021 +0000
+# Node ID 2d3e201b3724cc09a368f6b2ff47ff2364d6694b
+# Parent  13824fb0add8cd564564c907fec87cba0785966a
+Bug 1692940 - Switch vpx build to nasm instead of yasm. r=firefox-build-system-reviewers,dmajor
+
+We also remove the dependency on the check for GNU as, because all the
+build environments we support for arm use GNU as, and the dependency
+causes complications.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105427
+
+diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build
+--- a/media/libvpx/moz.build
++++ b/media/libvpx/moz.build
+@@ -4,18 +4,18 @@
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ 
+ with Files('*'):
+     BUG_COMPONENT = ('Core', 'Audio/Video')
+ 
+ include('sources.mozbuild')
+ 
+-if CONFIG['VPX_USE_YASM']:
+-    USE_YASM = True
++if CONFIG['VPX_USE_NASM']:
++    USE_NASM = True
+ 
+ # Linux, Mac and Win share file lists for x86* but not configurations.
+ if CONFIG['CPU_ARCH'] == 'x86_64':
+     EXPORTS.vpx += files['X64_EXPORTS']
+     SOURCES += files['X64_SOURCES']
+     if CONFIG['OS_TARGET'] == 'WINNT':
+         ASFLAGS += [ '-I%s/media/libvpx/config/win/x64/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libvpx/config/win/x64/' ]
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1232,32 +1232,31 @@ with only_when(compile_environment):
+         check_symbol('vpx_codec_dec_init_ver', flags=vpx.libs, onerror=lambda: die(
+             "--with-system-libvpx requested but symbol vpx_codec_dec_init_ver "
+             "not found"
+         ))
+ 
+         set_config('MOZ_SYSTEM_LIBVPX', True)
+ 
+ 
+-    @depends('--with-system-libvpx', target, gnu_as)
+-    def in_tree_vpx(system_libvpx, target, gnu_as):
++    @depends('--with-system-libvpx', target)
++    def in_tree_vpx(system_libvpx, target):
+         if system_libvpx:
+             return
+ 
+-        use_yasm = (target.cpu in ('x86', 'x86_64')) or None
+-        need_yasm = False
+-        arm_asm = (target.cpu == 'arm' and gnu_as) or None
++        arm_asm = (target.cpu == "arm") or None
++        return namespace(arm_asm=arm_asm)
+ 
+-        if use_yasm:
+-            need_yasm = True
+-            if target.kernel == 'WINNT':
+-                need_yasm = Version('1.1')
+-
+-        return namespace(arm_asm=arm_asm, use_yasm=use_yasm, need_yasm=need_yasm)
+-
++    @depends(target, when=in_tree_vpx)
++    def vpx_nasm(target):
++        if target.cpu in ("x86", "x86_64"):
++            if target.kernel == "WINNT":
++                # Version 2.03 is needed for automatic safeseh support.
++                return namespace(version="2.03", what="VPX")
++            return namespace(what="VPX")
+ 
+     # Building with -mfpu=neon requires either the "softfp" or the
+     # "hardfp" ABI. Depending on the compiler's default target, and the
+     # CFLAGS, the default ABI might be neither, in which case it is the
+     # "softfloat" ABI.
+     # The "softfloat" ABI is binary-compatible with the "softfp" ABI, so
+     # we can safely mix code built with both ABIs. So, if we detect
+     # that compiling uses the "softfloat" ABI, force the use of the
+@@ -1266,31 +1265,31 @@ with only_when(compile_environment):
+     # "softfloat" ABI, not the "softfp" ABI.
+     # Note: VPX_ASFLAGS is also used in CFLAGS.
+     softfp = cxx_compiler.try_compile(body='''
+         #ifndef __SOFTFP__
+         #error "compiler target supports -mfpu=neon, so we don't have to add extra flags"
+         #endif''', when=in_tree_vpx.arm_asm)
+ 
+ 
+-    @depends(in_tree_vpx, softfp, target)
+-    def vpx_as_flags(vpx, softfp, target):
++    @depends(in_tree_vpx, vpx_nasm, softfp, target)
++    def vpx_as_flags(vpx, vpx_nasm, softfp, target):
+         flags = []
+         if vpx and vpx.arm_asm:
+             # These flags are a lie; they're just used to enable the requisite
+             # opcodes; actual arch detection is done at runtime.
+             flags = ['-march=armv7-a', '-mfpu=neon']
+             if softfp:
+                 flags.append('-mfloat-abi=softfp')
+-        elif vpx and vpx.use_yasm and target.os != 'WINNT' and target.cpu != 'x86_64':
++        elif vpx and vpx_nasm and target.os != "WINNT" and target.cpu != "x86_64":
+             flags = ['-DPIC']
+         return flags
+ 
+ 
+-    set_config('VPX_USE_YASM', in_tree_vpx.use_yasm)
++    set_config("VPX_USE_NASM", True, when=vpx_nasm)
+     set_config('VPX_ASFLAGS', vpx_as_flags)
+ 
+ 
+ # JPEG
+ # ====
+ 
+ with only_when(compile_environment):
+     option('--with-system-jpeg', nargs='?',
+@@ -1505,23 +1504,22 @@ with only_when(compile_environment):
+     set_config('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+     set_config("FFVPX_USE_YASM", True, when=ffvpx.need_yasm)
+ 
+ 
+-@depends(yasm_version, in_tree_vpx.need_yasm, in_tree_jpeg.use_yasm,
++@depends(yasm_version, in_tree_jpeg.use_yasm,
+          ffvpx.need_yasm)
+ @imports(_from='__builtin__', _import='sorted')
+-def valid_yasm_version(yasm_version, for_vpx, for_jpeg, for_ffvpx=False):
++def valid_yasm_version(yasm_version, for_jpeg, for_ffvpx=False):
+     # Note: the default for for_ffvpx above only matters for unit tests.
+     requires = {
+-        'vpx': for_vpx,
+         'jpeg': for_jpeg,
+         'ffvpx': for_ffvpx,
+     }
+     requires = {k: v for (k, v) in requires.items() if v}
+     if requires and not yasm_version:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = ' and '.join((', '.join(items[:-1]), items[-1]))
+@@ -1536,17 +1534,17 @@ def valid_yasm_version(yasm_version, for
+         what, version = by_version[-1]
+         if yasm_version < version:
+             die('Yasm version %s or greater is required to build with %s.'
+                 % (version, what))
+ 
+ 
+ # nasm detection
+ # ==============================================================
+-@depends(dav1d_nasm)
++@depends(dav1d_nasm, vpx_nasm)
+ def need_nasm(*requirements):
+     requires = {
+         x.what: x.version if hasattr(x, "version") else True for x in requirements if x
+     }
+     if requires:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = " and ".join((", ".join(items[:-1]), items[-1]))

+ 73 - 38
mozilla-release/patches/1692940-5-88a1.patch → mozilla-release/patches/1692940-05-88a1.patch

@@ -3,7 +3,7 @@
 # Date 1614043605 0
 #      Tue Feb 23 01:26:45 2021 +0000
 # Node ID 2d78e4bc3367320976d35629044085e8ee26a1fe
-# Parent  9ca7d0aae25674fb8e42387fead0b7ca3571a3be
+# Parent  e42b1e9178d2725962db0e7bb52a3880f3242f23
 Bug 1692940 - Switch jpeg build to nasm instead of yasm. r=firefox-build-system-reviewers,dmajor
 
 Differential Revision: https://phabricator.services.mozilla.com/D105428
@@ -35,7 +35,7 @@ diff --git a/media/libjpeg/moz.build b/media/libjpeg/moz.build
 diff --git a/toolkit/moz.configure b/toolkit/moz.configure
 --- a/toolkit/moz.configure
 +++ b/toolkit/moz.configure
-@@ -1354,51 +1354,45 @@ with only_when(compile_environment):
+@@ -1348,51 +1348,45 @@ with only_when(compile_environment):
          set_config('MOZ_JPEG_CFLAGS', jpeg_flags.cflags)
          set_config('MOZ_JPEG_LIBS', jpeg_flags.ldflags)
  
@@ -47,56 +47,44 @@ diff --git a/toolkit/moz.configure b/toolkit/moz.configure
 -        flags = ()
 -        use_yasm = None
 -        need_yasm = False
--        if target.kernel == 'Darwin':
--            if target.cpu == 'x86':
+         if target.kernel == 'Darwin':
+             if target.cpu == 'x86':
 -                flags = ('-DPIC', '-DMACHO')
--            elif target.cpu == 'x86_64':
--                flags = ('-D__x86_64__', '-DPIC', '-DMACHO')
--        elif target.kernel == 'WINNT':
--            if target.cpu == 'x86':
--                flags = ('-DPIC', '-DWIN32')
--            elif target.cpu == 'x86_64':
--                flags = ('-D__x86_64__', '-DPIC', '-DWIN64', '-DMSVC')
--        elif target.cpu == 'arm':
--            flags = ('-march=armv7-a', '-mfpu=neon')
--        elif target.cpu == 'aarch64':
--            flags = ('-march=armv8-a',)
--        elif target.cpu == 'mips32':
--            flags = ('-mdspr2',)
--        elif target.cpu == 'x86':
--            flags = ('-DPIC', '-DELF')
--        elif target.cpu == 'x86_64':
--            flags = ('-D__x86_64__', '-DPIC', '-DELF')
-+        if target.kernel == "Darwin":
-+            if target.cpu == "x86":
 +                return ("-DPIC", "-DMACHO")
-+            elif target.cpu == "x86_64":
+             elif target.cpu == 'x86_64':
+-                flags = ('-D__x86_64__', '-DPIC', '-DMACHO')
 +                return ("-D__x86_64__", "-DPIC", "-DMACHO")
-+        elif target.kernel == "WINNT":
-+            if target.cpu == "x86":
+         elif target.kernel == 'WINNT':
+             if target.cpu == 'x86':
+-                flags = ('-DPIC', '-DWIN32')
 +                return ("-DPIC", "-DWIN32")
-+            elif target.cpu == "x86_64":
+             elif target.cpu == 'x86_64':
+-                flags = ('-D__x86_64__', '-DPIC', '-DWIN64', '-DMSVC')
 +                return ("-D__x86_64__", "-DPIC", "-DWIN64", "-DMSVC")
-+        elif target.cpu == "arm":
+         elif target.cpu == 'arm':
+-            flags = ('-march=armv7-a', '-mfpu=neon')
 +            return ("-march=armv7-a", "-mfpu=neon")
-+        elif target.cpu == "aarch64":
+         elif target.cpu == 'aarch64':
+-            flags = ('-march=armv8-a',)
 +            return ("-march=armv8-a",)
-+        elif target.cpu == "mips32":
+         elif target.cpu == 'mips32':
+-            flags = ('-mdspr2',)
 +            return ("-mdspr2",)
-+        elif target.cpu == "x86":
+         elif target.cpu == 'x86':
+-            flags = ('-DPIC', '-DELF')
 +            return ("-DPIC", "-DELF")
-+        elif target.cpu == "x86_64":
+         elif target.cpu == 'x86_64':
+-            flags = ('-D__x86_64__', '-DPIC', '-DELF')
 +            return ("-D__x86_64__", "-DPIC", "-DELF")
  
--        if target.cpu in ('x86', 'x86_64'):
++    @depends(target, when=in_tree_jpeg)
++    def jpeg_nasm(target):
+         if target.cpu in ('x86', 'x86_64'):
 -            use_yasm = True
 -            if target.kernel == 'Linux' and target.os == 'GNU':
 -                need_yasm = Version('1.0.1')
 -            else:
 -                need_yasm = Version('1.1')
-+    @depends(target, when=in_tree_jpeg)
-+    def jpeg_nasm(target):
-+        if target.cpu in ("x86", "x86_64"):
 +            # libjpeg-turbo 2.0.6 requires nasm 2.10.
 +            return namespace(version="2.10", what="JPEG")
  
@@ -108,9 +96,56 @@ diff --git a/toolkit/moz.configure b/toolkit/moz.configure
 +    set_config("LIBJPEG_TURBO_ASFLAGS", in_tree_jpeg)
  
  
- # Libav-fft Support
+ # FFmpeg's ffvpx configuration
  # ==============================================================
  with only_when(compile_environment):
      @depends(target)
      def libav_fft(target):
-         flags = None
+         return target.kernel == "WINNT" or target.cpu == "x86_64"
+@@ -1465,23 +1459,23 @@ with only_when(compile_environment):
+     set_config('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+     set_config("FFVPX_USE_YASM", True, when=ffvpx.need_yasm)
+ 
+ 
+-@depends(yasm_version, in_tree_jpeg.use_yasm,
+-         ffvpx.need_yasm)
++@depends(yasm_version,
++         ffvpx.use_yasm,
++)
+ @imports(_from='__builtin__', _import='sorted')
+-def valid_yasm_version(yasm_version, for_jpeg, for_ffvpx=False):
++def valid_yasm_version(yasm_version, for_ffvpx=False):
+     # Note: the default for for_ffvpx above only matters for unit tests.
+     requires = {
+-        'jpeg': for_jpeg,
+         'ffvpx': for_ffvpx,
+     }
+     requires = {k: v for (k, v) in requires.items() if v}
+     if requires and not yasm_version:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = ' and '.join((', '.join(items[:-1]), items[-1]))
+         else:
+@@ -1495,17 +1489,17 @@ def valid_yasm_version(yasm_version, for
+         what, version = by_version[-1]
+         if yasm_version < version:
+             die('Yasm version %s or greater is required to build with %s.'
+                 % (version, what))
+ 
+ 
+ # nasm detection
+ # ==============================================================
+-@depends(dav1d_nasm, vpx_nasm)
++@depends(dav1d_nasm, vpx_nasm, jpeg_nasm)
+ def need_nasm(*requirements):
+     requires = {
+         x.what: x.version if hasattr(x, "version") else True for x in requirements if x
+     }
+     if requires:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = " and ".join((", ".join(items[:-1]), items[-1]))

+ 218 - 0
mozilla-release/patches/1692940-06-88a1.patch

@@ -0,0 +1,218 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043605 0
+#      Tue Feb 23 01:26:45 2021 +0000
+# Node ID da0ee340f69903904c61da6b2d1cfac2d3aca4f2
+# Parent  d17a4e2acf84047fcb1a47598747760378dbc8a4
+Bug 1692940 - Switch ffvpx build to nasm instead of yasm. r=firefox-build-system-reviewers,dmajor
+
+nasm doesn't like compiling simple_idct10.asm on x86
+(https://bugzilla.nasm.us/show_bug.cgi?id=3392738), which is empty once
+preprocessed for x86, so exclude it there.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105429
+
+diff --git a/media/ffvpx/ffvpxcommon.mozbuild b/media/ffvpx/ffvpxcommon.mozbuild
+--- a/media/ffvpx/ffvpxcommon.mozbuild
++++ b/media/ffvpx/ffvpxcommon.mozbuild
+@@ -7,18 +7,18 @@
+ # Add assembler flags and includes
+ if CONFIG['CPU_ARCH'] != 'aarch64':
+     ASFLAGS += CONFIG['FFVPX_ASFLAGS']
+     ASFLAGS += ['-I%s/media/ffvpx/' % TOPSRCDIR]
+     ASFLAGS += ['-I%s/media/ffvpx/libavcodec/x86/' % TOPSRCDIR]
+     ASFLAGS += ['-I%s/media/ffvpx/libavutil/x86/' % TOPSRCDIR]
+ 
+ if CONFIG['FFVPX_ASFLAGS']:
+-    if CONFIG['FFVPX_USE_YASM']:
+-        USE_YASM = True
++    if CONFIG['FFVPX_USE_NASM']:
++        USE_NASM = True
+ 
+     if CONFIG['OS_ARCH'] == 'WINNT':
+        # Fix inline symbols and math defines for windows.
+         DEFINES['_USE_MATH_DEFINES'] = True
+         DEFINES['inline'] = "__inline"
+ 
+ LOCAL_INCLUDES += ['/media/ffvpx']
+ 
+diff --git a/media/ffvpx/libavcodec/x86/moz.build b/media/ffvpx/libavcodec/x86/moz.build
+--- a/media/ffvpx/libavcodec/x86/moz.build
++++ b/media/ffvpx/libavcodec/x86/moz.build
+@@ -6,16 +6,18 @@
+ 
+ SOURCES += [
+     'constants.c',
+     'flacdsp.asm',
+     'flacdsp_init.c',
+     'h264_intrapred.asm',
+     'h264_intrapred_10bit.asm',
+     'h264_intrapred_init.c',
++# Bug 1582271
++#    -    'simple_idct10.asm',
+     'videodsp.asm',
+     'videodsp_init.c',
+     'vp8dsp.asm',
+     'vp8dsp_init.c',
+     'vp8dsp_loopfilter.asm',
+     'vp9dsp_init.c',
+     'vp9dsp_init_10bpp.c',
+     'vp9dsp_init_12bpp.c',
+@@ -25,16 +27,22 @@ SOURCES += [
+     'vp9itxfm.asm',
+     'vp9itxfm_16bpp.asm',
+     'vp9lpf.asm',
+     'vp9lpf_16bpp.asm',
+     'vp9mc.asm',
+     'vp9mc_16bpp.asm',
+ ]
+ 
++# Bug 1582271
++# if CONFIG['CPU_ARCH'] == "x86_64":
++#     SOURCES += [
++#         'simple_idct10.asm',
++#     ]
++
+ if CONFIG['MOZ_LIBAV_FFT']:
+     SOURCES += [
+         'fft.asm',
+         'fft_init.c',
+     ]
+ 
+ FINAL_LIBRARY = 'mozavcodec'
+ 
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1392,19 +1392,19 @@ with only_when(compile_environment):
+         return target.kernel == "WINNT" or target.cpu == "x86_64"
+ 
+     set_config('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+     set_define('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+ 
+ 
+ with only_when(compile_environment):
+ 
+-    @depends(vpx_as_flags, target)
+-    def ffvpx(vpx_as_flags, target):
+-        enable = use_yasm = True
++    @depends(target)
++    def ffvpx(target):
++        enable = use_nasm = True
+         flac_only = False
+         flags = []
+ 
+         if target.kernel == "WINNT":
+             if target.cpu == "x86":
+                 # 32-bit windows need to prefix symbols with an underscore.
+                 flags = ["-DPIC", "-DWIN32", "-DPREFIX", "-Pconfig_win32.asm"]
+             elif target.cpu == "x86_64":
+@@ -1412,17 +1412,17 @@ with only_when(compile_environment):
+                     "-D__x86_64__",
+                     "-DPIC",
+                     "-DWIN64",
+                     "-DMSVC",
+                     "-Pconfig_win64.asm",
+                 ]
+             elif target.cpu == "aarch64":
+                 flags = ["-DPIC", "-DWIN64"]
+-                use_yasm = False
++                use_nasm = False
+         elif target.kernel == "Darwin":
+             if target.cpu == "x86_64":
+                 # 32/64-bit macosx asemblers need to prefix symbols with an
+                 # underscore.
+                 flags = [
+                     "-D__x86_64__",
+                     "-DPIC",
+                     "-DMACHO",
+@@ -1430,53 +1430,59 @@ with only_when(compile_environment):
+                     "-Pconfig_darwin64.asm",
+                 ]
+             else:
+                 flac_only = True
+         elif target.cpu == "x86_64":
+             flags = ["-D__x86_64__", "-DPIC", "-DELF", "-Pconfig_unix64.asm"]
+         elif target.cpu == "x86":
+             flac_only = True
+-        elif target.cpu in ("arm", "aarch64"):
+-            flac_only = True
+-            flags.extend(vpx_as_flags)
+         else:
+             enable = False
+ 
+         if flac_only or not enable:
+-            use_yasm = False
++            use_nasm = False
+ 
+-        if use_yasm:
++        if use_nasm:
+             # default disabled components
+             flags.append('-Pdefaults_disabled.asm')
+ 
+         return namespace(
+             enable=enable,
+-            need_yasm="1.2" if use_yasm else False,
++            use_nasm=use_nasm,
+             flac_only=flac_only,
+             flags=flags,
+         )
+ 
++    @depends(when=ffvpx.use_nasm)
++    def ffvpx_nasm():
++        # nasm 2.10 for AVX-2 support.
++        return namespace(version="2.10", what="FFVPX")
++
++    # ffvpx_nasm can't indirectly depend on vpx_as_flags, because it depends
++    # on a compiler test, so we have to do a little bit of dance here.
++    @depends(ffvpx, vpx_as_flags, target)
++    def ffvpx(ffvpx, vpx_as_flags, target):
++        if ffvpx and target.cpu in ("arm", "aarch64"):
++            ffvpx.flags.extend(vpx_as_flags)
++        return ffvpx
+ 
+     set_config('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+-    set_config("FFVPX_USE_YASM", True, when=ffvpx.need_yasm)
++    set_config("FFVPX_USE_NASM", True, when=ffvpx.use_nasm)
+ 
+ 
+ @depends(yasm_version,
+-         ffvpx.use_yasm,
+ )
+ @imports(_from='__builtin__', _import='sorted')
+-def valid_yasm_version(yasm_version, for_ffvpx=False):
+-    # Note: the default for for_ffvpx above only matters for unit tests.
++def valid_yasm_version(yasm_version):
+     requires = {
+-        'ffvpx': for_ffvpx,
+     }
+     requires = {k: v for (k, v) in requires.items() if v}
+     if requires and not yasm_version:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = ' and '.join((', '.join(items[:-1]), items[-1]))
+         else:
+             what = items[0]
+@@ -1489,17 +1495,17 @@ def valid_yasm_version(yasm_version, for
+         what, version = by_version[-1]
+         if yasm_version < version:
+             die('Yasm version %s or greater is required to build with %s.'
+                 % (version, what))
+ 
+ 
+ # nasm detection
+ # ==============================================================
+-@depends(dav1d_nasm, vpx_nasm, jpeg_nasm)
++@depends(dav1d_nasm, vpx_nasm, jpeg_nasm, ffvpx_nasm)
+ def need_nasm(*requirements):
+     requires = {
+         x.what: x.version if hasattr(x, "version") else True for x in requirements if x
+     }
+     if requires:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = " and ".join((", ".join(items[:-1]), items[-1]))

+ 80 - 0
mozilla-release/patches/1692940-07-88a1.patch

@@ -0,0 +1,80 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043606 0
+#      Tue Feb 23 01:26:46 2021 +0000
+# Node ID 3b1e09a1e421f6767391ff03b91550a2ffb7f66f
+# Parent  bfa6009b2b705ad4064c1b74cfa7d65bd854fcc6
+Bug 1692940 - Switch aom build to nasm instead of yasm. r=firefox-build-system-reviewers,jbauman,dmajor,andi
+
+nasm doesn't like compiling x86_abi_support.asm
+(https://bugzilla.nasm.us/show_bug.cgi?id=3392738), which is actually an
+include file, rather than a source file, so it shouldn't have been in
+the list of sources in the first place (libvpx has a similar file that
+is excluded already, for instance).
+
+I was considering updating the vendoring script, but it turns out it
+doesn't produce the current contents in-tree (which even breaks the
+build), and aom is set to be removed (bug 1635296)...
+
+Differential Revision: https://phabricator.services.mozilla.com/D105430
+
+diff --git a/media/libaom/moz.build b/media/libaom/moz.build
+--- a/media/libaom/moz.build
++++ b/media/libaom/moz.build
+@@ -8,33 +8,33 @@ with Files('*'):
+     BUG_COMPONENT = ('Core', 'Audio/Video')
+ 
+ include('sources.mozbuild')
+ 
+ # Linux, Mac and Win share file lists for x86* but not configurations.
+ if CONFIG['CPU_ARCH'] == 'x86_64':
+     EXPORTS.aom += files['X64_EXPORTS']
+     SOURCES += files['X64_SOURCES']
+-    USE_YASM = True
++    USE_NASM = True
+     if CONFIG['OS_TARGET'] == 'WINNT':
+         ASFLAGS += [ '-I%s/media/libaom/config/win/x64/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/win/x64/' ]
+         EXPORTS.aom += [ 'config/win/x64/config/aom_config.h' ]
+     elif CONFIG['OS_TARGET'] == 'Darwin':
+         ASFLAGS += [ '-I%s/media/libaom/config/mac/x64/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/mac/x64/' ]
+         EXPORTS.aom += [ 'config/mac/x64/config/aom_config.h' ]
+     else: # Android, Linux, BSDs, etc.
+         ASFLAGS += [ '-I%s/media/libaom/config/linux/x64/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/linux/x64/' ]
+         EXPORTS.aom += [ 'config/linux/x64/config/aom_config.h' ]
+ elif CONFIG['CPU_ARCH'] == 'x86':
+     EXPORTS.aom += files['IA32_EXPORTS']
+     SOURCES += files['IA32_SOURCES']
+-    USE_YASM = True
++    USE_NASM = True
+     if CONFIG['OS_TARGET'] == 'WINNT':
+         ASFLAGS += [ '-I%s/media/libaom/config/win/ia32/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/win/ia32/' ]
+         EXPORTS.aom += [ 'config/win/ia32/config/aom_config.h' ]
+     else: # Android, Linux, BSDs, etc.
+         ASFLAGS += [ '-I%s/media/libaom/config/linux/ia32/' % TOPSRCDIR ]
+         LOCAL_INCLUDES += [ '/media/libaom/config/linux/ia32/' ]
+         EXPORTS.aom += [ 'config/linux/ia32/config/aom_config.h' ]
+diff --git a/media/libaom/sources.mozbuild b/media/libaom/sources.mozbuild
+--- a/media/libaom/sources.mozbuild
++++ b/media/libaom/sources.mozbuild
+@@ -271,17 +271,16 @@ files = {
+     '../../third_party/aom/aom_dsp/x86/intrapred_avx2.c',
+     '../../third_party/aom/aom_dsp/x86/intrapred_sse2.c',
+     '../../third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm',
+     '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.c',
+     '../../third_party/aom/aom_dsp/x86/inv_wht_sse2.asm',
+     '../../third_party/aom/aom_dsp/x86/loopfilter_sse2.c',
+     '../../third_party/aom/aom_mem/aom_mem.c',
+     '../../third_party/aom/aom_ports/emms.asm',
+-    '../../third_party/aom/aom_ports/x86_abi_support.asm',
+     '../../third_party/aom/aom_scale/aom_scale_rtcd.c',
+     '../../third_party/aom/aom_scale/generic/aom_scale.c',
+     '../../third_party/aom/aom_scale/generic/gen_scalers.c',
+     '../../third_party/aom/aom_scale/generic/yv12config.c',
+     '../../third_party/aom/aom_scale/generic/yv12extend.c',
+     '../../third_party/aom/aom_util/aom_thread.c',
+     '../../third_party/aom/aom_util/debug_util.c',
+     '../../third_party/aom/av1/av1_dx_iface.c',

+ 89 - 0
mozilla-release/patches/1692940-08-88a1.patch

@@ -0,0 +1,89 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043606 0
+#      Tue Feb 23 01:26:46 2021 +0000
+# Node ID d74f2f0996f984e0cfdf6a07421a6fd50dc25db2
+# Parent  9fc987186e4c2d1319a3a36b1625eb2484c2cc81
+Bug 1692940 - Turn the USE_YASM unit test into a USE_NAME one. r=firefox-build-system-reviewers,dmajor
+
+Differential Revision: https://phabricator.services.mozilla.com/D105431
+
+diff --git a/python/mozbuild/mozbuild/test/frontend/data/use-yasm/moz.build b/python/mozbuild/mozbuild/test/frontend/data/use-nasm/moz.build
+rename from python/mozbuild/mozbuild/test/frontend/data/use-yasm/moz.build
+rename to python/mozbuild/mozbuild/test/frontend/data/use-nasm/moz.build
+--- a/python/mozbuild/mozbuild/test/frontend/data/use-yasm/moz.build
++++ b/python/mozbuild/mozbuild/test/frontend/data/use-nasm/moz.build
+@@ -3,11 +3,11 @@
+ # http://creativecommons.org/publicdomain/zero/1.0/
+ 
+ @template
+ def Library(name):
+     LIBRARY_NAME = name
+ 
+ Library('dummy')
+ 
+-USE_YASM = True
++USE_NASM = True
+ 
+ SOURCES += ['test1.S']
+diff --git a/python/mozbuild/mozbuild/test/frontend/data/use-nasm/test1.S b/python/mozbuild/mozbuild/test/frontend/data/use-nasm/test1.S
+new file mode 100644
+diff --git a/python/mozbuild/mozbuild/test/frontend/data/use-yasm/test1.S b/python/mozbuild/mozbuild/test/frontend/data/use-yasm/test1.S
+deleted file mode 100644
+diff --git a/python/mozbuild/mozbuild/test/frontend/test_emitter.py b/python/mozbuild/mozbuild/test/frontend/test_emitter.py
+--- a/python/mozbuild/mozbuild/test/frontend/test_emitter.py
++++ b/python/mozbuild/mozbuild/test/frontend/test_emitter.py
+@@ -425,43 +425,43 @@ class TestEmitterBasic(unittest.TestCase
+ 
+     def test_disable_compiler_warnings(self):
+         reader = self.reader('disable-compiler-warnings', extra_substs={
+             'WARNINGS_CFLAGS': '-Wall',
+         })
+         sources, ldflags, lib, flags = self.read_topsrcdir(reader)
+         self.assertEqual(flags.flags['WARNINGS_CFLAGS'], [])
+ 
+-    def test_use_yasm(self):
+-        # When yasm is not available, this should raise.
+-        reader = self.reader('use-yasm')
++    def test_use_nasm(self):
++        # When nasm is not available, this should raise.
++        reader = self.reader("use-nasm")
+         with six.assertRaisesRegex(self, SandboxValidationError,
+-                                   'yasm is not available'):
++                                   'nasm is not available'):
+             self.read_topsrcdir(reader)
+ 
+-        # When yasm is available, this should work.
+-        reader = self.reader('use-yasm',
++        # When nasm is available, this should work.
++        reader = self.reader('use-nasm',
+                              extra_substs=dict(
+-                                 YASM='yasm',
+-                                 YASM_ASFLAGS='-foo',
++                                 NASM='nasm',
++                                 NASM_ASFLAGS='-foo',
+                              ))
+ 
+         sources, passthru, ldflags, lib, flags, asflags = self.read_topsrcdir(reader)
+ 
+         self.assertIsInstance(passthru, VariablePassthru)
+         self.assertIsInstance(ldflags, ComputedFlags)
+         self.assertIsInstance(flags, ComputedFlags)
+         self.assertIsInstance(asflags, ComputedFlags)
+ 
+-        self.assertEqual(asflags.flags['OS'], reader.config.substs['YASM_ASFLAGS'])
++        self.assertEqual(asflags.flags['OS'], reader.config.substs['NASM_ASFLAGS'])
+ 
+         maxDiff = self.maxDiff
+         self.maxDiff = None
+         self.assertEqual(passthru.variables,
+-                         {'AS': 'yasm',
++                         {'AS': 'nasm',
+                           'AS_DASH_C_FLAG': '',
+                           'ASOUTOPTION': '-o '})
+         self.maxDiff = maxDiff
+ 
+     def test_generated_files(self):
+         reader = self.reader('generated-files')
+         objs = self.read_topsrcdir(reader)
+ 

+ 197 - 0
mozilla-release/patches/1692940-09-88a1.patch

@@ -0,0 +1,197 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043606 0
+#      Tue Feb 23 01:26:46 2021 +0000
+# Node ID fdba4b86b00aa04bbc8b1c42862b2f4e37b2b101
+# Parent  f2b3e8c02580d35fbabd8580c4573dc7bd49ffc0
+Bug 1692940 - Remove build system support for yasm. r=firefox-build-system-reviewers,dmajor
+
+Differential Revision: https://phabricator.services.mozilla.com/D105432
+
+diff --git a/build/moz.configure/toolchain.configure b/build/moz.configure/toolchain.configure
+--- a/build/moz.configure/toolchain.configure
++++ b/build/moz.configure/toolchain.configure
+@@ -54,54 +54,16 @@ def moz_optimize(option, _):
+         flags=flags,
+     )
+ 
+ 
+ set_config('MOZ_OPTIMIZE', moz_optimize.optimize)
+ add_old_configure_assignment('MOZ_OPTIMIZE', moz_optimize.optimize)
+ add_old_configure_assignment('MOZ_CONFIGURE_OPTIMIZE_FLAGS', moz_optimize.flags)
+ 
+-# yasm detection
+-# ==============================================================
+-yasm = check_prog('YASM', ['yasm'], allow_missing=True)
+-
+-
+-@depends_if(yasm)
+-@checking('yasm version')
+-def yasm_version(yasm):
+-    version = check_cmd_output(
+-        yasm, '--version',
+-        onerror=lambda: die('Failed to get yasm version.')
+-    ).splitlines()[0].split()[1]
+-    return Version(version)
+-
+-
+-@depends(yasm, target)
+-def yasm_asflags(yasm, target):
+-    if yasm:
+-        asflags = {
+-            ('OSX', 'x86'): ['-f', 'macho32'],
+-            ('OSX', 'x86_64'): ['-f', 'macho64'],
+-            ('WINNT', 'x86'): ['-f', 'win32'],
+-            ('WINNT', 'x86_64'): ['-f', 'x64'],
+-        }.get((target.os, target.cpu), None)
+-        if asflags is None:
+-            # We're assuming every x86 platform we support that's
+-            # not Windows or Mac is ELF.
+-            if target.cpu == 'x86':
+-                asflags = ['-f', 'elf32']
+-            elif target.cpu == 'x86_64':
+-                asflags = ['-f', 'elf64']
+-        if asflags:
+-            asflags += ['-rnasm', '-pnasm']
+-        return asflags
+-
+-
+-set_config('YASM_ASFLAGS', yasm_asflags)
+-
+ 
+ # Android NDK
+ # ==============================================================
+ 
+ 
+ @depends('--disable-compile-environment', build_project)
+ def compiling_android(compile_env, build_project):
+     return compile_env and build_project in ('mobile/android', 'js')
+diff --git a/python/mozbuild/mozbuild/frontend/context.py b/python/mozbuild/mozbuild/frontend/context.py
+--- a/python/mozbuild/mozbuild/frontend/context.py
++++ b/python/mozbuild/mozbuild/frontend/context.py
+@@ -398,22 +398,16 @@ class AsmFlags(BaseCompileFlags):
+         if (self._context.config.substs.get('MOZ_DEBUG') or
+             self._context.config.substs.get('MOZ_DEBUG_SYMBOLS')):
+             if self._context.get('USE_NASM'):
+                 if (self._context.config.substs.get('OS_ARCH') == 'WINNT' and
+                     not self._context.config.substs.get('GNU_CC')):
+                     debug_flags += ['-F', 'cv8']
+                 elif self._context.config.substs.get('OS_ARCH') != 'Darwin':
+                     debug_flags += ['-F', 'dwarf']
+-            elif self._context.get('USE_YASM'):
+-                if (self._context.config.substs.get('OS_ARCH') == 'WINNT' and
+-                    not self._context.config.substs.get('GNU_CC')):
+-                    debug_flags += ['-g', 'cv8']
+-                elif self._context.config.substs.get('OS_ARCH') != 'Darwin':
+-                    debug_flags += ['-g', 'dwarf2']
+             elif (self._context.config.substs.get('OS_ARCH') == 'WINNT' and
+                   self._context.config.substs.get('CPU_ARCH') == 'aarch64'):
+                 # armasm64 accepts a paucity of options compared to ml/ml64.
+                 pass
+             else:
+                 debug_flags += self._context.config.substs.get('MOZ_DEBUG_FLAGS', '').split()
+         return debug_flags
+ 
+@@ -2264,27 +2258,16 @@ VARIABLES = {
+         By default, the build will use the toolchain assembler, $(AS), to
+         assemble source files in assembly language (.s or .asm files). Setting
+         this value to ``True`` will cause it to use nasm instead.
+ 
+         If nasm is not available on this system, or does not support the
+         current target architecture, an error will be raised.
+         """),
+ 
+-    'USE_YASM': (bool, bool,
+-                 """Use the yasm assembler to assemble assembly files from SOURCES.
+-
+-        By default, the build will use the toolchain assembler, $(AS), to
+-        assemble source files in assembly language (.s or .asm files). Setting
+-        this value to ``True`` will cause it to use yasm instead.
+-
+-        If yasm is not available on this system, or does not support the
+-        current target architecture, an error will be raised.
+-        """),
+-
+     'USE_INTEGRATED_CLANGCL_AS': (bool, bool,
+         """Use the integrated clang-cl assembler to assemble assembly files from SOURCES.
+ 
+         This allows using clang-cl to assemble assembly files which is useful
+         on platforms like aarch64 where the alternative is to have to run a
+         pre-processor to generate files with suitable syntax.
+         """),
+ }
+diff --git a/python/mozbuild/mozbuild/frontend/emitter.py b/python/mozbuild/mozbuild/frontend/emitter.py
+--- a/python/mozbuild/mozbuild/frontend/emitter.py
++++ b/python/mozbuild/mozbuild/frontend/emitter.py
+@@ -1307,26 +1307,16 @@ class TreeMetadataEmitter(LoggingMixin):
+             yield obj
+ 
+         for obj in self._process_jar_manifests(context):
+             yield obj
+ 
+         computed_as_flags.resolve_flags('MOZBUILD',
+                                         context.get('ASFLAGS'))
+ 
+-        if context.get('USE_YASM') is True:
+-            yasm = context.config.substs.get('YASM')
+-            if not yasm:
+-                raise SandboxValidationError('yasm is not available', context)
+-            passthru.variables['AS'] = yasm
+-            passthru.variables['AS_DASH_C_FLAG'] = ''
+-            passthru.variables['ASOUTOPTION'] = '-o '
+-            computed_as_flags.resolve_flags('OS',
+-                                            context.config.substs.get('YASM_ASFLAGS', []))
+-
+         if context.get('USE_NASM') is True:
+             nasm = context.config.substs.get('NASM')
+             if not nasm:
+                 raise SandboxValidationError('nasm is not available', context)
+             passthru.variables['AS'] = nasm
+             passthru.variables['AS_DASH_C_FLAG'] = ''
+             passthru.variables['ASOUTOPTION'] = '-o '
+             computed_as_flags.resolve_flags('OS',
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1468,41 +1468,16 @@ with only_when(compile_environment):
+     set_config('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+     set_config("FFVPX_USE_NASM", True, when=ffvpx.use_nasm)
+ 
+ 
+-@depends(yasm_version,
+-)
+-@imports(_from='__builtin__', _import='sorted')
+-def valid_yasm_version(yasm_version):
+-    requires = {
+-    }
+-    requires = {k: v for (k, v) in requires.items() if v}
+-    if requires and not yasm_version:
+-        items = sorted(requires.keys())
+-        if len(items) > 1:
+-            what = ' and '.join((', '.join(items[:-1]), items[-1]))
+-        else:
+-            what = items[0]
+-        die('Yasm is required to build with %s, but you do not appear to have '
+-            'Yasm installed.' % what)
+-
+-    versioned = {k: v for (k, v) in requires.items() if v is not True}
+-    by_version = sorted(versioned.items(), key=lambda x: x[1])
+-    if by_version:
+-        what, version = by_version[-1]
+-        if yasm_version < version:
+-            die('Yasm version %s or greater is required to build with %s.'
+-                % (version, what))
+-
+-
+ # nasm detection
+ # ==============================================================
+ @depends(dav1d_nasm, vpx_nasm, jpeg_nasm, ffvpx_nasm)
+ def need_nasm(*requirements):
+     requires = {
+         x.what: x.version if hasattr(x, "version") else True for x in requirements if x
+     }
+     if requires:

+ 397 - 0
mozilla-release/patches/1692940-10no11-88a1.patch

@@ -0,0 +1,397 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614043607 0
+#      Tue Feb 23 01:26:47 2021 +0000
+# Node ID 2310bd4635a257928870e576995dd76151f2983e
+# Parent  aa2eda8a84e69b50cd81a13ea5e28fd445c9f834
+Bug 1692940 - Don't bootstrap yasm. r=firefox-build-system-reviewers,dmajor
+
+Differential Revision: https://phabricator.services.mozilla.com/D105599
+
+diff --git a/python/mozboot/mozboot/archlinux.py b/python/mozboot/mozboot/archlinux.py
+--- a/python/mozboot/mozboot/archlinux.py
++++ b/python/mozboot/mozboot/archlinux.py
+@@ -44,17 +44,16 @@ class ArchlinuxBootstrapper(
+         'libvpx',
+         'libxt',
+         'mime-types',
+         'nasm',
+         'startup-notification',
+         'gst-plugins-base-libs',
+         'libpulse',
+         'xorg-server-xvfb',
+-        'yasm',
+         'gst-libav',
+         'gst-plugins-good',
+     ]
+ 
+     BROWSER_AUR_PACKAGES = [
+         'https://aur.archlinux.org/cgit/aur.git/snapshot/uuid.tar.gz',
+     ]
+ 
+diff --git a/python/mozboot/mozboot/centosfedora.py b/python/mozboot/mozboot/centosfedora.py
+--- a/python/mozboot/mozboot/centosfedora.py
++++ b/python/mozboot/mozboot/centosfedora.py
+@@ -1,16 +1,14 @@
+ # This Source Code Form is subject to the terms of the Mozilla Public
+ # License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ # You can obtain one at http://mozilla.org/MPL/2.0/.
+ 
+ from __future__ import absolute_import, print_function, unicode_literals
+ 
+-import platform
+-
+ from mozboot.base import BaseBootstrapper
+ from mozboot.linux_common import LinuxBootstrapper
+ 
+ 
+ class CentOSFedoraBootstrapper(
+         LinuxBootstrapper,
+         BaseBootstrapper):
+     def __init__(self, distro, version, dist_id, **kwargs):
+@@ -36,17 +34,16 @@ class CentOSFedoraBootstrapper(
+             'alsa-lib-devel',
+             'dbus-glib-devel',
+             'glibc-static',
+             'libstdc++-static',
+             'libXt-devel',
+             'nasm',
+             'pulseaudio-libs-devel',
+             'wireless-tools-devel',
+-            'yasm',
+         ]
+ 
+         self.mobile_android_packages = [
+             'java-1.8.0-openjdk-devel',
+             # For downloading the Android SDK and NDK.
+             'wget',
+         ]
+ 
+@@ -114,25 +111,16 @@ class CentOSFedoraBootstrapper(
+     def install_mobile_android_artifact_mode_packages(self):
+         self.ensure_mobile_android_packages(artifact_mode=True)
+ 
+     def ensure_browser_packages(self, artifact_mode=False):
+         # TODO: Figure out what not to install for artifact mode
+         self.dnf_groupinstall(*self.browser_group_packages)
+         self.dnf_install(*self.browser_packages)
+ 
+-        if self.distro in ('centos') and self.version == 6:
+-            yasm = ('http://dl.fedoraproject.org/pub/epel/6/i386/'
+-                    'Packages/y/yasm-1.2.0-1.el6.i686.rpm')
+-            if platform.architecture()[0] == '64bit':
+-                yasm = ('http://dl.fedoraproject.org/pub/epel/6/x86_64/'
+-                        'Packages/y/yasm-1.2.0-1.el6.x86_64.rpm')
+-
+-            self.run_as_root(['rpm', '-ivh', yasm])
+-
+     def ensure_mobile_android_packages(self, artifact_mode=False):
+         # Install Android specific packages.
+         self.dnf_install(*self.mobile_android_packages)
+ 
+         self.ensure_java()
+         from mozboot import android
+         android.ensure_android('linux', artifact_mode=artifact_mode,
+                                no_interactive=self.no_interactive)
+diff --git a/python/mozboot/mozboot/debian.py b/python/mozboot/mozboot/debian.py
+--- a/python/mozboot/mozboot/debian.py
++++ b/python/mozboot/mozboot/debian.py
+@@ -60,17 +60,16 @@ class DebianBootstrapper(
+         'libdbus-1-dev',
+         'libdbus-glib-1-dev',
+         'libgtk-3-dev',
+         'libpulse-dev',
+         'libx11-xcb-dev',
+         'libxt-dev',
+         'python-dbus',
+         'xvfb',
+-        'yasm',
+     ]
+ 
+     # Subclasses can add packages to this variable to have them installed.
+     BROWSER_DISTRO_PACKAGES = []
+ 
+     # These are common packages for building Firefox for Android
+     # (mobile/android) for all Debian-derived distros (such as Ubuntu).
+     MOBILE_ANDROID_COMMON_PACKAGES = [
+diff --git a/python/mozboot/mozboot/freebsd.py b/python/mozboot/mozboot/freebsd.py
+--- a/python/mozboot/mozboot/freebsd.py
++++ b/python/mozboot/mozboot/freebsd.py
+@@ -27,17 +27,16 @@ class FreeBSDBootstrapper(BaseBootstrapp
+         self.browser_packages = [
+             'dbus-glib',
+             'gtk3',
+             'libXt',
+             'mesa-dri',  # depends on llvm*
+             'nasm',
+             'pulseaudio',
+             'v4l_compat',
+-            'yasm',
+         ]
+ 
+         if not self.which('as'):
+             self.packages.append('binutils')
+ 
+         if not self.which('unzip'):
+             self.packages.append('unzip')
+ 
+diff --git a/python/mozboot/mozboot/gentoo.py b/python/mozboot/mozboot/gentoo.py
+--- a/python/mozboot/mozboot/gentoo.py
++++ b/python/mozboot/mozboot/gentoo.py
+@@ -38,17 +38,16 @@ class GentooBootstrapper(
+                           'app-arch/zip',
+                           'sys-devel/autoconf:2.1'
+                           ])
+ 
+     def ensure_browser_packages(self, artifact_mode=False):
+         # TODO: Figure out what not to install for artifact mode
+         self.run_as_root(['emerge',
+                           '--oneshot', '--noreplace', '--quiet', '--newuse',
+-                          'dev-lang/yasm',
+                           'dev-libs/dbus-glib',
+                           'media-sound/pulseaudio',
+                           'x11-libs/gtk+:3',
+                           'x11-libs/libXt'
+                           ])
+ 
+     def ensure_mobile_android_packages(self, artifact_mode=False):
+         self.run_as_root(['emerge', '--noreplace', '--quiet',
+diff --git a/python/mozboot/mozboot/gentoo.py.1672894.later b/python/mozboot/mozboot/gentoo.py.1672894.later
+--- a/python/mozboot/mozboot/gentoo.py.1672894.later
++++ b/python/mozboot/mozboot/gentoo.py.1672894.later
+@@ -11,10 +11,9 @@
+                            'app-arch/zip',
+ -                          'sys-devel/autoconf:2.1'
+                            ])
+  
+      def ensure_browser_packages(self, artifact_mode=False):
+          # TODO: Figure out what not to install for artifact mode
+          self.run_as_root(['emerge',
+                            '--oneshot', '--noreplace', '--quiet', '--newuse',
+-                           'dev-lang/yasm',
+                            'dev-libs/dbus-glib',
+diff --git a/python/mozboot/mozboot/openbsd.py b/python/mozboot/mozboot/openbsd.py
+--- a/python/mozboot/mozboot/openbsd.py
++++ b/python/mozboot/mozboot/openbsd.py
+@@ -18,17 +18,16 @@ class OpenBSDBootstrapper(BaseBootstrapp
+             'wget',
+             'unzip',
+             'zip',
+         ]
+ 
+         self.browser_packages = [
+             'llvm',
+             'nasm',
+-            'yasm',
+             'gtk+3',
+             'dbus-glib',
+             'pulseaudio',
+         ]
+ 
+     def install_system_packages(self):
+         # we use -z because there's no other way to say "any autoconf-2.13"
+         self.run_as_root(['pkg_add', '-z'] + self.packages)
+diff --git a/python/mozboot/mozboot/opensuse.py b/python/mozboot/mozboot/opensuse.py
+--- a/python/mozboot/mozboot/opensuse.py
++++ b/python/mozboot/mozboot/opensuse.py
+@@ -30,17 +30,16 @@ class OpenSUSEBootstrapper(
+         'gtk3-devel',
+         'dbus-1-glib-devel',
+         'gconf2-devel',
+         'glibc-devel-static',
+         'libstdc++-devel',
+         'libXt-devel',
+         'libproxy-devel',
+         'libuuid-devel',
+-        'yasm',
+         'clang-devel',
+         'patterns-gnome-devel_gnome',
+     ]
+ 
+     BROWSER_GROUP_PACKAGES = [
+         'devel_C_C++',
+         'devel_gnome',
+     ]
+diff --git a/python/mozboot/mozboot/osx.py b/python/mozboot/mozboot/osx.py
+--- a/python/mozboot/mozboot/osx.py
++++ b/python/mozboot/mozboot/osx.py
+@@ -344,17 +344,16 @@ class OSXBootstrapper(BaseBootstrapper):
+             'watchman',
+         ]
+         self._ensure_homebrew_packages(packages)
+ 
+     def ensure_homebrew_browser_packages(self, artifact_mode=False):
+         # TODO: Figure out what not to install for artifact mode
+         packages = [
+             'nasm',
+-            'yasm',
+         ]
+         self._ensure_homebrew_packages(packages)
+ 
+     def ensure_homebrew_mobile_android_packages(self, artifact_mode=False):
+         # Multi-part process:
+         # 1. System packages.
+         # 2. Android SDK. Android NDK only if we are not in artifact mode. Android packages.
+ 
+@@ -423,17 +422,16 @@ class OSXBootstrapper(BaseBootstrapper):
+             self.run_as_root([self.port, 'select', '--set', 'python', 'python27'])
+         else:
+             print('The right python version is already active.')
+ 
+     def ensure_macports_browser_packages(self, artifact_mode=False):
+         # TODO: Figure out what not to install for artifact mode
+         packages = [
+             'nasm',
+-            'yasm',
+             'llvm-7.0',
+             'clang-7.0',
+         ]
+ 
+         self._ensure_macports_packages(packages)
+ 
+     def ensure_macports_mobile_android_packages(self, artifact_mode=False):
+         # Multi-part process:
+diff --git a/python/mozboot/mozboot/osx.py.1692940-10.later b/python/mozboot/mozboot/osx.py.1692940-10.later
+new file mode 100644
+--- /dev/null
++++ b/python/mozboot/mozboot/osx.py.1692940-10.later
+@@ -0,0 +1,76 @@
++--- osx.py
+++++ osx.py
++@@ -207,22 +207,20 @@ class OSXBootstrapper(BaseBootstrapper):
++         if not hg_modern:
++             print(
++                 "Mercurial wasn't found or is not sufficiently modern. "
++                 "It will be installed with %s" % self.package_manager
++             )
++         getattr(self, "ensure_%s_system_packages" % self.package_manager)(not hg_modern)
++ 
++     def install_browser_packages(self, mozconfig_builder):
++-        getattr(self, "ensure_%s_browser_packages" % self.package_manager)()
+++        pass
++ 
++     def install_browser_artifact_mode_packages(self, mozconfig_builder):
++-        getattr(self, "ensure_%s_browser_packages" % self.package_manager)(
++-            artifact_mode=True
++-        )
+++        pass
++ 
++     def install_mobile_android_packages(self, mozconfig_builder):
++         getattr(self, "ensure_%s_mobile_android_packages" % self.package_manager)(
++             mozconfig_builder
++         )
++ 
++     def install_mobile_android_artifact_mode_packages(self, mozconfig_builder):
++         getattr(self, "ensure_%s_mobile_android_packages" % self.package_manager)(
++@@ -384,23 +382,16 @@ class OSXBootstrapper(BaseBootstrapper):
++             "gnu-tar",
++             "terminal-notifier",
++             "watchman",
++         ]
++         if install_mercurial:
++             packages.append("mercurial")
++         self._ensure_homebrew_packages(packages)
++ 
++-    def ensure_homebrew_browser_packages(self, artifact_mode=False):
++-        # TODO: Figure out what not to install for artifact mode
++-        packages = [
++-            "yasm",
++-        ]
++-        self._ensure_homebrew_packages(packages)
++-
++     def ensure_homebrew_mobile_android_packages(
++         self, mozconfig_builder, artifact_mode=False
++     ):
++         # Multi-part process:
++         # 1. System packages.
++         # 2. Android SDK. Android NDK only if we are not in artifact mode. Android packages.
++ 
++         # 1. System packages.
++@@ -462,24 +453,16 @@ class OSXBootstrapper(BaseBootstrapper):
++         for python in pythons:
++             if "active" in python:
++                 active = python
++         if "python27" not in active:
++             self.run_as_root([self.port, "select", "--set", "python", "python27"])
++         else:
++             print("The right python version is already active.")
++ 
++-    def ensure_macports_browser_packages(self, artifact_mode=False):
++-        # TODO: Figure out what not to install for artifact mode
++-        packages = [
++-            "yasm",
++-        ]
++-
++-        self._ensure_macports_packages(packages)
++-
++     def ensure_macports_mobile_android_packages(
++         self, mozconfig_builder, artifact_mode=False
++     ):
++         # Multi-part process:
++         # 1. System packages.
++         # 2. Android SDK. Android NDK only if we are not in artifact mode. Android packages.
++ 
++         # 1. System packages.
+diff --git a/python/mozboot/mozboot/solus.py b/python/mozboot/mozboot/solus.py
+--- a/python/mozboot/mozboot/solus.py
++++ b/python/mozboot/mozboot/solus.py
+@@ -41,17 +41,16 @@ class SolusBootstrapper(
+         'libvpx',
+         'libxt',
+         'nasm',
+         'libstartup-notification',
+         'gst-plugins-base',
+         'gst-plugins-good',
+         'pulseaudio',
+         'xorg-server-xvfb',
+-        'yasm',
+     ]
+ 
+     MOBILE_ANDROID_COMMON_PACKAGES = [
+         'openjdk-8',
+         # For downloading the Android SDK and NDK.
+         'wget',
+         # See comment about 32 bit binaries and multilib below.
+         'ncurses-32bit',
+diff --git a/python/mozboot/mozboot/void.py b/python/mozboot/mozboot/void.py
+--- a/python/mozboot/mozboot/void.py
++++ b/python/mozboot/mozboot/void.py
+@@ -30,17 +30,16 @@ class VoidBootstrapper(
+         'dbus-devel',
+         'dbus-glib-devel',
+         'gtk+3-devel',
+         'pulseaudio',
+         'pulseaudio-devel',
+         'libcurl-devel',
+         'libxcb-devel',
+         'libXt-devel',
+-        'yasm',
+     ]
+ 
+     MOBILE_ANDROID_PACKAGES = [
+         'openjdk8',  # Android's `sdkmanager` requires Java 1.8 exactly.
+         'wget',  # For downloading the Android SDK and NDK.
+     ]
+ 
+     def __init__(self, version, dist_id, **kwargs):
+diff --git a/python/mozboot/mozboot/windows.py b/python/mozboot/mozboot/windows.py
+--- a/python/mozboot/mozboot/windows.py
++++ b/python/mozboot/mozboot/windows.py
+@@ -52,17 +52,16 @@ class WindowsBootstrapper(BaseBootstrapp
+         'zip',
+         'unzip',
+         'mingw-w64-x86_64-toolchain',  # TODO: Remove when Mercurial is installable from a wheel.
+         'mingw-w64-i686-toolchain'
+     ]
+ 
+     BROWSER_PACKAGES = [
+         'mingw-w64-x86_64-nasm',
+-        'mingw-w64-x86_64-yasm',
+         'mingw-w64-i686-nsis'
+     ]
+ 
+     MOBILE_ANDROID_COMMON_PACKAGES = [
+         'wget'
+     ]
+ 
+     def __init__(self, **kwargs):

+ 32 - 0
mozilla-release/patches/1692940-12-88a1.patch

@@ -0,0 +1,32 @@
+# HG changeset patch
+# User Mihai Alexandru Michis <malexandru@mozilla.com>
+# Date 1614049543 -7200
+#      Tue Feb 23 05:05:43 2021 +0200
+# Node ID 07420cd0943250559d722febc1bee339929ccd24
+# Parent  263520df233676201b8e5ba481a2e9a24e26f083
+Bug 1692940 - Fix bustages on Win 2012 AArch64. a=bustage-fix
+
+CLOSED TREE
+
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1456,17 +1456,17 @@ with only_when(compile_environment):
+     def ffvpx_nasm():
+         # nasm 2.10 for AVX-2 support.
+         return namespace(version="2.10", what="FFVPX")
+ 
+     # ffvpx_nasm can't indirectly depend on vpx_as_flags, because it depends
+     # on a compiler test, so we have to do a little bit of dance here.
+     @depends(ffvpx, vpx_as_flags, target)
+     def ffvpx(ffvpx, vpx_as_flags, target):
+-        if ffvpx and target.cpu in ("arm", "aarch64"):
++        if ffvpx and vpx_as_flags and target.cpu in ("arm", "aarch64"):
+             ffvpx.flags.extend(vpx_as_flags)
+         return ffvpx
+ 
+     set_config('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)

+ 33 - 0
mozilla-release/patches/1692945-1-87a1.patch

@@ -0,0 +1,33 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1613510225 0
+# Node ID 36fd576c711b0d343d59acdfbcb9adf9b1714f91
+# Parent  bb9883884262859c95df32b8d1e0ed62ca38bf11
+Bug 1692945 - Remove check for yasm/gas in js. r=firefox-build-system-reviewers,andi,dmajor
+
+Building the ICU data file hasn't required yasm since bug 1650299, and
+all the compilers we support now have a GAS-like assembler that we use
+unconditionally.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105268
+
+diff --git a/js/moz.configure b/js/moz.configure
+--- a/js/moz.configure
++++ b/js/moz.configure
+@@ -523,16 +523,8 @@ def icu_version(build_env):
+ set_config('MOZ_ICU_VERSION', icu_version)
+ 
+ # Source files that use ICU should have control over which parts of the ICU
+ # namespace they want to use.
+ set_define('U_USING_ICU_NAMESPACE', '0', when='--with-intl-api')
+ 
+ # We build ICU as a static library.
+ set_define('U_STATIC_IMPLEMENTATION', True, when=depends(system_icu)(lambda x: not x))
+-
+-@depends(yasm, gnu_as, target, compile_environment)
+-def can_build_data_file(yasm, gnu_as, target, compile_environment):
+-    if not compile_environment or (target.kernel == 'WINNT' and target.cpu == 'aarch64'):
+-        return
+-    if not yasm and not gnu_as:
+-        die('Building ICU requires either yasm or a GNU assembler. If you do not have '
+-            'either of those available for this platform you must use --without-intl-api')

+ 80 - 0
mozilla-release/patches/1692945-2-87a1.patch

@@ -0,0 +1,80 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1613510226 0
+# Node ID 44e6140070a0b4088e1ca8017611af569e57c6d7
+# Parent  4e0e7d9feca0f1152637d1add13c3d145d5eca46
+Bug 1692945 - Remove unused [YN]ASM variables. r=firefox-build-system-reviewers,andi,dmajor
+
+None of HAVE_NASM, HAVE_YASM, NASM_MAJOR_VERSION and NASM_MINOR_VERSION are
+used. Also, the YASM variable is not necessary for old-configure anymore.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105269
+
+diff --git a/build/moz.configure/toolchain.configure b/build/moz.configure/toolchain.configure
+--- a/build/moz.configure/toolchain.configure
++++ b/build/moz.configure/toolchain.configure
+@@ -2165,30 +2165,16 @@ def nasm_version(nasm):
+     if retcode:
+         # mac stub binary
+         return None
+ 
+     version = stdout.splitlines()[0].split()[2]
+     return Version(version)
+ 
+ 
+-@depends_if(nasm_version)
+-def nasm_major_version(nasm_version):
+-    return str(nasm_version.major)
+-
+-
+-@depends_if(nasm_version)
+-def nasm_minor_version(nasm_version):
+-    return str(nasm_version.minor)
+-
+-
+-set_config('NASM_MAJOR_VERSION', nasm_major_version)
+-set_config('NASM_MINOR_VERSION', nasm_minor_version)
+-
+-
+ @depends(nasm, target)
+ def nasm_asflags(nasm, target):
+     if nasm:
+         asflags = {
+             ('OSX', 'x86'): ['-f', 'macho32'],
+             ('OSX', 'x86_64'): ['-f', 'macho64'],
+             ('WINNT', 'x86'): ['-f', 'win32'],
+             ('WINNT', 'x86_64'): ['-f', 'win64'],
+@@ -2200,33 +2186,16 @@ def nasm_asflags(nasm, target):
+                 asflags = ['-f', 'elf32']
+             elif target.cpu == 'x86_64':
+                 asflags = ['-f', 'elf64']
+         return asflags
+ 
+ 
+ set_config('NASM_ASFLAGS', nasm_asflags)
+ 
+-@depends(nasm_asflags)
+-def have_nasm(value):
+-    if value:
+-        return True
+-
+-
+-@depends(yasm_asflags)
+-def have_yasm(yasm_asflags):
+-    if yasm_asflags:
+-        return True
+-
+-set_config('HAVE_NASM', have_nasm)
+-
+-set_config('HAVE_YASM', have_yasm)
+-# Until the YASM variable is not necessary in old-configure.
+-add_old_configure_assignment('YASM', have_yasm)
+-
+ 
+ # clang-cl integrated assembler support
+ # ==============================================================
+ @depends(target)
+ def clangcl_asflags(target):
+     asflags = None
+     if target.os == 'WINNT' and target.cpu == 'aarch64':
+         asflags = ['--target=aarch64-windows-msvc']

+ 122 - 0
mozilla-release/patches/1693215-1-88a1.patch

@@ -0,0 +1,122 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614031885 0
+# Node ID cda1a9bb647e59bfb0c10821daa02c9f1728ed5a
+# Parent  495ef3b30ea9fbd85f2c69fe0970c6cc7a817935
+Bug 1693215 - Always depend on yasm >= 1.2 for ffvpx. r=firefox-build-system-reviewers,dmajor
+
+Yasm 1.2 was released in October 2011. Let's just assume everyone can
+use that now.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105398
+
+diff --git a/media/ffvpx/ffvpxcommon.mozbuild b/media/ffvpx/ffvpxcommon.mozbuild
+--- a/media/ffvpx/ffvpxcommon.mozbuild
++++ b/media/ffvpx/ffvpxcommon.mozbuild
+@@ -15,20 +15,16 @@ if CONFIG['FFVPX_ASFLAGS']:
+     if CONFIG['FFVPX_USE_YASM']:
+         USE_YASM = True
+ 
+     if CONFIG['OS_ARCH'] == 'WINNT':
+        # Fix inline symbols and math defines for windows.
+         DEFINES['_USE_MATH_DEFINES'] = True
+         DEFINES['inline'] = "__inline"
+ 
+-    if USE_YASM and not CONFIG['YASM_HAS_AVX2']:
+-        DEFINES['YASM_MISSING_AVX2'] = True
+-
+-
+ LOCAL_INCLUDES += ['/media/ffvpx']
+ 
+ # We allow warnings for third-party code that can be updated from upstream.
+ AllowCompilerWarnings()
+ 
+ # Suppress warnings in third-party code.
+ if CONFIG['CC_TYPE'] in ('clang', 'clang-cl', 'gcc'):
+     CFLAGS += [
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1448,35 +1448,27 @@ with only_when(compile_environment):
+                 flags = ['-D__x86_64__', '-DPIC', '-DELF']
+         if flags:
+             if target.kernel == 'Linux' and target.os == 'GNU':
+                 need_yasm = Version('1.0.1')
+             else:
+                 need_yasm = Version('1.1')
+             return namespace(flags=flags, need_yasm=need_yasm)
+ 
+-
+     set_config('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+     set_define('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+     set_config('LIBAV_FFT_ASFLAGS', libav_fft.flags)
+ 
+ 
+ # FFmpeg's ffvpx configuration
+ # ==============================================================
+ with only_when(compile_environment):
+-    @depends_if(yasm_version)
+-    def yasm_has_avx2(yasm_version):
+-        return yasm_version >= '1.2'
+ 
+-
+-    set_config('YASM_HAS_AVX2', yasm_has_avx2)
+-
+-
+-    @depends(yasm_has_avx2, libav_fft, vpx_as_flags, target)
+-    def ffvpx(yasm_has_avx2, libav_fft, vpx_as_flags, target):
++    @depends(libav_fft, vpx_as_flags, target)
++    def ffvpx(libav_fft, vpx_as_flags, target):
+         enable = flac_only = use_yasm = False
+         flags = []
+         if target.cpu in ('x86', 'x86_64') or \
+                 target.cpu == 'aarch64' and target.kernel in ('WINNT', 'Darwin'):
+             enable = True
+             if libav_fft and libav_fft.flags:
+                 use_yasm = True
+                 flags.extend(libav_fft.flags)
+@@ -1500,41 +1492,35 @@ with only_when(compile_environment):
+         elif target.cpu in ('arm', 'aarch64') and \
+                 target.kernel not in ('WINNT', 'Darwin'):
+             enable = flac_only = True
+             flags.extend(vpx_as_flags)
+ 
+         if use_yasm:
+             # default disabled components
+             flags.append('-Pdefaults_disabled.asm')
+-            if not yasm_has_avx2:
+-                flags.extend((
+-                    '-DHAVE_AVX2=0',
+-                    '-DHAVE_AVX2_INTERNAL=0',
+-                    '-DHAVE_AVX2_EXTERNAL=0',
+-                ))
+ 
+         return namespace(
+             enable=enable,
+-            use_yasm=use_yasm,
++            need_yasm="1.2" if use_yasm else False,
+             flac_only=flac_only,
+             flags=flags,
+         )
+ 
+ 
+     set_config('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+-    set_config('FFVPX_USE_YASM', True, when=ffvpx.use_yasm)
++    set_config("FFVPX_USE_YASM", True, when=ffvpx.need_yasm)
+ 
+ 
+ @depends(yasm_version, in_tree_vpx.need_yasm, in_tree_jpeg.use_yasm,
+-         libav_fft.need_yasm, ffvpx.use_yasm)
++         libav_fft.need_yasm, ffvpx.need_yasm)
+ @imports(_from='__builtin__', _import='sorted')
+ def valid_yasm_version(yasm_version, for_vpx, for_jpeg, for_libav,
+                        for_ffvpx=False):
+     # Note: the default for for_ffvpx above only matters for unit tests.
+     requires = {
+         'vpx': for_vpx,
+         'jpeg': for_jpeg,
+         'libav': for_libav,

+ 168 - 0
mozilla-release/patches/1693215-2-88a1.patch

@@ -0,0 +1,168 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614031886 0
+#      Mon Feb 22 22:11:26 2021 +0000
+# Node ID 8680ed398ecf9bf61073ecd1cbf9756ddbdc00e4
+# Parent  0227a9e4d949c52b44cc2f5fdf3bc93d575d08f2
+Bug 1693215 - Don't check for yasm for libav. r=firefox-build-system-reviewers,dmajor
+
+Bug 1476231 actually removed libav, so we don't build it, and don't need
+neither the yasm check nor the LIBAV_FFT_ASFLAGS variable.
+However, we still have checks, both in moz.build and code, for
+MOZ_LIBAV_FFT, so we need to keep that.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105399
+
+diff --git a/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py b/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py
+--- a/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py
++++ b/python/mozbuild/mozbuild/test/configure/test_toolkit_moz_configure.py
+@@ -88,78 +88,78 @@ class TestToolkitMozConfigure(BaseConfig
+         self.assertEqual(get_value(environ={'MOZ_AUTOMATION': 1}), None)
+ 
+     def test_valid_yasm_version(self):
+         out = StringIO()
+         sandbox = self.get_sandbox({}, {}, out=out)
+         func = sandbox._depends[sandbox['valid_yasm_version']]._func
+ 
+         # Missing yasm is not an error when nothing requires it.
+-        func(None, False, False, False)
++        func(None, False, False)
+ 
+         # Any version of yasm works when nothing requires it.
+-        func(Version('1.0'), False, False, False)
++        func(Version('1.0'), False, False)
+ 
+         # Any version of yasm works when something requires any version.
+-        func(Version('1.0'), True, False, False)
+-        func(Version('1.0'), True, True, False)
+-        func(Version('1.0'), False, True, False)
++        func(Version('1.0'), True, False)
++        func(Version('1.0'), True, True)
++        func(Version('1.0'), False, True)
+ 
+         # A version of yasm greater than any requirement works.
+-        func(Version('1.5'), Version('1.0'), True, False)
+-        func(Version('1.5'), True, Version('1.0'), False)
+-        func(Version('1.5'), Version('1.1'), Version('1.0'), False)
++        func(Version("1.5"), Version("1.0"), True)
++        func(Version("1.5"), True, Version("1.0"))
++        func(Version("1.5"), Version("1.1"), Version("1.0"))
+ 
+         out.truncate(0)
+         out.seek(0)
+         with self.assertRaises(SystemExit):
+-            func(None, Version('1.0'), False, False)
++            func(None, Version('1.0'), False)
+ 
+         self.assertEqual(
+             out.getvalue(),
+             ('ERROR: Yasm is required to build with vpx, but you do not appear '
+              'to have Yasm installed.\n'),
+         )
+ 
+         out.truncate(0)
+         out.seek(0)
+         with self.assertRaises(SystemExit):
+-            func(None, Version('1.0'), Version('1.0'), False)
++            func(None, Version('1.0'), Version('1.0'))
+ 
+         self.assertEqual(
+             out.getvalue(),
+             ('ERROR: Yasm is required to build with jpeg and vpx, but you do not appear '
+              'to have Yasm installed.\n'),
+         )
+ 
+         out.truncate(0)
+         out.seek(0)
+         with self.assertRaises(SystemExit):
+-            func(None, Version('1.0'), Version('1.0'), Version('1.0'))
++            func(None, Version('1.0'), Version('1.0'))
+ 
+         self.assertEqual(
+             out.getvalue(),
+             ('ERROR: Yasm is required to build with jpeg, libav and vpx, but you do not appear '
+              'to have Yasm installed.\n'),
+         )
+ 
+         out.truncate(0)
+         out.seek(0)
+         with self.assertRaises(SystemExit):
+-            func(Version('1.0'), Version('1.1'), Version('1.0'), False)
++            func(Version('1.0'), Version('1.1'), Version('1.0'))
+ 
+         self.assertEqual(
+             out.getvalue(),
+             'ERROR: Yasm version 1.1 or greater is required to build with vpx.\n'
+         )
+ 
+         out.truncate(0)
+         out.seek(0)
+         with self.assertRaises(SystemExit):
+-            func(Version('1.0'), True, Version('1.0.1'), False)
++            func(Version('1.0'), True, Version('1.0.1'))
+ 
+         self.assertEqual(
+             out.getvalue(),
+             'ERROR: Yasm version 1.0.1 or greater is required to build with jpeg.\n'
+         )
+ 
+ 
+ if __name__ == '__main__':
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1442,25 +1442,20 @@ with only_when(compile_environment):
+         elif target.cpu == 'x86_64':
+             if target.kernel == 'Darwin':
+                 flags = ['-D__x86_64__', '-DPIC', '-DMACHO']
+             elif target.kernel == 'WINNT':
+                 flags = ['-D__x86_64__', '-DPIC', '-DWIN64', '-DMSVC']
+             else:
+                 flags = ['-D__x86_64__', '-DPIC', '-DELF']
+         if flags:
+-            if target.kernel == 'Linux' and target.os == 'GNU':
+-                need_yasm = Version('1.0.1')
+-            else:
+-                need_yasm = Version('1.1')
+-            return namespace(flags=flags, need_yasm=need_yasm)
++            return namespace(flags=flags)
+ 
+     set_config('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+     set_define('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+-    set_config('LIBAV_FFT_ASFLAGS', libav_fft.flags)
+ 
+ 
+ # FFmpeg's ffvpx configuration
+ # ==============================================================
+ with only_when(compile_environment):
+ 
+     @depends(libav_fft, vpx_as_flags, target)
+     def ffvpx(libav_fft, vpx_as_flags, target):
+@@ -1510,25 +1505,23 @@ with only_when(compile_environment):
+     set_define('MOZ_FFVPX', True, when=ffvpx.enable)
+     set_config('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+     set_config("FFVPX_USE_YASM", True, when=ffvpx.need_yasm)
+ 
+ 
+ @depends(yasm_version, in_tree_vpx.need_yasm, in_tree_jpeg.use_yasm,
+-         libav_fft.need_yasm, ffvpx.need_yasm)
++         ffvpx.need_yasm)
+ @imports(_from='__builtin__', _import='sorted')
+-def valid_yasm_version(yasm_version, for_vpx, for_jpeg, for_libav,
+-                       for_ffvpx=False):
++def valid_yasm_version(yasm_version, for_vpx, for_jpeg, for_ffvpx=False):
+     # Note: the default for for_ffvpx above only matters for unit tests.
+     requires = {
+         'vpx': for_vpx,
+         'jpeg': for_jpeg,
+-        'libav': for_libav,
+         'ffvpx': for_ffvpx,
+     }
+     requires = {k: v for (k, v) in requires.items() if v}
+     if requires and not yasm_version:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = ' and '.join((', '.join(items[:-1]), items[-1]))
+         else:

+ 152 - 0
mozilla-release/patches/1693215-3-88a1.patch

@@ -0,0 +1,152 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1614031886 0
+# Node ID 5d84014a7bff591657ed6cc47c1d8b108121d4a5
+# Parent  09c864df41d9750f21bcc6a541303ee03a99902d
+Bug 1693215 - Simplify the ffvpx configuration. r=firefox-build-system-reviewers,dmajor
+
+Since MOZ_LIBAV_FFT doesn't need the flags, we simplify the libav_fft
+function to return true in the conditions it used to return flags for,
+which is clearer.
+We then move all these flags to the ffvpx function, and rearrange the
+tests to be less convoluted, and with hope, more readable.
+
+This has been verified to not change the outcomes on the following
+targets:
+- i686-pc-linux-gnu
+- x86_64-pc-linux-gnu
+- aarch64-unknown-linux-gnu
+- arm-unknown-linux-gnueabi
+- s390x-unknown-linux-gnu
+- i686-apple-darwin
+- x86_64-apple-darwin
+- aarch64-apple-darwin
+- i686-pc-mingw32
+- x86_64-pc-mingw32
+- aarch64-pc-mingw32
+
+Differential Revision: https://phabricator.services.mozilla.com/D105400
+
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1385,75 +1385,75 @@ with only_when(compile_environment):
+                 need_yasm = Version('1.1')
+ 
+         return namespace(flags=flags, use_yasm=use_yasm, need_yasm=need_yasm)
+ 
+     set_config('LIBJPEG_TURBO_USE_YASM', in_tree_jpeg.use_yasm)
+     set_config('LIBJPEG_TURBO_ASFLAGS', in_tree_jpeg.flags)
+ 
+ 
+-# Libav-fft Support
++# FFmpeg's ffvpx configuration
+ # ==============================================================
+ with only_when(compile_environment):
+     @depends(target)
+     def libav_fft(target):
+-        flags = None
+-        if target.kernel == 'WINNT' and target.cpu == 'x86':
+-            flags = ['-DPIC', '-DWIN32']
+-        elif target.kernel == 'WINNT' and target.cpu == 'aarch64':
+-            flags = ['-DPIC', '-DWIN64']
+-        elif target.cpu == 'x86_64':
+-            if target.kernel == 'Darwin':
+-                flags = ['-D__x86_64__', '-DPIC', '-DMACHO']
+-            elif target.kernel == 'WINNT':
+-                flags = ['-D__x86_64__', '-DPIC', '-DWIN64', '-DMSVC']
+-            else:
+-                flags = ['-D__x86_64__', '-DPIC', '-DELF']
+-        if flags:
+-            return namespace(flags=flags)
++        return target.kernel == "WINNT" or target.cpu == "x86_64"
+ 
+     set_config('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+     set_define('MOZ_LIBAV_FFT', depends(when=libav_fft)(lambda: True))
+ 
+ 
+-# FFmpeg's ffvpx configuration
+-# ==============================================================
+ with only_when(compile_environment):
+ 
+-    @depends(libav_fft, vpx_as_flags, target)
+-    def ffvpx(libav_fft, vpx_as_flags, target):
+-        enable = flac_only = use_yasm = False
++    @depends(vpx_as_flags, target)
++    def ffvpx(vpx_as_flags, target):
++        enable = use_yasm = True
++        flac_only = False
+         flags = []
+-        if target.cpu in ('x86', 'x86_64') or \
+-                target.cpu == 'aarch64' and target.kernel in ('WINNT', 'Darwin'):
+-            enable = True
+-            if libav_fft and libav_fft.flags:
+-                use_yasm = True
+-                flags.extend(libav_fft.flags)
+-                if target.kernel == 'WINNT':
+-                    if target.cpu == 'x86':
+-                        # 32-bit windows need to prefix symbols with an underscore.
+-                        flags.extend(('-DPREFIX', '-Pconfig_win32.asm'))
+-                    elif target.cpu == 'aarch64':
+-                        use_yasm = False
+-                    else:
+-                        flags.append('-Pconfig_win64.asm')
+-                elif target.kernel == 'Darwin':
+-                    # 32/64-bit macosx assemblers need to prefix symbols with an
+-                    # underscore.
+-                    flags.extend(('-DPREFIX', '-Pconfig_darwin64.asm'))
+-                else:
+-                    # Default to unix.
+-                    flags.append('-Pconfig_unix64.asm')
++
++        if target.kernel == "WINNT":
++            if target.cpu == "x86":
++                # 32-bit windows need to prefix symbols with an underscore.
++                flags = ["-DPIC", "-DWIN32", "-DPREFIX", "-Pconfig_win32.asm"]
++            elif target.cpu == "x86_64":
++                flags = [
++                    "-D__x86_64__",
++                    "-DPIC",
++                    "-DWIN64",
++                    "-DMSVC",
++                    "-Pconfig_win64.asm",
++                ]
++            elif target.cpu == "aarch64":
++                flags = ["-DPIC", "-DWIN64"]
++                use_yasm = False
++        elif target.kernel == "Darwin":
++            if target.cpu == "x86_64":
++                # 32/64-bit macosx asemblers need to prefix symbols with an
++                # underscore.
++                flags = [
++                    "-D__x86_64__",
++                    "-DPIC",
++                    "-DMACHO",
++                    "-DPREFIX",
++                    "-Pconfig_darwin64.asm",
++                ]
+             else:
+                 flac_only = True
+-        elif target.cpu in ('arm', 'aarch64') and \
+-                target.kernel not in ('WINNT', 'Darwin'):
+-            enable = flac_only = True
++        elif target.cpu == "x86_64":
++            flags = ["-D__x86_64__", "-DPIC", "-DELF", "-Pconfig_unix64.asm"]
++        elif target.cpu == "x86":
++            flac_only = True
++        elif target.cpu in ("arm", "aarch64"):
++            flac_only = True
+             flags.extend(vpx_as_flags)
++        else:
++            enable = False
++
++        if flac_only or not enable:
++            use_yasm = False
+ 
+         if use_yasm:
+             # default disabled components
+             flags.append('-Pdefaults_disabled.asm')
+ 
+         return namespace(
+             enable=enable,
+             need_yasm="1.2" if use_yasm else False,

+ 89 - 0
mozilla-release/patches/1693498-1-88a1.patch

@@ -0,0 +1,89 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1613684055 0
+# Node ID 339e0d88e58b5c20ebe3218d9284d1c3cc5131fa
+# Parent  f055ab9e4d98a103f41ec832fd7e42ec548a8804
+Bug 1693498 - Remove avoid_avx2 option when building openh264. r=mjf
+
+Back when it was added, the nasm used for Linux builds was old enough
+that it didn't support it. We've been using a version that supports avx2
+for a while now.
+
+Differential Revision: https://phabricator.services.mozilla.com/D105618
+
+diff --git a/testing/mozharness/configs/openh264/linux32.py b/testing/mozharness/configs/openh264/linux32.py
+--- a/testing/mozharness/configs/openh264/linux32.py
++++ b/testing/mozharness/configs/openh264/linux32.py
+@@ -13,11 +13,10 @@ config = {
+     'exes': {
+         'gittool.py': [os.path.join(external_tools_path, 'gittool.py')],
+         'tooltool.py': "/builds/tooltool.py",
+         'python2.7': "/tools/python27/bin/python2.7",
+     },
+     'dump_syms_binary': 'dump_syms',
+     'arch': 'x86',
+     'use_mock': True,
+-    'avoid_avx2': True,
+     'operating_system': 'linux',
+ }
+diff --git a/testing/mozharness/configs/openh264/linux64.py b/testing/mozharness/configs/openh264/linux64.py
+--- a/testing/mozharness/configs/openh264/linux64.py
++++ b/testing/mozharness/configs/openh264/linux64.py
+@@ -13,11 +13,10 @@ config = {
+     'exes': {
+         'gittool.py': [os.path.join(external_tools_path, 'gittool.py')],
+         'tooltool.py': "/builds/tooltool.py",
+         'python2.7': "/tools/python27/bin/python2.7",
+     },
+     'dump_syms_binary': 'dump_syms',
+     'arch': 'x64',
+     'use_mock': True,
+-    'avoid_avx2': True,
+     'operating_system': 'linux',
+ }
+diff --git a/testing/mozharness/scripts/openh264_build.py b/testing/mozharness/scripts/openh264_build.py
+--- a/testing/mozharness/scripts/openh264_build.py
++++ b/testing/mozharness/scripts/openh264_build.py
+@@ -77,22 +77,16 @@ class OpenH264Build(MockMixin, TransferM
+             "action": "store_true",
+             "default": False,
+         }],
+         [["--use-yasm"], {
+             "dest": "use_yasm",
+             "help": "use yasm instead of nasm",
+             "action": "store_true",
+             "default": False,
+-        }],
+-        [["--avoid-avx2"], {
+-            "dest": "avoid_avx2",
+-            "help": "Pass HAVE_AVX2='false' through to Make to support older nasm",
+-            "action": "store_true",
+-            "default": False,
+         }]
+     ]
+ 
+     def __init__(self, require_config_file=False, config={},
+                  all_actions=all_actions,
+                  default_actions=default_actions):
+ 
+         # Default configuration
+@@ -160,19 +154,16 @@ class OpenH264Build(MockMixin, TransferM
+         self.fatal("can't determine platform")
+ 
+     def query_make_params(self):
+         dirs = self.query_abs_dirs()
+         retval = []
+         if self.config['debug_build']:
+             retval.append('BUILDTYPE=Debug')
+ 
+-        if self.config['avoid_avx2']:
+-            retval.append('HAVE_AVX2=false')
+-
+         if self.config['arch'] in ('x64', 'aarch64'):
+             retval.append('ENABLE64BIT=Yes')
+         else:
+             retval.append('ENABLE64BIT=No')
+ 
+         if "operating_system" in self.config:
+             retval.append("OS=%s" % self.config['operating_system'])
+             if self.config["operating_system"] == "android":

+ 258 - 0
mozilla-release/patches/1693498-2-88a1.patch

@@ -0,0 +1,258 @@
+# HG changeset patch
+# User Mike Hommey <mh+mozilla@glandium.org>
+# Date 1613684056 0
+# Node ID 1c5498cdf4c308a63cd6353d4b2498d7ed222024
+# Parent  e72c04c64d9fcb9ea87945171d27defd1ede17ec
+Bug 1693498 - Build openh264 with nasm on all platforms that need nasm or yasm. r=mjf
+
+Differential Revision: https://phabricator.services.mozilla.com/D105619
+
+diff --git a/testing/mozharness/configs/openh264/macosx64-aarch64.py.1693498-2.later b/testing/mozharness/configs/openh264/macosx64-aarch64.py.1693498-2.later
+new file mode 100644
+--- /dev/null
++++ b/testing/mozharness/configs/openh264/macosx64-aarch64.py.1693498-2.later
+@@ -0,0 +1,20 @@
++--- macosx64-aarch64.py
+++++ macosx64-aarch64.py
++@@ -14,17 +14,16 @@ external_tools_path = os.path.join(
++ 
++ config = {
++     "exes": {
++         "gittool.py": [os.path.join(external_tools_path, "gittool.py")],
++         "python2.7": "python2.7",
++     },
++     "dump_syms_binary": "{}/dump_syms/dump_syms".format(os.environ["MOZ_FETCHES_DIR"]),
++     "arch": "aarch64",
++-    "use_yasm": True,
++     "operating_system": "darwin",
++     "partial_env": {
++         "CFLAGS": (
++             "-target aarch64-apple-darwin -mcpu=apple-a12 "
++             "-isysroot {MOZ_FETCHES_DIR}/MacOSX11.0.sdk".format(
++                 MOZ_FETCHES_DIR=os.environ["MOZ_FETCHES_DIR"]
++             )
++         ),
+diff --git a/testing/mozharness/configs/openh264/macosx64.py b/testing/mozharness/configs/openh264/macosx64.py
+--- a/testing/mozharness/configs/openh264/macosx64.py
++++ b/testing/mozharness/configs/openh264/macosx64.py
+@@ -12,10 +12,9 @@ config = {
+     'tooltool_cache': "/builds/tooltool_cache",
+     'exes': {
+         'gittool.py': [os.path.join(external_tools_path, 'gittool.py')],
+         'tooltool.py': "/builds/tooltool.py",
+         'python2.7': "/tools/python27/bin/python2.7",
+     },
+     'dump_syms_binary': 'dump_syms',
+     'arch': 'x64',
+-    'use_yasm': True,
+ }
+diff --git a/testing/mozharness/configs/openh264/macosx64.py.1693498-2.later b/testing/mozharness/configs/openh264/macosx64.py.1693498-2.later
+new file mode 100644
+--- /dev/null
++++ b/testing/mozharness/configs/openh264/macosx64.py.1693498-2.later
+@@ -0,0 +1,19 @@
++--- macosx64.py
++@@ -32,14 +31,16 @@ config = {
++         "LDFLAGS": (
++             "-target x86_64-apple-darwin "
++             "-isysroot {MOZ_FETCHES_DIR}/MacOSX10.11.sdk "
++             "-mmacosx-version-min=10.11".format(
++                 MOZ_FETCHES_DIR=os.environ["MOZ_FETCHES_DIR"]
++             )
++         ),
++         "PATH": (
++-            "{MOZ_FETCHES_DIR}/clang/bin/:{MOZ_FETCHES_DIR}/cctools/bin/:%(PATH)s".format(
+++            "{MOZ_FETCHES_DIR}/clang/bin:"
+++            "{MOZ_FETCHES_DIR}/cctools/bin:"
+++            "{MOZ_FETCHES_DIR}/nasm:%(PATH)s".format(
++                 MOZ_FETCHES_DIR=os.environ["MOZ_FETCHES_DIR"]
++             )
++         ),
++     },
++ }
+diff --git a/testing/mozharness/configs/openh264/win32.py b/testing/mozharness/configs/openh264/win32.py
+--- a/testing/mozharness/configs/openh264/win32.py
++++ b/testing/mozharness/configs/openh264/win32.py
+@@ -13,17 +13,16 @@ config = {
+    'tooltool_manifest_file': "win.manifest",
+    'exes': {
+        'gittool.py': [sys.executable, os.path.join(external_tools_path, 'gittool.py')],
+        'python2.7': 'c:\\mozilla-build\\python27\\python2.7.exe',
+        'tooltool.py': [sys.executable, "c:\\mozilla-build\\tooltool.py"],
+    },
+    'dump_syms_binary': 'dump_syms.exe',
+    'arch': 'x86',
+-   'use_yasm': True,
+    'operating_system': 'msvc',
+    'partial_env': {
+        'PATH': '%s;%s;%s' % (
+            '{_VSPATH}/VC/redist/x86/Microsoft.VC140.CRT;{_VSPATH}/VC/redist/x64/Microsoft.VC140.CRT;{_VSPATH}/SDK/Redist/ucrt/DLLs/x86;{_VSPATH}/SDK/Redist/ucrt/DLLs/x64;{_VSPATH}/VC/bin/amd64_x86;{_VSPATH}/VC/bin/amd64;{_VSPATH}/VC/bin;{_VSPATH}/SDK/bin/x86;{_VSPATH}/SDK/bin/x64;{_VSPATH}/DIA SDK/bin'.format(_VSPATH=VSPATH),
+            os.environ['PATH'],
+            'C:\\mozilla-build\\Git\\bin',
+        ),
+        'WIN32_REDIST_DIR': '{_VSPATH}/VC/redist/x86/Microsoft.VC140.CRT'.format(_VSPATH=VSPATH),
+diff --git a/testing/mozharness/configs/openh264/win32.py.1693498-2.later b/testing/mozharness/configs/openh264/win32.py.1693498-2.later
+new file mode 100644
+--- /dev/null
++++ b/testing/mozharness/configs/openh264/win32.py.1693498-2.later
+@@ -0,0 +1,24 @@
++--- win32.py
+++++ win32.py
++@@ -19,20 +19,20 @@ config = {
++     "exes": {
++         "gittool.py": [sys.executable, os.path.join(external_tools_path, "gittool.py")],
++         "python2.7": "c:\\mozilla-build\\python\\python.exe",
++     },
++     "dump_syms_binary": "{}/dump_syms/dump_syms.exe".format(
++         os.environ["MOZ_FETCHES_DIR"]
++     ),
++     "arch": "x86",
++-    "use_yasm": True,
++     "partial_env": {
++         "PATH": (
++             "{MOZ_FETCHES_DIR}\\clang\\bin\\;"
+++            "{MOZ_FETCHES_DIR}\\nasm;"
++             "{_VSPATH}\\VC\\bin\\Hostx64\\x64;%(PATH)s"
++             # 32-bit redist here for our dump_syms.exe
++             "{_VSPATH}/VC/redist/x86/Microsoft.VC141.CRT;"
++             "{_VSPATH}/SDK/Redist/ucrt/DLLs/x86;"
++             "{_VSPATH}/DIA SDK/bin"
++         ).format(_VSPATH=VSPATH, MOZ_FETCHES_DIR=os.environ["MOZ_FETCHES_DIR"]),
++         "INCLUDES": (
++             "-I{_VSPATH}\\VC\\include "
+diff --git a/testing/mozharness/configs/openh264/win64-aarch64.py.1693498-2.later b/testing/mozharness/configs/openh264/win64-aarch64.py.1693498-2.later
+new file mode 100644
+--- /dev/null
++++ b/testing/mozharness/configs/openh264/win64-aarch64.py.1693498-2.later
+@@ -0,0 +1,20 @@
++--- win64-aarch64.py
+++++ win64-aarch64.py
++@@ -19,17 +19,16 @@ config = {
++     "exes": {
++         "gittool.py": [sys.executable, os.path.join(external_tools_path, "gittool.py")],
++         "python2.7": "c:\\mozilla-build\\python\\python.exe",
++     },
++     "dump_syms_binary": "{}/dump_syms/dump_syms.exe".format(
++         os.environ["MOZ_FETCHES_DIR"]
++     ),
++     "arch": "aarch64",
++-    "use_yasm": False,
++     "partial_env": {
++         "PATH": (
++             "%(abs_work_dir)s\\openh264;"
++             "{MOZ_FETCHES_DIR}\\clang\\bin\\;"
++             "{_VSPATH}\\VC\\bin\\Hostx64\\arm64;"
++             "{_VSPATH}\\VC\\bin\\Hostx64\\x64;"
++             # 32-bit redist here for our dump_syms.exe
++             "{_VSPATH}/VC/redist/x86/Microsoft.VC141.CRT;"
+diff --git a/testing/mozharness/configs/openh264/win64.py b/testing/mozharness/configs/openh264/win64.py
+--- a/testing/mozharness/configs/openh264/win64.py
++++ b/testing/mozharness/configs/openh264/win64.py
+@@ -13,17 +13,16 @@ config = {
+    'tooltool_manifest_file': "win.manifest",
+    'exes': {
+         'gittool.py': [sys.executable, os.path.join(external_tools_path, 'gittool.py')],
+         'python2.7': 'c:\\mozilla-build\\python27\\python2.7.exe',
+         'tooltool.py': [sys.executable, "c:\\mozilla-build\\tooltool.py"],
+    },
+    'dump_syms_binary': 'dump_syms.exe',
+    'arch': 'x64',
+-   'use_yasm': True,
+    'operating_system': 'msvc',
+    'partial_env': {
+        'PATH': '%s;%s;%s' % (
+            '{_VSPATH}/VC/bin/amd64;{_VSPATH}/VC/bin;{_VSPATH}/SDK/bin/x64;{_VSPATH}/VC/redist/x64/Microsoft.VC140.CRT;{_VSPATH}/SDK/Redist/ucrt/DLLs/x64;{_VSPATH}/VC/redist/x86/Microsoft.VC140.CRT;{_VSPATH}/SDK/Redist/ucrt/DLLs/x86;{_VSPATH}/DIA SDK/bin'.format(_VSPATH=VSPATH),
+            os.environ['PATH'],
+            'C:\\mozilla-build\\Git\\bin',
+        ),
+        'WIN32_REDIST_DIR': '{_VSPATH}/VC/redist/x64/Microsoft.VC140.CRT'.format(_VSPATH=VSPATH),
+diff --git a/testing/mozharness/configs/openh264/win64.py.1693498-2.later b/testing/mozharness/configs/openh264/win64.py.1693498-2.later
+new file mode 100644
+--- /dev/null
++++ b/testing/mozharness/configs/openh264/win64.py.1693498-2.later
+@@ -0,0 +1,24 @@
++--- win64.py
+++++ win64.py
++@@ -19,20 +19,20 @@ config = {
++     "exes": {
++         "gittool.py": [sys.executable, os.path.join(external_tools_path, "gittool.py")],
++         "python2.7": "c:\\mozilla-build\\python\\python.exe",
++     },
++     "dump_syms_binary": "{}/dump_syms/dump_syms.exe".format(
++         os.environ["MOZ_FETCHES_DIR"]
++     ),
++     "arch": "x64",
++-    "use_yasm": True,
++     "partial_env": {
++         "PATH": (
++             "{MOZ_FETCHES_DIR}\\clang\\bin\\;"
+++            "{MOZ_FETCHES_DIR}\\nasm;"
++             "{_VSPATH}\\VC\\bin\\Hostx64\\x64;%(PATH)s;"
++             # 32-bit redist here for our dump_syms.exe
++             "{_VSPATH}/VC/redist/x86/Microsoft.VC141.CRT;"
++             "{_VSPATH}/SDK/Redist/ucrt/DLLs/x86;"
++             "{_VSPATH}/DIA SDK/bin"
++         ).format(_VSPATH=VSPATH, MOZ_FETCHES_DIR=os.environ["MOZ_FETCHES_DIR"]),
++         "INCLUDES": (
++             "-I{_VSPATH}\\VC\\include "
+diff --git a/testing/mozharness/scripts/openh264_build.py b/testing/mozharness/scripts/openh264_build.py
+--- a/testing/mozharness/scripts/openh264_build.py
++++ b/testing/mozharness/scripts/openh264_build.py
+@@ -71,37 +71,30 @@ class OpenH264Build(MockMixin, TransferM
+             "dest": "operating_system",
+             "help": "Specify the operating system to build for",
+         }],
+         [["--use-mock"], {
+             "dest": "use_mock",
+             "help": "use mock to set up build environment",
+             "action": "store_true",
+             "default": False,
+-        }],
+-        [["--use-yasm"], {
+-            "dest": "use_yasm",
+-            "help": "use yasm instead of nasm",
+-            "action": "store_true",
+-            "default": False,
+         }]
+     ]
+ 
+     def __init__(self, require_config_file=False, config={},
+                  all_actions=all_actions,
+                  default_actions=default_actions):
+ 
+         # Default configuration
+         default_config = {
+             'debug_build': False,
+             'upload_ssh_key': "~/.ssh/ffxbld_rsa",
+             'upload_ssh_user': 'ffxbld',
+             'upload_ssh_host': 'upload.ffxbld.productdelivery.prod.mozaws.net',
+             'upload_path_base': '/tmp/openh264',
+-            'use_yasm': False,
+         }
+         default_config.update(config)
+ 
+         VCSScript.__init__(
+             self,
+             config_options=self.config_options,
+             require_config_file=require_config_file,
+             config=default_config,
+@@ -172,19 +165,16 @@ class OpenH264Build(MockMixin, TransferM
+                 elif self.config['arch'] == 'aarch64':
+                     retval.append("ARCH=arm64")
+                 else:
+                     retval.append("ARCH=arm")
+                 retval.append('TARGET=invalid')
+                 retval.append('NDKLEVEL=%s' % self.config['min_sdk'])
+                 retval.append('NDKROOT=%s/android-ndk-r11c' % dirs['abs_work_dir'])
+ 
+-        if self.config['use_yasm']:
+-            retval.append('ASM=yasm')
+-
+         return retval
+ 
+     def query_upload_ssh_key(self):
+         return self.config['upload_ssh_key']
+ 
+     def query_upload_ssh_host(self):
+         return self.config['upload_ssh_host']
+ 

+ 16 - 16
mozilla-release/patches/1709303-1-94a1.patch

@@ -3,7 +3,7 @@
 # Date 1631570112 0
 #      Mon Sep 13 21:55:12 2021 +0000
 # Node ID da5b090edd2b4a9d46125c5af15773fff2d9c025
-# Parent  1e913f3aee6dd92c10c3847a78555bfea9d3f003
+# Parent  8412cbbecb76b52d971cc80034969758a9c1180c
 Bug 1709303 - Part 1. Prepare scripts and patches for libjpeg-turbo update. r=jrmuizel,tnikkel
 
 media/libjpeg/1050342.diff is no longer necessary and a correction
@@ -4652,7 +4652,7 @@ diff --git a/media/update-libjpeg.sh b/media/update-libjpeg.sh
 diff --git a/toolkit/moz.configure b/toolkit/moz.configure
 --- a/toolkit/moz.configure
 +++ b/toolkit/moz.configure
-@@ -1349,50 +1349,95 @@ with only_when(compile_environment):
+@@ -1343,50 +1343,95 @@ with only_when(compile_environment):
              check_msg='for sufficient libjpeg-turbo JCS_EXTENSIONS',
              onerror=lambda: die('libjpeg-turbo JCS_EXTENSIONS required for '
                                   '--with-system-jpeg'),
@@ -4677,37 +4677,37 @@ diff --git a/toolkit/moz.configure b/toolkit/moz.configure
          if system_jpeg:
              return
  
--        if target.kernel == "Darwin":
+-        if target.kernel == 'Darwin':
 +        if target.cpu == "arm":
 +            return ("-march=armv7-a", "-mfpu=neon")
 +        elif target.cpu == "aarch64":
 +            return ("-march=armv8-a",)
 +        elif target.kernel == "Darwin":
-             if target.cpu == "x86":
+             if target.cpu == 'x86':
                  return ("-DPIC", "-DMACHO")
-             elif target.cpu == "x86_64":
+             elif target.cpu == 'x86_64':
                  return ("-D__x86_64__", "-DPIC", "-DMACHO")
-         elif target.kernel == "WINNT":
-             if target.cpu == "x86":
+         elif target.kernel == 'WINNT':
+             if target.cpu == 'x86':
                  return ("-DPIC", "-DWIN32")
-             elif target.cpu == "x86_64":
+             elif target.cpu == 'x86_64':
                  return ("-D__x86_64__", "-DPIC", "-DWIN64", "-DMSVC")
--        elif target.cpu == "arm":
+-        elif target.cpu == 'arm':
 -            return ("-march=armv7-a", "-mfpu=neon")
--        elif target.cpu == "aarch64":
+-        elif target.cpu == 'aarch64':
 -            return ("-march=armv8-a",)
-         elif target.cpu == "mips32":
+         elif target.cpu == 'mips32':
              return ("-mdspr2",)
 +        elif target.cpu == "mips64":
 +            return ("-Wa,-mloongson-mmi", "-mloongson-ext")
-         elif target.cpu == "x86":
+         elif target.cpu == 'x86':
              return ("-DPIC", "-DELF")
-         elif target.cpu == "x86_64":
+         elif target.cpu == 'x86_64':
              return ("-D__x86_64__", "-DPIC", "-DELF")
  
      @depends(target, when=in_tree_jpeg)
      def jpeg_nasm(target):
-         if target.cpu in ("x86", "x86_64"):
+         if target.cpu in ('x86', 'x86_64'):
              # libjpeg-turbo 2.0.6 requires nasm 2.10.
              return namespace(version="2.10", what="JPEG")
  
@@ -4749,9 +4749,9 @@ diff --git a/toolkit/moz.configure b/toolkit/moz.configure
 +    )
  
  
- # Libav-fft Support
+ # FFmpeg's ffvpx configuration
  # ==============================================================
  with only_when(compile_environment):
      @depends(target)
      def libav_fft(target):
-         flags = None
+         return target.kernel == "WINNT" or target.cpu == "x86_64"

+ 0 - 29
mozilla-release/patches/NOBUG-nasm-icu-25320.patch

@@ -1,29 +0,0 @@
-# HG changeset patch
-# User Frank-Rainer Grahl <frgrahl@gmx.net>
-# Date 1726194014 -7200
-# Parent  ba57bae2b93ce0551eca096a3eee2e240090603d
-No Bug - Use nasm for icu data file. r=me a=me
-
-Replacement for Bug 1650299 while we keep msvc support.
-
-diff --git a/config/external/icu/data/moz.build b/config/external/icu/data/moz.build
---- a/config/external/icu/data/moz.build
-+++ b/config/external/icu/data/moz.build
-@@ -22,15 +22,15 @@ asflags = [
- ]
- LOCAL_INCLUDES += ['.']
- 
- if CONFIG['OS_TARGET'] == 'WINNT' and CONFIG['CPU_ARCH'] == 'aarch64':
-     icudata = 'icudata.asm'
-     GeneratedFile(icudata, script='genicudata.py',
-                   inputs=[CONFIG['ICU_DATA_FILE']], flags=[data_symbol])
-     SOURCES += ['!%s' % icudata]
--elif CONFIG['HAVE_YASM']:
--    USE_YASM = True
-+elif CONFIG['HAVE_NASM']:
-+    USE_NASM = True
-     SOURCES += ['icudata.s']
-     ASFLAGS += asflags
- elif CONFIG['GNU_AS']:
-     SOURCES += ['icudata_gas.S']
-     ASFLAGS += asflags

+ 0 - 28
mozilla-release/patches/TOP-1445683-14-PLASTER-aom-fix-win32-bustage-2535.patch

@@ -1,28 +0,0 @@
-# HG changeset patch
-# User Bill Gianopoulos <wgianopoulos@gmail.com>
-# Date 1598375293 14400
-#      Tue Aug 25 13:08:13 2020 -0400
-# Parent  d922838bb5be1339946ccb2e0f01644a0ba82572
-Bug 1445683 - Don't include x86_abi_support.asm in libaom build to avoid clang ICE/error. r=frg a=frg
-
-diff --git a/media/libaom/sources.mozbuild b/media/libaom/sources.mozbuild
---- a/media/libaom/sources.mozbuild
-+++ b/media/libaom/sources.mozbuild
-@@ -271,17 +271,16 @@ files = {
-     '../../third_party/aom/aom_dsp/x86/intrapred_avx2.c',
-     '../../third_party/aom/aom_dsp/x86/intrapred_sse2.c',
-     '../../third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm',
-     '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.c',
-     '../../third_party/aom/aom_dsp/x86/inv_wht_sse2.asm',
-     '../../third_party/aom/aom_dsp/x86/loopfilter_sse2.c',
-     '../../third_party/aom/aom_mem/aom_mem.c',
-     '../../third_party/aom/aom_ports/emms.asm',
--    '../../third_party/aom/aom_ports/x86_abi_support.asm',
-     '../../third_party/aom/aom_scale/aom_scale_rtcd.c',
-     '../../third_party/aom/aom_scale/generic/aom_scale.c',
-     '../../third_party/aom/aom_scale/generic/gen_scalers.c',
-     '../../third_party/aom/aom_scale/generic/yv12config.c',
-     '../../third_party/aom/aom_scale/generic/yv12extend.c',
-     '../../third_party/aom/aom_util/aom_thread.c',
-     '../../third_party/aom/aom_util/debug_util.c',
-     '../../third_party/aom/av1/av1_dx_iface.c',

+ 66 - 0
mozilla-release/patches/TOP-NOBUG-fixnasmcheck-25320.patch

@@ -0,0 +1,66 @@
+# HG changeset patch
+# User Frank-Rainer Grahl <frgrahl@gmx.net>
+# Date 1726338514 -7200
+# Parent  d71e0e729ecc13156dc0c7c28310aa42c115703d
+No Bug - Fix nasm check. r=me a=me
+
+We don't have a bootstrap path and need sorted defined.
+
+diff --git a/toolkit/moz.configure b/toolkit/moz.configure
+--- a/toolkit/moz.configure
++++ b/toolkit/moz.configure
+@@ -1516,16 +1516,17 @@ with only_when(compile_environment):
+     set_define('MOZ_FFVPX_FLACONLY', True, when=ffvpx.flac_only)
+     set_config('FFVPX_ASFLAGS', ffvpx.flags)
+     set_config("FFVPX_USE_NASM", True, when=ffvpx.use_nasm)
+ 
+ 
+ # nasm detection
+ # ==============================================================
+ @depends(dav1d_nasm, vpx_nasm, jpeg_nasm, ffvpx_nasm)
++@imports(_from='__builtin__', _import='sorted')
+ def need_nasm(*requirements):
+     requires = {
+         x.what: x.version if hasattr(x, "version") else True for x in requirements if x
+     }
+     if requires:
+         items = sorted(requires.keys())
+         if len(items) > 1:
+             what = " and ".join((", ".join(items[:-1]), items[-1]))
+@@ -1534,17 +1535,17 @@ def need_nasm(*requirements):
+         versioned = {k: v for (k, v) in requires.items() if v is not True}
+         return namespace(what=what, versioned=versioned)
+ 
+ 
+ nasm = check_prog(
+     "NASM",
+     ["nasm"],
+     allow_missing=True,
+-    paths=bootstrap_search_path("nasm", when=need_nasm),
++    paths=toolchain_search_path,
+     when=need_nasm,
+ )
+ 
+ 
+ @depends(nasm, need_nasm.what)
+ def check_nasm(nasm, what):
+     if not nasm and what:
+         die("Nasm is required to build with %s, but it was not found." % what)
+@@ -1558,16 +1559,17 @@ def nasm_version(nasm):
+         check_cmd_output(nasm, "-v", onerror=lambda: die("Failed to get nasm version."))
+         .splitlines()[0]
+         .split()[2]
+     )
+     return Version(version)
+ 
+ 
+ @depends(nasm_version, need_nasm.versioned, when=need_nasm.versioned)
++@imports(_from='__builtin__', _import='sorted')
+ def check_nasm_version(nasm_version, versioned):
+     by_version = sorted(versioned.items(), key=lambda x: x[1])
+     what, version = by_version[-1]
+     if nasm_version < version:
+         die(
+             "Nasm version %s or greater is required to build with %s." % (version, what)
+         )
+     return nasm_version

+ 35 - 8
mozilla-release/patches/series

@@ -5918,7 +5918,6 @@ NOBUG-removenonascii67a1-25314.patch
 1578348-71a1.patch
 1576859-71a1.patch
 1587187-71a1.patch
-1585358-71a1.patch
 1587206-1-71a1.patch
 1583582-71a1.patch
 1579758-71a1.patch
@@ -6708,7 +6707,6 @@ NOBUG-removenonascii67a1-25314.patch
 1696581-2-88a1.patch
 1696935-88a1.patch
 1690604-88a1.patch
-1692940-5-88a1.patch
 1513184-88a1.patch
 1698827-88a1.patch
 1698592-89a1.patch
@@ -6817,7 +6815,6 @@ NOBUG-removenonascii67a1-25314.patch
 1730397-4only-94a1.patch
 1730712-1-94a1.patch
 1723031-1-94a1.patch
-1709303-1-94a1.patch
 1709303-2-94a1.patch
 1738598-PARTIAL-95a1.patch
 1730048-913.patch
@@ -6906,7 +6903,6 @@ NOBUG-removenonascii67a1-25314.patch
 1750760-1-99a1.patch
 1750760-3-99a1.patch
 1750760-4-99a1.patch
-1757308-99a1.patch
 1758062-9162.patch
 NOBUG-removemobilepromo-25312.patch
 NOBUG-disableupdates-25312.patch
@@ -7026,7 +7022,6 @@ NOBUG-ppcheck-25315.patch
 1801893-webp-109a1.patch
 1801583-109a1.patch
 1803469-webp-109a1.patch
-1787515-109a1.patch
 1466443-110a1.patch
 1782344-1-110a1.patch
 1782344-2-110a1.patch
@@ -7038,7 +7033,6 @@ NOBUG-ppcheck-25315.patch
 1760633-2-110a1.patch
 1760633-3-110a1.patch
 1810078-webp-111a1.patch
-1815737-111a1.patch
 1816737-112a1.patch
 1817900-13-112a1.patch
 1819374-4-112a1.patch
@@ -7165,7 +7159,6 @@ PPPPPPP-NSSgetentropy.patch
 WIP-1729459-comment25.patch
 TOP-1294490-7-PLASTER-webp-2535.patch
 TOP-1493400-6-PLASTER-dav1d-avoid-mColorDepth-2535.patch
-TOP-1445683-14-PLASTER-aom-fix-win32-bustage-2535.patch
 TOP-1683545-PLASTER-webrender-2536.patch
 TOP-1667581-3-PLASTER-2537.patch
 TOP-1469021-PLASTER-2538.patch
@@ -7364,7 +7357,41 @@ TOP-NOBUG-blockquad0-25319.patch
 1903254-129a1.patch
 1903021-129a1.patch
 1519636-80-elfhack-130a1.patch
-NOBUG-nasm-icu-25320.patch
 1902935-seamonkey-credits-25320.patch
 1862395-incorrect-version-resistfingerprinting-v2-25320.patch
 1737436-use-mozilla-compat-version-define-25320.patch
+1540760-1-68a1.patch
+1540760-2-68a1.patch
+1540760-3-68a1.patch
+1540760-4-68a1.patch
+1540760-5-68a1.patch
+1540760-6-68a1.patch
+1585358-71a1.patch
+1585359-71a1.patch
+1525393-1-75a1.patch
+1650299-80a1.patch
+1656063-81a1.patch
+1669888-83a1.patch
+1692945-1-87a1.patch
+1692945-2-87a1.patch
+1693215-1-88a1.patch
+1693215-2-88a1.patch
+1693215-3-88a1.patch
+1693498-1-88a1.patch
+1693498-2-88a1.patch
+1692940-01-88a1.patch
+1692940-02-88a1.patch
+1692940-03-88a1.patch
+1692940-04-88a1.patch
+1692940-05-88a1.patch
+1692940-06-88a1.patch
+1692940-07-88a1.patch
+1692940-08-88a1.patch
+1692940-09-88a1.patch
+1692940-10no11-88a1.patch
+1692940-12-88a1.patch
+1709303-1-94a1.patch
+1757308-99a1.patch
+1787515-109a1.patch
+1815737-111a1.patch
+TOP-NOBUG-fixnasmcheck-25320.patch