diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
index 7ffe397..4034aeb 100644
--- a/cmake/toolchains/aarch64-linux-gnu.cmake
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -23,6 +23,8 @@ if("${CROSS}" STREQUAL "")
   set(CROSS aarch64-linux-gnu-)
 endif()
 
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
 set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
 set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
index 8051f0d..0f36c98 100644
--- a/cmake/toolchains/arm-linux-gnueabihf.cmake
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -23,7 +23,9 @@ if("${CROSS}" STREQUAL "")
   set(CROSS arm-linux-gnueabihf-)
 endif()
 
-set(CMAKE_CXX_COMPILER ${CROSS}g++)
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
 set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/debian/changelog b/debian/changelog
index 353be9f..9291baa 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+libgav1 (0.16.3+git20210615.1.86f25fa-1) UNRELEASED; urgency=low
+
+  * New upstream snapshot.
+
+ -- Debian Janitor <janitor@jelmer.uk>  Thu, 08 Jul 2021 09:21:29 -0000
+
 libgav1 (0.16.3-2) unstable; urgency=medium
 
   * Uploading to sid.
diff --git a/debian/patches/Unbundling-abseil.patch b/debian/patches/Unbundling-abseil.patch
index 83672d4..8ee32db 100644
--- a/debian/patches/Unbundling-abseil.patch
+++ b/debian/patches/Unbundling-abseil.patch
@@ -1,7 +1,9 @@
 Unbundling abseil
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -100,7 +100,6 @@
+Index: libgav1/CMakeLists.txt
+===================================================================
+--- libgav1.orig/CMakeLists.txt
++++ libgav1/CMakeLists.txt
+@@ -100,7 +100,6 @@ if(LIBGAV1_VERBOSE)
    libgav1_dump_options()
  endif()
  
@@ -9,7 +11,7 @@ Unbundling abseil
  set(libgav1_gtest_build "${libgav1_build}/gtest")
  
  # Compiler/linker flags must be lists, but come in from the environment as
-@@ -115,18 +114,8 @@
+@@ -115,18 +114,8 @@ endif()
  # Set test-only flags based on LIBGAV1_CXX_FLAGS.
  libgav1_set_test_flags()
  
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
index e23903c..9aab6d1 100644
--- a/src/decoder_impl.cc
+++ b/src/decoder_impl.cc
@@ -1232,7 +1232,7 @@ StatusCode DecoderImpl::DecodeTiles(
     LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
     return kStatusOutOfMemory;
   }
-  if (sequence_header.enable_cdef) {
+  if (frame_header.cdef.bits > 0) {
     if (!frame_scratch_buffer->cdef_index.Reset(
             DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
             DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
@@ -1241,6 +1241,15 @@ StatusCode DecoderImpl::DecodeTiles(
       return kStatusOutOfMemory;
     }
   }
+  if (do_cdef) {
+    if (!frame_scratch_buffer->cdef_skip.Reset(
+            DivideBy2(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/true)) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef skip.");
+      return kStatusOutOfMemory;
+    }
+  }
   if (!frame_scratch_buffer->inter_transform_sizes.Reset(
           frame_header.rows4x4 + kMaxBlockHeight4x4,
           frame_header.columns4x4 + kMaxBlockWidth4x4,
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
index 5b4c094..3603750 100644
--- a/src/dsp/arm/average_blend_neon.cc
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -40,17 +40,19 @@ constexpr int kInterPostRoundBit =
 namespace low_bitdepth {
 namespace {
 
-inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
-                                  const int16_t* prediction_1) {
+inline uint8x8_t AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                  const int16_t* LIBGAV1_RESTRICT
+                                      prediction_1) {
   const int16x8_t pred0 = vld1q_s16(prediction_0);
   const int16x8_t pred1 = vld1q_s16(prediction_1);
   const int16x8_t res = vaddq_s16(pred0, pred1);
   return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
 }
 
-inline void AverageBlendLargeRow(const int16_t* prediction_0,
-                                 const int16_t* prediction_1, const int width,
-                                 uint8_t* dest) {
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint8_t* LIBGAV1_RESTRICT dest) {
   int x = width;
   do {
     const int16x8_t pred_00 = vld1q_s16(prediction_0);
@@ -71,8 +73,10 @@ inline void AverageBlendLargeRow(const int16_t* prediction_0,
   } while (x != 0);
 }
 
-void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
-                       const int width, const int height, void* const dest,
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                       const void* LIBGAV1_RESTRICT prediction_1,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT const dest,
                        const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
@@ -139,10 +143,10 @@ void Init8bpp() {
 namespace high_bitdepth {
 namespace {
 
-inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
-                                   const uint16_t* prediction_1,
-                                   const int32x4_t compound_offset,
-                                   const uint16x8_t v_bitdepth) {
+inline uint16x8_t AverageBlend8Row(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    const int32x4_t compound_offset, const uint16x8_t v_bitdepth) {
   const uint16x8_t pred0 = vld1q_u16(prediction_0);
   const uint16x8_t pred1 = vld1q_u16(prediction_1);
   const uint32x4_t pred_lo =
@@ -158,9 +162,10 @@ inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
   return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
 }
 
-inline void AverageBlendLargeRow(const uint16_t* prediction_0,
-                                 const uint16_t* prediction_1, const int width,
-                                 uint16_t* dest,
+inline void AverageBlendLargeRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const uint16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint16_t* LIBGAV1_RESTRICT dest,
                                  const int32x4_t compound_offset,
                                  const uint16x8_t v_bitdepth) {
   int x = width;
@@ -181,8 +186,10 @@ inline void AverageBlendLargeRow(const uint16_t* prediction_0,
   } while (x != 0);
 }
 
-void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
-                       const int width, const int height, void* const dest,
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                       const void* LIBGAV1_RESTRICT prediction_1,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT const dest,
                        const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint16_t*>(dest);
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
index 60c72d6..c65ea50 100644
--- a/src/dsp/arm/cdef_neon.cc
+++ b/src/dsp/arm/cdef_neon.cc
@@ -234,7 +234,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
   *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
 }
 
-LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* LIBGAV1_RESTRICT const source,
                                       ptrdiff_t stride, uint16x8_t* partial_lo,
                                       uint16x8_t* partial_hi) {
   const auto* src = static_cast<const uint8_t*>(source);
@@ -358,8 +358,10 @@ uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
   return SumVector(c);
 }
 
-void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
-                        uint8_t* const direction, int* const variance) {
+void CdefDirection_NEON(const void* LIBGAV1_RESTRICT const source,
+                        ptrdiff_t stride,
+                        uint8_t* LIBGAV1_RESTRICT const direction,
+                        int* LIBGAV1_RESTRICT const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const uint8_t*>(source);
@@ -407,8 +409,9 @@ void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
 // CdefFilter
 
 // Load 4 vectors based on the given |direction|.
-void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
-                   uint16x8_t* output, const int direction) {
+void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                   const ptrdiff_t stride, uint16x8_t* output,
+                   const int direction) {
   // Each |direction| describes a different set of source values. Expand this
   // set by negating each set. For |direction| == 0 this gives a diagonal line
   // from top right to bottom left. The first value is y, the second x. Negative
@@ -432,8 +435,9 @@ void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
 
 // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
 // do 2 rows at a time.
-void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
-                    uint16x8_t* output, const int direction) {
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, uint16x8_t* output,
+                    const int direction) {
   const int y_0 = kCdefDirections[direction][0][0];
   const int x_0 = kCdefDirections[direction][0][1];
   const int y_1 = kCdefDirections[direction][1][0];
@@ -470,11 +474,11 @@ int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
 }
 
 template <int width, bool enable_primary = true, bool enable_secondary = true>
-void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride,
-                     const int height, const int primary_strength,
-                     const int secondary_strength, const int damping,
-                     const int direction, void* dest,
-                     const ptrdiff_t dst_stride) {
+void CdefFilter_NEON(const uint16_t* LIBGAV1_RESTRICT src,
+                     const ptrdiff_t src_stride, const int height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction,
+                     void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
   static_assert(width == 8 || width == 4, "");
   static_assert(enable_primary || enable_secondary, "");
   constexpr bool clipping_required = enable_primary && enable_secondary;
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index 05e0d05..157aa18 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -210,6 +210,14 @@ inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
       vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
 }
 
+template <int lane>
+inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u16_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
+}
+
 // Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
 // register before loading the values. Use caution when using this in loops
 // because it will re-zero the register before loading on every iteration.
@@ -229,6 +237,15 @@ inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
       vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
 }
 
+// Convenience functions for 16-bit loads from a uint8_t* source.
+inline uint16x4_t Load4U16(const void* const buf) {
+  return vld1_u16(static_cast<const uint16_t*>(buf));
+}
+
+inline uint16x8_t Load8U16(const void* const buf) {
+  return vld1q_u16(static_cast<const uint16_t*>(buf));
+}
+
 //------------------------------------------------------------------------------
 // Store functions.
 
@@ -315,6 +332,17 @@ inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
 #endif
 }
 
+// Shim vqtbl2_u8 for armv7.
+inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl2_u8(a, index);
+#else
+  const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+                         vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+  return vtbl4_u8(b, index);
+#endif
+}
+
 // Shim vqtbl1_s8 for armv7.
 inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
 #if defined(__aarch64__)
@@ -325,6 +353,26 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
 #endif
 }
 
+//------------------------------------------------------------------------------
+// Saturation helpers.
+
+inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) {
+  return vmin_s16(vmax_s16(val, low), high);
+}
+
+inline int16x8_t Clip3S16(const int16x8_t value, const int16x8_t low,
+                          const int16x8_t high) {
+  const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
+  return vmaxq_s16(low, clipped_to_ceiling);
+}
+
+inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) {
+  const int16x8_t low = vdupq_n_s16(0);
+  const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
 //------------------------------------------------------------------------------
 // Interleave.
 
@@ -439,6 +487,9 @@ inline uint8x8_t Transpose32(const uint8x8_t a) {
   return vreinterpret_u8_u32(b);
 }
 
+// Swap high and low halves.
+inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
+
 // Implement vtrnq_s64().
 // Input:
 // a0: 00 01 02 03 04 05 06 07
@@ -512,6 +563,108 @@ inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
   *b = e.val[1];
 }
 
+// 4x8 Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 00 10 20 30 04 14 24 34
+// a[1]: 01 11 21 31 05 15 25 35
+// a[2]: 02 12 22 32 06 16 26 36
+// a[3]: 03 13 23 33 07 17 27 37
+inline void Transpose4x8(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  a[0] = vreinterpretq_u16_u32(c0.val[0]);
+  a[1] = vreinterpretq_u16_u32(c1.val[0]);
+  a[2] = vreinterpretq_u16_u32(c0.val[1]);
+  a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q:  p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34  p0q0
+// a[1]: 02 12 22 32 05 15 25 35  p1q1
+// a[2]: 01 11 21 31 06 16 26 36  p2q2
+// a[3]: 00 10 20 30 07 17 27 37  p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q:  p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard Transpose4x8 will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // Reverse odd vectors to bring the appropriate items to the front of zips.
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // r0       : 03 13 01 11 07 17 05 15
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // r1       : 23 33 21 31 27 37 25 35
+  const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+  const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+  // Zip to complete the halves.
+  // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
+  // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
+  // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
+  // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
+  const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+  // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
+  // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
+  // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
+  // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
+  const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
+  // The third row of c comes first here to swap p2 with q0.
+  const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
+
+  // 8x4 Output:
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  a[0] = d1.val[0];  // p0q0
+  a[1] = d0.val[1];  // p1q1
+  a[2] = d1.val[1];  // p2q2
+  a[3] = d0.val[0];  // p3q3
+}
+
 // Reversible if the x4 values are packed next to each other.
 // x4 input / x8 output:
 // a0: 00 01 02 03 40 41 42 43 44
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
new file mode 100644
index 0000000..3e3e740
--- /dev/null
+++ b/src/dsp/arm/convolve_10bit_neon.cc
@@ -0,0 +1,908 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+template <int filter_index>
+int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
+                           const int16x4_t* const taps) {
+  const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
+  int32x4x2_t sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[6]), taps[6]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[7]), taps[7]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+  } else {
+    // 4 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+  }
+  return sum;
+}
+
+template <int filter_index>
+int32x4_t SumOnePassTaps(const uint16x4_t* const src,
+                         const int16x4_t* const taps) {
+  const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
+  int32x4_t sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+    sum = vmlal_s16(sum, ssrc[4], taps[4]);
+    sum = vmlal_s16(sum, ssrc[5], taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+    sum = vmlal_s16(sum, ssrc[4], taps[4]);
+    sum = vmlal_s16(sum, ssrc[5], taps[5]);
+    sum = vmlal_s16(sum, ssrc[6], taps[6]);
+    sum = vmlal_s16(sum, ssrc[7], taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+  } else {
+    // 4 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+  }
+  return sum;
+}
+
+template <int filter_index, bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+                                 const ptrdiff_t src_stride,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t pred_stride, const int width,
+                                 const int height,
+                                 const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const uint16x8_t src_long = vld1q_u16(src + x);
+      const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
+      uint16x8_t v_src[8];
+      int32x4x2_t v_sum;
+      if (filter_index < 2) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+        v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
+      } else if (filter_index == 2) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+        v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+        v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+        v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+      } else if (filter_index == 3) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+      } else if (filter_index > 3) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+      }
+      if (is_compound) {
+        const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(&dest16[x],
+                 vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+        vst1_u16(&dest16[x + 4],
+                 vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+      } else {
+        // Normally the Horizontal pass does the downshift in two passes:
+        // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+        // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+        // Combining them requires adding the rounding offset from the skipped
+        // shift.
+        const int32x4_t v_first_shift_rounding_bit =
+            vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+        v_sum.val[0] = vaddq_s32(v_sum.val[0], v_first_shift_rounding_bit);
+        v_sum.val[1] = vaddq_s32(v_sum.val[1], v_first_shift_rounding_bit);
+        const uint16x4_t d0 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+        const uint16x4_t d1 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+        vst1_u16(&dest16[x], d0);
+        vst1_u16(&dest16[x + 4], d1);
+      }
+      x += 8;
+    } while (x < width);
+    src += src_stride >> 1;
+    dest16 += is_compound ? pred_stride : pred_stride >> 1;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_compound>
+void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height;
+  do {
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    uint16x4_t v_src[4];
+    int32x4_t v_sum;
+    const uint16x8_t src_long = vld1q_u16(src);
+    v_src[0] = vget_low_u16(src_long);
+    if (filter_index == 3) {
+      v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+      v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+      v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
+      v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
+      v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+    }
+    if (is_compound) {
+      const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+      vst1_u16(&dest16[0],
+               vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+    } else {
+      const int32x4_t v_first_shift_rounding_bit =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+      v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      vst1_u16(&dest16[0], d0);
+    }
+    src += src_stride >> 1;
+    dest16 += is_compound ? pred_stride : pred_stride >> 1;
+  } while (--y != 0);
+}
+
+template <int filter_index>
+void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height >> 1;
+  do {
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t input0 = vreinterpretq_s16_u16(vld1q_u16(src));
+    const int16x8_t input1 =
+        vreinterpretq_s16_u16(vld1q_u16(src + (src_stride >> 1)));
+    const int16x8x2_t input = vzipq_s16(input0, input1);
+    int32x4_t v_sum;
+    if (filter_index == 3) {
+      v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
+      v_sum = vmlal_s16(v_sum,
+                        vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
+                        v_tap[4]);
+    } else {
+      v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[2]);
+      v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 2)),
+                        v_tap[3]);
+      v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 4)),
+                        v_tap[4]);
+      v_sum = vmlal_s16(v_sum,
+                        vget_low_s16(vextq_s16(input.val[0], input.val[1], 6)),
+                        v_tap[5]);
+    }
+    // Normally the Horizontal pass does the downshift in two passes:
+    // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+    // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+    // Combining them requires adding the rounding offset from the skipped
+    // shift.
+    const int32x4_t v_first_shift_rounding_bit =
+        vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+    v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+    const uint16x4_t d0 =
+        vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+    dest16[0] = vget_lane_u16(d0, 0);
+    dest16[1] = vget_lane_u16(d0, 2);
+    dest16 += pred_stride >> 1;
+    dest16[0] = vget_lane_u16(d0, 1);
+    dest16[1] = vget_lane_u16(d0, 3);
+    dest16 += pred_stride >> 1;
+    src += src_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_compound>
+void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const int16x4_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, is_compound>(
+        src, src_stride, dest, pred_stride, width, height, v_tap);
+    return;
+  }
+
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
+    if (width == 4) {
+      FilterHorizontalWidth4<filter_index, is_compound>(
+          src, src_stride, dest, pred_stride, height, v_tap);
+      return;
+    }
+    assert(width == 2);
+    if (!is_compound) {
+      FilterHorizontalWidth2<filter_index>(src, src_stride, dest, pred_stride,
+                                           height, v_tap);
+    }
+  }
+}
+
+template <bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  // Duplicate the absolute value for each tap.  Negative taps are corrected
+  // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
+  int16x4_t v_tap[kSubPixelTaps];
+  assert(filter_id != 0);
+
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
+
+  if (filter_index == 2) {  // 8 tap.
+    FilterHorizontal<2, is_compound>(src, src_stride, dst, dst_stride, width,
+                                     height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    FilterHorizontal<1, is_compound>(src + 1, src_stride, dst, dst_stride,
+                                     width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    FilterHorizontal<0, is_compound>(src + 1, src_stride, dst, dst_stride,
+                                     width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    FilterHorizontal<4, is_compound>(src + 2, src_stride, dst, dst_stride,
+                                     width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<5, is_compound>(src + 2, src_stride, dst, dst_stride,
+                                     width, height, v_tap);
+  } else {  // 2 tap.
+    FilterHorizontal<3, is_compound>(src + 3, src_stride, dst, dst_stride,
+                                     width, height, v_tap);
+  }
+}
+
+void ConvolveHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* const src =
+      static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* const src =
+      static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_compound=*/true>(src, reference_stride, dest, width,
+                                         width, height, horizontal_filter_id,
+                                         filter_index);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const int16x4_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint16_t* src_x = src + x;
+    uint16x8_t srcs[8];
+    srcs[0] = vld1q_u16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vld1q_u16(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1q_u16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vld1q_u16(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1q_u16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vld1q_u16(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1q_u16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
+    int y = 0;
+    do {
+      srcs[next_row] = vld1q_u16(src_x);
+      src_x += src_stride;
+
+      const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+      if (is_compound) {
+        const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(dst16 + x + y * dst_stride,
+                 vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+        vst1_u16(dst16 + x + 4 + y * dst_stride,
+                 vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+      } else {
+        const uint16x4_t d0 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+        const uint16x4_t d1 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+        vst1_u16(dst16 + x + y * dst_stride, d0);
+        vst1_u16(dst16 + x + 4 + y * dst_stride, d1);
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const int16x4_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint16x4_t srcs[9];
+  srcs[0] = vld1_u16(src);
+  src += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = vld1_u16(src);
+    src += src_stride;
+    srcs[2] = vld1_u16(src);
+    src += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = vld1_u16(src);
+      src += src_stride;
+      srcs[4] = vld1_u16(src);
+      src += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = vld1_u16(src);
+        src += src_stride;
+        srcs[6] = vld1_u16(src);
+        src += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = vld1_u16(src);
+    src += src_stride;
+    srcs[num_taps] = vld1_u16(src);
+    src += src_stride;
+
+    const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+    const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps);
+    if (is_compound) {
+      const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+      const int16x4_t d1 =
+          vqrshrn_n_s32(v_sum_1, kInterRoundBitsHorizontal - 1);
+      vst1_u16(dst16,
+               vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+      dst16 += dst_stride;
+      vst1_u16(dst16,
+               vreinterpret_u16_s16(vadd_s16(d1, vdup_n_s16(kCompoundOffset))));
+      dst16 += dst_stride;
+    } else {
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      const uint16x4_t d1 =
+          vmin_u16(vqrshrun_n_s32(v_sum_1, kFilterBits - 1), v_max_bitdepth);
+      vst1_u16(dst16, d0);
+      dst16 += dst_stride;
+      vst1_u16(dst16, d1);
+      dst16 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index>
+void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const int16x4_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  const uint16x4_t v_zero = vdup_n_u16(0);
+
+  uint16x4_t srcs[9];
+  srcs[0] = Load2<0>(src, v_zero);
+  src += src_stride;
+  if (num_taps >= 4) {
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load2<0>(src, v_zero);
+    src += src_stride;
+    srcs[1] = vext_u16(srcs[0], srcs[2], 2);
+    if (num_taps >= 6) {
+      srcs[2] = Load2<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, v_zero);
+      src += src_stride;
+      srcs[3] = vext_u16(srcs[2], srcs[4], 2);
+      if (num_taps == 8) {
+        srcs[4] = Load2<1>(src, srcs[4]);
+        src += src_stride;
+        srcs[6] = Load2<0>(src, v_zero);
+        src += src_stride;
+        srcs[5] = vext_u16(srcs[4], srcs[6], 2);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = Load2<1>(src, srcs[next_row - 1]);
+    src += src_stride;
+    srcs[num_taps] = Load2<0>(src, v_zero);
+    src += src_stride;
+    srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
+
+    const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+    const uint16x4_t d0 =
+        vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+    Store2<0>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<1>(dst16, d0);
+    dst16 += dst_stride;
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+void ConvolveVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* src = static_cast<const uint16_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride >> 1;
+  assert(vertical_filter_id != 0);
+
+  int16x4_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((vertical_filter_id == 1) | (vertical_filter_id == 7) |
+              (vertical_filter_id == 8) | (vertical_filter_id == 9) |
+              (vertical_filter_id == 15))) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
+    assert(filter_index == 5 || filter_index == 4 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 0 || vertical_filter_id == 2 ||
+             vertical_filter_id == 3 || vertical_filter_id == 4 ||
+             vertical_filter_id == 5 || vertical_filter_id == 6 ||
+             vertical_filter_id == 10 || vertical_filter_id == 11 ||
+             vertical_filter_id == 12 || vertical_filter_id == 13 ||
+             vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* src = static_cast<const uint16_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  int16x4_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((vertical_filter_id == 1) | (vertical_filter_id == 7) |
+              (vertical_filter_id == 8) | (vertical_filter_id == 9) |
+              (vertical_filter_id == 15))) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 || filter_index == 4 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const auto* src = static_cast<const uint16_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  const uint16x8_t offset =
+      vdupq_n_u16((1 << kBitdepth10) + (1 << (kBitdepth10 - 1)));
+
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      int w = width;
+      do {
+        const uint16x8_t v_src_lo = vld1q_u16(&src[x]);
+        const uint16x8_t v_src_hi = vld1q_u16(&src[x + 8]);
+        const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+        const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+        const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+        const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+        vst1q_u16(&dest[x], v_dest_lo);
+        vst1q_u16(&dest[x + 8], v_dest_hi);
+        x += 16;
+        w -= 16;
+      } while (w != 0);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x8_t v_src_lo = vld1q_u16(&src[0]);
+      const uint16x8_t v_src_hi = vld1q_u16(&src[src_stride]);
+      const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+      const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+      const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+      const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+      vst1q_u16(&dest[0], v_dest_lo);
+      vst1q_u16(&dest[8], v_dest_hi);
+      src += src_stride << 1;
+      dest += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {  // width == 4
+    int y = height;
+    do {
+      const uint16x4_t v_src_lo = vld1_u16(&src[0]);
+      const uint16x4_t v_src_hi = vld1_u16(&src[src_stride]);
+      const uint16x4_t v_sum_lo = vadd_u16(v_src_lo, vget_low_u16(offset));
+      const uint16x4_t v_sum_hi = vadd_u16(v_src_hi, vget_low_u16(offset));
+      const uint16x4_t v_dest_lo = vshl_n_u16(v_sum_lo, final_shift);
+      const uint16x4_t v_dest_hi = vshl_n_u16(v_sum_hi, final_shift);
+      vst1_u16(&dest[0], v_dest_lo);
+      vst1_u16(&dest[4], v_dest_hi);
+      src += src_stride << 1;
+      dest += 8;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+}
+
+}  // namespace
+
+void ConvolveInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index 331bfe2..2373ec9 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -103,9 +103,11 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src,
 
 template <int filter_index, bool negative_outside_taps, bool is_2d,
           bool is_compound>
-void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
-                                 void* const dest, const ptrdiff_t pred_stride,
-                                 const int width, const int height,
+void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src,
+                                 const ptrdiff_t src_stride,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t pred_stride, const int width,
+                                 const int height,
                                  const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
@@ -220,9 +222,11 @@ void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 template <int filter_index, bool is_2d, bool is_compound>
-void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
-                            void* const dest, const ptrdiff_t pred_stride,
-                            const int height, const uint8x8_t* const v_tap) {
+void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
   int y = height;
@@ -257,9 +261,11 @@ void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 template <int filter_index, bool is_2d>
-void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
-                            void* const dest, const ptrdiff_t pred_stride,
-                            const int height, const uint8x8_t* const v_tap) {
+void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
   int y = height >> 1;
@@ -345,10 +351,11 @@ void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
 
 template <int filter_index, bool negative_outside_taps, bool is_2d,
           bool is_compound>
-void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const uint8x8_t* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const uint8x8_t* const v_tap) {
   assert(width < 8 || filter_index <= 3);
   // Don't simplify the redundant if conditions with the template parameters,
   // which helps the compiler generate compact code.
@@ -484,7 +491,8 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+                                 void* LIBGAV1_RESTRICT const dst,
                                  const ptrdiff_t dst_stride, const int width,
                                  const int height, const int16x8_t taps) {
   assert(width >= 8);
@@ -560,7 +568,8 @@ void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
 
 // Take advantage of |src_stride| == |width| to process two rows at a time.
 template <int num_taps, bool is_compound = false>
-void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
                             const ptrdiff_t dst_stride, const int height,
                             const int16x8_t taps) {
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -626,7 +635,8 @@ void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
 
 // Take advantage of |src_stride| == |width| to process four rows at a time.
 template <int num_taps>
-void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
                             const ptrdiff_t dst_stride, const int height,
                             const int16x8_t taps) {
   constexpr int next_row = (num_taps < 6) ? 4 : 8;
@@ -699,9 +709,10 @@ void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
 
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
-    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
-    const ptrdiff_t dst_stride, const int width, const int height,
-    const int filter_id, const int filter_index) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
   // Duplicate the absolute value for each tap.  Negative taps are corrected
   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
   uint8x8_t v_tap[kSubPixelTaps];
@@ -739,9 +750,10 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 }
 
 template <int vertical_taps>
-void Filter2DVertical(const uint16_t* const intermediate_result,
-                      const int width, const int height, const int16x8_t taps,
-                      void* const prediction, const ptrdiff_t pred_stride) {
+void Filter2DVertical(
+    const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   auto* const dest = static_cast<uint8_t*>(prediction);
   if (width >= 8) {
     Filter2DVerticalWidth8AndUp<vertical_taps>(
@@ -756,13 +768,13 @@ void Filter2DVertical(const uint16_t* const intermediate_result,
   }
 }
 
-void Convolve2D_NEON(const void* const reference,
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
                      const ptrdiff_t reference_stride,
                      const int horizontal_filter_index,
                      const int vertical_filter_index,
                      const int horizontal_filter_id,
                      const int vertical_filter_id, const int width,
-                     const int height, void* const prediction,
+                     const int height, void* LIBGAV1_RESTRICT const prediction,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -833,12 +845,10 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
 }
 
 template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
-                                         const ptrdiff_t src_stride,
-                                         const int width, const int subpixel_x,
-                                         const int step_x,
-                                         const int intermediate_height,
-                                         int16_t* intermediate) {
+inline void ConvolveKernelHorizontal2Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
   // Account for the 0-taps that precede the 2 nonzero taps.
   const int kernel_offset = 3;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -943,8 +953,9 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 void ConvolveKernelHorizontalPositive4Tap(
-    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
-    const int step_x, const int intermediate_height, int16_t* intermediate) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1010,8 +1021,9 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalSigned4Tap(
-    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
-    const int step_x, const int intermediate_height, int16_t* intermediate) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1085,9 +1097,10 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned6Tap(
-    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
-    const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* const intermediate) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1178,9 +1191,10 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalMixed6Tap(
-    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
-    const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* const intermediate) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1272,9 +1286,10 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned8Tap(
-    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
-    const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* const intermediate) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1336,9 +1351,10 @@ inline void ConvolveKernelHorizontalSigned8Tap(
 
 // This function handles blocks of width 2 or 4.
 template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
-                              const int filter_index, const int step_y,
-                              const int height, void* const dest,
+void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src,
+                              const int subpixel_y, const int filter_index,
+                              const int step_y, const int height,
+                              void* LIBGAV1_RESTRICT const dest,
                               const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   const int16_t* src_y = src;
@@ -1408,10 +1424,11 @@ void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
 }
 
 template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* const src, const int width,
-                                  const int subpixel_y, const int filter_index,
-                                  const int step_y, const int height,
-                                  void* const dest,
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const src,
+                                  const int width, const int subpixel_y,
+                                  const int filter_index, const int step_y,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
                                   const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   // A possible improvement is to use arithmetic to decide how many times to
@@ -1477,13 +1494,14 @@ inline void ConvolveVerticalScale(const int16_t* const src, const int width,
 }
 
 template <bool is_compound>
-void ConvolveScale2D_NEON(const void* const reference,
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
                           const ptrdiff_t reference_stride,
                           const int horizontal_filter_index,
                           const int vertical_filter_index, const int subpixel_x,
                           const int subpixel_y, const int step_x,
                           const int step_y, const int width, const int height,
-                          void* const prediction, const ptrdiff_t pred_stride) {
+                          void* LIBGAV1_RESTRICT const prediction,
+                          const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   assert(step_x <= 2048);
@@ -1714,14 +1732,12 @@ void ConvolveScale2D_NEON(const void* const reference,
   }
 }
 
-void ConvolveHorizontal_NEON(const void* const reference,
-                             const ptrdiff_t reference_stride,
-                             const int horizontal_filter_index,
-                             const int /*vertical_filter_index*/,
-                             const int horizontal_filter_id,
-                             const int /*vertical_filter_id*/, const int width,
-                             const int height, void* const prediction,
-                             const ptrdiff_t pred_stride) {
+void ConvolveHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* const src =
@@ -1741,10 +1757,11 @@ uint16x8_t Compound1DShift(const int16x8_t sum) {
 
 template <int filter_index, bool is_compound = false,
           bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
-                    void* const dst, const ptrdiff_t dst_stride,
-                    const int width, const int height,
-                    const uint8x8_t* const taps) {
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* const dst8 = static_cast<uint8_t*>(dst);
@@ -1814,9 +1831,11 @@ void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
 
 template <int filter_index, bool is_compound = false,
           bool negative_outside_taps = false>
-void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const uint8x8_t* const taps) {
+void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -2001,9 +2020,11 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 template <int filter_index, bool negative_outside_taps = false>
-void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const uint8x8_t* const taps) {
+void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   auto* dst8 = static_cast<uint8_t*>(dst);
 
@@ -2205,14 +2226,12 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
 // filtering is required.
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
-void ConvolveVertical_NEON(const void* const reference,
-                           const ptrdiff_t reference_stride,
-                           const int /*horizontal_filter_index*/,
-                           const int vertical_filter_index,
-                           const int /*horizontal_filter_id*/,
-                           const int vertical_filter_id, const int width,
-                           const int height, void* const prediction,
-                           const ptrdiff_t pred_stride) {
+void ConvolveVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
@@ -2325,11 +2344,11 @@ void ConvolveVertical_NEON(const void* const reference,
 }
 
 void ConvolveCompoundCopy_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t /*pred_stride*/) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
@@ -2381,11 +2400,11 @@ void ConvolveCompoundCopy_NEON(
 }
 
 void ConvolveCompoundVertical_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int vertical_filter_index,
-    const int /*horizontal_filter_id*/, const int vertical_filter_id,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t /*pred_stride*/) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
@@ -2476,11 +2495,11 @@ void ConvolveCompoundVertical_NEON(
 }
 
 void ConvolveCompoundHorizontal_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t /*pred_stride*/) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   const auto* const src =
       static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -2492,9 +2511,10 @@ void ConvolveCompoundHorizontal_NEON(
 }
 
 template <int vertical_taps>
-void Compound2DVertical(const uint16_t* const intermediate_result,
-                        const int width, const int height, const int16x8_t taps,
-                        void* const prediction) {
+void Compound2DVertical(
+    const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction) {
   auto* const dest = static_cast<uint16_t*>(prediction);
   if (width == 4) {
     Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
@@ -2505,14 +2525,12 @@ void Compound2DVertical(const uint16_t* const intermediate_result,
   }
 }
 
-void ConvolveCompound2D_NEON(const void* const reference,
-                             const ptrdiff_t reference_stride,
-                             const int horizontal_filter_index,
-                             const int vertical_filter_index,
-                             const int horizontal_filter_id,
-                             const int vertical_filter_id, const int width,
-                             const int height, void* const prediction,
-                             const ptrdiff_t /*pred_stride*/) {
+void ConvolveCompound2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   uint16_t
@@ -2551,16 +2569,18 @@ void ConvolveCompound2D_NEON(const void* const reference,
   }
 }
 
-inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+                              uint8_t* LIBGAV1_RESTRICT const dst) {
   const uint8x16_t left = vld1q_u8(src);
   const uint8x16_t right = vld1q_u8(src + 1);
   vst1q_u8(dst, vrhaddq_u8(left, right));
 }
 
 template <int width>
-inline void IntraBlockCopyHorizontal(const uint8_t* src,
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                                      const ptrdiff_t src_stride,
-                                     const int height, uint8_t* dst,
+                                     const int height,
+                                     uint8_t* LIBGAV1_RESTRICT dst,
                                      const ptrdiff_t dst_stride) {
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
@@ -2601,10 +2621,11 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
 }
 
 void ConvolveIntraBlockCopyHorizontal_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
 
@@ -2675,9 +2696,10 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
 }
 
 template <int width>
-inline void IntraBlockCopyVertical(const uint8_t* src,
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
                                    const ptrdiff_t src_stride, const int height,
-                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
   uint8x16_t row[8], below[8];
@@ -2764,11 +2786,11 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
 }
 
 void ConvolveIntraBlockCopyVertical_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
 
@@ -2834,8 +2856,9 @@ void ConvolveIntraBlockCopyVertical_NEON(
 }
 
 template <int width>
-inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
-                             const int height, uint8_t* dst,
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint8_t* LIBGAV1_RESTRICT dst,
                              const ptrdiff_t dst_stride) {
   const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
@@ -2996,11 +3019,11 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 void ConvolveIntraBlockCopy2D_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
   // Note: allow vertical access to height + 1. Because this function is only
diff --git a/src/dsp/arm/convolve_neon.h b/src/dsp/arm/convolve_neon.h
index 948ef4d..1cac618 100644
--- a/src/dsp/arm/convolve_neon.h
+++ b/src/dsp/arm/convolve_neon.h
@@ -25,6 +25,7 @@ namespace dsp {
 
 // Initializes Dsp::convolve. This function is not thread-safe.
 void ConvolveInit_NEON();
+void ConvolveInit10bpp_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
@@ -45,6 +46,13 @@ void ConvolveInit_NEON();
 
 #define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index a0cd0ac..7d287c8 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -52,11 +52,10 @@ inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
 }
 
 template <int width, int height>
-inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0,
-                                            const int16_t* prediction_1,
-                                            const int16x4_t weights[2],
-                                            void* const dest,
-                                            const ptrdiff_t dest_stride) {
+inline void DistanceWeightedBlendSmall_NEON(
+    const int16_t* LIBGAV1_RESTRICT prediction_0,
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   constexpr int step = 16 / width;
 
@@ -94,12 +93,11 @@ inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0,
   }
 }
 
-inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0,
-                                            const int16_t* prediction_1,
-                                            const int16x4_t weights[2],
-                                            const int width, const int height,
-                                            void* const dest,
-                                            const ptrdiff_t dest_stride) {
+inline void DistanceWeightedBlendLarge_NEON(
+    const int16_t* LIBGAV1_RESTRICT prediction_0,
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+    const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
 
   int y = height;
@@ -127,12 +125,11 @@ inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0,
   } while (--y != 0);
 }
 
-inline void DistanceWeightedBlend_NEON(const void* prediction_0,
-                                       const void* prediction_1,
-                                       const uint8_t weight_0,
-                                       const uint8_t weight_1, const int width,
-                                       const int height, void* const dest,
-                                       const ptrdiff_t dest_stride) {
+inline void DistanceWeightedBlend_NEON(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
@@ -267,11 +264,12 @@ inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
   return x;
 }
 
-void DistanceWeightedBlend_NEON(const void* prediction_0,
-                                const void* prediction_1,
+void DistanceWeightedBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
                                 const uint8_t weight_0, const uint8_t weight_1,
                                 const int width, const int height,
-                                void* const dest, const ptrdiff_t dest_stride) {
+                                void* LIBGAV1_RESTRICT const dest,
+                                const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   auto* dst = static_cast<uint16_t*>(dest);
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 8ee3745..7c7c6ea 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -84,8 +84,10 @@ inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
 // compute pixels that come after in the row, we have to finish the calculations
 // one at a time.
 template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
-                                     const int8_t* coeffs, int pos, int shift) {
+inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
+                                     int32x4x2_t sum,
+                                     const int8_t* LIBGAV1_RESTRICT coeffs,
+                                     int pos, int shift) {
   int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
 
   for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
@@ -99,8 +101,10 @@ inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
 template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
-                                     const int8_t* coeffs, int pos, int shift) {
+inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
+                                     int32x4x2_t sum,
+                                     const int8_t* LIBGAV1_RESTRICT coeffs,
+                                     int pos, int shift) {
   int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
 
   for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
@@ -117,12 +121,11 @@ inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
 // compute pixels that come after in the row, we have to finish the calculations
 // one at a time.
 template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
-                                           int8_t* v_grain_cursor,
-                                           int32x4x2_t sum_u, int32x4x2_t sum_v,
-                                           const int8_t* coeffs_u,
-                                           const int8_t* coeffs_v, int pos,
-                                           int shift) {
+inline void WriteFinalAutoRegressionChroma(
+    int8_t* LIBGAV1_RESTRICT u_grain_cursor,
+    int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+    int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+    const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
       u_grain_cursor, sum_u, coeffs_u, pos, shift);
   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
@@ -131,12 +134,11 @@ inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
 template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor,
-                                           int16_t* v_grain_cursor,
-                                           int32x4x2_t sum_u, int32x4x2_t sum_v,
-                                           const int8_t* coeffs_u,
-                                           const int8_t* coeffs_v, int pos,
-                                           int shift) {
+inline void WriteFinalAutoRegressionChroma(
+    int16_t* LIBGAV1_RESTRICT u_grain_cursor,
+    int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+    int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+    const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
       u_grain_cursor, sum_u, coeffs_u, pos, shift);
   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
@@ -224,12 +226,11 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
 
 template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
           bool use_luma>
-void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params,
-                                                  const void* luma_grain_buffer,
-                                                  int subsampling_x,
-                                                  int subsampling_y,
-                                                  void* u_grain_buffer,
-                                                  void* v_grain_buffer) {
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(
+    const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+    int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+    void* LIBGAV1_RESTRICT v_grain_buffer) {
   static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
   const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
   auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
@@ -686,11 +687,11 @@ inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
 
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageLuma_NEON(
-    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
-    int width, int height, int start_height,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
     const uint8_t scaling_lut_y[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
-    ptrdiff_t dest_stride_y) {
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    void* LIBGAV1_RESTRICT dest_plane_y, ptrdiff_t dest_stride_y) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
   const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
@@ -741,9 +742,10 @@ void BlendNoiseWithImageLuma_NEON(
 
 template <int bitdepth, typename GrainType, typename Pixel>
 inline int16x8_t BlendChromaValsWithCfl(
-    const Pixel* average_luma_buffer,
+    const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
     const uint8_t scaling_lut[kScalingLookupTableSize],
-    const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+    const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+    const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
     const int16x8_t scaling_shift_vect16,
     const int32x4_t scaling_shift_vect32) {
   const int16x8_t scaling =
@@ -763,10 +765,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
     const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
     int width, int height, int start_height, int subsampling_x,
     int subsampling_y, int scaling_shift,
-    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
-    ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
-    ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
-    ptrdiff_t dest_stride) {
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const Pixel* LIBGAV1_RESTRICT in_chroma_row, ptrdiff_t source_stride_chroma,
+    Pixel* LIBGAV1_RESTRICT out_chroma_row, ptrdiff_t dest_stride) {
   const int16x8_t floor = vdupq_n_s16(min_value);
   const int16x8_t ceiling = vdupq_n_s16(max_chroma);
   Pixel luma_buffer[16];
@@ -842,13 +844,13 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageChromaWithCfl_NEON(
-    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
-    int min_value, int max_chroma, int width, int height, int start_height,
-    int subsampling_x, int subsampling_y,
-    const uint8_t scaling_lut[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y,
-    const void* source_plane_uv, ptrdiff_t source_stride_uv,
-    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* LIBGAV1_RESTRICT source_plane_uv, ptrdiff_t source_stride_uv,
+    void* LIBGAV1_RESTRICT dest_plane_uv, ptrdiff_t dest_stride_uv) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
@@ -873,7 +875,8 @@ namespace {
 
 inline int16x8_t BlendChromaValsNoCfl(
     const uint8_t scaling_lut[kScalingLookupTableSize],
-    const uint8_t* chroma_cursor, const int8_t* noise_image_cursor,
+    const uint8_t* LIBGAV1_RESTRICT chroma_cursor,
+    const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
     const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
     const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
   uint8_t merged_buffer[8];
@@ -898,9 +901,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
     int width, int height, int start_height, int subsampling_x,
     int subsampling_y, int scaling_shift, int chroma_offset,
     int chroma_multiplier, int luma_multiplier,
-    const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
-    ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
-    ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint8_t* LIBGAV1_RESTRICT in_chroma_row,
+    ptrdiff_t source_stride_chroma, uint8_t* LIBGAV1_RESTRICT out_chroma_row,
     ptrdiff_t dest_stride) {
   const int16x8_t floor = vdupq_n_s16(min_value);
   const int16x8_t ceiling = vdupq_n_s16(max_chroma);
@@ -963,13 +967,13 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
 
 // This function is for the case params_.chroma_scaling_from_luma == false.
 void BlendNoiseWithImageChroma8bpp_NEON(
-    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
-    int min_value, int max_chroma, int width, int height, int start_height,
-    int subsampling_x, int subsampling_y,
-    const uint8_t scaling_lut[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y,
-    const void* source_plane_uv, ptrdiff_t source_stride_uv,
-    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* LIBGAV1_RESTRICT source_plane_uv, ptrdiff_t source_stride_uv,
+    void* LIBGAV1_RESTRICT dest_plane_uv, ptrdiff_t dest_stride_uv) {
   assert(plane == kPlaneU || plane == kPlaneV);
   const auto* noise_image =
       static_cast<const Array2D<int8_t>*>(noise_image_ptr);
@@ -989,12 +993,11 @@ void BlendNoiseWithImageChroma8bpp_NEON(
                             in_uv, source_stride_uv, out_uv, dest_stride_uv);
 }
 
-inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row,
-                                      const int8_t* noise_stripe_row_prev,
-                                      int plane_width,
-                                      const int8x8_t grain_coeff,
-                                      const int8x8_t old_coeff,
-                                      int8_t* noise_image_row) {
+inline void WriteOverlapLine8bpp_NEON(
+    const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
+    const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    const int8x8_t grain_coeff, const int8x8_t old_coeff,
+    int8_t* LIBGAV1_RESTRICT noise_image_row) {
   int x = 0;
   do {
     // Note that these reads may exceed noise_stripe_row's width by up to 7
@@ -1009,10 +1012,10 @@ inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row,
   } while (x < plane_width);
 }
 
-void ConstructNoiseImageOverlap8bpp_NEON(const void* noise_stripes_buffer,
-                                         int width, int height,
-                                         int subsampling_x, int subsampling_y,
-                                         void* noise_image_buffer) {
+void ConstructNoiseImageOverlap8bpp_NEON(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
   const auto* noise_stripes =
       static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
   auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
index 8d8748f..5e49db9 100644
--- a/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -76,7 +76,7 @@ template <int block_width, int block_height>
 void CflSubsampler420_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, const ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
   const auto* src = static_cast<const uint8_t*>(source);
   uint32_t sum;
   if (block_width == 4) {
@@ -173,7 +173,7 @@ template <int block_width, int block_height>
 void CflSubsampler444_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, const ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
   const auto* src = static_cast<const uint8_t*>(source);
   uint32_t sum;
   if (block_width == 4) {
@@ -276,7 +276,7 @@ inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
 // uint8_t. Saturated int16_t >> 6 outranges uint8_t.
 template <int block_height>
 inline void CflIntraPredictor4xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
@@ -295,7 +295,7 @@ inline void CflIntraPredictor4xN_NEON(
 
 template <int block_height>
 inline void CflIntraPredictor8xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
@@ -310,7 +310,7 @@ inline void CflIntraPredictor8xN_NEON(
 
 template <int block_height>
 inline void CflIntraPredictor16xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
@@ -328,7 +328,7 @@ inline void CflIntraPredictor16xN_NEON(
 
 template <int block_height>
 inline void CflIntraPredictor32xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
@@ -507,7 +507,8 @@ inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_4xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   static_assert(block_height_log2 <= 4, "");
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
@@ -568,7 +569,7 @@ template <int block_height_log2>
 void CflSubsampler444_4xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_cast<void>(max_luma_width);
   static_cast<void>(max_luma_height);
   static_assert(block_height_log2 <= 4, "");
@@ -588,7 +589,8 @@ void CflSubsampler444_4xH_NEON(
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_8xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
   const auto* src = static_cast<const uint16_t*>(source);
@@ -643,7 +645,7 @@ template <int block_height_log2>
 void CflSubsampler444_8xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_cast<void>(max_luma_width);
   static_cast<void>(max_luma_height);
   static_assert(block_height_log2 <= 5, "");
@@ -667,7 +669,7 @@ template <int block_width_log2, int block_height_log2, bool is_inside>
 void CflSubsampler444_WxH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
   const int block_width = 1 << block_width_log2;
@@ -751,7 +753,7 @@ template <int block_width_log2, int block_height_log2>
 void CflSubsampler444_WxH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_width_log2 == 4 || block_width_log2 == 5,
                 "This function will only work for block_width 16 and 32.");
   static_assert(block_height_log2 <= 5, "");
@@ -773,7 +775,7 @@ template <int block_height_log2>
 void CflSubsampler420_4xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const auto* src = static_cast<const uint16_t*>(source);
   const ptrdiff_t src_stride = stride / sizeof(src[0]);
@@ -839,7 +841,8 @@ void CflSubsampler420_4xH_NEON(
 template <int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_8xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const auto* src = static_cast<const uint16_t*>(source);
   const ptrdiff_t src_stride = stride / sizeof(src[0]);
@@ -944,7 +947,7 @@ template <int block_height_log2>
 void CflSubsampler420_8xH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   if (max_luma_width == 8) {
     CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
                                                         source, stride);
@@ -957,7 +960,8 @@ void CflSubsampler420_8xH_NEON(
 template <int block_width_log2, int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_WxH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   const auto* src = static_cast<const uint16_t*>(source);
   const ptrdiff_t src_stride = stride / sizeof(src[0]);
   const int block_height = 1 << block_height_log2;
@@ -1062,7 +1066,7 @@ template <int block_width_log2, int block_height_log2>
 void CflSubsampler420_WxH_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   switch (max_luma_width) {
     case 8:
       CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
@@ -1109,7 +1113,7 @@ inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
 
 template <int block_height, int bitdepth = 10>
 inline void CflIntraPredictor4xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint16_t*>(dest);
@@ -1133,7 +1137,7 @@ inline void CflIntraPredictor4xN_NEON(
 
 template <int block_height, int bitdepth = 10>
 inline void CflIntraPredictor8xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint16_t*>(dest);
@@ -1153,7 +1157,7 @@ inline void CflIntraPredictor8xN_NEON(
 
 template <int block_height, int bitdepth = 10>
 inline void CflIntraPredictor16xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint16_t*>(dest);
@@ -1177,7 +1181,7 @@ inline void CflIntraPredictor16xN_NEON(
 
 template <int block_height, int bitdepth = 10>
 inline void CflIntraPredictor32xN_NEON(
-    void* const dest, const ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint16_t*>(dest);
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 3f5edbd..8ecf875 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -40,9 +40,9 @@ inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
                                const uint8x8_t a_weight,
                                const uint8x8_t b_weight) {
   const uint16x8_t a_product = vmull_u8(a, a_weight);
-  const uint16x8_t b_product = vmull_u8(b, b_weight);
+  const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
 
-  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
+  return vrshrn_n_u16(sum, 5 /*log2(32)*/);
 }
 
 // For vertical operations the weights are one constant value.
@@ -52,9 +52,9 @@ inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
 }
 
 // Fill |left| and |right| with the appropriate values for a given |base_step|.
-inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
-                         const uint8x8_t right_step, uint8x8_t* left,
-                         uint8x8_t* right) {
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step, const uint8x8_t right_step,
+                         uint8x8_t* left, uint8x8_t* right) {
   const uint8x16_t mixed = vld1q_u8(source);
   *left = VQTbl1U8(mixed, left_step);
   *right = VQTbl1U8(mixed, right_step);
@@ -62,17 +62,18 @@ inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
 
 // Handle signed step arguments by ignoring the sign. Negative values are
 // considered out of range and overwritten later.
-inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step,
-                         const int8x8_t right_step, uint8x8_t* left,
-                         uint8x8_t* right) {
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+                         const int8x8_t left_step, const int8x8_t right_step,
+                         uint8x8_t* left, uint8x8_t* right) {
   LoadStepwise(source, vreinterpret_u8_s8(left_step),
                vreinterpret_u8_s8(right_step), left, right);
 }
 
 // Process 4 or 8 |width| by any |height|.
 template <int width>
-inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
-                                 const int height, const uint8_t* const top,
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint8_t* LIBGAV1_RESTRICT const top,
                                  const int xstep, const bool upsampled) {
   assert(width == 4 || width == 8);
 
@@ -142,10 +143,11 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
 
 // Process a multiple of 8 |width| by any |height|. Processes horizontally
 // before vertically in the hopes of being a little more cache friendly.
-inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
-                                 const int width, const int height,
-                                 const uint8_t* const top, const int xstep,
-                                 const bool upsampled) {
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint8_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep, const bool upsampled) {
   assert(width % 8 == 0);
   const int upsample_shift = static_cast<int>(upsampled);
   const int scale_bits = 6 - upsample_shift;
@@ -203,12 +205,10 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
   } while (++y < height);
 }
 
-void DirectionalIntraPredictorZone1_NEON(void* const dest,
-                                         const ptrdiff_t stride,
-                                         const void* const top_row,
-                                         const int width, const int height,
-                                         const int xstep,
-                                         const bool upsampled_top) {
+void DirectionalIntraPredictorZone1_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   uint8_t* dst = static_cast<uint8_t*>(dest);
 
@@ -282,11 +282,10 @@ void DirectionalIntraPredictorZone1_NEON(void* const dest,
 
 // Process 4 or 8 |width| by 4 or 8 |height|.
 template <int width>
-inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
-                                 const int height,
-                                 const uint8_t* const left_column,
-                                 const int base_left_y, const int ystep,
-                                 const int upsample_shift) {
+inline void DirectionalZone3_WxH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
+    const int ystep, const int upsample_shift) {
   assert(width == 4 || width == 8);
   assert(height == 4 || height == 8);
   const int scale_bits = 6 - upsample_shift;
@@ -417,12 +416,10 @@ constexpr int kPositiveIndexOffset = 15;
 
 // Process 4 or 8 |width| by any |height|.
 template <int width>
-inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
-                                            const ptrdiff_t stride,
-                                            const int height,
-                                            const uint8_t* const left_column,
-                                            const int16x8_t left_y,
-                                            const int upsample_shift) {
+inline void DirectionalZone2FromLeftCol_WxH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+    const int upsample_shift) {
   assert(width == 4 || width == 8);
 
   // The shift argument must be a constant.
@@ -468,12 +465,10 @@ inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
 
 // Process 4 or 8 |width| by any |height|.
 template <int width>
-inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride,
-                                      const int height,
-                                      const uint8_t* const top_row,
-                                      int zone_bounds, int top_x,
-                                      const int xstep,
-                                      const int upsample_shift) {
+inline void DirectionalZone1Blend_WxH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep, const int upsample_shift) {
   assert(width == 4 || width == 8);
 
   const int scale_bits_x = 6 - upsample_shift;
@@ -523,12 +518,12 @@ constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
 // then handle only blocks that take from |left_ptr|. Additionally, a fast
 // index-shuffle approach is used for pred values from |left_column| in sections
 // that permit it.
-inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
-                                 const uint8_t* const top_row,
-                                 const uint8_t* const left_column,
-                                 const int height, const int xstep,
-                                 const int ystep, const bool upsampled_top,
-                                 const bool upsampled_left) {
+inline void DirectionalZone2_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const bool upsampled_top,
+    const bool upsampled_left) {
   const int upsample_left_shift = static_cast<int>(upsampled_left);
   const int upsample_top_shift = static_cast<int>(upsampled_top);
 
@@ -564,8 +559,8 @@ inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
   // If the 64 scaling is regarded as a decimal point, the first value of the
   // left_y vector omits the portion which is covered under the left_column
   // offset. The following values need the full ystep as a relative offset.
-  int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
-  left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+  const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
 
   // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
   // The first stage, before the first y-loop, covers blocks that are only
@@ -639,13 +634,12 @@ inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
 }
 
 // Process a multiple of 8 |width|.
-inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
-                               const uint8_t* const top_row,
-                               const uint8_t* const left_column,
-                               const int width, const int height,
-                               const int xstep, const int ystep,
-                               const bool upsampled_top,
-                               const bool upsampled_left) {
+inline void DirectionalZone2_8(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
   const int upsample_left_shift = static_cast<int>(upsampled_left);
   const int upsample_top_shift = static_cast<int>(upsampled_top);
 
@@ -687,8 +681,8 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
   // If the 64 scaling is regarded as a decimal point, the first value of the
   // left_y vector omits the portion which is covered under the left_column
   // offset. Following values need the full ystep as a relative offset.
-  int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
-  left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+  const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
 
   // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
   // The first stage, before the first y-loop, covers blocks that are only
@@ -770,10 +764,11 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
 }
 
 void DirectionalIntraPredictorZone2_NEON(
-    void* const dest, const ptrdiff_t stride, const void* const top_row,
-    const void* const left_column, const int width, const int height,
-    const int xstep, const int ystep, const bool upsampled_top,
-    const bool upsampled_left) {
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
   // Increasing the negative buffer for this function allows more rows to be
   // processed at a time without branching in an inner loop to check the base.
   uint8_t top_buffer[288];
@@ -793,12 +788,10 @@ void DirectionalIntraPredictorZone2_NEON(
   }
 }
 
-void DirectionalIntraPredictorZone3_NEON(void* const dest,
-                                         const ptrdiff_t stride,
-                                         const void* const left_column,
-                                         const int width, const int height,
-                                         const int ystep,
-                                         const bool upsampled_left) {
+void DirectionalIntraPredictorZone3_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
   const auto* const left = static_cast<const uint8_t*>(left_column);
 
   assert(ystep > 0);
@@ -934,7 +927,8 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
 }
 
 // Each element of |dest| contains values associated with one weight value.
-inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+inline void LoadEdgeVals(uint16x4x2_t* dest,
+                         const uint16_t* LIBGAV1_RESTRICT const source,
                          const bool upsampled) {
   if (upsampled) {
     *dest = vld2_u16(source);
@@ -945,7 +939,8 @@ inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
 }
 
 // Each element of |dest| contains values associated with one weight value.
-inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+inline void LoadEdgeVals(uint16x8x2_t* dest,
+                         const uint16_t* LIBGAV1_RESTRICT const source,
                          const bool upsampled) {
   if (upsampled) {
     *dest = vld2q_u16(source);
@@ -956,8 +951,9 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
 }
 
 template <bool upsampled>
-inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
-                                 const int height, const uint16_t* const top,
+inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const top,
                                  const int xstep) {
   const int upsample_shift = static_cast<int>(upsampled);
   const int index_scale_bits = 6 - upsample_shift;
@@ -1007,9 +1003,11 @@ inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
 // Process a multiple of 8 |width| by any |height|. Processes horizontally
 // before vertically in the hopes of being a little more cache friendly.
 template <bool upsampled>
-inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
-                                 const int width, const int height,
-                                 const uint16_t* const top, const int xstep) {
+inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep) {
   assert(width % 8 == 0);
   const int upsample_shift = static_cast<int>(upsampled);
   const int index_scale_bits = 6 - upsample_shift;
@@ -1068,10 +1066,11 @@ inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
 
 // Process a multiple of 8 |width| by any |height|. Processes horizontally
 // before vertically in the hopes of being a little more cache friendly.
-inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
-                                   const int width, const int height,
-                                   const uint16_t* const top, const int xstep,
-                                   const bool upsampled) {
+inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t stride, const int width,
+                                   const int height,
+                                   const uint16_t* LIBGAV1_RESTRICT const top,
+                                   const int xstep, const bool upsampled) {
   assert(width % 8 == 0);
   const int upsample_shift = static_cast<int>(upsampled);
   const int index_scale_bits = 6 - upsample_shift;
@@ -1156,11 +1155,10 @@ inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
   }
 }
 
-void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
-                                         const void* const top_row,
-                                         const int width, const int height,
-                                         const int xstep,
-                                         const bool upsampled_top) {
+void DirectionalIntraPredictorZone1_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
   const uint16_t* const top = static_cast<const uint16_t*>(top_row);
   uint16_t* dst = static_cast<uint16_t*>(dest);
   stride /= sizeof(top[0]);
@@ -1225,9 +1223,10 @@ void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
 // 42 52 62 72             60 61 62 63
 // 43 53 63 73             70 71 72 73
 template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
-                                 const uint16_t* const left, const int ystep,
-                                 const int base_left_y = 0) {
+inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
   const int upsample_shift = static_cast<int>(upsampled);
   const int index_scale_bits = 6 - upsample_shift;
 
@@ -1278,8 +1277,9 @@ inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
 }
 
 template <bool upsampled>
-inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
-                                 const int height, const uint16_t* const left,
+inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
                                  const int ystep) {
   const int upsample_shift = static_cast<int>(upsampled);
   int y = 0;
@@ -1292,8 +1292,9 @@ inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
 }
 
 template <bool upsampled>
-inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
-                                 const int width, const uint16_t* const left,
+inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int width,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
                                  const int ystep) {
   int x = 0;
   int base_left_y = 0;
@@ -1308,9 +1309,10 @@ inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
 }
 
 template <bool upsampled>
-inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
-                                 const uint16_t* const left, const int ystep,
-                                 const int base_left_y = 0) {
+inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
   const int upsample_shift = static_cast<int>(upsampled);
   const int index_scale_bits = 6 - upsample_shift;
 
@@ -1400,9 +1402,11 @@ inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
 }
 
 template <bool upsampled>
-inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
-                                 const int width, const int height,
-                                 const uint16_t* const left, const int ystep) {
+inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
   const int upsample_shift = static_cast<int>(upsampled);
   // Zone3 never runs out of left_column values.
   assert((width + height - 1) << upsample_shift >  // max_base_y
@@ -1424,12 +1428,10 @@ inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
   } while (y < height);
 }
 
-void DirectionalIntraPredictorZone3_NEON(void* const dest,
-                                         const ptrdiff_t stride,
-                                         const void* const left_column,
-                                         const int width, const int height,
-                                         const int ystep,
-                                         const bool upsampled_left) {
+void DirectionalIntraPredictorZone3_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
   const uint16_t* const left = static_cast<const uint16_t*>(left_column);
   uint8_t* dst = static_cast<uint8_t*>(dest);
 
@@ -1472,10 +1474,668 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest,
   }
 }
 
+// -----------------------------------------------------------------------------
+// Zone2
+// This function deals with cases not found in zone 1 or zone 3. The extreme
+// angles are 93, which makes for sharp ascents along |left_column| with each
+// successive dest row element until reaching |top_row|, and 177, with a shallow
+// ascent up |left_column| until reaching large jumps along |top_row|. In the
+// extremely steep cases, source vectors can only be loaded one lane at a time.
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step, const uint8x8_t right_step,
+                         uint16x4_t* left, uint16x4_t* right) {
+  const uint8x16x2_t mixed = {
+      vld1q_u8(static_cast<const uint8_t*>(source)),
+      vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+  *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
+  *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
+}
+
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step_0,
+                         const uint8x8_t right_step_0,
+                         const uint8x8_t left_step_1,
+                         const uint8x8_t right_step_1, uint16x8_t* left,
+                         uint16x8_t* right) {
+  const uint8x16x2_t mixed = {
+      vld1q_u8(static_cast<const uint8_t*>(source)),
+      vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+  const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
+  const uint16x4_t left_high =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
+  *left = vcombine_u16(left_low, left_high);
+  const uint16x4_t right_low =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
+  const uint16x4_t right_high =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
+  *right = vcombine_u16(right_low, right_high);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const uint16x4_t a_weight,
+                                const uint16x4_t b_weight) {
+  const uint16x4_t a_product = vmul_u16(a, a_weight);
+  const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16x8_t a_weight,
+                                const uint16x8_t b_weight) {
+  const uint16x8_t a_product = vmulq_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative in localized functions.
+// This is accommodated by making sure the relative indices are within [-15, 0]
+// when the function is called, and sliding them into the inclusive range
+// [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
+// the byte offset for table lookups.
+
+constexpr int kPositiveIndexOffsetPixels = 15;
+constexpr int kPositiveIndexOffsetBytes = 30;
+
+inline void DirectionalZone2FromLeftCol_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
+    const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+
+  const int index_scale_bits = 6;
+  // The values in |offset_y| are negative, except for the first element, which
+  // is zero.
+  int16x4_t offset_y;
+  int16x4_t shift_upsampled = left_y;
+  // The shift argument must be a constant, otherwise use upsample_shift
+  // directly.
+  if (upsample_shift) {
+    offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
+    shift_upsampled = vshl_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshr_n_s16(left_y, index_scale_bits);
+  }
+  offset_y = vshl_n_s16(offset_y, 1);
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the
+  // right. With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by
+  // doing separate loads for |left_values| and |right_values|. vtbl
+  // supports 2 Q registers as input which would allow for cumulative
+  // offsets of 32.
+  // |sampler_0| indexes the first byte of each 16-bit value.
+  const int16x4_t sampler_0 =
+      vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
+  // |sampler_1| indexes the second byte of each 16-bit value.
+  const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
+  const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
+  const uint8x8_t left_indices =
+      vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
+  const uint8x8_t right_indices =
+      vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
+
+  const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
+  const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
+  const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
+
+  int y = 0;
+  do {
+    uint16x4_t src_left, src_right;
+    LoadStepwise(
+        left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+        left_indices, right_indices, &src_left, &src_right);
+    const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+    Store4(dst, val);
+    dst += stride;
+  } while (++y < height);
+}
+
+inline void DirectionalZone2FromLeftCol_8xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+    const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+
+  const int index_scale_bits = 6;
+  // The values in |offset_y| are negative, except for the first element, which
+  // is zero.
+  int16x8_t offset_y = left_y;
+  int16x8_t shift_upsampled = left_y;
+  // The shift argument must be a constant, otherwise use upsample_shift
+  // directly.
+  if (upsample_shift) {
+    offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
+    shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshrq_n_s16(left_y, index_scale_bits);
+  }
+  offset_y = vshlq_n_s16(offset_y, 1);
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the right.
+  // With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by doing
+  // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+  // registers as input which would allow for cumulative offsets of 32.
+  // |sampler_0| indexes the first byte of each 16-bit value.
+  const int16x8_t sampler_0 =
+      vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
+  // |sampler_1| indexes the second byte of each 16-bit value.
+  const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
+  const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
+  const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
+  const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
+  const uint8x8_t right_values_0 =
+      vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
+  const uint8x8_t right_values_1 =
+      vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
+
+  const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+  const uint16x8_t shift_0 =
+      vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
+  const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
+
+  int y = 0;
+  do {
+    uint16x8_t src_left, src_right;
+    LoadStepwise(
+        left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+        left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
+        &src_right);
+    const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+    Store8(dst, val);
+    dst += stride;
+  } while (++y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_4xH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  // Representing positions along the row, which |zone_bounds| will target for
+  // the blending boundary.
+  const int16x4_t indices = {0, 1, 2, 3};
+
+  uint16x4x2_t top_vals;
+  int y = height;
+  do {
+    const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+    LoadEdgeVals(&top_vals, src, upsampled);
+
+    const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    const uint16x4_t val =
+        WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+    const uint16x4_t dst_blend = Load4U16(dest);
+    // |zone_bounds| values can be negative.
+    const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
+    const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
+
+    Store4(dest, output);
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (--y != 0);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  // Representing positions along the row, which |zone_bounds| will target for
+  // the blending boundary.
+  const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  uint16x8x2_t top_vals;
+  int y = height;
+  do {
+    const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+    LoadEdgeVals(&top_vals, src, upsampled);
+
+    const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    const uint16x8_t val =
+        WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+    const uint16x8_t dst_blend = Load8U16(dest);
+    // |zone_bounds| values can be negative.
+    const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
+    const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
+
+    Store8(dest, output);
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (--y != 0);
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
+// that do not correspond to angle derivatives are left at zero.
+// Notably, in cases with upsampling, the shuffle-invalid height is always
+// greater than the prediction height (which is 8 at maximum).
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+
+  // Helper vector for index computation.
+  const int16x4_t zero_to_three = {0, 1, 2, 3};
+
+  // Loop increments for moving by block (4xN). Vertical still steps by 8. If
+  // it's only 4, it will be finished in the first iteration.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. The following values need the full ystep as a relative offset.
+  const int16x4_t left_y =
+      vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
+
+  // This loop treats the 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  // Round down to the nearest multiple of 8.
+  // TODO(petersonab): Check if rounding to the nearest 4 is okay.
+  const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+  DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
+                                      stride >> 1, max_top_only_y, top_row,
+                                      -xstep);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute.
+  const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  // +8 increment is OK because if height is 4 this only runs once.
+  for (; y < min_left_only_y;
+       y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    DirectionalZone2FromLeftCol_4xH(
+        dst, stride, min_height,
+        left_column + ((y - left_base_increment) << upsample_left_shift),
+        left_y, upsampled_left);
+
+    DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
+                                             xstep_bounds, top_x, xstep);
+  }
+
+  // Loop over y for left-only rows.
+  for (; y < height; y += 8, dst += stride8) {
+    // Angle expected by Zone3 is flipped about the 180 degree vector, which
+    // is the x-axis.
+    DirectionalZone3_4xH<upsampled_left>(
+        dst, stride, min_height, left_column + (y << upsample_left_shift),
+        -ystep);
+  }
+}
+
+// Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
+// address safety.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_Wx4(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int xstep, const int ystep) {
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int min_top_only_x = std::min((4 * xstep) >> 6, width);
+  int x = 0;
+  for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
+    uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+    // Round down to the nearest multiple of 4.
+    const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
+    if (max_top_only_y != 0) {
+      DirectionalZone1_4xH<upsampled_top>(
+          reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
+          top_row + (x << upsample_top_shift), -xstep);
+      continue;
+    }
+
+    DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
+                                         -ystep * x);
+
+    const int min_left_only_y = ((x + 4) << 6) / xstep;
+    if (min_left_only_y != 0) {
+      const int top_x = -xstep;
+      DirectionalZone1Blend_4xH<upsampled_top>(
+          dst_x, stride, 4, top_row + (x << upsample_top_shift),
+          xstep_bounds_base, top_x, xstep);
+    }
+  }
+  // Reached |min_top_only_x|.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
+        top_row + (x << upsample_top_shift), -xstep);
+  }
+}
+
+// Process a multiple of 8 |width|.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep) {
+  if (height == 4) {
+    DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, width, xstep, ystep);
+    return;
+  }
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop increments for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const int ystep8 = ystep << 3;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from |left_column| may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  int16x8_t left_y =
+      vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  int x = 0;
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_WxH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+        top_row + (x << upsample_top_shift), -xstep);
+
+    if (max_top_only_y == height) continue;
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_8xH(
+          dst_x, stride, 8,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsample_left_shift);
+
+      DirectionalZone1Blend_8xH<upsampled_top>(
+          dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
+          top_x, xstep);
+    }
+
+    // Pick up from the last y-value, using the slower but secure method for
+    // left prediction.
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone3_8x8<upsampled_left>(
+          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+          -ystep * x);
+
+      DirectionalZone1Blend_8xH<upsampled_top>(
+          dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
+          top_x, xstep);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_8x8<upsampled_left>(
+          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+          -ystep * x);
+    }
+  }
+  // Reached |min_top_only_x|.
+  if (x < width) {
+    DirectionalZone1_WxH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
+        top_row + (x << upsample_top_shift), -xstep);
+  }
+}
+
+// At this angle, neither edges are upsampled.
+// |min_width| is either 4 or 8.
+template <int min_width>
+void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+                         const uint16_t* LIBGAV1_RESTRICT const top,
+                         const uint16_t* LIBGAV1_RESTRICT const left,
+                         const int width, const int height) {
+  // y = 0 is more trivial than the other rows.
+  memcpy(dst, top - 1, width * sizeof(top[0]));
+  dst += stride;
+
+  // If |height| > |width|, then there is a point at which top_row is no longer
+  // used in each row.
+  const int min_left_only_y = std::min(width, height);
+
+  int y = 1;
+  do {
+    // Example: If y is 4 (min_width), the dest row starts with left[3],
+    // left[2], left[1], left[0], because the angle points up. Therefore, load
+    // starts at left[0] and is then reversed. If y is 2, the load starts at
+    // left[-2], and is reversed to store left[1], left[0], with negative values
+    // overwritten from |top_row|.
+    const uint16_t* const load_left = left + y - min_width;
+    uint16_t* dst16 = reinterpret_cast<uint16_t*>(dst);
+
+    // Some values will be overwritten when |y| is not a multiple of
+    // |min_width|.
+    if (min_width == 4) {
+      const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
+      vst1_u16(dst16, left_toward_corner);
+    } else {
+      int x = 0;
+      do {
+        const uint16x8_t left_toward_corner =
+            vrev64q_u16(vld1q_u16(load_left - x));
+        vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+        vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+        x += 8;
+      } while (x < y);
+    }
+    // Entering |top|.
+    memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
+    dst += stride;
+  } while (++y < min_left_only_y);
+
+  // Left only.
+  for (; y < height; ++y, dst += stride) {
+    uint16_t* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t* const load_left = left + y - min_width;
+
+    int x = 0;
+    if (min_width == 4) {
+      const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
+      vst1_u16(dst16 + x, left_toward_corner);
+    } else {
+      do {
+        const uint16x8_t left_toward_corner =
+            vrev64q_u16(vld1q_u16(load_left - x));
+        vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+        vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+        x += 8;
+      } while (x < width);
+    }
+  }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+    void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint16_t top_buffer[288];
+  uint16_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
+         160);
+  const uint16_t* top_ptr = top_buffer + 144;
+  const uint16_t* left_ptr = left_buffer + 144;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (width == 4) {
+    if (xstep == 64) {
+      assert(ystep == 64);
+      DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
+      return;
+    }
+    if (upsampled_top) {
+      if (upsampled_left) {
+        DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
+                                         xstep, ystep);
+      } else {
+        DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
+                                          height, xstep, ystep);
+      }
+    } else if (upsampled_left) {
+      DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
+                                        xstep, ystep);
+    } else {
+      DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
+                                         xstep, ystep);
+    }
+    return;
+  }
+
+  if (xstep == 64) {
+    assert(ystep == 64);
+    DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
+    return;
+  }
+  if (upsampled_top) {
+    if (upsampled_left) {
+      DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
+                                     height, xstep, ystep);
+    } else {
+      DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
+                                      height, xstep, ystep);
+    }
+  } else if (upsampled_left) {
+    DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
+                                    height, xstep, ystep);
+  } else {
+    DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
+                                     height, xstep, ystep);
+  }
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
   dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
   dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
 }
 
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
index f7d6235..310d90b 100644
--- a/src/dsp/arm/intrapred_directional_neon.h
+++ b/src/dsp/arm/intrapred_directional_neon.h
@@ -47,6 +47,10 @@ void IntraPredDirectionalInit_NEON();
 #define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
 #endif
 
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
 #ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
 #define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
 #endif
diff --git a/src/dsp/arm/intrapred_filter_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
index bd9f61d..f024e4f 100644
--- a/src/dsp/arm/intrapred_filter_neon.cc
+++ b/src/dsp/arm/intrapred_filter_neon.cc
@@ -85,9 +85,10 @@ alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] =
       {14, 12, 11, 10, 0, 0, 1, 1},
       {0, 0, 0, 0, 14, 12, 11, 9}}};
 
-void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column,
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column,
                                FilterIntraPredictor pred, int width,
                                int height) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c143648..8ef7594 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/common.h"
 #include "src/utils/constants.h"
 
 namespace libgav1 {
@@ -56,10 +57,10 @@ struct DcPredFuncs_NEON {
 
 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
           DcStoreFunc storefn>
-void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
-                      storefn>::DcTop(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const void* /*left_column*/) {
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+    DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+          const void* LIBGAV1_RESTRICT const top_row,
+          const void* /*left_column*/) {
   const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
   const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
   storefn(dest, stride, dc);
@@ -67,10 +68,10 @@ void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
 
 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
           DcStoreFunc storefn>
-void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
-                      storefn>::DcLeft(void* const dest, ptrdiff_t stride,
-                                       const void* /*top_row*/,
-                                       const void* const left_column) {
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+    DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+           const void* /*top_row*/,
+           const void* LIBGAV1_RESTRICT const left_column) {
   const uint32x2_t sum =
       sumfn(left_column, block_height_log2, false, nullptr, 0);
   const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
@@ -80,8 +81,9 @@ void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
           DcStoreFunc storefn>
 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const uint32x2_t sum =
       sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
   if (block_width_log2 == block_height_log2) {
@@ -154,8 +156,9 @@ inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
 // If |use_ref_1| is false then only sum |ref_0|.
 // For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
 // uint32_t.
-inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
-                             const bool use_ref_1, const void* ref_1,
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+                             const int ref_0_size_log2, const bool use_ref_1,
+                             const void* LIBGAV1_RESTRICT ref_1,
                              const int ref_1_size_log2) {
   const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
   const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
@@ -318,9 +321,10 @@ inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int width, int height>
-inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
-                             const void* const top_row,
-                             const void* const left_column) {
+inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const top_row,
+                             const void* LIBGAV1_RESTRICT const left_column) {
   auto* dest_u8 = static_cast<uint8_t*>(dest);
   const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
   const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
@@ -425,9 +429,10 @@ inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
       top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
 
 template <int width, int height>
-inline void Paeth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   auto* dest_u8 = static_cast<uint8_t*>(dest);
   const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
   const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
@@ -769,8 +774,9 @@ inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
 
 // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
 // If |use_ref_1| is false then only sum |ref_0|.
-inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
-                             const bool use_ref_1, const void* ref_1,
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+                             const int ref_0_size_log2, const bool use_ref_1,
+                             const void* LIBGAV1_RESTRICT ref_1,
                              const int ref_1_size_log2) {
   const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
   const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
@@ -968,9 +974,9 @@ struct DcDefs {
 // IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
 
 template <int block_height>
-void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
                         const void* /*top_row*/,
-                        const void* const left_column) {
+                        const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint16_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = 0;
@@ -983,9 +989,9 @@ void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
                         const void* /*top_row*/,
-                        const void* const left_column) {
+                        const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint16_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = 0;
@@ -998,9 +1004,9 @@ void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
                          const void* /*top_row*/,
-                         const void* const left_column) {
+                         const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint16_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = 0;
@@ -1020,9 +1026,9 @@ void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
                          const void* /*top_row*/,
-                         const void* const left_column) {
+                         const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint16_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = 0;
@@ -1048,8 +1054,8 @@ void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
 // IntraPredFuncs_NEON::Vertical -- copy top row to all rows
 
 template <int block_height>
-void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
-                      const void* const top_row,
+void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
                       const void* const /*left_column*/) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1062,8 +1068,8 @@ void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
-                      const void* const top_row,
+void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
                       const void* const /*left_column*/) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1076,8 +1082,8 @@ void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
-                       const void* const top_row,
+void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
                        const void* const /*left_column*/) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1096,8 +1102,8 @@ void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
-                       const void* const top_row,
+void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
                        const void* const /*left_column*/) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1122,8 +1128,8 @@ void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int block_height>
-void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
-                       const void* const top_row,
+void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
                        const void* const /*left_column*/) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1159,6 +1165,145 @@ void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
   } while (y != 0);
 }
 
+template <int height>
+inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
+  const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
+  const uint16x4_t top = vld1_u16(top_row);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t left = vdup_n_u16(left_col[y]);
+
+    const uint16x4_t left_dist = vabd_u16(top, top_left);
+    const uint16x4_t top_dist = vabd_u16(left, top_left);
+    const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
+
+    const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
+    const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
+    const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x4_t result = vbsl_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u16(left_or_top_mask, result, top_left);
+
+    vst1_u16(dst16, result);
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+  const uint16x8_t top = vld1q_u16(top_row);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t left = vdupq_n_u16(left_col[y]);
+
+    const uint16x8_t left_dist = vabdq_u16(top, top_left);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x8_t result = vbslq_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbslq_u16(left_or_top_mask, result, top_left);
+
+    vst1q_u16(dst16, result);
+    dst += stride;
+  }
+}
+
+// For 16xH and above.
+template <int width, int height>
+inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+
+  uint16x8_t top[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    top[i] = vld1q_u16(top_row + (i << 3));
+  }
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t left = vdupq_n_u16(left_col[y]);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+
+    for (int i = 0; i < (width >> 3); ++i) {
+      const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
+      const uint16x8_t top_left_dist =
+          vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
+
+      const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+      const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+      const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+      // if (left_dist <= top_dist && left_dist <= top_left_dist)
+      const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+      //   dest[x] = left_column[y];
+      // Fill all the unused spaces with 'top'. They will be overwritten when
+      // the positions for top_left are known.
+      uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
+      // else if (top_dist <= top_left_dist)
+      //   dest[x] = top_row[x];
+      // Add these values to the mask. They were already set.
+      const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+      // else
+      //   dest[x] = top_left;
+      result = vbslq_u16(left_or_top_mask, result, top_left);
+
+      vst1q_u16(dst_x, result);
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
@@ -1170,6 +1315,8 @@ void Init10bpp() {
       DcDefs::_4x4::Dc;
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
       Vertical4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<4>;
 
   // 4x8
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -1182,6 +1329,8 @@ void Init10bpp() {
       Horizontal4xH_NEON<8>;
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
       Vertical4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<8>;
 
   // 4x16
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -1194,6 +1343,8 @@ void Init10bpp() {
       Horizontal4xH_NEON<16>;
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
       Vertical4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<16>;
 
   // 8x4
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -1204,6 +1355,8 @@ void Init10bpp() {
       DcDefs::_8x4::Dc;
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
       Vertical8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<4>;
 
   // 8x8
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1216,6 +1369,8 @@ void Init10bpp() {
       Horizontal8xH_NEON<8>;
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
       Vertical8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<8>;
 
   // 8x16
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1226,6 +1381,8 @@ void Init10bpp() {
       DcDefs::_8x16::Dc;
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
       Vertical8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<16>;
 
   // 8x32
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1238,6 +1395,8 @@ void Init10bpp() {
       Horizontal8xH_NEON<32>;
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
       Vertical8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<32>;
 
   // 16x4
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1248,6 +1407,8 @@ void Init10bpp() {
       DcDefs::_16x4::Dc;
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
       Vertical16xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 4>;
 
   // 16x8
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1260,6 +1421,8 @@ void Init10bpp() {
       Horizontal16xH_NEON<8>;
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
       Vertical16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 8>;
 
   // 16x16
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1270,6 +1433,8 @@ void Init10bpp() {
       DcDefs::_16x16::Dc;
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
       Vertical16xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 16>;
 
   // 16x32
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1280,6 +1445,8 @@ void Init10bpp() {
       DcDefs::_16x32::Dc;
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
       Vertical16xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 32>;
 
   // 16x64
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1290,6 +1457,8 @@ void Init10bpp() {
       DcDefs::_16x64::Dc;
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
       Vertical16xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 64>;
 
   // 32x8
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1300,6 +1469,8 @@ void Init10bpp() {
       DcDefs::_32x8::Dc;
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
       Vertical32xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 8>;
 
   // 32x16
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1310,6 +1481,8 @@ void Init10bpp() {
       DcDefs::_32x16::Dc;
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
       Vertical32xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 16>;
 
   // 32x32
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1320,6 +1493,8 @@ void Init10bpp() {
       DcDefs::_32x32::Dc;
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
       Vertical32xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 32>;
 
   // 32x64
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1332,6 +1507,8 @@ void Init10bpp() {
       Horizontal32xH_NEON<64>;
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
       Vertical32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 64>;
 
   // 64x16
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1342,6 +1519,8 @@ void Init10bpp() {
       DcDefs::_64x16::Dc;
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
       Vertical64xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 16>;
 
   // 64x32
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1352,6 +1531,8 @@ void Init10bpp() {
       DcDefs::_64x32::Dc;
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
       Vertical64xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 32>;
 
   // 64x64
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1362,6 +1543,8 @@ void Init10bpp() {
       DcDefs::_64x64::Dc;
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
       Vertical64xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 64>;
 }
 
 }  // namespace
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
index b27f29f..5a56924 100644
--- a/src/dsp/arm/intrapred_neon.h
+++ b/src/dsp/arm/intrapred_neon.h
@@ -152,6 +152,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -161,6 +162,7 @@ void IntraPredInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -170,6 +172,7 @@ void IntraPredInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -177,6 +180,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -186,6 +190,7 @@ void IntraPredInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -193,6 +198,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -202,6 +208,7 @@ void IntraPredInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -209,6 +216,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -218,6 +226,7 @@ void IntraPredInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -226,6 +235,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -234,6 +244,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -242,6 +253,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -249,6 +261,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -257,6 +270,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -265,6 +279,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -275,6 +290,7 @@ void IntraPredInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -283,6 +299,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -291,6 +308,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
@@ -299,6 +317,7 @@ void IntraPredInit_NEON();
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
   LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index c33f333..854bd73 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/common.h"
 #include "src/utils/constants.h"
 
 namespace libgav1 {
@@ -38,20 +39,8 @@ namespace {
 // to have visibility of the values. This helps reduce loads and in the
 // creation of the inverse weights.
 constexpr uint8_t kSmoothWeights[] = {
-    // block dimension = 4
-    255, 149, 85, 64,
-    // block dimension = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // block dimension = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // block dimension = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // block dimension = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
-    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
-    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+#include "src/dsp/smooth_weights.inc"
+};
 
 // TODO(b/150459137): Keeping the intermediate values in uint16_t would allow
 // processing more values at once. At the high end, it could do 4x4 or 8x2 at a
@@ -67,9 +56,10 @@ inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
 }
 
 template <int width, int height>
-inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
-                              const void* const top_row,
-                              const void* const left_column) {
+inline void Smooth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   const uint8_t* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t top_right = top[width - 1];
@@ -148,9 +138,10 @@ inline uint8x16_t CalculateWeightsAndPred(
 }
 
 template <int width, int height>
-inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+inline void Smooth16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   const uint8_t* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t top_right = top[width - 1];
@@ -229,9 +220,10 @@ inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int width, int height>
-inline void SmoothVertical4Or8xN_NEON(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const void* const left_column) {
+inline void SmoothVertical4Or8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   const uint8_t* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t bottom_left = left[height - 1];
@@ -279,9 +271,10 @@ inline uint8x16_t CalculateVerticalWeightsAndPred(
 }
 
 template <int width, int height>
-inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride,
-                                        const void* const top_row,
-                                        const void* const left_column) {
+inline void SmoothVertical16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   const uint8_t* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t bottom_left = left[height - 1];
@@ -330,9 +323,10 @@ inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride,
 }
 
 template <int width, int height>
-inline void SmoothHorizontal4Or8xN_NEON(void* const dest, ptrdiff_t stride,
-                                        const void* const top_row,
-                                        const void* const left_column) {
+inline void SmoothHorizontal4Or8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   const uint8_t* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t top_right = top[width - 1];
@@ -382,9 +376,10 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred(
 }
 
 template <int width, int height>
-inline void SmoothHorizontal16PlusxN_NEON(void* const dest, ptrdiff_t stride,
-                                          const void* const top_row,
-                                          const void* const left_column) {
+inline void SmoothHorizontal16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const uint8_t* const top = static_cast<const uint8_t*>(top_row);
   const uint8_t* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t top_right = top[width - 1];
@@ -601,7 +596,535 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint16_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+template <int height>
+inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[3];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_v = vld1_u16(top);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
+  const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v);
+
+  // Weighted top right doesn't change with each row.
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // Each variable in the running summation is named for the last item to be
+    // accumulated.
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_top, weights_x_v, left[y]);
+    const uint32x4_t weighted_bl =
+        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+    const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1);
+    vst1_u16(reinterpret_cast<uint16_t*>(dst), pred);
+    dst += stride;
+  }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
+                           const uint32x4_t& weighted_corners_low,
+                           const uint32x4_t& weighted_corners_high,
+                           const uint16x4x2_t& top_vals,
+                           const uint16x4x2_t& weights_x, const uint16_t left_y,
+                           const uint16_t weight_y) {
+  // Each variable in the running summation is named for the last item to be
+  // accumulated.
+  const uint32x4_t weighted_top_low =
+      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+  const uint32x4_t weighted_edges_low =
+      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+  const uint16x4_t pred_low =
+      vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1);
+  vst1_u16(dst, pred_low);
+
+  const uint32x4_t weighted_top_high =
+      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+  const uint32x4_t weighted_edges_high =
+      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+  const uint16x4_t pred_high =
+      vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1);
+  vst1_u16(dst + 4, pred_high);
+}
+
+template <int height>
+inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[7];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)};
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+                                  vld1_u16(kSmoothWeights + 8)};
+  // Weighted top right doesn't change with each row.
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_corners_low =
+        vaddq_u32(weighted_bl, weighted_tr_low);
+    const uint32x4_t weighted_corners_high =
+        vaddq_u32(weighted_bl, weighted_tr_high);
+    CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low,
+                   weighted_corners_high, top_vals, weights_x, left[y],
+                   weights_y[y]);
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[width - 1];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weight_scaling = vdup_n_u16(256);
+  // Precompute weighted values that don't vary with |y|.
+  uint32x4_t weighted_tr_low[width >> 3];
+  uint32x4_t weighted_tr_high[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
+    weighted_tr_low[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right);
+    const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
+    weighted_tr_high[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right);
+  }
+
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  for (int y = 0; y < height; ++y) {
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    uint16_t* dst_x = reinterpret_cast<uint16_t*>(dst);
+    for (int i = 0; i < width >> 3; ++i) {
+      const int x = i << 3;
+      const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+      const uint32x4_t weighted_corners_low =
+          vaddq_u32(weighted_bl, weighted_tr_low[i]);
+      const uint32x4_t weighted_corners_high =
+          vaddq_u32(weighted_bl, weighted_tr_high[i]);
+      // Accumulate weighted edge values and store.
+      const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x),
+                                      vld1_u16(kSmoothWeights + width + x)};
+      CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high,
+                     top_vals, weights_x, left[y], weights_y[y]);
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothVertical4xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_v = vld1_u16(top);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    uint16_t* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale));
+
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothVertical8xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_low = vld1_u16(top);
+  const uint16x4_t top_high = vld1_u16(top + 4);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    uint16_t* dst16 = reinterpret_cast<uint16_t*>(dst);
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    const uint32x4_t weighted_top_low =
+        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+    const uint32x4_t weighted_top_high =
+        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+    vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+inline void SmoothVerticalWxH_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  uint16x4x2_t top_vals[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+  }
+
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  for (int y = 0; y < height; ++y) {
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    uint16_t* dst_x = reinterpret_cast<uint16_t*>(dst);
+    for (int i = 0; i < width >> 3; ++i) {
+      const uint32x4_t weighted_top_low =
+          vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);
+      vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+      const uint32x4_t weighted_top_high =
+          vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);
+      vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothHorizontal4xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[3];
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
+  const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x);
+
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+  for (int y = 0; y < height; ++y) {
+    uint16_t* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_tr, weights_x, left[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothHorizontal8xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[7];
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+                                  vld1_u16(kSmoothWeights + 8)};
+
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    uint16_t* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t left_y = left[y];
+    const uint32x4_t weighted_left_low =
+        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+    const uint32x4_t weighted_left_high =
+        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+    vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+inline void SmoothHorizontalWxH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[width - 1];
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weight_scaling = vdup_n_u16(256);
+
+  uint16x4_t weights_x_low[width >> 3];
+  uint16x4_t weights_x_high[width >> 3];
+  uint32x4_t weighted_tr_low[width >> 3];
+  uint32x4_t weighted_tr_high[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
+    weighted_tr_low[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right);
+    weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
+    weighted_tr_high[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    uint16_t* dst_x = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t left_y = left[y];
+    for (int i = 0; i < width >> 3; ++i) {
+      const uint32x4_t weighted_left_low =
+          vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);
+      vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+      const uint32x4_t weighted_left_high =
+          vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);
+      vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 64>;
+}
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredSmoothInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
index edd01be..28b5bd5 100644
--- a/src/dsp/arm/intrapred_smooth_neon.h
+++ b/src/dsp/arm/intrapred_smooth_neon.h
@@ -144,6 +144,131 @@ void IntraPredSmoothInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
   LIBGAV1_CPU_NEON
+
+// 10bpp
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
index ff184a1..f153e31 100644
--- a/src/dsp/arm/inverse_transform_10bit_neon.cc
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -67,7 +67,8 @@ LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
 
 //------------------------------------------------------------------------------
 template <int store_count>
-LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
                                     const int32x4_t* const s) {
   assert(store_count % 4 == 0);
   for (int i = 0; i < store_count; i += 4) {
@@ -79,8 +80,8 @@ LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
 }
 
 template <int load_count>
-LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
-                                   int32_t idx, int32x4_t* x) {
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, int32x4_t* x) {
   assert(load_count % 4 == 0);
   for (int i = 0; i < load_count; i += 4) {
     x[i] = vld1q_s32(&src[i * stride + idx]);
@@ -1517,59 +1518,23 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
 template <int identity_size>
 LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
     Array2DView<uint16_t> frame, const int start_x, const int start_y,
-    const int tx_width, const int tx_height, const int32_t* source) {
-  static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+    const int tx_width, const int tx_height,
+    const int32_t* LIBGAV1_RESTRICT source) {
+  static_assert(identity_size == 4 || identity_size == 8 ||
+                    identity_size == 16 || identity_size == 32,
                 "Invalid identity_size.");
   const int stride = frame.columns();
-  uint16_t* dst = frame[start_y] + start_x;
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
   const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
 
-  if (tx_width == 4) {
-    int i = 0;
-    do {
-      int32x4x2_t v_src, v_dst_i, a, b;
-      v_src.val[0] = vld1q_s32(&source[i * 4]);
-      v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
-      if (identity_size == 4) {
-        v_dst_i.val[0] =
-            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
-        v_dst_i.val[1] =
-            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
-        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
-        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
-      } else if (identity_size == 8) {
-        v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
-        v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
-        a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
-        a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
-      } else {  // identity_size == 16
-        v_dst_i.val[0] =
-            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
-        v_dst_i.val[1] =
-            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
-        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
-        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
-      }
-      uint16x4x2_t frame_data;
-      frame_data.val[0] = vld1_u16(dst);
-      frame_data.val[1] = vld1_u16(dst + stride);
-      b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
-      b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
-      vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
-      vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
-      dst += stride << 1;
-      i += 2;
-    } while (i < tx_height);
-  } else {
-    int i = 0;
-    do {
-      const int row = i * tx_width;
-      int j = 0;
+  if (identity_size < 32) {
+    if (tx_width == 4) {
+      int i = 0;
       do {
         int32x4x2_t v_src, v_dst_i, a, b;
-        v_src.val[0] = vld1q_s32(&source[row + j]);
-        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        v_src.val[0] = vld1q_s32(&source[i * 4]);
+        v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
         if (identity_size == 4) {
           v_dst_i.val[0] =
               vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
@@ -1591,13 +1556,72 @@ LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
           a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
         }
         uint16x4x2_t frame_data;
-        frame_data.val[0] = vld1_u16(dst + j);
-        frame_data.val[1] = vld1_u16(dst + j + 4);
+        frame_data.val[0] = vld1_u16(dst);
+        frame_data.val[1] = vld1_u16(dst + stride);
         b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
         b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
-        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
-        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
-        j += 8;
+        vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        dst += stride << 1;
+        i += 2;
+      } while (i < tx_height);
+    } else {
+      int i = 0;
+      do {
+        const int row = i * tx_width;
+        int j = 0;
+        do {
+          int32x4x2_t v_src, v_dst_i, a, b;
+          v_src.val[0] = vld1q_s32(&source[row + j]);
+          v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+          if (identity_size == 4) {
+            v_dst_i.val[0] =
+                vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+            v_dst_i.val[1] =
+                vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+            a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+            a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+          } else if (identity_size == 8) {
+            v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+            v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+            a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+            a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+          } else {  // identity_size == 16
+            v_dst_i.val[0] =
+                vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+            v_dst_i.val[1] =
+                vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+            a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+            a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+          }
+          uint16x4x2_t frame_data;
+          frame_data.val[0] = vld1_u16(dst + j);
+          frame_data.val[1] = vld1_u16(dst + j + 4);
+          b.val[0] =
+              vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+          b.val[1] =
+              vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+          vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+          vst1_u16(dst + j + 4,
+                   vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+          j += 8;
+        } while (j < tx_width);
+        dst += stride;
+      } while (++i < tx_height);
+    }
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int32x4_t v_dst_i = vld1q_s32(&source[row + j]);
+        const uint16x4_t frame_data = vld1_u16(dst + j);
+        const int32x4_t a = vrshrq_n_s32(v_dst_i, 2);
+        const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+        const uint16x4_t d = vmin_u16(vqmovun_s32(b), v_max_bitdepth);
+        vst1_u16(dst + j, d);
+        j += 4;
       } while (j < tx_width);
       dst += stride;
     } while (++i < tx_height);
@@ -1606,9 +1630,10 @@ LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
 
 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
     Array2DView<uint16_t> frame, const int start_x, const int start_y,
-    const int tx_width, const int tx_height, const int32_t* source) {
+    const int tx_width, const int tx_height,
+    const int32_t* LIBGAV1_RESTRICT source) {
   const int stride = frame.columns();
-  uint16_t* dst = frame[start_y] + start_x;
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
   const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
 
@@ -1747,6 +1772,119 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
   return true;
 }
 
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+                                                const int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 32; j += 4) {
+      const int32x4_t v_src = vld1q_s32(&dst[i * step + j]);
+      const int32x4_t v_dst_i = vqaddq_s32(v_src, v_src);
+      vst1q_s32(&dst[i * step + j], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x2_t v_src0 = vdup_n_s32(dst[0]);
+  const int32x2_t v_src =
+      vqrdmulh_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const int32x2_t v_dst_0 = vqadd_s32(v_src, v_src);
+  vst1_lane_s32(dst, v_dst_0, 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint16_t* LIBGAV1_RESTRICT dst,
+                                     const int dst_stride,
+                                     const void* LIBGAV1_RESTRICT source,
+                                     const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int32_t*>(source);
+  int32x4_t s[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int32_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int32_t g = f >> 1;
+    f = f - (f >> 1);
+    const int32_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int32_t i = (src[0] >> 4);
+    s[0] = vdupq_n_s32(h);
+    s[0] = vsetq_lane_s32(f, s[0], 0);
+    s[1] = vdupq_n_s32(i);
+    s[1] = vsetq_lane_s32(g, s[1], 0);
+    s[2] = s[3] = s[1];
+  } else {
+    // Load the 4x4 source in transposed form.
+    int32x4x4_t columns = vld4q_s32(src);
+
+    // Shift right and permute the columns for the WHT.
+    s[0] = vshrq_n_s32(columns.val[0], 2);
+    s[2] = vshrq_n_s32(columns.val[1], 2);
+    s[3] = vshrq_n_s32(columns.val[2], 2);
+    s[1] = vshrq_n_s32(columns.val[3], 2);
+
+    // Row transforms.
+    s[0] = vaddq_s32(s[0], s[2]);
+    s[3] = vsubq_s32(s[3], s[1]);
+    int32x4_t e = vhsubq_s32(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsubq_s32(e, s[1]);
+    s[2] = vsubq_s32(e, s[2]);
+    s[0] = vsubq_s32(s[0], s[1]);
+    s[3] = vaddq_s32(s[3], s[2]);
+
+    int32x4_t x[4];
+    Transpose4x4(s, x);
+
+    s[0] = x[0];
+    s[2] = x[1];
+    s[3] = x[2];
+    s[1] = x[3];
+
+    // Column transforms.
+    s[0] = vaddq_s32(s[0], s[2]);
+    s[3] = vsubq_s32(s[3], s[1]);
+    e = vhsubq_s32(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsubq_s32(e, s[1]);
+    s[2] = vsubq_s32(e, s[2]);
+    s[0] = vsubq_s32(s[0], s[1]);
+    s[3] = vaddq_s32(s[3], s[2]);
+  }
+
+  // Store to frame.
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  for (int row = 0; row < 4; row += 1) {
+    const uint16x4_t frame_data = vld1_u16(dst);
+    const int32x4_t b = vaddw_s16(s[row], vreinterpret_s16_u16(frame_data));
+    vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+    dst += dst_stride;
+  }
+}
+
 //------------------------------------------------------------------------------
 // row/column transform loops
 
@@ -1837,11 +1975,12 @@ LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
 template <int tx_height, bool enable_flip_rows = false>
 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
     Array2DView<uint16_t> frame, const int start_x, const int start_y,
-    const int tx_width, const int32_t* source, TransformType tx_type) {
+    const int tx_width, const int32_t* LIBGAV1_RESTRICT source,
+    TransformType tx_type) {
   const bool flip_rows =
       enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
   const int stride = frame.columns();
-  uint16_t* dst = frame[start_y] + start_x;
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
 
   if (tx_width == 4) {
     for (int i = 0; i < tx_height; ++i) {
@@ -1909,8 +2048,10 @@ void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
 }
 
 void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                  int adjusted_tx_height, void* src_buffer,
-                                  int start_x, int start_y, void* dst_frame) {
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -1962,8 +2103,10 @@ void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
 }
 
 void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                  int adjusted_tx_height, void* src_buffer,
-                                  int start_x, int start_y, void* dst_frame) {
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2014,8 +2157,10 @@ void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2066,8 +2211,10 @@ void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2117,8 +2264,10 @@ void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2168,8 +2317,10 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2222,8 +2373,10 @@ void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2275,8 +2428,10 @@ void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
 
 void Adst16TransformLoopColumn_NEON(TransformType tx_type,
                                     TransformSize tx_size,
-                                    int adjusted_tx_height, void* src_buffer,
-                                    int start_x, int start_y, void* dst_frame) {
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2335,9 +2490,10 @@ void Identity4TransformLoopRow_NEON(TransformType tx_type,
 
 void Identity4TransformLoopColumn_NEON(TransformType tx_type,
                                        TransformSize tx_size,
-                                       int adjusted_tx_height, void* src_buffer,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
                                        int start_x, int start_y,
-                                       void* dst_frame) {
+                                       void* LIBGAV1_RESTRICT dst_frame) {
   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
@@ -2416,9 +2572,10 @@ void Identity8TransformLoopRow_NEON(TransformType tx_type,
 
 void Identity8TransformLoopColumn_NEON(TransformType tx_type,
                                        TransformSize tx_size,
-                                       int adjusted_tx_height, void* src_buffer,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
                                        int start_x, int start_y,
-                                       void* dst_frame) {
+                                       void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2457,8 +2614,9 @@ void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
 void Identity16TransformLoopColumn_NEON(TransformType tx_type,
                                         TransformSize tx_size,
                                         int adjusted_tx_height,
-                                        void* src_buffer, int start_x,
-                                        int start_y, void* dst_frame) {
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int32_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2470,6 +2628,80 @@ void Identity16TransformLoopColumn_NEON(TransformType tx_type,
                                  adjusted_tx_height, src);
 }
 
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16. The src is always rounded before the identity
+  // transform and shifted by 1 afterwards.
+  auto* src = static_cast<int32_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int32_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  uint16_t* dst = frame[start_y] + start_x;
+  const int dst_stride = frame.columns();
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
 //------------------------------------------------------------------------------
 
 void Init10bpp() {
@@ -2524,6 +2756,16 @@ void Init10bpp() {
       Identity16TransformLoopRow_NEON;
   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
       Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
 }
 
 }  // namespace
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 315d5e9..7795144 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -1805,9 +1805,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
 template <int identity_size>
 LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
     Array2DView<uint8_t> frame, const int start_x, const int start_y,
-    const int tx_width, const int tx_height, const int16_t* source) {
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
   const int stride = frame.columns();
-  uint8_t* dst = frame[start_y] + start_x;
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
 
   if (identity_size < 32) {
     if (tx_width == 4) {
@@ -1891,9 +1892,10 @@ LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
 
 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
     Array2DView<uint8_t> frame, const int start_x, const int start_y,
-    const int tx_width, const int tx_height, const int16_t* source) {
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
   const int stride = frame.columns();
-  uint8_t* dst = frame[start_y] + start_x;
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
 
   if (tx_width == 4) {
     uint8x8_t frame_data = vdup_n_u8(0);
@@ -2106,8 +2108,9 @@ LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput(
 }
 
 // Process 4 wht4 rows and columns.
-LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* dst, const int dst_stride,
-                                     const void* source,
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* LIBGAV1_RESTRICT dst,
+                                     const int dst_stride,
+                                     const void* LIBGAV1_RESTRICT source,
                                      const int adjusted_tx_height) {
   const auto* const src = static_cast<const int16_t*>(source);
   int16x4_t s[4];
@@ -2273,11 +2276,12 @@ LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
 template <int tx_height, bool enable_flip_rows = false>
 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
     Array2DView<uint8_t> frame, const int start_x, const int start_y,
-    const int tx_width, const int16_t* source, TransformType tx_type) {
+    const int tx_width, const int16_t* LIBGAV1_RESTRICT source,
+    TransformType tx_type) {
   const bool flip_rows =
       enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
   const int stride = frame.columns();
-  uint8_t* dst = frame[start_y] + start_x;
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
 
   // Enable for 4x4, 4x8, 4x16
   if (tx_height < 32 && tx_width == 4) {
@@ -2368,8 +2372,10 @@ void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
 }
 
 void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                  int adjusted_tx_height, void* src_buffer,
-                                  int start_x, int start_y, void* dst_frame) {
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2435,8 +2441,10 @@ void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
 }
 
 void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                  int adjusted_tx_height, void* src_buffer,
-                                  int start_x, int start_y, void* dst_frame) {
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2497,8 +2505,10 @@ void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2551,8 +2561,10 @@ void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2594,8 +2606,10 @@ void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2645,8 +2659,10 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2707,8 +2723,10 @@ void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
 }
 
 void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                   int adjusted_tx_height, void* src_buffer,
-                                   int start_x, int start_y, void* dst_frame) {
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2771,8 +2789,10 @@ void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
 
 void Adst16TransformLoopColumn_NEON(TransformType tx_type,
                                     TransformSize tx_size,
-                                    int adjusted_tx_height, void* src_buffer,
-                                    int start_x, int start_y, void* dst_frame) {
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2844,9 +2864,10 @@ void Identity4TransformLoopRow_NEON(TransformType tx_type,
 
 void Identity4TransformLoopColumn_NEON(TransformType tx_type,
                                        TransformSize tx_size,
-                                       int adjusted_tx_height, void* src_buffer,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
                                        int start_x, int start_y,
-                                       void* dst_frame) {
+                                       void* LIBGAV1_RESTRICT dst_frame) {
   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
@@ -2919,9 +2940,10 @@ void Identity8TransformLoopRow_NEON(TransformType tx_type,
 
 void Identity8TransformLoopColumn_NEON(TransformType tx_type,
                                        TransformSize tx_size,
-                                       int adjusted_tx_height, void* src_buffer,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
                                        int start_x, int start_y,
-                                       void* dst_frame) {
+                                       void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -2960,8 +2982,9 @@ void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
 void Identity16TransformLoopColumn_NEON(TransformType tx_type,
                                         TransformSize tx_size,
                                         int adjusted_tx_height,
-                                        void* src_buffer, int start_x,
-                                        int start_y, void* dst_frame) {
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
 
@@ -3007,8 +3030,9 @@ void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
 void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
                                         TransformSize tx_size,
                                         int adjusted_tx_height,
-                                        void* src_buffer, int start_x,
-                                        int start_y, void* dst_frame) {
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
@@ -3029,8 +3053,10 @@ void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
 }
 
 void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
-                                  int adjusted_tx_height, void* src_buffer,
-                                  int start_x, int start_y, void* dst_frame) {
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
   assert(tx_type == kTransformTypeDctDct);
   assert(tx_size == kTransformSize4x4);
   static_cast<void>(tx_type);
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
index 91e0e83..ecb1575 100644
--- a/src/dsp/arm/inverse_transform_neon.h
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -62,6 +62,9 @@ void InverseTransformInit10bpp_NEON();
 #define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
 
 #endif  // LIBGAV1_ENABLE_NEON
 
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 8d72892..2487ba5 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -50,7 +50,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
 }
 
 // abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-//   OuterThreshhold()
+//   OuterThreshold()
 inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
                               const uint8x8_t p0q0, const uint8x8_t p1q1,
                               const uint8_t inner_thresh,
@@ -65,6 +65,7 @@ inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
                          const uint8_t hev_thresh, const uint8_t outer_thresh,
                          const uint8_t inner_thresh, uint8x8_t* const hev_mask,
                          uint8x8_t* const needs_filter4_mask) {
+  // First half is |p0 - p1|, second half is |q0 - q1|.
   const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
   // This includes cases where NeedsFilter4() is not true and so Filter2() will
   // not be applied.
@@ -256,7 +257,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
 
 // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
 //   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
-//   OuterThreshhold()
+//   OuterThreshold()
 inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
                               const uint8x8_t abd_p1p2_q1q2,
                               const uint8x8_t p0q0, const uint8x8_t p1q1,
@@ -288,26 +289,26 @@ inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
   // Sum p1 and q1 output from opposite directions
   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //      ^^^^^^^^
-  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
   //                                 ^^^^^^^^
   const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
   uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
 
   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //                 ^^^^^^^^
-  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
   //                      ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
 
   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //                            ^^^^^^^^
-  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
   //           ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
 
   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //                                       ^^
-  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
   //      ^^
   const uint8x8_t q0p0 = Transpose32(p0q0);
   sum = vaddw_u8(sum, q0p0);
@@ -488,7 +489,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
 // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
 //   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
 //   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
-//   OuterThreshhold()
+//   OuterThreshold()
 inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
                               const uint8x8_t abd_p1p2_q1q2,
                               const uint8x8_t abd_p2p3_q2q3,
@@ -1174,7 +1175,1266 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+                                 const uint16x4_t q0, const uint16x4_t q1,
+                                 const uint16_t outer_thresh) {
+  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+  return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16x8_t abd_p2p3_q2q3,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const hev_mask,
+                         uint16x4_t* const needs_filter4_mask) {
+  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+                          const uint16x8_t abd_p0p2_q0q2) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                         const uint16x8_t p0q0, const uint16_t hev_thresh,
+                         const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter6_mask,
+                         uint16x4_t* const is_flat3_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+                                     inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+                          const uint16x8_t abd_pn1p0_qn1q0,
+                          const uint16x8_t abd_pn2p0_qn2q0) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                         const uint16x8_t p1q1, const uint16x8_t p0q0,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter8_mask,
+                         uint16x4_t* const is_flat4_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  const uint16x4_t is_flat4 =
+      IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+                   inner_thresh, outer_mask);
+  // |is_flat4_mask| is used to decide where to use the result of Filter8.
+  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+  // overriding the question of whether to use Filter8. Because Filter4 doesn't
+  // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+  // source value. To be correct, the mask must account for this override.
+  *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+                    const uint16x8_t p1q1, const uint16x4_t hev_mask,
+                    uint16x8_t* const p1q1_result,
+                    uint16x8_t* const p0q0_result) {
+  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // q0mp0 means "q0 minus p0".
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int16x4_t p1mq1_saturated =
+      Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+  const int16x4_t hev_option =
+      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+  // Need to figure out what's going on here because there are some unnecessary
+  // tricks to accommodate 8x8 as smallest 8bpp vector
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four =
+      Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t plus_three =
+      Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+  // a3 = (a1 + 1) >> 1;
+  const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+  // Need to shift the second term or we end up with a2_ma2.
+  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+  *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+  *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+  const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+                             vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test, but may not come up often
+  // enough to warrant it.
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  const uint64x1_t needs_filter4_mask64 =
+      vreinterpret_u64_u16(needs_filter4_mask);
+  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Offset by 2 uint16_t values to load from first p1 position.
+  auto* dst = static_cast<uint8_t*>(dest) - 4;
+  auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+  auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+  auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+  uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+                       vld1_u16(dst_q1)};
+  Transpose4x4(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  const uint64x1_t needs_filter4_mask64 =
+      vreinterpret_u64_u16(needs_filter4_mask);
+  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  vst1_u16(dst_p1, output[0]);
+  vst1_u16(dst_p0, output[1]);
+  vst1_u16(dst_q0, output[2]);
+  vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                    const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions.
+  // The formula is regrouped to allow 3 doubling operations to be combined.
+  // TODO(petersonab): Apply grouping to 8bpp.
+  //
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                    ^^^^^^^^^^^
+  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                                ^^^^^^
+  sum = vaddq_u16(sum, p0q0);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //               ^^^^^
+  sum = vshlq_n_u16(sum, 1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^                          ^^^^^^
+  // Should dual issue with the left shift.
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+  sum = vaddq_u16(sum, outer_sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //                ^^^^^^^^
+  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //        ^^^^^^^^
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+  const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+                             vld1_u16(dst_p0), vld1_u16(dst_q0),
+                             vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Left side of the filter window.
+  auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Overread by 2 values. These overreads become the high halves of src_raw[2]
+  // and src_raw[3] after transpose.
+  uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                           vld1q_u16(dst_3)};
+  Transpose4x8(src_raw);
+  // p2, p1, p0, q0, q1, q2
+  const uint16x4_t src[6] = {
+      vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
+      vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
+      vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+  };
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  // dst_n starts at p2, so adjust to p1.
+  vst1_u16(dst_0 + 1, output[0]);
+  vst1_u16(dst_1 + 1, output[1]);
+  vst1_u16(dst_2 + 1, output[2]);
+  vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                    const uint16x8_t p1q1, const uint16x8_t p0q0,
+                    uint16x8_t* const p2q2_output,
+                    uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // TODO(petersonab): Apply grouping to 8bpp.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddq_u16(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p2q2_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  const uint16x4_t src[8] = {
+      vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+      vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+  // To get desired pairs after transpose, one half should be reversed.
+  uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                       vld1q_u16(dst_3)};
+
+  // src[0] = p0q0
+  // src[1] = p1q1
+  // src[2] = p2q2
+  // src[3] = p3q3
+  LoopFilterTranspose4x8(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+      vget_high_u16(src[1]), outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = src[0];
+  const uint16x8_t p1q1 = src[1];
+  const uint16x8_t p2q2 = src[2];
+  const uint16x8_t p3q3 = src[3];
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+  // After transpose, |output| will contain rows of the form:
+  // p0 p1 p2 p3 q0 q1 q2 q3
+  Transpose4x8(output);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+  vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+  vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+  vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+                     const uint16x8_t p4q4, const uint16x8_t p3q3,
+                     const uint16x8_t p2q2, const uint16x8_t p1q1,
+                     const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+                     uint16x8_t* const p4q4_output,
+                     uint16x8_t* const p3q3_output,
+                     uint16x8_t* const p2q2_output,
+                     uint16x8_t* const p1q1_output,
+                     uint16x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions.
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^^^^^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^^^^^^^^^^^^
+  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  sum = vaddq_u16(sum, p6q6_x7);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p5q5_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+  const uint16x8_t q3p3 = Transpose64(p3q3);
+  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+  const uint16x8_t q4p4 = Transpose64(p4q4);
+  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+  const uint16x8_t q5p5 = Transpose64(p5q5);
+  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+  auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+  auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+  auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+  auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+  auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+  const uint16x4_t src[14] = {
+      vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+      vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+      vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+      vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+
+  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+  uint16x8x2_t acdb;
+#if defined(__aarch64__)
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+                     vreinterpretq_u64_u16(ab), 1));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+                     vreinterpretq_u64_u16(ab), 0));
+#endif  // defined(__aarch64__)
+  return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                     int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Low halves:  p7 p6 p5 p4
+  // High halves: p3 p2 p1 p0
+  uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                         vld1q_u16(dst_3)};
+  // p7 will be the low half of src_p[0]. Not used until the end.
+  Transpose4x8(src_p);
+
+  // Low halves:  q0 q1 q2 q3
+  // High halves: q4 q5 q6 q7
+  uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+                         vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+  // q7 will be the high half of src_q[3]. Not used until the end.
+  Transpose4x8(src_q);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+      vget_low_u16(src_q[1]), outer_thresh);
+  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 =
+      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+  const uint16x8_t p5q5 =
+      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+  const uint16x8_t p6q6 =
+      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+  const uint16x8_t p7q7 =
+      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+  // To get the correctly ordered rows from the transpose, we need:
+  // p7p3 p6p2 p5p1 p4p0
+  // q0q4 q1q5 q2q6 q3q7
+  const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+  const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+  const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+  const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+  uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+                            p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+  Transpose4x8(output_p);
+  uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+                            p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+  Transpose4x8(output_q);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, output_p[0]);
+  vst1q_u16(dst_0 + 8, output_q[0]);
+  vst1q_u16(dst_1, output_p[1]);
+  vst1q_u16(dst_1 + 8, output_q[1]);
+  vst1q_u16(dst_2, output_p[2]);
+  vst1q_u16(dst_2 + 8, output_q[2]);
+  vst1q_u16(dst_3, output_p[3]);
+  vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void LoopFilterInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
index 5f79200..540defc 100644
--- a/src/dsp/arm/loop_filter_neon.h
+++ b/src/dsp/arm/loop_filter_neon.h
@@ -48,6 +48,23 @@ void LoopFilterInit_NEON();
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
diff --git a/src/dsp/arm/loop_restoration_10bit_neon.cc b/src/dsp/arm/loop_restoration_10bit_neon.cc
new file mode 100644
index 0000000..a2fe6d5
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_10bit_neon.cc
@@ -0,0 +1,2542 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+    const RestorationUnitInfo& restoration_info, const int direction,
+    int16_t filter[4]) {
+  for (int i = 0; i < 4; ++i) {
+    filter[i] = restoration_info.wiener_info.filter[direction][i];
+  }
+}
+
+inline int32x4x2_t WienerHorizontal2(const uint16x8_t s0, const uint16x8_t s1,
+                                     const int16_t filter,
+                                     const int32x4x2_t sum) {
+  const int16x8_t ss = vreinterpretq_s16_u16(vaddq_u16(s0, s1));
+  int32x4x2_t res;
+  res.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(ss), filter);
+  res.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(ss), filter);
+  return res;
+}
+
+inline void WienerHorizontalSum(const uint16x8_t s[3], const int16_t filter[4],
+                                int32x4x2_t sum, int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (kBitdepth10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[2]));
+  const int16x8_t s_1 = vreinterpretq_s16_u16(s[1]);
+  int16x4x2_t sum16;
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_0_2), filter[2]);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_1), filter[3]);
+  sum16.val[0] = vqshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal);
+  sum16.val[0] = vmax_s16(sum16.val[0], vdup_n_s16(-offset));
+  sum16.val[0] = vmin_s16(sum16.val[0], vdup_n_s16(limit - offset));
+  vst1_s16(wiener_buffer, sum16.val[0]);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_0_2), filter[2]);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_1), filter[3]);
+  sum16.val[1] = vqshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal);
+  sum16.val[1] = vmax_s16(sum16.val[1], vdup_n_s16(-offset));
+  sum16.val[1] = vmin_s16(sum16.val[1], vdup_n_s16(limit - offset));
+  vst1_s16(wiener_buffer + 4, sum16.val[1]);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[8];
+    s[0] = vld1q_u16(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 8;
+      s[7] = vld1q_u16(src_ptr);
+      s[1] = vextq_u16(s[0], s[7], 1);
+      s[2] = vextq_u16(s[0], s[7], 2);
+      s[3] = vextq_u16(s[0], s[7], 3);
+      s[4] = vextq_u16(s[0], s[7], 4);
+      s[5] = vextq_u16(s[0], s[7], 5);
+      s[6] = vextq_u16(s[0], s[7], 6);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+      sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+      WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+      s[0] = s[7];
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[6];
+    s[0] = vld1q_u16(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 8;
+      s[5] = vld1q_u16(src_ptr);
+      s[1] = vextq_u16(s[0], s[5], 1);
+      s[2] = vextq_u16(s[0], s[5], 2);
+      s[3] = vextq_u16(s[0], s[5], 3);
+      s[4] = vextq_u16(s[0], s[5], 4);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+      WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+      s[0] = s[5];
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[3];
+    ptrdiff_t x = width;
+    do {
+      s[0] = vld1q_u16(src_ptr);
+      s[1] = vld1q_u16(src_ptr + 1);
+      s[2] = vld1q_u16(src_ptr + 2);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+      src_ptr += 8;
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src + x);
+      const int16x8_t d = vreinterpretq_s16_u16(vshlq_n_u16(s, 4));
+      vst1q_s16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+                                   const int16_t filter,
+                                   const int32x4x2_t sum) {
+  int32x4x2_t d;
+  d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a0), filter);
+  d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a0), filter);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a1), filter);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a1), filter);
+  return d;
+}
+
+inline uint16x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+                                 const int32x4x2_t sum) {
+  int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+  const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+  const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+  return vcombine_u16(sum_lo_16, sum_hi_16);
+}
+
+inline uint16x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[7]) {
+  int32x4x2_t sum;
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[6], filter[0], sum);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap7Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[8];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[7], filter[0], sum);
+  sum = WienerVertical2(a[2], a[6], filter[1], sum);
+  d.val[1] = WienerVertical(a + 3, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[7];
+      const uint16x8_t d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint16x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[5]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[4], filter[1], sum);
+  return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap5Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[6];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  d.val[1] = WienerVertical(a + 2, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[5];
+      const uint16x8_t d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint16x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[3]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  return WienerVertical(a, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap3Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[4];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  d.val[1] = WienerVertical(a + 1, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[3];
+      const uint16x8_t d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+  const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+  const int16x8_t d0 = vrshrq_n_s16(a0, 4);
+  const int16x8_t d1 = vrshrq_n_s16(a1, 4);
+  vst1q_u16(dst, vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d0, vdupq_n_s16(0))),
+                           v_max_bitdepth));
+  vst1q_u16(dst + 8,
+            vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d1, vdupq_n_s16(0))),
+                      v_max_bitdepth));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+      WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst);
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+  int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+  int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+                             filter_horizontal);
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+                             filter_vertical);
+
+  // horizontal filtering.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               uint16x8_t dst[2]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               uint16x8_t dst[3]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+  dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4_t dst[2]) {
+  dst[0] = vld1q_u32(src + 0);
+  dst[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               uint32x4_t dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               uint32x4_t dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+  vst1q_u16(dst + 0, src[0]);
+  vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4_t src[2]) {
+  vst1q_u32(dst + 0, src[0]);
+  vst1q_u32(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4_t src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+inline uint16x8_t VaddwLo8(const uint16x8_t src0, const uint8x16_t src1) {
+  const uint8x8_t s1 = vget_low_u8(src1);
+  return vaddw_u8(src0, s1);
+}
+
+inline uint16x8_t VaddwHi8(const uint16x8_t src0, const uint8x16_t src1) {
+  const uint8x8_t s1 = vget_high_u8(src1);
+  return vaddw_u8(src0, s1);
+}
+
+inline uint32x4_t VmullLo16(const uint16x8_t src0, const uint16x8_t src1) {
+  return vmull_u16(vget_low_u16(src0), vget_low_u16(src1));
+}
+
+inline uint32x4_t VmullHi16(const uint16x8_t src0, const uint16x8_t src1) {
+  return vmull_u16(vget_high_u16(src0), vget_high_u16(src1));
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+  return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+  return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+inline uint32x4_t Square(uint16x4_t s) { return vmull_u16(s, s); }
+
+inline void Square(const uint16x8_t src, uint32x4_t dst[2]) {
+  const uint16x4_t s_lo = vget_low_u16(src);
+  const uint16x4_t s_hi = vget_high_u16(src);
+  dst[0] = Square(s_lo);
+  dst[1] = Square(s_hi);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x8_t dst[3]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u16(src[0], src[1], 1);
+  dst[2] = vextq_u16(src[0], src[1], 2);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x8_t dst[5]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u16(src[0], src[1], 1);
+  dst[2] = vextq_u16(src[0], src[1], 2);
+  dst[3] = vextq_u16(src[0], src[1], 3);
+  dst[4] = vextq_u16(src[0], src[1], 4);
+}
+
+inline void Prepare3_32(const uint32x4_t src[2], uint32x4_t dst[3]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u32(src[0], src[1], 1);
+  dst[2] = vextq_u32(src[0], src[1], 2);
+}
+
+inline void Prepare5_32(const uint32x4_t src[2], uint32x4_t dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = vextq_u32(src[0], src[1], 3);
+  dst[4] = src[1];
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+                          const uint16x8_t src2) {
+  const uint16x8_t sum = vaddq_u16(src0, src1);
+  return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2) {
+  const uint32x4_t sum = vaddq_u32(src0, src1);
+  return vaddq_u32(sum, src2);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const uint32x4_t src[3][2], uint32x4_t dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+  const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+  const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t* src0, const uint32x4_t* src1,
+                          const uint32x4_t* src2, const uint32x4_t* src3,
+                          const uint32x4_t* src4) {
+  const uint32x4_t sum01 = vaddq_u32(*src0, *src1);
+  const uint32x4_t sum23 = vaddq_u32(*src2, *src3);
+  const uint32x4_t sum = vaddq_u32(sum01, sum23);
+  return vaddq_u32(sum, *src4);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const uint32x4_t src[5][2], uint32x4_t dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline uint16x8_t Sum3Horizontal16(const uint16x8_t src[2]) {
+  uint16x8_t s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline uint16x8_t Sum5Horizontal16(const uint16x8_t src[2]) {
+  uint16x8_t s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const uint16x8_t src[2], uint16x8_t* const row3,
+                     uint16x8_t* const row5) {
+  uint16x8_t s[5];
+  Prepare5_16(src, s);
+  const uint16x8_t sum04 = vaddq_u16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = vaddq_u16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16x8_t src[3], uint16x8_t* const row3_0,
+                            uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                            uint16x8_t* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const uint32x4_t src[5], uint32x4_t* const row_sq3,
+                     uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddq_u32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const uint32x4_t src[3],
+                            uint32x4_t* const row_sq3_0,
+                            uint32x4_t* const row_sq3_1,
+                            uint32x4_t* const row_sq5_0,
+                            uint32x4_t* const row_sq5_1) {
+  uint32x4_t s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline uint16x8_t Sum343Lo(const uint8x16_t ma3[3]) {
+  const uint16x8_t sum = Sum3WLo16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline uint16x8_t Sum343Hi(const uint8x16_t ma3[3]) {
+  const uint16x8_t sum = Sum3WHi16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline uint32x4_t Sum343(const uint32x4_t src[3]) {
+  const uint32x4_t sum = Sum3_32(src);
+  const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+  return vaddq_u32(sum3, src[1]);
+}
+
+inline void Sum343(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline uint16x8_t Sum565Lo(const uint8x16_t src[3]) {
+  const uint16x8_t sum = Sum3WLo16(src);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline uint16x8_t Sum565Hi(const uint8x16_t src[3]) {
+  const uint16x8_t sum = Sum3WHi16(src);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline uint32x4_t Sum565(const uint32x4_t src[3]) {
+  const uint32x4_t sum = Sum3_32(src);
+  const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+  const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+  return vaddq_u32(sum5, src[1]);
+}
+
+inline void Sum565(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t /*width*/, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    uint16x8_t s[3];
+    uint32x4_t sq[6];
+    s[0] = vld1q_u16(src);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row3[2], row5[2];
+      uint32x4_t row_sq3[2], row_sq5[2];
+      s[1] = vld1q_u16(src + 8);
+      x -= 16;
+      src += 16;
+      s[2] = vld1q_u16(src);
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t /*width*/, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int y = 2;
+  do {
+    uint16x8_t s[3];
+    uint32x4_t sq[6];
+    s[0] = vld1q_u16(src);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row[2];
+      uint32x4_t row_sq[4];
+      s[1] = vld1q_u16(src + 8);
+      x -= 16;
+      src += 16;
+      s[2] = vld1q_u16(src);
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+template <int n>
+inline uint16x8_t CalculateMa(const uint16x8_t sum, const uint32x4_t sum_sq[2],
+                              const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const uint16x8_t b = vrshrq_n_u16(sum, 2);
+  const uint16x4_t sum_lo = vget_low_u16(b);
+  const uint16x4_t sum_hi = vget_high_u16(b);
+  const uint16x4_t z0 =
+      CalculateMa<n>(sum_lo, vrshrq_n_u32(sum_sq[0], 4), scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(sum_hi, vrshrq_n_u32(sum_sq[1], 4), scale);
+  return vcombine_u16(z0, z1);
+}
+
+inline void CalculateB5(const uint16x8_t sum, const uint16x8_t ma,
+                        uint32x4_t b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const uint32x4_t m2 = VmullLo16(ma, sum);
+  const uint32x4_t m3 = VmullHi16(ma, sum);
+  const uint32x4_t m0 = vmulq_n_u32(m2, one_over_n_quarter);
+  const uint32x4_t m1 = vmulq_n_u32(m3, one_over_n_quarter);
+  b[0] = vrshrq_n_u32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = vrshrq_n_u32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const uint16x8_t sum, const uint16x8_t ma,
+                        uint32x4_t b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const uint32x4_t m0 = VmullLo16(ma, sum);
+  const uint32x4_t m1 = VmullHi16(ma, sum);
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  b[0] = vrshrq_n_u32(m2, kSgrProjReciprocalBits);
+  b[1] = vrshrq_n_u32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex3(const uint16x8_t s3[3],
+                                  const uint32x4_t sq3[3][2],
+                                  const uint32_t scale, uint16x8_t* const sum,
+                                  uint16x8_t* const index) {
+  uint32x4_t sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const uint16x8_t s5[5],
+                                  const uint32x4_t sq5[5][2],
+                                  const uint32_t scale, uint16x8_t* const sum,
+                                  uint16x8_t* const index) {
+  uint32x4_t sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index,
+                               uint8x16_t* const ma, uint32x4_t b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+
+  const uint8x8_t idx = vqmovn_u16(index);
+  uint8_t temp[8];
+  vst1_u8(temp, idx);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[4]], *ma, offset + 4);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[5]], *ma, offset + 5);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[6]], *ma, offset + 6);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[7]], *ma, offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+inline uint8x8_t MaLookupAndAdjust(const uint8x8x4_t table0,
+                                   const uint8x8x2_t table1,
+                                   const uint16x8_t index) {
+  const uint8x8_t idx = vqmovn_u16(index);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t sub_idx = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, sub_idx);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  return val;
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+                                  const uint16x8_t index[2],
+                                  uint8x16_t* const ma, uint32x4_t b0[2],
+                                  uint32x4_t b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  const uint8x8_t ma_lo = MaLookupAndAdjust(table0, table1, index[0]);
+  const uint8x8_t ma_hi = MaLookupAndAdjust(table0, table1, index[1]);
+  *ma = vcombine_u8(ma_lo, ma_hi);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq0 = vmovl_u8(vget_low_u8(*ma));
+  CalculateB3(sum[0], maq0, b0);
+  const uint16x8_t maq1 = vmovl_u8(vget_high_u8(*ma));
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+                                  const uint16x8_t index[2], uint8x16_t ma[2],
+                                  uint32x4_t b[4]) {
+  uint8x16_t mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = vcombine_u8(vget_low_u8(ma[0]), vget_low_u8(mas));
+  ma[1] = vextq_u8(mas, vdupq_n_u8(0), 8);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4_t sq5[5][2],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint32x4_t b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  uint16x8_t sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4_t sq3[3][2],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint32x4_t b[2]) {
+  uint16x8_t sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const uint32x4_t b3[3], const ptrdiff_t x,
+                         uint32x4_t sum_b343[2], uint32x4_t sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  uint32x4_t b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = vshlq_n_u32(sum_b111[0], 2);
+  sum_b343[0] = vsubq_u32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = vaddq_u32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = vshlq_n_u32(sum_b111[1], 2);
+  sum_b343[1] = vsubq_u32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = vaddq_u32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[3],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+                           uint32x4_t sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+                           uint32x4_t sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint32x4_t sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4_t sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint32x4_t sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4_t sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4_t sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4_t sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const uint16x8_t s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t* const ma,
+    uint32x4_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  vst1q_u16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  vst1q_u16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16x8_t s[2][4], const ptrdiff_t /*sum_width*/, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const uint16x8_t s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[4],
+    uint8x16_t* const ma, uint32x4_t b[2]) {
+  uint16x8_t s5[5];
+  uint32x4_t sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16x8_t s[4], const ptrdiff_t /*sum_width*/, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const uint16x8_t s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint32x4_t sq[4], uint8x16_t* const ma,
+    uint32x4_t b[2]) {
+  uint16x8_t s3[3];
+  uint32x4_t sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  vst1q_u16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16x8_t s[4], const ptrdiff_t x, const ptrdiff_t /*sum_width*/,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint32x4_t sq[8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s3[4], sum[2], index[2];
+  uint32x4_t sq3[3][2];
+
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16(sum3, x + 8, s3 + 1);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const uint16x8_t s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+    uint32x4_t b3[2][6], uint8x16_t* const ma5, uint32x4_t b5[2]) {
+  uint16x8_t s3[4], s5[5], sum[2], index[2];
+  uint32x4_t sq3[4][2], sq5[5][2];
+
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = vextq_u8(ma3[0][0], vdupq_n_u8(0), 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16x8_t s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t /*sum_width*/, uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+    uint32x4_t b3[2][6], uint8x16_t ma5[2], uint32x4_t b5[6]) {
+  uint16x8_t s3[2][4], s5[2][5], sum[2][2], index[2][2];
+  uint32x4_t sq3[4][2], sq5[5][2];
+
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  vst1q_u16(sum3[2] + x + 0, s3[0][2]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  vst1q_u16(sum3[3] + x + 0, s3[0][3]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16(sum3, x + 8, s3[1]);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const uint16x8_t s[2], const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint32x4_t sq[4], uint8x16_t* const ma3, uint8x16_t* const ma5,
+    uint32x4_t b3[2], uint32x4_t b5[2]) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4_t sq3[3][2], sq5[5][2];
+
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16x8_t s[4], const ptrdiff_t /*sum_width*/, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma3[2],
+    uint8x16_t ma5[2], uint32x4_t b3[6], uint32x4_t b5[6]) {
+  uint16x8_t s3[2][3], s5[2][5], sum[2], index[2];
+  uint32x4_t sq3[3][2], sq5[5][2];
+
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16(sum3, x + 8, s3[1]);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  uint16x8_t s[2][4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[2][8], bs[6];
+
+  s[0][0] = vld1q_u16(src0 + 0);
+  s[0][1] = vld1q_u16(src0 + 8);
+  s[1][0] = vld1q_u16(src1 + 0);
+  s[1][1] = vld1q_u16(src1 + 8);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint8x16_t ma5[3];
+    uint16x8_t ma[2];
+    uint32x4_t b[4];
+
+    s[0][2] = vld1q_u16(src0 + x + 16);
+    s[0][3] = vld1q_u16(src0 + x + 24);
+    s[1][2] = vld1q_u16(src1 + x + 16);
+    s[1][3] = vld1q_u16(src1 + x + 24);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = vld1q_u16(src + 0);
+  s[1] = vld1q_u16(src + 8);
+  Square(s[0], sq);
+  // Quiet "may be used uninitialized" warning.
+  mas[0] = mas[1] = vdupq_n_u8(0);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = vld1q_u16(src + x + 16);
+    s[3] = vld1q_u16(src + x + 24);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    uint8x16_t ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      uint16x8_t ma[2];
+      uint32x4_t b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  uint16x8_t s[2][4];
+  uint8x16_t ma3[2][2], ma5[2];
+  uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+  s[0][0] = vld1q_u16(src0 + 0);
+  s[0][1] = vld1q_u16(src0 + 8);
+  s[1][0] = vld1q_u16(src1 + 0);
+  s[1][1] = vld1q_u16(src1 + 8);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[4];
+    uint8x16_t ma3x[3], ma5x[3];
+
+    s[0][2] = vld1q_u16(src0 + x + 16);
+    s[0][3] = vld1q_u16(src0 + x + 24);
+    s[1][2] = vld1q_u16(src1 + x + 16);
+    s[1][3] = vld1q_u16(src1 + x + 24);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint32x4_t ma_x_src, const uint32x4_t b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const int32x4_t v = vreinterpretq_s32_u32(vsubq_u32(b, ma_x_src));
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return vqrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint16x8_t src,
+                                         const uint16x8_t ma,
+                                         const uint32x4_t b[2]) {
+  const uint32x4_t ma_x_src_lo = VmullLo16(ma, src);
+  const uint32x4_t ma_x_src_hi = VmullHi16(ma, src);
+  const int16x4_t dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const int16x4_t dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return vcombine_s16(dst_lo, dst_hi);  // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint16x8_t src,
+                                              const uint16x8_t ma[2],
+                                              const uint32x4_t b[2][2]) {
+  const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+  uint32x4_t b_sum[2];
+  b_sum[0] = vaddq_u32(b[0][0], b[1][0]);
+  b_sum[1] = vaddq_u32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint16x8_t src,
+                                              const uint16x8_t ma[3],
+                                              const uint32x4_t b[3][2]) {
+  const uint16x8_t ma_sum = Sum3_16(ma);
+  uint32x4_t b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t SelfGuidedFinal(const uint16x8_t src, const int32x4_t v[2]) {
+  const int16x4_t v_lo =
+      vqrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t v_hi =
+      vqrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+  return vaddq_s16(vreinterpretq_s16_u16(src), vv);
+}
+
+inline int16x8_t SelfGuidedDoubleMultiplier(const uint16x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+  v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+  v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+  return SelfGuidedFinal(src, v);
+}
+
+inline int16x8_t SelfGuidedSingleMultiplier(const uint16x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const int16x8_t val) {
+  const uint16x8_t val0 = vreinterpretq_u16_s16(vmaxq_s16(val, vdupq_n_s16(0)));
+  const uint16x8_t val1 = vminq_u16(val0, vdupq_n_u16((1 << kBitdepth10) - 1));
+  vst1q_u16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  uint16x8_t s[2][4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[2][8], bs[6];
+
+  s[0][0] = vld1q_u16(src0 + 0);
+  s[0][1] = vld1q_u16(src0 + 8);
+  s[1][0] = vld1q_u16(src1 + 0);
+  s[1][1] = vld1q_u16(src1 + 8);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[2][2];
+    uint8x16_t ma5[3];
+    int16x8_t p[2];
+
+    s[0][2] = vld1q_u16(src0 + x + 16);
+    s[0][3] = vld1q_u16(src0 + x + 24);
+    s[1][2] = vld1q_u16(src1 + x + 16);
+    s[1][3] = vld1q_u16(src1 + x + 24);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    vst1q_u16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const uint16x8_t sr0_lo = vld1q_u16(src + x + 0);
+    const uint16x8_t sr1_lo = vld1q_u16(src + stride + x + 0);
+    ma[0] = vld1q_u16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const int16x8_t d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const int16x8_t d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const uint16x8_t sr0_hi = vld1q_u16(src + x + 8);
+    const uint16x8_t sr1_hi = vld1q_u16(src + stride + x + 8);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const int16x8_t d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const int16x8_t d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = vld1q_u16(src0 + 0);
+  s[1] = vld1q_u16(src0 + 8);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[2][2];
+    uint8x16_t ma5[3];
+
+    s[2] = vld1q_u16(src0 + x + 16);
+    s[3] = vld1q_u16(src0 + x + 24);
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = vld1q_u16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    int16x8_t p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = vld1q_u16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = vld1q_u16(src0 + 0);
+  s[1] = vld1q_u16(src0 + 8);
+  Square(s[0], sq);
+  // Quiet "may be used uninitialized" warning.
+  mas[0] = mas[1] = vdupq_n_u8(0);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = vld1q_u16(src0 + x + 16);
+    s[3] = vld1q_u16(src0 + x + 24);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    uint16x8_t ma[3];
+    uint32x4_t b[3][2];
+    uint8x16_t ma3[3];
+
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  uint16x8_t s[2][4];
+  uint8x16_t ma3[2][2], ma5[2];
+  uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+  s[0][0] = vld1q_u16(src0 + 0);
+  s[0][1] = vld1q_u16(src0 + 8);
+  s[1][0] = vld1q_u16(src1 + 0);
+  s[1][1] = vld1q_u16(src1 + 8);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3][3];
+    uint32x4_t b[3][3][2];
+    uint8x16_t ma3x[2][3], ma5x[3];
+    int16x8_t p[2][2];
+
+    s[0][2] = vld1q_u16(src0 + x + 16);
+    s[0][3] = vld1q_u16(src0 + x + 24);
+    s[1][2] = vld1q_u16(src1 + x + 16);
+    s[1][3] = vld1q_u16(src1 + x + 24);
+
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const uint16x8_t sr0_lo = vld1q_u16(src + x);
+    const uint16x8_t sr1_lo = vld1q_u16(src + stride + x);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const int16x8_t d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const int16x8_t d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const uint16x8_t sr0_hi = vld1q_u16(src + x + 8);
+    const uint16x8_t sr1_hi = vld1q_u16(src + stride + x + 8);
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const int16x8_t d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const int16x8_t d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  uint16x8_t s[4];
+  uint8x16_t ma3[2], ma5[2];
+  uint32x4_t sq[8], b3[6], b5[6];
+  uint16x8_t ma[3];
+  uint32x4_t b[3][2];
+
+  s[0] = vld1q_u16(src0 + 0);
+  s[1] = vld1q_u16(src0 + 8);
+  Square(s[0], sq);
+  // Quiet "may be used uninitialized" warning.
+  ma3[0] = ma3[1] = vdupq_n_u8(0);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
+
+    s[2] = vld1q_u16(src0 + x + 16);
+    s[3] = vld1q_u16(src0 + x + 24);
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    ma[0] = vld1q_u16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const int16x8_t d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const int16x8_t d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* top = static_cast<const uint16_t*>(top_border);
+  const auto* bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_restorations[0] = WienerFilter_NEON;
+  dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/loop_restoration_neon.h b/src/dsp/arm/loop_restoration_neon.h
index b551610..b9a4803 100644
--- a/src/dsp/arm/loop_restoration_neon.h
+++ b/src/dsp/arm/loop_restoration_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
 // Initializes Dsp::loop_restorations, see the defines below for specifics.
 // This function is not thread-safe.
 void LoopRestorationInit_NEON();
+void LoopRestorationInit10bpp_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
@@ -35,6 +36,9 @@ void LoopRestorationInit_NEON();
 #define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index ee50923..90166fc 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -79,10 +79,11 @@ inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
   return vreinterpretq_s16_u16(vmovl_u8(mask_val));
 }
 
-inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
-                                  const int16_t* const pred_1,
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+                                  const int16_t* LIBGAV1_RESTRICT const pred_1,
                                   const int16x8_t pred_mask_0,
-                                  const int16x8_t pred_mask_1, uint8_t* dst,
+                                  const int16x8_t pred_mask_1,
+                                  uint8_t* LIBGAV1_RESTRICT dst,
                                   const ptrdiff_t dst_stride) {
   const int16x8_t pred_val_0 = vld1q_s16(pred_0);
   const int16x8_t pred_val_1 = vld1q_s16(pred_1);
@@ -109,9 +110,11 @@ inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4x4_NEON(const int16_t* pred_0, const int16_t* pred_1,
-                                 const uint8_t* mask,
-                                 const ptrdiff_t mask_stride, uint8_t* dst,
+inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT mask,
+                                 const ptrdiff_t mask_stride,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
                                  const ptrdiff_t dst_stride) {
   const int16x8_t mask_inverter = vdupq_n_s16(64);
   int16x8_t pred_mask_0 =
@@ -133,10 +136,12 @@ inline void MaskBlending4x4_NEON(const int16_t* pred_0, const int16_t* pred_1,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4xH_NEON(const int16_t* pred_0, const int16_t* pred_1,
-                                 const uint8_t* const mask_ptr,
+inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
                                  const ptrdiff_t mask_stride, const int height,
-                                 uint8_t* dst, const ptrdiff_t dst_stride) {
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
   const uint8_t* mask = mask_ptr;
   if (height == 4) {
     MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
@@ -188,11 +193,12 @@ inline void MaskBlending4xH_NEON(const int16_t* pred_0, const int16_t* pred_1,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlend_NEON(const void* prediction_0, const void* prediction_1,
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
                            const ptrdiff_t /*prediction_stride_1*/,
-                           const uint8_t* const mask_ptr,
+                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
                            const ptrdiff_t mask_stride, const int width,
-                           const int height, void* dest,
+                           const int height, void* LIBGAV1_RESTRICT dest,
                            const ptrdiff_t dst_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
@@ -302,11 +308,10 @@ inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
   return vld1_u8(mask);
 }
 
-inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
-                                                uint8_t* const pred_1,
-                                                const ptrdiff_t pred_stride_1,
-                                                const uint8x8_t pred_mask_0,
-                                                const uint8x8_t pred_mask_1) {
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+    const uint8_t* LIBGAV1_RESTRICT const pred_0,
+    uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+    const uint8x8_t pred_mask_0, const uint8x8_t pred_mask_1) {
   const uint8x8_t pred_val_0 = vld1_u8(pred_0);
   uint8x8_t pred_val_1 = Load4(pred_1);
   pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1);
@@ -320,11 +325,10 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4x4_NEON(const uint8_t* pred_0,
-                                               uint8_t* pred_1,
-                                               const ptrdiff_t pred_stride_1,
-                                               const uint8_t* mask,
-                                               const ptrdiff_t mask_stride) {
+inline void InterIntraMaskBlending8bpp4x4_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride) {
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   uint8x8_t pred_mask_1 =
       GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
@@ -344,8 +348,9 @@ inline void InterIntraMaskBlending8bpp4x4_NEON(const uint8_t* pred_0,
 
 template <int subsampling_x, int subsampling_y>
 inline void InterIntraMaskBlending8bpp4xH_NEON(
-    const uint8_t* pred_0, uint8_t* pred_1, const ptrdiff_t pred_stride_1,
-    const uint8_t* mask, const ptrdiff_t mask_stride, const int height) {
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride, const int height) {
   if (height == 4) {
     InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
@@ -369,12 +374,11 @@ inline void InterIntraMaskBlending8bpp4xH_NEON(
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlend8bpp_NEON(const uint8_t* prediction_0,
-                                         uint8_t* prediction_1,
-                                         const ptrdiff_t prediction_stride_1,
-                                         const uint8_t* const mask_ptr,
-                                         const ptrdiff_t mask_stride,
-                                         const int width, const int height) {
+inline void InterIntraMaskBlend8bpp_NEON(
+    const uint8_t* LIBGAV1_RESTRICT prediction_0,
+    uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height) {
   if (width == 4) {
     InterIntraMaskBlending8bpp4xH_NEON<subsampling_x, subsampling_y>(
         prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
index da3ba17..b117274 100644
--- a/src/dsp/arm/motion_vector_search_neon.cc
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -61,8 +61,8 @@ inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
 }
 
 inline int16x8_t MvProjectionCompoundClip(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
     const int reference_offsets[2]) {
   const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
   const int32x2_t temporal_mv = vld1_s32(tmvs);
@@ -76,9 +76,9 @@ inline int16x8_t MvProjectionCompoundClip(
 }
 
 inline int16x8_t MvProjectionSingleClip(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets, const int reference_offset,
-    int16x4_t* const lookup) {
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, int16x4_t* const lookup) {
   const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
   const int16x8_t temporal_mv = vld1q_s16(tmvs);
   *lookup = vld1_lane_s16(
@@ -116,9 +116,10 @@ inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
 }
 
 void MvProjectionCompoundLowPrecision_NEON(
-    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
     const int reference_offsets[2], const int count,
-    CompoundMotionVector* candidate_mvs) {
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
   // |reference_offsets| non-zero check usually equals true and is ignored.
   // To facilitate the compilers, make a local copy of |reference_offsets|.
   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
@@ -135,9 +136,10 @@ void MvProjectionCompoundLowPrecision_NEON(
 }
 
 void MvProjectionCompoundForceInteger_NEON(
-    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
     const int reference_offsets[2], const int count,
-    CompoundMotionVector* candidate_mvs) {
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
   // |reference_offsets| non-zero check usually equals true and is ignored.
   // To facilitate the compilers, make a local copy of |reference_offsets|.
   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
@@ -154,9 +156,10 @@ void MvProjectionCompoundForceInteger_NEON(
 }
 
 void MvProjectionCompoundHighPrecision_NEON(
-    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
     const int reference_offsets[2], const int count,
-    CompoundMotionVector* candidate_mvs) {
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
   // |reference_offsets| non-zero check usually equals true and is ignored.
   // To facilitate the compilers, make a local copy of |reference_offsets|.
   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
@@ -173,8 +176,10 @@ void MvProjectionCompoundHighPrecision_NEON(
 }
 
 void MvProjectionSingleLowPrecision_NEON(
-    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
-    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
   // Up to three more elements could be calculated.
   int loop_count = (count + 3) >> 2;
   int16x4_t lookup = vdup_n_s16(0);
@@ -189,8 +194,10 @@ void MvProjectionSingleLowPrecision_NEON(
 }
 
 void MvProjectionSingleForceInteger_NEON(
-    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
-    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
   // Up to three more elements could be calculated.
   int loop_count = (count + 3) >> 2;
   int16x4_t lookup = vdup_n_s16(0);
@@ -205,8 +212,10 @@ void MvProjectionSingleForceInteger_NEON(
 }
 
 void MvProjectionSingleHighPrecision_NEON(
-    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
-    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
   // Up to three more elements could be calculated.
   int loop_count = (count + 3) >> 2;
   int16x4_t lookup = vdup_n_s16(0);
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 1111a90..c032b31 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -36,7 +36,8 @@ namespace {
 
 #include "src/dsp/obmc.inc"
 
-inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred,
+inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
+                           const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
                            const uint8x8_t pred_mask,
                            const uint8x8_t obmc_pred_mask) {
   const uint8x8_t pred_val = Load4(pred);
@@ -47,35 +48,17 @@ inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred,
   StoreLo4(pred, result);
 }
 
-template <bool from_left>
-inline void OverlapBlend2xH_NEON(uint8_t* const prediction,
-                                 const ptrdiff_t prediction_stride,
-                                 const int height,
-                                 const uint8_t* const obmc_prediction,
-                                 const ptrdiff_t obmc_prediction_stride) {
-  uint8_t* pred = prediction;
+inline void OverlapBlendFromLeft2xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
   const uint8x8_t mask_inverter = vdup_n_u8(64);
-  const uint8_t* obmc_pred = obmc_prediction;
-  uint8x8_t pred_mask;
-  uint8x8_t obmc_pred_mask;
-  int compute_height;
-  const int mask_offset = height - 2;
-  if (from_left) {
-    pred_mask = Load2(kObmcMask);
-    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
-    compute_height = height;
-  } else {
-    // Weights for the last line are all 64, which is a no-op.
-    compute_height = height - 1;
-  }
+  const uint8x8_t pred_mask = Load2(kObmcMask);
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   uint8x8_t pred_val = vdup_n_u8(0);
   uint8x8_t obmc_pred_val = vdup_n_u8(0);
   int y = 0;
   do {
-    if (!from_left) {
-      pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]);
-      obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
-    }
     pred_val = Load2<0>(pred, pred_val);
     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
     obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
@@ -85,16 +68,13 @@ inline void OverlapBlend2xH_NEON(uint8_t* const prediction,
 
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
-  } while (++y != compute_height);
+  } while (++y != height);
 }
 
 inline void OverlapBlendFromLeft4xH_NEON(
-    uint8_t* const prediction, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* const obmc_prediction,
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
     const ptrdiff_t obmc_prediction_stride) {
-  uint8_t* pred = prediction;
-  const uint8_t* obmc_pred = obmc_prediction;
-
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8x8_t pred_mask = Load4(kObmcMask + 2);
   // 64 - mask
@@ -114,11 +94,9 @@ inline void OverlapBlendFromLeft4xH_NEON(
 }
 
 inline void OverlapBlendFromLeft8xH_NEON(
-    uint8_t* const prediction, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* const obmc_prediction,
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
     const ptrdiff_t obmc_prediction_stride) {
-  uint8_t* pred = prediction;
-  const uint8_t* obmc_pred = obmc_prediction;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
   // 64 - mask
@@ -137,17 +115,19 @@ inline void OverlapBlendFromLeft8xH_NEON(
   } while (++y != height);
 }
 
-void OverlapBlendFromLeft_NEON(void* const prediction,
-                               const ptrdiff_t prediction_stride,
-                               const int width, const int height,
-                               const void* const obmc_prediction,
-                               const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendFromLeft_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<uint8_t*>(prediction);
   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
 
   if (width == 2) {
-    OverlapBlend2xH_NEON<true>(pred, prediction_stride, height, obmc_pred,
-                               obmc_prediction_stride);
+    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
     return;
   }
   if (width == 4) {
@@ -194,13 +174,10 @@ void OverlapBlendFromLeft_NEON(void* const prediction,
   } while (x < width);
 }
 
-inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction,
-                                        const ptrdiff_t prediction_stride,
-                                        const uint8_t* const obmc_prediction,
-                                        const ptrdiff_t obmc_prediction_stride,
-                                        const int height) {
-  uint8_t* pred = prediction;
-  const uint8_t* obmc_pred = obmc_prediction;
+inline void OverlapBlendFromTop4x4_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride, const int height) {
   uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
@@ -224,16 +201,14 @@ inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction,
 }
 
 inline void OverlapBlendFromTop4xH_NEON(
-    uint8_t* const prediction, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* const obmc_prediction,
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
     const ptrdiff_t obmc_prediction_stride) {
   if (height < 8) {
-    OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction,
+    OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred,
                                 obmc_prediction_stride, height);
     return;
   }
-  uint8_t* pred = prediction;
-  const uint8_t* obmc_pred = obmc_prediction;
   const uint8_t* mask = kObmcMask + height - 2;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   int y = 0;
@@ -282,11 +257,9 @@ inline void OverlapBlendFromTop4xH_NEON(
 }
 
 inline void OverlapBlendFromTop8xH_NEON(
-    uint8_t* const prediction, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* const obmc_prediction,
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
     const ptrdiff_t obmc_prediction_stride) {
-  uint8_t* pred = prediction;
-  const uint8_t* obmc_pred = obmc_prediction;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8_t* mask = kObmcMask + height - 2;
   const int compute_height = height - (height >> 2);
@@ -307,19 +280,16 @@ inline void OverlapBlendFromTop8xH_NEON(
   } while (++y != compute_height);
 }
 
-void OverlapBlendFromTop_NEON(void* const prediction,
-                              const ptrdiff_t prediction_stride,
-                              const int width, const int height,
-                              const void* const obmc_prediction,
-                              const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendFromTop_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<uint8_t*>(prediction);
   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
 
-  if (width == 2) {
-    OverlapBlend2xH_NEON<false>(pred, prediction_stride, height, obmc_pred,
-                                obmc_prediction_stride);
-    return;
-  }
   if (width == 4) {
     OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
                                 obmc_prediction_stride);
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
index 91537c4..88b51bd 100644
--- a/src/dsp/arm/super_res_neon.cc
+++ b/src/dsp/arm/super_res_neon.cc
@@ -81,11 +81,13 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
   return vqrshrn_n_u16(res, kFilterBits);
 }
 
-void SuperRes_NEON(const void* const coefficients, void* const source,
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+                   void* LIBGAV1_RESTRICT const source,
                    const ptrdiff_t source_stride, const int height,
                    const int downscaled_width, const int upscaled_width,
                    const int initial_subpixel_x, const int step,
-                   void* const dest, const ptrdiff_t dest_stride) {
+                   void* LIBGAV1_RESTRICT const dest,
+                   const ptrdiff_t dest_stride) {
   auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = height;
@@ -234,11 +236,13 @@ inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
 }
 
 template <int bitdepth>
-void SuperRes_NEON(const void* const coefficients, void* const source,
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+                   void* LIBGAV1_RESTRICT const source,
                    const ptrdiff_t source_stride, const int height,
                    const int downscaled_width, const int upscaled_width,
                    const int initial_subpixel_x, const int step,
-                   void* const dest, const ptrdiff_t dest_stride) {
+                   void* LIBGAV1_RESTRICT const dest,
+                   const ptrdiff_t dest_stride) {
   auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<uint16_t*>(dest);
   int y = height;
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index c7fb739..a8d0091 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -103,13 +103,15 @@ void HorizontalFilter(const int sx4, const int16_t alpha,
 }
 
 template <bool is_compound>
-void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
-               const int source_width, const int source_height,
-               const int* const warp_params, const int subsampling_x,
-               const int subsampling_y, const int block_start_x,
-               const int block_start_y, const int block_width,
-               const int block_height, const int16_t alpha, const int16_t beta,
-               const int16_t gamma, const int16_t delta, void* dest,
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+               const ptrdiff_t source_stride, const int source_width,
+               const int source_height,
+               const int* LIBGAV1_RESTRICT const warp_params,
+               const int subsampling_x, const int subsampling_y,
+               const int block_start_x, const int block_start_y,
+               const int block_width, const int block_height,
+               const int16_t alpha, const int16_t beta, const int16_t gamma,
+               const int16_t delta, void* LIBGAV1_RESTRICT dest,
                const ptrdiff_t dest_stride) {
   constexpr int kRoundBitsVertical =
       is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
index 7e5bff0..965b904 100644
--- a/src/dsp/arm/weight_mask_neon.cc
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -38,8 +38,9 @@ namespace {
 constexpr int kRoundingBits8bpp = 4;
 
 template <bool mask_is_inverse>
-inline void WeightMask8_NEON(const int16_t* prediction_0,
-                             const int16_t* prediction_1, uint8_t* mask) {
+inline void WeightMask8_NEON(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                             const int16_t* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask) {
   const int16x8_t pred_0 = vld1q_s16(prediction_0);
   const int16x8_t pred_1 = vld1q_s16(prediction_1);
   const uint8x8_t difference_offset = vdup_n_u8(38);
@@ -67,8 +68,9 @@ inline void WeightMask8_NEON(const int16_t* prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask8x8_NEON(const void* prediction_0, const void* prediction_1,
-                        uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask8x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                        const void* LIBGAV1_RESTRICT prediction_1,
+                        uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y = 0;
@@ -79,8 +81,10 @@ void WeightMask8x8_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x16_NEON(const void* prediction_0, const void* prediction_1,
-                         uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask8x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -93,8 +97,10 @@ void WeightMask8x16_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x32_NEON(const void* prediction_0, const void* prediction_1,
-                         uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask8x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 0;
@@ -120,8 +126,10 @@ void WeightMask8x32_NEON(const void* prediction_0, const void* prediction_1,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask16x8_NEON(const void* prediction_0, const void* prediction_1,
-                         uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y = 0;
@@ -132,8 +140,10 @@ void WeightMask16x8_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x16_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -146,8 +156,10 @@ void WeightMask16x16_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x32_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 0;
@@ -163,8 +175,10 @@ void WeightMask16x32_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x64_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -189,8 +203,10 @@ void WeightMask16x64_NEON(const void* prediction_0, const void* prediction_1,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask32x8_NEON(const void* prediction_0, const void* prediction_1,
-                         uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   WEIGHT32_AND_STRIDE;
@@ -204,8 +220,10 @@ void WeightMask32x8_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x16_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -218,8 +236,10 @@ void WeightMask32x16_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x32_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 0;
@@ -235,8 +255,10 @@ void WeightMask32x32_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x64_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -265,8 +287,10 @@ void WeightMask32x64_NEON(const void* prediction_0, const void* prediction_1,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask64x16_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -279,8 +303,10 @@ void WeightMask64x16_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x32_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 0;
@@ -296,8 +322,10 @@ void WeightMask64x32_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x64_NEON(const void* prediction_0, const void* prediction_1,
-                          uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -310,8 +338,10 @@ void WeightMask64x64_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x128_NEON(const void* prediction_0, const void* prediction_1,
-                           uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -325,8 +355,10 @@ void WeightMask64x128_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x64_NEON(const void* prediction_0, const void* prediction_1,
-                           uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask128x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -367,8 +399,10 @@ void WeightMask128x64_NEON(const void* prediction_0, const void* prediction_1,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x128_NEON(const void* prediction_0, const void* prediction_1,
-                            uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask128x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
index d3ec21f..273b355 100644
--- a/src/dsp/average_blend.cc
+++ b/src/dsp/average_blend.cc
@@ -27,8 +27,9 @@ namespace dsp {
 namespace {
 
 template <int bitdepth, typename Pixel>
-void AverageBlend_C(const void* prediction_0, const void* prediction_1,
-                    const int width, const int height, void* const dest,
+void AverageBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                    const void* LIBGAV1_RESTRICT prediction_1, const int width,
+                    const int height, void* const dest,
                     const ptrdiff_t dest_stride) {
   // 7.11.3.2 Rounding variables derivation process
   //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
index fe8a9d6..04e24e5 100644
--- a/src/dsp/average_blend_test.cc
+++ b/src/dsp/average_blend_test.cc
@@ -14,13 +14,13 @@
 
 #include "src/dsp/average_blend.h"
 
+#include <cassert>
 #include <cstdint>
 #include <ostream>
 #include <string>
 #include <type_traits>
 
 #include "absl/strings/match.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -55,18 +55,8 @@ constexpr int kCompoundPredictionRange[3][2] = {
     {3974, 61559},
 };
 
-struct TestParam {
-  TestParam(int width, int height) : width(width), height(height) {}
-  int width;
-  int height;
-};
-
-std::ostream& operator<<(std::ostream& os, const TestParam& param) {
-  return os << "BlockSize" << param.width << "x" << param.height;
-}
-
 template <int bitdepth, typename Pixel>
-class AverageBlendTest : public testing::TestWithParam<TestParam>,
+class AverageBlendTest : public testing::TestWithParam<BlockSize>,
                          public test_utils::MaxAlignedAllocable {
  public:
   AverageBlendTest() = default;
@@ -105,8 +95,8 @@ class AverageBlendTest : public testing::TestWithParam<TestParam>,
   using PredType =
       typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
   static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
-  const int width_ = GetParam().width;
-  const int height_ = GetParam().height;
+  const int width_ = kBlockWidthPixels[GetParam()];
+  const int height_ = kBlockHeightPixels[GetParam()];
   alignas(kMaxAlignment) PredType
       source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
   alignas(kMaxAlignment) PredType
@@ -158,69 +148,54 @@ void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests,
                                           kDestStride, kDestStride, false));
   }
 
-  test_utils::CheckMd5Digest(
-      kAverageBlend, absl::StrFormat("%dx%d", width_, height_).c_str(), digest,
-      dest_, sizeof(dest_[0]) * kDestStride * height_, elapsed_time);
+  test_utils::CheckMd5Digest(kAverageBlend, ToString(GetParam()), digest, dest_,
+                             sizeof(dest_[0]) * kDestStride * height_,
+                             elapsed_time);
 }
 
-const TestParam kTestParam[] = {
-    TestParam(4, 4),    TestParam(4, 8),    TestParam(8, 8),
-    TestParam(8, 16),   TestParam(16, 8),   TestParam(16, 16),
-    TestParam(16, 32),  TestParam(32, 16),  TestParam(32, 32),
-    TestParam(32, 64),  TestParam(64, 32),  TestParam(64, 64),
-    TestParam(64, 128), TestParam(128, 64), TestParam(128, 128),
+const BlockSize kTestParam[] = {
+    kBlock4x4,    kBlock4x8,     kBlock4x16,  kBlock8x4,   kBlock8x8,
+    kBlock8x16,   kBlock8x32,    kBlock16x4,  kBlock16x8,  kBlock16x16,
+    kBlock16x32,  kBlock16x64,   kBlock32x8,  kBlock32x16, kBlock32x32,
+    kBlock32x64,  kBlock64x16,   kBlock64x32, kBlock64x64, kBlock64x128,
+    kBlock128x64, kBlock128x128,
 };
 
 using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>;
 
-const char* GetAverageBlendDigest8bpp(const TestParam block_size) {
-  static const char* const kDigestsWidth4[] = {
+const char* GetAverageBlendDigest8bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
       "152bcc35946900b1ed16369b3e7a81b7",
       "c23e9b5698f7384eaae30a3908118b77",
-  };
-  static const char* const kDigestsWidth8[] = {
+      "f2da31d940f62490c368c03d32d3ede8",
+      // 8xN
+      "73c95485ef956e1d9ab914e88e6a202b",
       "d90d3abd368e58c513070a88b34649ba",
       "77f7d53d0edeffb3537afffd9ff33a4a",
-  };
-  static const char* const kDigestsWidth16[] = {
+      "460b9b1e6b83f65f013cfcaf67ec0122",
+      // 16xN
+      "96454a56de940174ff92e9bb686d6d38",
       "a50e268e93b48ae39cc2a47d377410e2",
       "65c8502ff6d78065d466f9911ed6bb3e",
       "bc2c873b9f5d74b396e1df705e87f699",
-  };
-  static const char* const kDigestsWidth32[] = {
+      "b4dae656484b2d255d1e09b7f34e12c1",
+      // 32xN
+      "7e1e5db92b22a96e5226a23de883d766",
       "ca40d46d89773e7f858b15fcecd43cc0",
       "bfdc894707323f4dc43d1326309f8368",
       "f4733417621719b7feba3166ec0da5b9",
-  };
-  static const char* const kDigestsWidth64[] = {
+      // 64xN
+      "378fa0594d22f01c8e8931c2a908d7c4",
       "db38fe2e082bd4a09acb3bb1d52ee11e",
       "3ad44401cc731215c46c9b7d96f7e4ae",
       "6c43267be5ed03d204a05fe36090f870",
-  };
-  static const char* const kDigestsWidth128[] = {
+      // 128xN
       "c8cfe46ebf166c1cbf08e8804206aadb",
       "b0557b5156d2334c8ce4a7ee12f9d6b4",
   };
-  // height < width implies 0.
-  // height == width implies 1.
-  // height > width implies 2.
-  const int height_index = block_size.height / block_size.width;
-  switch (block_size.width) {
-    case 4:
-      return kDigestsWidth4[height_index - 1];
-    case 8:
-      return kDigestsWidth8[height_index - 1];
-    case 16:
-      return kDigestsWidth16[height_index];
-    case 32:
-      return kDigestsWidth32[height_index];
-    case 64:
-      return kDigestsWidth64[height_index];
-    default:
-      EXPECT_EQ(block_size.width, 128)
-          << "Unknown width parameter: " << block_size.width;
-      return kDigestsWidth128[height_index];
-  }
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
 }
 
 TEST_P(AverageBlendTest8bpp, Blending) {
@@ -229,7 +204,9 @@ TEST_P(AverageBlendTest8bpp, Blending) {
 
 TEST_P(AverageBlendTest8bpp, DISABLED_Speed) {
   Test(GetAverageBlendDigest8bpp(GetParam()),
-       kNumSpeedTests / (GetParam().height * GetParam().width), false);
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockWidthPixels[GetParam()]),
+       false);
 }
 
 INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp,
@@ -246,54 +223,39 @@ INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp,
 #if LIBGAV1_MAX_BITDEPTH >= 10
 using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>;
 
-const char* GetAverageBlendDigest10bpp(const TestParam block_size) {
-  static const char* const kDigestsWidth4[] = {
+const char* GetAverageBlendDigest10bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
       "98c0671c092b4288adcaaa17362cc4a3",
       "7083f3def8bfb63ab3a985ef5616a923",
-  };
-  static const char* const kDigestsWidth8[] = {
+      "a7211ee2eaa6f88e08875b377d17b0f1",
+      // 8xN
+      "11f9ab881700f2ef0f82d8d4662868c6",
       "3bee144b9ea6f4288b860c24f88a22f3",
       "27113bd17bf95034f100e9046c7b59d2",
-  };
-  static const char* const kDigestsWidth16[] = {
+      "c42886a5e16e23a81e43833d34467558",
+      // 16xN
+      "b0ac2eb0a7a6596d6d1339074c7f8771",
       "24c9e079b9a8647a6ee03f5441f2cdd9",
       "dd05777751ccdb4356856c90e1176e53",
       "27b1d69d035b1525c013b7373cfe3875",
-  };
-  static const char* const kDigestsWidth32[] = {
+      "08c46403afe19e6b008ccc8f56633da9",
+      // 32xN
+      "36d434db11298aba76166df06e9b8125",
       "efd24dd7b555786bff1a482e51170ea3",
       "3b37ddac87de443cd18784f02c2d1dd5",
       "80d8070939a743a20689a65bf5dc0a68",
-  };
-  static const char* const kDigestsWidth64[] = {
+      // 64xN
+      "88e747246237c6408d0bd4cc3ecc8396",
       "af1fe8c52487c9f2951c3ea516828abb",
       "ea6f18ff56b053748c18032b7e048e83",
       "af0cb87fe27d24c2e0afd2c90a8533a6",
-  };
-  static const char* const kDigestsWidth128[] = {
+      // 128xN
       "16a83b19911d6dc7278a694b8baa9901",
       "bd22e77ce6fa727267ff63eeb4dcb19c",
   };
-  // (height  < width) -> 0
-  // (height == width) -> 1
-  // (height  > width) -> 2
-  const int height_index = block_size.height / block_size.width;
-  switch (block_size.width) {
-    case 4:
-      return kDigestsWidth4[height_index - 1];
-    case 8:
-      return kDigestsWidth8[height_index - 1];
-    case 16:
-      return kDigestsWidth16[height_index];
-    case 32:
-      return kDigestsWidth32[height_index];
-    case 64:
-      return kDigestsWidth64[height_index];
-    default:
-      EXPECT_EQ(block_size.width, 128)
-          << "Unknown width parameter: " << block_size.width;
-      return kDigestsWidth128[height_index];
-  }
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
 }
 
 TEST_P(AverageBlendTest10bpp, Blending) {
@@ -302,7 +264,10 @@ TEST_P(AverageBlendTest10bpp, Blending) {
 
 TEST_P(AverageBlendTest10bpp, DISABLED_Speed) {
   Test(GetAverageBlendDigest10bpp(GetParam()),
-       kNumSpeedTests / (GetParam().height * GetParam().width) / 2, false);
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+           2,
+       false);
 }
 
 INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp,
@@ -319,4 +284,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
 
 }  // namespace
 }  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+  return os << ToString(param);
+}
+
 }  // namespace libgav1
diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc
index 0b50517..ca2adfd 100644
--- a/src/dsp/cdef.cc
+++ b/src/dsp/cdef.cc
@@ -40,8 +40,10 @@ constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
 int32_t Square(int32_t x) { return x * x; }
 
 template <int bitdepth, typename Pixel>
-void CdefDirection_C(const void* const source, ptrdiff_t stride,
-                     uint8_t* const direction, int* const variance) {
+void CdefDirection_C(const void* LIBGAV1_RESTRICT const source,
+                     ptrdiff_t stride,
+                     uint8_t* LIBGAV1_RESTRICT const direction,
+                     int* LIBGAV1_RESTRICT const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const Pixel*>(source);
@@ -121,10 +123,11 @@ int Constrain(int diff, int threshold, int damping) {
 // constant large value (kCdefLargeValue) if at the boundary.
 template <int block_width, int bitdepth, typename Pixel,
           bool enable_primary = true, bool enable_secondary = true>
-void CdefFilter_C(const uint16_t* src, const ptrdiff_t src_stride,
-                  const int block_height, const int primary_strength,
-                  const int secondary_strength, const int damping,
-                  const int direction, void* const dest,
+void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src,
+                  const ptrdiff_t src_stride, const int block_height,
+                  const int primary_strength, const int secondary_strength,
+                  const int damping, const int direction,
+                  void* LIBGAV1_RESTRICT const dest,
                   const ptrdiff_t dest_stride) {
   static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width.");
   static_assert(enable_primary || enable_secondary, "");
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
index 727b4af..f11b45e 100644
--- a/src/dsp/convolve.cc
+++ b/src/dsp/convolve.cc
@@ -33,34 +33,39 @@ constexpr int kHorizontalOffset = 3;
 constexpr int kVerticalOffset = 3;
 
 // Compound prediction output ranges from ConvolveTest.ShowRange.
+// In some cases, the horizontal or vertical filter will be omitted. This table
+// shows the general case, where the downscaled horizontal output is input to
+// the vertical filter via the |intermediate_result| array. The final output is
+// either Pixel or compound values, depending on the |is_compound| variable.
 // Bitdepth:  8 Input range:            [       0,      255]
-//   intermediate range:                [   -7140,    23460]
-//   first pass output range:           [   -1785,     5865]
-//   intermediate range:                [ -328440,   589560]
-//   second pass output range:          [       0,      255]
-//   compound second pass output range: [   -5132,     9212]
+//   Horizontal upscaled range:         [   -7140,    23460]
+//   Horizontal downscaled range:       [   -1785,     5865]
+//   Vertical upscaled range:           [ -328440,   589560]
+//   Pixel output range:                [       0,      255]
+//   Compound output range:             [   -5132,     9212]
 //
 // Bitdepth: 10 Input range:            [       0,     1023]
-//   intermediate range:                [  -28644,    94116]
-//   first pass output range:           [   -7161,    23529]
-//   intermediate range:                [-1317624,  2365176]
-//   second pass output range:          [       0,     1023]
-//   compound second pass output range: [    3988,    61532]
+//   Horizontal upscaled range:         [  -28644,    94116]
+//   Horizontal downscaled range:       [   -7161,    23529]
+//   Vertical upscaled range:           [-1317624,  2365176]
+//   Pixel output range:                [       0,     1023]
+//   Compound output range:             [    3988,    61532]
 //
 // Bitdepth: 12 Input range:            [       0,     4095]
-//   intermediate range:                [ -114660,   376740]
-//   first pass output range:           [   -7166,    23546]
-//   intermediate range:                [-1318560,  2366880]
-//   second pass output range:          [       0,     4095]
-//   compound second pass output range: [    3974,    61559]
+//   Horizontal upscaled range:         [ -114660,   376740]
+//   Horizontal downscaled range:       [   -7166,    23546]
+//   Vertical upscaled range:           [-1318560,  2366880]
+//   Pixel output range:                [       0,     4095]
+//   Compound output range:             [    3974,    61559]
 
 template <int bitdepth, typename Pixel>
-void ConvolveScale2D_C(const void* const reference,
+void ConvolveScale2D_C(const void* LIBGAV1_RESTRICT const reference,
                        const ptrdiff_t reference_stride,
                        const int horizontal_filter_index,
                        const int vertical_filter_index, const int subpixel_x,
                        const int subpixel_y, const int step_x, const int step_y,
-                       const int width, const int height, void* prediction,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT prediction,
                        const ptrdiff_t pred_stride) {
   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
                                            ? kInterRoundBitsHorizontal12bpp
@@ -137,14 +142,12 @@ void ConvolveScale2D_C(const void* const reference,
 }
 
 template <int bitdepth, typename Pixel>
-void ConvolveCompoundScale2D_C(const void* const reference,
-                               const ptrdiff_t reference_stride,
-                               const int horizontal_filter_index,
-                               const int vertical_filter_index,
-                               const int subpixel_x, const int subpixel_y,
-                               const int step_x, const int step_y,
-                               const int width, const int height,
-                               void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompoundScale2D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int subpixel_x, const int subpixel_y,
+    const int step_x, const int step_y, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
   assert(pred_stride == width);
@@ -223,13 +226,13 @@ void ConvolveCompoundScale2D_C(const void* const reference,
 }
 
 template <int bitdepth, typename Pixel>
-void ConvolveCompound2D_C(const void* const reference,
+void ConvolveCompound2D_C(const void* LIBGAV1_RESTRICT const reference,
                           const ptrdiff_t reference_stride,
                           const int horizontal_filter_index,
                           const int vertical_filter_index,
                           const int horizontal_filter_id,
                           const int vertical_filter_id, const int width,
-                          const int height, void* prediction,
+                          const int height, void* LIBGAV1_RESTRICT prediction,
                           const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
@@ -307,11 +310,13 @@ void ConvolveCompound2D_C(const void* const reference,
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
 template <int bitdepth, typename Pixel>
-void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
+void Convolve2D_C(const void* LIBGAV1_RESTRICT const reference,
+                  const ptrdiff_t reference_stride,
                   const int horizontal_filter_index,
                   const int vertical_filter_index,
                   const int horizontal_filter_id, const int vertical_filter_id,
-                  const int width, const int height, void* prediction,
+                  const int width, const int height,
+                  void* LIBGAV1_RESTRICT prediction,
                   const ptrdiff_t pred_stride) {
   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
                                            ? kInterRoundBitsHorizontal12bpp
@@ -385,13 +390,13 @@ void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
 template <int bitdepth, typename Pixel>
-void ConvolveHorizontal_C(const void* const reference,
+void ConvolveHorizontal_C(const void* LIBGAV1_RESTRICT const reference,
                           const ptrdiff_t reference_stride,
                           const int horizontal_filter_index,
                           const int /*vertical_filter_index*/,
                           const int horizontal_filter_id,
                           const int /*vertical_filter_id*/, const int width,
-                          const int height, void* prediction,
+                          const int height, void* LIBGAV1_RESTRICT prediction,
                           const ptrdiff_t pred_stride) {
   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
                                            ? kInterRoundBitsHorizontal12bpp
@@ -427,13 +432,13 @@ void ConvolveHorizontal_C(const void* const reference,
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
 template <int bitdepth, typename Pixel>
-void ConvolveVertical_C(const void* const reference,
+void ConvolveVertical_C(const void* LIBGAV1_RESTRICT const reference,
                         const ptrdiff_t reference_stride,
                         const int /*horizontal_filter_index*/,
                         const int vertical_filter_index,
                         const int /*horizontal_filter_id*/,
                         const int vertical_filter_id, const int width,
-                        const int height, void* prediction,
+                        const int height, void* LIBGAV1_RESTRICT prediction,
                         const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
@@ -464,13 +469,13 @@ void ConvolveVertical_C(const void* const reference,
 }
 
 template <int bitdepth, typename Pixel>
-void ConvolveCopy_C(const void* const reference,
+void ConvolveCopy_C(const void* LIBGAV1_RESTRICT const reference,
                     const ptrdiff_t reference_stride,
                     const int /*horizontal_filter_index*/,
                     const int /*vertical_filter_index*/,
                     const int /*horizontal_filter_id*/,
                     const int /*vertical_filter_id*/, const int width,
-                    const int height, void* prediction,
+                    const int height, void* LIBGAV1_RESTRICT prediction,
                     const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
@@ -483,13 +488,13 @@ void ConvolveCopy_C(const void* const reference,
 }
 
 template <int bitdepth, typename Pixel>
-void ConvolveCompoundCopy_C(const void* const reference,
+void ConvolveCompoundCopy_C(const void* LIBGAV1_RESTRICT const reference,
                             const ptrdiff_t reference_stride,
                             const int /*horizontal_filter_index*/,
                             const int /*vertical_filter_index*/,
                             const int /*horizontal_filter_id*/,
                             const int /*vertical_filter_id*/, const int width,
-                            const int height, void* prediction,
+                            const int height, void* LIBGAV1_RESTRICT prediction,
                             const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
@@ -523,11 +528,11 @@ void ConvolveCompoundCopy_C(const void* const reference,
 // blended with another predictor to generate the final prediction of the block.
 template <int bitdepth, typename Pixel>
 void ConvolveCompoundHorizontal_C(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
   assert(pred_stride == width);
@@ -567,14 +572,12 @@ void ConvolveCompoundHorizontal_C(
 // The output is not clipped to valid pixel range. Its output will be
 // blended with another predictor to generate the final prediction of the block.
 template <int bitdepth, typename Pixel>
-void ConvolveCompoundVertical_C(const void* const reference,
-                                const ptrdiff_t reference_stride,
-                                const int /*horizontal_filter_index*/,
-                                const int vertical_filter_index,
-                                const int /*horizontal_filter_id*/,
-                                const int vertical_filter_id, const int width,
-                                const int height, void* prediction,
-                                const ptrdiff_t pred_stride) {
+void ConvolveCompoundVertical_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
   assert(pred_stride == width);
@@ -615,14 +618,12 @@ void ConvolveCompoundVertical_C(const void* const reference,
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
 template <int bitdepth, typename Pixel>
-void ConvolveIntraBlockCopy2D_C(const void* const reference,
-                                const ptrdiff_t reference_stride,
-                                const int /*horizontal_filter_index*/,
-                                const int /*vertical_filter_index*/,
-                                const int /*horizontal_filter_id*/,
-                                const int /*vertical_filter_id*/,
-                                const int width, const int height,
-                                void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveIntraBlockCopy2D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
   assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
   const auto* src = static_cast<const Pixel*>(reference);
@@ -670,14 +671,12 @@ void ConvolveIntraBlockCopy2D_C(const void* const reference,
 // The filtering of intra block copy is simply the average of current and
 // the next pixel.
 template <int bitdepth, typename Pixel, bool is_horizontal>
-void ConvolveIntraBlockCopy1D_C(const void* const reference,
-                                const ptrdiff_t reference_stride,
-                                const int /*horizontal_filter_index*/,
-                                const int /*vertical_filter_index*/,
-                                const int /*horizontal_filter_id*/,
-                                const int /*vertical_filter_id*/,
-                                const int width, const int height,
-                                void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveIntraBlockCopy1D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
   assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
   const auto* src = static_cast<const Pixel*>(reference);
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
index 4a2a9f1..52732f4 100644
--- a/src/dsp/convolve_test.cc
+++ b/src/dsp/convolve_test.cc
@@ -776,6 +776,9 @@ class ConvolveTest
       }
     } else if (absl::StartsWith(test_case, "NEON/")) {
       ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ConvolveInit10bpp_NEON();
+#endif
     } else {
       FAIL() << "Unrecognized architecture prefix in test case name: "
              << test_case;
@@ -1210,7 +1213,7 @@ void ShowRange() {
     assert(max > INT16_MAX && max < INT32_MAX);
   }
 
-  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+  printf("  Horizontal upscaled range:         [%8d, %8d]\n", min, max);
 
   const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
   const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
@@ -1219,7 +1222,7 @@ void ShowRange() {
   assert(first_pass_min > INT16_MIN);
   assert(first_pass_max < INT16_MAX);
 
-  printf("  first pass output range:           [%8d, %8d]\n", first_pass_min,
+  printf("  Horizontal downscaled range:       [%8d, %8d]\n", first_pass_min,
          first_pass_max);
 
   // Second pass.
@@ -1230,14 +1233,14 @@ void ShowRange() {
   assert(min < INT16_MIN && min > INT32_MIN);
   assert(max > INT16_MAX && max < INT32_MAX);
 
-  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+  printf("  Vertical upscaled range:           [%8d, %8d]\n", min, max);
 
   // Second pass non-compound output is clipped to Pixel values.
   const int second_pass_min =
       Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
   const int second_pass_max =
       Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
-  printf("  second pass output range:          [%8d, %8d]\n", second_pass_min,
+  printf("  Pixel output range:                [%8d, %8d]\n", second_pass_min,
          second_pass_max);
 
   // Output is Pixel so matches Pixel values.
@@ -1249,7 +1252,7 @@ void ShowRange() {
   const int compound_second_pass_max =
       RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
 
-  printf("  compound second pass output range: [%8d, %8d]\n",
+  printf("  Compound output range:             [%8d, %8d]\n",
          compound_second_pass_min, compound_second_pass_max);
 
   if (bitdepth == 8) {
@@ -1366,6 +1369,14 @@ INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp,
                          testing::Combine(testing::ValuesIn(kConvolveParam),
                                           testing::ValuesIn(kConvolveTypeParam),
                                           testing::Bool()));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest10bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveParam),
+                                          testing::ValuesIn(kConvolveTypeParam),
+                                          testing::Bool()));
+#endif  // LIBGAV1_ENABLE_NEON
+
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
 }  // namespace
diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc
index a035fbe..34d10fc 100644
--- a/src/dsp/distance_weighted_blend.cc
+++ b/src/dsp/distance_weighted_blend.cc
@@ -27,10 +27,12 @@ namespace dsp {
 namespace {
 
 template <int bitdepth, typename Pixel>
-void DistanceWeightedBlend_C(const void* prediction_0, const void* prediction_1,
+void DistanceWeightedBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
                              const uint8_t weight_0, const uint8_t weight_1,
                              const int width, const int height,
-                             void* const dest, const ptrdiff_t dest_stride) {
+                             void* LIBGAV1_RESTRICT const dest,
+                             const ptrdiff_t dest_stride) {
   // 7.11.3.2 Rounding variables derivation process
   //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
   constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
index b3f3a2e..fdf058e 100644
--- a/src/dsp/distance_weighted_blend_test.cc
+++ b/src/dsp/distance_weighted_blend_test.cc
@@ -14,13 +14,13 @@
 
 #include "src/dsp/distance_weighted_blend.h"
 
+#include <cassert>
 #include <cstdint>
 #include <ostream>
 #include <string>
 #include <type_traits>
 
 #include "absl/strings/match.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -43,18 +43,8 @@ constexpr int kNumSpeedTests = 500000;
 constexpr int kQuantizedDistanceLookup[4][2] = {
     {9, 7}, {11, 5}, {12, 4}, {13, 3}};
 
-struct TestParam {
-  TestParam(int width, int height) : width(width), height(height) {}
-  int width;
-  int height;
-};
-
-std::ostream& operator<<(std::ostream& os, const TestParam& param) {
-  return os << "BlockSize" << param.width << "x" << param.height;
-}
-
 template <int bitdepth, typename Pixel>
-class DistanceWeightedBlendTest : public testing::TestWithParam<TestParam>,
+class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>,
                                   public test_utils::MaxAlignedAllocable {
  public:
   DistanceWeightedBlendTest() = default;
@@ -91,8 +81,8 @@ class DistanceWeightedBlendTest : public testing::TestWithParam<TestParam>,
   using PredType =
       typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
   static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
-  const int width_ = GetParam().width;
-  const int height_ = GetParam().height;
+  const int width_ = kBlockWidthPixels[GetParam()];
+  const int height_ = kBlockHeightPixels[GetParam()];
   alignas(kMaxAlignment) PredType
       source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
   alignas(kMaxAlignment) PredType
@@ -149,74 +139,51 @@ void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest,
     elapsed_time += absl::Now() - start;
   }
 
-  test_utils::CheckMd5Digest(
-      "DistanceWeightedBlend",
-      absl::StrFormat("BlockSize%dx%d", width_, height_).c_str(), digest, dest_,
-      sizeof(dest_), elapsed_time);
+  test_utils::CheckMd5Digest("DistanceWeightedBlend", ToString(GetParam()),
+                             digest, dest_, sizeof(dest_), elapsed_time);
 }
 
-const TestParam kTestParam[] = {
-    TestParam(4, 4),    TestParam(4, 8),    TestParam(4, 16),
-    TestParam(8, 4),    TestParam(8, 8),    TestParam(8, 16),
-    TestParam(8, 32),   TestParam(16, 4),   TestParam(16, 8),
-    TestParam(16, 16),  TestParam(16, 32),  TestParam(16, 64),
-    TestParam(32, 8),   TestParam(32, 16),  TestParam(32, 32),
-    TestParam(32, 64),  TestParam(32, 128), TestParam(64, 16),
-    TestParam(64, 32),  TestParam(64, 64),  TestParam(64, 128),
-    TestParam(128, 32), TestParam(128, 64), TestParam(128, 128),
+const BlockSize kTestParam[] = {
+    kBlock4x4,    kBlock4x8,     kBlock4x16,  kBlock8x4,   kBlock8x8,
+    kBlock8x16,   kBlock8x32,    kBlock16x4,  kBlock16x8,  kBlock16x16,
+    kBlock16x32,  kBlock16x64,   kBlock32x8,  kBlock32x16, kBlock32x32,
+    kBlock32x64,  kBlock64x16,   kBlock64x32, kBlock64x64, kBlock64x128,
+    kBlock128x64, kBlock128x128,
 };
 
-const char* GetDistanceWeightedBlendDigest8bpp(const TestParam block_size) {
-  static const char* const kDigestsWidth4[] = {
+const char* GetDistanceWeightedBlendDigest8bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
       "ebf389f724f8ab46a2cac895e4e073ca",
       "09acd567b6b12c8cf8eb51d8b86eb4bf",
       "57bb4d65695d8ec6752f2bd8686b64fd",
-  };
-  static const char* const kDigestsWidth8[] = {
+      // 8xN
       "270905ac76f9a2cba8a552eb0bf7c8c1",
       "f0801c8574d2c271ef2bbea77a1d7352",
       "e761b580e3312be33a227492a233ce72",
       "ff214dab1a7e98e2285961d6421720c6",
-  };
-  static const char* const kDigestsWidth16[] = {
-      "4f712609a36e817f9752326d58562ff8", "14243f5c5f7c7104160c1f2cef0a0fbc",
-      "3ac3f3161b7c8dd8436b02abfdde104a", "81a00b704e0e41a5dbe6436ac70c098d",
+      // 16xN
+      "4f712609a36e817f9752326d58562ff8",
+      "14243f5c5f7c7104160c1f2cef0a0fbc",
+      "3ac3f3161b7c8dd8436b02abfdde104a",
+      "81a00b704e0e41a5dbe6436ac70c098d",
       "af8fd02017c7acdff788be742d700baa",
-  };
-  static const char* const kDigestsWidth32[] = {
-      "ee34332c66a6d6ed8ce64031aafe776c", "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
-      "607ffc22098d81b7e37a7bf62f4af5d3", "3823dbf043b4682f56d5ca698e755ea5",
-      "57f7e8d1e67645269ce760a2c8da4afc",
-  };
-  static const char* const kDigestsWidth64[] = {
+      // 32xN
+      "ee34332c66a6d6ed8ce64031aafe776c",
+      "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
+      "607ffc22098d81b7e37a7bf62f4af5d3",
+      "3823dbf043b4682f56d5ca698e755ea5",
+      // 64xN
       "4acf556b921956c2bc24659cd5128401",
       "a298c544c9c3b27924b4c23cc687ea5a",
       "539e2df267782ce61c70103b23b7d922",
       "3b0cb2a0b5d384efee4d81401025bec1",
-  };
-  static const char* const kDigestsWidth128[] = {
-      "d71ee689a40ff5f390d07717df4b7233",
+      // 128xN
       "8b56b636dd712c2f8d138badb7219991",
       "8cfc8836908902b8f915639b7bff45b3",
   };
-  const int height_index =
-      FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2;
-  switch (block_size.width) {
-    case 4:
-      return kDigestsWidth4[height_index - 2];
-    case 8:
-      return kDigestsWidth8[height_index - 1];
-    case 16:
-      return kDigestsWidth16[height_index];
-    case 32:
-      return kDigestsWidth32[height_index];
-    case 64:
-      return kDigestsWidth64[height_index];
-    default:
-      EXPECT_EQ(block_size.width, 128)
-          << "Unknown width parameter: " << block_size.width;
-      return kDigestsWidth128[height_index];
-  }
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
 }
 
 using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>;
@@ -243,57 +210,39 @@ INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp,
 #endif
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
-const char* GetDistanceWeightedBlendDigest10bpp(const TestParam block_size) {
-  static const char* const kDigestsWidth4[] = {
+const char* GetDistanceWeightedBlendDigest10bpp(const BlockSize block_size) {
+  static const char* const kDigests[] = {
+      // 4xN
       "55f594b56e16d5c401274affebbcc3d3",
       "69df14da4bb33a8f7d7087921008e919",
       "1b61f33604c54015794198a13bfebf46",
-  };
-  static const char* const kDigestsWidth8[] = {
+      // 8xN
       "825a938185b152f7cf09bf1c0723ce2b",
       "85ea315c51d979bc9b45834d6b40ec6f",
       "92ebde208e8c39f7ec6de2de82182dbb",
       "520f84716db5b43684dbb703806383fe",
-  };
-  static const char* const kDigestsWidth16[] = {
-      "12ca23e3e2930005a0511646e8c83da4", "6208694a6744f4a3906f58c1add670e3",
-      "a33d63889df989a3bbf84ff236614267", "34830846ecb0572a98bbd192fed02b16",
+      // 16xN
+      "12ca23e3e2930005a0511646e8c83da4",
+      "6208694a6744f4a3906f58c1add670e3",
+      "a33d63889df989a3bbf84ff236614267",
+      "34830846ecb0572a98bbd192fed02b16",
       "34bb2f79c0bd7f9a80691b8af597f2a8",
-  };
-  static const char* const kDigestsWidth32[] = {
-      "fa97f2d0e3143f1f44d3ac018b0d696d", "3df4a22456c9ab6ed346ab1b9750ae7d",
-      "6276a058b35c6131bc0c94a4b4a37ebc", "9ca42da5d2d5eb339df03ae2c7a26914",
-      "2ff0dc010a7b40830fb47423a9beb894",
-  };
-  static const char* const kDigestsWidth64[] = {
+      // 32xN
+      "fa97f2d0e3143f1f44d3ac018b0d696d",
+      "3df4a22456c9ab6ed346ab1b9750ae7d",
+      "6276a058b35c6131bc0c94a4b4a37ebc",
+      "9ca42da5d2d5eb339df03ae2c7a26914",
+      // 64xN
       "800e692c520f99223bc24c1ac95a0166",
       "818b6d20426585ef7fe844015a03aaf5",
       "fb48691ccfff083e01d74826e88e613f",
       "0bd350bc5bc604a224d77a5f5a422698",
-  };
-  static const char* const kDigestsWidth128[] = {
-      "02aac5d5669c1245da876c5440c4d829",
+      // 128xN
       "a130840813cd6bd69d09bcf5f8d0180f",
       "6ece1846bea55e8f8f2ed7fbf73718de",
   };
-  const int height_index =
-      FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2;
-  switch (block_size.width) {
-    case 4:
-      return kDigestsWidth4[height_index - 2];
-    case 8:
-      return kDigestsWidth8[height_index - 1];
-    case 16:
-      return kDigestsWidth16[height_index];
-    case 32:
-      return kDigestsWidth32[height_index];
-    case 64:
-      return kDigestsWidth64[height_index];
-    default:
-      EXPECT_EQ(block_size.width, 128)
-          << "Unknown width parameter: " << block_size.width;
-      return kDigestsWidth128[height_index];
-  }
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
 }
 
 using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>;
@@ -321,4 +270,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
 
 }  // namespace
 }  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+  return os << ToString(param);
+}
+
 }  // namespace libgav1
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
index a3d7701..aac0ca0 100644
--- a/src/dsp/dsp.cc
+++ b/src/dsp/dsp.cc
@@ -155,7 +155,9 @@ void DspInit() {
     WarpInit_NEON();
     WeightMaskInit_NEON();
 #if LIBGAV1_MAX_BITDEPTH >= 10
+    ConvolveInit10bpp_NEON();
     InverseTransformInit10bpp_NEON();
+    LoopRestorationInit10bpp_NEON();
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 #endif  // LIBGAV1_ENABLE_NEON
   });
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 153db7f..298b6d8 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -194,6 +194,7 @@ inline const char* ToString(const LoopFilterType filter_type) {
 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
 // the row above |dst|. |left| is an aligned vector of the column to the left
 // of |dst|. top-left and bottom-left may be accessed.
+// The pointer arguments do not alias one another.
 using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
                                     const void* top, const void* left);
 using IntraPredictorFuncs =
@@ -209,6 +210,7 @@ using IntraPredictorFuncs =
 // |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
 // process'. This can occur in cases with |width| + |height| <= 16. top-right
 // is accessed.
+// The pointer arguments do not alias one another.
 using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
                                                     const void* top, int width,
                                                     int height, int xstep,
@@ -226,6 +228,7 @@ using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
 // described in '7.11.2.11. Intra edge upsample process'. This can occur in
 // cases with |width| + |height| <= 16. top-left and upper-left are accessed,
 // up to [-2] in each if |upsampled_top/left| are set.
+// The pointer arguments do not alias one another.
 using DirectionalIntraPredictorZone2Func = void (*)(
     void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
     int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
@@ -240,6 +243,7 @@ using DirectionalIntraPredictorZone2Func = void (*)(
 // |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
 // process'. This can occur in cases with |width| + |height| <= 16. bottom-left
 // is accessed.
+// The pointer arguments do not alias one another.
 using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
                                                     const void* left, int width,
                                                     int height, int ystep,
@@ -250,6 +254,7 @@ using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
 // the row above |dst|. |left| is an aligned vector of the column to the left
 // of |dst|. |width| and |height| are the size of the block in pixels.
+// The pointer arguments do not alias one another.
 using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
                                           const void* top, const void* left,
                                           FilterIntraPredictor pred, int width,
@@ -303,11 +308,14 @@ using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
 // 7.13.3).
 // Apply the inverse transforms and add the residual to the destination frame
 // for the transform type and block size |tx_size| starting at position
-// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
-// |adjusted_tx_height| is the number of rows to process based on the non-zero
-// coefficient count in the block. It will be 1 (non-zero coefficient count ==
-// 1), 4 or a multiple of 8 up to 32 or the original transform height,
-// whichever is less.
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D of Pixel
+// values. |adjusted_tx_height| is the number of rows to process based on the
+// non-zero coefficient count in the block. It will be 1 (non-zero coefficient
+// count == 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less. |src_buffer| is a pointer to an Array2D of Residual
+// values. On input |src_buffer| contains the dequantized values, on output it
+// contains the residual.
+// The pointer arguments do not alias one another.
 using InverseTransformAddFunc = void (*)(TransformType tx_type,
                                          TransformSize tx_size,
                                          int adjusted_tx_height,
@@ -324,6 +332,13 @@ using InverseTransformAddFuncs =
 // Loop filter function signature. Section 7.14.
 // |dst| is an unaligned pointer to the output block. Pixel size is determined
 // by bitdepth with |stride| given in bytes.
+// <threshold param> <spec name> <range>
+// |outer_thresh|    blimit      [7, 193]
+// |inner_thresh|    limit       [1, 63]
+// |hev_thresh|      thresh      [0, 63]
+// These are scaled by the implementation by 'bitdepth - 8' to produce
+// the spec variables blimitBd, limitBd and threshBd.
+// Note these functions are not called when the loop filter level is 0.
 using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
                                 int inner_thresh, int hev_thresh);
 using LoopFilterFuncs =
@@ -333,6 +348,7 @@ using LoopFilterFuncs =
 // |src| is a pointer to the source block. Pixel size is determined by bitdepth
 // with |stride| given in bytes. |direction| and |variance| are output
 // parameters and must not be nullptr.
+// The pointer arguments do not alias one another.
 using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
                                    uint8_t* direction, int* variance);
 
@@ -344,6 +360,7 @@ using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
 // parameters.
 // |direction| is the filtering direction.
 // |dest| is the output buffer. |dest_stride| is given in bytes.
+// The pointer arguments do not alias one another.
 using CdefFilteringFunc = void (*)(const uint16_t* source,
                                    ptrdiff_t source_stride, int block_height,
                                    int primary_strength, int secondary_strength,
@@ -381,6 +398,7 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width,
 // |step| is the number of subpixels to move the kernel for the next destination
 // pixel.
 // |initial_subpixel_x| is a base offset from which |step| increments.
+// The pointer arguments do not alias one another.
 using SuperResFunc = void (*)(const void* coefficients, void* source,
                               ptrdiff_t source_stride, int height,
                               int downscaled_width, int upscaled_width,
@@ -397,6 +415,7 @@ using SuperResFunc = void (*)(const void* coefficients, void* source,
 // |top_border_stride| and |bottom_border_stride| are given in pixels.
 // |restoration_buffer| contains buffers required for self guided filter and
 // wiener filter. They must be initialized before calling.
+// The pointer arguments do not alias one another.
 using LoopRestorationFunc = void (*)(
     const RestorationUnitInfo& restoration_info, const void* source,
     ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
@@ -425,6 +444,7 @@ using LoopRestorationFuncs = LoopRestorationFunc[2];
 // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
 // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
 // be used.
+// The pointer arguments do not alias one another.
 using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
                               int horizontal_filter_index,
                               int vertical_filter_index,
@@ -462,6 +482,7 @@ using ConvolveFuncs = ConvolveFunc[2][2][2][2];
 // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
 // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
 // be used.
+// The pointer arguments do not alias one another.
 using ConvolveScaleFunc = void (*)(const void* reference,
                                    ptrdiff_t reference_stride,
                                    int horizontal_filter_index,
@@ -482,6 +503,7 @@ using ConvolveScaleFuncs = ConvolveScaleFunc[2];
 // The stride for the input buffers is equal to |width|.
 // The valid range of block size is [8x8, 128x128] for the luma plane.
 // |mask| is the output buffer. |mask_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
 using WeightMaskFunc = void (*)(const void* prediction_0,
                                 const void* prediction_1, uint8_t* mask,
                                 ptrdiff_t mask_stride);
@@ -504,6 +526,7 @@ using WeightMaskFuncs = WeightMaskFunc[6][6][2];
 // The stride for the input buffers is equal to |width|.
 // The valid range of block size is [8x8, 128x128] for the luma plane.
 // |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
 using AverageBlendFunc = void (*)(const void* prediction_0,
                                   const void* prediction_1, int width,
                                   int height, void* dest,
@@ -525,6 +548,7 @@ using AverageBlendFunc = void (*)(const void* prediction_0,
 // The stride for the input buffers is equal to |width|.
 // The valid range of block size is [8x8, 128x128] for the luma plane.
 // |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
 using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
                                            const void* prediction_1,
                                            uint8_t weight_0, uint8_t weight_1,
@@ -550,17 +574,18 @@ using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
 // |mask_stride| is corresponding stride.
 // |width|, |height| are the same for both input blocks.
 // If it's inter_intra (or wedge_inter_intra), the valid range of block size is
-// [8x8, 32x32]. Otherwise (including difference weighted prediction and
-// compound average prediction), the valid range is [8x8, 128x128].
+// [8x8, 32x32], no 4:1/1:4 blocks (Section 5.11.28). Otherwise (including
+// difference weighted prediction and compound average prediction), the valid
+// range is [8x8, 128x128].
 // If there's subsampling, the corresponding width and height are halved for
 // chroma planes.
-// |subsampling_x|, |subsampling_y| are the subsampling factors.
 // |is_inter_intra| stands for the prediction mode. If it is true, one of the
 // prediction blocks is from intra prediction of current frame. Otherwise, two
 // prediction blocks are both inter frame predictions.
 // |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
 // |dest| is the output block.
 // |dest_stride| is the corresponding stride for dest.
+// The pointer arguments do not alias one another.
 using MaskBlendFunc = void (*)(const void* prediction_0,
                                const void* prediction_1,
                                ptrdiff_t prediction_stride_1,
@@ -577,6 +602,7 @@ using MaskBlendFuncs = MaskBlendFunc[3][2];
 // |is_inter_intra| is true and |bitdepth| == 8.
 // |prediction_[01]| are Pixel values (uint8_t).
 // |prediction_1| is also the output buffer.
+// The pointer arguments do not alias one another.
 using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
                                              uint8_t* prediction_1,
                                              ptrdiff_t prediction_stride_1,
@@ -600,9 +626,12 @@ using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
 // clipped. Therefore obmc blending process doesn't need to clip the output.
 // |prediction| is the first input block, which will be overwritten.
 // |prediction_stride| is the stride, given in bytes.
-// |width|, |height| are the same for both input blocks.
+// |width|, |height| are the same for both input blocks. The range is [4x2,
+// 32x32] for kObmcDirectionVertical and [2x4, 32x32] for
+// kObmcDirectionHorizontal, see Section 7.11.3.9.
 // |obmc_prediction| is the second input block.
 // |obmc_prediction_stride| is its stride, given in bytes.
+// The pointer arguments do not alias one another.
 using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
                                int width, int height,
                                const void* obmc_prediction,
@@ -645,6 +674,7 @@ using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
 //   Therefore, there must be at least one extra padding byte after the right
 //   border of the last row in the source buffer.
 // * The top and bottom borders must be at least 13 pixels high.
+// The pointer arguments do not alias one another.
 using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
                           int source_width, int source_height,
                           const int* warp_params, int subsampling_x,
@@ -686,6 +716,7 @@ using LumaAutoRegressionFuncs =
 // from frame header, mainly providing auto_regression_coeff_u and
 // auto_regression_coeff_v for each chroma plane's filter, and
 // auto_regression_shift to right shift the filter sums by.
+// The pointer arguments do not alias one another.
 using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
                                           const void* luma_grain_buffer,
                                           int subsampling_x, int subsampling_y,
@@ -704,6 +735,7 @@ using ChromaAutoRegressionFuncs =
 // Because this function treats all planes identically and independently, it is
 // simplified to take one grain buffer at a time. This means duplicating some
 // random number generations, but that work can be reduced in other ways.
+// The pointer arguments do not alias one another.
 using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
                                            int grain_seed, int width,
                                            int height, int subsampling_x,
@@ -720,6 +752,7 @@ using ConstructNoiseStripesFuncs =
 // Array2D containing the allocated plane for this frame. Because this function
 // treats all planes identically and independently, it is simplified to take one
 // grain buffer at a time.
+// The pointer arguments do not alias one another.
 using ConstructNoiseImageOverlapFunc =
     void (*)(const void* noise_stripes_buffer, int width, int height,
              int subsampling_x, int subsampling_y, void* noise_image_buffer);
@@ -730,6 +763,7 @@ using ConstructNoiseImageOverlapFunc =
 // |num_points| can be between 0 and 15. When 0, the lookup table is set to
 // zero.
 // |point_value| and |point_scaling| have |num_points| valid elements.
+// The pointer arguments do not alias one another.
 using InitializeScalingLutFunc = void (*)(
     int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
     uint8_t scaling_lut[kScalingLookupTableSize]);
@@ -749,6 +783,7 @@ using InitializeScalingLutFunc = void (*)(
 // |scaling_shift| is applied as a right shift after scaling, so that scaling
 // down is possible. It is found in FilmGrainParams, but supplied directly to
 // BlendNoiseWithImageLumaFunc because it's the only member used.
+// The pointer arguments do not alias one another.
 using BlendNoiseWithImageLumaFunc =
     void (*)(const void* noise_image_ptr, int min_value, int max_value,
              int scaling_shift, int width, int height, int start_height,
@@ -797,13 +832,14 @@ using MotionFieldProjectionKernelFunc = void (*)(
 
 // Compound temporal motion vector projection function signature.
 // Section 7.9.3 and 7.10.2.10.
-// |temporal_mvs| is the set of temporal reference motion vectors.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
 // |temporal_reference_offsets| specifies the number of frames covered by the
 // original motion vector.
 // |reference_offsets| specifies the number of frames to be covered by the
 // projected motion vector.
 // |count| is the number of the temporal motion vectors.
-// |candidate_mvs| is the set of projected motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
 using MvProjectionCompoundFunc = void (*)(
     const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
     const int reference_offsets[2], int count,
@@ -811,13 +847,14 @@ using MvProjectionCompoundFunc = void (*)(
 
 // Single temporal motion vector projection function signature.
 // Section 7.9.3 and 7.10.2.10.
-// |temporal_mvs| is the set of temporal reference motion vectors.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
 // |temporal_reference_offsets| specifies the number of frames covered by the
 // original motion vector.
 // |reference_offset| specifies the number of frames to be covered by the
 // projected motion vector.
 // |count| is the number of the temporal motion vectors.
-// |candidate_mvs| is the set of projected motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
 using MvProjectionSingleFunc = void (*)(
     const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
     int reference_offset, int count, MotionVector* candidate_mvs);
diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc
index 41d1dd0..0905fd4 100644
--- a/src/dsp/film_grain.cc
+++ b/src/dsp/film_grain.cc
@@ -153,12 +153,11 @@ void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params,
 
 template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
           bool use_luma>
-void ApplyAutoRegressiveFilterToChromaGrains_C(const FilmGrainParams& params,
-                                               const void* luma_grain_buffer,
-                                               int subsampling_x,
-                                               int subsampling_y,
-                                               void* u_grain_buffer,
-                                               void* v_grain_buffer) {
+void ApplyAutoRegressiveFilterToChromaGrains_C(
+    const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+    int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+    void* LIBGAV1_RESTRICT v_grain_buffer) {
   static_assert(
       auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3,
       "Unsupported autoregression lag for chroma.");
@@ -227,9 +226,10 @@ void ApplyAutoRegressiveFilterToChromaGrains_C(const FilmGrainParams& params,
 
 // This implementation is for the condition overlap_flag == false.
 template <int bitdepth, typename GrainType>
-void ConstructNoiseStripes_C(const void* grain_buffer, int grain_seed,
-                             int width, int height, int subsampling_x,
-                             int subsampling_y, void* noise_stripes_buffer) {
+void ConstructNoiseStripes_C(const void* LIBGAV1_RESTRICT grain_buffer,
+                             int grain_seed, int width, int height,
+                             int subsampling_x, int subsampling_y,
+                             void* LIBGAV1_RESTRICT noise_stripes_buffer) {
   auto* noise_stripes =
       static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
   const auto* grain = static_cast<const GrainType*>(grain_buffer);
@@ -291,10 +291,10 @@ void ConstructNoiseStripes_C(const void* grain_buffer, int grain_seed,
 
 // This implementation is for the condition overlap_flag == true.
 template <int bitdepth, typename GrainType>
-void ConstructNoiseStripesWithOverlap_C(const void* grain_buffer,
-                                        int grain_seed, int width, int height,
-                                        int subsampling_x, int subsampling_y,
-                                        void* noise_stripes_buffer) {
+void ConstructNoiseStripesWithOverlap_C(
+    const void* LIBGAV1_RESTRICT grain_buffer, int grain_seed, int width,
+    int height, int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_stripes_buffer) {
   auto* noise_stripes =
       static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
   const auto* grain = static_cast<const GrainType*>(grain_buffer);
@@ -417,10 +417,11 @@ void ConstructNoiseStripesWithOverlap_C(const void* grain_buffer,
 }
 
 template <int bitdepth, typename GrainType>
-inline void WriteOverlapLine_C(const GrainType* noise_stripe_row,
-                               const GrainType* noise_stripe_row_prev,
-                               int plane_width, int grain_coeff, int old_coeff,
-                               GrainType* noise_image_row) {
+inline void WriteOverlapLine_C(
+    const GrainType* LIBGAV1_RESTRICT noise_stripe_row,
+    const GrainType* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    int grain_coeff, int old_coeff,
+    GrainType* LIBGAV1_RESTRICT noise_image_row) {
   int x = 0;
   do {
     int grain = noise_stripe_row[x];
@@ -433,9 +434,10 @@ inline void WriteOverlapLine_C(const GrainType* noise_stripe_row,
 }
 
 template <int bitdepth, typename GrainType>
-void ConstructNoiseImageOverlap_C(const void* noise_stripes_buffer, int width,
-                                  int height, int subsampling_x,
-                                  int subsampling_y, void* noise_image_buffer) {
+void ConstructNoiseImageOverlap_C(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
   const auto* noise_stripes =
       static_cast<const Array2DView<GrainType>*>(noise_stripes_buffer);
   auto* noise_image = static_cast<Array2D<GrainType>*>(noise_image_buffer);
@@ -496,11 +498,11 @@ void ConstructNoiseImageOverlap_C(const void* noise_stripes_buffer, int width,
 
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageLuma_C(
-    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
-    int width, int height, int start_height,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
     const uint8_t scaling_lut_y[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
-    ptrdiff_t dest_stride_y) {
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    void* LIBGAV1_RESTRICT dest_plane_y, ptrdiff_t dest_stride_y) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
@@ -524,13 +526,13 @@ void BlendNoiseWithImageLuma_C(
 // This function is for the case params_.chroma_scaling_from_luma == false.
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageChroma_C(
-    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
-    int min_value, int max_chroma, int width, int height, int start_height,
-    int subsampling_x, int subsampling_y,
-    const uint8_t scaling_lut_uv[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y,
-    const void* source_plane_uv, ptrdiff_t source_stride_uv,
-    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const uint8_t scaling_lut_uv[kScalingLookupTableSize],
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* LIBGAV1_RESTRICT source_plane_uv, ptrdiff_t source_stride_uv,
+    void* LIBGAV1_RESTRICT dest_plane_uv, ptrdiff_t dest_stride_uv) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
 
@@ -586,13 +588,13 @@ void BlendNoiseWithImageChroma_C(
 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageChromaWithCfl_C(
-    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
-    int min_value, int max_chroma, int width, int height, int start_height,
-    int subsampling_x, int subsampling_y,
-    const uint8_t scaling_lut[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y,
-    const void* source_plane_uv, ptrdiff_t source_stride_uv,
-    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* LIBGAV1_RESTRICT source_plane_uv, ptrdiff_t source_stride_uv,
+    void* LIBGAV1_RESTRICT dest_plane_uv, ptrdiff_t dest_stride_uv) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
index 4520c2c..75af279 100644
--- a/src/dsp/intrapred.cc
+++ b/src/dsp/intrapred.cc
@@ -63,8 +63,8 @@ struct IntraPredBppFuncs_C {
 
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* /*left_column*/) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
   int sum = block_width >> 1;  // rounder
   const auto* const top = static_cast<const Pixel*>(top_row);
   for (int x = 0; x < block_width; ++x) sum += top[x];
@@ -80,8 +80,8 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
 
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
-    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
   int sum = block_height >> 1;  // rounder
   const auto* const left = static_cast<const Pixel*>(left_column);
   for (int y = 0; y < block_height; ++y) sum += left[y];
@@ -132,8 +132,9 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
 
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const int divisor = block_width + block_height;
   int sum = divisor >> 1;  // rounder
 
@@ -158,8 +159,8 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
 // IntraPredFuncs_C::Vertical -- apply top row vertically
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* /*left_column*/) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
   auto* dst = static_cast<uint8_t*>(dest);
   for (int y = 0; y < block_height; ++y) {
     memcpy(dst, top_row, block_width * sizeof(Pixel));
@@ -170,8 +171,8 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
 // IntraPredFuncs_C::Horizontal -- apply left column horizontally
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
-    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const Pixel*>(left_column);
   auto* dst = static_cast<Pixel*>(dest);
   stride /= sizeof(Pixel);
@@ -184,8 +185,9 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
 // IntraPredFuncs_C::Paeth
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const Pixel*>(top_row);
   const auto* const left = static_cast<const Pixel*>(left_column);
   const Pixel top_left = top[-1];
diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc
index 948c0c0..0f7f4f2 100644
--- a/src/dsp/intrapred_cfl.cc
+++ b/src/dsp/intrapred_cfl.cc
@@ -41,7 +41,7 @@ constexpr TransformSize kTransformSizesLargerThan32x32[] = {
 // |alpha| can be -16 to 16 (inclusive).
 template <int block_width, int block_height, int bitdepth, typename Pixel>
 void CflIntraPredictor_C(
-    void* const dest, ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<Pixel*>(dest);
@@ -66,7 +66,8 @@ template <int block_width, int block_height, int bitdepth, typename Pixel,
           int subsampling_x, int subsampling_y>
 void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
                      const int max_luma_width, const int max_luma_height,
-                     const void* const source, ptrdiff_t stride) {
+                     const void* LIBGAV1_RESTRICT const source,
+                     ptrdiff_t stride) {
   assert(max_luma_width >= 4);
   assert(max_luma_height >= 4);
   const auto* src = static_cast<const Pixel*>(source);
diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc
index e670769..21a40b5 100644
--- a/src/dsp/intrapred_directional.cc
+++ b/src/dsp/intrapred_directional.cc
@@ -33,11 +33,10 @@ namespace {
 // 7.11.2.4. Directional intra prediction process
 
 template <typename Pixel>
-void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const int width, const int height,
-                                      const int xstep,
-                                      const bool upsampled_top) {
+void DirectionalIntraPredictorZone1_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
   const auto* const top = static_cast<const Pixel*>(top_row);
   auto* dst = static_cast<Pixel*>(dest);
   stride /= sizeof(Pixel);
@@ -96,13 +95,12 @@ void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
 }
 
 template <typename Pixel>
-void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const void* const left_column,
-                                      const int width, const int height,
-                                      const int xstep, const int ystep,
-                                      const bool upsampled_top,
-                                      const bool upsampled_left) {
+void DirectionalIntraPredictorZone2_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
   const auto* const top = static_cast<const Pixel*>(top_row);
   const auto* const left = static_cast<const Pixel*>(left_column);
   auto* dst = static_cast<Pixel*>(dest);
@@ -146,11 +144,10 @@ void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
 }
 
 template <typename Pixel>
-void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
-                                      const void* const left_column,
-                                      const int width, const int height,
-                                      const int ystep,
-                                      const bool upsampled_left) {
+void DirectionalIntraPredictorZone3_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
   const auto* const left = static_cast<const Pixel*>(left_column);
   stride /= sizeof(Pixel);
 
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
index ebf9da0..40668c2 100644
--- a/src/dsp/intrapred_directional_test.cc
+++ b/src/dsp/intrapred_directional_test.cc
@@ -702,7 +702,11 @@ const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
 }
 
 TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
-  const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_));
+#if LIBGAV1_ENABLE_NEON
+  const auto num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
   for (int i = kZone1; i < kNumZones; ++i) {
     TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
               static_cast<Zone>(i), num_runs);
@@ -867,7 +871,11 @@ const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
 }
 
 TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
-  const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_));
+#if LIBGAV1_ENABLE_NEON
+  const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
   for (int i = kZone1; i < kNumZones; ++i) {
     TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
               static_cast<Zone>(i), num_runs);
@@ -882,6 +890,7 @@ TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
 }
 
 TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest10bpp, Random) { TestRandomValues(); }
 
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc
index f4bd296..9a45eff 100644
--- a/src/dsp/intrapred_filter.cc
+++ b/src/dsp/intrapred_filter.cc
@@ -40,9 +40,9 @@ namespace {
 // adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
 // according to |pred|.
 template <int bitdepth, typename Pixel>
-void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
-                            const void* const top_row,
-                            const void* const left_column,
+void FilterIntraPredictor_C(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column,
                             const FilterIntraPredictor pred, const int width,
                             const int height) {
   const int kMaxPixel = (1 << bitdepth) - 1;
diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc
index 83c005e..0c7f272 100644
--- a/src/dsp/intrapred_smooth.cc
+++ b/src/dsp/intrapred_smooth.cc
@@ -42,26 +42,15 @@ struct SmoothFuncs_C {
 };
 
 constexpr uint8_t kSmoothWeights[] = {
-    // block dimension = 4
-    255, 149, 85, 64,
-    // block dimension = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // block dimension = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // block dimension = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // block dimension = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
-    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
-    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+#include "src/dsp/smooth_weights.inc"
+};
 
 // SmoothFuncs_C::Smooth
 template <int block_width, int block_height, typename Pixel>
 void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const Pixel*>(top_row);
   const auto* const left = static_cast<const Pixel*>(left_column);
   const Pixel top_right = top[block_width - 1];
@@ -94,8 +83,9 @@ void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
 // SmoothFuncs_C::SmoothVertical
 template <int block_width, int block_height, typename Pixel>
 void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const Pixel*>(top_row);
   const auto* const left = static_cast<const Pixel*>(left_column);
   const Pixel bottom_left = left[block_height - 1];
@@ -121,8 +111,9 @@ void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
 // SmoothFuncs_C::SmoothHorizontal
 template <int block_width, int block_height, typename Pixel>
 void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const Pixel*>(top_row);
   const auto* const left = static_cast<const Pixel*>(left_column);
   const Pixel top_right = top[block_width - 1];
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
index ed984d8..5e410e7 100644
--- a/src/dsp/inverse_transform.cc
+++ b/src/dsp/inverse_transform.cc
@@ -532,8 +532,8 @@ void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
 }
 
 template <typename Residual>
-void AdstInputPermutation(int32_t* const dst, const Residual* const src,
-                          int n) {
+void AdstInputPermutation(int32_t* LIBGAV1_RESTRICT const dst,
+                          const Residual* LIBGAV1_RESTRICT const src, int n) {
   assert(n == 8 || n == 16);
   for (int i = 0; i < n; ++i) {
     dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1];
@@ -544,8 +544,8 @@ constexpr int8_t kAdstOutputPermutationLookup[16] = {
     0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1};
 
 template <typename Residual>
-void AdstOutputPermutation(Residual* const dst, const int32_t* const src,
-                           int n) {
+void AdstOutputPermutation(Residual* LIBGAV1_RESTRICT const dst,
+                           const int32_t* LIBGAV1_RESTRICT const src, int n) {
   assert(n == 8 || n == 16);
   const auto shift = static_cast<int8_t>(n == 8);
   for (int i = 0; i < n; ++i) {
@@ -1106,8 +1106,9 @@ template <int bitdepth, typename Residual, typename Pixel,
           InverseTransformDcOnlyFunc dconly_transform1d,
           InverseTransform1DFunc transform1d_func, bool is_row>
 void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
-                     int adjusted_tx_height, void* src_buffer, int start_x,
-                     int start_y, void* dst_frame) {
+                     int adjusted_tx_height, void* LIBGAV1_RESTRICT src_buffer,
+                     int start_x, int start_y,
+                     void* LIBGAV1_RESTRICT dst_frame) {
   constexpr bool lossless = transform1d_type == k1DTransformWht;
   constexpr bool is_identity = transform1d_type == k1DTransformIdentity;
   // The transform size of the WHT is always 4x4. Setting tx_width and
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
index 623e203..cc64ff9 100644
--- a/src/dsp/inverse_transform_test.cc
+++ b/src/dsp/inverse_transform_test.cc
@@ -281,10 +281,14 @@ void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
     int num_tests) {
   libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
 
-  for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
-    const TransformType tx_type = kLibgav1TxType[tx_type_idx];
-    const Transform1D row_transform = kRowTransform[tx_type];
-    const Transform1D column_transform = kColumnTransform[tx_type];
+  for (int tx_type_idx = -1; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = (tx_type_idx == -1)
+                                      ? kTransformTypeDctDct
+                                      : kLibgav1TxType[tx_type_idx];
+    const Transform1D row_transform =
+        (tx_type_idx == -1) ? k1DTransformWht : kRowTransform[tx_type];
+    const Transform1D column_transform =
+        (tx_type_idx == -1) ? k1DTransformWht : kColumnTransform[tx_type];
 
     // Skip the 'C' test case as this is used as the reference.
     if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
@@ -349,7 +353,7 @@ void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
                       << ToString(
                              static_cast<TransformSize1D>(tx_size_1d_column_))
                       << " differs from reference in iteration #" << n
-                      << "tx_type_idx:" << tx_type_idx;
+                      << " tx_type_idx:" << tx_type_idx;
         break;
       }
     }
@@ -360,19 +364,22 @@ void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
       const auto cur_row_elapsed_time_us =
           static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
       printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
-             ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_],
-             base_row_elapsed_time_us, cur_row_elapsed_time_us,
+             (tx_type_idx == -1) ? ToString(row_transform) : ToString(tx_type),
+             kTransformSize1DNames[tx_size_1d_row_], base_row_elapsed_time_us,
+             cur_row_elapsed_time_us,
              static_cast<float>(base_row_elapsed_time_us) /
                  static_cast<float>(cur_row_elapsed_time_us));
       const auto base_column_elapsed_time_us = static_cast<int>(
           absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
       const auto cur_column_elapsed_time_us = static_cast<int>(
           absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
-      printf("TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
-             ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_],
-             base_column_elapsed_time_us, cur_column_elapsed_time_us,
-             static_cast<float>(base_column_elapsed_time_us) /
-                 static_cast<float>(cur_column_elapsed_time_us));
+      printf(
+          "TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+          (tx_type_idx == -1) ? ToString(column_transform) : ToString(tx_type),
+          kTransformSize1DNames[tx_size_1d_column_],
+          base_column_elapsed_time_us, cur_column_elapsed_time_us,
+          static_cast<float>(base_column_elapsed_time_us) /
+              static_cast<float>(cur_column_elapsed_time_us));
     }
   }
 }
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
index a28334d..4bd1443 100644
--- a/src/dsp/libgav1_dsp.cmake
+++ b/src/dsp/libgav1_dsp.cmake
@@ -66,6 +66,7 @@ list(APPEND libgav1_dsp_sources
             "${libgav1_source}/dsp/obmc.cc"
             "${libgav1_source}/dsp/obmc.h"
             "${libgav1_source}/dsp/obmc.inc"
+            "${libgav1_source}/dsp/smooth_weights.inc"
             "${libgav1_source}/dsp/super_res.cc"
             "${libgav1_source}/dsp/super_res.h"
             "${libgav1_source}/dsp/warp.cc"
@@ -90,6 +91,7 @@ list(APPEND libgav1_dsp_sources_neon
             "${libgav1_source}/dsp/arm/cdef_neon.cc"
             "${libgav1_source}/dsp/arm/cdef_neon.h"
             "${libgav1_source}/dsp/arm/common_neon.h"
+            "${libgav1_source}/dsp/arm/convolve_10bit_neon.cc"
             "${libgav1_source}/dsp/arm/convolve_neon.cc"
             "${libgav1_source}/dsp/arm/convolve_neon.h"
             "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc"
@@ -113,6 +115,7 @@ list(APPEND libgav1_dsp_sources_neon
             "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
             "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
             "${libgav1_source}/dsp/arm/loop_filter_neon.h"
+            "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc"
             "${libgav1_source}/dsp/arm/loop_restoration_neon.cc"
             "${libgav1_source}/dsp/arm/loop_restoration_neon.h"
             "${libgav1_source}/dsp/arm/mask_blend_neon.cc"
diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc
index 6cad97d..14d47bf 100644
--- a/src/dsp/loop_filter.cc
+++ b/src/dsp/loop_filter.cc
@@ -56,6 +56,9 @@ struct LoopFilterFuncs_C {
 
 inline void AdjustThresholds(const int bitdepth, int* const outer_thresh,
                              int* const inner_thresh, int* const hev_thresh) {
+  assert(*outer_thresh >= 7 && *outer_thresh <= 3 * kMaxLoopFilterValue + 4);
+  assert(*inner_thresh >= 1 && *inner_thresh <= kMaxLoopFilterValue);
+  assert(*hev_thresh >= 0 && *hev_thresh <= 3);
   *outer_thresh <<= bitdepth - 8;
   *inner_thresh <<= bitdepth - 8;
   *hev_thresh <<= bitdepth - 8;
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
index ca5107a..d013a1b 100644
--- a/src/dsp/loop_filter_test.cc
+++ b/src/dsp/loop_filter_test.cc
@@ -46,8 +46,6 @@ constexpr int kBlockStride = 32;
 constexpr int kNumTests = 50000;
 constexpr int kNumSpeedTests = 500000;
 
-constexpr int kMaxLoopFilter = 63;
-
 template <typename Pixel>
 void InitInput(Pixel* dst, const int stride, const int bitdepth,
                libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh,
@@ -172,11 +170,12 @@ void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
     absl::Duration elapsed_time;
     for (int n = 0; n < num_runs; ++n) {
       Pixel dst[kNumPixels];
-      const auto outer_thresh =
-          static_cast<uint8_t>(rnd(3 * kMaxLoopFilter + 5));
-      const auto inner_thresh = static_cast<uint8_t>(rnd(kMaxLoopFilter + 1));
+      const auto outer_thresh = static_cast<uint8_t>(
+          rnd(3 * kMaxLoopFilterValue - 2) + 7);  // [7, 193].
+      const auto inner_thresh =
+          static_cast<uint8_t>(rnd(kMaxLoopFilterValue) + 1);  // [1, 63].
       const auto hev_thresh =
-          static_cast<uint8_t>(rnd(kMaxLoopFilter + 1) >> 4);
+          static_cast<uint8_t>(rnd(kMaxLoopFilterValue + 1) >> 4);  // [0, 3].
       InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0);
 
       const absl::Time start = absl::Now();
@@ -228,20 +227,20 @@ using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>;
 
 const char* const* GetDigests8bpp(LoopFilterSize size) {
   static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
-      "2e07bdb04b363d4ce69c7d738b1ee01a",
-      "7ff41f2ffa809a2016d342d92afa7f89",
+      "6ba725d697d6209cb36dd199b8ffb47a",
+      "7dbb20e456ed0501fb4e7954f49f5e18",
   };
   static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
-      "2cd4d9ee7497ed67e38fad9cbeb7e278",
-      "75c57a30a927d1aca1ac5c4f175712ca",
+      "89bb757faa44298b7f6e9c1a67f455a5",
+      "be75d5a2fcd83709ff0845f7d83f7006",
   };
   static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
-      "854860a272d58ace223454ea727a6fe4",
-      "4129ee49b047777583c0e9b2006c87bf",
+      "b09137d68c7b4f8a8a15e33b4b69828f",
+      "ef8a7f1aa073805516d3518a82a5cfa4",
   };
   static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
-      "6eb768620b7ccc84b6f88b9193b02ad2",
-      "56e034d9edbe0d5a3cae69b2d9b3486e",
+      "6a7bc061ace0888275af88093f82ca08",
+      "a957ddae005839aa41ba7691788b01e4",
   };
 
   switch (size) {
@@ -290,20 +289,20 @@ using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>;
 
 const char* const* GetDigests10bpp(LoopFilterSize size) {
   static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
-      "657dd0f612734c9c1fb50a2313567af4",
-      "b1c0a0a0b35bad1589badf3c291c0461",
+      "72e75c478bb130ff1ebfa75f3a70b1a2",
+      "f32d67b611080e0bf1a9d162ff47c133",
   };
   static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
-      "d41906d4830157052d5bde417d9df9fc",
-      "451490def78bd649d16d64db4e665a62",
+      "8aec73c60c87ac7cc6bc9cc5157a2795",
+      "0e4385d3a0cbb2b1551e05ad2b0f07fb",
   };
   static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
-      "a763127680f31db7184f2a63ee140268",
-      "1f413bebacaa2435f0e07963a9095243",
+      "85cb2928fae43e1a27b2fe1b78ba7534",
+      "d044fad9d7c64b93ecb60c88ac48e55f",
   };
   static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
-      "f0e61add3e5856657c4055751a6dd6e2",
-      "44da25d613ea601bf5f6e2a42d329cf0",
+      "ebca95ec0db6efbac7ff7cbeabc0e6d0",
+      "754ffaf0ac26a5953a029653bb5dd275",
   };
 
   switch (size) {
@@ -336,6 +335,10 @@ INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
 INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
                          testing::ValuesIn(kLoopFilterSizes));
 #endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
 #endif
 
 }  // namespace
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
index 97a05d4..4c54bc6 100644
--- a/src/dsp/loop_restoration_test.cc
+++ b/src/dsp/loop_restoration_test.cc
@@ -83,6 +83,9 @@ class SelfGuidedFilterTest : public testing::TestWithParam<int>,
       }
     } else if (absl::StartsWith(test_case, "NEON/")) {
       LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_NEON();
+#endif
     } else {
       FAIL() << "Unrecognized architecture prefix in test case name: "
              << test_case;
@@ -228,7 +231,11 @@ void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
   if (target_self_guided_filter_func_ == nullptr) return;
   constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
   const int num_inputs = speed ? 1 : 5;
-  const int num_tests = speed ? 20000 : 1;
+#if LIBGAV1_ENABLE_NEON
+  const int num_tests = speed ? 4000 : 1;
+#else
+  const int num_tests = speed ? 10000 : 1;
+#endif
   libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
   const Pixel* const src = src_ + kOffset;
   Pixel* const dst = dst_ + kOffset;
@@ -310,6 +317,10 @@ INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp,
 INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp,
                          testing::ValuesIn(kUnitWidths));
 #endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
 
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
@@ -348,6 +359,9 @@ class WienerFilterTest : public testing::TestWithParam<int>,
       }
     } else if (absl::StartsWith(test_case, "NEON/")) {
       LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_NEON();
+#endif
     } else {
       FAIL() << "Unrecognized architecture prefix in test case name: "
              << test_case;
@@ -477,7 +491,11 @@ void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
         "3c91bf1a34672cd40bf261c5820d3ec3"}}};
   if (target_wiener_filter_func_ == nullptr) return;
   constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
-  const int num_tests = speed ? 100000 : 1;
+#if LIBGAV1_ENABLE_NEON
+  const int num_tests = speed ? 5000 : 1;
+#else
+  const int num_tests = speed ? 10000 : 1;
+#endif
   const Pixel* const src = src_ + kOffset;
   Pixel* const dst = dst_ + kOffset;
   for (const auto vertical_order : kWienerOrders) {
@@ -545,7 +563,7 @@ void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() {
                                    kStride, unit_width_, unit_height_,
                                    &restoration_buffer_, tmp);
         if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_,
-                                       kStride, kStride, false, false)) {
+                                       kStride, kStride, false, true)) {
           ADD_FAILURE() << "Mismatch -- wiener taps min/max";
         }
       }
@@ -608,6 +626,10 @@ INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp,
 INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
                          testing::ValuesIn(kUnitWidths));
 #endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
 
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
index 15ef821..207fde0 100644
--- a/src/dsp/mask_blend.cc
+++ b/src/dsp/mask_blend.cc
@@ -25,7 +25,8 @@ namespace libgav1 {
 namespace dsp {
 namespace {
 
-uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x,
+uint8_t GetMaskValue(const uint8_t* LIBGAV1_RESTRICT mask,
+                     const uint8_t* LIBGAV1_RESTRICT mask_next_row, int x,
                      int subsampling_x, int subsampling_y) {
   if ((subsampling_x | subsampling_y) == 0) {
     return mask[x];
@@ -43,10 +44,12 @@ uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x,
 
 template <int bitdepth, typename Pixel, bool is_inter_intra, int subsampling_x,
           int subsampling_y>
-void MaskBlend_C(const void* prediction_0, const void* prediction_1,
-                 const ptrdiff_t prediction_stride_1, const uint8_t* mask,
+void MaskBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                 const void* LIBGAV1_RESTRICT prediction_1,
+                 const ptrdiff_t prediction_stride_1,
+                 const uint8_t* LIBGAV1_RESTRICT mask,
                  const ptrdiff_t mask_stride, const int width, const int height,
-                 void* dest, const ptrdiff_t dest_stride) {
+                 void* LIBGAV1_RESTRICT dest, const ptrdiff_t dest_stride) {
   static_assert(!(bitdepth == 8 && is_inter_intra), "");
   assert(mask != nullptr);
   using PredType =
@@ -85,11 +88,12 @@ void MaskBlend_C(const void* prediction_0, const void* prediction_1,
 }
 
 template <int subsampling_x, int subsampling_y>
-void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0,
-                               uint8_t* prediction_1,
+void InterIntraMaskBlend8bpp_C(const uint8_t* LIBGAV1_RESTRICT prediction_0,
+                               uint8_t* LIBGAV1_RESTRICT prediction_1,
                                const ptrdiff_t prediction_stride_1,
-                               const uint8_t* mask, const ptrdiff_t mask_stride,
-                               const int width, const int height) {
+                               const uint8_t* LIBGAV1_RESTRICT mask,
+                               const ptrdiff_t mask_stride, const int width,
+                               const int height) {
   assert(mask != nullptr);
   constexpr int step_y = subsampling_y ? 2 : 1;
   const uint8_t* mask_next_row = mask + mask_stride;
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
index b5e7e60..6d30f48 100644
--- a/src/dsp/mask_blend_test.cc
+++ b/src/dsp/mask_blend_test.cc
@@ -22,7 +22,6 @@
 #include <type_traits>
 
 #include "absl/strings/match.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -57,46 +56,52 @@ constexpr int kCompoundPredictionRange[3][2] = {
 const char* GetDigest8bpp(int id) {
   static const char* const kDigest[] = {
       "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58",
-      "c490478208374a43765900ef7115c264", "b98f222eb70ef8589da2d6c839ca22b8",
-      "54752ca05f67b5af571bc311aa4e3de3", "344b2dab7accd8bd0a255bee16207336",
-      "0b2f6f755d1547eea7e0172f8133ea01", "310dc6364fdacba186c01f0e8ac4fcb7",
+      "2cd162cebf99724a3fc22d501bd8c8e4", "c490478208374a43765900ef7115c264",
+      "b98f222eb70ef8589da2d6c839ca22b8", "54752ca05f67b5af571bc311aa4e3de3",
+      "5ae48814dd285bfca4f5ee8e339dca99", "383f3f4f47563f065d1b6068e5931a24",
+      "344b2dab7accd8bd0a255bee16207336", "0b2f6f755d1547eea7e0172f8133ea01",
+      "310dc6364fdacba186c01f0e8ac4fcb7", "c2ee4673078d34971319c77ca77b23d1",
       "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827",
       "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7",
-      "4346c2860b23f0072b6b288f14c1df36", "8f8dd3eeed74ef115ca8a2f82ebff0ba",
-      "42e8872a81647767636f4c75609e0e2f", "1ff2526547d59557f7bb458249e34527",
-      "cd303d685268aebd2919dd468928d0ba", "254fb3ad990f9d408d252c70dd682e27",
-      "ba8d99c62853d14855f5d93e9574c97b", "e8ab744348681d6aa1043080efa86fc9",
-      "2fa919ca1f54b4336de878ff4015c352", "18e47c9809b909c2bfad08e00dffc635",
-      "9a90c843f06f0b662c509c26f5dd5054", "f89c608f884f37b064fc2b49eb2690a9",
-      "2448734d948ca6ddeb0ce8038a4ab2cf", "a3e0f86b7a5cb49716a424709c00b5a4",
-      "eb84dba768b54da10cded2f932f0aab7", "d6e8fdeb6875b70488f25d7f7ed9423f",
-      "1ca0822febce19c02ddc42a7b3331257", "a9259bb9b87ad002619eb47b907d7226",
-      "6408c5f327f1a9a390fb0046d4bc112b", "dba612489f87d00a82f2735fbcb98dcc",
-      "e8626a97699fbd247d6358ad5f766bee", "5e638a6897d7a2950f3512f871fa19e6",
-      "45a58708939779413f8e0e1de2ee5e6f", "079ae4682d398f0a7e4b66059589586d",
-      "6a06e617308409f9181b59bdd4f63d83", "b05ade2c1a572fc5fcca92b4163d9afb",
-      "30e955c3f86111207d5922575602e90a", "af5e6c65ed48a0eb7d509f7036398728",
-      "f9da3310d7dc75910483dfdd2af6ee62", "a9423b4d67bee5e7c7bc3baa7a9c017a",
-      "6b90a04333407013dd011c1af582e79f", "e658088a74bfb7cc57a2faa74a6f8689",
-      "6eedf27126eba6915035f9f701a1b992", "89116a7c6ad3f70a5b3f3105d04ad1a8",
-      "f41e5e166b049d0006d8b2cab56523b3", "3bed57a684075bbe3c25fd0c3e5520c3",
-      "85c0b21af2afb18ce948abfe3e23c85b", "bd8aaa3602d6b42438f8449f8adb52cb",
-      "1266bad904caad2c6d4047abefc2393d", "6573f2fe2a14c9ab7d5e192742388489",
-      "6b9b443f6306059fa3fe18df9de6dc48", "c9a91ee6ae8b653f552866e4073dd097",
-      "fa58938384198f7709d4871d155ba100", "033d121fc782e83ff94c31e73407d2a8",
-      "7ea268d79f7b8c75a4feeb24e892471a", "73a376bb3e07172d1e094ab8e01a7d42",
-      "13c366e0da1663fac126ea3d3876c110", "2f5eb5fcdf953c63fee2b8c75a6e5568",
-      "2054b197f002223f2d75699884279511", "67ce53e6991657a922d77cc8a23f1e07",
-      "f48e6d666435e7a917d6f90539b0d557", "21d03669d8d255e43552f8fb90724717",
-      "43dbaa1a7aaf2a01764e78e041b6763b", "a8173347ea861ecee6da54f81df73951",
-      "6b97ec4e4647a8de026d693059b855b7", "a85bf4c4b48791ac4971339877e4bc8a",
-      "04cf84d020a60ce3ce53845255ca8ec9", "ddd87035b960499b883d0aefcf96b6b2",
-      "278c5dd102474d598bf788cd66977ba9", "78b3790785811516142d417a49177c8c",
-      "7883ea9c2df0b4f5797cba31f4352678", "727004811025ac97b04940e2eaf68f94",
-      "7ffa3f97ec13dc8b6225550133a392bc", "6f5f2cb7a44aa0daea5c6b3315110591",
-      "88a59d68875fb44ec3be9d3fa293bccb", "0516e71f76b9d998794d3d63e480fa2f",
-      "193793d42f0964b4a958a68d9d7eb4ba", "4d259c7c6a95744e4ebaaa5361befb11",
-      "c090155b997dc103203bcb5a9dcc6282",
+      "4346c2860b23f0072b6b288f14c1df36", "1cdace53543063e129a125c4084ca5d7",
+      "1ae5328e0c0f4f2bec640d1af03b2978", "3860e040fbee0c5f68f0b4af769209b3",
+      "e9480ded15d9c38ee19bf5fa816dd296", "4e17c222b64f428df29938a8120ca256",
+      "2a943bc6de9b29c8bcae189ad3bec276", "b5a6bc02c76fa61040678fb2c6c112d2",
+      "2c11bb9bd29c5577194edb77cfd1c614", "31ed1832810ae385f4ad8f57795dde1e",
+      "eb87d647839c33984dfb25bac0e7cdb3", "f652ec2b1478e35acb19cf28042ee849",
+      "0cfb18ac0cb94af1447bcac32ac20c36", "e152bbbf5ee4b40b7b41ec1f2e901aaa",
+      "f17f78fd485f7beafa8126c1cda801d7", "9f9fbee0cc9d99435efd3dff644be273",
+      "9b498843d66440c1e68dc7ab04f57d42", "2f2b0beceb31b79ccb9179991629e4b8",
+      "e06a6ebb6791529bb23fe5b0a9914220", "2b3d1ff19812a17c17b1be1f1727815e",
+      "d0bbdecec414950ed63a8a35c2bae397", "8e53906c6513058d7f17013fe0d32bf1",
+      "be0690efd31f0bf3c2adcd27ca011ed5", "c2b26243c5f147fdeadf52735aa68fb5",
+      "94bb83e774d9189c5ee04fb361855e19", "dad6441e723791a91f31a56b2136fd33",
+      "10ccac76a2debb842a0685a527b6a659", "346fb0a4914b64dda3ca0f521412b999",
+      "d7e400b855502bbb4f2b8294e207bb96", "3487503f2d73ec52f25b5e8d06c81da4",
+      "3f49c096acfcf46d44ce18b48debca7c", "8ed6a745a2b5457ac7f3ac145ce57e72",
+      "21f9dda5ef934a5ee6274b22cc22f93b", "507b60611afeb373384d9b7606f7ea46",
+      "ac766fadcdb85a47ad14a6846b9e5c36", "fde149bc2162e02bbc5fa85cc41641a5",
+      "f5f094b5742d0a920ba734b017452d24", "c90d06b0c76a0983bd1428df2a1b64b3",
+      "3649e6a6ed9f69e3f78e0b75160fb82a", "1d44b7649497e651216db50d325e3073",
+      "948fa112e90e3ca4d15f3d2f2acfab9a", "9bb54c0f7d07c0b44c44ba09379a04ff",
+      "228261ab6f098f489a8968cff1e1f7ae", "5e128db7462164f7327d1d8feeb2e4c7",
+      "9e8b97f6d9d482d5770b138bd1077747", "81563d505a4e8dd779a089abf2a28b77",
+      "b7157451de7cfa161dff1afd7f9b8622", "6a25cc0a4aaf8a315d1158dbb0ec2966",
+      "303867ee010ba51da485ee10149c6f9b", "63b64b7527d2476e9ae5139b8166e8c9",
+      "cfa93c2aeeb27a1190a445a6fee61e15", "804bcff8709665eed6830e24346101be",
+      "829947ed3e90776cda4ae82918461497", "1df10a1cb80c1a81f521e7e0f80b4f99",
+      "3c9593e42ac574f3555bb8511d438a54", "eecef71492c0626685815e646f728f79",
+      "0c43d59f456ddca2449e016ae4e34be7", "207d4ac2579f1271fc9eca8d743917b3",
+      "3c472bb0b1c891ffda19077ebb659e48", "a4ae7a0d25113bc0238fa27409f9c0dd",
+      "e8ad037ca81f46774bb01d20f46671ce", "b22741e4fe0e4062e40a2decec102ffd",
+      "c72f9e7bc0170163cb94da0faa0d3ffb", "accaf5d475d155cbd3a8c113f90718bc",
+      "2fd31e72444ea258380c16881580de81", "8a6a2a253f6f5b0ff75ba39488e6b082",
+      "c5e8159c0f3ebb7536e84ab3dadac1b3", "ef7ec20b46c7dcf16591835642bd68ef",
+      "0c3425399dc64870d726c2837666a55e", "0365029ffbfc4cedf3bf2d757ea5b9df",
+      "836aa403254af2e04d4b7a7c4db8bfc5", "7f2f3f9c91677b233795169f9a88b2b2",
+      "9fc8bbe787244dac638c367b9c611d13", "f66ef45fae8e163ab0f0f393531dad26",
+      "beb984e88b6f9b96ae6efe5da23ad16b", "1083b829ea766b1d4eb0bb96e9fb3bff",
+      "be8abad1da69e4d238a45fc02a0061cf",
   };
   return kDigest[id];
 }
@@ -105,61 +110,68 @@ const char* GetDigest8bpp(int id) {
 const char* GetDigest10bpp(int id) {
   static const char* const kDigest[] = {
       "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121",
-      "53a3a76bf2bcd5761cd15fc739a4f4e1", "7597f69dc19a584280be0d67911db6a6",
-      "e1221c172843dc6c1b345bcd370771cc", "2ccbe012ca167114b14c3ba70befa960",
-      "0f68632d7e5faddb4554ca430d1df822", "8caa0061a26e142b783951d5abd7bf5d",
+      "e767350f150a84ac5a06dc348e815d62", "53a3a76bf2bcd5761cd15fc739a4f4e1",
+      "7597f69dc19a584280be0d67911db6a6", "e1221c172843dc6c1b345bcd370771cc",
+      "1a640c71ff9bb45505d89761f19efa8f", "e192f64322e0edb250b52f63aaa4de97",
+      "2ccbe012ca167114b14c3ba70befa960", "0f68632d7e5faddb4554ca430d1df822",
+      "8caa0061a26e142b783951d5abd7bf5d", "b01eeed3ec549e4a593100d9c5ba587a",
       "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882",
       "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952",
-      "346a797b7cb4de10759e329f8b49e077", "8f4aa102e9b1ac430bdb9ebd4ec4cfca",
-      "5886397456b15e504ad55d8e0ce71e0e", "2a78b52ce43dc28606e83521963c00fa",
-      "8d3ef5280063337b0df97f91251bb8fc", "81f0ceada000ce40586be828a2045430",
-      "edb7b70a473392148bc419a44385326b", "97abe2eecaf9158a0529b234a241a57a",
-      "65729d750aa1258e4a7eccef247ac8c2", "78cc995e81188b9e8b29fa58796a3313",
-      "a1eb6a8c2f7c77e30e739a1b3b07cc74", "805b0f2f4b9d80f118d800b5ab4f603e",
-      "12610c83533f7170149390ba581f70b2", "cba20deed43b49ada3f626c91510995d",
-      "ba7ea35410b746fcbcf56c24ccb56d59", "933b2235b9b943984607d87f0bce1067",
-      "7ae59015295db8983bc8472429076464", "c18cce63327b367c0a260e9cbf4222b9",
-      "7c9672a7dfa964cb3ed3f2b4b443d2b6", "b29bcf1cc5369702e0179db1198db531",
-      "412326aff6c89116240b5d3ef63fa5cc", "3d854589fd171e42d118be4627ec5330",
-      "9a157e51e39ed314031224f074193791", "c645cdc63d3112f27b90cc9080c6d071",
-      "3f360cc336a4ee9a9bd78bde1a6e9eb3", "37b40fa8674d03a7cd66afdee939b9bf",
-      "cd6c7b98fe71b533c6a06d6d9122a6d0", "c26e0a0e90a969d762edcab770bed3b7",
-      "e517967d2cf4f1b0fff09d334475e2ae", "bc760a328a0a4b2d75593667adfa2a0e",
-      "b6239fdeeccc462640047cb2e2c2be96", "bc01f6a232ef9f0d9e57301779edd67f",
-      "cf6e8c1823c5498fa5589db40406a6ad", "2a9a4bd0bd84f0b85225a5b30f5eaa16",
-      "56f7bb2265dbd8a563bb269aa527c8a3", "fcbed0f0350be5a1384f95f8090d262e",
-      "f3ecf2e5747ebff65ac78ecbe7cc5e6a", "1d57d1371ad2f5f320cc4de789665f7c",
-      "e9f400fee64673b0f6313400fe449135", "5dfdc4a8376740011c777df46418b5d2",
-      "a4eb2c077300c0d8eeda028c9db3a63a", "90551259280c2b2150f018304204f072",
-      "4cbcd76496fc5b841cd164b6067b9c0b", "895964acc7b7e7d084de2266421c351b",
-      "af2e05159d369d0e3b72707f242b2845", "c7d393cef751950df3b9ed8056a9ffce",
-      "788541c0807aed47b863d47e5912555d", "163a06512f48c1b0f2535c8c50815bcc",
-      "dc5e723bab9fbfd7074a62e05b6b3c2b", "bf91200ce1bf97b4642a601adc13d700",
-      "d93fcefa6b9004baaab76d436e7ac931", "e89a2111caecc6bcf5f2b42ea0167ab4",
-      "e04a058df9b87878ca97edc1c42e76e1", "5d1f60876147edd6ed29d1fb50172464",
-      "655fb228aa410fd244c58c87fe510bec", "639a8a0a8f62d628136f5a97b3728b69",
-      "5b60f2428b092a502d6471fa09befd7f", "40601555ac945b4d37d3434b6e5619be",
-      "02be23bf1f89d5f5af02a39b98f96142", "9347a45bd54d28d8105f8183996b3505",
-      "d8429cc7b0b388981861a0fdd40289f0", "c4b7fab3b044486f663e160c07805e0a",
-      "f5f5d513b1f1c13d0abc70fc18afea48", "f236795ea30f1b8761b268734a245ba1",
-      "c7b7452ea8247a3a40248278d08953d5", "ddd6ba3c5ec56cc7a0b0161ae67001fa",
-      "94675749f2db46a8ade6f2f211db9a32", "3d165364ff96a5ef39e67a53fe3ed3be",
-      "3d1d66a9401fd7e78050724ca1fa0419",
+      "346a797b7cb4de10759e329f8b49e077", "d4929154275255f2d786d6fc42c7c5d3",
+      "18a6af6f36ca1ea4ab6f5a76505de040", "0c43e68414bfc02f9b20e796506f643b",
+      "9f483f543f6b1d58e23abf9337ed6fe6", "e114860c2538b63f1be4a23560420cdc",
+      "da8680798f96572c46155c7838b452c3", "20b47a27617297231843c0f2ed7b559b",
+      "16fa4a4f33a32e28c79da83dca63fd41", "76e2c1d3c323777a3c478e11e1ba6bf2",
+      "dccdfd52a71855cc4da18af52bda4c03", "121befbd6c246e85a34225241b8bcaf1",
+      "5780757555fd87ca1ff3f1b498a1d6e9", "6b0be2256285694b1edc0201608e1326",
+      "b7ef338c58d17f69426b5a99170c7295", "b92b84b5b3d01afac02fb9c092e84b06",
+      "e6ef7fea8b183f871c4306c4f49370c5", "c1bf95c05774d8471504e57a3efa66e4",
+      "bbacdbdafc625a139361ec22fe2cf003", "5fbbb2d6ca8fc6d07ca8d4105fda4a01",
+      "c1cbb295d9f00aa865d91a95e96f99b2", "1490e4f2c874a76ecc2bbf35dce446c3",
+      "c3bd73daaeec39895a8b64812773c93c", "6d385068ef3afbd821183d36851f709b",
+      "a34c52ef7f2fd04d1cd420238641ef48", "45d10029358c6835cf968a30605659ea",
+      "a72c1bb18cf9312c5713ce0de370743d", "df7368db2a7515a1c06a4c9dd9e32ebf",
+      "52782632271caccfa9a35ed7533e2052", "6f0ef9b62d2b9956a6464694b7a86b79",
+      "814dbc176f7201725a1cfd1cf668b4b9", "065ffbee984f4b9343c8acb0eb04fcbe",
+      "0915d76ce458d5164e3c90c1ce150795", "bf2b431d9bfa7a9925ea6f6509267ae9",
+      "d3df8c0c940a01b7bf3c3afb80b6dcd4", "15ab86216c9856a8427a51fe599258a3",
+      "2cb078484472c88e26b7401c9f11cf51", "7c5f68cc098c8adabc9e26f9cd549151",
+      "a8e47da1fcc91c2bc74d030892621576", "71af422ba2d86a401f8278591c0ef540",
+      "964c902bb4698ce82f4aa0a1edc80cd6", "78271c37d62af86576dab72ed59746b3",
+      "7247c3a7534a41137027e7d3f255f5ef", "8e529ab964f5f9d0f7c3ced98239cfc8",
+      "2481ed50bff6b36a3cac6dca2aca5ae5", "78a1ff18bf217d45f5170675dee26948",
+      "00fc534119c13aa7af4b818cad9218a2", "67501a83c93f2f9debfa86955bdffde5",
+      "2a512ef738e33a4d8476f72654deffb4", "f4eef28078bbc12de9cfb5bc2fef6238",
+      "b7ac3a35205a978bed587356155bae0e", "51ea101f09c4de2f754b61ab5aff1526",
+      "2bd689d7ec964ee8c8f6f0682f93f5ca", "eecac8dbdaa73b8b3c2234892c444147",
+      "cb7086f44ef70ef919086a3d200d8c13", "0abe35e3c796c2de1e550426b2b19441",
+      "0eb140561e1ea3843464a5247d8ecb18", "d908f7317f00daacbe3dd43495db64ad",
+      "d4d677c4b347de0a13ccab7bc16b8e6e", "26523c2c2df7f31896a3ae5aa24d5ada",
+      "0ebb9f816684769816b2ae0b1f94e3a4", "fd938d0577e3687b0a810e199f69f0bb",
+      "eb8fb832e72030e2aa214936ae0effe4", "56631887763f7daf6e1e73783e5ff656",
+      "590a25cc722c2aa4d885eede5ef09f20", "80944a218ed9b9b0374cde72914449eb",
+      "d9cbc2f1e0e56cdd6722310932db1981", "a88eb213b7a6767bbe639cda120a4ab6",
+      "9972ecbadfdf3ed0b3fedf435c5a804f", "01fdf7e22405a1b17a8d275b7451094f",
+      "6a7824e10406fade0d032e886bbc76b6", "76fefadd793ec3928e915d92782bc7e1",
+      "0fbd6b076752c9f5c926ca5c1df892ac", "aac9457239f07ad633fcd45c1465af2a",
+      "56823ef9a8e21c9c7441cc9ed870d648", "52f4c7a0b7177175302652cbc482f442",
+      "f4a4f4d7c8b93c0486cf3cbaa26fbc19",
   };
   return kDigest[id];
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
 struct MaskBlendTestParam {
-  MaskBlendTestParam(int width, int height, int subsampling_x,
-                     int subsampling_y, bool is_inter_intra,
-                     bool is_wedge_inter_intra)
-      : width(width),
-        height(height),
+  MaskBlendTestParam(BlockSize block_size, int subsampling_x, int subsampling_y,
+                     bool is_inter_intra, bool is_wedge_inter_intra)
+      : block_size(block_size),
+        width(kBlockWidthPixels[block_size]),
+        height(kBlockHeightPixels[block_size]),
         subsampling_x(subsampling_x),
         subsampling_y(subsampling_y),
         is_inter_intra(is_inter_intra),
         is_wedge_inter_intra(is_wedge_inter_intra) {}
+  BlockSize block_size;
   int width;
   int height;
   int subsampling_x;
@@ -169,7 +181,7 @@ struct MaskBlendTestParam {
 };
 
 std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) {
-  return os << "BlockSize" << param.width << "x" << param.height
+  return os << ToString(param.block_size)
             << ", subsampling(x/y): " << param.subsampling_x << "/"
             << param.subsampling_y
             << ", is_inter_intra: " << param.is_inter_intra
@@ -215,40 +227,44 @@ class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
  protected:
   int GetDigestIdOffset() const {
     // id is for retrieving the corresponding digest from the lookup table given
-    // the set of input parameters. id can be figured out by its width, height
-    // and an offset (id_offset).
+    // the set of input parameters. id can be figured out by the block size and
+    // an offset (id_offset).
     // For example, in kMaskBlendTestParam, this set of parameters
     // (8, 8, 0, 0, false, false) corresponds to the first entry in the
     // digest lookup table, where id == 0.
-    // (8, 8, 1, 0, false, false) corresponds to id == 13.
-    // (8, 8, 1, 1, false, false) corresponds to id == 26.
-    // (8, 8, 0, 0, true, false) corresponds to id == 39.
+    // (8, 8, 1, 0, false, false) corresponds to id == 17.
+    // (8, 8, 1, 1, false, false) corresponds to id == 34.
+    // (8, 8, 0, 0, true, false) corresponds to id == 51.
     // Id_offset denotes offset for different modes (is_inter_intra,
-    // is_wedge_inter_intra). Width and height help to figure out id:
-    // width = 8, height = 8, id = id_offset + log2(8) - 3.
-    // width = 8, height = 16, id = id_offset + log2(min(width, height) - 3 + 1.
+    // is_wedge_inter_intra).
     // ...
     if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) {
-      return param_.subsampling_x * 13 + param_.subsampling_y * 13;
+      return param_.subsampling_x * 17 + param_.subsampling_y * 17;
     }
     if (param_.is_inter_intra && !param_.is_wedge_inter_intra) {
-      return 39 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+      return 51 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
     }
     if (param_.is_inter_intra && param_.is_wedge_inter_intra) {
-      return 60 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+      return 72 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
     }
     return 0;
   }
 
   int GetDigestId() const {
-    int id = GetDigestIdOffset();
-    if (param_.width == param_.height) {
-      return id + 3 * (FloorLog2(param_.width) - 3);
+    // Only 8x8 and larger blocks are tested.
+    int block_size_adjustment =
+        static_cast<int>(param_.block_size > kBlock16x4);
+    if (param_.is_inter_intra || param_.is_wedge_inter_intra) {
+      // 4:1/1:4 blocks are invalid for these modes.
+      block_size_adjustment += static_cast<int>(param_.block_size > kBlock8x32);
+      block_size_adjustment +=
+          static_cast<int>(param_.block_size > kBlock16x64);
+      block_size_adjustment += static_cast<int>(param_.block_size > kBlock32x8);
+      block_size_adjustment +=
+          static_cast<int>(param_.block_size > kBlock64x16);
     }
-    if (param_.width < param_.height) {
-      return id + 1 + 3 * (FloorLog2(param_.width) - 3);
-    }
-    return id + 2 + 3 * (FloorLog2(param_.height) - 3);
+    return GetDigestIdOffset() + param_.block_size - kBlock8x8 -
+           block_size_adjustment;
   }
 
   void Test(const char* digest, int num_runs);
@@ -353,100 +369,112 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
     elapsed_time += absl::Now() - start;
   }
 
-  test_utils::CheckMd5Digest(
-      "MaskBlend",
-      absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), digest,
-      dest_, sizeof(dest_), elapsed_time);
+  test_utils::CheckMd5Digest("MaskBlend", ToString(param_.block_size), digest,
+                             dest_, sizeof(dest_), elapsed_time);
 }
 
 const MaskBlendTestParam kMaskBlendTestParam[] = {
     // is_inter_intra = false, is_wedge_inter_intra = false.
     // block size range is from 8x8 to 128x128.
-    MaskBlendTestParam(8, 8, 0, 0, false, false),
-    MaskBlendTestParam(8, 16, 0, 0, false, false),
-    MaskBlendTestParam(16, 8, 0, 0, false, false),
-    MaskBlendTestParam(16, 16, 0, 0, false, false),
-    MaskBlendTestParam(16, 32, 0, 0, false, false),
-    MaskBlendTestParam(32, 16, 0, 0, false, false),
-    MaskBlendTestParam(32, 32, 0, 0, false, false),
-    MaskBlendTestParam(32, 64, 0, 0, false, false),
-    MaskBlendTestParam(64, 32, 0, 0, false, false),
-    MaskBlendTestParam(64, 64, 0, 0, false, false),
-    MaskBlendTestParam(64, 128, 0, 0, false, false),
-    MaskBlendTestParam(128, 64, 0, 0, false, false),
-    MaskBlendTestParam(128, 128, 0, 0, false, false),
-    MaskBlendTestParam(8, 8, 1, 0, false, false),
-    MaskBlendTestParam(8, 16, 1, 0, false, false),
-    MaskBlendTestParam(16, 8, 1, 0, false, false),
-    MaskBlendTestParam(16, 16, 1, 0, false, false),
-    MaskBlendTestParam(16, 32, 1, 0, false, false),
-    MaskBlendTestParam(32, 16, 1, 0, false, false),
-    MaskBlendTestParam(32, 32, 1, 0, false, false),
-    MaskBlendTestParam(32, 64, 1, 0, false, false),
-    MaskBlendTestParam(64, 32, 1, 0, false, false),
-    MaskBlendTestParam(64, 64, 1, 0, false, false),
-    MaskBlendTestParam(64, 128, 1, 0, false, false),
-    MaskBlendTestParam(128, 64, 1, 0, false, false),
-    MaskBlendTestParam(128, 128, 1, 0, false, false),
-    MaskBlendTestParam(8, 8, 1, 1, false, false),
-    MaskBlendTestParam(8, 16, 1, 1, false, false),
-    MaskBlendTestParam(16, 8, 1, 1, false, false),
-    MaskBlendTestParam(16, 16, 1, 1, false, false),
-    MaskBlendTestParam(16, 32, 1, 1, false, false),
-    MaskBlendTestParam(32, 16, 1, 1, false, false),
-    MaskBlendTestParam(32, 32, 1, 1, false, false),
-    MaskBlendTestParam(32, 64, 1, 1, false, false),
-    MaskBlendTestParam(64, 32, 1, 1, false, false),
-    MaskBlendTestParam(64, 64, 1, 1, false, false),
-    MaskBlendTestParam(64, 128, 1, 1, false, false),
-    MaskBlendTestParam(128, 64, 1, 1, false, false),
-    MaskBlendTestParam(128, 128, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x128, 0, 0, false, false),
+    MaskBlendTestParam(kBlock128x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock128x128, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x128, 1, 0, false, false),
+    MaskBlendTestParam(kBlock128x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock128x128, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x128, 1, 1, false, false),
+    MaskBlendTestParam(kBlock128x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock128x128, 1, 1, false, false),
     // is_inter_intra = true, is_wedge_inter_intra = false.
-    // block size range is from 8x8 to 32x32.
-    MaskBlendTestParam(8, 8, 0, 0, true, false),
-    MaskBlendTestParam(8, 16, 0, 0, true, false),
-    MaskBlendTestParam(16, 8, 0, 0, true, false),
-    MaskBlendTestParam(16, 16, 0, 0, true, false),
-    MaskBlendTestParam(16, 32, 0, 0, true, false),
-    MaskBlendTestParam(32, 16, 0, 0, true, false),
-    MaskBlendTestParam(32, 32, 0, 0, true, false),
-    MaskBlendTestParam(8, 8, 1, 0, true, false),
-    MaskBlendTestParam(8, 16, 1, 0, true, false),
-    MaskBlendTestParam(16, 8, 1, 0, true, false),
-    MaskBlendTestParam(16, 16, 1, 0, true, false),
-    MaskBlendTestParam(16, 32, 1, 0, true, false),
-    MaskBlendTestParam(32, 16, 1, 0, true, false),
-    MaskBlendTestParam(32, 32, 1, 0, true, false),
-    MaskBlendTestParam(8, 8, 1, 1, true, false),
-    MaskBlendTestParam(8, 16, 1, 1, true, false),
-    MaskBlendTestParam(16, 8, 1, 1, true, false),
-    MaskBlendTestParam(16, 16, 1, 1, true, false),
-    MaskBlendTestParam(16, 32, 1, 1, true, false),
-    MaskBlendTestParam(32, 16, 1, 1, true, false),
-    MaskBlendTestParam(32, 32, 1, 1, true, false),
+    // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+    // Read inter intra syntax).
+    MaskBlendTestParam(kBlock8x8, 0, 0, true, false),
+    MaskBlendTestParam(kBlock8x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x8, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x32, 0, 0, true, false),
+    MaskBlendTestParam(kBlock32x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock32x32, 0, 0, true, false),
+    MaskBlendTestParam(kBlock8x8, 1, 0, true, false),
+    MaskBlendTestParam(kBlock8x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x8, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x32, 1, 0, true, false),
+    MaskBlendTestParam(kBlock32x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock32x32, 1, 0, true, false),
+    MaskBlendTestParam(kBlock8x8, 1, 1, true, false),
+    MaskBlendTestParam(kBlock8x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x8, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x32, 1, 1, true, false),
+    MaskBlendTestParam(kBlock32x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock32x32, 1, 1, true, false),
     // is_inter_intra = true, is_wedge_inter_intra = true.
-    // block size range is from 8x8 to 32x32.
-    MaskBlendTestParam(8, 8, 0, 0, true, true),
-    MaskBlendTestParam(8, 16, 0, 0, true, true),
-    MaskBlendTestParam(16, 8, 0, 0, true, true),
-    MaskBlendTestParam(16, 16, 0, 0, true, true),
-    MaskBlendTestParam(16, 32, 0, 0, true, true),
-    MaskBlendTestParam(32, 16, 0, 0, true, true),
-    MaskBlendTestParam(32, 32, 0, 0, true, true),
-    MaskBlendTestParam(8, 8, 1, 0, true, true),
-    MaskBlendTestParam(8, 16, 1, 0, true, true),
-    MaskBlendTestParam(16, 8, 1, 0, true, true),
-    MaskBlendTestParam(16, 16, 1, 0, true, true),
-    MaskBlendTestParam(16, 32, 1, 0, true, true),
-    MaskBlendTestParam(32, 16, 1, 0, true, true),
-    MaskBlendTestParam(32, 32, 1, 0, true, true),
-    MaskBlendTestParam(8, 8, 1, 1, true, true),
-    MaskBlendTestParam(8, 16, 1, 1, true, true),
-    MaskBlendTestParam(16, 8, 1, 1, true, true),
-    MaskBlendTestParam(16, 16, 1, 1, true, true),
-    MaskBlendTestParam(16, 32, 1, 1, true, true),
-    MaskBlendTestParam(32, 16, 1, 1, true, true),
-    MaskBlendTestParam(32, 32, 1, 1, true, true),
+    // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+    // Read inter intra syntax).
+    MaskBlendTestParam(kBlock8x8, 0, 0, true, true),
+    MaskBlendTestParam(kBlock8x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x8, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x32, 0, 0, true, true),
+    MaskBlendTestParam(kBlock32x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock32x32, 0, 0, true, true),
+    MaskBlendTestParam(kBlock8x8, 1, 0, true, true),
+    MaskBlendTestParam(kBlock8x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x8, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x32, 1, 0, true, true),
+    MaskBlendTestParam(kBlock32x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock32x32, 1, 0, true, true),
+    MaskBlendTestParam(kBlock8x8, 1, 1, true, true),
+    MaskBlendTestParam(kBlock8x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x8, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x32, 1, 1, true, true),
+    MaskBlendTestParam(kBlock32x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock32x32, 1, 1, true, true),
 };
 
 using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>;
diff --git a/src/dsp/motion_vector_search.cc b/src/dsp/motion_vector_search.cc
index 9402302..2f6fcfd 100644
--- a/src/dsp/motion_vector_search.cc
+++ b/src/dsp/motion_vector_search.cc
@@ -35,10 +35,10 @@ namespace {
      !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch))
 
 void MvProjectionCompoundLowPrecision_C(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
     const int reference_offsets[2], const int count,
-    CompoundMotionVector* const candidate_mvs) {
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
   // To facilitate the compilers, make a local copy of |reference_offsets|.
   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
   int index = 0;
@@ -62,10 +62,10 @@ void MvProjectionCompoundLowPrecision_C(
 }
 
 void MvProjectionCompoundForceInteger_C(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
     const int reference_offsets[2], const int count,
-    CompoundMotionVector* const candidate_mvs) {
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
   // To facilitate the compilers, make a local copy of |reference_offsets|.
   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
   int index = 0;
@@ -91,10 +91,10 @@ void MvProjectionCompoundForceInteger_C(
 }
 
 void MvProjectionCompoundHighPrecision_C(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets,
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
     const int reference_offsets[2], const int count,
-    CompoundMotionVector* const candidate_mvs) {
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
   // To facilitate the compilers, make a local copy of |reference_offsets|.
   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
   int index = 0;
@@ -113,9 +113,10 @@ void MvProjectionCompoundHighPrecision_C(
 }
 
 void MvProjectionSingleLowPrecision_C(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets, const int reference_offset,
-    const int count, MotionVector* const candidate_mvs) {
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
   int index = 0;
   do {
     GetMvProjection(
@@ -131,9 +132,10 @@ void MvProjectionSingleLowPrecision_C(
 }
 
 void MvProjectionSingleForceInteger_C(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets, const int reference_offset,
-    const int count, MotionVector* const candidate_mvs) {
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
   int index = 0;
   do {
     GetMvProjection(
@@ -151,9 +153,10 @@ void MvProjectionSingleForceInteger_C(
 }
 
 void MvProjectionSingleHighPrecision_C(
-    const MotionVector* const temporal_mvs,
-    const int8_t* const temporal_reference_offsets, const int reference_offset,
-    const int count, MotionVector* const candidate_mvs) {
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
   int index = 0;
   do {
     GetMvProjection(
diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc
index 46d1b5b..6b5c6e3 100644
--- a/src/dsp/obmc.cc
+++ b/src/dsp/obmc.cc
@@ -30,15 +30,18 @@ namespace {
 
 // 7.11.3.10 (from top samples).
 template <typename Pixel>
-void OverlapBlendVertical_C(void* const prediction,
+void OverlapBlendVertical_C(void* LIBGAV1_RESTRICT const prediction,
                             const ptrdiff_t prediction_stride, const int width,
-                            const int height, const void* const obmc_prediction,
+                            const int height,
+                            const void* LIBGAV1_RESTRICT const obmc_prediction,
                             const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<Pixel*>(prediction);
   const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
   const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
   const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
   const uint8_t* const mask = kObmcMask + height - 2;
+  assert(width >= 4);
+  assert(height >= 2);
 
   for (int y = 0; y < height; ++y) {
     const uint8_t mask_value = mask[y];
@@ -53,16 +56,19 @@ void OverlapBlendVertical_C(void* const prediction,
 
 // 7.11.3.10 (from left samples).
 template <typename Pixel>
-void OverlapBlendHorizontal_C(void* const prediction,
-                              const ptrdiff_t prediction_stride,
-                              const int width, const int height,
-                              const void* const obmc_prediction,
-                              const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendHorizontal_C(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<Pixel*>(prediction);
   const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
   const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
   const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
   const uint8_t* const mask = kObmcMask + width - 2;
+  assert(width >= 2);
+  assert(height >= 4);
+
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       const uint8_t mask_value = mask[x];
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
index 60b10c6..386d1a6 100644
--- a/src/dsp/obmc_test.cc
+++ b/src/dsp/obmc_test.cc
@@ -42,18 +42,16 @@ namespace {
 #include "src/dsp/obmc.inc"
 
 constexpr int kMaxBlendingBlockSize = 64;
-constexpr int kNumSpeedTests = 1000000;
+constexpr int kNumSpeedTests = 2e8;
 
 const char* GetDigest8bpp(int id) {
   static const char* const kDigest[] = {
-      "76906f87892c30c7059a5c97e4838c42", "0b8670d937217c66425f2662b51eebbe",
       "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f",
       "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b",
       "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c",
       "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519",
       "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b",
-      "e2152dea159249149ff4151111b73ed6", "6b44c0052789ce2fa4df882f35618e7d",
-      "1edd570bec7e63780d83588f6aacda25", "b04b81c9e52c58885907dc7f1ef2c11c",
+      "e2152dea159249149ff4151111b73ed6", "1edd570bec7e63780d83588f6aacda25",
       "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14",
       "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546",
       "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2",
@@ -65,14 +63,12 @@ const char* GetDigest8bpp(int id) {
 
 const char* GetDigestSpeed8bpp(int id) {
   static const char* const kDigest[] = {
-      "c5b532f5960477bdd50684ab25fae0f4", "bf76ed404bc5674e0a4ff238efceb62b",
       "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9",
       "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345",
       "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c",
       "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049",
       "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b",
-      "e3a1960b0a7238db1184a3f9d8e9a4b2", "721c7e8ec3aa0608b64f10f7ff5427db",
-      "ba9938553703d520bc0ade427c397140", "8b6e15e8ecd234363f70f51c64b0aea1",
+      "e3a1960b0a7238db1184a3f9d8e9a4b2", "ba9938553703d520bc0ade427c397140",
       "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e",
       "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70",
       "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad",
@@ -85,14 +81,12 @@ const char* GetDigestSpeed8bpp(int id) {
 #if LIBGAV1_MAX_BITDEPTH >= 10
 const char* GetDigest10bpp(int id) {
   static const char* const kDigest[] = {
-      "6ab8f28e8fb3c4b10b23efee38d4154e", "d4374005d34e43e06c1b0c906289dadd",
       "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075",
       "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3",
       "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d",
       "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca",
       "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6",
-      "d2e70b81882aeb3d9fccef89e7552a9d", "4a692ddf91905727bc524d91735cf93c",
-      "f5d882ee6d9ae6f7dfa467ca99301424", "58821b87e7d9d4388d6003ffcb3723d1",
+      "d2e70b81882aeb3d9fccef89e7552a9d", "f5d882ee6d9ae6f7dfa467ca99301424",
       "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df",
       "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78",
       "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc",
@@ -104,14 +98,12 @@ const char* GetDigest10bpp(int id) {
 
 const char* GetDigestSpeed10bpp(int id) {
   static const char* const kDigest[] = {
-      "df59e5fd6e0237a56381f3a516806eb8", "f478bdf43e0b91b8dc9b2661eb207e49",
       "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5",
       "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225",
       "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246",
       "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c",
       "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863",
-      "a148bbafdc4466fbb700b31acccca8ac", "ff0566921ff2d5145f79fbf409508fb2",
-      "5da9d960988549f53b817003b93e4d01", "fa9028b2ed049ad71b5fd15f2daacbe5",
+      "a148bbafdc4466fbb700b31acccca8ac", "5da9d960988549f53b817003b93e4d01",
       "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089",
       "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d",
       "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed",
@@ -165,16 +157,19 @@ class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
 
  protected:
   int GetDigestId() const {
-    // blending_direction_ == 0:
+    // blending_direction_ == kObmcDirectionVertical:
     // (width, height):
-    // (2, 2), id = 0. (2, 4), id = 1. (4, 2), id = 2.
-    // (4, 4), id = 3. (4, 8), id = 4. (8, 4), id = 5.
+    // (4, 2), id = 0. (4, 4), id = 1. (4, 8), id = 2. (8, 4), id = 3.
     // ...
-    // blending_direction_ == 1: id starts from 13.
-    const int id = (blending_direction_ == kObmcDirectionVertical) ? 0 : 13;
-    if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1);
-    if (width_ < height_) return id + 1 + 3 * (FloorLog2(width_) - 1);
-    return id + 2 + 3 * (FloorLog2(height_) - 1);
+    // blending_direction_ == kObmcDirectionHorizontal: id starts from 11.
+    // Vertical skips (2, 4) while horizontal skips (4, 2) creating a gap after
+    // (2, 4).
+    const int id = (blending_direction_ == kObmcDirectionVertical) ? 0
+                   : (width_ == 2)                                 ? 12
+                                                                   : 11;
+    if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1) - 2;
+    if (width_ < height_) return id + 3 * (FloorLog2(width_) - 1) - 1;
+    return id + 3 * (FloorLog2(height_) - 1);
   }
 
   // Note |digest| is only used when |use_fixed_values| is false.
@@ -184,7 +179,7 @@ class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
  private:
   const int width_ = GetParam().width;
   const int height_ = GetParam().height;
-  const int blending_direction_ = GetParam().blending_direction;
+  const ObmcDirection blending_direction_ = GetParam().blending_direction;
   Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
   Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
   dsp::ObmcBlendFunc func_;
@@ -223,8 +218,9 @@ void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
     EXPECT_TRUE(success);
   } else {
     test_utils::CheckMd5Digest(
-        "Obmc", absl::StrFormat("%dx%d", width_, height_).c_str(), digest,
-        source1_, sizeof(source1_), absl::Duration());
+        ToString(blending_direction_),
+        absl::StrFormat("%dx%d", width_, height_).c_str(), digest, source1_,
+        sizeof(source1_), absl::Duration());
   }
 }
 
@@ -256,14 +252,12 @@ void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
   }
   memcpy(source1_, dest,
          sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
-  test_utils::CheckMd5Digest("Obmc",
+  test_utils::CheckMd5Digest(ToString(blending_direction_),
                              absl::StrFormat("%dx%d", width_, height_).c_str(),
                              digest, source1_, sizeof(source1_), elapsed_time);
 }
 
 const ObmcTestParam kObmcTestParam[] = {
-    ObmcTestParam(2, 2, kObmcDirectionVertical),
-    ObmcTestParam(2, 4, kObmcDirectionVertical),
     ObmcTestParam(4, 2, kObmcDirectionVertical),
     ObmcTestParam(4, 4, kObmcDirectionVertical),
     ObmcTestParam(4, 8, kObmcDirectionVertical),
@@ -275,9 +269,7 @@ const ObmcTestParam kObmcTestParam[] = {
     ObmcTestParam(16, 32, kObmcDirectionVertical),
     ObmcTestParam(32, 16, kObmcDirectionVertical),
     ObmcTestParam(32, 32, kObmcDirectionVertical),
-    ObmcTestParam(2, 2, kObmcDirectionHorizontal),
     ObmcTestParam(2, 4, kObmcDirectionHorizontal),
-    ObmcTestParam(4, 2, kObmcDirectionHorizontal),
     ObmcTestParam(4, 4, kObmcDirectionHorizontal),
     ObmcTestParam(4, 8, kObmcDirectionHorizontal),
     ObmcTestParam(8, 4, kObmcDirectionHorizontal),
@@ -301,9 +293,8 @@ TEST_P(ObmcBlendTest8bpp, Blending) {
 }
 
 TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) {
-  TestSpeed(
-      GetDigestSpeed8bpp(GetDigestId()),
-      (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width));
+  TestSpeed(GetDigestSpeed8bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
 }
 
 INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp,
@@ -331,9 +322,8 @@ TEST_P(ObmcBlendTest10bpp, Blending) {
 }
 
 TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) {
-  TestSpeed(
-      GetDigestSpeed10bpp(GetDigestId()),
-      (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width));
+  TestSpeed(GetDigestSpeed10bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
 }
 
 INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp,
diff --git a/src/dsp/smooth_weights.inc b/src/dsp/smooth_weights.inc
new file mode 100644
index 0000000..d4ee8a6
--- /dev/null
+++ b/src/dsp/smooth_weights.inc
@@ -0,0 +1,35 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Each row below contains weights used for a corresponding block size. Because
+// they are adjacent powers of 2, the index of each row is the sum of the sizes
+// of preceding rows, minus 4.
+// The weights need to be declared as uint8_t or uint16_t, depending on the
+// bitdepth, so the values are held in a single canonical place.
+// clang-format off
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+    // clang-format on
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
index abb01a1..570ba73 100644
--- a/src/dsp/super_res.cc
+++ b/src/dsp/super_res.cc
@@ -25,11 +25,12 @@ namespace dsp {
 namespace {
 
 template <int bitdepth, typename Pixel>
-void SuperRes_C(const void* /*coefficients*/, void* const source,
+void SuperRes_C(const void* /*coefficients*/,
+                void* LIBGAV1_RESTRICT const source,
                 const ptrdiff_t source_stride, const int height,
                 const int downscaled_width, const int upscaled_width,
-                const int initial_subpixel_x, const int step, void* const dest,
-                ptrdiff_t dest_stride) {
+                const int initial_subpixel_x, const int step,
+                void* LIBGAV1_RESTRICT const dest, ptrdiff_t dest_stride) {
   assert(step <= 1 << kSuperResScaleBits);
   auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<Pixel*>(dest);
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc
index fbde65a..dd467ea 100644
--- a/src/dsp/warp.cc
+++ b/src/dsp/warp.cc
@@ -59,14 +59,14 @@ constexpr int kWarpedDiffPrecisionBits = 10;
 //   compound second pass output range: [    8129,    57403]
 
 template <bool is_compound, int bitdepth, typename Pixel>
-void Warp_C(const void* const source, ptrdiff_t source_stride,
+void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
             const int source_width, const int source_height,
-            const int* const warp_params, const int subsampling_x,
-            const int subsampling_y, const int block_start_x,
-            const int block_start_y, const int block_width,
-            const int block_height, const int16_t alpha, const int16_t beta,
-            const int16_t gamma, const int16_t delta, void* dest,
-            ptrdiff_t dest_stride) {
+            const int* LIBGAV1_RESTRICT const warp_params,
+            const int subsampling_x, const int subsampling_y,
+            const int block_start_x, const int block_start_y,
+            const int block_width, const int block_height, const int16_t alpha,
+            const int16_t beta, const int16_t gamma, const int16_t delta,
+            void* LIBGAV1_RESTRICT dest, ptrdiff_t dest_stride) {
   assert(block_width >= 8 && block_height >= 8);
   if (is_compound) {
     assert(dest_stride == block_width);
diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc
index 15d6bc6..41f4c70 100644
--- a/src/dsp/weight_mask.cc
+++ b/src/dsp/weight_mask.cc
@@ -29,8 +29,9 @@ namespace dsp {
 namespace {
 
 template <int width, int height, int bitdepth, bool mask_is_inverse>
-void WeightMask_C(const void* prediction_0, const void* prediction_1,
-                  uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask_C(const void* LIBGAV1_RESTRICT prediction_0,
+                  const void* LIBGAV1_RESTRICT prediction_1,
+                  uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
   using PredType =
       typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
   const auto* pred_0 = static_cast<const PredType*>(prediction_0);
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index ec9f589..911c5a9 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -35,8 +35,9 @@ namespace {
 
 constexpr int kInterPostRoundBit = 4;
 
-inline void AverageBlend4Row(const int16_t* prediction_0,
-                             const int16_t* prediction_1, uint8_t* dest) {
+inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                             const int16_t* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT dest) {
   const __m128i pred_0 = LoadLo8(prediction_0);
   const __m128i pred_1 = LoadLo8(prediction_1);
   __m128i res = _mm_add_epi16(pred_0, pred_1);
@@ -44,8 +45,9 @@ inline void AverageBlend4Row(const int16_t* prediction_0,
   Store4(dest, _mm_packus_epi16(res, res));
 }
 
-inline void AverageBlend8Row(const int16_t* prediction_0,
-                             const int16_t* prediction_1, uint8_t* dest) {
+inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                             const int16_t* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT dest) {
   const __m128i pred_0 = LoadAligned16(prediction_0);
   const __m128i pred_1 = LoadAligned16(prediction_1);
   __m128i res = _mm_add_epi16(pred_0, pred_1);
@@ -53,9 +55,10 @@ inline void AverageBlend8Row(const int16_t* prediction_0,
   StoreLo8(dest, _mm_packus_epi16(res, res));
 }
 
-inline void AverageBlendLargeRow(const int16_t* prediction_0,
-                                 const int16_t* prediction_1, const int width,
-                                 uint8_t* dest) {
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint8_t* LIBGAV1_RESTRICT dest) {
   int x = 0;
   do {
     const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
@@ -71,8 +74,10 @@ inline void AverageBlendLargeRow(const int16_t* prediction_0,
   } while (x < width);
 }
 
-void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1,
-                         const int width, const int height, void* const dest,
+void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         const int width, const int height,
+                         void* LIBGAV1_RESTRICT const dest,
                          const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
@@ -148,11 +153,11 @@ namespace {
 constexpr int kInterPostRoundBitPlusOne = 5;
 
 template <const int width, const int offset>
-inline void AverageBlendRow(const uint16_t* prediction_0,
-                            const uint16_t* prediction_1,
+inline void AverageBlendRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                            const uint16_t* LIBGAV1_RESTRICT prediction_1,
                             const __m128i& compound_offset,
                             const __m128i& round_offset, const __m128i& max,
-                            const __m128i& zero, uint16_t* dst,
+                            const __m128i& zero, uint16_t* LIBGAV1_RESTRICT dst,
                             const ptrdiff_t dest_stride) {
   // pred_0/1 max range is 16b.
   const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
@@ -182,9 +187,10 @@ inline void AverageBlendRow(const uint16_t* prediction_0,
   StoreHi8(dst + dest_stride, result);
 }
 
-void AverageBlend10bpp_SSE4_1(const void* prediction_0,
-                              const void* prediction_1, const int width,
-                              const int height, void* const dest,
+void AverageBlend10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              const int width, const int height,
+                              void* LIBGAV1_RESTRICT const dest,
                               const ptrdiff_t dst_stride) {
   auto* dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc
index d41dc38..01a2b9f 100644
--- a/src/dsp/x86/cdef_avx2.cc
+++ b/src/dsp/x86/cdef_avx2.cc
@@ -269,8 +269,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
       _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
 }
 
-LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
-                                      __m256i* partial) {
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+                                      ptrdiff_t stride, __m256i* partial) {
   // 8x8 input
   // 00 01 02 03 04 05 06 07
   // 10 11 12 13 14 15 16 17
@@ -451,8 +451,10 @@ inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
   cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
 }
 
-void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
-                        uint8_t* const direction, int* const variance) {
+void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source,
+                        ptrdiff_t stride,
+                        uint8_t* LIBGAV1_RESTRICT const direction,
+                        int* LIBGAV1_RESTRICT const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const uint8_t*>(source);
@@ -500,8 +502,9 @@ void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
 // CdefFilter
 
 // Load 4 vectors based on the given |direction|.
-inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
-                          __m128i* output, const int direction) {
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                          const ptrdiff_t stride, __m128i* output,
+                          const int direction) {
   // Each |direction| describes a different set of source values. Expand this
   // set by negating each set. For |direction| == 0 this gives a diagonal line
   // from top right to bottom left. The first value is y, the second x. Negative
@@ -525,8 +528,9 @@ inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
 
 // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
 // do 2 rows at a time.
-void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
-                    __m128i* output, const int direction) {
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, __m128i* output,
+                    const int direction) {
   const int y_0 = kCdefDirections[direction][0][0];
   const int x_0 = kCdefDirections[direction][0][1];
   const int y_1 = kCdefDirections[direction][1][0];
@@ -569,11 +573,11 @@ inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
 }
 
 template <int width, bool enable_primary = true, bool enable_secondary = true>
-void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride,
-                     const int height, const int primary_strength,
-                     const int secondary_strength, const int damping,
-                     const int direction, void* dest,
-                     const ptrdiff_t dst_stride) {
+void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src,
+                     const ptrdiff_t src_stride, const int height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction,
+                     void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
   static_assert(width == 8 || width == 4, "Invalid CDEF width.");
   static_assert(enable_primary || enable_secondary, "");
   constexpr bool clipping_required = enable_primary && enable_secondary;
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
index 6ede778..6c48844 100644
--- a/src/dsp/x86/cdef_sse4.cc
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -241,8 +241,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
 }
 
-LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
-                                      __m128i* partial_lo,
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+                                      ptrdiff_t stride, __m128i* partial_lo,
                                       __m128i* partial_hi) {
   // 8x8 input
   // 00 01 02 03 04 05 06 07
@@ -395,8 +395,10 @@ inline uint32_t SquareSum_S16(const __m128i a) {
   return SumVector_S32(square);
 }
 
-void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
-                          uint8_t* const direction, int* const variance) {
+void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
+                          ptrdiff_t stride,
+                          uint8_t* LIBGAV1_RESTRICT const direction,
+                          int* LIBGAV1_RESTRICT const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const uint8_t*>(source);
@@ -438,8 +440,9 @@ void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
 // CdefFilter
 
 // Load 4 vectors based on the given |direction|.
-inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
-                          __m128i* output, const int direction) {
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                          const ptrdiff_t stride, __m128i* output,
+                          const int direction) {
   // Each |direction| describes a different set of source values. Expand this
   // set by negating each set. For |direction| == 0 this gives a diagonal line
   // from top right to bottom left. The first value is y, the second x. Negative
@@ -463,8 +466,9 @@ inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
 
 // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
 // do 2 rows at a time.
-void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
-                    __m128i* output, const int direction) {
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, __m128i* output,
+                    const int direction) {
   const int y_0 = kCdefDirections[direction][0][0];
   const int x_0 = kCdefDirections[direction][0][1];
   const int y_1 = kCdefDirections[direction][1][0];
@@ -507,10 +511,11 @@ inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
 }
 
 template <int width, bool enable_primary = true, bool enable_secondary = true>
-void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride,
-                       const int height, const int primary_strength,
-                       const int secondary_strength, const int damping,
-                       const int direction, void* dest,
+void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride, const int height,
+                       const int primary_strength, const int secondary_strength,
+                       const int damping, const int direction,
+                       void* LIBGAV1_RESTRICT dest,
                        const ptrdiff_t dst_stride) {
   static_assert(width == 8 || width == 4, "Invalid CDEF width.");
   static_assert(enable_primary || enable_secondary, "");
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 2ecb77c..4126ca9 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -127,10 +127,11 @@ __m256i HorizontalTaps8To16(const __m256i* const src,
 // Filter 2xh sizes.
 template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int /*width*/, const int height,
-                      const __m128i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int /*width*/,
+                      const int height, const __m128i* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
 
@@ -195,10 +196,11 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
 // Filter widths >= 4.
 template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const __m256i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const __m256i* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
 
@@ -467,7 +469,8 @@ __m256i SimpleSum2DVerticalTaps(const __m256i* const src,
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical16xH(const uint16_t* src, void* const dst,
+void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src,
+                          void* LIBGAV1_RESTRICT const dst,
                           const ptrdiff_t dst_stride, const int width,
                           const int height, const __m256i* const taps) {
   assert(width >= 8);
@@ -542,9 +545,10 @@ void Filter2DVertical16xH(const uint16_t* src, void* const dst,
 
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
-    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
-    const ptrdiff_t dst_stride, const int width, const int height,
-    const int filter_id, const int filter_index) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
   assert(filter_id != 0);
   __m128i v_tap[4];
   const __m128i v_horizontal_filter =
@@ -567,9 +571,10 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
 
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
-    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
-    const ptrdiff_t dst_stride, const int width, const int height,
-    const int filter_id, const int filter_index) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
   assert(filter_id != 0);
   __m256i v_tap[4];
   const __m128i v_horizontal_filter =
@@ -602,13 +607,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 }
 
-void Convolve2D_AVX2(const void* const reference,
+void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
                      const ptrdiff_t reference_stride,
                      const int horizontal_filter_index,
                      const int vertical_filter_index,
                      const int horizontal_filter_id,
                      const int vertical_filter_id, const int width,
-                     const int height, void* prediction,
+                     const int height, void* LIBGAV1_RESTRICT prediction,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -774,10 +779,11 @@ __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
 }
 
 template <int filter_index, bool is_compound = false>
-void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
-                        void* const dst, const ptrdiff_t dst_stride,
-                        const int width, const int height,
-                        const __m256i* const v_tap) {
+void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
+                        const ptrdiff_t src_stride,
+                        void* LIBGAV1_RESTRICT const dst,
+                        const ptrdiff_t dst_stride, const int width,
+                        const int height, const __m256i* const v_tap) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -856,10 +862,11 @@ void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 template <int filter_index, bool is_compound = false>
-void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
-                        void* const dst, const ptrdiff_t dst_stride,
-                        const int /*width*/, const int height,
-                        const __m256i* const v_tap) {
+void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
+                        const ptrdiff_t src_stride,
+                        void* LIBGAV1_RESTRICT const dst,
+                        const ptrdiff_t dst_stride, const int /*width*/,
+                        const int height, const __m256i* const v_tap) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps;
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -958,10 +965,11 @@ void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 template <int filter_index, bool is_compound = false>
-void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int /*width*/, const int height,
-                       const __m256i* const v_tap) {
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int /*width*/,
+                       const int height, const __m256i* const v_tap) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps;
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -1055,10 +1063,11 @@ void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 template <int filter_index, bool is_compound = false>
-void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int /*width*/, const int height,
-                       const __m128i* const v_tap) {
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int /*width*/,
+                       const int height, const __m128i* const v_tap) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -1119,13 +1128,13 @@ void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
   } while (--y != 0);
 }
 
-void ConvolveVertical_AVX2(const void* const reference,
+void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
                            const ptrdiff_t reference_stride,
                            const int /*horizontal_filter_index*/,
                            const int vertical_filter_index,
                            const int /*horizontal_filter_id*/,
                            const int vertical_filter_id, const int width,
-                           const int height, void* prediction,
+                           const int height, void* LIBGAV1_RESTRICT prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
@@ -1257,11 +1266,11 @@ void ConvolveVertical_AVX2(const void* const reference,
 }
 
 void ConvolveCompoundVertical_AVX2(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int vertical_filter_index,
-    const int /*horizontal_filter_id*/, const int vertical_filter_id,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t /*pred_stride*/) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
@@ -1366,14 +1375,12 @@ void ConvolveCompoundVertical_AVX2(
   }
 }
 
-void ConvolveHorizontal_AVX2(const void* const reference,
-                             const ptrdiff_t reference_stride,
-                             const int horizontal_filter_index,
-                             const int /*vertical_filter_index*/,
-                             const int horizontal_filter_id,
-                             const int /*vertical_filter_id*/, const int width,
-                             const int height, void* prediction,
-                             const ptrdiff_t pred_stride) {
+void ConvolveHorizontal_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -1390,11 +1397,11 @@ void ConvolveHorizontal_AVX2(const void* const reference,
 }
 
 void ConvolveCompoundHorizontal_AVX2(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -1415,14 +1422,12 @@ void ConvolveCompoundHorizontal_AVX2(
       filter_index);
 }
 
-void ConvolveCompound2D_AVX2(const void* const reference,
-                             const ptrdiff_t reference_stride,
-                             const int horizontal_filter_index,
-                             const int vertical_filter_index,
-                             const int horizontal_filter_id,
-                             const int vertical_filter_id, const int width,
-                             const int height, void* prediction,
-                             const ptrdiff_t pred_stride) {
+void ConvolveCompound2D_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index 9b72fe4..36bd06b 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -37,7 +37,7 @@ namespace {
 #include "src/dsp/x86/convolve_sse4.inc"
 
 template <int filter_index>
-__m128i SumHorizontalTaps(const uint8_t* const src,
+__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
                           const __m128i* const v_tap) {
   __m128i v_src[4];
   const __m128i src_long = LoadUnaligned16(src);
@@ -68,7 +68,7 @@ __m128i SumHorizontalTaps(const uint8_t* const src,
 }
 
 template <int filter_index>
-__m128i SimpleHorizontalTaps(const uint8_t* const src,
+__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
                              const __m128i* const v_tap) {
   __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
 
@@ -84,7 +84,7 @@ __m128i SimpleHorizontalTaps(const uint8_t* const src,
 }
 
 template <int filter_index>
-__m128i HorizontalTaps8To16(const uint8_t* const src,
+__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
                             const __m128i* const v_tap) {
   const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
 
@@ -93,10 +93,11 @@ __m128i HorizontalTaps8To16(const uint8_t* const src,
 
 template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const __m128i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const __m128i* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
 
@@ -206,9 +207,10 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
 
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
-    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
-    const ptrdiff_t dst_stride, const int width, const int height,
-    const int filter_id, const int filter_index) {
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
   assert(filter_id != 0);
   __m128i v_tap[4];
   const __m128i v_horizontal_filter =
@@ -241,13 +243,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 }
 
-void Convolve2D_SSE4_1(const void* const reference,
+void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
                        const ptrdiff_t reference_stride,
                        const int horizontal_filter_index,
                        const int vertical_filter_index,
                        const int horizontal_filter_id,
                        const int vertical_filter_id, const int width,
-                       const int height, void* prediction,
+                       const int height, void* LIBGAV1_RESTRICT prediction,
                        const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -328,10 +330,11 @@ void Convolve2D_SSE4_1(const void* const reference,
 }
 
 template <int filter_index, bool is_compound = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
-                    void* const dst, const ptrdiff_t dst_stride,
-                    const int width, const int height,
-                    const __m128i* const v_tap) {
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const __m128i* const v_tap) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -400,14 +403,12 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
   } while (x < width);
 }
 
-void ConvolveVertical_SSE4_1(const void* const reference,
-                             const ptrdiff_t reference_stride,
-                             const int /*horizontal_filter_index*/,
-                             const int vertical_filter_index,
-                             const int /*horizontal_filter_id*/,
-                             const int vertical_filter_id, const int width,
-                             const int height, void* prediction,
-                             const ptrdiff_t pred_stride) {
+void ConvolveVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
@@ -477,14 +478,12 @@ void ConvolveVertical_SSE4_1(const void* const reference,
   }
 }
 
-void ConvolveCompoundCopy_SSE4(const void* const reference,
-                               const ptrdiff_t reference_stride,
-                               const int /*horizontal_filter_index*/,
-                               const int /*vertical_filter_index*/,
-                               const int /*horizontal_filter_id*/,
-                               const int /*vertical_filter_id*/,
-                               const int width, const int height,
-                               void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompoundCopy_SSE4(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
@@ -539,11 +538,11 @@ void ConvolveCompoundCopy_SSE4(const void* const reference,
 }
 
 void ConvolveCompoundVertical_SSE4_1(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int vertical_filter_index,
-    const int /*horizontal_filter_id*/, const int vertical_filter_id,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t /*pred_stride*/) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
@@ -608,14 +607,12 @@ void ConvolveCompoundVertical_SSE4_1(
   }
 }
 
-void ConvolveHorizontal_SSE4_1(const void* const reference,
-                               const ptrdiff_t reference_stride,
-                               const int horizontal_filter_index,
-                               const int /*vertical_filter_index*/,
-                               const int horizontal_filter_id,
-                               const int /*vertical_filter_id*/,
-                               const int width, const int height,
-                               void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -626,11 +623,11 @@ void ConvolveHorizontal_SSE4_1(const void* const reference,
 }
 
 void ConvolveCompoundHorizontal_SSE4_1(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t /*pred_stride*/) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
   auto* dest = static_cast<uint16_t*>(prediction);
@@ -640,14 +637,12 @@ void ConvolveCompoundHorizontal_SSE4_1(
       filter_index);
 }
 
-void ConvolveCompound2D_SSE4_1(const void* const reference,
-                               const ptrdiff_t reference_stride,
-                               const int horizontal_filter_index,
-                               const int vertical_filter_index,
-                               const int horizontal_filter_id,
-                               const int vertical_filter_id, const int width,
-                               const int height, void* prediction,
-                               const ptrdiff_t /*pred_stride*/) {
+void ConvolveCompound2D_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   alignas(16) uint16_t
@@ -835,7 +830,8 @@ inline void GetHalfSubPixelFilter(__m128i* output) {
 // exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
 // |step_x|.
 template <int num_taps, int grade_x>
-inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
+                                 const __m128i src_indices,
                                  __m128i* const source /*[num_taps >> 1]*/) {
   // |used_bytes| is only computed in msan builds. Mask away unused bytes for
   // msan because it incorrectly models the outcome of the shuffles in some
@@ -900,10 +896,11 @@ inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
 }
 
 template <int grade_x, int filter_index, int num_taps>
-inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
-                                    int width, int subpixel_x, int step_x,
+inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
+                                    ptrdiff_t src_stride, int width,
+                                    int subpixel_x, int step_x,
                                     int intermediate_height,
-                                    int16_t* intermediate) {
+                                    int16_t* LIBGAV1_RESTRICT intermediate) {
   // Account for the 0-taps that precede the 2 nonzero taps.
   const int kernel_offset = (8 - num_taps) >> 1;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -976,7 +973,8 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
 }
 
 template <int num_taps>
-inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
+                                __m128i* output) {
   // Avoid overreading the filter due to starting at kernel_offset.
   // The only danger of overread is in the final filter, which has 4 taps.
   const __m128i filter =
@@ -1072,10 +1070,11 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
 // |width_class| is 2, 4, or 8, according to the Store function that should be
 // used.
 template <int num_taps, int width_class, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
-                                  const int subpixel_y, const int filter_index,
-                                  const int step_y, const int height,
-                                  void* dest, const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
+                                  const int width, const int subpixel_y,
+                                  const int filter_index, const int step_y,
+                                  const int height, void* LIBGAV1_RESTRICT dest,
+                                  const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   constexpr int kernel_offset = (8 - num_taps) / 2;
   const int16_t* src_y = src;
@@ -1168,13 +1167,13 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
 }
 
 template <bool is_compound>
-void ConvolveScale2D_SSE4_1(const void* const reference,
+void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
                             const ptrdiff_t reference_stride,
                             const int horizontal_filter_index,
                             const int vertical_filter_index,
                             const int subpixel_x, const int subpixel_y,
                             const int step_x, const int step_y, const int width,
-                            const int height, void* prediction,
+                            const int height, void* LIBGAV1_RESTRICT prediction,
                             const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -1342,16 +1341,18 @@ void ConvolveScale2D_SSE4_1(const void* const reference,
   }
 }
 
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                              uint8_t* LIBGAV1_RESTRICT dst) {
   const __m128i left = LoadUnaligned16(src);
   const __m128i right = LoadUnaligned16(src + 1);
   StoreUnaligned16(dst, _mm_avg_epu8(left, right));
 }
 
 template <int width>
-inline void IntraBlockCopyHorizontal(const uint8_t* src,
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                                      const ptrdiff_t src_stride,
-                                     const int height, uint8_t* dst,
+                                     const int height,
+                                     uint8_t* LIBGAV1_RESTRICT dst,
                                      const ptrdiff_t dst_stride) {
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
@@ -1392,10 +1393,11 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
 }
 
 void ConvolveIntraBlockCopyHorizontal_SSE4_1(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
 
@@ -1464,9 +1466,10 @@ void ConvolveIntraBlockCopyHorizontal_SSE4_1(
 }
 
 template <int width>
-inline void IntraBlockCopyVertical(const uint8_t* src,
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
                                    const ptrdiff_t src_stride, const int height,
-                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
   __m128i row[8], below[8];
@@ -1553,11 +1556,11 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
 }
 
 void ConvolveIntraBlockCopyVertical_SSE4_1(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
 
@@ -1622,7 +1625,8 @@ void ConvolveIntraBlockCopyVertical_SSE4_1(
 }
 
 // Load then add two uint8_t vectors. Return the uint16_t vector result.
-inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
+                                const uint8_t* LIBGAV1_RESTRICT src1) {
   const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
   const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
   return _mm_add_epi16(a, b);
@@ -1637,8 +1641,9 @@ inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
 }
 
 template <int width>
-inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
-                             const int height, uint8_t* dst,
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint8_t* LIBGAV1_RESTRICT dst,
                              const ptrdiff_t dst_stride) {
   const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
@@ -1793,11 +1798,11 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 void ConvolveIntraBlockCopy2D_SSE4_1(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* const prediction,
-    const ptrdiff_t pred_stride) {
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
   // Note: allow vertical access to height + 1. Because this function is only
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index 3c29b19..c813df4 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -54,8 +54,10 @@ inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
 
 template <int height>
 inline void DistanceWeightedBlend4xH_SSE4_1(
-    const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
 
@@ -98,8 +100,10 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
 
 template <int height>
 inline void DistanceWeightedBlend8xH_SSE4_1(
-    const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
 
@@ -125,9 +129,10 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
 }
 
 inline void DistanceWeightedBlendLarge_SSE4_1(
-    const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, const int width, const int height, void* const dest,
-    const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
 
@@ -154,11 +159,12 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
   } while (--y != 0);
 }
 
-void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
-                                  const void* prediction_1,
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
                                   const uint8_t weight_0,
                                   const uint8_t weight_1, const int width,
-                                  const int height, void* const dest,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
                                   const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
@@ -257,8 +263,10 @@ inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
 
 template <int height>
 inline void DistanceWeightedBlend4xH_SSE4_1(
-    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint16_t*>(dest);
   const __m128i weight0 = _mm_set1_epi32(weight_0);
   const __m128i weight1 = _mm_set1_epi32(weight_1);
@@ -301,8 +309,10 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
 
 template <int height>
 inline void DistanceWeightedBlend8xH_SSE4_1(
-    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint16_t*>(dest);
   const __m128i weight0 = _mm_set1_epi32(weight_0);
   const __m128i weight1 = _mm_set1_epi32(weight_1);
@@ -332,9 +342,10 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
 }
 
 inline void DistanceWeightedBlendLarge_SSE4_1(
-    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, const int width, const int height, void* const dest,
-    const ptrdiff_t dest_stride) {
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint16_t*>(dest);
   const __m128i weight0 = _mm_set1_epi32(weight_0);
   const __m128i weight1 = _mm_set1_epi32(weight_1);
@@ -364,11 +375,12 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
   } while (--y != 0);
 }
 
-void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
-                                  const void* prediction_1,
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
                                   const uint8_t weight_0,
                                   const uint8_t weight_1, const int width,
-                                  const int height, void* const dest,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
                                   const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
index 745c1ca..f2b8c21 100644
--- a/src/dsp/x86/film_grain_sse4.cc
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -162,11 +162,11 @@ inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
 
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageLuma_SSE4_1(
-    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
-    int width, int height, int start_height,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
     const uint8_t scaling_lut_y[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
-    ptrdiff_t dest_stride_y) {
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    void* LIBGAV1_RESTRICT dest_plane_y, ptrdiff_t dest_stride_y) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
   const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
@@ -216,9 +216,10 @@ void BlendNoiseWithImageLuma_SSE4_1(
 
 template <int bitdepth, typename GrainType, typename Pixel>
 inline __m128i BlendChromaValsWithCfl(
-    const Pixel* average_luma_buffer,
+    const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
     const uint8_t scaling_lut[kScalingLookupTableSize],
-    const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+    const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+    const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
     const __m128i scaling_shift) {
   const __m128i scaling =
       GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
@@ -233,10 +234,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
     const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
     int width, int height, int start_height, int subsampling_x,
     int subsampling_y, int scaling_shift,
-    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
-    ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
-    ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
-    ptrdiff_t dest_stride) {
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const Pixel* LIBGAV1_RESTRICT in_chroma_row, ptrdiff_t source_stride_chroma,
+    Pixel* LIBGAV1_RESTRICT out_chroma_row, ptrdiff_t dest_stride) {
   const __m128i floor = _mm_set1_epi16(min_value);
   const __m128i ceiling = _mm_set1_epi16(max_chroma);
   alignas(16) Pixel luma_buffer[16];
@@ -306,13 +307,13 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
 template <int bitdepth, typename GrainType, typename Pixel>
 void BlendNoiseWithImageChromaWithCfl_SSE4_1(
-    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
-    int min_value, int max_chroma, int width, int height, int start_height,
-    int subsampling_x, int subsampling_y,
-    const uint8_t scaling_lut[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y,
-    const void* source_plane_uv, ptrdiff_t source_stride_uv,
-    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* LIBGAV1_RESTRICT source_plane_uv, ptrdiff_t source_stride_uv,
+    void* LIBGAV1_RESTRICT dest_plane_uv, ptrdiff_t dest_stride_uv) {
   const auto* noise_image =
       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
@@ -336,9 +337,9 @@ namespace {
 // |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
 inline __m128i BlendChromaValsNoCfl8bpp(
     const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
-    const int8_t* noise_image_cursor, const __m128i& average_luma,
-    const __m128i& scaling_shift, const __m128i& offset,
-    const __m128i& weights) {
+    const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const __m128i& average_luma, const __m128i& scaling_shift,
+    const __m128i& offset, const __m128i& weights) {
   uint8_t merged_buffer[8];
   const __m128i combined_lo =
       _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
@@ -362,9 +363,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
     int width, int height, int start_height, int subsampling_x,
     int subsampling_y, int scaling_shift, int chroma_offset,
     int chroma_multiplier, int luma_multiplier,
-    const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
-    ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
-    ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint8_t* LIBGAV1_RESTRICT in_chroma_row,
+    ptrdiff_t source_stride_chroma, uint8_t* LIBGAV1_RESTRICT out_chroma_row,
     ptrdiff_t dest_stride) {
   const __m128i floor = _mm_set1_epi16(min_value);
   const __m128i ceiling = _mm_set1_epi16(max_chroma);
@@ -432,13 +434,13 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
 
 // This function is for the case params_.chroma_scaling_from_luma == false.
 void BlendNoiseWithImageChroma8bpp_SSE4_1(
-    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
-    int min_value, int max_chroma, int width, int height, int start_height,
-    int subsampling_x, int subsampling_y,
-    const uint8_t scaling_lut[kScalingLookupTableSize],
-    const void* source_plane_y, ptrdiff_t source_stride_y,
-    const void* source_plane_uv, ptrdiff_t source_stride_uv,
-    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* LIBGAV1_RESTRICT source_plane_uv, ptrdiff_t source_stride_uv,
+    void* LIBGAV1_RESTRICT dest_plane_uv, ptrdiff_t dest_stride_uv) {
   assert(plane == kPlaneU || plane == kPlaneV);
   const auto* noise_image =
       static_cast<const Array2D<int8_t>*>(noise_image_ptr);
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
index f2dcfdb..eb7e466 100644
--- a/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -88,7 +88,7 @@ inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
 
 template <int width, int height>
 void CflIntraPredictor_SSE4_1(
-    void* const dest, ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
@@ -127,7 +127,8 @@ void CflIntraPredictor_SSE4_1(
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   static_assert(block_height_log2 <= 4, "");
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
@@ -189,7 +190,7 @@ template <int block_height_log2>
 void CflSubsampler444_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_height_log2 <= 4, "");
   assert(max_luma_width >= 4);
   assert(max_luma_height >= 4);
@@ -209,7 +210,7 @@ template <int block_height_log2, bool inside>
 void CflSubsampler444_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_height_log2 <= 5, "");
   const int block_height = 1 << block_height_log2, block_width = 8;
   const int visible_height = max_luma_height;
@@ -292,7 +293,7 @@ template <int block_height_log2>
 void CflSubsampler444_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_height_log2 <= 5, "");
   assert(max_luma_width >= 4);
   assert(max_luma_height >= 4);
@@ -315,7 +316,7 @@ template <int block_width_log2, int block_height_log2, bool inside>
 void CflSubsampler444_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
   static_assert(block_height_log2 <= 5, "");
   assert(max_luma_width >= 4);
@@ -418,7 +419,7 @@ template <int block_width_log2, int block_height_log2>
 void CflSubsampler444_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
   static_assert(block_height_log2 <= 5, "");
   assert(max_luma_width >= 4);
@@ -441,7 +442,7 @@ template <int block_height_log2>
 void CflSubsampler420_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const auto* src = static_cast<const uint8_t*>(source);
   int16_t* luma_ptr = luma[0];
@@ -511,7 +512,7 @@ template <int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const auto* src = static_cast<const uint8_t*>(source);
   const __m128i zero = _mm_setzero_si128();
@@ -620,7 +621,7 @@ template <int block_height_log2>
 void CflSubsampler420_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   if (max_luma_width == 8) {
     CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
         luma, max_luma_width, max_luma_height, source, stride);
@@ -634,7 +635,7 @@ template <int block_width_log2, int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_WxH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const auto* src = static_cast<const uint8_t*>(source);
   const __m128i zero = _mm_setzero_si128();
   __m128i final_sum = zero;
@@ -751,7 +752,7 @@ template <int block_width_log2, int block_height_log2>
 void CflSubsampler420_WxH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   switch (max_luma_width) {
     case 8:
       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
@@ -968,7 +969,7 @@ inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
 
 template <int width, int height>
 void CflIntraPredictor_10bpp_SSE4_1(
-    void* const dest, ptrdiff_t stride,
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   constexpr int kCflLumaBufferStrideLog2_16i = 5;
@@ -1018,7 +1019,8 @@ void CflIntraPredictor_10bpp_SSE4_1(
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   static_assert(block_height_log2 <= 4, "");
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
@@ -1079,7 +1081,7 @@ template <int block_height_log2>
 void CflSubsampler444_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_cast<void>(max_luma_width);
   static_cast<void>(max_luma_height);
   static_assert(block_height_log2 <= 4, "");
@@ -1099,7 +1101,8 @@ void CflSubsampler444_4xH_SSE4_1(
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
@@ -1158,7 +1161,7 @@ template <int block_height_log2>
 void CflSubsampler444_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_cast<void>(max_luma_width);
   static_cast<void>(max_luma_height);
   static_assert(block_height_log2 <= 5, "");
@@ -1182,7 +1185,7 @@ template <int block_width_log2, int block_height_log2, bool is_inside>
 void CflSubsampler444_WxH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
   const int block_width = 1 << block_width_log2;
@@ -1278,7 +1281,7 @@ template <int block_width_log2, int block_height_log2>
 void CflSubsampler444_WxH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   static_assert(block_width_log2 == 4 || block_width_log2 == 5,
                 "This function will only work for block_width 16 and 32.");
   static_assert(block_height_log2 <= 5, "");
@@ -1300,7 +1303,7 @@ template <int block_height_log2>
 void CflSubsampler420_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const auto* src = static_cast<const uint16_t*>(source);
   const ptrdiff_t src_stride = stride / sizeof(src[0]);
@@ -1371,7 +1374,8 @@ void CflSubsampler420_4xH_SSE4_1(
 template <int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   const int block_height = 1 << block_height_log2;
   const auto* src = static_cast<const uint16_t*>(source);
   const ptrdiff_t src_stride = stride / sizeof(src[0]);
@@ -1483,7 +1487,7 @@ template <int block_height_log2>
 void CflSubsampler420_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   if (max_luma_width == 8) {
     CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
                                                           source, stride);
@@ -1496,7 +1500,8 @@ void CflSubsampler420_8xH_SSE4_1(
 template <int block_width_log2, int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_WxH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
   const auto* src = static_cast<const uint16_t*>(source);
   const ptrdiff_t src_stride = stride / sizeof(src[0]);
   const __m128i zero = _mm_setzero_si128();
@@ -1615,7 +1620,7 @@ template <int block_width_log2, int block_height_log2>
 void CflSubsampler420_WxH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
   switch (max_luma_width) {
     case 8:
       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc
index 022af8d..a43a5cf 100644
--- a/src/dsp/x86/intrapred_filter_sse4.cc
+++ b/src/dsp/x86/intrapred_filter_sse4.cc
@@ -64,10 +64,10 @@ constexpr int kDuplicateFirstHalf = 0x44;
 // at zero to preserve the sum.
 // |pixels| contains p0-p7 in order as shown above.
 // |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
-                             const __m128i& pixels, const __m128i& taps_0_1,
-                             const __m128i& taps_2_3, const __m128i& taps_4_5,
-                             const __m128i& taps_6_7) {
+inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t stride, const __m128i& pixels,
+                             const __m128i& taps_0_1, const __m128i& taps_2_3,
+                             const __m128i& taps_4_5, const __m128i& taps_6_7) {
   const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
   const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
   // |output_half| contains 8 partial sums for f0-f7.
@@ -93,10 +93,10 @@ inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
 // for successive blocks. This implementation takes advantage of the fact
 // that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
 // using shifts to arrange things to fit reusable shuffle vectors.
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
-                      const uint8_t* const top_ptr,
-                      const uint8_t* const left_ptr, FilterIntraPredictor pred,
-                      const int height) {
+inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride,
+                      const uint8_t* LIBGAV1_RESTRICT const top_ptr,
+                      const uint8_t* LIBGAV1_RESTRICT const left_ptr,
+                      FilterIntraPredictor pred, const int height) {
   // Two filter kernels per vector.
   const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
   const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
@@ -271,9 +271,10 @@ inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
   }
 }
 
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column,
+void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                                 ptrdiff_t stride,
+                                 const void* LIBGAV1_RESTRICT const top_row,
+                                 const void* LIBGAV1_RESTRICT const left_column,
                                  FilterIntraPredictor pred, const int width,
                                  const int height) {
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
index de9f551..b53ee8c 100644
--- a/src/dsp/x86/intrapred_smooth_sse4.cc
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -38,23 +38,12 @@ namespace {
 // to have visibility of the values. This helps reduce loads and in the
 // creation of the inverse weights.
 constexpr uint8_t kSmoothWeights[] = {
-    // block dimension = 4
-    255, 149, 85, 64,
-    // block dimension = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // block dimension = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // block dimension = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // block dimension = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
-    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
-    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+#include "src/dsp/smooth_weights.inc"
+};
 
 template <int y_mask>
-inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
+inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
+                                      const __m128i& left,
                                       const __m128i& weights,
                                       const __m128i& scaled_top_right,
                                       const __m128i& round) {
@@ -77,7 +66,8 @@ inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
   return _mm_add_epi16(scaled_corner, weighted_px);
 }
 
-inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
+inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
+                                       const __m128i& pixels,
                                        const __m128i& weights,
                                        const __m128i& scaled_corner,
                                        const __m128i& round) {
@@ -91,13 +81,11 @@ inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
 // scaled_corner2 are the same.
-inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
-                                        const __m128i& pixels2,
-                                        const __m128i& weights1,
-                                        const __m128i& weights2,
-                                        const __m128i& scaled_corner1,
-                                        const __m128i& scaled_corner2,
-                                        const __m128i& round) {
+inline void WriteSmoothDirectionalSum16(
+    uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
+    const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
+    const __m128i& scaled_corner1, const __m128i& scaled_corner2,
+    const __m128i& round) {
   const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
   const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
   const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
@@ -109,8 +97,9 @@ inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
 }
 
 template <int y_mask>
-inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
-                                const __m128i& left, const __m128i& weights_x,
+inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
+                                const __m128i& top, const __m128i& left,
+                                const __m128i& weights_x,
                                 const __m128i& weights_y,
                                 const __m128i& scaled_bottom_left,
                                 const __m128i& scaled_top_right,
@@ -135,7 +124,8 @@ inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
 // pixels[0]: above and below_pred interleave vector
 // pixels[1]: left vector
 // pixels[2]: right_pred vector
-inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
+inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+                              const uint8_t* LIBGAV1_RESTRICT left,
                               const int height, __m128i* pixels) {
   if (height == 4) {
     pixels[1] = Load4(left);
@@ -156,8 +146,9 @@ inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
 // weight_h[2]: same as [0], second half for height = 16 only
 // weight_h[3]: same as [1], second half for height = 16 only
 // weight_w[0]: weights_w and scale - weights_w interleave vector
-inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
-                               __m128i* weight_h, __m128i* weight_w) {
+inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
+                               const int height, __m128i* weight_h,
+                               __m128i* weight_w) {
   const __m128i scale = _mm_set1_epi16(256);
   const __m128i x_weights = Load4(weight_array);
   weight_h[0] = _mm_cvtepu8_epi16(x_weights);
@@ -179,7 +170,8 @@ inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
 }
 
 inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
-                               const __m128i* weight_x, uint8_t* dst,
+                               const __m128i* weight_x,
+                               uint8_t* LIBGAV1_RESTRICT dst,
                                const ptrdiff_t stride,
                                const bool use_second_half) {
   const __m128i round = _mm_set1_epi32(256);
@@ -215,8 +207,9 @@ inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
 
 // The interleaving approach has some overhead that causes it to underperform in
 // the 4x4 case.
-void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                      const void* top_row, const void* left_column) {
+void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
   const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
   const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
@@ -247,8 +240,9 @@ void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
                             scaled_bottom_left, scaled_top_right, scale);
 }
 
-void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                      const void* top_row, const void* left_column) {
+void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   __m128i weights_x[1];
@@ -260,8 +254,10 @@ void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
 }
 
-void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                       const void* top_row, const void* left_column) {
+void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   __m128i weights_x[1];
@@ -283,7 +279,8 @@ void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
 // pixels[5]: above and below_pred interleave vector, second half
 // pixels[6]: left vector + 16
 // pixels[7]: right_pred vector
-inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
+inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
+                              const uint8_t* LIBGAV1_RESTRICT left,
                               const int height, __m128i* pixels) {
   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
   __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
@@ -317,8 +314,9 @@ inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
 // weight_h[7]: same as [1], offset 24
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
-                               __m128i* weight_w, __m128i* weight_h) {
+inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
+                               const int height, __m128i* weight_w,
+                               __m128i* weight_h) {
   const int offset = (height < 8) ? 0 : 4;
   __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
   weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
@@ -360,7 +358,8 @@ inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
 
 inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
                                const __m128i* weights_y, const int height,
-                               uint8_t* dst, const ptrdiff_t stride,
+                               uint8_t* LIBGAV1_RESTRICT dst,
+                               const ptrdiff_t stride,
                                const bool use_second_half) {
   const __m128i round = _mm_set1_epi32(256);
   const __m128i mask_increment = _mm_set1_epi16(0x0202);
@@ -405,8 +404,9 @@ inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
   }
 }
 
-void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                      const void* top_row, const void* left_column) {
+void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   __m128i pixels[4];
@@ -419,8 +419,9 @@ void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
 }
 
-void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                      const void* top_row, const void* left_column) {
+void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
 
@@ -434,8 +435,10 @@ void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
 }
 
-void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                       const void* top_row, const void* left_column) {
+void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   __m128i pixels[4];
@@ -450,8 +453,10 @@ void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
 }
 
-void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                       const void* top_row, const void* left_column) {
+void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   __m128i pixels[8];
@@ -473,8 +478,9 @@ void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
 }
 
 template <int width, int height>
-void SmoothWxH(void* const dest, const ptrdiff_t stride,
-               const void* const top_row, const void* const left_column) {
+void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+               const void* LIBGAV1_RESTRICT const top_row,
+               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
@@ -532,8 +538,10 @@ void SmoothWxH(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
-                                const void* top_row, const void* left_column) {
+void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
+                                const ptrdiff_t stride,
+                                const void* LIBGAV1_RESTRICT top_row,
+                                const void* LIBGAV1_RESTRICT left_column) {
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
@@ -553,9 +561,10 @@ void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
 }
 
-void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothHorizontal4x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi32(top[3]);
   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
@@ -585,9 +594,10 @@ void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
 }
 
-void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column) {
+void SmoothHorizontal4x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi32(top[3]);
   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
@@ -637,9 +647,10 @@ void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
 }
 
-void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothHorizontal8x4_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[7]);
   const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
@@ -666,9 +677,10 @@ void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
 }
 
-void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothHorizontal8x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[7]);
   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -686,9 +698,10 @@ void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column) {
+void SmoothHorizontal8x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[7]);
   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
@@ -714,9 +727,10 @@ void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column) {
+void SmoothHorizontal8x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[7]);
   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
@@ -756,9 +770,10 @@ void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column) {
+void SmoothHorizontal16x4_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[15]);
   const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
@@ -795,9 +810,10 @@ void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
                               scaled_top_right1, scaled_top_right2, scale);
 }
 
-void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column) {
+void SmoothHorizontal16x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[15]);
   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -822,9 +838,10 @@ void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal16x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[15]);
   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -858,9 +875,10 @@ void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal16x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[15]);
   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -910,9 +928,10 @@ void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal16x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[15]);
   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -940,9 +959,10 @@ void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column) {
+void SmoothHorizontal32x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[31]);
   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -978,9 +998,10 @@ void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal32x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[31]);
   const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -1027,9 +1048,10 @@ void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal32x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[31]);
   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
@@ -1096,9 +1118,10 @@ void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal32x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[31]);
   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
@@ -1137,9 +1160,10 @@ void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal64x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[63]);
   const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -1212,9 +1236,10 @@ void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal64x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[63]);
   const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -1315,9 +1340,10 @@ void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                  const void* const top_row,
-                                  const void* const left_column) {
+void SmoothHorizontal64x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const __m128i top_right = _mm_set1_epi16(top[63]);
   const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
@@ -1378,7 +1404,8 @@ void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
+inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+                                      const uint8_t* LIBGAV1_RESTRICT left,
                                       const int height, __m128i* pixels) {
   __m128i top = Load4(above);
   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
@@ -1390,7 +1417,8 @@ inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
 // (256-w) counterparts. This is precomputed by the compiler when the weights
 // table is visible to this module. Removing this visibility can cut speed by up
 // to half in both 4xH and 8xH transforms.
-inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
+inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
+                                           weight_array,
                                        const int height, __m128i* weights) {
   const __m128i inverter = _mm_set1_epi16(256);
 
@@ -1413,7 +1441,8 @@ inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
 }
 
 inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
-                                   const int height, uint8_t* dst,
+                                   const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
                                    const ptrdiff_t stride) {
   const __m128i pred_round = _mm_set1_epi32(128);
   const __m128i mask_increment = _mm_set1_epi16(0x0202);
@@ -1438,9 +1467,10 @@ inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
   }
 }
 
-void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                              const void* const top_row,
-                              const void* const left_column) {
+void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint8_t*>(left_column);
   const auto* const above = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1453,9 +1483,10 @@ void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
 }
 
-void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                              const void* const top_row,
-                              const void* const left_column) {
+void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint8_t*>(left_column);
   const auto* const above = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1468,9 +1499,10 @@ void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
 }
 
-void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left = static_cast<const uint8_t*>(left_column);
   const auto* const above = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1485,9 +1517,10 @@ void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
 }
 
-void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                              const void* const top_row,
-                              const void* const left_column) {
+void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
   const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
@@ -1520,9 +1553,10 @@ void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
   WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
 }
 
-void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                              const void* const top_row,
-                              const void* const left_column) {
+void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
@@ -1544,9 +1578,10 @@ void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -1583,9 +1618,10 @@ void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const __m128i zero = _mm_setzero_si128();
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
@@ -1649,9 +1685,10 @@ void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
@@ -1694,9 +1731,10 @@ void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
                               scale);
 }
 
-void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
@@ -1722,9 +1760,10 @@ void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical16x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
@@ -1766,9 +1805,10 @@ void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical16x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
@@ -1839,9 +1879,10 @@ void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical16x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
@@ -1887,9 +1928,10 @@ void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
-                               const void* const top_row,
-                               const void* const left_column) {
+void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1922,9 +1964,10 @@ void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical32x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   auto* dst = static_cast<uint8_t*>(dest);
@@ -1975,9 +2018,10 @@ void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical32x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2063,9 +2107,10 @@ void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical32x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2120,9 +2165,10 @@ void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical64x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2192,9 +2238,10 @@ void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical64x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2311,9 +2358,10 @@ void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
   }
 }
 
-void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
-                                const void* const top_row,
-                                const void* const left_column) {
+void SmoothVertical64x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
index 063929d..0c6a2f3 100644
--- a/src/dsp/x86/intrapred_sse4.cc
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -90,11 +90,11 @@ struct DirectionalPredFuncs_SSE4_1 {
 
 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
-                        shiftk, dc_mult>::DcTop(void* const dest,
-                                                ptrdiff_t stride,
-                                                const void* const top_row,
-                                                const void* /*left_column*/) {
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* /*left_column*/) {
   const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
   const __m128i sum = top_sumfn(top_row);
   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
@@ -103,11 +103,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
 
 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
-                        shiftk,
-                        dc_mult>::DcLeft(void* const dest, ptrdiff_t stride,
-                                         const void* /*top_row*/,
-                                         const void* const left_column) {
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* /*top_row*/,
+                     const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
   const __m128i sum = left_sumfn(left_column);
   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
@@ -116,10 +116,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
 
 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
-                        shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride,
-                                             const void* const top_row,
-                                             const void* const left_column) {
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                 const void* LIBGAV1_RESTRICT const top_row,
+                 const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i rounder =
       _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
   const __m128i sum_top = top_sumfn(top_row);
@@ -141,8 +142,8 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
 
 template <ColumnStoreFunc col_storefn>
 void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
-    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
-    const void* const left_column) {
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
   col_storefn(dest, stride, left_column);
 }
 
@@ -384,8 +385,9 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
 // ColStoreN<height> copies each of the |height| values in |column| across its
 // corresponding in dest.
 template <WriteDuplicateFunc writefn>
-inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
-                             const void* const column) {
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
   const __m128i col_data = Load4(column);
   const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
   const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
@@ -393,8 +395,9 @@ inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
 }
 
 template <WriteDuplicateFunc writefn>
-inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
-                             const void* const column) {
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
   const ptrdiff_t stride4 = stride << 2;
   const __m128i col_data = LoadLo8(column);
   const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
@@ -407,8 +410,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
 }
 
 template <WriteDuplicateFunc writefn>
-inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
-                              const void* const column) {
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
   const ptrdiff_t stride4 = stride << 2;
   const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
   const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
@@ -428,8 +432,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
 }
 
 template <WriteDuplicateFunc writefn>
-inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
-                              const void* const column) {
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
   const ptrdiff_t stride4 = stride << 2;
   auto* dst = static_cast<uint8_t*>(dest);
   for (int y = 0; y < 32; y += 16) {
@@ -457,8 +462,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
 }
 
 template <WriteDuplicateFunc writefn>
-inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
-                              const void* const column) {
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
   const ptrdiff_t stride4 = stride << 2;
   auto* dst = static_cast<uint8_t*>(dest);
   for (int y = 0; y < 64; y += 16) {
@@ -574,7 +580,7 @@ struct DirDefs {
 };
 
 template <int y_mask>
-inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
                             const __m128i& left, const __m128i& top_lefts,
                             const __m128i& top_dists, const __m128i& left_dists,
                             const __m128i& top_left_diffs) {
@@ -614,7 +620,7 @@ inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
 // could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
 // for the blends.
 template <int y_mask>
-inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
                             const __m128i& left, const __m128i& top_lefts,
                             const __m128i& top_dists, const __m128i& left_dists,
                             const __m128i& top_left_diffs) {
@@ -658,7 +664,7 @@ inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
 // |left_dists| is provided alongside its spread out version because it doesn't
 // change between calls and interacts with both kinds of packing.
 template <int y_mask>
-inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
                              const __m128i& left, const __m128i& top_lefts,
                              const __m128i& top_dists,
                              const __m128i& left_dists,
@@ -712,8 +718,9 @@ inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
 }
 
-void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
-                     const void* const top_row, const void* const left_column) {
+void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
   const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
 
@@ -742,8 +749,9 @@ void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
                         top_left_diff);
 }
 
-void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
-                     const void* const top_row, const void* const left_column) {
+void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = LoadLo8(left_column);
   const __m128i left_lo = _mm_cvtepu8_epi32(left);
   const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
@@ -787,9 +795,9 @@ void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
                         top_left_diff);
 }
 
-void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
-                      const void* const top_row,
-                      const void* const left_column) {
+void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = LoadUnaligned16(left_column);
   const __m128i left_0 = _mm_cvtepu8_epi32(left);
   const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
@@ -862,8 +870,9 @@ void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
                         top_left_diff);
 }
 
-void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
-                     const void* const top_row, const void* const left_column) {
+void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -891,8 +900,9 @@ void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
                               top_left_diff);
 }
 
-void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
-                     const void* const top_row, const void* const left_column) {
+void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -932,9 +942,9 @@ void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
                               top_left_diff);
 }
 
-void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
-                      const void* const top_row,
-                      const void* const left_column) {
+void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = LoadUnaligned16(left_column);
   const __m128i left_lo = _mm_cvtepu8_epi16(left);
   const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
@@ -1001,18 +1011,18 @@ void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
                               left_dists, top_left_diff);
 }
 
-void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride,
-                      const void* const top_row,
-                      const void* const left_column) {
+void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* const dst = static_cast<uint8_t*>(dest);
   Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
   Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
 }
 
-void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
-                      const void* const top_row,
-                      const void* const left_column) {
+void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
   const __m128i left = Load4(left_column);
   const __m128i top = LoadUnaligned16(top_row);
   const __m128i top_lo = _mm_cvtepu8_epi16(top);
@@ -1057,7 +1067,7 @@ void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
 
 // Inlined for calling with offsets in larger transform