From 8c05aae086902386c38318d3fe436e3c9b45e4d5 Mon Sep 17 00:00:00 2001
From: Attila Uygun <auygun@opera.com>
Date: Thu, 10 Nov 2022 21:45:51 +0100
Subject: [PATCH] Update SincResampler

---
 src/base/sinc_resampler.cc | 124 ++++++++++++++++++++++++++++++-------
 src/base/sinc_resampler.h  |  66 ++++++++++++--------
 2 files changed, 141 insertions(+), 49 deletions(-)

diff --git a/src/base/sinc_resampler.cc b/src/base/sinc_resampler.cc
index 1f2985b..223afbe 100644
--- a/src/base/sinc_resampler.cc
+++ b/src/base/sinc_resampler.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
@@ -82,15 +82,19 @@
 #include "base/log.h"
 
 #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
-#include <xmmintrin.h>
-#define CONVOLVE_FUNC Convolve_SSE
+#include <immintrin.h>
+// Including these headers directly should generally be avoided. Since
+// Chrome is compiled with -msse3 (the minimal requirement), we include the
+// headers directly to make the intrinsics available.
+#include <avx2intrin.h>
+#include <avxintrin.h>
+#include <fmaintrin.h>
 #elif defined(_M_ARM64) || defined(__aarch64__)
 #include <arm_neon.h>
-#define CONVOLVE_FUNC Convolve_NEON
-#else
-#define CONVOLVE_FUNC Convolve_C
 #endif
 
+namespace base {
+
 namespace {
 
 constexpr double kPiDouble = 3.14159265358979323846;
@@ -109,7 +113,7 @@ class ScopedSubnormalFloatDisabler {
   ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete;
 
   ~ScopedSubnormalFloatDisabler() {
-#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
     _mm_setcsr(orig_state_);
 #endif
   }
@@ -145,27 +149,52 @@ int CalculateChunkSize(int block_size_, double io_ratio) {
 
 }  // namespace
 
-namespace base {
+// If we know the minimum architecture at compile time, avoid CPU detection.
+void SincResampler::InitializeCPUSpecificFeatures() {
+#if defined(_M_ARM64) || defined(__aarch64__)
+  convolve_proc_ = Convolve_NEON;
+#elif defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
+#if 0  // TODO
+  // Using AVX2 instead of SSE2 when AVX2/FMA3 supported.
+  if (cpu.has_avx2() && cpu.has_fma3())
+    convolve_proc_ = Convolve_AVX2;
+  else if (cpu.has_sse2())
+    convolve_proc_ = Convolve_SSE;
+  else
+    convolve_proc_ = Convolve_C;
+#endif
+  convolve_proc_ = Convolve_SSE;
+#else
+  // Unknown architecture.
+  convolve_proc_ = Convolve_C;
+#endif
+}
 
 SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames)
     : io_sample_rate_ratio_(io_sample_rate_ratio),
       request_frames_(request_frames),
       input_buffer_size_(request_frames_ + kKernelSize),
-      // Create input buffers with a 16-byte alignment for SSE optimizations.
+      // Create input buffers with a 32-byte alignment for SIMD optimizations.
       kernel_storage_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
+          base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
       kernel_pre_sinc_storage_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
+          base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
       kernel_window_storage_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
+          base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
       input_buffer_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * input_buffer_size_))),
+          base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))),
       r1_(input_buffer_.get()),
       r2_(input_buffer_.get() + kKernelSize / 2) {
-  DCHECK(request_frames_ > 0);
+  CHECK(request_frames > kKernelSize * 3 / 2)
+      << "request_frames must be greater than 1.5 kernels to allow sufficient "
+         "data for resampling";
+  // This means that after the first call to Flush we will have
+  // block_size_ > kKernelSize and r2_ < r3_.
+
+  InitializeCPUSpecificFeatures();
+  DCHECK(convolve_proc_);
+  CHECK(request_frames_ > 0);
   Flush();
-  DCHECK(block_size_ > kKernelSize)
-      << "block_size must be greater than kKernelSize!";
 
   memset(kernel_storage_.get(), 0,
          sizeof(*kernel_storage_.get()) * kKernelStorageSize);
@@ -213,14 +242,14 @@ void SincResampler::InitializeKernel() {
     for (int i = 0; i < kKernelSize; ++i) {
       const int idx = i + offset_idx * kKernelSize;
       const float pre_sinc =
-          kPiFloat * (i - kKernelSize / 2 - subsample_offset);
+          base::kPiFloat * (i - kKernelSize / 2 - subsample_offset);
       kernel_pre_sinc_storage_[idx] = pre_sinc;
 
       // Compute Blackman window, matching the offset of the sinc().
       const float x = (i - subsample_offset) / kKernelSize;
       const float window =
-          static_cast<float>(kA0 - kA1 * cos(2.0 * kPiDouble * x) +
-                             kA2 * cos(4.0 * kPiDouble * x));
+          static_cast<float>(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) +
+                             kA2 * cos(4.0 * base::kPiDouble * x));
       kernel_window_storage_[idx] = window;
 
       // Compute the sinc with offset, then window the sinc() function and store
@@ -286,10 +315,10 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
         const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
         const float* k2 = k1 + kKernelSize;
 
-        // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage.  Should always
-        // be true so long as kKernelSize is a multiple of 16.
-        DCHECK(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F));
-        DCHECK(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F));
+        // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage.  Should always
+        // be true so long as kKernelSize is a multiple of 32.
+        DCHECK(0u == reinterpret_cast<uintptr_t>(k1) & 0x1F);
+        DCHECK(0u == reinterpret_cast<uintptr_t>(k2) & 0x1F);
 
         // Initialize input pointer based on quantized |virtual_source_idx_|.
         const float* input_ptr = r1_ + source_idx;
@@ -298,7 +327,7 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
         const double kernel_interpolation_factor =
             virtual_offset_idx - offset_idx;
         *destination++ =
-            CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
+            convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
 
         // Advance the virtual index.
         virtual_source_idx_ += io_sample_rate_ratio_;
@@ -413,6 +442,53 @@ float SincResampler::Convolve_SSE(const float* input_ptr,
 
   return result;
 }
+
+__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2(
+    const float* input_ptr,
+    const float* k1,
+    const float* k2,
+    double kernel_interpolation_factor) {
+  __m256 m_input;
+  __m256 m_sums1 = _mm256_setzero_ps();
+  __m256 m_sums2 = _mm256_setzero_ps();
+
+  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
+  // these loops has not been tested or benchmarked.
+  bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
+  if (!aligned_input) {
+    for (size_t i = 0; i < kKernelSize; i += 8) {
+      m_input = _mm256_loadu_ps(input_ptr + i);
+      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
+      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
+    }
+  } else {
+    for (size_t i = 0; i < kKernelSize; i += 8) {
+      m_input = _mm256_load_ps(input_ptr + i);
+      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
+      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
+    }
+  }
+
+  // Linearly interpolate the two "convolutions".
+  __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
+                                 _mm256_extractf128_ps(m_sums1, 1));
+  __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
+                                 _mm256_extractf128_ps(m_sums2, 1));
+  m128_sums1 = _mm_mul_ps(
+      m128_sums1,
+      _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
+  m128_sums2 = _mm_mul_ps(
+      m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
+  m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
+
+  // Sum components together.
+  float result;
+  m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
+  _mm_store_ss(&result, _mm_add_ss(m128_sums2,
+                                   _mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
+
+  return result;
+}
 #elif defined(_M_ARM64) || defined(__aarch64__)
 float SincResampler::Convolve_NEON(const float* input_ptr,
                                    const float* k1,
diff --git a/src/base/sinc_resampler.h b/src/base/sinc_resampler.h
index da7f1a9..cef5e5e 100644
--- a/src/base/sinc_resampler.h
+++ b/src/base/sinc_resampler.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -15,37 +15,41 @@ namespace base {
 // SincResampler is a high-quality single-channel sample-rate converter.
 class SincResampler {
  public:
-  enum {
-    // The kernel size can be adjusted for quality (higher is better) at the
-    // expense of performance.  Must be a multiple of 32.
-    // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
-    kKernelSize = 32,
+  // The kernel size can be adjusted for quality (higher is better) at the
+  // expense of performance.  Must be a multiple of 32.
+  // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
+  static constexpr int kKernelSize = 32;
 
-    // Default request size.  Affects how often and for how much SincResampler
-    // calls back for input.  Must be greater than kKernelSize.
-    kDefaultRequestSize = 512,
+  // Default request size.  Affects how often and for how much SincResampler
+  // calls back for input.  Must be greater than kKernelSize.
+  static constexpr int kDefaultRequestSize = 512;
 
-    // The kernel offset count is used for interpolation and is the number of
-    // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
-    // at the expense of allocating more memory.
-    kKernelOffsetCount = 32,
-    kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
-  };
+  // The kernel offset count is used for interpolation and is the number of
+  // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
+  // at the expense of allocating more memory.
+  static constexpr int kKernelOffsetCount = 32;
+  static constexpr int kKernelStorageSize =
+      kKernelSize * (kKernelOffsetCount + 1);
 
   // Callback type for providing more data into the resampler.  Expects |frames|
   // of data to be rendered into |destination|; zero padded if not enough frames
   // are available to satisfy the request.
   typedef std::function<void(int frames, float* destination)> ReadCB;
 
-  // Constructs a SincResampler.  |io_sample_rate_ratio| is the ratio
+  // Constructs a SincResampler with the specified |read_cb|, which is used to
+  // acquire audio data for resampling.  |io_sample_rate_ratio| is the ratio
   // of input / output sample rates.  |request_frames| controls the size in
   // frames of the buffer requested by each |read_cb| call.  The value must be
-  // greater than kKernelSize.  Specify kDefaultRequestSize if there are no
+  // greater than 1.5*kKernelSize.  Specify kDefaultRequestSize if there are no
   // request size constraints.
   SincResampler(double io_sample_rate_ratio, int request_frames);
+
+  SincResampler(const SincResampler&) = delete;
+  SincResampler& operator=(const SincResampler&) = delete;
+
   ~SincResampler();
 
-  // Resample |frames| of data from |read_cb| into |destination|.
+  // Resample |frames| of data from |read_cb_| into |destination|.
   void Resample(int frames, float* destination, ReadCB read_cb);
 
   // The maximum size in frames that guarantees Resample() will only make a
@@ -97,6 +101,10 @@ class SincResampler {
                             const float* k1,
                             const float* k2,
                             double kernel_interpolation_factor);
+  static float Convolve_AVX2(const float* input_ptr,
+                             const float* k1,
+                             const float* k2,
+                             double kernel_interpolation_factor);
 #elif defined(_M_ARM64) || defined(__aarch64__)
   static float Convolve_NEON(const float* input_ptr,
                              const float* k1,
@@ -104,6 +112,10 @@ class SincResampler {
                              double kernel_interpolation_factor);
 #endif
 
+  // Selects runtime specific CPU features like SSE.  Must be called before
+  // using SincResampler.
+  void InitializeCPUSpecificFeatures();
+
   // The ratio of input / output sample rates.
   double io_sample_rate_ratio_;
 
@@ -130,12 +142,19 @@ class SincResampler {
   // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
   // The kernel offsets are sub-sample shifts of a windowed sinc shifted from
   // 0.0 to 1.0 sample.
-  base::AlignedMemPtr<float[]> kernel_storage_;
-  base::AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
-  base::AlignedMemPtr<float[]> kernel_window_storage_;
+  AlignedMemPtr<float[]> kernel_storage_;
+  AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
+  AlignedMemPtr<float[]> kernel_window_storage_;
 
   // Data from the source is copied into this buffer for each processing pass.
-  base::AlignedMemPtr<float[]> input_buffer_;
+  AlignedMemPtr<float[]> input_buffer_;
+
+  // Stores the runtime selection of which Convolve function to use.
+  using ConvolveProc = float (*)(const float*,
+                                 const float*,
+                                 const float*,
+                                 double);
+  ConvolveProc convolve_proc_;
 
   // Pointers to the various regions inside |input_buffer_|.  See the diagram at
   // the top of the .cc file for more information.
@@ -144,9 +163,6 @@ class SincResampler {
   float* const r2_;
   float* r3_;
   float* r4_;
-
-  SincResampler(SincResampler const&) = delete;
-  SincResampler& operator=(SincResampler const&) = delete;
 };
 
 }  // namespace base