Update SincResampler

2022-11-10 21:45:51 +01:00 · 2022-11-10 21:45:51 +01:00 · 8c05aae086
parent f723513521
commit 8c05aae086
2 changed files with 141 additions and 49 deletions
--- a/src/base/sinc_resampler.cc
+++ b/src/base/sinc_resampler.cc
@ -1,4 +1,4 @@
-// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
@ -82,15 +82,19 @@
 #include "base/log.h"

 #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
-#include <xmmintrin.h>
-#define CONVOLVE_FUNC Convolve_SSE
+#include <immintrin.h>
+// Including these headers directly should generally be avoided. Since
+// Chrome is compiled with -msse3 (the minimal requirement), we include the
+// headers directly to make the intrinsics available.
+#include <avx2intrin.h>
+#include <avxintrin.h>
+#include <fmaintrin.h>
 #elif defined(_M_ARM64) || defined(__aarch64__)
 #include <arm_neon.h>
-#define CONVOLVE_FUNC Convolve_NEON
-#else
-#define CONVOLVE_FUNC Convolve_C
 #endif

+namespace base {
+
 namespace {

 constexpr double kPiDouble = 3.14159265358979323846;
@ -109,7 +113,7 @@ class ScopedSubnormalFloatDisabler {
  ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete;

  ~ScopedSubnormalFloatDisabler() {
-#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
    _mm_setcsr(orig_state_);
 #endif
  }
@ -145,27 +149,52 @@ int CalculateChunkSize(int block_size_, double io_ratio) {

 }  // namespace

-namespace base {
+// If we know the minimum architecture at compile time, avoid CPU detection.
+void SincResampler::InitializeCPUSpecificFeatures() {
+#if defined(_M_ARM64) || defined(__aarch64__)
+  convolve_proc_ = Convolve_NEON;
+#elif defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
+#if 0  // TODO
+  // Using AVX2 instead of SSE2 when AVX2/FMA3 supported.
+  if (cpu.has_avx2() && cpu.has_fma3())
+    convolve_proc_ = Convolve_AVX2;
+  else if (cpu.has_sse2())
+    convolve_proc_ = Convolve_SSE;
+  else
+    convolve_proc_ = Convolve_C;
+#endif
+  convolve_proc_ = Convolve_SSE;
+#else
+  // Unknown architecture.
+  convolve_proc_ = Convolve_C;
+#endif
+}

 SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames)
    : io_sample_rate_ratio_(io_sample_rate_ratio),
      request_frames_(request_frames),
      input_buffer_size_(request_frames_ + kKernelSize),
-      // Create input buffers with a 16-byte alignment for SSE optimizations.
+      // Create input buffers with a 32-byte alignment for SIMD optimizations.
      kernel_storage_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
+          base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
      kernel_pre_sinc_storage_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
+          base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
      kernel_window_storage_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
+          base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
      input_buffer_(static_cast<float*>(
-          base::AlignedAlloc<16>(sizeof(float) * input_buffer_size_))),
+          base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))),
      r1_(input_buffer_.get()),
      r2_(input_buffer_.get() + kKernelSize / 2) {
-  DCHECK(request_frames_ > 0);
+  CHECK(request_frames > kKernelSize * 3 / 2)
+      << "request_frames must be greater than 1.5 kernels to allow sufficient "
+         "data for resampling";
+  // This means that after the first call to Flush we will have
+  // block_size_ > kKernelSize and r2_ < r3_.
+
+  InitializeCPUSpecificFeatures();
+  DCHECK(convolve_proc_);
+  CHECK(request_frames_ > 0);
  Flush();
-  DCHECK(block_size_ > kKernelSize)
-      << "block_size must be greater than kKernelSize!";

  memset(kernel_storage_.get(), 0,
         sizeof(*kernel_storage_.get()) * kKernelStorageSize);
@ -213,14 +242,14 @@ void SincResampler::InitializeKernel() {
    for (int i = 0; i < kKernelSize; ++i) {
      const int idx = i + offset_idx * kKernelSize;
      const float pre_sinc =
-          kPiFloat * (i - kKernelSize / 2 - subsample_offset);
+          base::kPiFloat * (i - kKernelSize / 2 - subsample_offset);
      kernel_pre_sinc_storage_[idx] = pre_sinc;

      // Compute Blackman window, matching the offset of the sinc().
      const float x = (i - subsample_offset) / kKernelSize;
      const float window =
-          static_cast<float>(kA0 - kA1 * cos(2.0 * kPiDouble * x) +
-                             kA2 * cos(4.0 * kPiDouble * x));
+          static_cast<float>(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) +
+                             kA2 * cos(4.0 * base::kPiDouble * x));
      kernel_window_storage_[idx] = window;

      // Compute the sinc with offset, then window the sinc() function and store
@ -286,10 +315,10 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
        const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
        const float* k2 = k1 + kKernelSize;

-        // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage.  Should always
-        // be true so long as kKernelSize is a multiple of 16.
-        DCHECK(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F));
-        DCHECK(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F));
+        // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage.  Should always
+        // be true so long as kKernelSize is a multiple of 32.
+        DCHECK(0u == reinterpret_cast<uintptr_t>(k1) & 0x1F);
+        DCHECK(0u == reinterpret_cast<uintptr_t>(k2) & 0x1F);

        // Initialize input pointer based on quantized |virtual_source_idx_|.
        const float* input_ptr = r1_ + source_idx;
@ -298,7 +327,7 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
        const double kernel_interpolation_factor =
            virtual_offset_idx - offset_idx;
        *destination++ =
-            CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
+            convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);

        // Advance the virtual index.
        virtual_source_idx_ += io_sample_rate_ratio_;
@ -413,6 +442,53 @@ float SincResampler::Convolve_SSE(const float* input_ptr,

  return result;
 }
+
+__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2(
+    const float* input_ptr,
+    const float* k1,
+    const float* k2,
+    double kernel_interpolation_factor) {
+  __m256 m_input;
+  __m256 m_sums1 = _mm256_setzero_ps();
+  __m256 m_sums2 = _mm256_setzero_ps();
+
+  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
+  // these loops has not been tested or benchmarked.
+  bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
+  if (!aligned_input) {
+    for (size_t i = 0; i < kKernelSize; i += 8) {
+      m_input = _mm256_loadu_ps(input_ptr + i);
+      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
+      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
+    }
+  } else {
+    for (size_t i = 0; i < kKernelSize; i += 8) {
+      m_input = _mm256_load_ps(input_ptr + i);
+      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
+      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
+    }
+  }
+
+  // Linearly interpolate the two "convolutions".
+  __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
+                                 _mm256_extractf128_ps(m_sums1, 1));
+  __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
+                                 _mm256_extractf128_ps(m_sums2, 1));
+  m128_sums1 = _mm_mul_ps(
+      m128_sums1,
+      _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
+  m128_sums2 = _mm_mul_ps(
+      m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
+  m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
+
+  // Sum components together.
+  float result;
+  m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
+  _mm_store_ss(&result, _mm_add_ss(m128_sums2,
+                                   _mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
+
+  return result;
+}
 #elif defined(_M_ARM64) || defined(__aarch64__)
 float SincResampler::Convolve_NEON(const float* input_ptr,
                                   const float* k1,
--- a/src/base/sinc_resampler.h
+++ b/src/base/sinc_resampler.h
@ -1,4 +1,4 @@
-// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

@ -15,37 +15,41 @@ namespace base {
 // SincResampler is a high-quality single-channel sample-rate converter.
 class SincResampler {
 public:
-  enum {
-    // The kernel size can be adjusted for quality (higher is better) at the
-    // expense of performance.  Must be a multiple of 32.
-    // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
-    kKernelSize = 32,
+  // The kernel size can be adjusted for quality (higher is better) at the
+  // expense of performance.  Must be a multiple of 32.
+  // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
+  static constexpr int kKernelSize = 32;

-    // Default request size.  Affects how often and for how much SincResampler
-    // calls back for input.  Must be greater than kKernelSize.
-    kDefaultRequestSize = 512,
+  // Default request size.  Affects how often and for how much SincResampler
+  // calls back for input.  Must be greater than kKernelSize.
+  static constexpr int kDefaultRequestSize = 512;

-    // The kernel offset count is used for interpolation and is the number of
-    // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
-    // at the expense of allocating more memory.
-    kKernelOffsetCount = 32,
-    kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
-  };
+  // The kernel offset count is used for interpolation and is the number of
+  // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
+  // at the expense of allocating more memory.
+  static constexpr int kKernelOffsetCount = 32;
+  static constexpr int kKernelStorageSize =
+      kKernelSize * (kKernelOffsetCount + 1);

  // Callback type for providing more data into the resampler.  Expects |frames|
  // of data to be rendered into |destination|; zero padded if not enough frames
  // are available to satisfy the request.
  typedef std::function<void(int frames, float* destination)> ReadCB;

-  // Constructs a SincResampler.  |io_sample_rate_ratio| is the ratio
+  // Constructs a SincResampler with the specified |read_cb|, which is used to
+  // acquire audio data for resampling.  |io_sample_rate_ratio| is the ratio
  // of input / output sample rates.  |request_frames| controls the size in
  // frames of the buffer requested by each |read_cb| call.  The value must be
-  // greater than kKernelSize.  Specify kDefaultRequestSize if there are no
+  // greater than 1.5*kKernelSize.  Specify kDefaultRequestSize if there are no
  // request size constraints.
  SincResampler(double io_sample_rate_ratio, int request_frames);
+
+  SincResampler(const SincResampler&) = delete;
+  SincResampler& operator=(const SincResampler&) = delete;
+
  ~SincResampler();

-  // Resample |frames| of data from |read_cb| into |destination|.
+  // Resample |frames| of data from |read_cb_| into |destination|.
  void Resample(int frames, float* destination, ReadCB read_cb);

  // The maximum size in frames that guarantees Resample() will only make a
@ -97,6 +101,10 @@ class SincResampler {
                            const float* k1,
                            const float* k2,
                            double kernel_interpolation_factor);
+  static float Convolve_AVX2(const float* input_ptr,
+                             const float* k1,
+                             const float* k2,
+                             double kernel_interpolation_factor);
 #elif defined(_M_ARM64) || defined(__aarch64__)
  static float Convolve_NEON(const float* input_ptr,
                             const float* k1,
@ -104,6 +112,10 @@ class SincResampler {
                             double kernel_interpolation_factor);
 #endif

+  // Selects runtime specific CPU features like SSE.  Must be called before
+  // using SincResampler.
+  void InitializeCPUSpecificFeatures();
+
  // The ratio of input / output sample rates.
  double io_sample_rate_ratio_;

@ -130,12 +142,19 @@ class SincResampler {
  // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
  // The kernel offsets are sub-sample shifts of a windowed sinc shifted from
  // 0.0 to 1.0 sample.
-  base::AlignedMemPtr<float[]> kernel_storage_;
-  base::AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
-  base::AlignedMemPtr<float[]> kernel_window_storage_;
+  AlignedMemPtr<float[]> kernel_storage_;
+  AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
+  AlignedMemPtr<float[]> kernel_window_storage_;

  // Data from the source is copied into this buffer for each processing pass.
-  base::AlignedMemPtr<float[]> input_buffer_;
+  AlignedMemPtr<float[]> input_buffer_;
+
+  // Stores the runtime selection of which Convolve function to use.
+  using ConvolveProc = float (*)(const float*,
+                                 const float*,
+                                 const float*,
+                                 double);
+  ConvolveProc convolve_proc_;

  // Pointers to the various regions inside |input_buffer_|.  See the diagram at
  // the top of the .cc file for more information.
@ -144,9 +163,6 @@ class SincResampler {
  float* const r2_;
  float* r3_;
  float* r4_;
-
-  SincResampler(SincResampler const&) = delete;
-  SincResampler& operator=(SincResampler const&) = delete;
 };

 }  // namespace base