From 8c05aae086902386c38318d3fe436e3c9b45e4d5 Mon Sep 17 00:00:00 2001 From: Attila Uygun Date: Thu, 10 Nov 2022 21:45:51 +0100 Subject: [PATCH] Update SincResampler --- src/base/sinc_resampler.cc | 124 ++++++++++++++++++++++++++++++------- src/base/sinc_resampler.h | 66 ++++++++++++-------- 2 files changed, 141 insertions(+), 49 deletions(-) diff --git a/src/base/sinc_resampler.cc b/src/base/sinc_resampler.cc index 1f2985b..223afbe 100644 --- a/src/base/sinc_resampler.cc +++ b/src/base/sinc_resampler.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright 2012 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // @@ -82,15 +82,19 @@ #include "base/log.h" #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__) -#include -#define CONVOLVE_FUNC Convolve_SSE +#include +// Including these headers directly should generally be avoided. Since +// Chrome is compiled with -msse3 (the minimal requirement), we include the +// headers directly to make the intrinsics available. +#include +#include +#include #elif defined(_M_ARM64) || defined(__aarch64__) #include -#define CONVOLVE_FUNC Convolve_NEON -#else -#define CONVOLVE_FUNC Convolve_C #endif +namespace base { + namespace { constexpr double kPiDouble = 3.14159265358979323846; @@ -109,7 +113,7 @@ class ScopedSubnormalFloatDisabler { ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete; ~ScopedSubnormalFloatDisabler() { -#if defined(ARCH_CPU_X86_FAMILY) +#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__) _mm_setcsr(orig_state_); #endif } @@ -145,27 +149,52 @@ int CalculateChunkSize(int block_size_, double io_ratio) { } // namespace -namespace base { +// If we know the minimum architecture at compile time, avoid CPU detection. +void SincResampler::InitializeCPUSpecificFeatures() { +#if defined(_M_ARM64) || defined(__aarch64__) + convolve_proc_ = Convolve_NEON; +#elif defined(_M_X64) || defined(__x86_64__) || defined(__i386__) +#if 0 // TODO + // Using AVX2 instead of SSE2 when AVX2/FMA3 supported. + if (cpu.has_avx2() && cpu.has_fma3()) + convolve_proc_ = Convolve_AVX2; + else if (cpu.has_sse2()) + convolve_proc_ = Convolve_SSE; + else + convolve_proc_ = Convolve_C; +#endif + convolve_proc_ = Convolve_SSE; +#else + // Unknown architecture. + convolve_proc_ = Convolve_C; +#endif +} SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames) : io_sample_rate_ratio_(io_sample_rate_ratio), request_frames_(request_frames), input_buffer_size_(request_frames_ + kKernelSize), - // Create input buffers with a 16-byte alignment for SSE optimizations. + // Create input buffers with a 32-byte alignment for SIMD optimizations. kernel_storage_(static_cast( - base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))), + base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))), kernel_pre_sinc_storage_(static_cast( - base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))), + base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))), kernel_window_storage_(static_cast( - base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))), + base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))), input_buffer_(static_cast( - base::AlignedAlloc<16>(sizeof(float) * input_buffer_size_))), + base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))), r1_(input_buffer_.get()), r2_(input_buffer_.get() + kKernelSize / 2) { - DCHECK(request_frames_ > 0); + CHECK(request_frames > kKernelSize * 3 / 2) + << "request_frames must be greater than 1.5 kernels to allow sufficient " + "data for resampling"; + // This means that after the first call to Flush we will have + // block_size_ > kKernelSize and r2_ < r3_. + + InitializeCPUSpecificFeatures(); + DCHECK(convolve_proc_); + CHECK(request_frames_ > 0); Flush(); - DCHECK(block_size_ > kKernelSize) - << "block_size must be greater than kKernelSize!"; memset(kernel_storage_.get(), 0, sizeof(*kernel_storage_.get()) * kKernelStorageSize); @@ -213,14 +242,14 @@ void SincResampler::InitializeKernel() { for (int i = 0; i < kKernelSize; ++i) { const int idx = i + offset_idx * kKernelSize; const float pre_sinc = - kPiFloat * (i - kKernelSize / 2 - subsample_offset); + base::kPiFloat * (i - kKernelSize / 2 - subsample_offset); kernel_pre_sinc_storage_[idx] = pre_sinc; // Compute Blackman window, matching the offset of the sinc(). const float x = (i - subsample_offset) / kKernelSize; const float window = - static_cast(kA0 - kA1 * cos(2.0 * kPiDouble * x) + - kA2 * cos(4.0 * kPiDouble * x)); + static_cast(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) + + kA2 * cos(4.0 * base::kPiDouble * x)); kernel_window_storage_[idx] = window; // Compute the sinc with offset, then window the sinc() function and store @@ -286,10 +315,10 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) { const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; const float* k2 = k1 + kKernelSize; - // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always - // be true so long as kKernelSize is a multiple of 16. - DCHECK(0u == (reinterpret_cast(k1) & 0x0F)); - DCHECK(0u == (reinterpret_cast(k2) & 0x0F)); + // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always + // be true so long as kKernelSize is a multiple of 32. + DCHECK(0u == reinterpret_cast(k1) & 0x1F); + DCHECK(0u == reinterpret_cast(k2) & 0x1F); // Initialize input pointer based on quantized |virtual_source_idx_|. const float* input_ptr = r1_ + source_idx; @@ -298,7 +327,7 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) { const double kernel_interpolation_factor = virtual_offset_idx - offset_idx; *destination++ = - CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); + convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. virtual_source_idx_ += io_sample_rate_ratio_; @@ -413,6 +442,53 @@ float SincResampler::Convolve_SSE(const float* input_ptr, return result; } + +__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2( + const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m256 m_input; + __m256 m_sums1 = _mm256_setzero_ps(); + __m256 m_sums2 = _mm256_setzero_ps(); + + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling + // these loops has not been tested or benchmarked. + bool aligned_input = (reinterpret_cast(input_ptr) & 0x1F) == 0; + if (!aligned_input) { + for (size_t i = 0; i < kKernelSize; i += 8) { + m_input = _mm256_loadu_ps(input_ptr + i); + m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); + m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); + } + } else { + for (size_t i = 0; i < kKernelSize; i += 8) { + m_input = _mm256_load_ps(input_ptr + i); + m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); + m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); + } + } + + // Linearly interpolate the two "convolutions". + __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0), + _mm256_extractf128_ps(m_sums1, 1)); + __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0), + _mm256_extractf128_ps(m_sums2, 1)); + m128_sums1 = _mm_mul_ps( + m128_sums1, + _mm_set_ps1(static_cast(1.0 - kernel_interpolation_factor))); + m128_sums2 = _mm_mul_ps( + m128_sums2, _mm_set_ps1(static_cast(kernel_interpolation_factor))); + m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2); + + // Sum components together. + float result; + m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1); + _mm_store_ss(&result, _mm_add_ss(m128_sums2, + _mm_shuffle_ps(m128_sums2, m128_sums2, 1))); + + return result; +} #elif defined(_M_ARM64) || defined(__aarch64__) float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, diff --git a/src/base/sinc_resampler.h b/src/base/sinc_resampler.h index da7f1a9..cef5e5e 100644 --- a/src/base/sinc_resampler.h +++ b/src/base/sinc_resampler.h @@ -1,4 +1,4 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright 2012 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -15,37 +15,41 @@ namespace base { // SincResampler is a high-quality single-channel sample-rate converter. class SincResampler { public: - enum { - // The kernel size can be adjusted for quality (higher is better) at the - // expense of performance. Must be a multiple of 32. - // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. - kKernelSize = 32, + // The kernel size can be adjusted for quality (higher is better) at the + // expense of performance. Must be a multiple of 32. + // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. + static constexpr int kKernelSize = 32; - // Default request size. Affects how often and for how much SincResampler - // calls back for input. Must be greater than kKernelSize. - kDefaultRequestSize = 512, + // Default request size. Affects how often and for how much SincResampler + // calls back for input. Must be greater than kKernelSize. + static constexpr int kDefaultRequestSize = 512; - // The kernel offset count is used for interpolation and is the number of - // sub-sample kernel shifts. Can be adjusted for quality (higher is better) - // at the expense of allocating more memory. - kKernelOffsetCount = 32, - kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), - }; + // The kernel offset count is used for interpolation and is the number of + // sub-sample kernel shifts. Can be adjusted for quality (higher is better) + // at the expense of allocating more memory. + static constexpr int kKernelOffsetCount = 32; + static constexpr int kKernelStorageSize = + kKernelSize * (kKernelOffsetCount + 1); // Callback type for providing more data into the resampler. Expects |frames| // of data to be rendered into |destination|; zero padded if not enough frames // are available to satisfy the request. typedef std::function ReadCB; - // Constructs a SincResampler. |io_sample_rate_ratio| is the ratio + // Constructs a SincResampler with the specified |read_cb|, which is used to + // acquire audio data for resampling. |io_sample_rate_ratio| is the ratio // of input / output sample rates. |request_frames| controls the size in // frames of the buffer requested by each |read_cb| call. The value must be - // greater than kKernelSize. Specify kDefaultRequestSize if there are no + // greater than 1.5*kKernelSize. Specify kDefaultRequestSize if there are no // request size constraints. SincResampler(double io_sample_rate_ratio, int request_frames); + + SincResampler(const SincResampler&) = delete; + SincResampler& operator=(const SincResampler&) = delete; + ~SincResampler(); - // Resample |frames| of data from |read_cb| into |destination|. + // Resample |frames| of data from |read_cb_| into |destination|. void Resample(int frames, float* destination, ReadCB read_cb); // The maximum size in frames that guarantees Resample() will only make a @@ -97,6 +101,10 @@ class SincResampler { const float* k1, const float* k2, double kernel_interpolation_factor); + static float Convolve_AVX2(const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor); #elif defined(_M_ARM64) || defined(__aarch64__) static float Convolve_NEON(const float* input_ptr, const float* k1, @@ -104,6 +112,10 @@ class SincResampler { double kernel_interpolation_factor); #endif + // Selects runtime specific CPU features like SSE. Must be called before + // using SincResampler. + void InitializeCPUSpecificFeatures(); + // The ratio of input / output sample rates. double io_sample_rate_ratio_; @@ -130,12 +142,19 @@ class SincResampler { // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize. // The kernel offsets are sub-sample shifts of a windowed sinc shifted from // 0.0 to 1.0 sample. - base::AlignedMemPtr kernel_storage_; - base::AlignedMemPtr kernel_pre_sinc_storage_; - base::AlignedMemPtr kernel_window_storage_; + AlignedMemPtr kernel_storage_; + AlignedMemPtr kernel_pre_sinc_storage_; + AlignedMemPtr kernel_window_storage_; // Data from the source is copied into this buffer for each processing pass. - base::AlignedMemPtr input_buffer_; + AlignedMemPtr input_buffer_; + + // Stores the runtime selection of which Convolve function to use. + using ConvolveProc = float (*)(const float*, + const float*, + const float*, + double); + ConvolveProc convolve_proc_; // Pointers to the various regions inside |input_buffer_|. See the diagram at // the top of the .cc file for more information. @@ -144,9 +163,6 @@ class SincResampler { float* const r2_; float* r3_; float* r4_; - - SincResampler(SincResampler const&) = delete; - SincResampler& operator=(SincResampler const&) = delete; }; } // namespace base