Update SincResampler

This commit is contained in:
Attila Uygun 2022-11-10 21:45:51 +01:00
parent f723513521
commit 8c05aae086
2 changed files with 141 additions and 49 deletions

View File

@ -1,4 +1,4 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
// //
@ -82,15 +82,19 @@
#include "base/log.h" #include "base/log.h"
#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__) #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
#include <xmmintrin.h> #include <immintrin.h>
#define CONVOLVE_FUNC Convolve_SSE // Including these headers directly should generally be avoided. Since
// Chrome is compiled with -msse3 (the minimal requirement), we include the
// headers directly to make the intrinsics available.
#include <avx2intrin.h>
#include <avxintrin.h>
#include <fmaintrin.h>
#elif defined(_M_ARM64) || defined(__aarch64__) #elif defined(_M_ARM64) || defined(__aarch64__)
#include <arm_neon.h> #include <arm_neon.h>
#define CONVOLVE_FUNC Convolve_NEON
#else
#define CONVOLVE_FUNC Convolve_C
#endif #endif
namespace base {
namespace { namespace {
constexpr double kPiDouble = 3.14159265358979323846; constexpr double kPiDouble = 3.14159265358979323846;
@ -109,7 +113,7 @@ class ScopedSubnormalFloatDisabler {
ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete; ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete;
~ScopedSubnormalFloatDisabler() { ~ScopedSubnormalFloatDisabler() {
#if defined(ARCH_CPU_X86_FAMILY) #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
_mm_setcsr(orig_state_); _mm_setcsr(orig_state_);
#endif #endif
} }
@ -145,27 +149,52 @@ int CalculateChunkSize(int block_size_, double io_ratio) {
} // namespace } // namespace
namespace base { // If we know the minimum architecture at compile time, avoid CPU detection.
void SincResampler::InitializeCPUSpecificFeatures() {
#if defined(_M_ARM64) || defined(__aarch64__)
convolve_proc_ = Convolve_NEON;
#elif defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
#if 0 // TODO
// Using AVX2 instead of SSE2 when AVX2/FMA3 supported.
if (cpu.has_avx2() && cpu.has_fma3())
convolve_proc_ = Convolve_AVX2;
else if (cpu.has_sse2())
convolve_proc_ = Convolve_SSE;
else
convolve_proc_ = Convolve_C;
#endif
convolve_proc_ = Convolve_SSE;
#else
// Unknown architecture.
convolve_proc_ = Convolve_C;
#endif
}
SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames) SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames)
: io_sample_rate_ratio_(io_sample_rate_ratio), : io_sample_rate_ratio_(io_sample_rate_ratio),
request_frames_(request_frames), request_frames_(request_frames),
input_buffer_size_(request_frames_ + kKernelSize), input_buffer_size_(request_frames_ + kKernelSize),
// Create input buffers with a 16-byte alignment for SSE optimizations. // Create input buffers with a 32-byte alignment for SIMD optimizations.
kernel_storage_(static_cast<float*>( kernel_storage_(static_cast<float*>(
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))), base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
kernel_pre_sinc_storage_(static_cast<float*>( kernel_pre_sinc_storage_(static_cast<float*>(
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))), base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
kernel_window_storage_(static_cast<float*>( kernel_window_storage_(static_cast<float*>(
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))), base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
input_buffer_(static_cast<float*>( input_buffer_(static_cast<float*>(
base::AlignedAlloc<16>(sizeof(float) * input_buffer_size_))), base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))),
r1_(input_buffer_.get()), r1_(input_buffer_.get()),
r2_(input_buffer_.get() + kKernelSize / 2) { r2_(input_buffer_.get() + kKernelSize / 2) {
DCHECK(request_frames_ > 0); CHECK(request_frames > kKernelSize * 3 / 2)
<< "request_frames must be greater than 1.5 kernels to allow sufficient "
"data for resampling";
// This means that after the first call to Flush we will have
// block_size_ > kKernelSize and r2_ < r3_.
InitializeCPUSpecificFeatures();
DCHECK(convolve_proc_);
CHECK(request_frames_ > 0);
Flush(); Flush();
DCHECK(block_size_ > kKernelSize)
<< "block_size must be greater than kKernelSize!";
memset(kernel_storage_.get(), 0, memset(kernel_storage_.get(), 0,
sizeof(*kernel_storage_.get()) * kKernelStorageSize); sizeof(*kernel_storage_.get()) * kKernelStorageSize);
@ -213,14 +242,14 @@ void SincResampler::InitializeKernel() {
for (int i = 0; i < kKernelSize; ++i) { for (int i = 0; i < kKernelSize; ++i) {
const int idx = i + offset_idx * kKernelSize; const int idx = i + offset_idx * kKernelSize;
const float pre_sinc = const float pre_sinc =
kPiFloat * (i - kKernelSize / 2 - subsample_offset); base::kPiFloat * (i - kKernelSize / 2 - subsample_offset);
kernel_pre_sinc_storage_[idx] = pre_sinc; kernel_pre_sinc_storage_[idx] = pre_sinc;
// Compute Blackman window, matching the offset of the sinc(). // Compute Blackman window, matching the offset of the sinc().
const float x = (i - subsample_offset) / kKernelSize; const float x = (i - subsample_offset) / kKernelSize;
const float window = const float window =
static_cast<float>(kA0 - kA1 * cos(2.0 * kPiDouble * x) + static_cast<float>(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) +
kA2 * cos(4.0 * kPiDouble * x)); kA2 * cos(4.0 * base::kPiDouble * x));
kernel_window_storage_[idx] = window; kernel_window_storage_[idx] = window;
// Compute the sinc with offset, then window the sinc() function and store // Compute the sinc with offset, then window the sinc() function and store
@ -286,10 +315,10 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
const float* k2 = k1 + kKernelSize; const float* k2 = k1 + kKernelSize;
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always
// be true so long as kKernelSize is a multiple of 16. // be true so long as kKernelSize is a multiple of 32.
DCHECK(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F)); DCHECK(0u == reinterpret_cast<uintptr_t>(k1) & 0x1F);
DCHECK(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F)); DCHECK(0u == reinterpret_cast<uintptr_t>(k2) & 0x1F);
// Initialize input pointer based on quantized |virtual_source_idx_|. // Initialize input pointer based on quantized |virtual_source_idx_|.
const float* input_ptr = r1_ + source_idx; const float* input_ptr = r1_ + source_idx;
@ -298,7 +327,7 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
const double kernel_interpolation_factor = const double kernel_interpolation_factor =
virtual_offset_idx - offset_idx; virtual_offset_idx - offset_idx;
*destination++ = *destination++ =
CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
// Advance the virtual index. // Advance the virtual index.
virtual_source_idx_ += io_sample_rate_ratio_; virtual_source_idx_ += io_sample_rate_ratio_;
@ -413,6 +442,53 @@ float SincResampler::Convolve_SSE(const float* input_ptr,
return result; return result;
} }
__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2(
const float* input_ptr,
const float* k1,
const float* k2,
double kernel_interpolation_factor) {
__m256 m_input;
__m256 m_sums1 = _mm256_setzero_ps();
__m256 m_sums2 = _mm256_setzero_ps();
// Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
// these loops has not been tested or benchmarked.
bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
if (!aligned_input) {
for (size_t i = 0; i < kKernelSize; i += 8) {
m_input = _mm256_loadu_ps(input_ptr + i);
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
}
} else {
for (size_t i = 0; i < kKernelSize; i += 8) {
m_input = _mm256_load_ps(input_ptr + i);
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
}
}
// Linearly interpolate the two "convolutions".
__m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
_mm256_extractf128_ps(m_sums1, 1));
__m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
_mm256_extractf128_ps(m_sums2, 1));
m128_sums1 = _mm_mul_ps(
m128_sums1,
_mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
m128_sums2 = _mm_mul_ps(
m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
// Sum components together.
float result;
m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
_mm_store_ss(&result, _mm_add_ss(m128_sums2,
_mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
return result;
}
#elif defined(_M_ARM64) || defined(__aarch64__) #elif defined(_M_ARM64) || defined(__aarch64__)
float SincResampler::Convolve_NEON(const float* input_ptr, float SincResampler::Convolve_NEON(const float* input_ptr,
const float* k1, const float* k1,

View File

@ -1,4 +1,4 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
@ -15,37 +15,41 @@ namespace base {
// SincResampler is a high-quality single-channel sample-rate converter. // SincResampler is a high-quality single-channel sample-rate converter.
class SincResampler { class SincResampler {
public: public:
enum { // The kernel size can be adjusted for quality (higher is better) at the
// The kernel size can be adjusted for quality (higher is better) at the // expense of performance. Must be a multiple of 32.
// expense of performance. Must be a multiple of 32. // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+. static constexpr int kKernelSize = 32;
kKernelSize = 32,
// Default request size. Affects how often and for how much SincResampler // Default request size. Affects how often and for how much SincResampler
// calls back for input. Must be greater than kKernelSize. // calls back for input. Must be greater than kKernelSize.
kDefaultRequestSize = 512, static constexpr int kDefaultRequestSize = 512;
// The kernel offset count is used for interpolation and is the number of // The kernel offset count is used for interpolation and is the number of
// sub-sample kernel shifts. Can be adjusted for quality (higher is better) // sub-sample kernel shifts. Can be adjusted for quality (higher is better)
// at the expense of allocating more memory. // at the expense of allocating more memory.
kKernelOffsetCount = 32, static constexpr int kKernelOffsetCount = 32;
kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), static constexpr int kKernelStorageSize =
}; kKernelSize * (kKernelOffsetCount + 1);
// Callback type for providing more data into the resampler. Expects |frames| // Callback type for providing more data into the resampler. Expects |frames|
// of data to be rendered into |destination|; zero padded if not enough frames // of data to be rendered into |destination|; zero padded if not enough frames
// are available to satisfy the request. // are available to satisfy the request.
typedef std::function<void(int frames, float* destination)> ReadCB; typedef std::function<void(int frames, float* destination)> ReadCB;
// Constructs a SincResampler. |io_sample_rate_ratio| is the ratio // Constructs a SincResampler with the specified |read_cb|, which is used to
// acquire audio data for resampling. |io_sample_rate_ratio| is the ratio
// of input / output sample rates. |request_frames| controls the size in // of input / output sample rates. |request_frames| controls the size in
// frames of the buffer requested by each |read_cb| call. The value must be // frames of the buffer requested by each |read_cb| call. The value must be
// greater than kKernelSize. Specify kDefaultRequestSize if there are no // greater than 1.5*kKernelSize. Specify kDefaultRequestSize if there are no
// request size constraints. // request size constraints.
SincResampler(double io_sample_rate_ratio, int request_frames); SincResampler(double io_sample_rate_ratio, int request_frames);
SincResampler(const SincResampler&) = delete;
SincResampler& operator=(const SincResampler&) = delete;
~SincResampler(); ~SincResampler();
// Resample |frames| of data from |read_cb| into |destination|. // Resample |frames| of data from |read_cb_| into |destination|.
void Resample(int frames, float* destination, ReadCB read_cb); void Resample(int frames, float* destination, ReadCB read_cb);
// The maximum size in frames that guarantees Resample() will only make a // The maximum size in frames that guarantees Resample() will only make a
@ -97,6 +101,10 @@ class SincResampler {
const float* k1, const float* k1,
const float* k2, const float* k2,
double kernel_interpolation_factor); double kernel_interpolation_factor);
static float Convolve_AVX2(const float* input_ptr,
const float* k1,
const float* k2,
double kernel_interpolation_factor);
#elif defined(_M_ARM64) || defined(__aarch64__) #elif defined(_M_ARM64) || defined(__aarch64__)
static float Convolve_NEON(const float* input_ptr, static float Convolve_NEON(const float* input_ptr,
const float* k1, const float* k1,
@ -104,6 +112,10 @@ class SincResampler {
double kernel_interpolation_factor); double kernel_interpolation_factor);
#endif #endif
// Selects runtime specific CPU features like SSE. Must be called before
// using SincResampler.
void InitializeCPUSpecificFeatures();
// The ratio of input / output sample rates. // The ratio of input / output sample rates.
double io_sample_rate_ratio_; double io_sample_rate_ratio_;
@ -130,12 +142,19 @@ class SincResampler {
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize. // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from // The kernel offsets are sub-sample shifts of a windowed sinc shifted from
// 0.0 to 1.0 sample. // 0.0 to 1.0 sample.
base::AlignedMemPtr<float[]> kernel_storage_; AlignedMemPtr<float[]> kernel_storage_;
base::AlignedMemPtr<float[]> kernel_pre_sinc_storage_; AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
base::AlignedMemPtr<float[]> kernel_window_storage_; AlignedMemPtr<float[]> kernel_window_storage_;
// Data from the source is copied into this buffer for each processing pass. // Data from the source is copied into this buffer for each processing pass.
base::AlignedMemPtr<float[]> input_buffer_; AlignedMemPtr<float[]> input_buffer_;
// Stores the runtime selection of which Convolve function to use.
using ConvolveProc = float (*)(const float*,
const float*,
const float*,
double);
ConvolveProc convolve_proc_;
// Pointers to the various regions inside |input_buffer_|. See the diagram at // Pointers to the various regions inside |input_buffer_|. See the diagram at
// the top of the .cc file for more information. // the top of the .cc file for more information.
@ -144,9 +163,6 @@ class SincResampler {
float* const r2_; float* const r2_;
float* r3_; float* r3_;
float* r4_; float* r4_;
SincResampler(SincResampler const&) = delete;
SincResampler& operator=(SincResampler const&) = delete;
}; };
} // namespace base } // namespace base