From 771f8a3d78aad1e74eeedf20ed5b1c2876a9bf97 Mon Sep 17 00:00:00 2001 From: Attila Uygun Date: Wed, 3 May 2023 20:21:26 +0200 Subject: [PATCH] Update SincResampler --- src/base/sinc_resampler.cc | 150 ++++++++++++++++++++++--------------- src/base/sinc_resampler.h | 67 +++++++++++------ 2 files changed, 131 insertions(+), 86 deletions(-) diff --git a/src/base/sinc_resampler.cc b/src/base/sinc_resampler.cc index 223afbe..140dc90 100644 --- a/src/base/sinc_resampler.cc +++ b/src/base/sinc_resampler.cc @@ -11,7 +11,7 @@ // <---------------------------------------------------------> // r0_ (during first load) // -// kKernelSize / 2 kKernelSize / 2 kKernelSize / 2 kKernelSize / 2 +// kernel_size_ / 2 kernel_size_ / 2 kernel_size_ / 2 kernel_size_ / 2 // <---------------> <---------------> <---------------> <---------------> // r1_ r2_ r3_ r4_ // @@ -22,8 +22,8 @@ // <------------------ ... -----------------> // r0_ (during second load) // -// On the second request r0_ slides to the right by kKernelSize / 2 and r3_, r4_ -// and block_size_ are reinitialized via step (3) in the algorithm below. +// On the second request r0_ slides to the right by kernel_size_ / 2 and r3_, +// r4_ and block_size_ are reinitialized via step (3) in the algorithm below. // // These new regions remain constant until a Flush() occurs. While complicated, // this allows us to reduce jitter by always requesting the same amount from the @@ -31,26 +31,27 @@ // // The algorithm: // -// 1) Allocate input_buffer of size: request_frames_ + kKernelSize; this ensures +// 1) Allocate input_buffer of size: request_frames_ + kernel_size_; this +// ensures // there's enough room to read request_frames_ from the callback into region // r0_ (which will move between the first and subsequent passes). // // 2) Let r1_, r2_ each represent half the kernel centered around r0_: // -// r0_ = input_buffer_ + kKernelSize / 2 +// r0_ = input_buffer_ + kernel_size_ / 2 // r1_ = input_buffer_ // r2_ = r0_ // -// r0_ is always request_frames_ in size. r1_, r2_ are kKernelSize / 2 in +// r0_ is always request_frames_ in size. r1_, r2_ are kernel_size_ / 2 in // size. r1_ must be zero initialized to avoid convolution with garbage (see // step (5) for why). // // 3) Let r3_, r4_ each represent half the kernel right aligned with the end of // r0_ and choose block_size_ as the distance in frames between r4_ and r2_: // -// r3_ = r0_ + request_frames_ - kKernelSize -// r4_ = r0_ + request_frames_ - kKernelSize / 2 -// block_size_ = r4_ - r2_ = request_frames_ - kKernelSize / 2 +// r3_ = r0_ + request_frames_ - kernel_size_ +// r4_ = r0_ + request_frames_ - kernel_size_ / 2 +// block_size_ = r4_ - r2_ = request_frames_ - kernel_size_ / 2 // // 4) Consume request_frames_ frames into r0_. // @@ -62,9 +63,9 @@ // // 7) If we're on the second load, in order to avoid overwriting the frames we // just wrapped from r4_ we need to slide r0_ to the right by the size of -// r4_, which is kKernelSize / 2: +// r4_, which is kernel_size_ / 2: // -// r0_ = r0_ + kKernelSize / 2 = input_buffer_ + kKernelSize +// r0_ = r0_ + kernel_size_ / 2 = input_buffer_ + kernel_size_ // // r3_, r4_, and block_size_ then need to be reinitialized, so goto (3). // @@ -127,7 +128,9 @@ class ScopedSubnormalFloatDisabler { #endif }; -double SincScaleFactor(double io_ratio) { +} // namespace + +static double SincScaleFactor(double io_ratio, int kernel_size) { // |sinc_scale_factor| is basically the normalized cutoff frequency of the // low-pass filter. double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; @@ -136,19 +139,17 @@ double SincScaleFactor(double io_ratio) { // windowing it the transition from pass to stop does not happen right away. // So we should adjust the low pass filter cutoff slightly downward to avoid // some aliasing at the very high-end. - // TODO(crogers): this value is empirical and to be more exact should vary - // depending on kKernelSize. - sinc_scale_factor *= 0.9; + // Note: these values are derived empirically. + if (kernel_size == SincResampler::kMaxKernelSize) { + sinc_scale_factor *= 0.92; + } else { + DCHECK(kernel_size == SincResampler::kMinKernelSize); + sinc_scale_factor *= 0.90; + } return sinc_scale_factor; } -int CalculateChunkSize(int block_size_, double io_ratio) { - return block_size_ / io_ratio; -} - -} // namespace - // If we know the minimum architecture at compile time, avoid CPU detection. void SincResampler::InitializeCPUSpecificFeatures() { #if defined(_M_ARM64) || defined(__aarch64__) @@ -170,26 +171,39 @@ void SincResampler::InitializeCPUSpecificFeatures() { #endif } +static int CalculateChunkSize(int block_size_, double io_ratio) { + return block_size_ / io_ratio; +} + +// Static +int SincResampler::KernelSizeFromRequestFrames(int request_frames) { + // We want the kernel size to *more* than 1.5 * `request_frames`. + constexpr int kSmallKernelLimit = kMaxKernelSize * 3 / 2; + return request_frames <= kSmallKernelLimit ? kMinKernelSize : kMaxKernelSize; +} + SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames) - : io_sample_rate_ratio_(io_sample_rate_ratio), + : kernel_size_(KernelSizeFromRequestFrames(request_frames)), + kernel_storage_size_(kernel_size_ * (kKernelOffsetCount + 1)), + io_sample_rate_ratio_(io_sample_rate_ratio), request_frames_(request_frames), - input_buffer_size_(request_frames_ + kKernelSize), + input_buffer_size_(request_frames_ + kernel_size_), // Create input buffers with a 32-byte alignment for SIMD optimizations. kernel_storage_(static_cast( - base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))), + base::AlignedAlloc<32>(sizeof(float) * kernel_storage_size_))), kernel_pre_sinc_storage_(static_cast( - base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))), + base::AlignedAlloc<32>(sizeof(float) * kernel_storage_size_))), kernel_window_storage_(static_cast( - base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))), + base::AlignedAlloc<32>(sizeof(float) * kernel_storage_size_))), input_buffer_(static_cast( base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))), r1_(input_buffer_.get()), - r2_(input_buffer_.get() + kKernelSize / 2) { - CHECK(request_frames > kKernelSize * 3 / 2) + r2_(input_buffer_.get() + kernel_size_ / 2) { + CHECK(request_frames > kernel_size_ * 3 / 2) << "request_frames must be greater than 1.5 kernels to allow sufficient " "data for resampling"; // This means that after the first call to Flush we will have - // block_size_ > kKernelSize and r2_ < r3_. + // block_size_ > kernel_size_ and r2_ < r3_. InitializeCPUSpecificFeatures(); DCHECK(convolve_proc_); @@ -197,11 +211,11 @@ SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames) Flush(); memset(kernel_storage_.get(), 0, - sizeof(*kernel_storage_.get()) * kKernelStorageSize); + sizeof(*kernel_storage_.get()) * kernel_storage_size_); memset(kernel_pre_sinc_storage_.get(), 0, - sizeof(*kernel_pre_sinc_storage_.get()) * kKernelStorageSize); + sizeof(*kernel_pre_sinc_storage_.get()) * kernel_storage_size_); memset(kernel_window_storage_.get(), 0, - sizeof(*kernel_window_storage_.get()) * kKernelStorageSize); + sizeof(*kernel_window_storage_.get()) * kernel_storage_size_); InitializeKernel(); } @@ -210,10 +224,10 @@ SincResampler::~SincResampler() = default; void SincResampler::UpdateRegions(bool second_load) { // Setup various region pointers in the buffer (see diagram above). If we're - // on the second load we need to slide r0_ to the right by kKernelSize / 2. - r0_ = input_buffer_.get() + (second_load ? kKernelSize : kKernelSize / 2); - r3_ = r0_ + request_frames_ - kKernelSize; - r4_ = r0_ + request_frames_ - kKernelSize / 2; + // on the second load we need to slide r0_ to the right by kernel_size_ / 2. + r0_ = input_buffer_.get() + (second_load ? kernel_size_ : kernel_size_ / 2); + r3_ = r0_ + request_frames_ - kernel_size_; + r4_ = r0_ + request_frames_ - kernel_size_ / 2; block_size_ = r4_ - r2_; chunk_size_ = CalculateChunkSize(block_size_, io_sample_rate_ratio_); @@ -234,19 +248,20 @@ void SincResampler::InitializeKernel() { // Generates a set of windowed sinc() kernels. // We generate a range of sub-sample offsets from 0.0 to 1.0. - const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); + const double sinc_scale_factor = + SincScaleFactor(io_sample_rate_ratio_, kernel_size_); for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { const float subsample_offset = static_cast(offset_idx) / kKernelOffsetCount; - for (int i = 0; i < kKernelSize; ++i) { - const int idx = i + offset_idx * kKernelSize; + for (int i = 0; i < kernel_size_; ++i) { + const int idx = i + offset_idx * kernel_size_; const float pre_sinc = - base::kPiFloat * (i - kKernelSize / 2 - subsample_offset); + base::kPiFloat * (i - kernel_size_ / 2 - subsample_offset); kernel_pre_sinc_storage_[idx] = pre_sinc; // Compute Blackman window, matching the offset of the sinc(). - const float x = (i - subsample_offset) / kKernelSize; + const float x = (i - subsample_offset) / kernel_size_; const float window = static_cast(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) + kA2 * cos(4.0 * base::kPiDouble * x)); @@ -272,10 +287,11 @@ void SincResampler::SetRatio(double io_sample_rate_ratio) { // Optimize reinitialization by reusing values which are independent of // |sinc_scale_factor|. Provides a 3x speedup. - const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); + const double sinc_scale_factor = + SincScaleFactor(io_sample_rate_ratio_, kernel_size_); for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { - for (int i = 0; i < kKernelSize; ++i) { - const int idx = i + offset_idx * kKernelSize; + for (int i = 0; i < kernel_size_; ++i) { + const int idx = i + offset_idx * kernel_size_; const float window = kernel_window_storage_[idx]; const float pre_sinc = kernel_pre_sinc_storage_[idx]; @@ -312,13 +328,13 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) { // We'll compute "convolutions" for the two kernels which straddle // |virtual_source_idx_|. - const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; - const float* k2 = k1 + kKernelSize; + const float* k1 = kernel_storage_.get() + offset_idx * kernel_size_; + const float* k2 = k1 + kernel_size_; // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always // be true so long as kKernelSize is a multiple of 32. - DCHECK(0u == reinterpret_cast(k1) & 0x1F); - DCHECK(0u == reinterpret_cast(k2) & 0x1F); + DCHECK(0u == (reinterpret_cast(k1) & 0x1F)); + DCHECK(0u == (reinterpret_cast(k2) & 0x1F)); // Initialize input pointer based on quantized |virtual_source_idx_|. const float* input_ptr = r1_ + source_idx; @@ -326,13 +342,14 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) { // Figure out how much to weight each kernel's "convolution". const double kernel_interpolation_factor = virtual_offset_idx - offset_idx; - *destination++ = - convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor); + *destination++ = convolve_proc_(kernel_size_, input_ptr, k1, k2, + kernel_interpolation_factor); // Advance the virtual index. virtual_source_idx_ += io_sample_rate_ratio_; - if (!--remaining_frames) + if (!--remaining_frames) { return; + } } } @@ -342,11 +359,12 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) { // Step (3) -- Copy r3_, r4_ to r1_, r2_. // This wraps the last input frames back to the start of the buffer. - memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); + memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kernel_size_); // Step (4) -- Reinitialize regions if necessary. - if (r0_ == r2_) + if (r0_ == r2_) { UpdateRegions(true); + } // Step (5) -- Refresh the buffer with more input. read_cb(request_frames_, r0_); @@ -381,7 +399,12 @@ double SincResampler::BufferedFrames() const { return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0; } -float SincResampler::Convolve_C(const float* input_ptr, +int SincResampler::KernelSize() const { + return kernel_size_; +} + +float SincResampler::Convolve_C(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { @@ -390,7 +413,7 @@ float SincResampler::Convolve_C(const float* input_ptr, // Generate a single output sample. Unrolling this loop hurt performance in // local testing. - int n = kKernelSize; + int n = kernel_size; while (n--) { sum1 += *input_ptr * *k1++; sum2 += *input_ptr++ * *k2++; @@ -402,7 +425,8 @@ float SincResampler::Convolve_C(const float* input_ptr, } #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__) -float SincResampler::Convolve_SSE(const float* input_ptr, +float SincResampler::Convolve_SSE(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { @@ -413,13 +437,13 @@ float SincResampler::Convolve_SSE(const float* input_ptr, // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling // these loops hurt performance in local testing. if (reinterpret_cast(input_ptr) & 0x0F) { - for (int i = 0; i < kKernelSize; i += 4) { + for (int i = 0; i < kernel_size; i += 4) { m_input = _mm_loadu_ps(input_ptr + i); m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); } } else { - for (int i = 0; i < kKernelSize; i += 4) { + for (int i = 0; i < kernel_size; i += 4) { m_input = _mm_load_ps(input_ptr + i); m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); @@ -444,6 +468,7 @@ float SincResampler::Convolve_SSE(const float* input_ptr, } __attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2( + const int kernel_size, const float* input_ptr, const float* k1, const float* k2, @@ -456,13 +481,13 @@ __attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2( // these loops has not been tested or benchmarked. bool aligned_input = (reinterpret_cast(input_ptr) & 0x1F) == 0; if (!aligned_input) { - for (size_t i = 0; i < kKernelSize; i += 8) { + for (size_t i = 0; i < static_cast(kernel_size); i += 8) { m_input = _mm256_loadu_ps(input_ptr + i); m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); } } else { - for (size_t i = 0; i < kKernelSize; i += 8) { + for (size_t i = 0; i < static_cast(kernel_size); i += 8) { m_input = _mm256_load_ps(input_ptr + i); m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); @@ -490,7 +515,8 @@ __attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2( return result; } #elif defined(_M_ARM64) || defined(__aarch64__) -float SincResampler::Convolve_NEON(const float* input_ptr, +float SincResampler::Convolve_NEON(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { @@ -498,7 +524,7 @@ float SincResampler::Convolve_NEON(const float* input_ptr, float32x4_t m_sums1 = vmovq_n_f32(0); float32x4_t m_sums2 = vmovq_n_f32(0); - const float* upper = input_ptr + kKernelSize; + const float* upper = input_ptr + kernel_size; for (; input_ptr < upper;) { m_input = vld1q_f32(input_ptr); input_ptr += 4; diff --git a/src/base/sinc_resampler.h b/src/base/sinc_resampler.h index cef5e5e..b94f398 100644 --- a/src/base/sinc_resampler.h +++ b/src/base/sinc_resampler.h @@ -16,32 +16,40 @@ namespace base { class SincResampler { public: // The kernel size can be adjusted for quality (higher is better) at the - // expense of performance. Must be a multiple of 32. - // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. - static constexpr int kKernelSize = 32; + // expense of performance. Must be a multiple of 32. We aim for 64 for + // perceptible audio quality (see crbug.com/1407622), but fallback to 32 in + // cases where `request_frames_` is too small (e.g. 10ms of 8kHz audio). + // Use SincResampler::KernelSize() to check which size is being used. + static constexpr int kMaxKernelSize = 64; + static constexpr int kMinKernelSize = 32; // Default request size. Affects how often and for how much SincResampler - // calls back for input. Must be greater than kKernelSize. + // calls back for input. Must be greater than 1.5 * `kernel_size_`. static constexpr int kDefaultRequestSize = 512; + // A smaller request size, which still allows higher quality resampling, by + // guaranteeing we will use kMaxKernelSize. + static constexpr int kSmallRequestSize = kMaxKernelSize * 2; + // The kernel offset count is used for interpolation and is the number of // sub-sample kernel shifts. Can be adjusted for quality (higher is better) // at the expense of allocating more memory. static constexpr int kKernelOffsetCount = 32; - static constexpr int kKernelStorageSize = - kKernelSize * (kKernelOffsetCount + 1); // Callback type for providing more data into the resampler. Expects |frames| // of data to be rendered into |destination|; zero padded if not enough frames // are available to satisfy the request. typedef std::function ReadCB; + // Returns the kernel size which will be used for a given `request_frames`. + static int KernelSizeFromRequestFrames(int request_frames); + // Constructs a SincResampler with the specified |read_cb|, which is used to // acquire audio data for resampling. |io_sample_rate_ratio| is the ratio // of input / output sample rates. |request_frames| controls the size in // frames of the buffer requested by each |read_cb| call. The value must be - // greater than 1.5*kKernelSize. Specify kDefaultRequestSize if there are no - // request size constraints. + // greater than 1.5*`kernel_size_`. Specify kDefaultRequestSize if there are + // no request size constraints. SincResampler(double io_sample_rate_ratio, int request_frames); SincResampler(const SincResampler&) = delete; @@ -52,10 +60,10 @@ class SincResampler { // Resample |frames| of data from |read_cb_| into |destination|. void Resample(int frames, float* destination, ReadCB read_cb); - // The maximum size in frames that guarantees Resample() will only make a - // single call to |read_cb_| for more data. Note: If PrimeWithSilence() is + // The maximum size in output frames that guarantees Resample() will only make + // a single call to |read_cb_| for more data. Note: If PrimeWithSilence() is // not called, chunk size will grow after the first two Resample() calls by - // kKernelSize / (2 * io_sample_rate_ratio). See the .cc file for details. + // `kernel_size_` / (2 * io_sample_rate_ratio). See the .cc file for details. int ChunkSize() const { return chunk_size_; } // Returns the max number of frames that could be requested (via multiple @@ -77,13 +85,19 @@ class SincResampler { // Resample() is in progress. void SetRatio(double io_sample_rate_ratio); - float* get_kernel_for_testing() { return kernel_storage_.get(); } - // Return number of input frames consumed by a callback but not yet processed. // Since input/output ratio can be fractional, so can this value. // Zero before first call to Resample(). double BufferedFrames() const; + // Return the actual kernel size used by the resampler. Should be + // kMaxKernelSize most of the time, but varies based on `request_frames_`; + int KernelSize() const; + + float* get_kernel_for_testing() { return kernel_storage_.get(); } + + int kernel_storage_size_for_testing() { return kernel_storage_size_; } + private: void InitializeKernel(); void UpdateRegions(bool second_load); @@ -92,21 +106,25 @@ class SincResampler { // linearly interpolated using |kernel_interpolation_factor|. On x86, the // underlying implementation is chosen at run time based on SSE support. On // ARM, NEON support is chosen at compile time based on compilation flags. - static float Convolve_C(const float* input_ptr, + static float Convolve_C(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); #if defined(_M_X64) || defined(__x86_64__) || defined(__i386__) - static float Convolve_SSE(const float* input_ptr, + static float Convolve_SSE(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); - static float Convolve_AVX2(const float* input_ptr, + static float Convolve_AVX2(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); #elif defined(_M_ARM64) || defined(__aarch64__) - static float Convolve_NEON(const float* input_ptr, + static float Convolve_NEON(const int kernel_size, + const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); @@ -116,6 +134,9 @@ class SincResampler { // using SincResampler. void InitializeCPUSpecificFeatures(); + const int kernel_size_; + const int kernel_storage_size_; + // The ratio of input / output sample rates. double io_sample_rate_ratio_; @@ -139,9 +160,9 @@ class SincResampler { // The size (in samples) of the internal buffer used by the resampler. const int input_buffer_size_; - // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize. - // The kernel offsets are sub-sample shifts of a windowed sinc shifted from - // 0.0 to 1.0 sample. + // Contains kKernelOffsetCount kernels back-to-back, each of size + // `kernel_size_`. The kernel offsets are sub-sample shifts of a windowed sinc + // shifted from 0.0 to 1.0 sample. AlignedMemPtr kernel_storage_; AlignedMemPtr kernel_pre_sinc_storage_; AlignedMemPtr kernel_window_storage_; @@ -150,10 +171,8 @@ class SincResampler { AlignedMemPtr input_buffer_; // Stores the runtime selection of which Convolve function to use. - using ConvolveProc = float (*)(const float*, - const float*, - const float*, - double); + using ConvolveProc = + float (*)(const int, const float*, const float*, const float*, double); ConvolveProc convolve_proc_; // Pointers to the various regions inside |input_buffer_|. See the diagram at