mirror of https://github.com/auygun/kaliber.git
Update SincResampler
This commit is contained in:
parent
f723513521
commit
8c05aae086
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
// Copyright 2012 The Chromium Authors
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file.
|
// found in the LICENSE file.
|
||||||
//
|
//
|
||||||
|
@ -82,15 +82,19 @@
|
||||||
#include "base/log.h"
|
#include "base/log.h"
|
||||||
|
|
||||||
#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
||||||
#include <xmmintrin.h>
|
#include <immintrin.h>
|
||||||
#define CONVOLVE_FUNC Convolve_SSE
|
// Including these headers directly should generally be avoided. Since
|
||||||
|
// Chrome is compiled with -msse3 (the minimal requirement), we include the
|
||||||
|
// headers directly to make the intrinsics available.
|
||||||
|
#include <avx2intrin.h>
|
||||||
|
#include <avxintrin.h>
|
||||||
|
#include <fmaintrin.h>
|
||||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#define CONVOLVE_FUNC Convolve_NEON
|
|
||||||
#else
|
|
||||||
#define CONVOLVE_FUNC Convolve_C
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
namespace base {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
constexpr double kPiDouble = 3.14159265358979323846;
|
constexpr double kPiDouble = 3.14159265358979323846;
|
||||||
|
@ -109,7 +113,7 @@ class ScopedSubnormalFloatDisabler {
|
||||||
ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete;
|
ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete;
|
||||||
|
|
||||||
~ScopedSubnormalFloatDisabler() {
|
~ScopedSubnormalFloatDisabler() {
|
||||||
#if defined(ARCH_CPU_X86_FAMILY)
|
#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
||||||
_mm_setcsr(orig_state_);
|
_mm_setcsr(orig_state_);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -145,27 +149,52 @@ int CalculateChunkSize(int block_size_, double io_ratio) {
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
namespace base {
|
// If we know the minimum architecture at compile time, avoid CPU detection.
|
||||||
|
void SincResampler::InitializeCPUSpecificFeatures() {
|
||||||
|
#if defined(_M_ARM64) || defined(__aarch64__)
|
||||||
|
convolve_proc_ = Convolve_NEON;
|
||||||
|
#elif defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
||||||
|
#if 0 // TODO
|
||||||
|
// Using AVX2 instead of SSE2 when AVX2/FMA3 supported.
|
||||||
|
if (cpu.has_avx2() && cpu.has_fma3())
|
||||||
|
convolve_proc_ = Convolve_AVX2;
|
||||||
|
else if (cpu.has_sse2())
|
||||||
|
convolve_proc_ = Convolve_SSE;
|
||||||
|
else
|
||||||
|
convolve_proc_ = Convolve_C;
|
||||||
|
#endif
|
||||||
|
convolve_proc_ = Convolve_SSE;
|
||||||
|
#else
|
||||||
|
// Unknown architecture.
|
||||||
|
convolve_proc_ = Convolve_C;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames)
|
SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames)
|
||||||
: io_sample_rate_ratio_(io_sample_rate_ratio),
|
: io_sample_rate_ratio_(io_sample_rate_ratio),
|
||||||
request_frames_(request_frames),
|
request_frames_(request_frames),
|
||||||
input_buffer_size_(request_frames_ + kKernelSize),
|
input_buffer_size_(request_frames_ + kKernelSize),
|
||||||
// Create input buffers with a 16-byte alignment for SSE optimizations.
|
// Create input buffers with a 32-byte alignment for SIMD optimizations.
|
||||||
kernel_storage_(static_cast<float*>(
|
kernel_storage_(static_cast<float*>(
|
||||||
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
|
base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
|
||||||
kernel_pre_sinc_storage_(static_cast<float*>(
|
kernel_pre_sinc_storage_(static_cast<float*>(
|
||||||
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
|
base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
|
||||||
kernel_window_storage_(static_cast<float*>(
|
kernel_window_storage_(static_cast<float*>(
|
||||||
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
|
base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
|
||||||
input_buffer_(static_cast<float*>(
|
input_buffer_(static_cast<float*>(
|
||||||
base::AlignedAlloc<16>(sizeof(float) * input_buffer_size_))),
|
base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))),
|
||||||
r1_(input_buffer_.get()),
|
r1_(input_buffer_.get()),
|
||||||
r2_(input_buffer_.get() + kKernelSize / 2) {
|
r2_(input_buffer_.get() + kKernelSize / 2) {
|
||||||
DCHECK(request_frames_ > 0);
|
CHECK(request_frames > kKernelSize * 3 / 2)
|
||||||
|
<< "request_frames must be greater than 1.5 kernels to allow sufficient "
|
||||||
|
"data for resampling";
|
||||||
|
// This means that after the first call to Flush we will have
|
||||||
|
// block_size_ > kKernelSize and r2_ < r3_.
|
||||||
|
|
||||||
|
InitializeCPUSpecificFeatures();
|
||||||
|
DCHECK(convolve_proc_);
|
||||||
|
CHECK(request_frames_ > 0);
|
||||||
Flush();
|
Flush();
|
||||||
DCHECK(block_size_ > kKernelSize)
|
|
||||||
<< "block_size must be greater than kKernelSize!";
|
|
||||||
|
|
||||||
memset(kernel_storage_.get(), 0,
|
memset(kernel_storage_.get(), 0,
|
||||||
sizeof(*kernel_storage_.get()) * kKernelStorageSize);
|
sizeof(*kernel_storage_.get()) * kKernelStorageSize);
|
||||||
|
@ -213,14 +242,14 @@ void SincResampler::InitializeKernel() {
|
||||||
for (int i = 0; i < kKernelSize; ++i) {
|
for (int i = 0; i < kKernelSize; ++i) {
|
||||||
const int idx = i + offset_idx * kKernelSize;
|
const int idx = i + offset_idx * kKernelSize;
|
||||||
const float pre_sinc =
|
const float pre_sinc =
|
||||||
kPiFloat * (i - kKernelSize / 2 - subsample_offset);
|
base::kPiFloat * (i - kKernelSize / 2 - subsample_offset);
|
||||||
kernel_pre_sinc_storage_[idx] = pre_sinc;
|
kernel_pre_sinc_storage_[idx] = pre_sinc;
|
||||||
|
|
||||||
// Compute Blackman window, matching the offset of the sinc().
|
// Compute Blackman window, matching the offset of the sinc().
|
||||||
const float x = (i - subsample_offset) / kKernelSize;
|
const float x = (i - subsample_offset) / kKernelSize;
|
||||||
const float window =
|
const float window =
|
||||||
static_cast<float>(kA0 - kA1 * cos(2.0 * kPiDouble * x) +
|
static_cast<float>(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) +
|
||||||
kA2 * cos(4.0 * kPiDouble * x));
|
kA2 * cos(4.0 * base::kPiDouble * x));
|
||||||
kernel_window_storage_[idx] = window;
|
kernel_window_storage_[idx] = window;
|
||||||
|
|
||||||
// Compute the sinc with offset, then window the sinc() function and store
|
// Compute the sinc with offset, then window the sinc() function and store
|
||||||
|
@ -286,10 +315,10 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
|
||||||
const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
|
const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
|
||||||
const float* k2 = k1 + kKernelSize;
|
const float* k2 = k1 + kKernelSize;
|
||||||
|
|
||||||
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always
|
// Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always
|
||||||
// be true so long as kKernelSize is a multiple of 16.
|
// be true so long as kKernelSize is a multiple of 32.
|
||||||
DCHECK(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F));
|
DCHECK(0u == reinterpret_cast<uintptr_t>(k1) & 0x1F);
|
||||||
DCHECK(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F));
|
DCHECK(0u == reinterpret_cast<uintptr_t>(k2) & 0x1F);
|
||||||
|
|
||||||
// Initialize input pointer based on quantized |virtual_source_idx_|.
|
// Initialize input pointer based on quantized |virtual_source_idx_|.
|
||||||
const float* input_ptr = r1_ + source_idx;
|
const float* input_ptr = r1_ + source_idx;
|
||||||
|
@ -298,7 +327,7 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
|
||||||
const double kernel_interpolation_factor =
|
const double kernel_interpolation_factor =
|
||||||
virtual_offset_idx - offset_idx;
|
virtual_offset_idx - offset_idx;
|
||||||
*destination++ =
|
*destination++ =
|
||||||
CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
|
convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
|
||||||
|
|
||||||
// Advance the virtual index.
|
// Advance the virtual index.
|
||||||
virtual_source_idx_ += io_sample_rate_ratio_;
|
virtual_source_idx_ += io_sample_rate_ratio_;
|
||||||
|
@ -413,6 +442,53 @@ float SincResampler::Convolve_SSE(const float* input_ptr,
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2(
|
||||||
|
const float* input_ptr,
|
||||||
|
const float* k1,
|
||||||
|
const float* k2,
|
||||||
|
double kernel_interpolation_factor) {
|
||||||
|
__m256 m_input;
|
||||||
|
__m256 m_sums1 = _mm256_setzero_ps();
|
||||||
|
__m256 m_sums2 = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
// Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
|
||||||
|
// these loops has not been tested or benchmarked.
|
||||||
|
bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
|
||||||
|
if (!aligned_input) {
|
||||||
|
for (size_t i = 0; i < kKernelSize; i += 8) {
|
||||||
|
m_input = _mm256_loadu_ps(input_ptr + i);
|
||||||
|
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
|
||||||
|
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < kKernelSize; i += 8) {
|
||||||
|
m_input = _mm256_load_ps(input_ptr + i);
|
||||||
|
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
|
||||||
|
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Linearly interpolate the two "convolutions".
|
||||||
|
__m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
|
||||||
|
_mm256_extractf128_ps(m_sums1, 1));
|
||||||
|
__m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
|
||||||
|
_mm256_extractf128_ps(m_sums2, 1));
|
||||||
|
m128_sums1 = _mm_mul_ps(
|
||||||
|
m128_sums1,
|
||||||
|
_mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
|
||||||
|
m128_sums2 = _mm_mul_ps(
|
||||||
|
m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
|
||||||
|
m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
|
||||||
|
|
||||||
|
// Sum components together.
|
||||||
|
float result;
|
||||||
|
m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
|
||||||
|
_mm_store_ss(&result, _mm_add_ss(m128_sums2,
|
||||||
|
_mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||||
float SincResampler::Convolve_NEON(const float* input_ptr,
|
float SincResampler::Convolve_NEON(const float* input_ptr,
|
||||||
const float* k1,
|
const float* k1,
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
// Copyright 2012 The Chromium Authors
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file.
|
// found in the LICENSE file.
|
||||||
|
|
||||||
|
@ -15,37 +15,41 @@ namespace base {
|
||||||
// SincResampler is a high-quality single-channel sample-rate converter.
|
// SincResampler is a high-quality single-channel sample-rate converter.
|
||||||
class SincResampler {
|
class SincResampler {
|
||||||
public:
|
public:
|
||||||
enum {
|
// The kernel size can be adjusted for quality (higher is better) at the
|
||||||
// The kernel size can be adjusted for quality (higher is better) at the
|
// expense of performance. Must be a multiple of 32.
|
||||||
// expense of performance. Must be a multiple of 32.
|
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
|
||||||
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
|
static constexpr int kKernelSize = 32;
|
||||||
kKernelSize = 32,
|
|
||||||
|
|
||||||
// Default request size. Affects how often and for how much SincResampler
|
// Default request size. Affects how often and for how much SincResampler
|
||||||
// calls back for input. Must be greater than kKernelSize.
|
// calls back for input. Must be greater than kKernelSize.
|
||||||
kDefaultRequestSize = 512,
|
static constexpr int kDefaultRequestSize = 512;
|
||||||
|
|
||||||
// The kernel offset count is used for interpolation and is the number of
|
// The kernel offset count is used for interpolation and is the number of
|
||||||
// sub-sample kernel shifts. Can be adjusted for quality (higher is better)
|
// sub-sample kernel shifts. Can be adjusted for quality (higher is better)
|
||||||
// at the expense of allocating more memory.
|
// at the expense of allocating more memory.
|
||||||
kKernelOffsetCount = 32,
|
static constexpr int kKernelOffsetCount = 32;
|
||||||
kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
|
static constexpr int kKernelStorageSize =
|
||||||
};
|
kKernelSize * (kKernelOffsetCount + 1);
|
||||||
|
|
||||||
// Callback type for providing more data into the resampler. Expects |frames|
|
// Callback type for providing more data into the resampler. Expects |frames|
|
||||||
// of data to be rendered into |destination|; zero padded if not enough frames
|
// of data to be rendered into |destination|; zero padded if not enough frames
|
||||||
// are available to satisfy the request.
|
// are available to satisfy the request.
|
||||||
typedef std::function<void(int frames, float* destination)> ReadCB;
|
typedef std::function<void(int frames, float* destination)> ReadCB;
|
||||||
|
|
||||||
// Constructs a SincResampler. |io_sample_rate_ratio| is the ratio
|
// Constructs a SincResampler with the specified |read_cb|, which is used to
|
||||||
|
// acquire audio data for resampling. |io_sample_rate_ratio| is the ratio
|
||||||
// of input / output sample rates. |request_frames| controls the size in
|
// of input / output sample rates. |request_frames| controls the size in
|
||||||
// frames of the buffer requested by each |read_cb| call. The value must be
|
// frames of the buffer requested by each |read_cb| call. The value must be
|
||||||
// greater than kKernelSize. Specify kDefaultRequestSize if there are no
|
// greater than 1.5*kKernelSize. Specify kDefaultRequestSize if there are no
|
||||||
// request size constraints.
|
// request size constraints.
|
||||||
SincResampler(double io_sample_rate_ratio, int request_frames);
|
SincResampler(double io_sample_rate_ratio, int request_frames);
|
||||||
|
|
||||||
|
SincResampler(const SincResampler&) = delete;
|
||||||
|
SincResampler& operator=(const SincResampler&) = delete;
|
||||||
|
|
||||||
~SincResampler();
|
~SincResampler();
|
||||||
|
|
||||||
// Resample |frames| of data from |read_cb| into |destination|.
|
// Resample |frames| of data from |read_cb_| into |destination|.
|
||||||
void Resample(int frames, float* destination, ReadCB read_cb);
|
void Resample(int frames, float* destination, ReadCB read_cb);
|
||||||
|
|
||||||
// The maximum size in frames that guarantees Resample() will only make a
|
// The maximum size in frames that guarantees Resample() will only make a
|
||||||
|
@ -97,6 +101,10 @@ class SincResampler {
|
||||||
const float* k1,
|
const float* k1,
|
||||||
const float* k2,
|
const float* k2,
|
||||||
double kernel_interpolation_factor);
|
double kernel_interpolation_factor);
|
||||||
|
static float Convolve_AVX2(const float* input_ptr,
|
||||||
|
const float* k1,
|
||||||
|
const float* k2,
|
||||||
|
double kernel_interpolation_factor);
|
||||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||||
static float Convolve_NEON(const float* input_ptr,
|
static float Convolve_NEON(const float* input_ptr,
|
||||||
const float* k1,
|
const float* k1,
|
||||||
|
@ -104,6 +112,10 @@ class SincResampler {
|
||||||
double kernel_interpolation_factor);
|
double kernel_interpolation_factor);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Selects runtime specific CPU features like SSE. Must be called before
|
||||||
|
// using SincResampler.
|
||||||
|
void InitializeCPUSpecificFeatures();
|
||||||
|
|
||||||
// The ratio of input / output sample rates.
|
// The ratio of input / output sample rates.
|
||||||
double io_sample_rate_ratio_;
|
double io_sample_rate_ratio_;
|
||||||
|
|
||||||
|
@ -130,12 +142,19 @@ class SincResampler {
|
||||||
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
|
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
|
||||||
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from
|
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from
|
||||||
// 0.0 to 1.0 sample.
|
// 0.0 to 1.0 sample.
|
||||||
base::AlignedMemPtr<float[]> kernel_storage_;
|
AlignedMemPtr<float[]> kernel_storage_;
|
||||||
base::AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
|
AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
|
||||||
base::AlignedMemPtr<float[]> kernel_window_storage_;
|
AlignedMemPtr<float[]> kernel_window_storage_;
|
||||||
|
|
||||||
// Data from the source is copied into this buffer for each processing pass.
|
// Data from the source is copied into this buffer for each processing pass.
|
||||||
base::AlignedMemPtr<float[]> input_buffer_;
|
AlignedMemPtr<float[]> input_buffer_;
|
||||||
|
|
||||||
|
// Stores the runtime selection of which Convolve function to use.
|
||||||
|
using ConvolveProc = float (*)(const float*,
|
||||||
|
const float*,
|
||||||
|
const float*,
|
||||||
|
double);
|
||||||
|
ConvolveProc convolve_proc_;
|
||||||
|
|
||||||
// Pointers to the various regions inside |input_buffer_|. See the diagram at
|
// Pointers to the various regions inside |input_buffer_|. See the diagram at
|
||||||
// the top of the .cc file for more information.
|
// the top of the .cc file for more information.
|
||||||
|
@ -144,9 +163,6 @@ class SincResampler {
|
||||||
float* const r2_;
|
float* const r2_;
|
||||||
float* r3_;
|
float* r3_;
|
||||||
float* r4_;
|
float* r4_;
|
||||||
|
|
||||||
SincResampler(SincResampler const&) = delete;
|
|
||||||
SincResampler& operator=(SincResampler const&) = delete;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace base
|
} // namespace base
|
||||||
|
|
Loading…
Reference in New Issue