mirror of https://github.com/auygun/kaliber.git
Update SincResampler
This commit is contained in:
parent
f723513521
commit
8c05aae086
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
||||
// Copyright 2012 The Chromium Authors
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
//
|
||||
|
@ -82,15 +82,19 @@
|
|||
#include "base/log.h"
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
||||
#include <xmmintrin.h>
|
||||
#define CONVOLVE_FUNC Convolve_SSE
|
||||
#include <immintrin.h>
|
||||
// Including these headers directly should generally be avoided. Since
|
||||
// Chrome is compiled with -msse3 (the minimal requirement), we include the
|
||||
// headers directly to make the intrinsics available.
|
||||
#include <avx2intrin.h>
|
||||
#include <avxintrin.h>
|
||||
#include <fmaintrin.h>
|
||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
#define CONVOLVE_FUNC Convolve_NEON
|
||||
#else
|
||||
#define CONVOLVE_FUNC Convolve_C
|
||||
#endif
|
||||
|
||||
namespace base {
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr double kPiDouble = 3.14159265358979323846;
|
||||
|
@ -109,7 +113,7 @@ class ScopedSubnormalFloatDisabler {
|
|||
ScopedSubnormalFloatDisabler(const ScopedSubnormalFloatDisabler&) = delete;
|
||||
|
||||
~ScopedSubnormalFloatDisabler() {
|
||||
#if defined(ARCH_CPU_X86_FAMILY)
|
||||
#if defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
||||
_mm_setcsr(orig_state_);
|
||||
#endif
|
||||
}
|
||||
|
@ -145,27 +149,52 @@ int CalculateChunkSize(int block_size_, double io_ratio) {
|
|||
|
||||
} // namespace
|
||||
|
||||
namespace base {
|
||||
// If we know the minimum architecture at compile time, avoid CPU detection.
|
||||
void SincResampler::InitializeCPUSpecificFeatures() {
|
||||
#if defined(_M_ARM64) || defined(__aarch64__)
|
||||
convolve_proc_ = Convolve_NEON;
|
||||
#elif defined(_M_X64) || defined(__x86_64__) || defined(__i386__)
|
||||
#if 0 // TODO
|
||||
// Using AVX2 instead of SSE2 when AVX2/FMA3 supported.
|
||||
if (cpu.has_avx2() && cpu.has_fma3())
|
||||
convolve_proc_ = Convolve_AVX2;
|
||||
else if (cpu.has_sse2())
|
||||
convolve_proc_ = Convolve_SSE;
|
||||
else
|
||||
convolve_proc_ = Convolve_C;
|
||||
#endif
|
||||
convolve_proc_ = Convolve_SSE;
|
||||
#else
|
||||
// Unknown architecture.
|
||||
convolve_proc_ = Convolve_C;
|
||||
#endif
|
||||
}
|
||||
|
||||
SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames)
|
||||
: io_sample_rate_ratio_(io_sample_rate_ratio),
|
||||
request_frames_(request_frames),
|
||||
input_buffer_size_(request_frames_ + kKernelSize),
|
||||
// Create input buffers with a 16-byte alignment for SSE optimizations.
|
||||
// Create input buffers with a 32-byte alignment for SIMD optimizations.
|
||||
kernel_storage_(static_cast<float*>(
|
||||
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
|
||||
base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
|
||||
kernel_pre_sinc_storage_(static_cast<float*>(
|
||||
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
|
||||
base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
|
||||
kernel_window_storage_(static_cast<float*>(
|
||||
base::AlignedAlloc<16>(sizeof(float) * kKernelStorageSize))),
|
||||
base::AlignedAlloc<32>(sizeof(float) * kKernelStorageSize))),
|
||||
input_buffer_(static_cast<float*>(
|
||||
base::AlignedAlloc<16>(sizeof(float) * input_buffer_size_))),
|
||||
base::AlignedAlloc<32>(sizeof(float) * input_buffer_size_))),
|
||||
r1_(input_buffer_.get()),
|
||||
r2_(input_buffer_.get() + kKernelSize / 2) {
|
||||
DCHECK(request_frames_ > 0);
|
||||
CHECK(request_frames > kKernelSize * 3 / 2)
|
||||
<< "request_frames must be greater than 1.5 kernels to allow sufficient "
|
||||
"data for resampling";
|
||||
// This means that after the first call to Flush we will have
|
||||
// block_size_ > kKernelSize and r2_ < r3_.
|
||||
|
||||
InitializeCPUSpecificFeatures();
|
||||
DCHECK(convolve_proc_);
|
||||
CHECK(request_frames_ > 0);
|
||||
Flush();
|
||||
DCHECK(block_size_ > kKernelSize)
|
||||
<< "block_size must be greater than kKernelSize!";
|
||||
|
||||
memset(kernel_storage_.get(), 0,
|
||||
sizeof(*kernel_storage_.get()) * kKernelStorageSize);
|
||||
|
@ -213,14 +242,14 @@ void SincResampler::InitializeKernel() {
|
|||
for (int i = 0; i < kKernelSize; ++i) {
|
||||
const int idx = i + offset_idx * kKernelSize;
|
||||
const float pre_sinc =
|
||||
kPiFloat * (i - kKernelSize / 2 - subsample_offset);
|
||||
base::kPiFloat * (i - kKernelSize / 2 - subsample_offset);
|
||||
kernel_pre_sinc_storage_[idx] = pre_sinc;
|
||||
|
||||
// Compute Blackman window, matching the offset of the sinc().
|
||||
const float x = (i - subsample_offset) / kKernelSize;
|
||||
const float window =
|
||||
static_cast<float>(kA0 - kA1 * cos(2.0 * kPiDouble * x) +
|
||||
kA2 * cos(4.0 * kPiDouble * x));
|
||||
static_cast<float>(kA0 - kA1 * cos(2.0 * base::kPiDouble * x) +
|
||||
kA2 * cos(4.0 * base::kPiDouble * x));
|
||||
kernel_window_storage_[idx] = window;
|
||||
|
||||
// Compute the sinc with offset, then window the sinc() function and store
|
||||
|
@ -286,10 +315,10 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
|
|||
const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
|
||||
const float* k2 = k1 + kKernelSize;
|
||||
|
||||
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always
|
||||
// be true so long as kKernelSize is a multiple of 16.
|
||||
DCHECK(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F));
|
||||
DCHECK(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F));
|
||||
// Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always
|
||||
// be true so long as kKernelSize is a multiple of 32.
|
||||
DCHECK(0u == reinterpret_cast<uintptr_t>(k1) & 0x1F);
|
||||
DCHECK(0u == reinterpret_cast<uintptr_t>(k2) & 0x1F);
|
||||
|
||||
// Initialize input pointer based on quantized |virtual_source_idx_|.
|
||||
const float* input_ptr = r1_ + source_idx;
|
||||
|
@ -298,7 +327,7 @@ void SincResampler::Resample(int frames, float* destination, ReadCB read_cb) {
|
|||
const double kernel_interpolation_factor =
|
||||
virtual_offset_idx - offset_idx;
|
||||
*destination++ =
|
||||
CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
|
||||
convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
|
||||
|
||||
// Advance the virtual index.
|
||||
virtual_source_idx_ += io_sample_rate_ratio_;
|
||||
|
@ -413,6 +442,53 @@ float SincResampler::Convolve_SSE(const float* input_ptr,
|
|||
|
||||
return result;
|
||||
}
|
||||
|
||||
__attribute__((target("avx2,fma"))) float SincResampler::Convolve_AVX2(
|
||||
const float* input_ptr,
|
||||
const float* k1,
|
||||
const float* k2,
|
||||
double kernel_interpolation_factor) {
|
||||
__m256 m_input;
|
||||
__m256 m_sums1 = _mm256_setzero_ps();
|
||||
__m256 m_sums2 = _mm256_setzero_ps();
|
||||
|
||||
// Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
|
||||
// these loops has not been tested or benchmarked.
|
||||
bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
|
||||
if (!aligned_input) {
|
||||
for (size_t i = 0; i < kKernelSize; i += 8) {
|
||||
m_input = _mm256_loadu_ps(input_ptr + i);
|
||||
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
|
||||
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < kKernelSize; i += 8) {
|
||||
m_input = _mm256_load_ps(input_ptr + i);
|
||||
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
|
||||
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
|
||||
}
|
||||
}
|
||||
|
||||
// Linearly interpolate the two "convolutions".
|
||||
__m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
|
||||
_mm256_extractf128_ps(m_sums1, 1));
|
||||
__m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
|
||||
_mm256_extractf128_ps(m_sums2, 1));
|
||||
m128_sums1 = _mm_mul_ps(
|
||||
m128_sums1,
|
||||
_mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
|
||||
m128_sums2 = _mm_mul_ps(
|
||||
m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
|
||||
m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
|
||||
|
||||
// Sum components together.
|
||||
float result;
|
||||
m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
|
||||
_mm_store_ss(&result, _mm_add_ss(m128_sums2,
|
||||
_mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
|
||||
|
||||
return result;
|
||||
}
|
||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||
float SincResampler::Convolve_NEON(const float* input_ptr,
|
||||
const float* k1,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
||||
// Copyright 2012 The Chromium Authors
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
|
@ -15,37 +15,41 @@ namespace base {
|
|||
// SincResampler is a high-quality single-channel sample-rate converter.
|
||||
class SincResampler {
|
||||
public:
|
||||
enum {
|
||||
// The kernel size can be adjusted for quality (higher is better) at the
|
||||
// expense of performance. Must be a multiple of 32.
|
||||
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
|
||||
kKernelSize = 32,
|
||||
static constexpr int kKernelSize = 32;
|
||||
|
||||
// Default request size. Affects how often and for how much SincResampler
|
||||
// calls back for input. Must be greater than kKernelSize.
|
||||
kDefaultRequestSize = 512,
|
||||
static constexpr int kDefaultRequestSize = 512;
|
||||
|
||||
// The kernel offset count is used for interpolation and is the number of
|
||||
// sub-sample kernel shifts. Can be adjusted for quality (higher is better)
|
||||
// at the expense of allocating more memory.
|
||||
kKernelOffsetCount = 32,
|
||||
kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
|
||||
};
|
||||
static constexpr int kKernelOffsetCount = 32;
|
||||
static constexpr int kKernelStorageSize =
|
||||
kKernelSize * (kKernelOffsetCount + 1);
|
||||
|
||||
// Callback type for providing more data into the resampler. Expects |frames|
|
||||
// of data to be rendered into |destination|; zero padded if not enough frames
|
||||
// are available to satisfy the request.
|
||||
typedef std::function<void(int frames, float* destination)> ReadCB;
|
||||
|
||||
// Constructs a SincResampler. |io_sample_rate_ratio| is the ratio
|
||||
// Constructs a SincResampler with the specified |read_cb|, which is used to
|
||||
// acquire audio data for resampling. |io_sample_rate_ratio| is the ratio
|
||||
// of input / output sample rates. |request_frames| controls the size in
|
||||
// frames of the buffer requested by each |read_cb| call. The value must be
|
||||
// greater than kKernelSize. Specify kDefaultRequestSize if there are no
|
||||
// greater than 1.5*kKernelSize. Specify kDefaultRequestSize if there are no
|
||||
// request size constraints.
|
||||
SincResampler(double io_sample_rate_ratio, int request_frames);
|
||||
|
||||
SincResampler(const SincResampler&) = delete;
|
||||
SincResampler& operator=(const SincResampler&) = delete;
|
||||
|
||||
~SincResampler();
|
||||
|
||||
// Resample |frames| of data from |read_cb| into |destination|.
|
||||
// Resample |frames| of data from |read_cb_| into |destination|.
|
||||
void Resample(int frames, float* destination, ReadCB read_cb);
|
||||
|
||||
// The maximum size in frames that guarantees Resample() will only make a
|
||||
|
@ -97,6 +101,10 @@ class SincResampler {
|
|||
const float* k1,
|
||||
const float* k2,
|
||||
double kernel_interpolation_factor);
|
||||
static float Convolve_AVX2(const float* input_ptr,
|
||||
const float* k1,
|
||||
const float* k2,
|
||||
double kernel_interpolation_factor);
|
||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||
static float Convolve_NEON(const float* input_ptr,
|
||||
const float* k1,
|
||||
|
@ -104,6 +112,10 @@ class SincResampler {
|
|||
double kernel_interpolation_factor);
|
||||
#endif
|
||||
|
||||
// Selects runtime specific CPU features like SSE. Must be called before
|
||||
// using SincResampler.
|
||||
void InitializeCPUSpecificFeatures();
|
||||
|
||||
// The ratio of input / output sample rates.
|
||||
double io_sample_rate_ratio_;
|
||||
|
||||
|
@ -130,12 +142,19 @@ class SincResampler {
|
|||
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
|
||||
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from
|
||||
// 0.0 to 1.0 sample.
|
||||
base::AlignedMemPtr<float[]> kernel_storage_;
|
||||
base::AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
|
||||
base::AlignedMemPtr<float[]> kernel_window_storage_;
|
||||
AlignedMemPtr<float[]> kernel_storage_;
|
||||
AlignedMemPtr<float[]> kernel_pre_sinc_storage_;
|
||||
AlignedMemPtr<float[]> kernel_window_storage_;
|
||||
|
||||
// Data from the source is copied into this buffer for each processing pass.
|
||||
base::AlignedMemPtr<float[]> input_buffer_;
|
||||
AlignedMemPtr<float[]> input_buffer_;
|
||||
|
||||
// Stores the runtime selection of which Convolve function to use.
|
||||
using ConvolveProc = float (*)(const float*,
|
||||
const float*,
|
||||
const float*,
|
||||
double);
|
||||
ConvolveProc convolve_proc_;
|
||||
|
||||
// Pointers to the various regions inside |input_buffer_|. See the diagram at
|
||||
// the top of the .cc file for more information.
|
||||
|
@ -144,9 +163,6 @@ class SincResampler {
|
|||
float* const r2_;
|
||||
float* r3_;
|
||||
float* r4_;
|
||||
|
||||
SincResampler(SincResampler const&) = delete;
|
||||
SincResampler& operator=(SincResampler const&) = delete;
|
||||
};
|
||||
|
||||
} // namespace base
|
||||
|
|
Loading…
Reference in New Issue