diff options
Diffstat (limited to 'utils/uhjdecoder.cpp')
-rw-r--r-- | utils/uhjdecoder.cpp | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/utils/uhjdecoder.cpp b/utils/uhjdecoder.cpp new file mode 100644 index 00000000..3de2e40a --- /dev/null +++ b/utils/uhjdecoder.cpp @@ -0,0 +1,515 @@ +/* + * 2-channel UHJ Decoder + * + * Copyright (c) Chris Robinson <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config.h" + +#ifdef HAVE_SSE_INTRINSICS +#include <xmmintrin.h> +#elif defined(HAVE_NEON) +#include <arm_neon.h> +#endif + +#include <array> +#include <complex> +#include <cstring> +#include <memory> +#include <stddef.h> +#include <string> +#include <utility> +#include <vector> + +#include "albit.h" +#include "albyte.h" +#include "alcomplex.h" +#include "almalloc.h" +#include "alspan.h" +#include "opthelpers.h" + +#include "sndfile.h" + +#include "win_main_utf8.h" + + +struct FileDeleter { + void operator()(FILE *file) { fclose(file); } +}; +using FilePtr = std::unique_ptr<FILE,FileDeleter>; + +struct SndFileDeleter { + void operator()(SNDFILE *sndfile) { sf_close(sndfile); } +}; +using SndFilePtr = std::unique_ptr<SNDFILE,SndFileDeleter>; + + +using ubyte = unsigned char; +using ushort = unsigned short; +using uint = unsigned int; +using complex_d = std::complex<double>; + +using byte4 = std::array<al::byte,4>; + + +constexpr ubyte SUBTYPE_BFORMAT_FLOAT[]{ + 0x03, 0x00, 0x00, 0x00, 0x21, 0x07, 0xd3, 0x11, 0x86, 0x44, 0xc8, 0xc1, + 0xca, 0x00, 0x00, 0x00 +}; + +void fwrite16le(ushort val, FILE *f) +{ + ubyte data[2]{ static_cast<ubyte>(val&0xff), static_cast<ubyte>((val>>8)&0xff) }; + fwrite(data, 1, 2, f); +} + +void fwrite32le(uint val, FILE *f) +{ + ubyte data[4]{ static_cast<ubyte>(val&0xff), static_cast<ubyte>((val>>8)&0xff), + static_cast<ubyte>((val>>16)&0xff), static_cast<ubyte>((val>>24)&0xff) }; + fwrite(data, 1, 4, f); +} + +template<al::endian = al::endian::native> +byte4 f32AsLEBytes(const float &value) = delete; + +template<> +byte4 f32AsLEBytes<al::endian::little>(const float &value) +{ + byte4 ret{}; + std::memcpy(ret.data(), &value, 4); + return ret; +} +template<> +byte4 f32AsLEBytes<al::endian::big>(const float &value) +{ + byte4 ret{}; + std::memcpy(ret.data(), &value, 4); + std::swap(ret[0], ret[3]); + std::swap(ret[1], ret[2]); + return ret; +} + + +constexpr uint BufferLineSize{1024}; + +using FloatBufferLine = std::array<float,BufferLineSize>; +using FloatBufferSpan = al::span<float,BufferLineSize>; + + +struct UhjDecoder { + constexpr static size_t sFilterSize{128}; + + alignas(16) std::array<float,BufferLineSize+sFilterSize> mS{}; + alignas(16) std::array<float,BufferLineSize+sFilterSize> mD{}; + + /* History for the FIR filter. */ + alignas(16) std::array<float,sFilterSize-1> mDTHistory{}; + alignas(16) std::array<float,sFilterSize-1> mSHistory{}; + + alignas(16) std::array<float,BufferLineSize + sFilterSize*2> mTemp{}; + + void decode2(const float *RESTRICT InSamples, FloatBufferLine *OutSamples, + const size_t SamplesToDo); + + DEF_NEWDEL(UhjDecoder) +}; + +/* Same basic filter design as in core/uhjfilter.cpp. */ +template<size_t FilterSize> +struct PhaseShifterT { + static_assert((FilterSize&(FilterSize-1)) == 0, "FilterSize needs to be power-of-two"); + + alignas(16) std::array<float,FilterSize> Coeffs{}; + + PhaseShifterT() + { + constexpr size_t fft_size{FilterSize * 2}; + constexpr size_t half_size{fft_size / 2}; + + auto fftBuffer = std::make_unique<complex_d[]>(fft_size); + std::fill_n(fftBuffer.get(), fft_size, complex_d{}); + fftBuffer[half_size] = 1.0; + + forward_fft({fftBuffer.get(), fft_size}); + for(size_t i{0};i < half_size+1;++i) + fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()}; + for(size_t i{half_size+1};i < fft_size;++i) + fftBuffer[i] = std::conj(fftBuffer[fft_size - i]); + inverse_fft({fftBuffer.get(), fft_size}); + + auto fftiter = fftBuffer.get() + half_size + (FilterSize-1); + for(float &coeff : Coeffs) + { + coeff = static_cast<float>(fftiter->real() / double{fft_size}); + fftiter -= 2; + } + } +}; +const PhaseShifterT<UhjDecoder::sFilterSize> PShift{}; + +/* Mostly the same as in core/uhjfilter.cpp, except this overwrites the output + * instead of adding to it. + */ +void allpass_process(al::span<float> dst, const float *RESTRICT src) +{ +#ifdef HAVE_SSE_INTRINSICS + if(size_t todo{dst.size()>>1}) + { + auto *out = reinterpret_cast<__m64*>(dst.data()); + do { + __m128 r04{_mm_setzero_ps()}; + __m128 r14{_mm_setzero_ps()}; + for(size_t j{0};j < PShift.Coeffs.size();j+=4) + { + const __m128 coeffs{_mm_load_ps(&PShift.Coeffs[j])}; + const __m128 s0{_mm_loadu_ps(&src[j*2])}; + const __m128 s1{_mm_loadu_ps(&src[j*2 + 4])}; + + __m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))}; + r04 = _mm_add_ps(r04, _mm_mul_ps(s, coeffs)); + + s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1)); + r14 = _mm_add_ps(r14, _mm_mul_ps(s, coeffs)); + } + src += 2; + + __m128 r4{_mm_add_ps(_mm_unpackhi_ps(r04, r14), _mm_unpacklo_ps(r04, r14))}; + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + + _mm_storel_pi(out, r4); + ++out; + } while(--todo); + } + if((dst.size()&1)) + { + __m128 r4{_mm_setzero_ps()}; + for(size_t j{0};j < PShift.Coeffs.size();j+=4) + { + const __m128 coeffs{_mm_load_ps(&PShift.Coeffs[j])}; + const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])}; + r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs)); + } + r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + + dst.back() = _mm_cvtss_f32(r4); + } + +#elif defined(HAVE_NEON) + + size_t pos{0}; + if(size_t todo{dst.size()>>1}) + { + auto shuffle_2020 = [](float32x4_t a, float32x4_t b) + { + float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; + ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); + return ret; + }; + auto shuffle_3131 = [](float32x4_t a, float32x4_t b) + { + float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; + ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); + return ret; + }; + auto unpacklo = [](float32x4_t a, float32x4_t b) + { + float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; + return vcombine_f32(result.val[0], result.val[1]); + }; + auto unpackhi = [](float32x4_t a, float32x4_t b) + { + float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))}; + return vcombine_f32(result.val[0], result.val[1]); + }; + do { + float32x4_t r04{vdupq_n_f32(0.0f)}; + float32x4_t r14{vdupq_n_f32(0.0f)}; + for(size_t j{0};j < PShift.Coeffs.size();j+=4) + { + const float32x4_t coeffs{vld1q_f32(&PShift.Coeffs[j])}; + const float32x4_t s0{vld1q_f32(&src[j*2])}; + const float32x4_t s1{vld1q_f32(&src[j*2 + 4])}; + + r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs); + r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs); + } + src += 2; + + float32x4_t r4{vaddq_f32(unpackhi(r04, r14), unpacklo(r04, r14))}; + float32x2_t r2{vadd_f32(vget_low_f32(r4), vget_high_f32(r4))}; + + vst1_f32(&dst[pos], r2); + pos += 2; + } while(--todo); + } + if((dst.size()&1)) + { + auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d) + { + float32x4_t ret{vmovq_n_f32(a)}; + ret = vsetq_lane_f32(b, ret, 1); + ret = vsetq_lane_f32(c, ret, 2); + ret = vsetq_lane_f32(d, ret, 3); + return ret; + }; + float32x4_t r4{vdupq_n_f32(0.0f)}; + for(size_t j{0};j < PShift.Coeffs.size();j+=4) + { + const float32x4_t coeffs{vld1q_f32(&PShift.Coeffs[j])}; + const float32x4_t s{load4(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])}; + r4 = vmlaq_f32(r4, s, coeffs); + } + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + dst[pos] = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0); + } + +#else + + for(float &output : dst) + { + float ret{0.0f}; + for(size_t j{0};j < PShift.Coeffs.size();++j) + ret += src[j*2] * PShift.Coeffs[j]; + + output = ret; + ++src; + } +#endif +} + + +/* There is a difference with decoding 2-channel UHJ compared to 3-channel, due + * to 2-channel having lost some of the original signal (which can be recovered + * with 3-channel). The B-Format signal reconstructed from 2-channel UHJ should + * not be run through a normal B-Format decoder, as it needs different shelf + * filters (none? if I understand right, it should do an energy-optimized + * decode only). + * + * 2-channel UHJ decoding is done as: + * + * S = (Left + Right)/2.0 + * D = (Left - Right)/2.0 + * + * W = 0.982*S + j*0.164*D + * X = 0.419*S - j*0.828*D + * Y = 0.763*D + j*0.385*S + * + * where j is a +90 degree phase shift. + */ +void UhjDecoder::decode2(const float *RESTRICT InSamples, FloatBufferLine *OutSamples, + const size_t SamplesToDo) +{ + ASSUME(SamplesToDo > 0); + + float *woutput{OutSamples[0].data()}; + float *xoutput{OutSamples[1].data()}; + float *youtput{OutSamples[2].data()}; + + /* Add a delay to the input mid/side channels, to align it with the + * all-passed signal. + */ + + /* S = (Left + Right)/2.0 */ + for(size_t i{0};i < SamplesToDo;++i) + mS[sFilterSize+i] = (InSamples[i*2 + 0] + InSamples[i*2 + 1]) * 0.5f; + + /* D = (Left - Right)/2.0 */ + for(size_t i{0};i < SamplesToDo;++i) + mD[sFilterSize+i] = (InSamples[i*2 + 0] - InSamples[i*2 + 1]) * 0.5f; + + /* Precompute j*D and store in xoutput. */ + auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin()); + std::copy_n(mD.cbegin(), SamplesToDo+sFilterSize, tmpiter); + std::copy_n(mTemp.cbegin()+SamplesToDo, mDTHistory.size(), mDTHistory.begin()); + allpass_process({xoutput, SamplesToDo}, mTemp.data()); + + for(size_t i{0};i < SamplesToDo;++i) + { + /* W = 0.982*S + j*0.164*D */ + woutput[i] = 0.982f*mS[i] + 0.164f*xoutput[i]; + /* X = 0.419*S - j*0.828*D */ + xoutput[i] = 0.419f*mS[i] - 0.828f*xoutput[i]; + } + + /* Precompute j*S and store in youtput. */ + tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin()); + std::copy_n(mS.cbegin(), SamplesToDo+sFilterSize, tmpiter); + std::copy_n(mTemp.cbegin()+SamplesToDo, mSHistory.size(), mSHistory.begin()); + allpass_process({youtput, SamplesToDo}, mTemp.data()); + + for(size_t i{0};i < SamplesToDo;++i) + { + /* Y = 0.763*D + j*0.385*S */ + youtput[i] = 0.763f*mD[i] + 0.385f*youtput[i]; + } + + std::copy(mS.begin()+SamplesToDo, mS.begin()+SamplesToDo+sFilterSize, mS.begin()); + std::copy(mD.begin()+SamplesToDo, mD.begin()+SamplesToDo+sFilterSize, mD.begin()); +} + + +int main(int argc, char **argv) +{ + if(argc < 2 || std::strcmp(argv[1], "-h") == 0 || std::strcmp(argv[1], "--help") == 0) + { + printf("Usage: %s <filename.wav>\n", argv[0]); + return 1; + } + + size_t num_files{0}, num_decoded{0}; + for(int fidx{1};fidx < argc;++fidx) + { + ++num_files; + SF_INFO ininfo{}; + SndFilePtr infile{sf_open(argv[fidx], SFM_READ, &ininfo)}; + if(!infile) + { + fprintf(stderr, "Failed to open %s\n", argv[fidx]); + continue; + } + if(ininfo.channels != 2) + { + fprintf(stderr, "%s is not a stereo file\n", argv[fidx]); + continue; + } + printf("Converting %s...\n", argv[fidx]); + + std::string outname{argv[fidx]}; + auto lastslash = outname.find_last_of('/'); + if(lastslash != std::string::npos) + outname.erase(0, lastslash+1); + auto lastdot = outname.find_last_of('.'); + if(lastdot != std::string::npos) + outname.resize(lastdot+1); + outname += "amb"; + + FilePtr outfile{fopen(outname.c_str(), "wb")}; + if(!outfile) + { + fprintf(stderr, "Failed to create %s\n", outname.c_str()); + continue; + } + + fputs("RIFF", outfile.get()); + fwrite32le(0xFFFFFFFF, outfile.get()); // 'RIFF' header len; filled in at close + + fputs("WAVE", outfile.get()); + + fputs("fmt ", outfile.get()); + fwrite32le(40, outfile.get()); // 'fmt ' header len; 40 bytes for EXTENSIBLE + + // 16-bit val, format type id (extensible: 0xFFFE) + fwrite16le(0xFFFE, outfile.get()); + // 16-bit val, channel count + fwrite16le(static_cast<ushort>(3), outfile.get()); + // 32-bit val, frequency + fwrite32le(static_cast<uint>(ininfo.samplerate), outfile.get()); + // 32-bit val, bytes per second + fwrite32le(static_cast<uint>(ininfo.samplerate) * sizeof(float) * 3, outfile.get()); + // 16-bit val, frame size + fwrite16le(static_cast<ushort>(sizeof(float) * 3), outfile.get()); + // 16-bit val, bits per sample + fwrite16le(static_cast<ushort>(sizeof(float) * 8), outfile.get()); + // 16-bit val, extra byte count + fwrite16le(22, outfile.get()); + // 16-bit val, valid bits per sample + fwrite16le(static_cast<ushort>(sizeof(float) * 8), outfile.get()); + // 32-bit val, channel mask + fwrite32le(0, outfile.get()); + // 16 byte GUID, sub-type format + fwrite(SUBTYPE_BFORMAT_FLOAT, 1, 16, outfile.get()); + + fputs("data", outfile.get()); + fwrite32le(0xFFFFFFFF, outfile.get()); // 'data' header len; filled in at close + if(ferror(outfile.get())) + { + fprintf(stderr, "Error writing wave file header: %s (%d)\n", strerror(errno), errno); + continue; + } + + auto DataStart = ftell(outfile.get()); + + auto decoder = std::make_unique<UhjDecoder>(); + auto inmem = std::make_unique<float[]>(BufferLineSize*static_cast<uint>(ininfo.channels)); + auto decmem = std::make_unique<std::array<float,BufferLineSize>[]>(3); + auto outmem = std::make_unique<byte4[]>(BufferLineSize*3); + + /* The all-pass filter has a lead-in of 127 samples, and a lead-out of + * 128 samples. So after reading the last samples from the input, an + * additional 255 samples of silence need to be fed through the decoder + * for it to finish. + */ + sf_count_t LeadOut{UhjDecoder::sFilterSize*2 - 1}; + while(LeadOut > 0) + { + sf_count_t sgot{sf_readf_float(infile.get(), inmem.get(), BufferLineSize)}; + sgot = std::max<sf_count_t>(sgot, 0); + if(sgot < BufferLineSize) + { + const sf_count_t remaining{std::min(BufferLineSize - sgot, LeadOut)}; + std::fill_n(inmem.get() + sgot*2, remaining*2, 0.0f); + sgot += remaining; + LeadOut -= remaining; + } + + auto got = static_cast<size_t>(sgot); + decoder->decode2(inmem.get(), decmem.get(), got); + for(size_t i{0};i < got;++i) + { + outmem[i*3 + 0] = f32AsLEBytes(decmem[0][i]); + outmem[i*3 + 1] = f32AsLEBytes(decmem[1][i]); + outmem[i*3 + 2] = f32AsLEBytes(decmem[2][i]); + } + + size_t wrote{fwrite(outmem.get(), sizeof(byte4)*3, got, outfile.get())}; + if(wrote < got) + { + fprintf(stderr, "Error writing wave data: %s (%d)\n", strerror(errno), errno); + break; + } + } + + auto DataEnd = ftell(outfile.get()); + if(DataEnd > DataStart) + { + long dataLen{DataEnd - DataStart}; + if(fseek(outfile.get(), 4, SEEK_SET) == 0) + fwrite32le(static_cast<uint>(DataEnd-8), outfile.get()); // 'WAVE' header len + if(fseek(outfile.get(), DataStart-4, SEEK_SET) == 0) + fwrite32le(static_cast<uint>(dataLen), outfile.get()); // 'data' header len + } + fflush(outfile.get()); + ++num_decoded; + } + if(num_decoded == 0) + fprintf(stderr, "Failed to decode any input files\n"); + else if(num_decoded < num_files) + fprintf(stderr, "Decoded %zu of %zu files\n", num_decoded, num_files); + else + printf("Decoded %zu file%s\n", num_decoded, (num_decoded==1)?"":"s"); + return 0; +} |