winamp/Src/external_dependencies/openmpt-trunk/include/r8brain/CDSPBlockConvolver.h

//$ nobt
//$ nocpp

/**
 * @file CDSPBlockConvolver.h
 *
 * @brief Single-block overlap-save convolution processor class.
 *
 * This file includes single-block overlap-save convolution processor class.
 *
 * r8brain-free-src Copyright (c) 2013-2022 Aleksey Vaneev
 * See the "LICENSE" file for license.
 */

#ifndef R8B_CDSPBLOCKCONVOLVER_INCLUDED
#define R8B_CDSPBLOCKCONVOLVER_INCLUDED

#include "CDSPFIRFilter.h"
#include "CDSPProcessor.h"

namespace r8b {

/**
 * @brief Single-block overlap-save convolution processing class.
 *
 * Class that implements single-block overlap-save convolution processing. The
 * length of a single FFT block used depends on the length of the filter
 * kernel.
 *
 * The rationale behind "single-block" processing is that increasing the FFT
 * block length by 2 is more efficient than performing convolution at the same
 * FFT block length but using two blocks.
 *
 * This class also implements a built-in resampling by any whole-number
 * factor, which simplifies the overall resampling objects topology.
 */

class CDSPBlockConvolver : public CDSPProcessor
{
public:
	/**
	 * Constructor initializes internal variables and constants of *this
	 * object.
	 *
	 * @param aFilter Pre-calculated filter data. Reference to this object is
	 * inhertied by *this object, and the object will be released when *this
	 * object is destroyed. If upsampling is used, filter's gain should be
	 * equal to the upsampling factor.
	 * @param aUpFactor The upsampling factor, positive value. E.g. value of 2
	 * means 2x upsampling should be performed over the input data.
	 * @param aDownFactor The downsampling factor, positive value. E.g. value
	 * of 2 means 2x downsampling should be performed over the output data.
	 * @param PrevLatency Latency, in samples (any value >=0), which was left
	 * in the output signal by a previous process. This value is usually
	 * non-zero if the minimum-phase filters are in use. This value is always
	 * zero if the linear-phase filters are in use.
	 * @param aDoConsumeLatency "True" if the output latency should be
	 * consumed. Does not apply to the fractional part of the latency (if such
	 * part is available).
	 */

	CDSPBlockConvolver( CDSPFIRFilter& aFilter, const int aUpFactor,
		const int aDownFactor, const double PrevLatency = 0.0,
		const bool aDoConsumeLatency = true )
		: Filter( &aFilter )
		, UpFactor( aUpFactor )
		, DownFactor( aDownFactor )
		, DoConsumeLatency( aDoConsumeLatency )
		, BlockLen2( 2 << Filter -> getBlockLenBits() )
	{
		R8BASSERT( UpFactor > 0 );
		R8BASSERT( DownFactor > 0 );
		R8BASSERT( PrevLatency >= 0.0 );

		int fftinBits;
		UpShift = getBitOccupancy( UpFactor ) - 1;

		if(( 1 << UpShift ) == UpFactor )
		{
			fftinBits = Filter -> getBlockLenBits() + 1 - UpShift;
			PrevInputLen = ( Filter -> getKernelLen() - 1 + UpFactor - 1 ) /
				UpFactor;

			InputLen = BlockLen2 - PrevInputLen * UpFactor;
		}
		else
		{
			UpShift = -1;
			fftinBits = Filter -> getBlockLenBits() + 1;
			PrevInputLen = Filter -> getKernelLen() - 1;
			InputLen = BlockLen2 - PrevInputLen;
		}

		OutOffset = ( Filter -> isZeroPhase() ? Filter -> getLatency() : 0 );
		LatencyFrac = Filter -> getLatencyFrac() + PrevLatency * UpFactor;
		Latency = (int) LatencyFrac;
		const int InLatency = Latency + Filter -> getLatency() - OutOffset;
		LatencyFrac -= Latency;
		LatencyFrac /= DownFactor;

		Latency += InputLen + Filter -> getLatency();

		int fftoutBits;
		InputDelay = 0;
		DownSkipInit = 0;
		DownShift = getBitOccupancy( DownFactor ) - 1;

		if(( 1 << DownShift ) == DownFactor )
		{
			fftoutBits = Filter -> getBlockLenBits() + 1 - DownShift;

			if( DownFactor > 1 )
			{
				if( UpShift > 0 )
				{
					// This case never happens in practice due to mutual
					// exclusion of "power of 2" DownFactor and UpFactor
					// values.

					R8BASSERT( UpShift == 0 );
				}
				else
				{
					// Make sure InputLen is divisible by DownFactor.

					const int ilc = InputLen & ( DownFactor - 1 );
					PrevInputLen += ilc;
					InputLen -= ilc;
					Latency -= ilc;

					// Correct InputDelay for input and filter's latency.

					const int lc = InLatency & ( DownFactor - 1 );

					if( lc > 0 )
					{
						InputDelay = DownFactor - lc;
					}

					if( !DoConsumeLatency )
					{
						Latency /= DownFactor;
					}
				}
			}
		}
		else
		{
			fftoutBits = Filter -> getBlockLenBits() + 1;
			DownShift = -1;

			if( !DoConsumeLatency && DownFactor > 1 )
			{
				DownSkipInit = Latency % DownFactor;
				Latency /= DownFactor;
			}
		}

		R8BASSERT( Latency >= 0 );

		fftin = new CDSPRealFFTKeeper( fftinBits );

		if( fftoutBits == fftinBits )
		{
			fftout = fftin;
		}
		else
		{
			ffto2 = new CDSPRealFFTKeeper( fftoutBits );
			fftout = ffto2;
		}

		WorkBlocks.alloc( BlockLen2 * 2 + PrevInputLen );
		CurInput = &WorkBlocks[ 0 ];
		CurOutput = &WorkBlocks[ BlockLen2 ]; // CurInput and
			// CurOutput are address-aligned.
		PrevInput = &WorkBlocks[ BlockLen2 * 2 ];

		clear();

		R8BCONSOLE( "CDSPBlockConvolver: flt_len=%i in_len=%i io=%i/%i "
			"fft=%i/%i latency=%i\n", Filter -> getKernelLen(), InputLen,
			UpFactor, DownFactor, (*fftin) -> getLen(), (*fftout) -> getLen(),
			getLatency() );
	}

	virtual ~CDSPBlockConvolver()
	{
		Filter -> unref();
	}

	virtual int getLatency() const
	{
		return( DoConsumeLatency ? 0 : Latency );
	}

	virtual double getLatencyFrac() const
	{
		return( LatencyFrac );
	}

	virtual int getMaxOutLen( const int MaxInLen ) const
	{
		R8BASSERT( MaxInLen >= 0 );

		return(( MaxInLen * UpFactor + DownFactor - 1 ) / DownFactor );
	}

	virtual void clear()
	{
		memset( &PrevInput[ 0 ], 0, PrevInputLen * sizeof( PrevInput[ 0 ]));

		if( DoConsumeLatency )
		{
			LatencyLeft = Latency;
		}
		else
		{
			LatencyLeft = 0;

			if( DownShift > 0 )
			{
				memset( &CurOutput[ 0 ], 0, ( BlockLen2 >> DownShift ) *
					sizeof( CurOutput[ 0 ]));
			}
			else
			{
				memset( &CurOutput[ BlockLen2 - OutOffset ], 0, OutOffset *
					sizeof( CurOutput[ 0 ]));

				memset( &CurOutput[ 0 ], 0, ( InputLen - OutOffset ) *
					sizeof( CurOutput[ 0 ]));
			}
		}

		memset( CurInput, 0, InputDelay * sizeof( CurInput[ 0 ]));

		InDataLeft = InputLen - InputDelay;
		UpSkip = 0;
		DownSkip = DownSkipInit;
	}

	virtual int process( double* ip, int l0, double*& op0 )
	{
		R8BASSERT( l0 >= 0 );
		R8BASSERT( UpFactor / DownFactor <= 1 || ip != op0 || l0 == 0 );

		double* op = op0;
		int l = l0 * UpFactor;
		l0 = 0;

		while( l > 0 )
		{
			const int Offs = InputLen - InDataLeft;

			if( l < InDataLeft )
			{
				InDataLeft -= l;

				if( UpShift >= 0 )
				{
					memcpy( &CurInput[ Offs >> UpShift ], ip,
						( l >> UpShift ) * sizeof( CurInput[ 0 ]));
				}
				else
				{
					copyUpsample( ip, &CurInput[ Offs ], l );
				}

				copyToOutput( Offs - OutOffset, op, l, l0 );
				break;
			}

			const int b = InDataLeft;
			l -= b;
			InDataLeft = InputLen;
			int ilu;

			if( UpShift >= 0 )
			{
				const int bu = b >> UpShift;
				memcpy( &CurInput[ Offs >> UpShift ], ip,
					bu * sizeof( CurInput[ 0 ]));

				ip += bu;
				ilu = InputLen >> UpShift;
			}
			else
			{
				copyUpsample( ip, &CurInput[ Offs ], b );
				ilu = InputLen;
			}

			const size_t pil = PrevInputLen * sizeof( CurInput[ 0 ]);
			memcpy( &CurInput[ ilu ], PrevInput, pil );
			memcpy( PrevInput, &CurInput[ ilu - PrevInputLen ], pil );

			(*fftin) -> forward( CurInput );

			if( UpShift > 0 )
			{
				#if R8B_FLOATFFT
					mirrorInputSpectrum( (float*) CurInput );
				#else // R8B_FLOATFFT
					mirrorInputSpectrum( CurInput );
				#endif // R8B_FLOATFFT
			}

			if( Filter -> isZeroPhase() )
			{
				(*fftout) -> multiplyBlocksZP( Filter -> getKernelBlock(),
					CurInput );
			}
			else
			{
				(*fftout) -> multiplyBlocks( Filter -> getKernelBlock(),
					CurInput );
			}

			if( DownShift > 0 )
			{
				const int z = BlockLen2 >> DownShift;

				#if R8B_FLOATFFT
					float* const kb = (float*) Filter -> getKernelBlock();
					float* const p = (float*) CurInput;
				#else // R8B_FLOATFFT
					const double* const kb = Filter -> getKernelBlock();
					double* const p = CurInput;
				#endif // R8B_FLOATFFT

				p[ 1 ] = kb[ z ] * p[ z ] - kb[ z + 1 ] * p[ z + 1 ];
			}

			(*fftout) -> inverse( CurInput );

			copyToOutput( Offs - OutOffset, op, b, l0 );

			double* const tmp = CurInput;
			CurInput = CurOutput;
			CurOutput = tmp;
		}

		return( l0 );
	}

private:
	CDSPFIRFilter* Filter; ///< Filter in use.
		///<
	CPtrKeeper< CDSPRealFFTKeeper* > fftin; ///< FFT object 1, used to produce
		///< the input spectrum (can embed the "power of 2" upsampling).
		///<
	CPtrKeeper< CDSPRealFFTKeeper* > ffto2; ///< FFT object 2 (can be NULL).
		///<
	CDSPRealFFTKeeper* fftout; ///< FFT object used to produce the output
		///< signal (can embed the "power of 2" downsampling), may point to
		///< either "fftin" or "ffto2".
		///<
	int UpFactor; ///< Upsampling factor.
		///<
	int DownFactor; ///< Downsampling factor.
		///<
	bool DoConsumeLatency; ///< "True" if the output latency should be
		///< consumed. Does not apply to the fractional part of the latency
		///< (if such part is available).
		///<
	int BlockLen2; ///< Equals block length * 2.
		///<
	int OutOffset; ///< Output offset, depends on filter's introduced latency.
		///<
	int PrevInputLen; ///< The length of previous input data saved, used for
		///< overlap.
		///<
	int InputLen; ///< The number of input samples that should be accumulated
		///< before the input block is processed.
		///<
	int Latency; ///< Processing latency, in samples.
		///<
	double LatencyFrac; ///< Fractional latency, in samples, that is left in
		///< the output signal.
		///<
	int UpShift; ///< "Power of 2" upsampling shift. Equals -1 if UpFactor is
		///< not a "power of 2" value. Equals 0 if UpFactor equals 1.
		///<
	int DownShift; ///< "Power of 2" downsampling shift. Equals -1 if
		///< DownFactor is not a "power of 2". Equals 0 if DownFactor equals
		///< 1.
		///<
	int InputDelay; ///< Additional input delay, in samples. Used to make the
		///< output delay divisible by DownShift. Used only if UpShift <= 0
		///< and DownShift > 0.
		///<
	CFixedBuffer< double > WorkBlocks; ///< Previous input data, input and
		///< output data blocks, overall capacity = BlockLen2 * 2 +
		///< PrevInputLen. Used in the flip-flop manner.
		///<
	double* PrevInput; ///< Previous input data buffer, capacity = BlockLen.
		///<
	double* CurInput; ///< Input data buffer, capacity = BlockLen2.
		///<
	double* CurOutput; ///< Output data buffer, capacity = BlockLen2.
		///<
	int InDataLeft; ///< Samples left before processing input and output FFT
		///< blocks. Initialized to InputLen on clear.
		///<
	int LatencyLeft; ///< Latency in samples left to skip.
		///<
	int UpSkip; ///< The current upsampling sample skip (value in the range
		///< 0 to UpFactor - 1).
		///<
	int DownSkip; ///< The current downsampling sample skip (value in the
		///< range 0 to DownFactor - 1). Not used if DownShift > 0.
		///<
	int DownSkipInit; ///< The initial DownSkip value after clear().
		///<

	/**
	 * Function copies samples from the input buffer to the output buffer
	 * while inserting zeros inbetween them to perform the whole-numbered
	 * upsampling.
	 *
	 * @param[in,out] ip0 Input buffer. Will be advanced on function's return.
	 * @param[out] op Output buffer.
	 * @param l0 The number of samples to fill in the output buffer, including
	 * both input samples and interpolation (zero) samples.
	 */

	void copyUpsample( double*& ip0, double* op, int l0 )
	{
		int b = min( UpSkip, l0 );

		if( b != 0 )
		{
			UpSkip -= b;
			l0 -= b;

			*op = 0.0;
			op++;

			while( --b != 0 )
			{
				*op = 0.0;
				op++;
			}
		}

		double* ip = ip0;
		const int upf = UpFactor;
		int l = l0 / upf;
		int lz = l0 - l * upf;

		if( upf == 3 )
		{
			while( l != 0 )
			{
				op[ 0 ] = *ip;
				op[ 1 ] = 0.0;
				op[ 2 ] = 0.0;
				ip++;
				op += upf;
				l--;
			}
		}
		else
		if( upf == 5 )
		{
			while( l != 0 )
			{
				op[ 0 ] = *ip;
				op[ 1 ] = 0.0;
				op[ 2 ] = 0.0;
				op[ 3 ] = 0.0;
				op[ 4 ] = 0.0;
				ip++;
				op += upf;
				l--;
			}
		}
		else
		{
			const size_t zc = ( upf - 1 ) * sizeof( op[ 0 ]);

			while( l != 0 )
			{
				*op = *ip;
				ip++;

				memset( op + 1, 0, zc );
				op += upf;
				l--;
			}
		}

		if( lz != 0 )
		{
			*op = *ip;
			ip++;
			op++;

			UpSkip = upf - lz;

			while( --lz != 0 )
			{
				*op = 0.0;
				op++;
			}
		}

		ip0 = ip;
	}

	/**
	 * Function copies sample data from the CurOutput buffer to the specified
	 * output buffer and advances its position. If necessary, this function
	 * "consumes" latency and performs downsampling.
	 *
	 * @param Offs CurOutput buffer offset, can be negative.
	 * @param[out] op0 Output buffer pointer, will be advanced.
	 * @param b The number of output samples available, including those which
	 * are discarded during whole-number downsampling.
	 * @param l0 The overall output sample count, will be increased.
	 */

	void copyToOutput( int Offs, double*& op0, int b, int& l0 )
	{
		if( Offs < 0 )
		{
			if( Offs + b <= 0 )
			{
				Offs += BlockLen2;
			}
			else
			{
				copyToOutput( Offs + BlockLen2, op0, -Offs, l0 );
				b += Offs;
				Offs = 0;
			}
		}

		if( LatencyLeft != 0 )
		{
			if( LatencyLeft >= b )
			{
				LatencyLeft -= b;
				return;
			}

			Offs += LatencyLeft;
			b -= LatencyLeft;
			LatencyLeft = 0;
		}

		const int df = DownFactor;

		if( DownShift > 0 )
		{
			int Skip = Offs & ( df - 1 );

			if( Skip > 0 )
			{
				Skip = df - Skip;
				b -= Skip;
				Offs += Skip;
			}

			if( b > 0 )
			{
				b = ( b + df - 1 ) >> DownShift;
				memcpy( op0, &CurOutput[ Offs >> DownShift ],
					b * sizeof( op0[ 0 ]));

				op0 += b;
				l0 += b;
			}
		}
		else
		{
			if( df > 1 )
			{
				const double* ip = &CurOutput[ Offs + DownSkip ];
				int l = ( b + df - 1 - DownSkip ) / df;
				DownSkip += l * df - b;

				double* op = op0;
				l0 += l;
				op0 += l;

				while( l > 0 )
				{
					*op = *ip;
					ip += df;
					op++;
					l--;
				}
			}
			else
			{
				memcpy( op0, &CurOutput[ Offs ], b * sizeof( op0[ 0 ]));
				op0 += b;
				l0 += b;
			}
		}
	}

	/**
	 * Function performs input spectrum mirroring which is used to perform a
	 * fast "power of 2" upsampling. Such mirroring is equivalent to insertion
	 * of zeros into the input signal.
	 *
	 * @param p Spectrum data block to mirror.
	 * @tparam T Buffer's element type.
	 */

	template< typename T >
	void mirrorInputSpectrum( T* const p )
	{
		const int bl1 = BlockLen2 >> UpShift;
		const int bl2 = bl1 + bl1;
		int i;

		for( i = bl1 + 2; i < bl2; i += 2 )
		{
			p[ i ] = p[ bl2 - i ];
			p[ i + 1 ] = -p[ bl2 - i + 1 ];
		}

		p[ bl1 ] = p[ 1 ];
		p[ bl1 + 1 ] = 0.0;
		p[ 1 ] = p[ 0 ];

		for( i = 1; i < UpShift; i++ )
		{
			const int z = bl1 << i;
			memcpy( &p[ z ], p, z * sizeof( p[ 0 ]));
			p[ z + 1 ] = 0.0;
		}
	}
};

} // namespace r8b

#endif // R8B_CDSPBLOCKCONVOLVER_INCLUDED
Initial community commit 2024-09-24 12:54:57 +00:00			`//$ nobt`
			`//$ nocpp`

			`/**`
			`* @file CDSPBlockConvolver.h`
			`*`
			`* @brief Single-block overlap-save convolution processor class.`
			`*`
			`* This file includes single-block overlap-save convolution processor class.`
			`*`
			`* r8brain-free-src Copyright (c) 2013-2022 Aleksey Vaneev`
			`* See the "LICENSE" file for license.`
			`*/`

			`#ifndef R8B_CDSPBLOCKCONVOLVER_INCLUDED`
			`#define R8B_CDSPBLOCKCONVOLVER_INCLUDED`

			`#include "CDSPFIRFilter.h"`
			`#include "CDSPProcessor.h"`

			`namespace r8b {`

			`/**`
			`* @brief Single-block overlap-save convolution processing class.`
			`*`
			`* Class that implements single-block overlap-save convolution processing. The`
			`* length of a single FFT block used depends on the length of the filter`
			`* kernel.`
			`*`
			`* The rationale behind "single-block" processing is that increasing the FFT`
			`* block length by 2 is more efficient than performing convolution at the same`
			`* FFT block length but using two blocks.`
			`*`
			`* This class also implements a built-in resampling by any whole-number`
			`* factor, which simplifies the overall resampling objects topology.`
			`*/`

			`class CDSPBlockConvolver : public CDSPProcessor`
			`{`
			`public:`
			`/**`
			`* Constructor initializes internal variables and constants of *this`
			`* object.`
			`*`
			`* @param aFilter Pre-calculated filter data. Reference to this object is`
			`* inhertied by this object, and the object will be released when this`
			`* object is destroyed. If upsampling is used, filter's gain should be`
			`* equal to the upsampling factor.`
			`* @param aUpFactor The upsampling factor, positive value. E.g. value of 2`
			`* means 2x upsampling should be performed over the input data.`
			`* @param aDownFactor The downsampling factor, positive value. E.g. value`
			`* of 2 means 2x downsampling should be performed over the output data.`
			`* @param PrevLatency Latency, in samples (any value >=0), which was left`
			`* in the output signal by a previous process. This value is usually`
			`* non-zero if the minimum-phase filters are in use. This value is always`
			`* zero if the linear-phase filters are in use.`
			`* @param aDoConsumeLatency "True" if the output latency should be`
			`* consumed. Does not apply to the fractional part of the latency (if such`
			`* part is available).`
			`*/`

			`CDSPBlockConvolver( CDSPFIRFilter& aFilter, const int aUpFactor,`
			`const int aDownFactor, const double PrevLatency = 0.0,`
			`const bool aDoConsumeLatency = true )`
			`: Filter( &aFilter )`
			`, UpFactor( aUpFactor )`
			`, DownFactor( aDownFactor )`
			`, DoConsumeLatency( aDoConsumeLatency )`
			`, BlockLen2( 2 << Filter -> getBlockLenBits() )`
			`{`
			`R8BASSERT( UpFactor > 0 );`
			`R8BASSERT( DownFactor > 0 );`
			`R8BASSERT( PrevLatency >= 0.0 );`

			`int fftinBits;`
			`UpShift = getBitOccupancy( UpFactor ) - 1;`

			`if(( 1 << UpShift ) == UpFactor )`
			`{`
			`fftinBits = Filter -> getBlockLenBits() + 1 - UpShift;`
			`PrevInputLen = ( Filter -> getKernelLen() - 1 + UpFactor - 1 ) /`
			`UpFactor;`

			`InputLen = BlockLen2 - PrevInputLen * UpFactor;`
			`}`
			`else`
			`{`
			`UpShift = -1;`
			`fftinBits = Filter -> getBlockLenBits() + 1;`
			`PrevInputLen = Filter -> getKernelLen() - 1;`
			`InputLen = BlockLen2 - PrevInputLen;`
			`}`

			`OutOffset = ( Filter -> isZeroPhase() ? Filter -> getLatency() : 0 );`
			`LatencyFrac = Filter -> getLatencyFrac() + PrevLatency * UpFactor;`
			`Latency = (int) LatencyFrac;`
			`const int InLatency = Latency + Filter -> getLatency() - OutOffset;`
			`LatencyFrac -= Latency;`
			`LatencyFrac /= DownFactor;`

			`Latency += InputLen + Filter -> getLatency();`

			`int fftoutBits;`
			`InputDelay = 0;`
			`DownSkipInit = 0;`
			`DownShift = getBitOccupancy( DownFactor ) - 1;`

			`if(( 1 << DownShift ) == DownFactor )`
			`{`
			`fftoutBits = Filter -> getBlockLenBits() + 1 - DownShift;`

			`if( DownFactor > 1 )`
			`{`
			`if( UpShift > 0 )`
			`{`
			`// This case never happens in practice due to mutual`
			`// exclusion of "power of 2" DownFactor and UpFactor`
			`// values.`

			`R8BASSERT( UpShift == 0 );`
			`}`
			`else`
			`{`
			`// Make sure InputLen is divisible by DownFactor.`

			`const int ilc = InputLen & ( DownFactor - 1 );`
			`PrevInputLen += ilc;`
			`InputLen -= ilc;`
			`Latency -= ilc;`

			`// Correct InputDelay for input and filter's latency.`

			`const int lc = InLatency & ( DownFactor - 1 );`

			`if( lc > 0 )`
			`{`
			`InputDelay = DownFactor - lc;`
			`}`

			`if( !DoConsumeLatency )`
			`{`
			`Latency /= DownFactor;`
			`}`
			`}`
			`}`
			`}`
			`else`
			`{`
			`fftoutBits = Filter -> getBlockLenBits() + 1;`
			`DownShift = -1;`

			`if( !DoConsumeLatency && DownFactor > 1 )`
			`{`
			`DownSkipInit = Latency % DownFactor;`
			`Latency /= DownFactor;`
			`}`
			`}`

			`R8BASSERT( Latency >= 0 );`

			`fftin = new CDSPRealFFTKeeper( fftinBits );`

			`if( fftoutBits == fftinBits )`
			`{`
			`fftout = fftin;`
			`}`
			`else`
			`{`
			`ffto2 = new CDSPRealFFTKeeper( fftoutBits );`
			`fftout = ffto2;`
			`}`

			`WorkBlocks.alloc( BlockLen2 * 2 + PrevInputLen );`
			`CurInput = &WorkBlocks[ 0 ];`
			`CurOutput = &WorkBlocks[ BlockLen2 ]; // CurInput and`
			`// CurOutput are address-aligned.`
			`PrevInput = &WorkBlocks[ BlockLen2 * 2 ];`

			`clear();`

			`R8BCONSOLE( "CDSPBlockConvolver: flt_len=%i in_len=%i io=%i/%i "`
			`"fft=%i/%i latency=%i\n", Filter -> getKernelLen(), InputLen,`
			`UpFactor, DownFactor, (fftin) -> getLen(), (fftout) -> getLen(),`
			`getLatency() );`
			`}`

			`virtual ~CDSPBlockConvolver()`
			`{`
			`Filter -> unref();`
			`}`

			`virtual int getLatency() const`
			`{`
			`return( DoConsumeLatency ? 0 : Latency );`
			`}`

			`virtual double getLatencyFrac() const`
			`{`
			`return( LatencyFrac );`
			`}`

			`virtual int getMaxOutLen( const int MaxInLen ) const`
			`{`
			`R8BASSERT( MaxInLen >= 0 );`

			`return(( MaxInLen * UpFactor + DownFactor - 1 ) / DownFactor );`
			`}`

			`virtual void clear()`
			`{`
			`memset( &PrevInput[ 0 ], 0, PrevInputLen * sizeof( PrevInput[ 0 ]));`

			`if( DoConsumeLatency )`
			`{`
			`LatencyLeft = Latency;`
			`}`
			`else`
			`{`
			`LatencyLeft = 0;`

			`if( DownShift > 0 )`
			`{`
			`memset( &CurOutput[ 0 ], 0, ( BlockLen2 >> DownShift ) *`
			`sizeof( CurOutput[ 0 ]));`
			`}`
			`else`
			`{`
			`memset( &CurOutput[ BlockLen2 - OutOffset ], 0, OutOffset *`
			`sizeof( CurOutput[ 0 ]));`

			`memset( &CurOutput[ 0 ], 0, ( InputLen - OutOffset ) *`
			`sizeof( CurOutput[ 0 ]));`
			`}`
			`}`

			`memset( CurInput, 0, InputDelay * sizeof( CurInput[ 0 ]));`

			`InDataLeft = InputLen - InputDelay;`
			`UpSkip = 0;`
			`DownSkip = DownSkipInit;`
			`}`

			`virtual int process( double* ip, int l0, double*& op0 )`
			`{`
			`R8BASSERT( l0 >= 0 );`
			`R8BASSERT( UpFactor / DownFactor <= 1 \|\| ip != op0 \|\| l0 == 0 );`

			`double* op = op0;`
			`int l = l0 * UpFactor;`
			`l0 = 0;`

			`while( l > 0 )`
			`{`
			`const int Offs = InputLen - InDataLeft;`

			`if( l < InDataLeft )`
			`{`
			`InDataLeft -= l;`

			`if( UpShift >= 0 )`
			`{`
			`memcpy( &CurInput[ Offs >> UpShift ], ip,`
			`( l >> UpShift ) * sizeof( CurInput[ 0 ]));`
			`}`
			`else`
			`{`
			`copyUpsample( ip, &CurInput[ Offs ], l );`
			`}`

			`copyToOutput( Offs - OutOffset, op, l, l0 );`
			`break;`
			`}`

			`const int b = InDataLeft;`
			`l -= b;`
			`InDataLeft = InputLen;`
			`int ilu;`

			`if( UpShift >= 0 )`
			`{`
			`const int bu = b >> UpShift;`
			`memcpy( &CurInput[ Offs >> UpShift ], ip,`
			`bu * sizeof( CurInput[ 0 ]));`

			`ip += bu;`
			`ilu = InputLen >> UpShift;`
			`}`
			`else`
			`{`
			`copyUpsample( ip, &CurInput[ Offs ], b );`
			`ilu = InputLen;`
			`}`

			`const size_t pil = PrevInputLen * sizeof( CurInput[ 0 ]);`
			`memcpy( &CurInput[ ilu ], PrevInput, pil );`
			`memcpy( PrevInput, &CurInput[ ilu - PrevInputLen ], pil );`

			`(*fftin) -> forward( CurInput );`

			`if( UpShift > 0 )`
			`{`
			`#if R8B_FLOATFFT`
			`mirrorInputSpectrum( (float*) CurInput );`
			`#else // R8B_FLOATFFT`
			`mirrorInputSpectrum( CurInput );`
			`#endif // R8B_FLOATFFT`
			`}`

			`if( Filter -> isZeroPhase() )`
			`{`
			`(*fftout) -> multiplyBlocksZP( Filter -> getKernelBlock(),`
			`CurInput );`
			`}`
			`else`
			`{`
			`(*fftout) -> multiplyBlocks( Filter -> getKernelBlock(),`
			`CurInput );`
			`}`

			`if( DownShift > 0 )`
			`{`
			`const int z = BlockLen2 >> DownShift;`

			`#if R8B_FLOATFFT`
			`float* const kb = (float*) Filter -> getKernelBlock();`
			`float* const p = (float*) CurInput;`
			`#else // R8B_FLOATFFT`
			`const double* const kb = Filter -> getKernelBlock();`
			`double* const p = CurInput;`
			`#endif // R8B_FLOATFFT`

			`p[ 1 ] = kb[ z ] * p[ z ] - kb[ z + 1 ] * p[ z + 1 ];`
			`}`

			`(*fftout) -> inverse( CurInput );`

			`copyToOutput( Offs - OutOffset, op, b, l0 );`

			`double* const tmp = CurInput;`
			`CurInput = CurOutput;`
			`CurOutput = tmp;`
			`}`

			`return( l0 );`
			`}`

			`private:`
			`CDSPFIRFilter* Filter; ///< Filter in use.`
			`///<`
			`CPtrKeeper< CDSPRealFFTKeeper* > fftin; ///< FFT object 1, used to produce`
			`///< the input spectrum (can embed the "power of 2" upsampling).`
			`///<`
			`CPtrKeeper< CDSPRealFFTKeeper* > ffto2; ///< FFT object 2 (can be NULL).`
			`///<`
			`CDSPRealFFTKeeper* fftout; ///< FFT object used to produce the output`
			`///< signal (can embed the "power of 2" downsampling), may point to`
			`///< either "fftin" or "ffto2".`
			`///<`
			`int UpFactor; ///< Upsampling factor.`
			`///<`
			`int DownFactor; ///< Downsampling factor.`
			`///<`
			`bool DoConsumeLatency; ///< "True" if the output latency should be`
			`///< consumed. Does not apply to the fractional part of the latency`
			`///< (if such part is available).`
			`///<`
			`int BlockLen2; ///< Equals block length * 2.`
			`///<`
			`int OutOffset; ///< Output offset, depends on filter's introduced latency.`
			`///<`
			`int PrevInputLen; ///< The length of previous input data saved, used for`
			`///< overlap.`
			`///<`
			`int InputLen; ///< The number of input samples that should be accumulated`
			`///< before the input block is processed.`
			`///<`
			`int Latency; ///< Processing latency, in samples.`
			`///<`
			`double LatencyFrac; ///< Fractional latency, in samples, that is left in`
			`///< the output signal.`
			`///<`
			`int UpShift; ///< "Power of 2" upsampling shift. Equals -1 if UpFactor is`
			`///< not a "power of 2" value. Equals 0 if UpFactor equals 1.`
			`///<`
			`int DownShift; ///< "Power of 2" downsampling shift. Equals -1 if`
			`///< DownFactor is not a "power of 2". Equals 0 if DownFactor equals`
			`///< 1.`
			`///<`
			`int InputDelay; ///< Additional input delay, in samples. Used to make the`
			`///< output delay divisible by DownShift. Used only if UpShift <= 0`
			`///< and DownShift > 0.`
			`///<`
			`CFixedBuffer< double > WorkBlocks; ///< Previous input data, input and`
			`///< output data blocks, overall capacity = BlockLen2 * 2 +`
			`///< PrevInputLen. Used in the flip-flop manner.`
			`///<`
			`double* PrevInput; ///< Previous input data buffer, capacity = BlockLen.`
			`///<`
			`double* CurInput; ///< Input data buffer, capacity = BlockLen2.`
			`///<`
			`double* CurOutput; ///< Output data buffer, capacity = BlockLen2.`
			`///<`
			`int InDataLeft; ///< Samples left before processing input and output FFT`
			`///< blocks. Initialized to InputLen on clear.`
			`///<`
			`int LatencyLeft; ///< Latency in samples left to skip.`
			`///<`
			`int UpSkip; ///< The current upsampling sample skip (value in the range`
			`///< 0 to UpFactor - 1).`
			`///<`
			`int DownSkip; ///< The current downsampling sample skip (value in the`
			`///< range 0 to DownFactor - 1). Not used if DownShift > 0.`
			`///<`
			`int DownSkipInit; ///< The initial DownSkip value after clear().`
			`///<`

			`/**`
			`* Function copies samples from the input buffer to the output buffer`
			`* while inserting zeros inbetween them to perform the whole-numbered`
			`* upsampling.`
			`*`
			`* @param[in,out] ip0 Input buffer. Will be advanced on function's return.`
			`* @param[out] op Output buffer.`
			`* @param l0 The number of samples to fill in the output buffer, including`
			`* both input samples and interpolation (zero) samples.`
			`*/`

			`void copyUpsample( double& ip0, double op, int l0 )`
			`{`
			`int b = min( UpSkip, l0 );`

			`if( b != 0 )`
			`{`
			`UpSkip -= b;`
			`l0 -= b;`

			`*op = 0.0;`
			`op++;`

			`while( --b != 0 )`
			`{`
			`*op = 0.0;`
			`op++;`
			`}`
			`}`

			`double* ip = ip0;`
			`const int upf = UpFactor;`
			`int l = l0 / upf;`
			`int lz = l0 - l * upf;`

			`if( upf == 3 )`
			`{`
			`while( l != 0 )`
			`{`
			`op[ 0 ] = *ip;`
			`op[ 1 ] = 0.0;`
			`op[ 2 ] = 0.0;`
			`ip++;`
			`op += upf;`
			`l--;`
			`}`
			`}`
			`else`
			`if( upf == 5 )`
			`{`
			`while( l != 0 )`
			`{`
			`op[ 0 ] = *ip;`
			`op[ 1 ] = 0.0;`
			`op[ 2 ] = 0.0;`
			`op[ 3 ] = 0.0;`
			`op[ 4 ] = 0.0;`
			`ip++;`
			`op += upf;`
			`l--;`
			`}`
			`}`
			`else`
			`{`
			`const size_t zc = ( upf - 1 ) * sizeof( op[ 0 ]);`

			`while( l != 0 )`
			`{`
			`op = ip;`
			`ip++;`

			`memset( op + 1, 0, zc );`
			`op += upf;`
			`l--;`
			`}`
			`}`

			`if( lz != 0 )`
			`{`
			`op = ip;`
			`ip++;`
			`op++;`

			`UpSkip = upf - lz;`

			`while( --lz != 0 )`
			`{`
			`*op = 0.0;`
			`op++;`
			`}`
			`}`

			`ip0 = ip;`
			`}`

			`/**`
			`* Function copies sample data from the CurOutput buffer to the specified`
			`* output buffer and advances its position. If necessary, this function`
			`* "consumes" latency and performs downsampling.`
			`*`
			`* @param Offs CurOutput buffer offset, can be negative.`
			`* @param[out] op0 Output buffer pointer, will be advanced.`
			`* @param b The number of output samples available, including those which`
			`* are discarded during whole-number downsampling.`
			`* @param l0 The overall output sample count, will be increased.`
			`*/`

			`void copyToOutput( int Offs, double*& op0, int b, int& l0 )`
			`{`
			`if( Offs < 0 )`
			`{`
			`if( Offs + b <= 0 )`
			`{`
			`Offs += BlockLen2;`
			`}`
			`else`
			`{`
			`copyToOutput( Offs + BlockLen2, op0, -Offs, l0 );`
			`b += Offs;`
			`Offs = 0;`
			`}`
			`}`

			`if( LatencyLeft != 0 )`
			`{`
			`if( LatencyLeft >= b )`
			`{`
			`LatencyLeft -= b;`
			`return;`
			`}`

			`Offs += LatencyLeft;`
			`b -= LatencyLeft;`
			`LatencyLeft = 0;`
			`}`

			`const int df = DownFactor;`

			`if( DownShift > 0 )`
			`{`
			`int Skip = Offs & ( df - 1 );`

			`if( Skip > 0 )`
			`{`
			`Skip = df - Skip;`
			`b -= Skip;`
			`Offs += Skip;`
			`}`

			`if( b > 0 )`
			`{`
			`b = ( b + df - 1 ) >> DownShift;`
			`memcpy( op0, &CurOutput[ Offs >> DownShift ],`
			`b * sizeof( op0[ 0 ]));`

			`op0 += b;`
			`l0 += b;`
			`}`
			`}`
			`else`
			`{`
			`if( df > 1 )`
			`{`
			`const double* ip = &CurOutput[ Offs + DownSkip ];`
			`int l = ( b + df - 1 - DownSkip ) / df;`
			`DownSkip += l * df - b;`

			`double* op = op0;`
			`l0 += l;`
			`op0 += l;`

			`while( l > 0 )`
			`{`
			`op = ip;`
			`ip += df;`
			`op++;`
			`l--;`
			`}`
			`}`
			`else`
			`{`
			`memcpy( op0, &CurOutput[ Offs ], b * sizeof( op0[ 0 ]));`
			`op0 += b;`
			`l0 += b;`
			`}`
			`}`
			`}`

			`/**`
			`* Function performs input spectrum mirroring which is used to perform a`
			`* fast "power of 2" upsampling. Such mirroring is equivalent to insertion`
			`* of zeros into the input signal.`
			`*`
			`* @param p Spectrum data block to mirror.`
			`* @tparam T Buffer's element type.`
			`*/`

			`template< typename T >`
			`void mirrorInputSpectrum( T* const p )`
			`{`
			`const int bl1 = BlockLen2 >> UpShift;`
			`const int bl2 = bl1 + bl1;`
			`int i;`

			`for( i = bl1 + 2; i < bl2; i += 2 )`
			`{`
			`p[ i ] = p[ bl2 - i ];`
			`p[ i + 1 ] = -p[ bl2 - i + 1 ];`
			`}`

			`p[ bl1 ] = p[ 1 ];`
			`p[ bl1 + 1 ] = 0.0;`
			`p[ 1 ] = p[ 0 ];`

			`for( i = 1; i < UpShift; i++ )`
			`{`
			`const int z = bl1 << i;`
			`memcpy( &p[ z ], p, z * sizeof( p[ 0 ]));`
			`p[ z + 1 ] = 0.0;`
			`}`
			`}`
			`};`

			`} // namespace r8b`

			`#endif // R8B_CDSPBLOCKCONVOLVER_INCLUDED`