winamp/Src/external_dependencies/openmpt-trunk/include/r8brain/CDSPHBUpsampler.inc
2024-09-24 14:54:57 +02:00

712 lines
24 KiB
PHP
Vendored

// Auto-generated by `genhbc`, do not edit!
#if defined( R8B_SSE2 )
R8BHBC1( convolve1 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
R8BHBC2
R8BHBC1( convolve2 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
R8BHBC1( convolve3 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
op[ 1 ] += flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
R8BHBC2
R8BHBC1( convolve4 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
R8BHBC1( convolve5 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
op[ 1 ] += flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
R8BHBC2
R8BHBC1( convolve6 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
R8BHBC1( convolve7 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
op[ 1 ] += flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
R8BHBC2
R8BHBC1( convolve8 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
R8BHBC1( convolve9 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
op[ 1 ] += flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
R8BHBC2
R8BHBC1( convolve10 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
R8BHBC1( convolve11 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
op[ 1 ] += flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
R8BHBC2
R8BHBC1( convolve12 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
R8BHBC1( convolve13 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
op[ 1 ] += flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
R8BHBC2
R8BHBC1( convolve14 )
__m128d v1, v2, m1, s1;
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = m1;
__m128d v3, v4, m3, s3;
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = m3;
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
s3 = _mm_add_pd( s3, m3 );
v2 = _mm_loadu_pd( rp - 13 ); v1 = _mm_loadu_pd( rp + 13 );
m1 = _mm_mul_pd( _mm_load_pd( flt + 12 ),
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
s1 = _mm_add_pd( s1, m1 );
s1 = _mm_add_pd( s1, s3 );
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
R8BHBC2
#elif defined( R8B_NEON )
R8BHBC1( convolve1 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
R8BHBC2
R8BHBC1( convolve2 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
R8BHBC1( convolve3 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
R8BHBC2
R8BHBC1( convolve4 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
R8BHBC1( convolve5 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
R8BHBC2
R8BHBC1( convolve6 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
R8BHBC1( convolve7 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
R8BHBC2
R8BHBC1( convolve8 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
R8BHBC1( convolve9 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
R8BHBC2
R8BHBC1( convolve10 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
R8BHBC1( convolve11 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
R8BHBC2
R8BHBC1( convolve12 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
R8BHBC1( convolve13 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
R8BHBC2
R8BHBC1( convolve14 )
float64x2_t v1, v2, s1;
s1 = vdupq_n_f64( 0.0 );
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
float64x2_t v3, v4, s3;
s3 = vdupq_n_f64( 0.0 );
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
v2 = vld1q_f64( rp - 13 ); v1 = vld1q_f64( rp + 13 );
s1 = vmlaq_f64( s1, vld1q_f64( flt + 12 ),
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
s1 = vaddq_f64( s1, s3 );
op[ 1 ] = vaddvq_f64( s1 );
R8BHBC2
#else // SIMD
R8BHBC1( convolve1 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
R8BHBC2
R8BHBC1( convolve2 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]);
R8BHBC2
R8BHBC1( convolve3 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
R8BHBC2
R8BHBC1( convolve4 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]);
R8BHBC2
R8BHBC1( convolve5 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
R8BHBC2
R8BHBC1( convolve6 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]);
R8BHBC2
R8BHBC1( convolve7 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
R8BHBC2
R8BHBC1( convolve8 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]);
R8BHBC2
R8BHBC1( convolve9 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
R8BHBC2
R8BHBC1( convolve10 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]);
R8BHBC2
R8BHBC1( convolve11 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
R8BHBC2
R8BHBC1( convolve12 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
+ flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]);
R8BHBC2
R8BHBC1( convolve13 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
+ flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
+ flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
R8BHBC2
R8BHBC1( convolve14 )
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
+ flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
+ flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ])
+ flt[ 13 ] * ( rp[ 14 ] + rp[ -13 ]);
R8BHBC2
#endif // SIMD