winamp/Src/libvpShared/corelibs/cdxv/preproc/preproc.c
2024-09-24 14:54:57 +02:00

694 lines
17 KiB
C

/****************************************************************************
*
* Module Title : preproc.c
*
* Description : Simple pre-processor.
*
****************************************************************************/
/****************************************************************************
* Header Files
****************************************************************************/
#include "memory.h"
#include "preproc.h"
/****************************************************************************
* Macros
****************************************************************************/
#define FRAMECOUNT 7
#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
/****************************************************************************
* Imports
****************************************************************************/
extern void GetProcessorFlags (int *MmxEnabled, int *XmmEnabled, int *WmtEnabled );
/****************************************************************************
* Exported Global Variables
****************************************************************************/
void (*tempFilter)( PreProcInstance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength );
#ifndef MAPCA
/****************************************************************************
*
* ROUTINE : spatialFilter_wmt
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
* unsigned char *s : Pointer to source frame.
* unsigned char *d : Pointer to destination frame.
* int width : WIdth of images.
* int height : Height of images.
* int pitch : Stride of images.
* int strength : Strength of filter to apply.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Performs a closesness adjusted temporarl blur
*
* SPECIAL NOTES : Destination frame can be same as source frame.
*
****************************************************************************/
void spatialFilter_wmt
(
PreProcInstance *ppi,
unsigned char *s,
unsigned char *d,
int width,
int height,
int pitch,
int strength
)
{
int i;
int row = 1;
int PixelOffsets[] =
{
-pitch-1, -pitch, -pitch+1,
-1, 0, +1,
pitch-1, pitch, pitch+1
};
unsigned char *frameptr = ppi->frameBuffer;
__declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
__declspec(align(16)) unsigned short sixteens[]= {16,16,16,16,16,16,16,16};
memcpy ( d, s, width );
d += pitch;
s += pitch;
do
{
// NOTE: By doing it this way I am ensuring that pixels will always be unaligned!!!
int col = 1;
d[0] = s[0];
d[width - 1] = s[width - 1];
do
{
__declspec(align(16)) unsigned short counts[8];
__declspec(align(16)) unsigned short sums[8];
_asm
{
mov esi, s // get the source line
add esi, col // add the column offset
pxor xmm1,xmm1 // accumulator
pxor xmm2,xmm2 // count
pxor xmm7,xmm7 // 0s for use with unpack
movq xmm3, QWORD PTR [esi] // get 8 pixels
punpcklbw xmm3, xmm7 // unpack to shorts
xor eax, eax // neighbor iterator
NextNeighbor:
mov ecx, [PixelOffsets+eax*4] // get eax index pixel neighbor offset
movq xmm4, QWORD PTR [esi + ecx] // get ecx index neighbor values
punpcklbw xmm4, xmm7 // xmm4 unpacked neighbor values
movdqa xmm6, xmm4 // save the pixel values
psubsw xmm4, xmm3 // subtracted pixel values
pmullw xmm4, xmm4 // square xmm4
movd xmm5, strength
psrlw xmm4, xmm5 // should be strength
pmullw xmm4, threes // 3 * modifier
movdqa xmm5, sixteens // 16s
psubusw xmm5, xmm4 // 16 - modifiers
movdqa xmm4, xmm5 // save the modifiers
pmullw xmm4, xmm6 // multiplier values
paddusw xmm1, xmm4 // accumulator
paddusw xmm2, xmm5 // count
inc eax // next neighbor
cmp eax,9 // there are nine neigbors
jne NextNeighbor
movdqa counts, xmm2
psrlw xmm2,1 // divide count by 2 for rounding
paddusw xmm1,xmm2 // rounding added in
mov frameptr,esi
movdqa sums, xmm1
}
for ( i=0; i<8; i++ )
{
int blurvalue = sums[i] * ppi->fixedDivide[counts[i]];
blurvalue >>= 16;
d[col+i] = blurvalue;
}
col += 8;
} while ( col<width-1 );
d += pitch;
s += pitch;
++row;
} while ( row<height-1 );
memcpy ( d, s, width );
__asm emms
}
#endif
/****************************************************************************
*
* ROUTINE : tempFilter_c
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
* unsigned char *s : Pointer to source frame.
* unsigned char *d : Pointer to destination frame.
* int bytes : Number of bytes to filter.
* int strength : Strength of filter to apply.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Performs a closesness adjusted temporarl blur
*
* SPECIAL NOTES : Destination frame can be same as source frame.
*
****************************************************************************/
void tempFilter_c
(
PreProcInstance *ppi,
unsigned char *s,
unsigned char *d,
int bytes,
int strength
)
{
int byte = 0;
unsigned char *frameptr = ppi->frameBuffer;
if ( ppi->frame == 0 )
{
do
{
int frame = 0;
do
{
*frameptr = s[byte];
++frameptr;
++frame;
} while ( frame < FRAMECOUNT );
d[byte] = s[byte];
++byte;
} while ( byte < bytes );
}
else
{
int modifier;
int offset = (ppi->frame % FRAMECOUNT);
do
{
int accumulator = 0;
int count = 0;
int frame = 0;
frameptr[offset] = s[byte];
do
{
int pixelValue = *frameptr;
modifier = s[byte];
modifier -= pixelValue;
modifier *= modifier;
modifier >>= strength;
modifier *= 3;
if(modifier > 16)
modifier = 16;
modifier = 16 - modifier;
accumulator += modifier * pixelValue;
count += modifier;
frameptr++;
++frame;
} while ( frame < FRAMECOUNT );
accumulator += (count >> 1);
accumulator *= ppi->fixedDivide[count];
accumulator >>= 16;
d[byte] = accumulator;
++byte;
} while ( byte < bytes );
}
++ppi->frame;
}
#ifndef MAPCA
/****************************************************************************
*
* ROUTINE : tempFilter_wmt
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
* unsigned char *s : Pointer to source frame.
* unsigned char *d : Pointer to destination frame.
* int bytes : Number of bytes to filter.
* int strength : Strength of filter to apply.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Performs a closesness adjusted temporarl blur
*
* SPECIAL NOTES : Destination frame can be same as source frame.
*
****************************************************************************/
void tempFilter_wmt
(
PreProcInstance *ppi,
unsigned char *s,
unsigned char *d,
int bytes,
int strength
)
{
int byte = 0;
unsigned char * frameptr = ppi->frameBuffer;
__declspec(align(16)) unsigned short threes[] ={ 3, 3, 3, 3, 3, 3, 3, 3};
__declspec(align(16)) unsigned short sixteens[]={16,16,16,16,16,16,16,16};
if ( ppi->frame == 0 )
{
do
{
int i;
int frame = 0;
do
{
for ( i=0; i<8; i++ )
{
*frameptr = s[byte+i];
++frameptr;
}
++frame;
} while ( frame < FRAMECOUNT );
for ( i=0; i<8; i++ )
d[byte+i] = s[byte+i];
byte += 8;
} while ( byte < bytes );
}
else
{
int i;
int offset2 = (ppi->frame % FRAMECOUNT);
do
{
__declspec(align(16)) unsigned short counts[8];
__declspec(align(16)) unsigned short sums[8];
int accumulator = 0;
int count = 0;
int frame = 0;
_asm
{
mov eax,offset2
mov edi,s // source pixels
pxor xmm1,xmm1 // accumulator
pxor xmm7,xmm7
mov esi,frameptr // accumulator
pxor xmm2,xmm2 // count
movq xmm3, QWORD PTR [edi]
movq QWORD PTR [esi+8*eax],xmm3
punpcklbw xmm3, xmm2 // xmm3 source pixels
mov ecx, FRAMECOUNT
NextFrame:
movq xmm4, QWORD PTR [esi] // get frame buffer values
punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
movdqa xmm6, xmm4 // save the pixel values
psubsw xmm4, xmm3 // subtracted pixel values
pmullw xmm4, xmm4 // square xmm4
movd xmm5, strength
psrlw xmm4, xmm5 // should be strength
pmullw xmm4, threes // 3 * modifier
movdqa xmm5, sixteens // 16s
psubusw xmm5, xmm4 // 16 - modifiers
movdqa xmm4, xmm5 // save the modifiers
pmullw xmm4, xmm6 // multiplier values
paddusw xmm1, xmm4 // accumulator
paddusw xmm2, xmm5 // count
add esi, 8 // next frame
dec ecx // next set of eight pixels
jnz NextFrame
movdqa counts, xmm2
psrlw xmm2,1 // divide count by 2 for rounding
paddusw xmm1,xmm2 // rounding added in
mov frameptr,esi
movdqa sums, xmm1
}
for ( i=0; i<8; i++ )
{
int blurvalue = sums[i] * ppi->fixedDivide[counts[i]];
blurvalue >>= 16;
d[i] = blurvalue;
}
s += 8;
d += 8;
byte += 8;
} while ( byte < bytes );
}
++ppi->frame;
__asm emms
}
/****************************************************************************
*
* ROUTINE : tempFilter_mmx
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
* unsigned char *s : Pointer to source frame.
* unsigned char *d : Pointer to destination frame.
* int bytes : Number of bytes to filter.
* int strength : Strength of filter to apply.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Performs a closesness adjusted temporarl blur
*
* SPECIAL NOTES : Destination frame can be same as source frame.
*
****************************************************************************/
void tempFilter_mmx
(
PreProcInstance *ppi,
unsigned char *s,
unsigned char *d,
int bytes,
int strength
)
{
int byte = 0;
unsigned char *frameptr = ppi->frameBuffer;
__declspec(align(16)) unsigned short threes[] ={ 3, 3, 3, 3};
__declspec(align(16)) unsigned short sixteens[]={16,16,16,16};
if ( ppi->frame == 0 )
{
do
{
int i;
int frame = 0;
do
{
for ( i=0; i<4; i++ )
{
*frameptr = s[byte+i];
++frameptr;
}
++frame;
} while ( frame < FRAMECOUNT );
for ( i=0; i<4; i++ )
d[byte+i] = s[byte+i];
byte += 4;
} while ( byte < bytes );
}
else
{
int i;
int offset2 = (ppi->frame % FRAMECOUNT);
do
{
__declspec(align(16)) unsigned short counts[8];
__declspec(align(16)) unsigned short sums[8];
int accumulator = 0;
int count = 0;
int frame = 0;
_asm
{
mov eax,offset2
mov edi,s // source pixels
pxor mm1,mm1 // accumulator
pxor mm7,mm7
mov esi,frameptr // accumulator
pxor mm2,mm2 // count
movd mm3, DWORD PTR [edi]
movd DWORD PTR [esi+4*eax],mm3
punpcklbw mm3, mm2 // mm3 source pixels
mov ecx, FRAMECOUNT
NextFrame:
movd mm4, DWORD PTR [esi] // get frame buffer values
punpcklbw mm4, mm7 // mm4 frame buffer pixels
movq mm6, mm4 // save the pixel values
psubsw mm4, mm3 // subtracted pixel values
pmullw mm4, mm4 // square mm4
movd mm5, strength
psrlw mm4, mm5 // should be strength
pmullw mm4, threes // 3 * modifier
movq mm5, sixteens // 16s
psubusw mm5, mm4 // 16 - modifiers
movq mm4, mm5 // save the modifiers
pmullw mm4, mm6 // multiplier values
paddusw mm1, mm4 // accumulator
paddusw mm2, mm5 // count
add esi, 4 // next frame
dec ecx // next set of eight pixels
jnz NextFrame
movq counts, mm2
psrlw mm2,1 // divide count by 2 for rounding
paddusw mm1,mm2 // rounding added in
mov frameptr,esi
movq sums, mm1
}
for ( i=0; i<4; i++ )
{
int blurvalue = sums[i] * ppi->fixedDivide[counts[i]];
blurvalue >>= 16;
d[i] = blurvalue;
}
s += 4;
d += 4;
byte += 4;
} while ( byte < bytes );
}
++ppi->frame;
__asm emms
}
#endif
/****************************************************************************
*
* ROUTINE : DeletePreProc
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Deletes a pre-processing instance.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void DeletePreProc ( PreProcInstance *ppi )
{
if ( ppi->frameBufferAlloc )
duck_free ( ppi->frameBufferAlloc );
ppi->frameBufferAlloc = 0;
ppi->frameBuffer = 0;
if( ppi->fixedDivideAlloc )
duck_free ( ppi->fixedDivideAlloc );
ppi->fixedDivideAlloc = 0;
ppi->fixedDivide = 0;
}
/****************************************************************************
*
* ROUTINE : InitPreProc
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
* int FrameSize : Number of bytes in one frame.
*
* OUTPUTS : None.
*
* RETURNS : int: 1 if successful, 0 if failed.
*
* FUNCTION : Initializes prepprocessor instance.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
int InitPreProc ( PreProcInstance *ppi, int FrameSize )
{
int i;
int MmxEnabled;
int XmmEnabled;
int WmtEnabled;
#ifndef MAPCA
GetProcessorFlags ( &MmxEnabled, &XmmEnabled, &WmtEnabled );
if ( WmtEnabled )
tempFilter = tempFilter_wmt;
else if ( MmxEnabled )
tempFilter = tempFilter_mmx;
else
#endif
tempFilter = tempFilter_c;
DeletePreProc ( ppi );
ppi->frameBufferAlloc = duck_malloc ( 32+FrameSize*7*sizeof(unsigned char), DMEM_GENERAL );
if ( !ppi->frameBufferAlloc ) { DeletePreProc( ppi ); return 0; }
ppi->frameBuffer = (unsigned char *) ROUNDUP32( ppi->frameBufferAlloc );
ppi->fixedDivideAlloc = duck_malloc ( 32+255*sizeof(unsigned int), DMEM_GENERAL );
if ( !ppi->fixedDivideAlloc ) { DeletePreProc( ppi ); return 0; }
ppi->fixedDivide = (unsigned int *) ROUNDUP32( ppi->fixedDivideAlloc );
for ( i=1; i<255; i++ )
ppi->fixedDivide[i] = 0x10000 / i;
return 1;
}
/****************************************************************************
*
* ROUTINE : spatialFilter_c
*
* INPUTS : PreProcInstance *ppi : Pointer to pre-processor instance.
* unsigned char *s : Pointer to source frame.
* unsigned char *d : Pointer to destination frame.
* int width : Width of images.
* int height : Height of images.
* int pitch : Stride of images.
* int strength : Strength of filter to apply.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Performs a closesness adjusted temporal blur.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void spatialFilter_c
(
PreProcInstance *ppi,
unsigned char *s,
unsigned char *d,
int width,
int height,
int pitch,
int strength
)
{
int modifier;
int byte = 0;
int row = 1;
int PixelOffsets[9];
PixelOffsets[0] = -pitch - 1;
PixelOffsets[1] = -pitch;
PixelOffsets[2] = -pitch + 1;
PixelOffsets[3] = - 1;
PixelOffsets[4] = 0;
PixelOffsets[5] = + 1;
PixelOffsets[6] = pitch - 1;
PixelOffsets[7] = pitch ;
PixelOffsets[8] = pitch + 1;
memcpy ( d, s, width );
d += pitch;
s += pitch;
do
{
int col = 1;
d[0] = s[0];
d[width - 1] = s[width - 1];
do
{
int accumulator = 0;
int count = 0;
int neighbor = 0;
do
{
int pixelValue = s[ col + PixelOffsets[neighbor] ];
modifier = s[col];
modifier -= pixelValue;
modifier *= modifier;
modifier >>= strength;
modifier *= 3;
if(modifier > 16)
modifier = 16;
modifier = 16 - modifier;
accumulator += modifier * pixelValue;
count += modifier;
neighbor++;
} while ( neighbor < sizeof(PixelOffsets)/sizeof(int) );
accumulator += (count >> 1);
accumulator *= ppi->fixedDivide[count];
accumulator >>= 16;
d[col] = accumulator;
++col;
} while ( col < width-1 );
d += pitch;
s += pitch;
++row;
} while ( row < height-1 );
memcpy ( d, s, width );
}