
/*
 * bltPictureMMX.c --
 *
 * This module implements image processing procedures for the BLT toolkit.
 *
 *	Copyright 1997-2004 George A Howlett.
 *
 *	Permission is hereby granted, free of charge, to any person
 *	obtaining a copy of this software and associated documentation
 *	files (the "Software"), to deal in the Software without
 *	restriction, including without limitation the rights to use,
 *	copy, modify, merge, publish, distribute, sublicense, and/or
 *	sell copies of the Software, and to permit persons to whom the
 *	Software is furnished to do so, subject to the following
 *	conditions:
 *
 *	The above copyright notice and this permission notice shall be
 *	included in all copies or substantial portions of the
 *	Software.
 *
 *	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
 *	KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 *	WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 *	PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
 *	OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 *	OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 *	OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 *	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "bltInt.h"
#include "bltPicture.h"
#include "bltPictureInt.h"

int bltUseMMX = 0;

#ifdef HAVE_X86_ASM

void
Blt_SelectPixelsMMX(
    Picture *destPtr,
    Picture *srcPtr, 
    Pix32 *lowerPtr,
    Pix32 *upperPtr)
{
    Pix32 *srcRowPtr, *destRowPtr;
    int y;

    asm volatile (
	/* Put lower and upper pixels in registers. */
	"movd %0, %%mm4	        # mm4 = L\n\t"
	"movd %1, %%mm5	        # mm5 = H\n\t"
	"pxor %%mm6, %%mm6	# mm6 = 0\n\t"
	"punpckldq %%mm4, %%mm4 # mm4 = L,L\n\t" 
	"punpckldq %%mm5, %%mm5 # mm5 = H,H\n\t" :
	/* output registers */ :
	/* input registers */
	"r" (lowerPtr->color), "r" (upperPtr->color));

    destRowPtr = destPtr->bits, srcRowPtr = srcPtr->bits;
    for (y = 0; y < srcPtr->height; y++) {
	Pix32 *dp, *sp, *send;

	dp = destRowPtr;
	for(sp = srcRowPtr, send = sp + srcPtr->width; sp < send; sp += 2) {
	    asm volatile (
		/* Compare two pixels at a time */
		"movq (%1), %%mm3	# mm3 = S1,S2\n\t"
		"movq %%mm4, %%mm0	# mm0 = L,L\n\t"

		/* We want to test (S >= L) && (S <= H). Since the
		 * operands are all unsigned, pcmp* ops are out. 
		 * Instead use saturated, unsigned subtraction.
		 * ((L psub S) == 0) is the same as (S >= L) */

		"psubusb %%mm3, %%mm0	# mm0 = L - S\n\t"
		"movq %%mm3, %%mm1	# mm1 = S\n\t"
		"psubusb %%mm5, %%mm1	# mm1 = S - H\n\t"

		/* "or" the two results and compare 32-bit values to 0
		 * (inverting the logic). */

		"por %%mm1, %%mm0	# mm0 = (S >= L)|(H >= S)\n\t"
 		"pcmpeqd %%mm6, %%mm0	# invert logic\n\t"
		"movq %%mm0, (%0)	# dp = new value\n" :
		/* output registers */
		"+r" (dp) :
		/* input registers */
		"r" (sp));
	    dp += 2;
	}
	srcRowPtr  += srcPtr->pixelsPerRow;
	destRowPtr += destPtr->pixelsPerRow;
    }
    asm volatile ("emms");
}

void
Blt_ApplyPictureToPictureMMX(
    Picture *destPtr, 
    Picture *srcPtr, 
    Blt_PictureArithOps op)
{
    Pix32 *srcRowPtr, *destRowPtr;
    int width, height;
    int y;

    /* If the picture sizes are different use the smaller dimension. */
    width = MIN(srcPtr->width, destPtr->width);
    height = MIN(srcPtr->height, destPtr->height);
    
    asm volatile (
        /* Generate constants needed below. */
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pcmpeqw %mm7, %mm7	# mm5 = -1 \n");

    srcRowPtr = srcPtr->bits;
    destRowPtr = destPtr->bits;
    for (y = 0; y < height; y++) {
	Pix32 *sp, *dp, *dend;

	sp = srcRowPtr;
	dp = destRowPtr, dend = dp + width;
	switch(op) {
	case PICTURE_ARITH_ADD:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "paddusb (%1), %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_SUB:
	    while (dp < dend) {
		asm volatile (
		     "movq (%0), %%mm0\n\t" 
		     "psubusb (%1), %%mm0\n\t" 
		     "movq %%mm0, (%0)" : 
		     /* output registers */
		     "+r" (dp) : 
		     /* input registers */
		     "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_RSUB:
	    while (dp < dend) {
		asm volatile (
		     "movq (%1), %%mm1\n\t" 
		     "psubusb (%0), %%mm1\n\t" 
		     "movq %%mm1, (%0)" : 
		     /* output registers */
		     "+r" (dp) : 
		     /* input registers */
		     "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_AND:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "pand (%1), %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_OR:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "por (%1), %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_NAND:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "pand (%1), %%mm0\n\t" 
		    "pxor %%mm7, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_NOR:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "por (%1), %%mm0\n\t" 
		    "pxor %%mm7, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_XOR:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "pxor (%1), %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_MIN:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0		# mm0 = A\n\t" 
		    "movq (%1), %%mm1		# mm1 = B\n\t" 
		    "movq %%mm0, %%mm2		# mm2 = A\n\t" 
		    "psubusb %%mm1, %%mm2	# mm2 = A - B\n\t"
		    "pcmpeqb %%mm6, %%mm2	# mm2 = 0s A>B 1s A<=B\n\t"
		    "pand %%mm2, %%mm0		# mm2 = mask & A\n\t" 
		    "pxor %%mm7, %%mm2		# mm2 = ~mask\n\t" 
		    "pand %%mm2, %%mm1		# mm0 = ~mask & B\n\t" 
		    "por %%mm1, %%mm0		# mm0 = R1 | R2\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;

	case PICTURE_ARITH_MAX:
	    while (dp < dend) {
		asm volatile (
		    "movq (%0), %%mm0		# mm0 = A\n\t" 
		    "movq (%1), %%mm1		# mm1 = B\n\t" 
		    "movq %%mm0, %%mm2		# mm2 = A\n\t" 
		    "psubusb %%mm1, %%mm2	# mm2 = A - B\n\t"
		    "pcmpeqb %%mm6, %%mm2	# mm2 = 0s A>B 1s A<=B\n\t"
		    "pand %%mm2, %%mm1		# mm1 = mask & B\n\t" 
		    "pxor %%mm7, %%mm2		# mm2 = ~mask\n\t" 
		    "pand %%mm2, %%mm0		# mm0 = ~mask & A\n\t" 
		    "por %%mm1, %%mm0		# mm3 = R1 | R2\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (dp) : 
		    /* input registers */
		    "r" (sp));
		sp += 2, dp += 2;
	    }
	    break;
	}
	destRowPtr += destPtr->pixelsPerRow;
	srcRowPtr += srcPtr->pixelsPerRow;
    }
    asm volatile ("emms");
}


void
Blt_ApplyScalarToPictureMMX(
    Picture *srcPtr, 
    Pix32 *colorPtr,
    Blt_PictureArithOps op)
{
    Pix32 *srcRowPtr;
    unsigned long value;
    int y;
    
    /*
     * mm7 = -1
     * mm6 = 0x0
     * mm4 = scalar,scalar
     */
    value = (unsigned long)colorPtr->color;
    asm volatile (
        /* Generate constants needed below. */
	"pxor %%mm6, %%mm6	  # mm6 = 0\n\t"
	"pcmpeqw %%mm7, %%mm7	  # mm5 = -1 \n\t"
	/* Put the scalar into hi/lo 32-bit words.*/
	"movd %0, %%mm4		  # mm4 = scalar\n\t"
	"punpckldq %%mm4, %%mm4   # mm2 = S,S\n" :
	/* output registers */ :
	/* input registers */
	"r" (value));

    srcRowPtr = srcPtr->bits;
    for (y = 0; y < srcPtr->height; y++) {
	Pix32 *sp, *send;

	sp = srcRowPtr;
	send = sp + srcPtr->width;
	switch(op) {
	case PICTURE_ARITH_ADD:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "paddusb %%mm4, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_SUB:
	    while (sp < send) {
		asm volatile (
		     "movq (%0), %%mm0\n\t" 
		     "psubusb %%mm4, %%mm0\n\t" 
		     "movq %%mm0, (%0)" : 
		     /* output registers */
		     "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_RSUB:
	    while (sp < send) {
		asm volatile (
		     "movq (%0), %%mm0\n\t" 
		     "movq %%mm4, %%mm1\n\t"
		     "psubusb %%mm0, %%mm1\n\t" 
		     "movq %%mm1, (%0)" : 
		     /* output registers */
		     "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_AND:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "pand %%mm4, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_OR:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "por %%mm4, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_NAND:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "pand %%mm4, %%mm0\n\t" 
		    "pxor %%mm7, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_NOR:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "por %%mm4, %%mm0\n\t" 
		    "pxor %%mm7, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_XOR:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0\n\t" 
		    "pxor %%mm4, %%mm0\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_MIN:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0     # mm0 = Color\n\t" 
		    "movq %%mm0, %%mm1    # mm1 = Color\n\t" 
		    "psubusb %%mm4, %%mm1 # mm1 = C - S\n\t"
		    "pcmpeqb %%mm6, %%mm1 # mm2 = mask: 0s C>S 1s C<=S\n\t"
		    "pand %%mm1, %%mm0    # mm0 = mask & C\n\t" 
		    "pxor %%mm7, %%mm1    # mm1 = ~mask\n\t" 
		    "pand %%mm4, %%mm1    # mm1 = S & ~mask\n\t" 
		    "por %%mm1, %%mm0     # mm0 = (S&~mask)|(mask&C)\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;

	case PICTURE_ARITH_MAX:
	    while (sp < send) {
		asm volatile (
		    "movq (%0), %%mm0     # mm0 = Color\n\t" 
		    "movq %%mm4, %%mm1    # mm1 = Scalar\n\t" 
		    "psubusb %%mm0, %%mm1 # mm1 = S - C\n\t"
		    "pcmpeqb %%mm6, %%mm1 # mm1 = mask: 0s S>C 1s S<=C\n\t"
		    "pand %%mm1, %%mm0    # mm0 = mask & C\n\t" 
		    "pxor %%mm7, %%mm1    # mm1 = ~mask\n\t" 
		    "pand %%mm4, %%mm1    # mm1 = S & ~mask\n\t" 
		    "por %%mm1, %%mm0     # mm0 = (S&~mask)|(mask&C)\n\t" 
		    "movq %%mm0, (%0)" : 
		    /* output registers */
		    "+r" (sp));
		sp += 2;
	    }
	    break;
	}
	srcRowPtr += srcPtr->pixelsPerRow;
    }
    asm volatile ("emms");
}

void
Blt_ZoomVerticallyMMX(
    Picture *destPtr,
    Picture *srcPtr, 
    PictureFilter *filterPtr)
{
    Sample *samples, *send;
    int x;
    int bytesPerSample;		/* Size of sample. */
    long bytesPerRow;

    /* Pre-calculate filter contributions for each row. */
    bytesPerSample = Blt_ComputeWeights(srcPtr->height, destPtr->height, 
	filterPtr, &samples);
    bytesPerRow = sizeof(Pix32) * srcPtr->pixelsPerRow;
    send = (Sample *)((char *)samples + (destPtr->height * bytesPerSample));

    asm volatile (
        /* Generate constants needed below. */
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pcmpeqw %mm2, %mm2	# mm2 = -1 \n\t"
	"psubw %mm6, %mm2	# mm2 = 1,1,1,1\n\t"
	"psllw $4, %mm2	        # mm2 = BIAS\n");

    /* Apply filter to each row. */
    for (x = 0; x < srcPtr->width; x++) {
	Pix32 *dp, *srcColumnPtr;
	Sample *splPtr;

	srcColumnPtr = srcPtr->bits + x;
	dp = destPtr->bits + x;
	for (splPtr = samples; splPtr < send; 
	     splPtr = (Sample *)((char *)splPtr + bytesPerSample)) {
	    Pix32 *sp;

	    sp = srcColumnPtr + (splPtr->start * srcPtr->pixelsPerRow);
	    asm volatile (
		/* Clear the accumulator mm5. */
                 "pxor %%mm5, %%mm5	    #  mm5 = 0\n\n" 
                 ".Lasm%=:\n\t" 
		 /* Load the weighting factor into mm1. */
		 "movd (%1), %%mm1	    #  mm1 = 0,0,0,W\n\t"
		 /* Load the source pixel into mm0. */
                 "movd (%3), %%mm0          #  mm0 = S\n\t" 
		 /* Unpack the weighting factor into mm1. */
		 "punpcklwd %%mm1, %%mm1    #  mm1 = 0,0,W,W\n\t"
		 "punpcklwd %%mm1, %%mm1    #  mm1 = W,W,W,W\n\t"
		 /* Unpack the pixel components into 16-bit words.*/
                 "punpcklbw %%mm6, %%mm0    #  mm0 = Sa,Sb,Sg,Sr\n\t" 
		 /* Scale the 8-bit components to 14 bits. (S * 257) >> 2 */
                 "movq %%mm0, %%mm3         #  mm3 = S8\n\t" 
                 "psllw $8, %%mm3           #  mm3 = S8 * 256\n\t" 
                 "paddw %%mm3, %%mm0        #  mm0 = S16\n\t" 
                 "psrlw $1, %%mm0           #  mm0 = S15\n\t" 
		 /* Multiple each pixel component by the weight.  Note
		  * that the lower 16-bits of the product are
		  * truncated (bad) creating round-off error in the
		  * sum. */
                 "pmulhw %%mm1, %%mm0       #  mm0 = S15 * W14\n\t" 
                 /* Accumulate upper 16-bit results of product in mm5. */
                 "paddsw %%mm0, %%mm5        #  mm5 = prod + mm5\n\t" 
                 /* Move the pointers to the next weight and pixel */
                 "add $4, %1                #  wp++\n\t" 
                 "add %4, %3                #  sp++\n\t" 
                 "cmp %2, %1                #  wend == wp\n\t" 
                 "jnz .Lasm%=\n\t" 
                 /* end loop */
                 /* Add a rounding bias to the pixel sum */
                 "paddw %%mm2, %%mm5        # mm5 = A13 + BIAS\n\t" 
                 /* Shift off fractional part */
                 "psraw $5, %%mm5           # mm5 = A8\n\t" 
		 /* Pack 16-bit components into lower 4 bytes. */
                 "packuswb  %%mm5, %%mm5    # Pack 4 low-order bytes.\n\t" 
		 /* Save the word (pixel) in the destination. */
                 "movd %%mm5,(%0)           # dp = word\n" :
  		 /* output registers */ 
		 "+r" (dp) :
		 /* input registers */
		 "r" (splPtr->weights), 
		 "r" (splPtr->wend), 
		 "r" (sp),
		 "r" (bytesPerRow));
#ifdef notdef
	    if (dp->Alpha != 0xFF) {
		fprintf(stdout, "mmx v-alpha=0x%x\n", dp->Alpha);
	    }
#endif
	    dp += destPtr->pixelsPerRow;

	}
    }
    asm volatile ("emms");
    /* Free the memory allocated for filter weights. */
    Blt_Free(samples);
}

void
Blt_ZoomHorizontallyMMX(
    Picture *destPtr,
    Picture *srcPtr, 
    PictureFilter *filterPtr)
{
    Sample *samples, *send;
    int y;
    Pix32 *srcRowPtr, *destRowPtr;
    int bytesPerSample;		/* Size of sample. */

    /* Pre-calculate filter contributions for each column. */
    bytesPerSample = Blt_ComputeWeights(srcPtr->width, destPtr->width, 
	filterPtr, &samples);
    send = (Sample *)((char *)samples + (destPtr->width * bytesPerSample));

    /* Apply filter to each column. */
    srcRowPtr = srcPtr->bits;
    destRowPtr = destPtr->bits;

    asm volatile (
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pxor %mm3, %mm3	# mm3 = 0\n\t"
	"pcmpeqw %mm2, %mm2	# mm2 = -1\n\t"
	"psubw %mm3, %mm2	# mm2 = 1,1,1,1\n\t"
	"psllw $4, %mm2	        # mm2 = BIAS\n");

    for (y = 0; y < srcPtr->height; y++) {
	Pix32 *dp;
	Sample *splPtr;

	dp = destRowPtr;
	for (splPtr = samples; splPtr < send; 
	     splPtr = (Sample *)((char *)splPtr + bytesPerSample)) {

	    Pix32 *sp;
	    sp = srcRowPtr + splPtr->start;
	    asm volatile (
		/* Clear the accumulator mm5. */
                 "pxor %%mm5, %%mm5        #  mm5 = 0\n\n" 
                 ".Lasm%=:\n\t" 
		 /* Load the weighting factor into mm1. */
                 "movd (%1), %%mm1         #  mm1 = W\n\t" 
		 /* Get the source RGBA pixel. */
                 "movd (%3), %%mm0         #  mm0 = sp\n\t" 
		 /* Unpack the weighting factor into mm1. */
		 "punpcklwd %%mm1, %%mm1   #  mm1 = 0,0,W,W\n\t"
		 "punpcklwd %%mm1, %%mm1   #  mm1 = W,W,W,W\n\t"
		 /* Unpack the pixel into mm0. */
                 "punpcklbw %%mm6, %%mm0   #  mm0 = Sa,Sr,Sg,Sb\n\t" 
		 /* Scale the 8-bit components to 14 bits: (S * 257) >> 2 */
                 "movq %%mm0, %%mm3        #  mm3 = S8\n\t" 
                 "psllw $8, %%mm3          #  mm3 = S8 * 256\n\t" 
                 "paddw %%mm3, %%mm0       #  mm0 = S16\n\t" 
                 "psrlw $1, %%mm0          #  mm0 = S15\n\t" 
		 /* Multiple each pixel component by the weight.  Note
		  * that the lower 16-bits of the product are
		  * truncated (bad) creating round-off error in the
		  * sum. */
                 "pmulhw %%mm1, %%mm0      #  mm0 = S15 * W14\n\t" 
                 /* Add the 16-bit components to mm5. */
                 "paddsw %%mm0, %%mm5      #  mm5 = A13 + mm5\n\t" 
                 /* Move the pointers to the next weight and pixel */
                 "add $4, %1		   #  wp++\n\t" 
                 "add $4, %3               #  sp++\n\t" 
                 "cmp %2, %1               #  wend == wp\n\t" 
                 "jnz .Lasm%=\n\t" 
                 /* end loop */
                 /* Add a rounding bias to the pixel sum. */
                 "paddw %%mm2, %%mm5       # mm5 = A13 + BIAS\n\t" 
                 /* Shift off fractional portion. */
                 "psraw $5, %%mm5          # mm5 = A8\n\t" 
		 /* Pack 16-bit components into lower 4 bytes. */
                 "packuswb %%mm5, %%mm5    # Pack A8 into low 4 bytes.\n\t" 
		 /* Store the word (pixel) in the destination. */
                 "movd %%mm5,(%0)	   # dp = word\n" :
  		 /* output registers */ 
		 "+r" (dp) :
		 /* input registers */
		 "r" (splPtr->weights), 
		 "r" (splPtr->wend), 
		 "r" (sp));
#ifdef notdef
	    if (dp->Alpha != 0xFF) {
		fprintf(stdout, "mmx h-alpha=0x%x\n", dp->Alpha);
	    }
#endif
	    dp++;
	}
	srcRowPtr += srcPtr->pixelsPerRow;
	destRowPtr += destPtr->pixelsPerRow;
    }
    asm volatile ("emms");
    /* Free the memory allocated for horizontal filter weights. */
    Blt_Free(samples);
}


void
Blt_TentVerticallyMMX(
    Picture *destPtr,
    Picture *srcPtr) 
{
    Pix32 *srcColumnPtr, *destColumnPtr;
    int x;
    size_t nPixels;
    
    asm volatile (
	/* Establish constants used below. */
	"pxor %mm6, %mm6	# mm6 = 0\n");

    nPixels = srcPtr->height * srcPtr->pixelsPerRow; 
    srcColumnPtr = srcPtr->bits;
    destColumnPtr = destPtr->bits;
    for (x = 0; x < srcPtr->width; x++) {
	Pix32 *dp, *rp, *rend;

	/* 
	 * mm0 = 
	 * mm1 = unpacked center pixel 
	 * mm2 = unpacked left pixel  
	 * mm3 = unpacked right pixel 
	 * mm4 = 
	 * mm5 = 
	 * mm6 = 0
	 * mm7 = 
	 */
	dp = destColumnPtr;
	rp = srcColumnPtr + srcPtr->pixelsPerRow;
	asm volatile (
	    "movd (%2), %%mm1         # mm1 = cp\n\t"
	    "movd (%1), %%mm3         # mm3 = rp\n\t"
	    "punpcklbw %%mm6, %%mm1   # mm1 = S8\n\t"
	    "movq %%mm1, %%mm2        # mm2 = lp = S8\n\t"
	    "punpcklbw %%mm6, %%mm3   # mm3 = S8\n\t"
	    "movq  %%mm1, %%mm0       # mm0 = cp\n\t"
	    "psllw $1, %%mm0          # mm0 = cp << 1\n\t"
	    "paddw %%mm2, %%mm0       # mm0 = lp + (cp << 1)\n\t"
	    "paddw %%mm3, %%mm0       # mm0 = lp + (cp << 1) + rp\n\t"
	    "psraw $2, %%mm0          # mm0 = (lp + (cp << 1) + rp) >> 2\n\t"
	    "packuswb %%mm0, %%mm0    # Pack into low 4 bytes.\n\t"  
	    "movd %%mm0,(%0)	      # dp = word\n\t"
	    "movq %%mm3, %%mm1	      # cp = rp\n" :
	    /* output registers */ 
	    "+r" (dp), "+r" (rp) :
	    /* input registers */
	    "r" (srcColumnPtr));
	dp += destPtr->pixelsPerRow;
	rp += srcPtr->pixelsPerRow;

	for (rend = srcColumnPtr + nPixels; rp < rend; /*empty*/) {
	    asm volatile (
	        "movd (%1), %%mm3         #  mm3 = rp\n\t" 
	        "punpcklbw %%mm6, %%mm3   #  mm3 = S8\n\t"
		"movq  %%mm1, %%mm0       #  mm0 = cp\n\t"
		"psllw $1, %%mm0          #  mm0 = cp << 1\n\t"
		"paddw %%mm2, %%mm0       #  mm0 = lp + (cp << 1)\n\t"
		"paddw %%mm3, %%mm0       #  mm0 = lp + (cp << 1) + rp\n\t"
		"psraw $2, %%mm0          #  mm0 = (lp + (cp<<1) + rp) >> 2\n\t"
		"packuswb %%mm0, %%mm0    #  Pack into low 4 bytes.\n\t"  
		"movd %%mm0,(%0)	  #  dp = word\n\t" 
  	        "movq %%mm1, %%mm2        #  lp = cp\n\t"
  	        "movq %%mm3, %%mm1        #  cp = rp\n" :
		/* output registers */ 
		"+r" (dp), "+r" (rp));
	    dp += destPtr->pixelsPerRow;
	    rp += srcPtr->pixelsPerRow;
	}	
	asm volatile (
	    "movq %%mm1, %%mm3        #  rp = cp\n\t"
	    "movq %%mm1, %%mm0        #  mm0 = cp\n\t"
	    "psllw $1, %%mm0          #  mm0 = cp << 1\n\t"
	    "paddw %%mm2, %%mm0       #  mm0 = lp + (cp << 1)\n\t"
	    "paddw %%mm3, %%mm0       #  mm0 = lp + (cp << 1) + rp\n\t"
	    "psraw $2, %%mm0          #  mm0 = (lp + (cp << 1) + rp) >> 2\n\t"
	    "packuswb %%mm0, %%mm0    #  Pack into low 4 bytes.\n\t"  
	    "movd %%mm0,(%0)	      #  dp = word\n" : 
	    /* output registers */ 
	    "+r" (dp));

	srcColumnPtr++, destColumnPtr++;
    }
    asm volatile ("emms");
}

void
Blt_TentHorizontallyMMX(
    Picture *destPtr,
    Picture *srcPtr) 
{
    Pix32 *srcRowPtr, *destRowPtr;
    int y;

    asm volatile (
	/* Establish constants used below. */
	"pxor %mm6, %mm6	# mm6 = 0\n");

    srcRowPtr = srcPtr->bits;
    destRowPtr = destPtr->bits;
    for (y = 0; y < srcPtr->height; y++) {
	Pix32 *dp;
	Pix32 *rp, *rend;
	
	/* 
	 * mm0 = 
	 * mm1 = unpacked center pixel 
	 * mm2 = unpacked left pixel  
	 * mm3 = unpacked right pixel 
	 * mm4 = 
	 * mm5 = 
	 * mm6 = 0
	 * mm7 = 
	 */
	dp = destRowPtr;
	rp = srcRowPtr + 1;
	asm volatile (
	    "movd (%2), %%mm1         #  mm1 = cp\n\t"
	    "movq %%mm1, %%mm2        #  mm2 = lp\n\t"
	    "movd (%1), %%mm3         #  mm3 = rp\n\t"
	    "punpcklbw %%mm6, %%mm1   #  mm1 = S8\n\t"
	    "punpcklbw %%mm6, %%mm2   #  mm2 = S8\n\t"
	    "punpcklbw %%mm6, %%mm3   #  mm3 = S8\n\t"
	    "movq  %%mm1, %%mm0       #  mm0 = cp\n\t"
	    "psllw $1, %%mm0          #  mm0 = cp << 1\n\t"
	    "paddw %%mm2, %%mm0       #  mm0 = lp + (cp << 1)\n\t"
	    "paddw %%mm3, %%mm0       #  mm0 = lp + (cp << 1) + rp\n\t"
	    "psraw $2, %%mm0          #  mm0 = (lp + (cp << 1) + rp) >> 2\n\t"
	    "packuswb %%mm0, %%mm0    #  Pack into low 4 bytes.\n\t"  
	    "movd %%mm0,(%0)	      #  dp = word\n\t"
	    "movq %%mm3, %%mm1	      #  cp = rp\n" :
	    /* output registers */ 
	    "+r" (dp), "+r" (rp) :
	    /* input registers */
	    "r" (srcRowPtr));
	dp++, rp++;

	for (rend = srcRowPtr + srcPtr->width; rp < rend; /*empty*/) {
	    asm volatile (
		"movd (%1), %%mm3         # mm3 = rp\n\t" 
		"punpcklbw %%mm6, %%mm3   # mm3 = S8\n\t"
		"movq  %%mm1, %%mm0       #  mm0 = cp\n\t"
		"psllw $1, %%mm0          #  mm0 = cp << 1\n\t"
		"paddw %%mm2, %%mm0       #  mm0 = lp + (cp << 1)\n\t"
		"paddw %%mm3, %%mm0       #  mm0 = lp + (cp << 1) + rp\n\t"
		"psraw $2, %%mm0          #  mm0 = (lp + (cp<<1) + rp) >> 2\n\t"
		"packuswb %%mm0, %%mm0    #  Pack into low 4 bytes.\n\t"  
		"movd %%mm0,(%0)	  #  dp = word\n\t" 
  	        "movq %%mm1, %%mm2        #  lp = cp\n\t"
  	        "movq %%mm3, %%mm1        #  cp = rp\n" :
		/* output registers */ 
		"+r" (dp), "+r" (rp));
	    dp++, rp++;
	}

	asm volatile (
	    "movq %%mm1, %%mm3        #  rp = cp\n\t"
	    "movq %%mm1, %%mm0        #  mm0 = cp\n\t"
	    "psllw $1, %%mm0          #  mm0 = cp << 1\n\t"
	    "paddw %%mm2, %%mm0       #  mm0 = lp + (cp << 1)\n\t"
	    "paddw %%mm3, %%mm0       #  mm0 = lp + (cp << 1) + rp\n\t"
	    "psraw $2, %%mm0          #  mm0 = (lp + (cp << 1) + rp) >> 2\n\t"
	    "packuswb %%mm0, %%mm0    #  Pack into low 4 bytes.\n\t"  
	    "movd %%mm0,(%0)	      #  dp = word\n" : 
	    /* output registers */ 
	    "+r" (dp));

	srcRowPtr += srcPtr->pixelsPerRow;
	destRowPtr += destPtr->pixelsPerRow;
    }
    asm volatile ("emms");
}

void
Blt_BlendPictureAreaMMX(
    Picture *bgPtr,		/* (in/out) Background picture. Composite
				 * overwrites region in background. */
    Picture *fgPtr,		/* Foreground picture. */
    int fgX, int fgY,		/* Origin of foreground region in source. */
    int fgWidth, int fgHeight,	/* Dimension of region to be blended. */
    int bgX, int bgY)		/* Origin of background region in
				 * destination. */
{
    Pix32 *srcRowPtr, *destRowPtr;
    int y;

    destRowPtr = bgPtr->bits + ((bgY * bgPtr->pixelsPerRow) + bgX);
    srcRowPtr = fgPtr->bits + ((fgY * fgPtr->pixelsPerRow) + fgX);

    asm volatile (
        /* Generate constants needed below. */
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pcmpeqw %mm5, %mm5	# mm5 = -1 \n\t"
	"psubw %mm6, %mm5	# mm5 = 1,1,1,1\n\t"
	"psllw $7, %mm5	        # mm5 = ROUND = 128\n");

    for (y = 0; y < fgHeight; y++) {
	Pix32 *sp, *dp;
	int x;

	sp = srcRowPtr, dp = destRowPtr;
	for (x = 0; x < fgWidth; x++) {

	    /* Blend the foreground and background together. */
	    if (sp->Alpha == 0xFF) {
		*dp = *sp;
	    } else if (sp->Alpha != 0x00) {
		unsigned long beta;
		
		beta = sp->Alpha ^ 0xFF; /* beta = 1 - alpha */

		/*
		 * Small wins:  
		 *
		 * We can compute 
		 *      dest = (fg * alpha) + (beta * bg);
		 * for all RGBA components at once. 
		 *
		 * Packing unsigned with saturation performs the
		 * necessary clamping without the branch misprediction
		 * penalty.
		 *
		 * FIXME: 
		 *     Check if it's faster to do the blend calcution
		 *     all the time (even when alpha is 0 or
		 *     255). There's a good probability that the
		 *     majority of pixels are opaque (interior) or
		 *     completely transparent (exterior).  Only the
		 *     edge pixels would require blending.
		 */
  	        asm volatile (
		    /* 
		     * mm0 = dp
		     * mm1 = sp
		     * mm2 = beta = 1 - alpha
		     * mm3 = temp
		     * mm4 = 
		     * mm5 = ROUND = 128,128,128,128
		     * mm6 = 0
		     * mm7 = 
		     */
		    "movd (%0), %%mm0         #  mm0 = dp\n\t" 
		    "movd (%1), %%mm1         #  mm1 = sp\n\t" 
 		    "movd %2, %%mm2           #  mm2 = beta\n\t" 
		    "punpcklbw %%mm6, %%mm0   #  mm0 = Da,Dr,Dg,Db\n\t" 
		    "punpcklbw %%mm6, %%mm1   #  mm1 = Sa,Sr,Sg,Sb\n\t" 
 		    "punpcklwd %%mm2, %%mm2   #  mm2 = 0,0,B,B\n\t"
		    "punpcklwd %%mm2, %%mm2   #  mm2 = B,B,B,B\n\t"
		    "pmullw %%mm0, %%mm2      #  mm2 = D*B\n\t" 
		    "paddw %%mm5, %%mm2       #  mm2 = (D*B)+ROUND\n\t"
		    "movq %%mm2, %%mm3	      #  mm3 = P16\n\t" 
		    "psrlw $8, %%mm3          #  mm3 = P16 / 256\n\t"
		    "paddw %%mm2, %%mm3       #  mm3 = (P16 / 256) + P16\n\t" 
		    "psrlw $8, %%mm3          #  mm3 = P8 ~= P16 / 257\n\t"
		    "paddw %%mm1, %%mm3       #  mm3 = S + P\n\t"
		    "packuswb %%mm3, %%mm3    #  Pack 4 low bytes.\n\t" 
		    "movd %%mm3, (%0)         #  *dp = word\n" :
		    "+r" (dp) :
		    "r" (sp), 
		    "r" (beta));
	    }
	    sp++, dp++;
	}
	srcRowPtr += fgPtr->pixelsPerRow;
	destRowPtr += bgPtr->pixelsPerRow;
    }
    asm volatile ("emms");
}

void
Blt_FadePictureMMX(
    Picture *bgPtr, 
    Picture *fgPtr, 
    int fgX, int fgY,
    int fgWidth, int fgHeight,
    int bgX, int bgY,
    int alpha)
{
    Pix32 *srcRowPtr, *destRowPtr;
    int beta;

    beta = alpha ^ 0xFF;
    destRowPtr = bgPtr->bits + ((bgY * bgPtr->pixelsPerRow) + bgX);
    srcRowPtr = fgPtr->bits + ((fgY * fgPtr->pixelsPerRow) + fgX);
    if (alpha == 0xFF) {
	int y;

	for (y = 0; y < fgHeight; y++) {
	    Pix32 *sp, *dp;
	    int x;
	
	    sp = srcRowPtr, dp = destRowPtr;
	    for (x = 0; x < fgWidth; x++) {
		*dp++ = *sp++;
	    }
	    srcRowPtr += fgPtr->pixelsPerRow;
	    destRowPtr += bgPtr->pixelsPerRow;
	}
    } else if (alpha != 0x00) {
	int y;

    asm volatile (
        /* Generate constants needed below. */
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pcmpeqw %mm5, %mm5	# mm5 = -1 \n\t"
	"psubw %mm6, %mm5	# mm5 = 1,1,1,1\n\t"
	"psllw $7, %mm5	        # mm5 = BIAS = 128\n");

	for (y = 0; y < fgHeight; y++) {
	    Pix32 *sp, *dp;
	    int x;
	    
	    sp = srcRowPtr, dp = destRowPtr;
	    for (x = 0; x < fgWidth; x++) {
  	        asm volatile (
		    /* 
		     * mm0 = dp
		     * mm1 = sp
		     * mm2 = beta = 1 - alpha
		     * mm3 = temp
		     * mm4 = 
		     * mm5 = ROUND = 128,128,128,128
		     * mm6 = 0
		     * mm7 = 
		     */
		    "movd (%0), %%mm0         #  mm0 = dp\n\t" 
		    "movd (%1), %%mm1         #  mm1 = sp\n\t" 
 		    "movd %2, %%mm2           #  mm2 = beta\n\t" 
		    "punpcklbw %%mm6, %%mm0   #  mm0 = Da,Dr,Dg,Db\n\t" 
		    "punpcklbw %%mm6, %%mm1   #  mm1 = Sa,Sr,Sg,Sb\n\t" 
 		    "punpcklwd %%mm2, %%mm2   #  mm2 = 0,0,beta,beta\n\t"
		    "punpcklwd %%mm2, %%mm2   #  mm2 = beta,beta,beta,beta\n\t"
		    "pmullw %%mm0, %%mm2      #  mm2 = prod = D*beta\n\t" 
		    "paddw %%mm5, %%mm2       #  mm2 = t = (D*beta)+ROUND\n\t"
		    "movq %%mm2, %%mm3	      #  mm3 = t\n\t" 
		    "psrlw $8, %%mm3          #  mm3 = t >> 8\n\t"
		    "paddw %%mm2, %%mm3       #  mm3 = t + (t>>8)\n\t" 
		    "psrlw $8, %%mm3          #  mm3 = ((t+(t>>8)) >> 8)\n\t"
		    "paddw %%mm1, %%mm3       #  mm3 = S + ((t+(t>>8))>>8)\n\t"
		    "packuswb %%mm3, %%mm3    #  Pack 4 low bytes.\n\t" 
		    "movd %%mm3,(%0)          #  *dp = word\n" :
		    "+r" (dp) :
		    "r" (dp), 
		    "r" (beta));
		sp++, dp++;
	    }
	    srcRowPtr += fgPtr->pixelsPerRow;
	    destRowPtr += bgPtr->pixelsPerRow;
	}
    }
    asm volatile ("emms");
}

#define CPU_FEATURE_AMD_MMXEXT     (1 << 22)
#define CPU_FEATURE_AND_3DNOW      (1 << 31)
#define CPU_FEATURE_CENTAUR_3DNOW  (1 << 31)
#define CPU_FEATURE_CENTAUR_MMX    (1 << 23)
#define CPU_FEATURE_CENTAUR_MMXEXT (1 << 24)
#define CPU_FEATURE_CYRIX_MMX      (1 << 23)
#define CPU_FEATURE_CYRIX_MMXEXT   (1 << 24)
#define CPU_FEATURE_INTEL_MMX      (1 << 23)
#define CPU_FEATURE_INTEL_XMM      (1 << 25)
#define CPU_FEATURE_INTEL_XMM2     (1 << 26)

/*	Function to test if multimedia instructions are supported...
 */
/* 
 * Returns 1 if MMX instructions are supported,
 *	   3 if Cyrix MMX and Extended MMX instructions are supported
 *	   5 if AMD MMX and 3DNow! instructions are supported
 *	   0 if hardware does not support any of these
*/
int
Blt_CheckForMMX(void)
{
    int result;

    result = 0;			
    asm volatile (
	/* See if ID instruction is supported. Save a copy of
	 * EFLAGS in eax and ecx */
#if (SIZEOF_VOID_P == 4)
	"push %%ebx\n\t"
#endif
	"pushf\n\t"
#if (SIZEOF_VOID_P == 8)
	"pop %%rax\n\t" 
#else
	"pop %%eax\n\t" 
#endif
	"mov %%eax, %%ecx\n\t"
	/* Toggle the CPUID bit in one copy and store to the EFLAGS
	 * reg */
	"xorl $0x200000, %%eax\n\t"
#if (SIZEOF_VOID_P == 8)
	"push %%rax\n\t"
#else
	"push %%eax\n\t"
#endif
	"popf\n\t"
	/* Get the (hopefully modified) EFLAGS */
	"pushf\n\t"
#if (SIZEOF_VOID_P == 8)
	"pop %%rax\n\t"
#else
	"pop %%eax\n\t"
#endif
	/* Compare the result with the previous. */
	"xor %%eax, %%ecx\n\t"
	"testl $0x200000, %%ecx\n\t"
	"jz .LNotSupported1%=\n\t"		/* CPUID not supported */

	/* Get standard CPUID information, and go to a specific vendor
	 * section */
	"movl $0, %%eax\n\t"
	"cpuid\n\t"

	/* Check for "GenuineIntel" */
	"cmpl $0x756e6547, %%ebx\n\t"
	"jne .LCheckAMD%=\n\t"
	"cmpl $0x49656e69, %%edx\n\t"
	"jne .LCheckAMD%=\n\t"
	"cmpl $0x6c65746e, %%ecx\n"
	"jne .LCheckAMD%=\n\t"
	"jmp .LFoundIntel%=\n\n"
	
	/* Check for "AuthenticAMD" */
	".LCheckAMD%=:\n\t"
	"cmpl $0x68747541, %%ebx\n\t"
	"jne .LCheckCyrix%=\n\t"
	"cmpl $0x69746e65, %%edx\n\t"
	"jne .LCheckCyrix%=\n\t"
	"cmpl $0x444d4163, %%ecx\n"
	"jne .LCheckCyrix%=\n\t"
	"jmp .LFoundAMD%=\n\n"
	
	/* Check for Cyrix */
	".LCheckCyrix%=:\n\t"
	"cmpl $0x69727943, %%ebx\n\t"
	"jne .LNotSupported2%=\n\t"
	"cmpl $0x736e4978, %%edx\n\t"
	"jne .LNotSupported3%=\n\t"
	"cmpl $0x64616574, %%ecx\n\t"
	"jne .LNotSupported4%=\n\t"

	/* Drop through to Cyrix... */
	
	/* Cyrix Section */
	/* See if extended CPUID level 80000001 is supported */
	/* The value of CPUID/80000001 for the 6x86MX is undefined
	   according to the Cyrix CPU Detection Guide (Preliminary
	   Rev. 1.01 table 1), so we'll check the value of eax for
	   CPUID/0 to see if standard CPUID level 2 is supported.
	   According to the table, the only CPU which supports level
	   2 is also the only one which supports extended CPUID levels.
	*/
	"cmpl $0x2, %%eax\n\t"
	"jne .LCheckMMX%=\n\t"	/* Use standard CPUID instead */
	
	/* Extended CPUID supported (in theory), so get extended
	   features */
	"movl $0x80000001, %%eax\n\t"
	"cpuid\n\t"
	"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
	"jz .LNotSupported5%=\n\t"      /* MMX not supported */
	"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
	"jnz .LEMMXSupported%=\n\t"
	"movl $1, %0\n\n\t"		/* MMX Supported */
	"jmp .LDone%=\n\n"
	".LEMMXSupported%=:\n\t"
	"movl $3, %0\n\n\t"		/* EMMX and MMX Supported */
	"jmp .LDone%=\n\t"
	
	
	/* AMD Section */
	".LFoundAMD%=:\n\t"
	
	/* See if extended CPUID is supported */
	"movl $0x80000000, %%eax\n\t"
	"cpuid\n\t"
	"cmpl $0x80000000, %%eax\n\t"
	"jl .LCheckMMX%=\n\t"	/* Use standard CPUID instead */
	
	/* Extended CPUID supported, so get extended features */
	"movl $0x80000001, %%eax\n\t"
	"cpuid\n\t"
	"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
	"jz .LNotSupported6%=\n\t"      /* MMX not supported */
	"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
	"jnz .L3DNowSupported%=\n\t"
	"movl $1, %0\n\n\t"		/* MMX Supported */
	"jmp .LDone%=\n\n"
	".L3DNowSupported%=:\n\t"
	"movl $5, %0\n\n\t"		/* 3DNow! and MMX Supported */
	"jmp .LDone%=\n\t"
	
	
	/* Intel Section */
	".LFoundIntel%=:\n\t"
	
	/* Check for MMX */
	".LCheckMMX%=:\n\t"
	"movl $1, %%eax\n\t"
	"cpuid\n\t"
	"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
	"jz .LNotSupported7%=\n\t" /* MMX Not supported */
	"movl $1, %0\n\n\t"	/* MMX Supported */
	"jmp .LDone%=\n\t"
	
	/* Nothing supported */
	"\n.LNotSupported1%=:\n\t"
	"#movl $101, %0\n\n\t"
	"\n.LNotSupported2%=:\n\t"
	"#movl $102, %0\n\n\t"
	"\n.LNotSupported3%=:\n\t"
	"#movl $103, %0\n\n\t"
	"\n.LNotSupported4%=:\n\t"
	"#movl $104, %0\n\n\t"
	"\n.LNotSupported5%=:\n\t"
	"#movl $105, %0\n\n\t"
	"\n.LNotSupported6%=:\n\t"
	"#movl $106, %0\n\n\t"
	"\n.LNotSupported7%=:\n\t"
	"#movl $107, %0\n\n\t"
	"movl $0, %0\n\n\t"
	".LDone%=:\n\t"
#if (SIZEOF_VOID_P == 4)
	"pop %%ebx\n\t"
#endif
	: "=m" (result)
	: /* no input */
#if (SIZEOF_VOID_P == 8)
	: "eax", "ebx", "ecx", "edx"
#else
	: "eax", "ecx", "edx"
#endif
	);
    bltUseMMX = result;
    fprintf(stderr, "MMX=%d\n", bltUseMMX);
    return result;
}

void
Blt_ConvolvePictureVerticallyMMX(
    Picture *destPtr,
    Picture *srcPtr, 
    PictureFilter *filterPtr)
{
    Sample *samples, *send;
    int x;
    int bytesPerSample;		/* Size of sample. */
    long bytesPerRow;

    /* Pre-calculate filter contributions for each row. */
    bytesPerSample = Blt_ComputeWeights(srcPtr->height, destPtr->height, 
	filterPtr, &samples);
    bytesPerRow = sizeof(Pix32) * srcPtr->pixelsPerRow;
    send = (Sample *)((char *)samples + (destPtr->height * bytesPerSample));

    asm volatile (
        /* Generate constants needed below. */
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pcmpeqw %mm2, %mm2	# mm2 = -1 \n\t"
	"psubw %mm6, %mm2	# mm2 = 1,1,1,1\n\t"
	"psllw $4, %mm2	        # mm2 = BIAS\n");

    /* Apply filter to each row. */
    for (x = 0; x < srcPtr->width; x++) {
	Pix32 *dp, *srcColumnPtr;
	Sample *splPtr;

	srcColumnPtr = srcPtr->bits + x;
	dp = destPtr->bits + x;
	for (splPtr = samples; splPtr < send; 
	     splPtr = (Sample *)((char *)splPtr + bytesPerSample)) {
	    Pix32 *sp;

	    sp = srcColumnPtr + (splPtr->start * srcPtr->pixelsPerRow);
	    asm volatile (
		/* Clear the accumulator mm5. */
                 "pxor %%mm5, %%mm5	    #  mm5 = 0\n\n" 
                 ".Lasm%=:\n\t" 
		 /* Load the weighting factor into mm1. */
		 "movd (%1), %%mm1	    #  mm1 = 0,0,0,W\n\t"
		 /* Load the source pixel into mm0. */
                 "movd (%3), %%mm0          #  mm0 = S\n\t" 
		 /* Unpack the weighting factor into mm1. */
		 "punpcklwd %%mm1, %%mm1    #  mm1 = 0,0,W,W\n\t"
		 "punpcklwd %%mm1, %%mm1    #  mm1 = W,W,W,W\n\t"
		 /* Unpack the pixel components into 16-bit words.*/
                 "punpcklbw %%mm6, %%mm0    #  mm0 = Sa,Sb,Sg,Sr\n\t" 
		 /* Scale the 8-bit components to 14 bits. (S * 257) >> 2 */
                 "movq %%mm0, %%mm3         #  mm3 = S8\n\t" 
                 "psllw $8, %%mm3           #  mm3 = S8 * 256\n\t" 
                 "paddw %%mm3, %%mm0        #  mm0 = S16\n\t" 
                 "psrlw $1, %%mm0           #  mm0 = S15\n\t" 
		 /* Multiple each pixel component by the weight.  Note
		  * that the lower 16-bits of the product are
		  * truncated (bad) creating round-off error in the
		  * sum. */
                 "pmulhw %%mm1, %%mm0       #  mm0 = S15 * W14\n\t" 
                 /* Accumulate upper 16-bit results of product in mm5. */
                 "paddsw %%mm0, %%mm5        #  mm5 = prod + mm5\n\t" 
                 /* Move the pointers to the next weight and pixel */
                 "add $4, %1                #  wp++\n\t" 
                 "add %4, %3                #  sp++\n\t" 
                 "cmp %2, %1                #  wend == wp\n\t" 
                 "jnz .Lasm%=\n\t" 
                 /* end loop */
                 /* Add a rounding bias to the pixel sum */
                 "paddw %%mm2, %%mm5        # mm5 = A13 + BIAS\n\t" 
                 /* Shift off fractional part */
                 "psraw $5, %%mm5           # mm5 = A8\n\t" 
		 /* Pack 16-bit components into lower 4 bytes. */
                 "packuswb  %%mm5, %%mm5    # Pack 4 low-order bytes.\n\t" 
		 /* Save the word (pixel) in the destination. */
                 "movd %%mm5,(%0)           # dp = word\n" :
  		 /* output registers */ 
		 "+r" (dp) :
		 /* input registers */
		 "r" (splPtr->weights), 
		 "r" (splPtr->wend), 
		 "r" (sp),
		 "r" (bytesPerRow));
#ifdef notdef
	    if (dp->Alpha != 0xFF) {
		fprintf(stdout, "mmx v-alpha=0x%x\n", dp->Alpha);
	    }
#endif
	    dp += destPtr->pixelsPerRow;

	}
    }
    asm volatile ("emms");
    /* Free the memory allocated for filter weights. */
    Blt_Free(samples);
}

void
Blt_ConvolvePictureHorizontallyMMX(
    Picture *destPtr,
    Picture *srcPtr, 
    PictureFilter *filterPtr)
{
    Sample *samples, *send;
    int y;
    Pix32 *srcRowPtr, *destRowPtr;
    int bytesPerSample;		/* Size of sample. */

    /* Pre-calculate filter contributions for each column. */
    bytesPerSample = Blt_ComputeWeights(srcPtr->width, destPtr->width, 
	filterPtr, &samples);
    send = (Sample *)((char *)samples + (destPtr->width * bytesPerSample));

    /* Apply filter to each column. */
    srcRowPtr = srcPtr->bits;
    destRowPtr = destPtr->bits;

    asm volatile (
	"pxor %mm6, %mm6	# mm6 = 0\n\t"
	"pxor %mm3, %mm3	# mm3 = 0\n\t"
	"pcmpeqw %mm2, %mm2	# mm2 = -1\n\t"
	"psubw %mm3, %mm2	# mm2 = 1,1,1,1\n\t"
	"psllw $4, %mm2	        # mm2 = BIAS\n");

    for (y = 0; y < srcPtr->height; y++) {
	Pix32 *dp;
	Sample *splPtr;

	dp = destRowPtr;
	for (splPtr = samples; splPtr < send; 
	     splPtr = (Sample *)((char *)splPtr + bytesPerSample)) {

	    Pix32 *sp;
	    sp = srcRowPtr + splPtr->start;
	    asm volatile (
		/* Clear the accumulator mm5. */
                 "pxor %%mm5, %%mm5        #  mm5 = 0\n\n" 
                 ".Lasm%=:\n\t" 
		 /* Load the weighting factor into mm1. */
                 "movd (%1), %%mm1         #  mm1 = W\n\t" 
		 /* Get the source RGBA pixel. */
                 "movd (%3), %%mm0         #  mm0 = sp\n\t" 
		 /* Unpack the weighting factor into mm1. */
		 "punpcklwd %%mm1, %%mm1    #  mm1 = 0,0,W,W\n\t"
		 "punpcklwd %%mm1, %%mm1    #  mm1 = W,W,W,W\n\t"
		 /* Unpack the pixel into mm0. */
                 "punpcklbw %%mm6, %%mm0   #  mm0 = Sa,Sr,Sg,Sb\n\t" 
		 /* Scale the 8-bit components to 14 bits: (S * 257) >> 2 */
                 "movq %%mm0, %%mm3        #  mm3 = S8\n\t" 
                 "psllw $8, %%mm3          #  mm3 = S8 * 256\n\t" 
                 "paddw %%mm3, %%mm0       #  mm0 = S16\n\t" 
                 "psrlw $1, %%mm0          #  mm0 = S15\n\t" 
		 /* Multiple each pixel component by the weight.  Note
		  * that the lower 16-bits of the product are
		  * truncated (bad) creating round-off error in the
		  * sum. */
                 "pmulhw %%mm1, %%mm0      #  mm0 = S15 * W14\n\t" 
                 /* Add the 16-bit components to mm5. */
                 "paddsw %%mm0, %%mm5       #  mm5 = A13 + mm5\n\t" 
                 /* Move the pointers to the next weight and pixel */
                 "add $4, %1		   #  wp++\n\t" 
                 "add $4, %3               #  sp++\n\t" 
                 "cmp %2, %1               #  wend == wp\n\t" 
                 "jnz .Lasm%=\n\t" 
                 /* end loop */
                 /* Add a rounding bias to the pixel sum. */
                 "paddw %%mm2, %%mm5       # mm5 = A13 + BIAS\n\t" 
                 /* Shift off fractional portion. */
                 "psraw $5, %%mm5          # mm5 = A8\n\t" 
		 /* Pack 16-bit components into lower 4 bytes. */
                 "packuswb %%mm5, %%mm5    # Pack A8 into low 4 bytes.\n\t" 
		 /* Store the word (pixel) in the destination. */
                 "movd %%mm5,(%0)	   # dp = word\n" :
  		 /* output registers */ 
		 "+r" (dp) :
		 /* input registers */
		 "r" (splPtr->weights), 
		 "r" (splPtr->wend), 
		 "r" (sp));
#ifdef notdef
	    if (dp->Alpha != 0xFF) {
		fprintf(stdout, "mmx h-alpha=0x%x\n", dp->Alpha);
	    }
#endif
	    dp++;
	}
	srcRowPtr += srcPtr->pixelsPerRow;
	destRowPtr += destPtr->pixelsPerRow;
    }
    asm volatile ("emms");
    /* Free the memory allocated for horizontal filter weights. */
    Blt_Free(samples);
}

#endif /* HAVE_X86_ASM */
