$NetBSD: patch-ah,v 1.2 2006/08/09 17:55:51 mrg Exp $

--- video/output.c.orig	2001-05-19 20:05:26.000000000 -0700
+++ video/output.c	2006-08-09 10:39:35.000000000 -0700
@@ -4,23 +4,29 @@
 
 #define CLIP(x)  ((x) >= 0 ? ((x) < 255 ? (x) : 255) : 0)
 
+#if defined(__GNUC__) && __GNUC__ > 3
+#define LMUSED	__attribute__((__used__))
+#else
+#define LMUSED
+#endif
+
 static long long mpeg3_MMX_0 = 0L;
-static unsigned long  mpeg3_MMX_10w[]         = {0x00100010, 0x00100010};                     /*dd    00010 0010h, 000100010h */
-static unsigned long  mpeg3_MMX_80w[]         = {0x00800080, 0x00800080};                     /*dd    00080 0080h, 000800080h */
+static unsigned long  mpeg3_MMX_10w[]         LMUSED = {0x00100010, 0x00100010};                     /*dd    00010 0010h, 000100010h */
+static unsigned long  mpeg3_MMX_80w[]         LMUSED = {0x00800080, 0x00800080};                     /*dd    00080 0080h, 000800080h */
 
-static unsigned long  mpeg3_MMX_00FFw[]       = {0x00ff00ff, 0x00ff00ff};                     /*dd    000FF 00FFh, 000FF00FFh */
+static unsigned long  mpeg3_MMX_00FFw[]       LMUSED = {0x00ff00ff, 0x00ff00ff};                     /*dd    000FF 00FFh, 000FF00FFh */
 
-static unsigned short mpeg3_MMX_Ublucoeff[]   = {0x81, 0x81, 0x81, 0x81};                     /*dd    00081 0081h, 000810081h */
-static unsigned short mpeg3_MMX_Vredcoeff[]   = {0x66, 0x66, 0x66, 0x66};                     /*dd    00066 0066h, 000660066h */
+static unsigned short mpeg3_MMX_Ublucoeff[]   LMUSED = {0x81, 0x81, 0x81, 0x81};                     /*dd    00081 0081h, 000810081h */
+static unsigned short mpeg3_MMX_Vredcoeff[]   LMUSED = {0x66, 0x66, 0x66, 0x66};                     /*dd    00066 0066h, 000660066h */
 
-static unsigned short mpeg3_MMX_Ugrncoeff[]   = {0xffe8, 0xffe8, 0xffe8, 0xffe8};             /*dd    0FFE7 FFE7h, 0FFE7FFE7h */
-static unsigned short mpeg3_MMX_Vgrncoeff[]   = {0xffcd, 0xffcd, 0xffcd, 0xffcd};             /*dd    0FFCC FFCCh, 0FFCCFFCCh */
+static unsigned short mpeg3_MMX_Ugrncoeff[]   LMUSED = {0xffe8, 0xffe8, 0xffe8, 0xffe8};             /*dd    0FFE7 FFE7h, 0FFE7FFE7h */
+static unsigned short mpeg3_MMX_Vgrncoeff[]   LMUSED = {0xffcd, 0xffcd, 0xffcd, 0xffcd};             /*dd    0FFCC FFCCh, 0FFCCFFCCh */
 
-static unsigned short mpeg3_MMX_Ycoeff[]      = {0x4a, 0x4a, 0x4a, 0x4a};                     /*dd    0004A 004Ah, 0004A004Ah */
+static unsigned short mpeg3_MMX_Ycoeff[]      LMUSED = {0x4a, 0x4a, 0x4a, 0x4a};                     /*dd    0004A 004Ah, 0004A004Ah */
 
-static unsigned short mpeg3_MMX_redmask[]     = {0xf800, 0xf800, 0xf800, 0xf800};             /*dd    07c00 7c00h, 07c007c00h */
+static unsigned short mpeg3_MMX_redmask[]     LMUSED = {0xf800, 0xf800, 0xf800, 0xf800};             /*dd    07c00 7c00h, 07c007c00h */
 
-static unsigned short mpeg3_MMX_grnmask[]     = {0x7e0, 0x7e0, 0x7e0, 0x7e0};                 /*dd    003e0 03e0h, 003e003e0h */
+static unsigned short mpeg3_MMX_grnmask[]     LMUSED = {0x7e0, 0x7e0, 0x7e0, 0x7e0};                 /*dd    003e0 03e0h, 003e003e0h */
 
 static unsigned char mpeg3_601_to_rgb[256];
 
@@ -207,50 +213,50 @@
 		);
 }
 
-static unsigned long long  mpeg3_MMX_U_80 = 0x0000008000800000;
-static unsigned long long  mpeg3_MMX_V_80 = 0x0000000000800080;
-static long long  mpeg3_MMX_U_COEF        = 0x00000058ffd30000;
-static long long  mpeg3_MMX_V_COEF        = 0x00000000ffea006f;
-static long long  mpeg3_MMX_601_Y_COEF    = 0x0000004800480048;
-static long long  mpeg3_MMX_601_Y_DIFF    = 0x0000000000000010;
+static unsigned long long  mpeg3_MMX_U_80 LMUSED = 0x0000008000800000LL;
+static unsigned long long  mpeg3_MMX_V_80 LMUSED = 0x0000000000800080LL;
+static long long  mpeg3_MMX_U_COEF        LMUSED = 0x00000058ffd30000LL;
+static long long  mpeg3_MMX_V_COEF        LMUSED = 0x00000000ffea006fLL;
+static long long  mpeg3_MMX_601_Y_COEF    LMUSED = 0x0000004800480048LL;
+static long long  mpeg3_MMX_601_Y_DIFF    LMUSED = 0x0000000000000010LL;
 
 inline void mpeg3_bgra32_mmx(unsigned long y, 
 		unsigned long u, 
 		unsigned long v, 
 		unsigned long *output)
 {
-asm("
-/* Output will be 0x00rrggbb with the 00 trailing so this can also be used */
-/* for bgr24. */
-	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */
-	movd (%1), %%mm1;          /* Load u    0x00000000000000cr */
-	movq %%mm0, %%mm3;         /* Copy y to temp */
-	psllq $16, %%mm1;          /* Shift u   0x0000000000cr0000 */
-	movd (%2), %%mm2;          /* Load v    0x00000000000000cb */
-	psllq $16, %%mm3;          /* Shift y */
-	movq %%mm1, %%mm4;         /* Copy u to temp */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */
-	psllq $16, %%mm4;          /* Shift u */
-	movq %%mm2, %%mm5;         /* Copy v to temp */
-	psllq $16, %%mm3;          /* Shift y  */
-	por %%mm4, %%mm1;          /* Overlay new u byte 0x000000cr00cr0000 */
-	psllq $16, %%mm5;          /* Shift v  */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */
-	por %%mm5, %%mm2;          /* Overlay new v byte 0x0000000000cb00cb */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x000000uu00uu0000 mm2: 0x0000000000vv00vv */
- 	psubw mpeg3_MMX_U_80, %%mm1;    /* Subtract 128 from u 0x000000uu00uu0000 */
- 	pmullw mpeg3_MMX_U_COEF, %%mm1; /* Multiply u coeffs 0x0000uuuuuuuu0000 */
- 	psllw $6, %%mm0;                /* Shift y coeffs 0x0000yyy0yyy0yyy0 */
- 	psubw mpeg3_MMX_V_80, %%mm2;    /* Subtract 128 from v 0x0000000000cb00cb */
- 	pmullw mpeg3_MMX_V_COEF, %%mm2; /* Multiply v coeffs 0x0000crcrcrcrcrcr */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x0000uuuuuuuu0000 mm2: 0x00000000vvvvvvvv */
-	paddsw %%mm1, %%mm0;        /* Add u to result */
-	paddsw %%mm2, %%mm0;        /* Add v to result 0x0000rrrrggggbbbb */
-	psraw $6, %%mm0;           /* Demote precision */
-	packuswb %%mm0, %%mm0;     /* Pack into ARGB 0x0000000000rrggbb */
-	movd %%mm0, (%3);          /* Store output */
+asm("\n\
+/* Output will be 0x00rrggbb with the 00 trailing so this can also be used */\n\
+/* for bgr24. */\n\
+	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */\n\
+	movd (%1), %%mm1;          /* Load u    0x00000000000000cr */\n\
+	movq %%mm0, %%mm3;         /* Copy y to temp */\n\
+	psllq $16, %%mm1;          /* Shift u   0x0000000000cr0000 */\n\
+	movd (%2), %%mm2;          /* Load v    0x00000000000000cb */\n\
+	psllq $16, %%mm3;          /* Shift y */\n\
+	movq %%mm1, %%mm4;         /* Copy u to temp */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */\n\
+	psllq $16, %%mm4;          /* Shift u */\n\
+	movq %%mm2, %%mm5;         /* Copy v to temp */\n\
+	psllq $16, %%mm3;          /* Shift y  */\n\
+	por %%mm4, %%mm1;          /* Overlay new u byte 0x000000cr00cr0000 */\n\
+	psllq $16, %%mm5;          /* Shift v  */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */\n\
+	por %%mm5, %%mm2;          /* Overlay new v byte 0x0000000000cb00cb */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x000000uu00uu0000 mm2: 0x0000000000vv00vv */\n\
+ 	psubw mpeg3_MMX_U_80, %%mm1;    /* Subtract 128 from u 0x000000uu00uu0000 */\n\
+ 	pmullw mpeg3_MMX_U_COEF, %%mm1; /* Multiply u coeffs 0x0000uuuuuuuu0000 */\n\
+ 	psllw $6, %%mm0;                /* Shift y coeffs 0x0000yyy0yyy0yyy0 */\n\
+ 	psubw mpeg3_MMX_V_80, %%mm2;    /* Subtract 128 from v 0x0000000000cb00cb */\n\
+ 	pmullw mpeg3_MMX_V_COEF, %%mm2; /* Multiply v coeffs 0x0000crcrcrcrcrcr */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x0000uuuuuuuu0000 mm2: 0x00000000vvvvvvvv */\n\
+	paddsw %%mm1, %%mm0;        /* Add u to result */\n\
+	paddsw %%mm2, %%mm0;        /* Add v to result 0x0000rrrrggggbbbb */\n\
+	psraw $6, %%mm0;           /* Demote precision */\n\
+	packuswb %%mm0, %%mm0;     /* Pack into ARGB 0x0000000000rrggbb */\n\
+	movd %%mm0, (%3);          /* Store output */\n\
 	"
 :
 : "r" (&y), "r" (&u), "r" (&v), "r" (output));
@@ -261,86 +267,86 @@
 		unsigned long v, 
 		unsigned long *output)
 {
-asm("
-/* Output will be 0x00rrggbb with the 00 trailing so this can also be used */
-/* for bgr24. */
-	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */
-	psubsw mpeg3_MMX_601_Y_DIFF, %%mm0;      /* Subtract 16 from y */
-	movd (%1), %%mm1;          /* Load u    0x00000000000000cr */
-	movq %%mm0, %%mm3;         /* Copy y to temp */
-	psllq $16, %%mm1;          /* Shift u   0x0000000000cr0000 */
-	movd (%2), %%mm2;          /* Load v    0x00000000000000cb */
-	psllq $16, %%mm3;          /* Shift y */
-	movq %%mm1, %%mm4;         /* Copy u to temp */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */
-	psllq $16, %%mm4;          /* Shift u */
-	movq %%mm2, %%mm5;         /* Copy v to temp */
-	psllq $16, %%mm3;          /* Shift y  */
-	por %%mm4, %%mm1;          /* Overlay new u byte 0x000000cr00cr0000 */
-	psllq $16, %%mm5;          /* Shift v  */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */
-	por %%mm5, %%mm2;          /* Overlay new v byte 0x0000000000cb00cb */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x000000uu00uu0000 mm2: 0x0000000000vv00vv */
-	pmullw mpeg3_MMX_601_Y_COEF, %%mm0; /* Scale and shift y coeffs */
-	psubw mpeg3_MMX_U_80, %%mm1;     /* Subtract 128 from u 0x000000uu00uu0000 */
- 	pmullw mpeg3_MMX_U_COEF, %%mm1;  /* Multiply u coeffs 0x0000uuuuuuuu0000 */
-	psubw mpeg3_MMX_V_80, %%mm2;     /* Subtract 128 from v 0x0000000000cb00cb */
- 	pmullw mpeg3_MMX_V_COEF, %%mm2;  /* Multiply v coeffs 0x0000crcrcrcrcrcr */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x0000uuuuuuuu0000 mm2: 0x00000000vvvvvvvv */
-	paddsw %%mm1, %%mm0;        /* Add u to result */
-	paddsw %%mm2, %%mm0;        /* Add v to result 0x0000rrrrggggbbbb */
-	psraw $6, %%mm0;           /* Demote precision */
-	packuswb %%mm0, %%mm0;     /* Pack into ARGB 0x0000000000rrggbb */
-	movd %%mm0, (%3);          /* Store output */
+asm("\n\
+/* Output will be 0x00rrggbb with the 00 trailing so this can also be used */\n\
+/* for bgr24. */\n\
+	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */\n\
+	psubsw mpeg3_MMX_601_Y_DIFF, %%mm0;      /* Subtract 16 from y */\n\
+	movd (%1), %%mm1;          /* Load u    0x00000000000000cr */\n\
+	movq %%mm0, %%mm3;         /* Copy y to temp */\n\
+	psllq $16, %%mm1;          /* Shift u   0x0000000000cr0000 */\n\
+	movd (%2), %%mm2;          /* Load v    0x00000000000000cb */\n\
+	psllq $16, %%mm3;          /* Shift y */\n\
+	movq %%mm1, %%mm4;         /* Copy u to temp */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */\n\
+	psllq $16, %%mm4;          /* Shift u */\n\
+	movq %%mm2, %%mm5;         /* Copy v to temp */\n\
+	psllq $16, %%mm3;          /* Shift y  */\n\
+	por %%mm4, %%mm1;          /* Overlay new u byte 0x000000cr00cr0000 */\n\
+	psllq $16, %%mm5;          /* Shift v  */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */\n\
+	por %%mm5, %%mm2;          /* Overlay new v byte 0x0000000000cb00cb */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x000000uu00uu0000 mm2: 0x0000000000vv00vv */\n\
+	pmullw mpeg3_MMX_601_Y_COEF, %%mm0; /* Scale and shift y coeffs */\n\
+	psubw mpeg3_MMX_U_80, %%mm1;     /* Subtract 128 from u 0x000000uu00uu0000 */\n\
+ 	pmullw mpeg3_MMX_U_COEF, %%mm1;  /* Multiply u coeffs 0x0000uuuuuuuu0000 */\n\
+	psubw mpeg3_MMX_V_80, %%mm2;     /* Subtract 128 from v 0x0000000000cb00cb */\n\
+ 	pmullw mpeg3_MMX_V_COEF, %%mm2;  /* Multiply v coeffs 0x0000crcrcrcrcrcr */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x0000uuuuuuuu0000 mm2: 0x00000000vvvvvvvv */\n\
+	paddsw %%mm1, %%mm0;        /* Add u to result */\n\
+	paddsw %%mm2, %%mm0;        /* Add v to result 0x0000rrrrggggbbbb */\n\
+	psraw $6, %%mm0;           /* Demote precision */\n\
+	packuswb %%mm0, %%mm0;     /* Pack into ARGB 0x0000000000rrggbb */\n\
+	movd %%mm0, (%3);          /* Store output */\n\
 	"
 :
 : "r" (&y), "r" (&u), "r" (&v), "r" (output));
 }
 
-static unsigned long long  mpeg3_MMX_U_80_RGB    = 0x0000000000800080;
-static unsigned long long  mpeg3_MMX_V_80_RGB    = 0x0000008000800000;
-static long long  mpeg3_MMX_U_COEF_RGB    = 0x00000000ffd30058;
-static long long  mpeg3_MMX_V_COEF_RGB    = 0x0000006fffea0000;
+static unsigned long long  mpeg3_MMX_U_80_RGB    LMUSED = 0x0000000000800080;
+static unsigned long long  mpeg3_MMX_V_80_RGB    LMUSED = 0x0000008000800000ULL;
+static long long  mpeg3_MMX_U_COEF_RGB    LMUSED = 0x00000000ffd30058;
+static long long  mpeg3_MMX_V_COEF_RGB    LMUSED = 0x0000006fffea0000ULL;
 
 inline void mpeg3_rgba32_mmx(unsigned long y, 
 		unsigned long u, 
 		unsigned long v, 
 		unsigned long *output)
 {
-asm("
-/* Output will be 0x00bbggrr with the 00 trailing so this can also be used */
-/* for rgb24. */
-	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */
-	movd (%1), %%mm1;          /* Load v    0x00000000000000vv */
-	movq %%mm0, %%mm3;         /* Copy y to temp */
-	psllq $16, %%mm1;          /* Shift v   0x0000000000vv0000 */
-	movd (%2), %%mm2;          /* Load u    0x00000000000000uu */
-	psllq $16, %%mm3;          /* Shift y */
-	movq %%mm1, %%mm4;         /* Copy v to temp */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */
-	psllq $16, %%mm4;          /* Shift v */
-	movq %%mm2, %%mm5;         /* Copy u to temp */
-	psllq $16, %%mm3;          /* Shift y  */
-	por %%mm4, %%mm1;          /* Overlay new v byte 0x000000vv00vv0000 */
-	psllq $16, %%mm5;          /* Shift u  */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */
-	por %%mm5, %%mm2;          /* Overlay new u byte 0x0000000000uu00uu */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x000000vv00vv0000 mm2: 0x0000000000uu00uu */
- 	psubw mpeg3_MMX_V_80_RGB, %%mm1;    /* Subtract 128 from v 0x000000vv00vv0000 */
- 	pmullw mpeg3_MMX_V_COEF_RGB, %%mm1; /* Multiply v coeffs 0x0000vvvvvvvv0000 */
- 	psllw $6, %%mm0;                /* Shift y coeffs 0x0000yyy0yyy0yyy0 */
- 	psubw mpeg3_MMX_U_80_RGB, %%mm2;    /* Subtract 128 from u 0x0000000000uu00uu */
- 	pmullw mpeg3_MMX_U_COEF_RGB, %%mm2; /* Multiply u coeffs 0x0000uuuuuuuuuuuu */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x0000vvvvvvvv0000 mm2: 0x00000000uuuuuuuu */
-	paddsw %%mm1, %%mm0;        /* Add v to result */
-	paddsw %%mm2, %%mm0;        /* Add u to result 0x0000bbbbggggrrrr */
-	psraw $6, %%mm0;           /* Demote precision */
-	packuswb %%mm0, %%mm0;     /* Pack into RGBA 0x0000000000bbggrr */
-	movd %%mm0, (%3);          /* Store output */
+asm("\n\
+/* Output will be 0x00bbggrr with the 00 trailing so this can also be used */\n\
+/* for rgb24. */\n\
+	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */\n\
+	movd (%1), %%mm1;          /* Load v    0x00000000000000vv */\n\
+	movq %%mm0, %%mm3;         /* Copy y to temp */\n\
+	psllq $16, %%mm1;          /* Shift v   0x0000000000vv0000 */\n\
+	movd (%2), %%mm2;          /* Load u    0x00000000000000uu */\n\
+	psllq $16, %%mm3;          /* Shift y */\n\
+	movq %%mm1, %%mm4;         /* Copy v to temp */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */\n\
+	psllq $16, %%mm4;          /* Shift v */\n\
+	movq %%mm2, %%mm5;         /* Copy u to temp */\n\
+	psllq $16, %%mm3;          /* Shift y  */\n\
+	por %%mm4, %%mm1;          /* Overlay new v byte 0x000000vv00vv0000 */\n\
+	psllq $16, %%mm5;          /* Shift u  */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */\n\
+	por %%mm5, %%mm2;          /* Overlay new u byte 0x0000000000uu00uu */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x000000vv00vv0000 mm2: 0x0000000000uu00uu */\n\
+ 	psubw mpeg3_MMX_V_80_RGB, %%mm1;    /* Subtract 128 from v 0x000000vv00vv0000 */\n\
+ 	pmullw mpeg3_MMX_V_COEF_RGB, %%mm1; /* Multiply v coeffs 0x0000vvvvvvvv0000 */\n\
+ 	psllw $6, %%mm0;                /* Shift y coeffs 0x0000yyy0yyy0yyy0 */\n\
+ 	psubw mpeg3_MMX_U_80_RGB, %%mm2;    /* Subtract 128 from u 0x0000000000uu00uu */\n\
+ 	pmullw mpeg3_MMX_U_COEF_RGB, %%mm2; /* Multiply u coeffs 0x0000uuuuuuuuuuuu */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x0000vvvvvvvv0000 mm2: 0x00000000uuuuuuuu */\n\
+	paddsw %%mm1, %%mm0;        /* Add v to result */\n\
+	paddsw %%mm2, %%mm0;        /* Add u to result 0x0000bbbbggggrrrr */\n\
+	psraw $6, %%mm0;           /* Demote precision */\n\
+	packuswb %%mm0, %%mm0;     /* Pack into RGBA 0x0000000000bbggrr */\n\
+	movd %%mm0, (%3);          /* Store output */\n\
 	"
 :
 : "r" (&y), "r" (&v), "r" (&u), "r" (output));
@@ -351,39 +357,39 @@
 		unsigned long v, 
 		unsigned long *output)
 {
-asm("
-/* Output will be 0x00bbggrr with the 00 trailing so this can also be used */
-/* for rgb24. */
-	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */
-	psubsw mpeg3_MMX_601_Y_DIFF, %%mm0;      /* Subtract 16 from y */
-	movd (%1), %%mm1;          /* Load v    0x00000000000000vv */
-	movq %%mm0, %%mm3;         /* Copy y to temp */
-	psllq $16, %%mm1;          /* Shift v   0x0000000000vv0000 */
-	movd (%2), %%mm2;          /* Load u    0x00000000000000uu */
-	psllq $16, %%mm3;          /* Shift y */
-	movq %%mm1, %%mm4;         /* Copy v to temp */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */
-	psllq $16, %%mm4;          /* Shift v */
-	movq %%mm2, %%mm5;         /* Copy u to temp */
-	psllq $16, %%mm3;          /* Shift y  */
-	por %%mm4, %%mm1;          /* Overlay new v byte 0x000000vv00vv0000 */
-	psllq $16, %%mm5;          /* Shift u  */
-	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */
-	por %%mm5, %%mm2;          /* Overlay new u byte 0x0000000000uu00uu */
-
-/* mm0: 0x000000yy00yy00yy     mm1: 0x000000vv00vv0000     mm2: 0x0000000000uu00uu */
-	pmullw mpeg3_MMX_601_Y_COEF, %%mm0;     /* Scale y coeffs */
- 	psubw mpeg3_MMX_V_80_RGB, %%mm1;    /* Subtract 128 from v 0x000000vv00vv0000 */
- 	pmullw mpeg3_MMX_V_COEF_RGB, %%mm1; /* Multiply v coeffs 0x0000vvvvvvvv0000 */
- 	psubw mpeg3_MMX_U_80_RGB, %%mm2;    /* Subtract 128 from u 0x0000000000uu00uu */
- 	pmullw mpeg3_MMX_U_COEF_RGB, %%mm2; /* Multiply u coeffs 0x0000uuuuuuuuuuuu */
-
-/* mm0: 0x000000yy00yy00yy mm1: 0x0000vvvvvvvv0000 mm2: 0x00000000uuuuuuuu */
-	paddsw %%mm1, %%mm0;        /* Add v to result */
-	paddsw %%mm2, %%mm0;        /* Add u to result 0x0000bbbbggggrrrr */
-	psraw $6, %%mm0;           /* Demote precision */
-	packuswb %%mm0, %%mm0;     /* Pack into RGBA 0x0000000000bbggrr */
-	movd %%mm0, (%3);          /* Store output */
+asm("\n\
+/* Output will be 0x00bbggrr with the 00 trailing so this can also be used */\n\
+/* for rgb24. */\n\
+	movd (%0), %%mm0;          /* Load y   0x00000000000000yy */\n\
+	psubsw mpeg3_MMX_601_Y_DIFF, %%mm0;      /* Subtract 16 from y */\n\
+	movd (%1), %%mm1;          /* Load v    0x00000000000000vv */\n\
+	movq %%mm0, %%mm3;         /* Copy y to temp */\n\
+	psllq $16, %%mm1;          /* Shift v   0x0000000000vv0000 */\n\
+	movd (%2), %%mm2;          /* Load u    0x00000000000000uu */\n\
+	psllq $16, %%mm3;          /* Shift y */\n\
+	movq %%mm1, %%mm4;         /* Copy v to temp */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x0000000000yy00yy */\n\
+	psllq $16, %%mm4;          /* Shift v */\n\
+	movq %%mm2, %%mm5;         /* Copy u to temp */\n\
+	psllq $16, %%mm3;          /* Shift y  */\n\
+	por %%mm4, %%mm1;          /* Overlay new v byte 0x000000vv00vv0000 */\n\
+	psllq $16, %%mm5;          /* Shift u  */\n\
+	por %%mm3, %%mm0;          /* Overlay new y byte 0x000000yy00yy00yy */\n\
+	por %%mm5, %%mm2;          /* Overlay new u byte 0x0000000000uu00uu */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy     mm1: 0x000000vv00vv0000     mm2: 0x0000000000uu00uu */\n\
+	pmullw mpeg3_MMX_601_Y_COEF, %%mm0;     /* Scale y coeffs */\n\
+ 	psubw mpeg3_MMX_V_80_RGB, %%mm1;    /* Subtract 128 from v 0x000000vv00vv0000 */\n\
+ 	pmullw mpeg3_MMX_V_COEF_RGB, %%mm1; /* Multiply v coeffs 0x0000vvvvvvvv0000 */\n\
+ 	psubw mpeg3_MMX_U_80_RGB, %%mm2;    /* Subtract 128 from u 0x0000000000uu00uu */\n\
+ 	pmullw mpeg3_MMX_U_COEF_RGB, %%mm2; /* Multiply u coeffs 0x0000uuuuuuuuuuuu */\n\
+\n\
+/* mm0: 0x000000yy00yy00yy mm1: 0x0000vvvvvvvv0000 mm2: 0x00000000uuuuuuuu */\n\
+	paddsw %%mm1, %%mm0;        /* Add v to result */\n\
+	paddsw %%mm2, %%mm0;        /* Add u to result 0x0000bbbbggggrrrr */\n\
+	psraw $6, %%mm0;           /* Demote precision */\n\
+	packuswb %%mm0, %%mm0;     /* Pack into RGBA 0x0000000000bbggrr */\n\
+	movd %%mm0, (%3);          /* Store output */\n\
 	"
 :
 : "r" (&y), "r" (&v), "r" (&u), "r" (output));
@@ -482,11 +488,14 @@
 	*data++ = CLIP(r_l); \
 	*data++ = 0;
 
-#define STORE_PIXEL_RGB565 \
-	*((unsigned short*)data)++ = \
-		((CLIP(r_l) & 0xf8) << 8) | \
-		((CLIP(g_l) & 0xfc) << 3) | \
-		((CLIP(b_l) & 0xf8) >> 3);
+#define STORE_PIXEL_RGB565 { \
+		unsigned short *sdata = (unsigned short *)data; \
+		*((unsigned short*)sdata)++ = \
+			((CLIP(r_l) & 0xf8) << 8) | \
+			((CLIP(g_l) & 0xfc) << 3) | \
+			((CLIP(b_l) & 0xf8) >> 3); \
+		data = (unsigned char *)sdata; \
+	}
 
 #define STORE_PIXEL_RGB888 \
 	*data++ = CLIP(r_l); \
