gst/deinterlace/tvtime/greedyh.asm - imx-gst-plugins-good - Git at Google

 /*
  *
  * GStreamer
  * Copyright (c) 2001 Tom Barry.  All rights reserved.
  * Copyright (C) 2008,2010 Sebastian Dröge <slomo@collabora.co.uk>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Library General Public License for more details.
  *
  * You should have received a copy of the GNU Library General Public
  * License aglong with this library; if not, write to the
  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  * Boston, MA 02110-1301, USA.
  */


 /*
  * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry.
  * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
  */


 #include "x86-64_macros.inc"

 static void
 FUNCT_NAME_YUY2 (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
 {

   // in tight loop some vars are accessed faster in local storage
   gint64 YMask = 0x00ff00ff00ff00ffull;        // to keep only luma
   gint64 UVMask = 0xff00ff00ff00ff00ull;       // to keep only chroma
   gint64 ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma
   gint64 QW256 = 0x0100010001000100ull;        // 4 256's
   gint64 MaxComb;
   gint64 MotionThreshold;
   gint64 MotionSense;
   gint64 i;
   glong LoopCtr;
   glong oldbx = 0;

   gint64 QW256B;
   gint64 LastAvg = 0;          //interp value from left qword

   // FIXME: Use C implementation if the width is not a multiple of 4
   // Do something more optimal later
   if (width % 4 != 0)
     C_FUNCT_YUY2 (self, L1, L2, L3, L2P, Dest, width);

   // Set up our two parms that are actually evaluated for each pixel
   i = self->max_comb;
   MaxComb =
       i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;

   i = self->motion_threshold;    // scale to range of 0-257
   MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;

   i = self->motion_sense;        // scale to range of 0-257
   MotionSense = i << 48 | i << 32 | i << 16 | i;

   i = 0xffffffff - 256;
   QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct.

   LoopCtr = width / 8 - 1;       // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop

   // For ease of reading, the comments below assume that we're operating on an odd
   // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
   __asm__ __volatile__ (
       // save ebx (-fPIC)
       MOVX " %%" XBX ", %[oldbx]\n\t"
       MOVX "  %[L1],          %%" XAX "\n\t"
       LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR
       MOVX "  %[L3],          %%" XCX "\n\t"
       SUBX "  %%" XAX ",        %%" XCX "\n\t"   // carry L3 addr as an offset
       MOVX "  %[L2P],         %%" XDX "\n\t"
       MOVX "  %[L2],          %%" XSI "\n\t"
       MOVX "  %[Dest],        %%" XDI "\n\t"      // DL1 if Odd or DL2 if Even

       ".align 8\n\t"
       "1:\n\t"
       "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value
       "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel
       "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel
       "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
       "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp

       //        pavgb   mm6, mm3                    // use macro below
       V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")

       // DJR - Diagonal Jaggie Reduction
       // In the event that we are going to use an average (Bob) pixel we do not want a jagged
       // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
       // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.

       "movq  %[LastAvg],     %%mm4\n\t" // the bob value from prev qword in row
       "movq  %%mm6,          %[LastAvg]\n\t"    // save for next pass
       "psrlq $48,            %%mm4\n\t" // right justify 1 pixel
       "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel
       "psllq $16,            %%mm7\n\t" // left justify 3 pixels
       "por   %%mm7,          %%mm4\n\t" // and combine
       "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1
       // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below

       V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
       "psllq $48,            %%mm5\n\t" // left just 1 pixel
       "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel
       "psrlq $16,            %%mm7\n\t" // right just 3 pixels
       "por   %%mm7,          %%mm5\n\t" // combine
       // pavgb        mm4, mm5                        // avg of forward and prev by 1 pixel, use macro
       V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")       // mm5 gets modified if MMX
       //                        pavgb        mm6, mm4                        // avg of center and surround interp vals, use macro
       V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")

       // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
 #ifndef IS_MMX
       //          pavgb        mm4, mm6                        // 1/4 center, 3/4 adjacent
       V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
       //                    pavgb        mm6, mm4                        // 3/8 center, 5/8 adjacent
       V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
 #endif

       // get abs value of possible L2 comb
       "movq    %%mm6,        %%mm4\n\t" // work copy of interp val
       "movq    %%mm2,        %%mm7\n\t" // L2
       "psubusb %%mm4,        %%mm7\n\t" // L2 - avg
       "movq    %%mm4,        %%mm5\n\t" // avg
       "psubusb %%mm2,        %%mm5\n\t" // avg - L2
       "por     %%mm7,        %%mm5\n\t" // abs(avg-L2)

       // get abs value of possible L2P comb
       "movq    %%mm0,        %%mm7\n\t" // L2P
       "psubusb %%mm4,        %%mm7\n\t" // L2P - avg
       "psubusb %%mm0,        %%mm4\n\t" // avg - L2P
       "por     %%mm7,        %%mm4\n\t" // abs(avg-L2P)

       // use L2 or L2P depending upon which makes smaller comb
       "psubusb %%mm5,        %%mm4\n\t" // see if it goes to zero
       "psubusb %%mm5,        %%mm5\n\t" // 0
       "pcmpeqb %%mm5,        %%mm4\n\t" // if (mm4=0) then FF else 0
       "pcmpeqb %%mm4,        %%mm5\n\t" // opposite of mm4

       // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
       "pand    %%mm2,        %%mm5\n\t" // use L2 if mm5 == ff, else 0
       "pand    %%mm0,        %%mm4\n\t" // use L2P if mm4 = ff, else 0
       "por     %%mm5,        %%mm4\n\t" // may the best win

       // Inventory: at this point we have the following values:
       // mm0 = L2P (or L2)
       // mm1 = L1
       // mm2 = L2 (or L2P)
       // mm3 = L3
       // mm4 = the best of L2,L2P weave pixel, base upon comb
       // mm6 = the avg interpolated value, if we need to use it
       // Let's measure movement, as how much the weave pixel has changed

       "movq    %%mm2,        %%mm7\n\t"
       "psubusb %%mm0,        %%mm2\n\t"
       "psubusb %%mm7,        %%mm0\n\t"
       "por     %%mm2,        %%mm0\n\t"   // abs value of change, used later

       // Now lets clip our chosen value to be not outside of the range
       // of the high/low range L1-L3 by more than MaxComb.
       // This allows some comb but limits the damages and also allows more
       // detail than a boring oversmoothed clip.

       "movq    %%mm1,        %%mm2\n\t" // copy L1
       // pmaxub mm2, mm3                     // use macro
       V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3)
       "movq    %%mm1,        %%mm5\n\t" // copy L1
       // pminub        mm5, mm3                    // now = Min(L1,L3), use macro
       V_PMINUB ("%%mm5", "%%mm3", "%%mm7")

       // allow the value to be above the high or below the low by amt of MaxComb
       "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff
       "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff
       // pmaxub        mm4, mm5         // now = Max(best,Min(L1,L3) use macro
       V_PMAXUB ("%%mm4", "%%mm5")
       // pminub        mm4, mm2         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
       V_PMINUB ("%%mm4", "%%mm2", "%%mm7")

       // Blend weave pixel with bob pixel, depending on motion val in mm0
       "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change
       "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits
       "movq    %[QW256], %%mm7\n\t"
 #ifdef IS_MMXEXT
       "pminsw  %%mm7,        %%mm0\n\t" // max = 256
 #else
       "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff..
       "psubusw %[QW256B],    %%mm0\n\t" // now = Min(L1,256)
 #endif
       "psubusw %%mm0,        %%mm7\n\t" // so the 2 sum to 256, weighted avg
       "movq    %%mm4,        %%mm2\n\t" // save weave chroma info before trashing
       "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
       "pmullw  %%mm7,        %%mm4\n\t" // use more weave for less motion
       "pand    %[YMask],     %%mm6\n\t" // keep only luma from calc'd value
       "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion
       "paddusw %%mm6,        %%mm4\n\t" // combine
       "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg
       // chroma comes from weave pixel
       "pand    %[UVMask],    %%mm2\n\t" // keep chroma
       "por     %%mm4,        %%mm2\n\t" // and combine
       V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
       // bump ptrs and loop
       LEAX "    8(%%" XAX "),   %%" XAX "\n\t"
       LEAX "    8(%%" XBX "),   %%" XBX "\n\t"
       LEAX "    8(%%" XDX "),   %%" XDX "\n\t"
       LEAX "    8(%%" XDI "),   %%" XDI "\n\t"
       LEAX "    8(%%" XSI "),   %%" XSI "\n\t"
       DECX "    %[LoopCtr]\n\t"

       "jg      1b\n\t"   // loop if not to last line
       // note P-III default assumes backward branches taken
       "jl      1f\n\t"          // done
       MOVX "    %%" XAX ",      %%" XBX "\n\t"  // sharpness lookahead 1 byte only, be wrong on 1
       "jmp     1b\n\t"

       "1:\n\t"
       MOVX " %[oldbx], %%" XBX "\n\t"
       "emms\n\t":     /* no outputs */

       :[LastAvg] "m" (LastAvg),
        [L1] "m" (L1),
        [L3] "m" (L3),
        [L2P] "m" (L2P),
        [L2] "m" (L2),
        [Dest] "m" (Dest),
        [ShiftMask] "m" (ShiftMask),
        [MaxComb] "m" (MaxComb),
        [MotionThreshold] "m" (MotionThreshold),
        [MotionSense] "m" (MotionSense),
        [QW256B] "m" (QW256B),
        [YMask] "m" (YMask),
        [UVMask] "m" (UVMask),
        [LoopCtr] "m" (LoopCtr),
        [QW256] "m" (QW256),
        [oldbx] "m" (oldbx)
       : XAX, XCX, XDX, XSI, XDI,
       "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
 #ifdef __MMX__
       "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
 #endif
       "memory", "cc");
 }

 static void
 FUNCT_NAME_UYVY (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
 {

   // in tight loop some vars are accessed faster in local storage
   gint64 YMask = 0xff00ff00ff00ff00ull;       // to keep only luma
   gint64 UVMask = 0x00ff00ff00ff00ffull;        // to keep only chroma
   gint64 ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma
   gint64 QW256 = 0x0100010001000100ull;        // 4 256's
   gint64 MaxComb;
   gint64 MotionThreshold;
   gint64 MotionSense;
   gint64 i;
   glong LoopCtr;
   glong oldbx = 0;

   gint64 QW256B;
   gint64 LastAvg = 0;          //interp value from left qword

   // FIXME: Use C implementation if the width is not a multiple of 4
   // Do something more optimal later
   if (width % 4 != 0)
     C_FUNCT_UYVY (self, L1, L2, L3, L2P, Dest, width);

   // Set up our two parms that are actually evaluated for each pixel
   i = self->max_comb;
   MaxComb =
       i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;

   i = self->motion_threshold;    // scale to range of 0-257
   MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;

   i = self->motion_sense;        // scale to range of 0-257
   MotionSense = i << 48 | i << 32 | i << 16 | i;

   i = 0xffffffff - 256;
   QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct.

   LoopCtr = width / 8 - 1;       // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop

   // For ease of reading, the comments below assume that we're operating on an odd
   // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
   __asm__ __volatile__ (
       // save ebx (-fPIC)
       MOVX " %%" XBX ", %[oldbx]\n\t"
       MOVX "  %[L1],          %%" XAX "\n\t"
       LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR
       MOVX "  %[L3],          %%" XCX "\n\t"
       SUBX "  %%" XAX ",        %%" XCX "\n\t"   // carry L3 addr as an offset
       MOVX "  %[L2P],         %%" XDX "\n\t"
       MOVX "  %[L2],          %%" XSI "\n\t"
       MOVX "  %[Dest],        %%" XDI "\n\t"      // DL1 if Odd or DL2 if Even

       ".align 8\n\t"
       "1:\n\t"
       "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value
       "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel
       "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel
       "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
       "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp

       //        pavgb   mm6, mm3                    // use macro below
       V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")

       // DJR - Diagonal Jaggie Reduction
       // In the event that we are going to use an average (Bob) pixel we do not want a jagged
       // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
       // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.

       "movq  %[LastAvg],     %%mm4\n\t" // the bob value from prev qword in row
       "movq  %%mm6,          %[LastAvg]\n\t"    // save for next pass
       "psrlq $48,            %%mm4\n\t" // right justify 1 pixel
       "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel
       "psllq $16,            %%mm7\n\t" // left justify 3 pixels
       "por   %%mm7,          %%mm4\n\t" // and combine
       "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1
       // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below

       V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
       "psllq $48,            %%mm5\n\t" // left just 1 pixel
       "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel
       "psrlq $16,            %%mm7\n\t" // right just 3 pixels
       "por   %%mm7,          %%mm5\n\t" // combine
       // pavgb        mm4, mm5                        // avg of forward and prev by 1 pixel, use macro
       V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")       // mm5 gets modified if MMX
       //                        pavgb        mm6, mm4                        // avg of center and surround interp vals, use macro
       V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")

       // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
 #ifndef IS_MMX
       //          pavgb        mm4, mm6                        // 1/4 center, 3/4 adjacent
       V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
       //                    pavgb        mm6, mm4                        // 3/8 center, 5/8 adjacent
       V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
 #endif

       // get abs value of possible L2 comb
       "movq    %%mm6,        %%mm4\n\t" // work copy of interp val
       "movq    %%mm2,        %%mm7\n\t" // L2
       "psubusb %%mm4,        %%mm7\n\t" // L2 - avg
       "movq    %%mm4,        %%mm5\n\t" // avg
       "psubusb %%mm2,        %%mm5\n\t" // avg - L2
       "por     %%mm7,        %%mm5\n\t" // abs(avg-L2)

       // get abs value of possible L2P comb
       "movq    %%mm0,        %%mm7\n\t" // L2P
       "psubusb %%mm4,        %%mm7\n\t" // L2P - avg
       "psubusb %%mm0,        %%mm4\n\t" // avg - L2P
       "por     %%mm7,        %%mm4\n\t" // abs(avg-L2P)

       // use L2 or L2P depending upon which makes smaller comb
       "psubusb %%mm5,        %%mm4\n\t" // see if it goes to zero
       "psubusb %%mm5,        %%mm5\n\t" // 0
       "pcmpeqb %%mm5,        %%mm4\n\t" // if (mm4=0) then FF else 0
       "pcmpeqb %%mm4,        %%mm5\n\t" // opposite of mm4

       // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
       "pand    %%mm2,        %%mm5\n\t" // use L2 if mm5 == ff, else 0
       "pand    %%mm0,        %%mm4\n\t" // use L2P if mm4 = ff, else 0
       "por     %%mm5,        %%mm4\n\t" // may the best win

       // Inventory: at this point we have the following values:
       // mm0 = L2P (or L2)
       // mm1 = L1
       // mm2 = L2 (or L2P)
       // mm3 = L3
       // mm4 = the best of L2,L2P weave pixel, base upon comb
       // mm6 = the avg interpolated value, if we need to use it
       // Let's measure movement, as how much the weave pixel has changed

       "movq    %%mm2,        %%mm7\n\t"
       "psubusb %%mm0,        %%mm2\n\t"
       "psubusb %%mm7,        %%mm0\n\t"
       "por     %%mm2,        %%mm0\n\t"   // abs value of change, used later

       // Now lets clip our chosen value to be not outside of the range
       // of the high/low range L1-L3 by more than MaxComb.
       // This allows some comb but limits the damages and also allows more
       // detail than a boring oversmoothed clip.

       "movq    %%mm1,        %%mm2\n\t" // copy L1
       // pmaxub mm2, mm3                     // use macro
       V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3)
       "movq    %%mm1,        %%mm5\n\t" // copy L1
       // pminub        mm5, mm3                    // now = Min(L1,L3), use macro
       V_PMINUB ("%%mm5", "%%mm3", "%%mm7")

       // allow the value to be above the high or below the low by amt of MaxComb
       "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff
       "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff
       // pmaxub        mm4, mm5         // now = Max(best,Min(L1,L3) use macro
       V_PMAXUB ("%%mm4", "%%mm5")
       // pminub        mm4, mm2         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
       V_PMINUB ("%%mm4", "%%mm2", "%%mm7")

       // Blend weave pixel with bob pixel, depending on motion val in mm0
       "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change
       "psrlw   $8,           %%mm0\n\t" // div by 256 to get weighted avg
       "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits
       "movq    %[QW256], %%mm7\n\t"
 #ifdef IS_MMXEXT
       "pminsw  %%mm7,        %%mm0\n\t" // max = 256
 #else
       "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff..
       "psubusw %[QW256B],    %%mm0\n\t" // now = Min(L1,256)
 #endif
       "psubusw %%mm0,        %%mm7\n\t" // so the 2 sum to 256, weighted avg
       "movq    %%mm4,        %%mm2\n\t" // save weave chroma info before trashing
       "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
       "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg
       "pmullw  %%mm7,        %%mm4\n\t" // use more weave for less motion
       "pand    %[YMask],     %%mm6\n\t" // keep only luma from calc'd value
       "psrlw   $8,           %%mm6\n\t" // div by 256 to get weighted avg
       "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion
       "paddusw %%mm6,        %%mm4\n\t" // combine
       "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
       // chroma comes from weave pixel
       "pand    %[UVMask],    %%mm2\n\t" // keep chroma
       "por     %%mm4,        %%mm2\n\t" // and combine
       V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
       // bump ptrs and loop
       LEAX "    8(%%" XAX "),   %%" XAX "\n\t"
       LEAX "    8(%%" XBX "),   %%" XBX "\n\t"
       LEAX "    8(%%" XDX "),   %%" XDX "\n\t"
       LEAX "    8(%%" XDI "),   %%" XDI "\n\t"
       LEAX "    8(%%" XSI "),   %%" XSI "\n\t"
       DECX "    %[LoopCtr]\n\t"

       "jg      1b\n\t"   // loop if not to last line
       // note P-III default assumes backward branches taken
       "jl      1f\n\t"          // done
       MOVX "    %%" XAX ",      %%" XBX "\n\t"  // sharpness lookahead 1 byte only, be wrong on 1
       "jmp     1b\n\t"

       "1:\n\t"
       MOVX " %[oldbx], %%" XBX "\n\t"
       "emms\n\t":     /* no outputs */

       :[LastAvg] "m" (LastAvg),
        [L1] "m" (L1),
        [L3] "m" (L3),
        [L2P] "m" (L2P),
        [L2] "m" (L2),
        [Dest] "m" (Dest),
        [ShiftMask] "m" (ShiftMask),
        [MaxComb] "m" (MaxComb),
        [MotionThreshold] "m" (MotionThreshold),
        [MotionSense] "m" (MotionSense),
        [QW256B] "m" (QW256B),
        [YMask] "m" (YMask),
        [UVMask] "m" (UVMask),
        [LoopCtr] "m" (LoopCtr),
        [QW256] "m" (QW256),
        [oldbx] "m" (oldbx)
       : XAX, XCX, XDX, XSI, XDI,
       "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
 #ifdef __MMX__
       "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
 #endif
       "memory", "cc");
 }
	/*
	*
	* GStreamer
	* Copyright (c) 2001 Tom Barry. All rights reserved.
	* Copyright (C) 2008,2010 Sebastian Dröge <slomo@collabora.co.uk>
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Library General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Library General Public License for more details.
	*
	* You should have received a copy of the GNU Library General Public
	* License aglong with this library; if not, write to the
	* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
	* Boston, MA 02110-1301, USA.
	*/


	/*
	* Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry.
	* See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
	*/


	#include "x86-64_macros.inc"

	static void
	FUNCT_NAME_YUY2 (GstDeinterlaceMethodGreedyH self, const guint8 L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
	{

	// in tight loop some vars are accessed faster in local storage
	gint64 YMask = 0x00ff00ff00ff00ffull; // to keep only luma
	gint64 UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma
	gint64 ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma
	gint64 QW256 = 0x0100010001000100ull; // 4 256's
	gint64 MaxComb;
	gint64 MotionThreshold;
	gint64 MotionSense;
	gint64 i;
	glong LoopCtr;
	glong oldbx = 0;

	gint64 QW256B;
	gint64 LastAvg = 0; //interp value from left qword

	// FIXME: Use C implementation if the width is not a multiple of 4
	// Do something more optimal later
	if (width % 4 != 0)
	C_FUNCT_YUY2 (self, L1, L2, L3, L2P, Dest, width);

	// Set up our two parms that are actually evaluated for each pixel
	i = self->max_comb;
	MaxComb =
	i << 56 \| i << 48 \| i << 40 \| i << 32 \| i << 24 \| i << 16 \| i << 8 \| i;

	i = self->motion_threshold; // scale to range of 0-257
	MotionThreshold = i << 48 \| i << 32 \| i << 16 \| i \| UVMask;

	i = self->motion_sense; // scale to range of 0-257
	MotionSense = i << 48 \| i << 32 \| i << 16 \| i;

	i = 0xffffffff - 256;
	QW256B = i << 48 \| i << 32 \| i << 16 \| i; // save a couple instr on PMINSW instruct.

	LoopCtr = width / 8 - 1; // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop

	// For ease of reading, the comments below assume that we're operating on an odd
	// field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
	__asm__ __volatile__ (
	// save ebx (-fPIC)
	MOVX " %%" XBX ", %[oldbx]\n\t"
	MOVX " %[L1], %%" XAX "\n\t"
	LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR
	MOVX " %[L3], %%" XCX "\n\t"
	SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset
	MOVX " %[L2P], %%" XDX "\n\t"
	MOVX " %[L2], %%" XSI "\n\t"
	MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even

	".align 8\n\t"
	"1:\n\t"
	"movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value
	"movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel
	"movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel
	"movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
	"movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp

	// pavgb mm6, mm3 // use macro below
	V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")

	// DJR - Diagonal Jaggie Reduction
	// In the event that we are going to use an average (Bob) pixel we do not want a jagged
	// stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
	// interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.

	"movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
	"movq %%mm6, %[LastAvg]\n\t" // save for next pass
	"psrlq $48, %%mm4\n\t" // right justify 1 pixel
	"movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
	"psllq $16, %%mm7\n\t" // left justify 3 pixels
	"por %%mm7, %%mm4\n\t" // and combine
	"movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1
	// pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below

	V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
	"psllq $48, %%mm5\n\t" // left just 1 pixel
	"movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
	"psrlq $16, %%mm7\n\t" // right just 3 pixels
	"por %%mm7, %%mm5\n\t" // combine
	// pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
	V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
	// pavgb mm6, mm4 // avg of center and surround interp vals, use macro
	V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")

	// Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
	#ifndef IS_MMX
	// pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
	V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
	// pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
	V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
	#endif

	// get abs value of possible L2 comb
	"movq %%mm6, %%mm4\n\t" // work copy of interp val
	"movq %%mm2, %%mm7\n\t" // L2
	"psubusb %%mm4, %%mm7\n\t" // L2 - avg
	"movq %%mm4, %%mm5\n\t" // avg
	"psubusb %%mm2, %%mm5\n\t" // avg - L2
	"por %%mm7, %%mm5\n\t" // abs(avg-L2)

	// get abs value of possible L2P comb
	"movq %%mm0, %%mm7\n\t" // L2P
	"psubusb %%mm4, %%mm7\n\t" // L2P - avg
	"psubusb %%mm0, %%mm4\n\t" // avg - L2P
	"por %%mm7, %%mm4\n\t" // abs(avg-L2P)

	// use L2 or L2P depending upon which makes smaller comb
	"psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
	"psubusb %%mm5, %%mm5\n\t" // 0
	"pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
	"pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4

	// if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
	"pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
	"pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
	"por %%mm5, %%mm4\n\t" // may the best win

	// Inventory: at this point we have the following values:
	// mm0 = L2P (or L2)
	// mm1 = L1
	// mm2 = L2 (or L2P)
	// mm3 = L3
	// mm4 = the best of L2,L2P weave pixel, base upon comb
	// mm6 = the avg interpolated value, if we need to use it
	// Let's measure movement, as how much the weave pixel has changed

	"movq %%mm2, %%mm7\n\t"
	"psubusb %%mm0, %%mm2\n\t"
	"psubusb %%mm7, %%mm0\n\t"
	"por %%mm2, %%mm0\n\t" // abs value of change, used later

	// Now lets clip our chosen value to be not outside of the range
	// of the high/low range L1-L3 by more than MaxComb.
	// This allows some comb but limits the damages and also allows more
	// detail than a boring oversmoothed clip.

	"movq %%mm1, %%mm2\n\t" // copy L1
	// pmaxub mm2, mm3 // use macro
	V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
	"movq %%mm1, %%mm5\n\t" // copy L1
	// pminub mm5, mm3 // now = Min(L1,L3), use macro
	V_PMINUB ("%%mm5", "%%mm3", "%%mm7")

	// allow the value to be above the high or below the low by amt of MaxComb
	"psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
	"paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
	// pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
	V_PMAXUB ("%%mm4", "%%mm5")
	// pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
	V_PMINUB ("%%mm4", "%%mm2", "%%mm7")

	// Blend weave pixel with bob pixel, depending on motion val in mm0
	"psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change
	"pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
	"movq %[QW256], %%mm7\n\t"
	#ifdef IS_MMXEXT
	"pminsw %%mm7, %%mm0\n\t" // max = 256
	#else
	"paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
	"psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
	#endif
	"psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
	"movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
	"pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
	"pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
	"pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
	"pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
	"paddusw %%mm6, %%mm4\n\t" // combine
	"psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
	// chroma comes from weave pixel
	"pand %[UVMask], %%mm2\n\t" // keep chroma
	"por %%mm4, %%mm2\n\t" // and combine
	V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
	// bump ptrs and loop
	LEAX " 8(%%" XAX "), %%" XAX "\n\t"
	LEAX " 8(%%" XBX "), %%" XBX "\n\t"
	LEAX " 8(%%" XDX "), %%" XDX "\n\t"
	LEAX " 8(%%" XDI "), %%" XDI "\n\t"
	LEAX " 8(%%" XSI "), %%" XSI "\n\t"
	DECX " %[LoopCtr]\n\t"

	"jg 1b\n\t" // loop if not to last line
	// note P-III default assumes backward branches taken
	"jl 1f\n\t" // done
	MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1
	"jmp 1b\n\t"

	"1:\n\t"
	MOVX " %[oldbx], %%" XBX "\n\t"
	"emms\n\t": /* no outputs */

	:[LastAvg] "m" (LastAvg),
	[L1] "m" (L1),
	[L3] "m" (L3),
	[L2P] "m" (L2P),
	[L2] "m" (L2),
	[Dest] "m" (Dest),
	[ShiftMask] "m" (ShiftMask),
	[MaxComb] "m" (MaxComb),
	[MotionThreshold] "m" (MotionThreshold),
	[MotionSense] "m" (MotionSense),
	[QW256B] "m" (QW256B),
	[YMask] "m" (YMask),
	[UVMask] "m" (UVMask),
	[LoopCtr] "m" (LoopCtr),
	[QW256] "m" (QW256),
	[oldbx] "m" (oldbx)
	: XAX, XCX, XDX, XSI, XDI,
	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
	#ifdef __MMX__
	"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
	#endif
	"memory", "cc");
	}

	static void
	FUNCT_NAME_UYVY (GstDeinterlaceMethodGreedyH self, const guint8 L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
	{

	// in tight loop some vars are accessed faster in local storage
	gint64 YMask = 0xff00ff00ff00ff00ull; // to keep only luma
	gint64 UVMask = 0x00ff00ff00ff00ffull; // to keep only chroma
	gint64 ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma
	gint64 QW256 = 0x0100010001000100ull; // 4 256's
	gint64 MaxComb;
	gint64 MotionThreshold;
	gint64 MotionSense;
	gint64 i;
	glong LoopCtr;
	glong oldbx = 0;

	gint64 QW256B;
	gint64 LastAvg = 0; //interp value from left qword

	// FIXME: Use C implementation if the width is not a multiple of 4
	// Do something more optimal later
	if (width % 4 != 0)
	C_FUNCT_UYVY (self, L1, L2, L3, L2P, Dest, width);

	// Set up our two parms that are actually evaluated for each pixel
	i = self->max_comb;
	MaxComb =
	i << 56 \| i << 48 \| i << 40 \| i << 32 \| i << 24 \| i << 16 \| i << 8 \| i;

	i = self->motion_threshold; // scale to range of 0-257
	MotionThreshold = i << 48 \| i << 32 \| i << 16 \| i \| UVMask;

	i = self->motion_sense; // scale to range of 0-257
	MotionSense = i << 48 \| i << 32 \| i << 16 \| i;

	i = 0xffffffff - 256;
	QW256B = i << 48 \| i << 32 \| i << 16 \| i; // save a couple instr on PMINSW instruct.

	LoopCtr = width / 8 - 1; // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop

	// For ease of reading, the comments below assume that we're operating on an odd
	// field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
	__asm__ __volatile__ (
	// save ebx (-fPIC)
	MOVX " %%" XBX ", %[oldbx]\n\t"
	MOVX " %[L1], %%" XAX "\n\t"
	LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR
	MOVX " %[L3], %%" XCX "\n\t"
	SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset
	MOVX " %[L2P], %%" XDX "\n\t"
	MOVX " %[L2], %%" XSI "\n\t"
	MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even

	".align 8\n\t"
	"1:\n\t"
	"movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value
	"movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel
	"movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel
	"movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
	"movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp

	// pavgb mm6, mm3 // use macro below
	V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")

	// DJR - Diagonal Jaggie Reduction
	// In the event that we are going to use an average (Bob) pixel we do not want a jagged
	// stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
	// interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.

	"movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
	"movq %%mm6, %[LastAvg]\n\t" // save for next pass
	"psrlq $48, %%mm4\n\t" // right justify 1 pixel
	"movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
	"psllq $16, %%mm7\n\t" // left justify 3 pixels
	"por %%mm7, %%mm4\n\t" // and combine
	"movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1
	// pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below

	V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
	"psllq $48, %%mm5\n\t" // left just 1 pixel
	"movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
	"psrlq $16, %%mm7\n\t" // right just 3 pixels
	"por %%mm7, %%mm5\n\t" // combine
	// pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
	V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
	// pavgb mm6, mm4 // avg of center and surround interp vals, use macro
	V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")

	// Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
	#ifndef IS_MMX
	// pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
	V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
	// pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
	V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
	#endif

	// get abs value of possible L2 comb
	"movq %%mm6, %%mm4\n\t" // work copy of interp val
	"movq %%mm2, %%mm7\n\t" // L2
	"psubusb %%mm4, %%mm7\n\t" // L2 - avg
	"movq %%mm4, %%mm5\n\t" // avg
	"psubusb %%mm2, %%mm5\n\t" // avg - L2
	"por %%mm7, %%mm5\n\t" // abs(avg-L2)

	// get abs value of possible L2P comb
	"movq %%mm0, %%mm7\n\t" // L2P
	"psubusb %%mm4, %%mm7\n\t" // L2P - avg
	"psubusb %%mm0, %%mm4\n\t" // avg - L2P
	"por %%mm7, %%mm4\n\t" // abs(avg-L2P)

	// use L2 or L2P depending upon which makes smaller comb
	"psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
	"psubusb %%mm5, %%mm5\n\t" // 0
	"pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
	"pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4

	// if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
	"pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
	"pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
	"por %%mm5, %%mm4\n\t" // may the best win

	// Inventory: at this point we have the following values:
	// mm0 = L2P (or L2)
	// mm1 = L1
	// mm2 = L2 (or L2P)
	// mm3 = L3
	// mm4 = the best of L2,L2P weave pixel, base upon comb
	// mm6 = the avg interpolated value, if we need to use it
	// Let's measure movement, as how much the weave pixel has changed

	"movq %%mm2, %%mm7\n\t"
	"psubusb %%mm0, %%mm2\n\t"
	"psubusb %%mm7, %%mm0\n\t"
	"por %%mm2, %%mm0\n\t" // abs value of change, used later

	// Now lets clip our chosen value to be not outside of the range
	// of the high/low range L1-L3 by more than MaxComb.
	// This allows some comb but limits the damages and also allows more
	// detail than a boring oversmoothed clip.

	"movq %%mm1, %%mm2\n\t" // copy L1
	// pmaxub mm2, mm3 // use macro
	V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
	"movq %%mm1, %%mm5\n\t" // copy L1
	// pminub mm5, mm3 // now = Min(L1,L3), use macro
	V_PMINUB ("%%mm5", "%%mm3", "%%mm7")

	// allow the value to be above the high or below the low by amt of MaxComb
	"psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
	"paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
	// pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
	V_PMAXUB ("%%mm4", "%%mm5")
	// pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
	V_PMINUB ("%%mm4", "%%mm2", "%%mm7")

	// Blend weave pixel with bob pixel, depending on motion val in mm0
	"psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change
	"psrlw $8, %%mm0\n\t" // div by 256 to get weighted avg
	"pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
	"movq %[QW256], %%mm7\n\t"
	#ifdef IS_MMXEXT
	"pminsw %%mm7, %%mm0\n\t" // max = 256
	#else
	"paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
	"psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
	#endif
	"psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
	"movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
	"pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
	"psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
	"pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
	"pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
	"psrlw $8, %%mm6\n\t" // div by 256 to get weighted avg
	"pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
	"paddusw %%mm6, %%mm4\n\t" // combine
	"pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
	// chroma comes from weave pixel
	"pand %[UVMask], %%mm2\n\t" // keep chroma
	"por %%mm4, %%mm2\n\t" // and combine
	V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
	// bump ptrs and loop
	LEAX " 8(%%" XAX "), %%" XAX "\n\t"
	LEAX " 8(%%" XBX "), %%" XBX "\n\t"
	LEAX " 8(%%" XDX "), %%" XDX "\n\t"
	LEAX " 8(%%" XDI "), %%" XDI "\n\t"
	LEAX " 8(%%" XSI "), %%" XSI "\n\t"
	DECX " %[LoopCtr]\n\t"

	"jg 1b\n\t" // loop if not to last line
	// note P-III default assumes backward branches taken
	"jl 1f\n\t" // done
	MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1
	"jmp 1b\n\t"

	"1:\n\t"
	MOVX " %[oldbx], %%" XBX "\n\t"
	"emms\n\t": /* no outputs */

	:[LastAvg] "m" (LastAvg),
	[L1] "m" (L1),
	[L3] "m" (L3),
	[L2P] "m" (L2P),
	[L2] "m" (L2),
	[Dest] "m" (Dest),
	[ShiftMask] "m" (ShiftMask),
	[MaxComb] "m" (MaxComb),
	[MotionThreshold] "m" (MotionThreshold),
	[MotionSense] "m" (MotionSense),
	[QW256B] "m" (QW256B),
	[YMask] "m" (YMask),
	[UVMask] "m" (UVMask),
	[LoopCtr] "m" (LoopCtr),
	[QW256] "m" (QW256),
	[oldbx] "m" (oldbx)
	: XAX, XCX, XDX, XSI, XDI,
	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
	#ifdef __MMX__
	"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
	#endif
	"memory", "cc");
	}