| #include <string.h> |
| #include <math.h> |
| |
| // Define a few macros for CPU dependent instructions. |
| // I suspect I don't really understand how the C macro preprocessor works but |
| // this seems to get the job done. // TRB 7/01 |
| |
| // BEFORE USING THESE YOU MUST SET: |
| |
| // #define SIMD_TYPE MMXEXT (or MMX or 3DNOW) |
| |
| // some macros for pavgb instruction |
| // V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it |
| |
| #define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \ |
| "movq "mmr2", "mmrw"\n\t" \ |
| "pand "smask", "mmrw"\n\t" \ |
| "psrlw $1, "mmrw"\n\t" \ |
| "pand "smask", "mmr1"\n\t" \ |
| "psrlw $1, "mmr1"\n\t" \ |
| "paddusb "mmrw", "mmr1"\n\t" |
| #define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" |
| #define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" |
| #define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE) |
| #define V_PAVGB2(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) |
| #define V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB_##simd_type(mmr1, mmr2, mmrw, smask) |
| |
| // some macros for pmaxub instruction |
| #define V_PMAXUB_MMX(mmr1, mmr2) \ |
| "psubusb "mmr2", "mmr1"\n\t" \ |
| "paddusb "mmr2", "mmr1"\n\t" |
| #define V_PMAXUB_MMXEXT(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" |
| #define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version |
| #define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SIMD_TYPE) |
| #define V_PMAXUB2(mmr1, mmr2, simd_type) V_PMAXUB3(mmr1, mmr2, simd_type) |
| #define V_PMAXUB3(mmr1, mmr2, simd_type) V_PMAXUB_##simd_type(mmr1, mmr2) |
| |
| // some macros for pminub instruction |
| // V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw |
| #define V_PMINUB_MMX(mmr1, mmr2, mmrw) \ |
| "pcmpeqb "mmrw", "mmrw"\n\t" \ |
| "psubusb "mmr2", "mmrw"\n\t" \ |
| "paddusb "mmrw", "mmr1"\n\t" \ |
| "psubusb "mmrw", "mmr1"\n\t" |
| #define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" |
| #define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version |
| #define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE) |
| #define V_PMINUB2(mmr1, mmr2, mmrw, simd_type) V_PMINUB3(mmr1, mmr2, mmrw, simd_type) |
| #define V_PMINUB3(mmr1, mmr2, mmrw, simd_type) V_PMINUB_##simd_type(mmr1, mmr2, mmrw) |
| |
| // some macros for movntq instruction |
| // V_MOVNTQ(mmr1, mmr2) |
| #define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" |
| #define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" |
| #define V_MOVNTQ_MMXEXT(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" |
| #define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE) |
| #define V_MOVNTQ2(mmr1, mmr2, simd_type) V_MOVNTQ3(mmr1, mmr2, simd_type) |
| #define V_MOVNTQ3(mmr1, mmr2, simd_type) V_MOVNTQ_##simd_type(mmr1, mmr2) |
| |
| // end of macros |
| |
| #ifdef IS_SSE2 |
| |
| #define MERGE4PIXavg(PADDR1, PADDR2) \ |
| "movdqu "PADDR1", %%xmm0\n\t" /* our 4 pixels */ \ |
| "movdqu "PADDR2", %%xmm1\n\t" /* our pixel2 value */ \ |
| "movdqa %%xmm0, %%xmm2\n\t" /* another copy of our pixel1 value */ \ |
| "movdqa %%xmm1, %%xmm3\n\t" /* another copy of our pixel1 value */ \ |
| "psubusb %%xmm1, %%xmm2\n\t" \ |
| "psubusb %%xmm0, %%xmm3\n\t" \ |
| "por %%xmm3, %%xmm2\n\t" \ |
| "pavgb %%xmm1, %%xmm0\n\t" /* avg of 2 pixels */ \ |
| "movdqa %%xmm2, %%xmm3\n\t" /* another copy of our our weights */ \ |
| "pxor %%xmm1, %%xmm1\n\t" \ |
| "psubusb %%xmm7, %%xmm3\n\t" /* nonzero where old weights lower, else 0 */ \ |
| "pcmpeqb %%xmm1, %%xmm3\n\t" /* now ff where new better, else 00 */ \ |
| "pcmpeqb %%xmm3, %%xmm1\n\t" /* here ff where old better, else 00 */ \ |
| "pand %%xmm3, %%xmm0\n\t" /* keep only better new pixels */ \ |
| "pand %%xmm3, %%xmm2\n\t" /* and weights */ \ |
| "pand %%xmm1, %%xmm5\n\t" /* keep only better old pixels */ \ |
| "pand %%xmm1, %%xmm7\n\t" \ |
| "por %%xmm0, %%xmm5\n\t" /* and merge new & old vals */ \ |
| "por %%xmm2, %%xmm7\n\t" |
| |
| #define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B) \ |
| "movdqu "PADDR1A", %%xmm0\n\t" /* our 4 pixels */ \ |
| "movdqu "PADDR2A", %%xmm1\n\t" /* our pixel2 value */ \ |
| "movdqu "PADDR1B", %%xmm2\n\t" /* our 4 pixels */ \ |
| "movdqu "PADDR2B", %%xmm3\n\t" /* our pixel2 value */ \ |
| "pavgb %%xmm2, %%xmm0\n\t" \ |
| "pavgb %%xmm3, %%xmm1\n\t" \ |
| "movdqa %%xmm0, %%xmm2\n\t" /* another copy of our pixel1 value */ \ |
| "movdqa %%xmm1, %%xmm3\n\t" /* another copy of our pixel1 value */ \ |
| "psubusb %%xmm1, %%xmm2\n\t" \ |
| "psubusb %%xmm0, %%xmm3\n\t" \ |
| "por %%xmm3, %%xmm2\n\t" \ |
| "pavgb %%xmm1, %%xmm0\n\t" /* avg of 2 pixels */ \ |
| "movdqa %%xmm2, %%xmm3\n\t" /* another copy of our our weights */ \ |
| "pxor %%xmm1, %%xmm1\n\t" \ |
| "psubusb %%xmm7, %%xmm3\n\t" /* nonzero where old weights lower, else 0 */ \ |
| "pcmpeqb %%xmm1, %%xmm3\n\t" /* now ff where new better, else 00 */ \ |
| "pcmpeqb %%xmm3, %%xmm1\n\t" /* here ff where old better, else 00 */ \ |
| "pand %%xmm3, %%xmm0\n\t" /* keep only better new pixels */ \ |
| "pand %%xmm3, %%xmm2\n\t" /* and weights */ \ |
| "pand %%xmm1, %%xmm5\n\t" /* keep only better old pixels */ \ |
| "pand %%xmm1, %%xmm7\n\t" \ |
| "por %%xmm0, %%xmm5\n\t" /* and merge new & old vals */ \ |
| "por %%xmm2, %%xmm7\n\t" |
| |
| #define RESET_CHROMA "por "_UVMask", %%xmm7\n\t" |
| |
| #else // ifdef IS_SSE2 |
| |
| #define MERGE4PIXavg(PADDR1, PADDR2) \ |
| "movq "PADDR1", %%mm0\n\t" /* our 4 pixels */ \ |
| "movq "PADDR2", %%mm1\n\t" /* our pixel2 value */ \ |
| "movq %%mm0, %%mm2\n\t" /* another copy of our pixel1 value */ \ |
| "movq %%mm1, %%mm3\n\t" /* another copy of our pixel1 value */ \ |
| "psubusb %%mm1, %%mm2\n\t" \ |
| "psubusb %%mm0, %%mm3\n\t" \ |
| "por %%mm3, %%mm2\n\t" \ |
| V_PAVGB ("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */ \ |
| "movq %%mm2, %%mm3\n\t" /* another copy of our our weights */ \ |
| "pxor %%mm1, %%mm1\n\t" \ |
| "psubusb %%mm7, %%mm3\n\t" /* nonzero where old weights lower, else 0 */ \ |
| "pcmpeqb %%mm1, %%mm3\n\t" /* now ff where new better, else 00 */ \ |
| "pcmpeqb %%mm3, %%mm1\n\t" /* here ff where old better, else 00 */ \ |
| "pand %%mm3, %%mm0\n\t" /* keep only better new pixels */ \ |
| "pand %%mm3, %%mm2\n\t" /* and weights */ \ |
| "pand %%mm1, %%mm5\n\t" /* keep only better old pixels */ \ |
| "pand %%mm1, %%mm7\n\t" \ |
| "por %%mm0, %%mm5\n\t" /* and merge new & old vals */ \ |
| "por %%mm2, %%mm7\n\t" |
| |
| #define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B) \ |
| "movq "PADDR1A", %%mm0\n\t" /* our 4 pixels */ \ |
| "movq "PADDR2A", %%mm1\n\t" /* our pixel2 value */ \ |
| "movq "PADDR1B", %%mm2\n\t" /* our 4 pixels */ \ |
| "movq "PADDR2B", %%mm3\n\t" /* our pixel2 value */ \ |
| V_PAVGB("%%mm0", "%%mm2", "%%mm2", _ShiftMask) \ |
| V_PAVGB("%%mm1", "%%mm3", "%%mm3", _ShiftMask) \ |
| "movq %%mm0, %%mm2\n\t" /* another copy of our pixel1 value */ \ |
| "movq %%mm1, %%mm3\n\t" /* another copy of our pixel1 value */ \ |
| "psubusb %%mm1, %%mm2\n\t" \ |
| "psubusb %%mm0, %%mm3\n\t" \ |
| "por %%mm3, %%mm2\n\t" \ |
| V_PAVGB("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */ \ |
| "movq %%mm2, %%mm3\n\t" /* another copy of our our weights */ \ |
| "pxor %%mm1, %%mm1\n\t" \ |
| "psubusb %%mm7, %%mm3\n\t" /* nonzero where old weights lower, else 0 */ \ |
| "pcmpeqb %%mm1, %%mm3\n\t" /* now ff where new better, else 00 */ \ |
| "pcmpeqb %%mm3, %%mm1\n\t" /* here ff where old better, else 00 */ \ |
| "pand %%mm3, %%mm0\n\t" /* keep only better new pixels */ \ |
| "pand %%mm3, %%mm2\n\t" /* and weights */ \ |
| "pand %%mm1, %%mm5\n\t" /* keep only better old pixels */ \ |
| "pand %%mm1, %%mm7\n\t" \ |
| "por %%mm0, %%mm5\n\t" /* and merge new & old vals */ \ |
| "por %%mm2, %%mm7\n\t" |
| |
| #define RESET_CHROMA "por "_UVMask", %%mm7\n\t" |
| |
| #endif |
| |
| |