| /* |
| This program is free software; you can redristibute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, write to the Free Software |
| Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| |
| This file is a modified version of RTjpeg 0.1.2, (C) Justin Schoeman 1998 |
| |
| (991101) Wim Taymans : added MMX dct and idct from intels site. |
| */ |
| |
| |
| /* |
| |
| Main Routines |
| |
| This file contains most of the initialisation and control functions |
| |
| (C) Justin Schoeman 1998 |
| |
| */ |
| |
| #include <config.h> |
| #include <sys/types.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| typedef unsigned char __u8; |
| typedef signed char __s8; |
| typedef unsigned short __u16; |
| typedef signed short __s16; |
| typedef unsigned long __u32; |
| typedef signed long __s32; |
| typedef unsigned long long __u64; |
| |
| /*#define MMX_TRACE */ |
| |
| |
| #ifdef HAVE_LIBMMX |
| #include "mmx.h" |
| #endif |
| |
| static const unsigned char RTjpeg_ZZ[64]={ |
| 0, |
| 8, 1, |
| 2, 9, 16, |
| 24, 17, 10, 3, |
| 4, 11, 18, 25, 32, |
| 40, 33, 26, 19, 12, 5, |
| 6, 13, 20, 27, 34, 41, 48, |
| 56, 49, 42, 35, 28, 21, 14, 7, |
| 15, 22, 29, 36, 43, 50, 57, |
| 58, 51, 44, 37, 30, 23, |
| 31, 38, 45, 52, 59, |
| 60, 53, 46, 39, |
| 47, 54, 61, |
| 62, 55, |
| 63 }; |
| |
| static const __u64 RTjpeg_aan_tab[64]={ |
| 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, |
| 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, |
| 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, |
| 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, |
| 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, |
| 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, |
| 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, |
| 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, |
| }; |
| |
| #ifndef HAVE_LIBMMX |
| static __s32 RTjpeg_ws[64+31]; |
| #endif |
| __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32]; |
| |
| __s16 *RTjpeg_block; |
| __s32 *RTjpeg_lqt; |
| __s32 *RTjpeg_cqt; |
| __u32 *RTjpeg_liqt; |
| __u32 *RTjpeg_ciqt; |
| |
| unsigned char RTjpeg_lb8; |
| unsigned char RTjpeg_cb8; |
| int RTjpeg_width, RTjpeg_height; |
| int RTjpeg_Ywidth, RTjpeg_Cwidth; |
| int RTjpeg_Ysize, RTjpeg_Csize; |
| |
| __s16 *RTjpeg_old=NULL; |
| |
| #ifdef HAVE_LIBMMX |
| mmx_t RTjpeg_lmask; |
| mmx_t RTjpeg_cmask; |
| #else |
| __u16 RTjpeg_lmask; |
| __u16 RTjpeg_cmask; |
| #endif |
| int RTjpeg_mtest=0; |
| |
| static const unsigned char RTjpeg_lum_quant_tbl[64] = { |
| 16, 11, 10, 16, 24, 40, 51, 61, |
| 12, 12, 14, 19, 26, 58, 60, 55, |
| 14, 13, 16, 24, 40, 57, 69, 56, |
| 14, 17, 22, 29, 51, 87, 80, 62, |
| 18, 22, 37, 56, 68, 109, 103, 77, |
| 24, 35, 55, 64, 81, 104, 113, 92, |
| 49, 64, 78, 87, 103, 121, 120, 101, |
| 72, 92, 95, 98, 112, 100, 103, 99 |
| }; |
| |
| static const unsigned char RTjpeg_chrom_quant_tbl[64] = { |
| 17, 18, 24, 47, 99, 99, 99, 99, |
| 18, 21, 26, 66, 99, 99, 99, 99, |
| 24, 26, 56, 99, 99, 99, 99, 99, |
| 47, 66, 99, 99, 99, 99, 99, 99, |
| 99, 99, 99, 99, 99, 99, 99, 99, |
| 99, 99, 99, 99, 99, 99, 99, 99, |
| 99, 99, 99, 99, 99, 99, 99, 99, |
| 99, 99, 99, 99, 99, 99, 99, 99 |
| }; |
| |
| int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) |
| { |
| register int ci, co=1, tmp; |
| register __s16 ZZvalue; |
| |
| (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); |
| |
| for(ci=1; ci<=bt8; ci++) |
| { |
| ZZvalue = data[RTjpeg_ZZ[ci]]; |
| |
| if(ZZvalue>0) |
| { |
| strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; |
| } |
| else |
| { |
| strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; |
| } |
| } |
| |
| for(; ci<64; ci++) |
| { |
| ZZvalue = data[RTjpeg_ZZ[ci]]; |
| |
| if(ZZvalue>0) |
| { |
| strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue; |
| } |
| else if(ZZvalue<0) |
| { |
| strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue; |
| } |
| else /* compress zeros */ |
| { |
| tmp=ci; |
| do |
| { |
| ci++; |
| } |
| while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0)); |
| |
| strm[co++]=(__s8)(63+(ci-tmp)); |
| ci--; |
| } |
| } |
| return (int)co; |
| } |
| |
| int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) |
| { |
| int ci=1, co=1, tmp; |
| register int i; |
| |
| i=RTjpeg_ZZ[0]; |
| data[i]=((__u8)strm[0])*qtbl[i]; |
| |
| for(co=1; co<=bt8; co++) |
| { |
| i=RTjpeg_ZZ[co]; |
| data[i]=strm[ci++]*qtbl[i]; |
| } |
| |
| for(; co<64; co++) |
| { |
| if(strm[ci]>63) |
| { |
| tmp=co+strm[ci]-63; |
| for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0; |
| co--; |
| } else |
| { |
| i=RTjpeg_ZZ[co]; |
| data[i]=strm[ci]*qtbl[i]; |
| } |
| ci++; |
| } |
| return (int)ci; |
| } |
| |
| #if defined(HAVE_LIBMMX) |
| void RTjpeg_quant_init(void) |
| { |
| int i; |
| __s16 *qtbl; |
| |
| qtbl=(__s16 *)RTjpeg_lqt; |
| for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i]; |
| |
| qtbl=(__s16 *)RTjpeg_cqt; |
| for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i]; |
| } |
| |
| static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL; |
| static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL; |
| |
| void RTjpeg_quant(__s16 *block, __s32 *qtbl) |
| { |
| int i; |
| mmx_t *bl, *ql; |
| |
| ql=(mmx_t *)qtbl; |
| bl=(mmx_t *)block; |
| |
| movq_m2r(RTjpeg_ones, mm6); |
| movq_m2r(RTjpeg_half, mm7); |
| |
| for(i=16; i; i--) |
| { |
| movq_m2r(*(ql++), mm0); /* quant vals (4) */ |
| movq_m2r(*bl, mm2); /* block vals (4) */ |
| movq_r2r(mm0, mm1); |
| movq_r2r(mm2, mm3); |
| |
| punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */ |
| punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */ |
| |
| punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */ |
| punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */ |
| |
| pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */ |
| pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */ |
| |
| psrad_i2r(16, mm0); |
| psrad_i2r(16, mm1); |
| |
| packssdw_r2r(mm1, mm0); |
| |
| movq_r2m(mm0, *(bl++)); |
| |
| } |
| } |
| #else |
| void RTjpeg_quant_init(void) |
| { |
| } |
| |
| void RTjpeg_quant(__s16 *block, __s32 *qtbl) |
| { |
| int i; |
| |
| for(i=0; i<64; i++) |
| block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16); |
| } |
| #endif |
| |
| /* |
| * Perform the forward DCT on one block of samples. |
| */ |
| #ifdef HAVE_LIBMMX |
| static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL; |
| static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL; |
| static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL; |
| static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL; |
| static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL; |
| |
| #else |
| |
| #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ |
| #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ |
| #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ |
| #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ |
| |
| #define DESCALE10(x) (__s16)( ((x)+128) >> 8) |
| #define DESCALE20(x) (__s16)(((x)+32768) >> 16) |
| #define D_MULTIPLY(var,const) ((__s32) ((var) * (const))) |
| #endif |
| |
| void RTjpeg_dct_init(void) |
| { |
| int i; |
| |
| for(i=0; i<64; i++) |
| { |
| RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]); |
| RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]); |
| } |
| } |
| |
| void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) |
| { |
| #ifndef HAVE_LIBMMX |
| __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
| __s32 tmp10, tmp11, tmp12, tmp13; |
| __s32 z1, z2, z3, z4, z5, z11, z13; |
| __u8 *idataptr; |
| __s16 *odataptr; |
| __s32 *wsptr; |
| int ctr; |
| |
| idataptr = idata; |
| wsptr = RTjpeg_ws; |
| for (ctr = 7; ctr >= 0; ctr--) { |
| tmp0 = idataptr[0] + idataptr[7]; |
| tmp7 = idataptr[0] - idataptr[7]; |
| tmp1 = idataptr[1] + idataptr[6]; |
| tmp6 = idataptr[1] - idataptr[6]; |
| tmp2 = idataptr[2] + idataptr[5]; |
| tmp5 = idataptr[2] - idataptr[5]; |
| tmp3 = idataptr[3] + idataptr[4]; |
| tmp4 = idataptr[3] - idataptr[4]; |
| |
| tmp10 = (tmp0 + tmp3); /* phase 2 */ |
| tmp13 = tmp0 - tmp3; |
| tmp11 = (tmp1 + tmp2); |
| tmp12 = tmp1 - tmp2; |
| |
| wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */ |
| wsptr[4] = (tmp10 - tmp11)<<8; |
| |
| z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ |
| wsptr[2] = (tmp13<<8) + z1; /* phase 5 */ |
| wsptr[6] = (tmp13<<8) - z1; |
| |
| tmp10 = tmp4 + tmp5; /* phase 2 */ |
| tmp11 = tmp5 + tmp6; |
| tmp12 = tmp6 + tmp7; |
| |
| z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ |
| z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ |
| z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ |
| z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ |
| |
| z11 = (tmp7<<8) + z3; /* phase 5 */ |
| z13 = (tmp7<<8) - z3; |
| |
| wsptr[5] = z13 + z2; /* phase 6 */ |
| wsptr[3] = z13 - z2; |
| wsptr[1] = z11 + z4; |
| wsptr[7] = z11 - z4; |
| |
| idataptr += rskip<<3; /* advance pointer to next row */ |
| wsptr += 8; |
| } |
| |
| wsptr = RTjpeg_ws; |
| odataptr=odata; |
| for (ctr = 7; ctr >= 0; ctr--) { |
| tmp0 = wsptr[0] + wsptr[56]; |
| tmp7 = wsptr[0] - wsptr[56]; |
| tmp1 = wsptr[8] + wsptr[48]; |
| tmp6 = wsptr[8] - wsptr[48]; |
| tmp2 = wsptr[16] + wsptr[40]; |
| tmp5 = wsptr[16] - wsptr[40]; |
| tmp3 = wsptr[24] + wsptr[32]; |
| tmp4 = wsptr[24] - wsptr[32]; |
| |
| tmp10 = tmp0 + tmp3; /* phase 2 */ |
| tmp13 = tmp0 - tmp3; |
| tmp11 = tmp1 + tmp2; |
| tmp12 = tmp1 - tmp2; |
| |
| odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */ |
| odataptr[32] = DESCALE10(tmp10 - tmp11); |
| |
| z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ |
| odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */ |
| odataptr[48] = DESCALE20((tmp13<<8) - z1); |
| |
| tmp10 = tmp4 + tmp5; /* phase 2 */ |
| tmp11 = tmp5 + tmp6; |
| tmp12 = tmp6 + tmp7; |
| |
| z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ |
| z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ |
| z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ |
| z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ |
| |
| z11 = (tmp7<<8) + z3; /* phase 5 */ |
| z13 = (tmp7<<8) - z3; |
| |
| odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */ |
| odataptr[24] = DESCALE20(z13 - z2); |
| odataptr[8] = DESCALE20(z11 + z4); |
| odataptr[56] = DESCALE20(z11 - z4); |
| |
| odataptr++; /* advance pointer to next column */ |
| wsptr++; |
| } |
| #else |
| mmx_t tmp6, tmp7; |
| register mmx_t *dataptr = (mmx_t *)odata; |
| mmx_t *idata2 = (mmx_t *)idata; |
| |
| /* first copy the input 8 bit to the destination 16 bits */ |
| |
| movq_m2r(RTjpeg_zero, mm2); |
| |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+1)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+2)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+3)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+4)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+5)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+6)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+7)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+8)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+9)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+10)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+11)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+12)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+13)); |
| |
| idata2 += rskip; |
| |
| movq_m2r(*idata2, mm0); |
| movq_r2r(mm0, mm1); |
| |
| punpcklbw_r2r(mm2, mm0); |
| movq_r2m(mm0, *(dataptr+14)); |
| |
| punpckhbw_r2r(mm2, mm1); |
| movq_r2m(mm1, *(dataptr+15)); |
| |
| /* Start Transpose to do calculations on rows */ |
| |
| movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into m5 */ |
| |
| movq_m2r(*(dataptr+13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */ |
| movq_r2r(mm7, mm5); |
| |
| punpcklwd_m2r(*(dataptr+11), mm7); /* m11:m01|m10:m00 - interleave first and second lines */ |
| movq_r2r(mm6, mm2); |
| |
| punpcklwd_m2r(*(dataptr+15), mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */ |
| movq_r2r(mm7, mm1); |
| |
| movq_m2r(*(dataptr+11), mm3); /* m13:m13|m11:m10 - second line */ |
| punpckldq_r2r(mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */ |
| |
| movq_m2r(*(dataptr+15), mm0); /* m13:m13|m11:m10 - fourth line */ |
| punpckhdq_r2r(mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ |
| |
| movq_r2m(mm7,*(dataptr+9)); /* write result 1 */ |
| punpckhwd_r2r(mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */ |
| |
| movq_r2m(mm1,*(dataptr+11)); /* write result 2 */ |
| punpckhwd_r2r(mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */ |
| |
| movq_r2r(mm5, mm1); |
| punpckldq_r2r(mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */ |
| |
| movq_m2r(*(dataptr+1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */ |
| punpckhdq_r2r(mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */ |
| |
| movq_r2m(mm5,*(dataptr+13)); /* write result 3 */ |
| |
| /* last 4x4 done */ |
| |
| movq_r2m(mm1, *(dataptr+15)); /* write result 4, last 4x4 */ |
| |
| movq_m2r(*(dataptr+5), mm2); /* m23:m22|m21:m20 - third line */ |
| movq_r2r(mm0, mm6); |
| |
| punpcklwd_m2r(*(dataptr+3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ |
| movq_r2r(mm2, mm7); |
| |
| punpcklwd_m2r(*(dataptr+7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */ |
| movq_r2r(mm0, mm4); |
| |
| |
| movq_m2r(*(dataptr+8), mm1); /* n03:n02|n01:n00 - first line */ |
| punpckldq_r2r(mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */ |
| |
| movq_m2r(*(dataptr+12), mm3); /* n23:n22|n21:n20 - third line */ |
| punpckhdq_r2r(mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */ |
| |
| punpckhwd_m2r(*(dataptr+3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */ |
| movq_r2r(mm1, mm2); /* copy first line */ |
| |
| punpckhwd_m2r(*(dataptr+7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */ |
| movq_r2r(mm6, mm5); /* copy first intermediate result */ |
| |
| movq_r2m(mm0, *(dataptr+8)); /* write result 1 */ |
| punpckhdq_r2r(mm7, mm5); /* m33:m23|m13:m03 - produce third result */ |
| |
| punpcklwd_m2r(*(dataptr+10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */ |
| movq_r2r(mm3, mm0); /* copy third line */ |
| |
| punpckhwd_m2r(*(dataptr+10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */ |
| |
| movq_r2m(mm4, *(dataptr+10)); /* write result 2 out */ |
| punpckldq_r2r(mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */ |
| |
| punpcklwd_m2r(*(dataptr+14), mm3); /* n31:n21|n30:n20 - interleave third and fourth lines */ |
| movq_r2r(mm1, mm4); |
| |
| movq_r2m(mm6, *(dataptr+12)); /* write result 3 out */ |
| punpckldq_r2r(mm3, mm1); /* n30:n20|n10:n00 - produce first result */ |
| |
| punpckhwd_m2r(*(dataptr+14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */ |
| movq_r2r(mm2, mm6); |
| |
| movq_r2m(mm5, *(dataptr+14)); /* write result 4 out */ |
| punpckhdq_r2r(mm3, mm4); /* n31:n21|n11:n01- produce second result */ |
| |
| movq_r2m(mm1, *(dataptr+1)); /* write result 5 out - (first result for other 4 x 4 block) */ |
| punpckldq_r2r(mm0, mm2); /* n32:n22|n12:n02- produce third result */ |
| |
| movq_r2m(mm4, *(dataptr+3)); /* write result 6 out */ |
| punpckhdq_r2r(mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */ |
| |
| movq_r2m(mm2, *(dataptr+5)); /* write result 7 out*/ |
| |
| movq_m2r(*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */ |
| |
| movq_r2m(mm6, *(dataptr+7)); /* write result 8 out */ |
| |
| |
| /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */ |
| |
| movq_m2r(*(dataptr+4), mm7); /* m23:m22|m21:m20 - third line */ |
| movq_r2r(mm0, mm2); |
| |
| punpcklwd_m2r(*(dataptr+2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ |
| movq_r2r(mm7, mm4); |
| |
| punpcklwd_m2r(*(dataptr+6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */ |
| movq_r2r(mm0, mm1); |
| |
| movq_m2r(*(dataptr+2), mm6); /* m13:m12|m11:m10 - second line */ |
| punpckldq_r2r(mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */ |
| |
| movq_m2r(*(dataptr+6), mm5); /* m33:m32|m31:m30 - fourth line */ |
| punpckhdq_r2r(mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ |
| |
| movq_r2r(mm0, mm7); /* write result 1 */ |
| punpckhwd_r2r(mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */ |
| |
| psubw_m2r(*(dataptr+14), mm7); /* tmp07=x0-x7: Stage 1 */ |
| movq_r2r(mm1, mm6); /* write result 2 */ |
| |
| paddw_m2r(*(dataptr+14), mm0); /* tmp00=x0+x7: Stage 1 */ |
| punpckhwd_r2r(mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */ |
| |
| paddw_m2r(*(dataptr+12), mm1); /* tmp01=x1+x6: Stage 1 */ |
| movq_r2r(mm2, mm3); /* copy first intermediate result */ |
| |
| psubw_m2r(*(dataptr+12), mm6); /* tmp06=x1-x6: Stage 1 */ |
| punpckldq_r2r(mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */ |
| |
| movq_r2m(mm7, tmp7); |
| movq_r2r(mm2, mm5); /* write result 3 */ |
| |
| movq_r2m(mm6, tmp6); |
| punpckhdq_r2r(mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */ |
| |
| paddw_m2r(*(dataptr+10), mm2); /* tmp02=x2+5: Stage 1 */ |
| movq_r2r(mm3, mm4); /* write result 4 */ |
| |
| /************************************************************************************************ |
| End of Transpose |
| ************************************************************************************************/ |
| |
| |
| paddw_m2r(*(dataptr+8), mm3); /* tmp03=x3+x4: stage 1 */ |
| movq_r2r(mm0, mm7); |
| |
| psubw_m2r(*(dataptr+8), mm4); /* tmp04=x3-x4: stage 1 */ |
| movq_r2r(mm1, mm6); |
| |
| paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03: even 2 */ |
| psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03: even 2 */ |
| |
| psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02: even 2 */ |
| paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02: even 2 */ |
| |
| psubw_m2r(*(dataptr+10), mm5); /* tmp05=x2-x5: stage 1 */ |
| paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ |
| |
| /* stage 3 */ |
| |
| movq_m2r(tmp6, mm2); |
| movq_r2r(mm0, mm3); |
| |
| psllw_i2r(2, mm6); /* m8 * 2^2 */ |
| paddw_r2r(mm1, mm0); |
| |
| pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ |
| psubw_r2r(mm1, mm3); |
| |
| movq_r2m(mm0, *dataptr); |
| movq_r2r(mm7, mm0); |
| |
| /* Odd part */ |
| movq_r2m(mm3, *(dataptr+8)); |
| paddw_r2r(mm5, mm4); /* tmp10 */ |
| |
| movq_m2r(tmp7, mm3); |
| paddw_r2r(mm6, mm0); /* tmp32 */ |
| |
| paddw_r2r(mm2, mm5); /* tmp11 */ |
| psubw_r2r(mm6, mm7); /* tmp33 */ |
| |
| movq_r2m(mm0, *(dataptr+4)); |
| paddw_r2r(mm3, mm2); /* tmp12 */ |
| |
| /* stage 4 */ |
| |
| movq_r2m(mm7, *(dataptr+12)); |
| movq_r2r(mm4, mm1); /* copy of tmp10 */ |
| |
| psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ |
| psllw_i2r(2, mm4); /* m8 * 2^2 */ |
| |
| movq_m2r(RTjpeg_C2mC6, mm0); |
| psllw_i2r(2, mm1); |
| |
| pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ |
| psllw_i2r(2, mm2); |
| |
| pmulhw_r2r(mm0, mm4); /* z5 */ |
| |
| /* stage 5 */ |
| |
| pmulhw_m2r(RTjpeg_C2pC6, mm2); |
| psllw_i2r(2, mm5); |
| |
| pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ |
| movq_r2r(mm3, mm0); /* copy tmp7 */ |
| |
| movq_m2r(*(dataptr+1), mm7); |
| paddw_r2r(mm1, mm4); /* z2 */ |
| |
| paddw_r2r(mm1, mm2); /* z4 */ |
| |
| paddw_r2r(mm5, mm0); /* z11 */ |
| psubw_r2r(mm5, mm3); /* z13 */ |
| |
| /* stage 6 */ |
| |
| movq_r2r(mm3, mm5); /* copy z13 */ |
| psubw_r2r(mm4, mm3); /* y3=z13 - z2 */ |
| |
| paddw_r2r(mm4, mm5); /* y5=z13 + z2 */ |
| movq_r2r(mm0, mm6); /* copy z11 */ |
| |
| movq_r2m(mm3, *(dataptr+6)); /*save y3 */ |
| psubw_r2r(mm2, mm0); /* y7=z11 - z4 */ |
| |
| movq_r2m(mm5, *(dataptr+10)); /*save y5 */ |
| paddw_r2r(mm2, mm6); /* y1=z11 + z4 */ |
| |
| movq_r2m(mm0, *(dataptr+14)); /*save y7 */ |
| |
| /************************************************ |
| * End of 1st 4 rows |
| ************************************************/ |
| |
| movq_m2r(*(dataptr+3), mm1); /* load x1: stage 1 */ |
| movq_r2r(mm7, mm0); /* copy x0 */ |
| |
| movq_r2m(mm6, *(dataptr+2)); /*save y1 */ |
| |
| movq_m2r(*(dataptr+5), mm2); /* load x2: stage 1 */ |
| movq_r2r(mm1, mm6); /* copy x1 */ |
| |
| paddw_m2r(*(dataptr+15), mm0); /* tmp00 = x0 + x7 */ |
| |
| movq_m2r(*(dataptr+7), mm3); /* load x3 : stage 1 */ |
| movq_r2r(mm2, mm5); /* copy x2 */ |
| |
| psubw_m2r(*(dataptr+15), mm7); /* tmp07 = x0 - x7 */ |
| movq_r2r(mm3, mm4); /* copy x3 */ |
| |
| paddw_m2r(*(dataptr+13), mm1); /* tmp01 = x1 + x6 */ |
| |
| movq_r2m(mm7, tmp7); /* save tmp07 */ |
| movq_r2r(mm0, mm7); /* copy tmp00 */ |
| |
| psubw_m2r(*(dataptr+13), mm6); /* tmp06 = x1 - x6 */ |
| |
| /* stage 2, Even Part */ |
| |
| paddw_m2r(*(dataptr+9), mm3); /* tmp03 = x3 + x4 */ |
| |
| movq_r2m(mm6, tmp6); /* save tmp07 */ |
| movq_r2r(mm1, mm6); /* copy tmp01 */ |
| |
| paddw_m2r(*(dataptr+11), mm2); /* tmp02 = x2 + x5 */ |
| paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 */ |
| |
| psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 */ |
| |
| psubw_m2r(*(dataptr+9), mm4); /* tmp04 = x3 - x4 */ |
| psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 */ |
| |
| paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 */ |
| |
| psubw_m2r(*(dataptr+11), mm5); /* tmp05 = x2 - x5 */ |
| paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ |
| |
| /* stage 3, Even and stage 4 & 5 even */ |
| |
| movq_m2r(tmp6, mm2); /* load tmp6 */ |
| movq_r2r(mm0, mm3); /* copy tmp10 */ |
| |
| psllw_i2r(2, mm6); /* shift z1 */ |
| paddw_r2r(mm1, mm0); /* y0=tmp10 + tmp11 */ |
| |
| pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ |
| psubw_r2r(mm1, mm3); /* y4=tmp10 - tmp11 */ |
| |
| movq_r2m(mm0, *(dataptr+1)); /*save y0 */ |
| movq_r2r(mm7, mm0); /* copy tmp13 */ |
| |
| /* odd part */ |
| |
| movq_r2m(mm3, *(dataptr+9)); /*save y4 */ |
| paddw_r2r(mm5, mm4); /* tmp10 = tmp4 + tmp5 */ |
| |
| movq_m2r(tmp7, mm3); /* load tmp7 */ |
| paddw_r2r(mm6, mm0); /* tmp32 = tmp13 + z1 */ |
| |
| paddw_r2r(mm2, mm5); /* tmp11 = tmp5 + tmp6 */ |
| psubw_r2r(mm6, mm7); /* tmp33 = tmp13 - z1 */ |
| |
| movq_r2m(mm0, *(dataptr+5)); /*save y2 */ |
| paddw_r2r(mm3, mm2); /* tmp12 = tmp6 + tmp7 */ |
| |
| /* stage 4 */ |
| |
| movq_r2m(mm7, *(dataptr+13)); /*save y6 */ |
| movq_r2r(mm4, mm1); /* copy tmp10 */ |
| |
| psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ |
| psllw_i2r(2, mm4); /* shift tmp10 */ |
| |
| movq_m2r(RTjpeg_C2mC6, mm0); /* load C2mC6 */ |
| psllw_i2r(2, mm1); /* shift (tmp10-tmp12) */ |
| |
| pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ |
| psllw_i2r(2, mm5); /* prepare for multiply */ |
| |
| pmulhw_r2r(mm0, mm4); /* multiply by converted real */ |
| |
| /* stage 5 */ |
| |
| pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ |
| psllw_i2r(2, mm2); /* prepare for multiply */ |
| |
| pmulhw_m2r(RTjpeg_C2pC6, mm2); /* multiply */ |
| movq_r2r(mm3, mm0); /* copy tmp7 */ |
| |
| movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */ |
| paddw_r2r(mm1, mm4); /* z2 */ |
| |
| paddw_r2r(mm5, mm0); /* z11 */ |
| psubw_r2r(mm5, mm3); /* z13 */ |
| |
| /* stage 6 */ |
| |
| movq_r2r(mm3, mm5); /* copy z13 */ |
| paddw_r2r(mm1, mm2); /* z4 */ |
| |
| movq_r2r(mm0, mm6); /* copy z11 */ |
| psubw_r2r(mm4, mm5); /* y3 */ |
| |
| paddw_r2r(mm2, mm6); /* y1 */ |
| paddw_r2r(mm4, mm3); /* y5 */ |
| |
| movq_r2m(mm5, *(dataptr+7)); /*save y3 */ |
| |
| movq_r2m(mm6, *(dataptr+3)); /*save y1 */ |
| psubw_r2r(mm2, mm0); /* y7 */ |
| |
| /************************************************************************************************ |
| Start of Transpose |
| ************************************************************************************************/ |
| |
| movq_m2r(*(dataptr+13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */ |
| movq_r2r(mm7, mm5); /* copy first line */ |
| |
| punpcklwd_r2r(mm3, mm7); /* m11:m01|m10:m00 - interleave first and second lines */ |
| movq_r2r(mm6, mm2); /* copy third line */ |
| |
| punpcklwd_r2r(mm0, mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */ |
| movq_r2r(mm7, mm1); /* copy first intermediate result */ |
| |
| punpckldq_r2r(mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */ |
| |
| punpckhdq_r2r(mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ |
| |
| movq_r2m(mm7, *(dataptr+9)); /* write result 1 */ |
| punpckhwd_r2r(mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */ |
| |
| movq_r2m(mm1, *(dataptr+11)); /* write result 2 */ |
| punpckhwd_r2r(mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */ |
| |
| movq_r2r(mm5, mm1); /* copy first intermediate result */ |
| punpckldq_r2r(mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */ |
| |
| movq_m2r(*(dataptr+1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */ |
| punpckhdq_r2r(mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */ |
| |
| movq_r2m(mm5, *(dataptr+13)); /* write result 3 */ |
| |
| /****** last 4x4 done */ |
| |
| movq_r2m(mm1, *(dataptr+15)); /* write result 4, last 4x4 */ |
| |
| movq_m2r(*(dataptr+5), mm2); /* m23:m22|m21:m20 - third line */ |
| movq_r2r(mm0, mm6); /* copy first line */ |
| |
| punpcklwd_m2r(*(dataptr+3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ |
| movq_r2r(mm2, mm7); /* copy third line */ |
| |
| punpcklwd_m2r(*(dataptr+7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */ |
| movq_r2r(mm0, mm4); /* copy first intermediate result */ |
| |
| |
| |
| movq_m2r(*(dataptr+8), mm1); /* n03:n02|n01:n00 - first line */ |
| punpckldq_r2r(mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */ |
| |
| movq_m2r(*(dataptr+12), mm3); /* n23:n22|n21:n20 - third line */ |
| punpckhdq_r2r(mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */ |
| |
| punpckhwd_m2r(*(dataptr+3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */ |
| movq_r2r(mm1, mm2); /* copy first line */ |
| |
| punpckhwd_m2r(*(dataptr+7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */ |
| movq_r2r(mm6, mm5); /* copy first intermediate result */ |
| |
| movq_r2m(mm0, *(dataptr+8)); /* write result 1 */ |
| punpckhdq_r2r(mm7, mm5); /* m33:m23|m13:m03 - produce third result */ |
| |
| punpcklwd_m2r(*(dataptr+10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */ |
| movq_r2r(mm3, mm0); /* copy third line */ |
| |
| punpckhwd_m2r(*(dataptr+10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */ |
| |
| movq_r2m(mm4, *(dataptr+10)); /* write result 2 out */ |
| punpckldq_r2r(mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */ |
| |
| punpcklwd_m2r(*(dataptr+14), mm3); /* n33:n23|n32:n22 - interleave third and fourth lines */ |
| movq_r2r(mm1, mm4); /* copy second intermediate result */ |
| |
| movq_r2m(mm6, *(dataptr+12)); /* write result 3 out */ |
| punpckldq_r2r(mm3, mm1); /* */ |
| |
| punpckhwd_m2r(*(dataptr+14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */ |
| movq_r2r(mm2, mm6); /* copy second intermediate result */ |
| |
| movq_r2m(mm5, *(dataptr+14)); /* write result 4 out */ |
| punpckhdq_r2r(mm3, mm4); /* n31:n21|n11:n01- produce second result */ |
| |
| movq_r2m(mm1, *(dataptr+1)); /* write result 5 out - (first result for other 4 x 4 block) */ |
| punpckldq_r2r(mm0, mm2); /* n32:n22|n12:n02- produce third result */ |
| |
| movq_r2m(mm4, *(dataptr+3)); /* write result 6 out */ |
| punpckhdq_r2r(mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */ |
| |
| movq_r2m(mm2, *(dataptr+5)); /* write result 7 out */ |
| |
| movq_m2r(*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */ |
| |
| movq_r2m(mm6, *(dataptr+7)); /* write result 8 out */ |
| |
| /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */ |
| |
| movq_m2r(*(dataptr+4), mm7); /* m23:m22|m21:m20 - third line */ |
| movq_r2r(mm0, mm2); /* copy first line */ |
| |
| punpcklwd_m2r(*(dataptr+2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ |
| movq_r2r(mm7, mm4); /* copy third line */ |
| |
| punpcklwd_m2r(*(dataptr+6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */ |
| movq_r2r(mm0, mm1); /* copy first intermediate result */ |
| |
| movq_m2r(*(dataptr+2), mm6); /* m13:m12|m11:m10 - second line */ |
| punpckldq_r2r(mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */ |
| |
| movq_m2r(*(dataptr+6), mm5); /* m33:m32|m31:m30 - fourth line */ |
| punpckhdq_r2r(mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ |
| |
| movq_r2r(mm0, mm7); /* write result 1 */ |
| punpckhwd_r2r(mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */ |
| |
| psubw_m2r(*(dataptr+14), mm7); /* tmp07=x0-x7: Stage 1 */ |
| movq_r2r(mm1, mm6); /* write result 2 */ |
| |
| paddw_m2r(*(dataptr+14), mm0); /* tmp00=x0+x7: Stage 1 */ |
| punpckhwd_r2r(mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */ |
| |
| paddw_m2r(*(dataptr+12), mm1); /* tmp01=x1+x6: Stage 1 */ |
| movq_r2r(mm2, mm3); /* copy first intermediate result */ |
| |
| psubw_m2r(*(dataptr+12), mm6); /* tmp06=x1-x6: Stage 1 */ |
| punpckldq_r2r(mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */ |
| |
| movq_r2m(mm7, tmp7); /* save tmp07 */ |
| movq_r2r(mm2, mm5); /* write result 3 */ |
| |
| movq_r2m(mm6, tmp6); /* save tmp06 */ |
| |
| punpckhdq_r2r(mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */ |
| |
| paddw_m2r(*(dataptr+10), mm2); /* tmp02=x2+x5: stage 1 */ |
| movq_r2r(mm3, mm4); /* write result 4 */ |
| |
| /************************************************************************************************ |
| End of Transpose 2 |
| ************************************************************************************************/ |
| |
| paddw_m2r(*(dataptr+8), mm3); /* tmp03=x3+x4: stage 1 */ |
| movq_r2r(mm0, mm7); |
| |
| psubw_m2r(*(dataptr+8), mm4); /* tmp04=x3-x4: stage 1 */ |
| movq_r2r(mm1, mm6); |
| |
| paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03: even 2 */ |
| psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03: even 2 */ |
| |
| psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02: even 2 */ |
| paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02: even 2 */ |
| |
| psubw_m2r(*(dataptr+10), mm5); /* tmp05=x2-x5: stage 1 */ |
| paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ |
| |
| /* stage 3 */ |
| |
| movq_m2r(tmp6, mm2); |
| movq_r2r(mm0, mm3); |
| |
| psllw_i2r(2, mm6); /* m8 * 2^2 */ |
| paddw_r2r(mm1, mm0); |
| |
| pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ |
| psubw_r2r(mm1, mm3); |
| |
| movq_r2m(mm0, *dataptr); |
| movq_r2r(mm7, mm0); |
| |
| /* Odd part */ |
| movq_r2m(mm3, *(dataptr+8)); |
| paddw_r2r(mm5, mm4); /* tmp10 */ |
| |
| movq_m2r(tmp7, mm3); |
| paddw_r2r(mm6, mm0); /* tmp32 */ |
| |
| paddw_r2r(mm2, mm5); /* tmp11 */ |
| psubw_r2r(mm6, mm7); /* tmp33 */ |
| |
| movq_r2m(mm0, *(dataptr+4)); |
| paddw_r2r(mm3, mm2); /* tmp12 */ |
| |
| /* stage 4 */ |
| movq_r2m(mm7, *(dataptr+12)); |
| movq_r2r(mm4, mm1); /* copy of tmp10 */ |
| |
| psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ |
| psllw_i2r(2, mm4); /* m8 * 2^2 */ |
| |
| movq_m2r(RTjpeg_C2mC6, mm0); |
| psllw_i2r(2, mm1); |
| |
| pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ |
| psllw_i2r(2, mm2); |
| |
| pmulhw_r2r(mm0, mm4); /* z5 */ |
| |
| /* stage 5 */ |
| |
| pmulhw_m2r(RTjpeg_C2pC6, mm2); |
| psllw_i2r(2, mm5); |
| |
| pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ |
| movq_r2r(mm3, mm0); /* copy tmp7 */ |
| |
| movq_m2r(*(dataptr+1), mm7); |
| paddw_r2r(mm1, mm4); /* z2 */ |
| |
| paddw_r2r(mm1, mm2); /* z4 */ |
| |
| paddw_r2r(mm5, mm0); /* z11 */ |
| psubw_r2r(mm5, mm3); /* z13 */ |
| |
| /* stage 6 */ |
| |
| movq_r2r(mm3, mm5); /* copy z13 */ |
| psubw_r2r(mm4, mm3); /* y3=z13 - z2 */ |
| |
| paddw_r2r(mm4, mm5); /* y5=z13 + z2 */ |
| movq_r2r(mm0, mm6); /* copy z11 */ |
| |
| movq_r2m(mm3, *(dataptr+6)); /*save y3 */ |
| psubw_r2r(mm2, mm0); /* y7=z11 - z4 */ |
| |
| movq_r2m(mm5, *(dataptr+10)); /*save y5 */ |
| paddw_r2r(mm2, mm6); /* y1=z11 + z4 */ |
| |
| movq_r2m(mm0, *(dataptr+14)); /*save y7 */ |
| |
| /************************************************ |
| * End of 1st 4 rows |
| ************************************************/ |
| |
| movq_m2r(*(dataptr+3), mm1); /* load x1 : stage 1 */ |
| movq_r2r(mm7, mm0); /* copy x0 */ |
| |
| movq_r2m(mm6, *(dataptr+2)); /*save y1 */ |
| |
| movq_m2r(*(dataptr+5), mm2); /* load x2 : stage 1 */ |
| movq_r2r(mm1, mm6); /* copy x1 */ |
| |
| paddw_m2r(*(dataptr+15), mm0); /* tmp00 = x0 + x7 */ |
| |
| movq_m2r(*(dataptr+7), mm3); /* load x3 : stage 1 */ |
| movq_r2r(mm2, mm5); /* copy x2 */ |
| |
| psubw_m2r(*(dataptr+15), mm7); /* tmp07 = x0 - x7 */ |
| movq_r2r(mm3, mm4); /* copy x3 */ |
| |
| paddw_m2r(*(dataptr+13), mm1); /* tmp01 = x1 + x6 */ |
| |
| movq_r2m(mm7, tmp7); /* save tmp07 */ |
| movq_r2r(mm0, mm7); /* copy tmp00 */ |
| |
| psubw_m2r(*(dataptr+13), mm6); /* tmp06 = x1 - x6 */ |
| |
| /* stage 2, Even Part */ |
| |
| paddw_m2r(*(dataptr+9), mm3); /* tmp03 = x3 + x4 */ |
| |
| movq_r2m(mm6, tmp6); /* save tmp07 */ |
| movq_r2r(mm1, mm6); /* copy tmp01 */ |
| |
| paddw_m2r(*(dataptr+11), mm2); /* tmp02 = x2 + x5 */ |
| paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 */ |
| |
| psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 */ |
| |
| psubw_m2r(*(dataptr+9), mm4); /* tmp04 = x3 - x4 */ |
| psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 */ |
| |
| paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 */ |
| |
| psubw_m2r(*(dataptr+11), mm5); /* tmp05 = x2 - x5 */ |
| paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ |
| |
| /* stage 3, Even and stage 4 & 5 even */ |
| |
| movq_m2r(tmp6, mm2); /* load tmp6 */ |
| movq_r2r(mm0, mm3); /* copy tmp10 */ |
| |
| psllw_i2r(2, mm6); /* shift z1 */ |
| paddw_r2r(mm1, mm0); /* y0=tmp10 + tmp11 */ |
| |
| pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ |
| psubw_r2r(mm1, mm3); /* y4=tmp10 - tmp11 */ |
| |
| movq_r2m(mm0, *(dataptr+1)); /*save y0 */ |
| movq_r2r(mm7, mm0); /* copy tmp13 */ |
| |
| /* odd part */ |
| |
| movq_r2m(mm3, *(dataptr+9)); /*save y4 */ |
| paddw_r2r(mm5, mm4); /* tmp10 = tmp4 + tmp5 */ |
| |
| movq_m2r(tmp7, mm3); /* load tmp7 */ |
| paddw_r2r(mm6, mm0); /* tmp32 = tmp13 + z1 */ |
| |
| paddw_r2r(mm2, mm5); /* tmp11 = tmp5 + tmp6 */ |
| psubw_r2r(mm6, mm7); /* tmp33 = tmp13 - z1 */ |
| |
| movq_r2m(mm0, *(dataptr+5)); /*save y2 */ |
| paddw_r2r(mm3, mm2); /* tmp12 = tmp6 + tmp7 */ |
| |
| /* stage 4 */ |
| |
| movq_r2m(mm7, *(dataptr+13)); /*save y6 */ |
| movq_r2r(mm4, mm1); /* copy tmp10 */ |
| |
| psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ |
| psllw_i2r(2, mm4); /* shift tmp10 */ |
| |
| movq_m2r(RTjpeg_C2mC6, mm0); /* load C2mC6 */ |
| psllw_i2r(2, mm1); /* shift (tmp10-tmp12) */ |
| |
| pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ |
| psllw_i2r(2, mm5); /* prepare for multiply */ |
| |
| pmulhw_r2r(mm0, mm4); /* multiply by converted real */ |
| |
| /* stage 5 */ |
| |
| pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ |
| psllw_i2r(2, mm2); /* prepare for multiply */ |
| |
| pmulhw_m2r(RTjpeg_C2pC6, mm2); /* multiply */ |
| movq_r2r(mm3, mm0); /* copy tmp7 */ |
| |
| movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */ |
| paddw_r2r(mm1, mm4); /* z2 */ |
| |
| paddw_r2r(mm5, mm0); /* z11 */ |
| psubw_r2r(mm5, mm3); /* z13 */ |
| |
| /* stage 6 */ |
| |
| movq_r2r(mm3, mm5); /* copy z13 */ |
| paddw_r2r(mm1, mm2); /* z4 */ |
| |
| movq_r2r(mm0, mm6); /* copy z11 */ |
| psubw_r2r(mm4, mm5); /* y3 */ |
| |
| paddw_r2r(mm2, mm6); /* y1 */ |
| paddw_r2r(mm4, mm3); /* y5 */ |
| |
| movq_r2m(mm5, *(dataptr+7)); /*save y3 */ |
| psubw_r2r(mm2, mm0); /* yè=z11 - z4 */ |
| |
| movq_r2m(mm3, *(dataptr+11)); /*save y5 */ |
| |
| movq_r2m(mm6, *(dataptr+3)); /*save y1 */ |
| |
| movq_r2m(mm0, *(dataptr+15)); /*save y7 */ |
| |
| |
| #endif |
| } |
| |
| #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */ |
| #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */ |
| #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */ |
| #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */ |
| |
| #define DESCALE(x) (__s16)( ((x)+4) >> 3) |
| |
| /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */ |
| |
| #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x)) |
| #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8) |
| |
| void RTjpeg_idct_init(void) |
| { |
| int i; |
| |
| for(i=0; i<64; i++) |
| { |
| RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32; |
| RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32; |
| } |
| } |
| |
| void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip) |
| { |
| #ifdef HAVE_LIBMMX |
| |
| static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL; |
| static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; |
| static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL; |
| static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL; |
| static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; |
| |
| mmx_t workspace[64]; |
| mmx_t *wsptr = workspace; |
| register mmx_t *dataptr = (mmx_t *)odata; |
| mmx_t *idata = (mmx_t *)data; |
| |
| rskip = rskip>>3; |
| /* |
| * Perform inverse DCT on one block of coefficients. |
| */ |
| |
| /* Odd part */ |
| |
| movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */ |
| |
| movq_m2r(*(idata+6), mm0); /* load idata[DCTSIZE*3] */ |
| |
| movq_m2r(*(idata+2), mm3); /* load idata[DCTSIZE*1] */ |
| |
| movq_r2r(mm1, mm2); /* copy tmp6 : phase 6 */ */ |
| |
| movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */ |
| |
| paddw_r2r(mm0, mm1); /* z13 = tmp6 + tmp5; */ |
| |
| psubw_r2r(mm0, mm2); /* z10 = tmp6 - tmp5 */ |
| |
| psllw_i2r(2, mm2); /* shift z10 */ |
| movq_r2r(mm2, mm0); /* copy z10 */ |
| |
| pmulhw_m2r(fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */ |
| movq_r2r(mm3, mm5); /* copy tmp4 */ |
| |
| pmulhw_m2r(fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */ |
| paddw_r2r(mm4, mm3); /* z11 = tmp4 + tmp7; */ |
| |
| movq_r2r(mm3, mm6); /* copy z11 : phase 5 */ |
| psubw_r2r(mm4, mm5); /* z12 = tmp4 - tmp7; */ |
| |
| psubw_r2r(mm1, mm6); /* z11-z13 */ |
| psllw_i2r(2, mm5); /* shift z12 */ |
| |
| movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */ |
| movq_r2r(mm5, mm7); /* copy z12 */ |
| |
| pmulhw_m2r(fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6): even part */ |
| paddw_r2r(mm1, mm3); /* tmp7 = z11 + z13; */ |
| |
| /*ok */ |
| |
| /* Even part */ |
| pmulhw_m2r(fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */ |
| psllw_i2r(2, mm6); |
| |
| movq_m2r(*(idata+4), mm1); /* load idata[DCTSIZE*2] */ |
| |
| paddw_r2r(mm5, mm0); /* tmp10 */ |
| |
| paddw_r2r(mm7, mm2); /* tmp12 */ |
| |
| pmulhw_m2r(fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */ |
| psubw_r2r(mm3, mm2); /* tmp6 = tmp12 - tmp7 */ |
| |
| movq_r2r(mm1, mm5); /* copy tmp1 */ |
| paddw_r2r(mm4, mm1); /* tmp13= tmp1 + tmp3; phases 5-3 */ |
| |
| psubw_r2r(mm4, mm5); /* tmp1-tmp3 */ |
| psubw_r2r(mm2, mm6); /* tmp5 = tmp11 - tmp6; */ |
| |
| movq_r2m(mm1, *(wsptr)); /* save tmp13 in workspace */ |
| psllw_i2r(2, mm5); /* shift tmp1-tmp3 */ |
| |
| movq_m2r(*(idata), mm7); /* load idata[DCTSIZE*0] */ |
| |
| pmulhw_m2r(fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */ |
| paddw_r2r(mm6, mm0); /* tmp4 = tmp10 + tmp5; */ |
| |
| movq_m2r(*(idata+8), mm4); /* load idata[DCTSIZE*4] */ |
| |
| psubw_r2r(mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */ |
| |
| movq_r2m(mm0, *(wsptr+4)); /* save tmp4 in workspace */ |
| movq_r2r(mm7, mm1); /* copy tmp0 : phase 3 */ |
| |
| movq_r2m(mm5, *(wsptr+2)); /* save tmp12 in workspace */ |
| psubw_r2r(mm4, mm1); /* tmp11 = tmp0 - tmp2; */ |
| |
| paddw_r2r(mm4, mm7); /* tmp10 = tmp0 + tmp2; */ |
| movq_r2r(mm1, mm5); /* copy tmp11 */ |
| |
| paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */ |
| movq_r2r(mm7, mm4); /* copy tmp10 : phase 2 */ |
| |
| paddw_m2r(*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */ |
| |
| psubw_m2r(*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */ |
| movq_r2r(mm7, mm0); /* copy tmp0 */ |
| |
| psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */ |
| paddw_r2r(mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */ |
| |
| psubw_r2r(mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */ |
| |
| movq_r2m(mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */ |
| movq_r2r(mm1, mm3); /* copy tmp1 */ |
| |
| movq_r2m(mm0, *(wsptr+14)); /* wsptr[DCTSIZE*7] */ |
| paddw_r2r(mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */ |
| |
| psubw_r2r(mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */ |
| |
| movq_r2m(mm1, *(wsptr+2)); /* wsptr[DCTSIZE*1] */ |
| movq_r2r(mm4, mm1); /* copy tmp3 */ |
| |
| movq_r2m(mm3, *(wsptr+12)); /* wsptr[DCTSIZE*6] */ |
| |
| paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */ |
| |
| psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */ |
| |
| movq_r2m(mm4, *(wsptr+8)); |
| movq_r2r(mm5, mm7); /* copy tmp2 */ |
| |
| paddw_r2r(mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */ |
| |
| movq_r2m(mm1, *(wsptr+6)); |
| psubw_r2r(mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */ |
| |
| movq_r2m(mm5, *(wsptr+4)); |
| |
| movq_r2m(mm7, *(wsptr+10)); |
| |
| /*ok */ |
| |
| |
| /*****************************************************************/ |
| |
| idata++; |
| wsptr++; |
| |
| /*****************************************************************/ |
| |
| movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */ |
| |
| movq_m2r(*(idata+6), mm0); /* load idata[DCTSIZE*3] */ |
| |
| movq_m2r(*(idata+2), mm3); /* load idata[DCTSIZE*1] */ |
| movq_r2r(mm1, mm2); /* copy tmp6 : phase 6 */ */ |
| |
| movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */ |
| paddw_r2r(mm0, mm1); /* z13 = tmp6 + tmp5; */ |
| |
| psubw_r2r(mm0, mm2); /* z10 = tmp6 - tmp5 */ |
| |
| psllw_i2r(2, mm2); /* shift z10 */ |
| movq_r2r(mm2, mm0); /* copy z10 */ |
| |
| pmulhw_m2r(fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */ |
| movq_r2r(mm3, mm5); /* copy tmp4 */ |
| |
| pmulhw_m2r(fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */ |
| paddw_r2r(mm4, mm3); /* z11 = tmp4 + tmp7; */ |
| |
| movq_r2r(mm3, mm6); /* copy z11 : phase 5 */ |
| psubw_r2r(mm4, mm5); /* z12 = tmp4 - tmp7; */ |
| |
| psubw_r2r(mm1, mm6); /* z11-z13 */ |
| psllw_i2r(2, mm5); /* shift z12 */ |
| |
| movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */ |
| movq_r2r(mm5, mm7); /* copy z12 */ |
| |
| pmulhw_m2r(fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6) even part */ |
| paddw_r2r(mm1, mm3); /* tmp7 = z11 + z13; */ |
| |
| /*ok */ |
| |
| /* Even part */ |
| pmulhw_m2r(fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */ |
| psllw_i2r(2, mm6); |
| |
| movq_m2r(*(idata+4), mm1); /* load idata[DCTSIZE*2] */ |
| |
| paddw_r2r(mm5, mm0); /* tmp10 */ |
| |
| paddw_r2r(mm7, mm2); /* tmp12 */ |
| |
| pmulhw_m2r(fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */ |
| psubw_r2r(mm3, mm2); /* tmp6 = tmp12 - tmp7 */ |
| |
| movq_r2r(mm1, mm5); /* copy tmp1 */ |
| paddw_r2r(mm4, mm1); /* tmp13= tmp1 + tmp3; phases 5-3 */ |
| |
| psubw_r2r(mm4, mm5); /* tmp1-tmp3 */ |
| psubw_r2r(mm2, mm6); /* tmp5 = tmp11 - tmp6; */ |
| |
| movq_r2m(mm1, *(wsptr)); /* save tmp13 in workspace */ |
| psllw_i2r(2, mm5); /* shift tmp1-tmp3 */ |
| |
| movq_m2r(*(idata), mm7); /* load idata[DCTSIZE*0] */ |
| paddw_r2r(mm6, mm0); /* tmp4 = tmp10 + tmp5; */ |
| |
| pmulhw_m2r(fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */ |
| |
| movq_m2r(*(idata+8), mm4); /* load idata[DCTSIZE*4] */ |
| |
| psubw_r2r(mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */ |
| |
| movq_r2m(mm0, *(wsptr+4)); /* save tmp4 in workspace */ |
| movq_r2r(mm7, mm1); /* copy tmp0: phase 3 */ |
| |
| movq_r2m(mm5, *(wsptr+2)); /* save tmp12 in workspace */ |
| psubw_r2r(mm4, mm1); /* tmp11 = tmp0 - tmp2; */ |
| |
| paddw_r2r(mm4, mm7); /* tmp10 = tmp0 + tmp2; */ |
| movq_r2r(mm1, mm5); /* copy tmp11 */ |
| |
| paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */ |
| movq_r2r(mm7, mm4); /* copy tmp10: phase 2 */ |
| |
| paddw_m2r(*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */ |
| |
| psubw_m2r(*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */ |
| movq_r2r(mm7, mm0); /* copy tmp0 */ |
| |
| psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */ |
| paddw_r2r(mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */ |
| |
| psubw_r2r(mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */ |
| |
| movq_r2m(mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */ |
| movq_r2r(mm1, mm3); /* copy tmp1 */ |
| |
| movq_r2m(mm0, *(wsptr+14)); /* wsptr[DCTSIZE*7] */ |
| paddw_r2r(mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */ |
| |
| psubw_r2r(mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */ |
| |
| movq_r2m(mm1, *(wsptr+2)); /* wsptr[DCTSIZE*1] */ |
| movq_r2r(mm4, mm1); /* copy tmp3 */ |
| |
| movq_r2m(mm3, *(wsptr+12)); /* wsptr[DCTSIZE*6] */ |
| |
| paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */ |
| |
| psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */ |
| |
| movq_r2m(mm4, *(wsptr+8)); |
| movq_r2r(mm5, mm7); /* copy tmp2 */ |
| |
| paddw_r2r(mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */ |
| |
| movq_r2m(mm1, *(wsptr+6)); |
| psubw_r2r(mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */ |
| |
| movq_r2m(mm5, *(wsptr+4)); |
| |
| movq_r2m(mm7, *(wsptr+10)); |
| |
| /*****************************************************************/ |
| |
| /* Pass 2: process rows from work array, store into output array. */ |
| /* Note that we must descale the results by a factor of 8 == 2**3, */ |
| /* and also undo the PASS1_BITS scaling. */ |
| |
| /*****************************************************************/ |
| /* Even part */ |
| |
| wsptr--; |
| |
| /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */ |
| /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */ |
| /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */ |
| /* tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */ |
| movq_m2r(*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */ |
| |
| movq_m2r(*(wsptr+1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */ |
| movq_r2r(mm0, mm2); |
| |
| movq_m2r(*(wsptr+2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */ |
| paddw_r2r(mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */ |
| |
| movq_m2r(*(wsptr+3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */ |
| psubw_r2r(mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */ |
| |
| movq_r2r(mm0, mm6); |
| movq_r2r(mm3, mm5); |
| |
| paddw_r2r(mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */ |
| movq_r2r(mm2, mm1); |
| |
| psubw_r2r(mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */ |
| punpcklwd_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */ |
| |
| movq_m2r(*(wsptr+7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */ |
| punpckhwd_r2r(mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */ |
| |
| movq_m2r(*(wsptr+4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */ |
| punpckldq_r2r(mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ |
| |
| punpcklwd_r2r(mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */ |
| movq_r2r(mm3, mm4); |
| |
| movq_m2r(*(wsptr+6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */ |
| punpckhwd_r2r(mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */ |
| |
| movq_m2r(*(wsptr+5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */ |
| punpckldq_r2r(mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ |
| |
| |
| paddw_r2r(mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */ |
| movq_r2r(mm6, mm2); |
| |
| psubw_r2r(mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */ |
| paddw_r2r(mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */ |
| |
| movq_r2r(mm3, mm5); |
| punpcklwd_r2r(mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */ |
| |
| psubw_r2r(mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */ |
| punpckhwd_r2r(mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */ |
| |
| movq_r2r(mm4, mm7); |
| punpckldq_r2r(mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */ |
| |
| punpcklwd_r2r(mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */ |
| |
| punpckhwd_r2r(mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */ |
| |
| punpckldq_r2r(mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */ |
| movq_r2r(mm1, mm6); |
| |
| /*ok */ |
| |
| /* mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ |
| /* mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ |
| |
| |
| movq_r2r(mm0, mm2); |
| punpckhdq_r2r(mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */ |
| |
| punpckldq_r2r(mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */ |
| psllw_i2r(2, mm6); |
| |
| pmulhw_m2r(fix_141, mm6); |
| punpckldq_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */ |
| |
| punpckhdq_r2r(mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */ |
| movq_r2r(mm0, mm7); |
| |
| /* tmp0 = tmp10 + tmp13; */ |
| /* tmp3 = tmp10 - tmp13; */ |
| paddw_r2r(mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */ |
| psubw_r2r(mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */ |
| |
| /* tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */ |
| psubw_r2r(mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */ |
| /* tmp1 = tmp11 + tmp12; */ |
| /* tmp2 = tmp11 - tmp12; */ |
| movq_r2r(mm1, mm5); |
| |
| /*OK */ |
| |
| /* Odd part */ |
| |
| /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */ |
| /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */ |
| /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */ |
| /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */ |
| movq_m2r(*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */ |
| paddw_r2r(mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */ |
| |
| movq_m2r(*(wsptr+1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */ |
| psubw_r2r(mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */ |
| |
| movq_r2r(mm3, mm6); |
| punpckldq_r2r(mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */ |
| |
| punpckhdq_r2r(mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */ |
| movq_r2r(mm3, mm2); |
| |
| /*Save tmp0 and tmp1 in wsptr */ |
| movq_r2m(mm0, *(wsptr)); /* save tmp0 */ |
| paddw_r2r(mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */ |
| |
| |
| /*Continue with z10 --- z13 */ |
| movq_m2r(*(wsptr+2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */ |
| psubw_r2r(mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */ |
| |
| movq_m2r(*(wsptr+3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */ |
| movq_r2r(mm6, mm4); |
| |
| movq_r2m(mm1, *(wsptr+1)); /* save tmp1 */ |
| punpckldq_r2r(mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */ |
| |
| punpckhdq_r2r(mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */ |
| movq_r2r(mm6, mm1); |
| |
| /*Save tmp2 and tmp3 in wsptr */ |
| paddw_r2r(mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */ |
| movq_r2r(mm2, mm4); |
| |
| /*Continue with z10 --- z13 */ |
| movq_r2m(mm5, *(wsptr+2)); /* save tmp2 */ |
| punpcklwd_r2r(mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */ |
| |
| psubw_r2r(mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */ |
| punpckhwd_r2r(mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */ |
| |
| movq_r2r(mm3, mm0); |
| punpcklwd_r2r(mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */ |
| |
| movq_r2m(mm7, *(wsptr+3)); /* save tmp3 */ |
| punpckhwd_r2r(mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */ |
| |
| movq_m2r(*(wsptr+4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */ |
| punpckhdq_r2r(mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */ |
| |
| movq_m2r(*(wsptr+5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */ |
| punpckhdq_r2r(mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */ |
| |
| movq_m2r(*(wsptr+6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */ |
| movq_r2r(mm6, mm4); |
| |
| punpckldq_r2r(mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */ |
| movq_r2r(mm1, mm5); |
| |
| punpckhdq_r2r(mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */ |
| movq_r2r(mm6, mm2); |
| |
| movq_m2r(*(wsptr+7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */ |
| paddw_r2r(mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */ |
| |
| psubw_r2r(mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */ |
| punpckldq_r2r(mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */ |
| |
| punpckhdq_r2r(mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */ |
| movq_r2r(mm1, mm7); |
| |
| paddw_r2r(mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */ |
| psubw_r2r(mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */ |
| |
| movq_r2r(mm6, mm5); |
| punpcklwd_r2r(mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */ |
| |
| punpckhwd_r2r(mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */ |
| movq_r2r(mm2, mm4); |
| |
| punpcklwd_r2r(mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */ |
| |
| punpckhwd_r2r(mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */ |
| |
| punpckhdq_r2r(mm6, mm4); /*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */ |
| |
| punpckhdq_r2r(mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */ |
| movq_r2r(mm0, mm5); |
| |
| punpckldq_r2r(mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */ |
| |
| punpckhdq_r2r(mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */ |
| movq_r2r(mm3, mm4); |
| |
| punpckhdq_r2r(mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */ |
| movq_r2r(mm5, mm1); |
| |
| punpckldq_r2r(mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */ |
| /* tmp7 = z11 + z13; : phase 5 */ |
| /* tmp8 = z11 - z13; : phase 5 */ |
| psubw_r2r(mm4, mm1); /* tmp8 */ |
| |
| paddw_r2r(mm4, mm5); /* tmp7 */ |
| /* tmp21 = MULTIPLY(tmp8, FIX_1_414213562); 2*c4 */ |
| psllw_i2r(2, mm1); |
| |
| psllw_i2r(2, mm0); |
| |
| pmulhw_m2r(fix_141, mm1); /* tmp21 */ |
| /* tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) 2*(c2-c6) */ |
| /* + MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */ |
| psllw_i2r(2, mm3); |
| movq_r2r(mm0, mm7); |
| |
| pmulhw_m2r(fix_n184, mm7); |
| movq_r2r(mm3, mm6); |
| |
| movq_m2r(*(wsptr), mm2); /* tmp0,final1 */ |
| |
| pmulhw_m2r(fix_108n184, mm6); |
| /* tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */ |
| /* + MULTIPLY(z12, FIX_1_847759065); 2*c2 */ |
| movq_r2r(mm2, mm4); /* final1 */ |
| |
| pmulhw_m2r(fix_184n261, mm0); |
| paddw_r2r(mm5, mm2); /* tmp0+tmp7,final1 */ |
| |
| pmulhw_m2r(fix_184, mm3); |
| psubw_r2r(mm5, mm4); /* tmp0-tmp7,final1 */ |
| |
| /* tmp6 = tmp22 - tmp7; phase 2 */ |
| psraw_i2r(3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */ |
| |
| paddw_r2r(mm6, mm7); /* tmp20 */ |
| psraw_i2r(3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */ |
| |
| paddw_r2r(mm0, mm3); /* tmp22 */ |
| |
| /* tmp5 = tmp21 - tmp6; */ |
| psubw_r2r(mm5, mm3); /* tmp6 */ |
| |
| /* tmp4 = tmp20 + tmp5; */ |
| movq_m2r(*(wsptr+1), mm0); /* tmp1,final2 */ |
| psubw_r2r(mm3, mm1); /* tmp5 */ |
| |
| movq_r2r(mm0, mm6); /* final2 */ |
| paddw_r2r(mm3, mm0); /* tmp1+tmp6,final2 */ |
| |
| /* Final output stage: scale down by a factor of 8 and range-limit */ |
| |
| |
| /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final1 */ |
| |
| |
| /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final2 */ |
| psubw_r2r(mm3, mm6); /* tmp1-tmp6,final2 */ |
| psraw_i2r(3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */ |
| |
| psraw_i2r(3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */ |
| |
| packuswb_r2r(mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */ |
| |
| movq_m2r(*(wsptr+2), mm5); /* tmp2,final3 */ |
| packuswb_r2r(mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */ |
| |
| /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final3 */ |
| paddw_r2r(mm1, mm7); /* tmp4 */ |
| movq_r2r(mm5, mm3); |
| |
| paddw_r2r(mm1, mm5); /* tmp2+tmp5 */ |
| psubw_r2r(mm1, mm3); /* tmp2-tmp5 */ |
| |
| psraw_i2r(3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */ |
| |
| movq_m2r(*(wsptr+3), mm4); /* tmp3,final4 */ |
| psraw_i2r(3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */ |
| |
| |
| |
| /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final4 */ |
| movq_r2r(mm4, mm6); |
| paddw_r2r(mm7, mm4); /* tmp3+tmp4 */ |
| |
| psubw_r2r(mm7, mm6); /* tmp3-tmp4 */ |
| psraw_i2r(3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */ |
| |
| /* mov ecx, [dataptr] */ |
| |
| psraw_i2r(3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */ |
| |
| packuswb_r2r(mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */ |
| |
| packuswb_r2r(mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */ |
| movq_r2r(mm2, mm4); |
| |
| movq_r2r(mm5, mm7); |
| punpcklbw_r2r(mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */ |
| |
| punpckhbw_r2r(mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */ |
| movq_r2r(mm2, mm1); |
| |
| punpcklbw_r2r(mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */ |
| |
| /* add dataptr, 4 */ |
| |
| punpckhbw_r2r(mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */ |
| |
| punpcklwd_r2r(mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */ |
| |
| /* add ecx, output_col */ |
| |
| movq_r2r(mm7, mm6); |
| punpckhwd_r2r(mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */ |
| |
| movq_r2r(mm2, mm0); |
| punpcklwd_r2r(mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */ |
| |
| /* mov idata, [dataptr] */ |
| |
| punpckldq_r2r(mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */ |
| |
| /* add dataptr, 4 */ |
| |
| movq_r2r(mm1, mm3); |
| |
| /* add idata, output_col */ |
| |
| punpckhwd_r2r(mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */ |
| |
| movq_r2m(mm2, *(dataptr)); |
| |
| punpckhdq_r2r(mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */ |
| |
| dataptr += rskip; |
| movq_r2m(mm0, *(dataptr)); |
| |
| punpckldq_r2r(mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */ |
| punpckhdq_r2r(mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */ |
| |
| dataptr += rskip; |
| movq_r2m(mm1, *(dataptr)); |
| |
| dataptr += rskip; |
| movq_r2m(mm3, *(dataptr)); |
| |
| /*******************************************************************/ |
| |
| wsptr += 8; |
| |
| /*******************************************************************/ |
| |
| /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */ |
| /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */ |
| /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */ |
| /* tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */ |
| movq_m2r(*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */ |
| |
| movq_m2r(*(wsptr+1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */ |
| movq_r2r(mm0, mm2); |
| |
| movq_m2r(*(wsptr+2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */ |
| paddw_r2r(mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */ |
| |
| movq_m2r(*(wsptr+3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */ |
| psubw_r2r(mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */ |
| |
| movq_r2r(mm0, mm6); |
| movq_r2r(mm3, mm5); |
| |
| paddw_r2r(mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */ |
| movq_r2r(mm2, mm1); |
| |
| psubw_r2r(mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */ |
| punpcklwd_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */ |
| |
| movq_m2r(*(wsptr+7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */ |
| punpckhwd_r2r(mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */ |
| |
| movq_m2r(*(wsptr+4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */ |
| punpckldq_r2r(mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ |
| |
| punpcklwd_r2r(mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */ |
| movq_r2r(mm3, mm4); |
| |
| movq_m2r(*(wsptr+6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */ |
| punpckhwd_r2r(mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */ |
| |
| movq_m2r(*(wsptr+5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */ |
| punpckldq_r2r(mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ |
| |
| paddw_r2r(mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */ |
| movq_r2r(mm6, mm2); |
| |
| psubw_r2r(mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */ |
| paddw_r2r(mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */ |
| |
| movq_r2r(mm3, mm5); |
| punpcklwd_r2r(mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */ |
| |
| psubw_r2r(mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */ |
| punpckhwd_r2r(mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */ |
| |
| movq_r2r(mm4, mm7); |
| punpckldq_r2r(mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */ |
| |
| punpcklwd_r2r(mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */ |
| |
| punpckhwd_r2r(mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */ |
| |
| punpckldq_r2r(mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */ |
| movq_r2r(mm1, mm6); |
| |
| /*OK */ |
| |
| /* mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ |
| /* mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ |
| |
| movq_r2r(mm0, mm2); |
| punpckhdq_r2r(mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */ |
| |
| punpckldq_r2r(mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */ |
| psllw_i2r(2, mm6); |
| |
| pmulhw_m2r(fix_141, mm6); |
| punpckldq_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */ |
| |
| punpckhdq_r2r(mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */ |
| movq_r2r(mm0, mm7); |
| |
| /* tmp0 = tmp10 + tmp13; */ |
| /* tmp3 = tmp10 - tmp13; */ |
| paddw_r2r(mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */ |
| psubw_r2r(mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */ |
| |
| /* tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */ |
| psubw_r2r(mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */ |
| /* tmp1 = tmp11 + tmp12; */ |
| /* tmp2 = tmp11 - tmp12; */ |
| movq_r2r(mm1, mm5); |
| |
| /*OK */ |
| |
| |
| /* Odd part */ |
| |
| /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */ |
| /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */ |
| /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */ |
| /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */ |
| movq_m2r(*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */ |
| paddw_r2r(mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */ |
| |
| movq_m2r(*(wsptr+1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */ |
| psubw_r2r(mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */ |
| |
| movq_r2r(mm3, mm6); |
| punpckldq_r2r(mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */ |
| |
| punpckhdq_r2r(mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */ |
| movq_r2r(mm3, mm2); |
| |
| /*Save tmp0 and tmp1 in wsptr */ |
| movq_r2m(mm0, *(wsptr)); /* save tmp0 */ |
| paddw_r2r(mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */ |
| |
| |
| /*Continue with z10 --- z13 */ |
| movq_m2r(*(wsptr+2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */ |
| psubw_r2r(mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */ |
| |
| movq_m2r(*(wsptr+3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */ |
| movq_r2r(mm6, mm4); |
| |
| movq_r2m(mm1, *(wsptr+1)); /* save tmp1 */ |
| punpckldq_r2r(mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */ |
| |
| punpckhdq_r2r(mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */ |
| movq_r2r(mm6, mm1); |
| |
| /*Save tmp2 and tmp3 in wsptr */ |
| paddw_r2r(mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */ |
| movq_r2r(mm2, mm4); |
| |
| /*Continue with z10 --- z13 */ |
| movq_r2m(mm5, *(wsptr+2)); /* save tmp2 */ |
| punpcklwd_r2r(mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */ |
| |
| psubw_r2r(mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */ |
| punpckhwd_r2r(mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */ |
| |
| movq_r2r(mm3, mm0); |
| punpcklwd_r2r(mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */ |
| |
| movq_r2m(mm7, *(wsptr+3)); /* save tmp3 */ |
| punpckhwd_r2r(mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */ |
| |
| movq_m2r(*(wsptr+4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */ |
| punpckhdq_r2r(mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */ |
| |
| movq_m2r(*(wsptr+5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */ |
| punpckhdq_r2r(mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */ |
| |
| movq_m2r(*(wsptr+6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */ |
| movq_r2r(mm6, mm4); |
| |
| punpckldq_r2r(mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */ |
| movq_r2r(mm1, mm5); |
| |
| punpckhdq_r2r(mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */ |
| movq_r2r(mm6, mm2); |
| |
| movq_m2r(*(wsptr+7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */ |
| paddw_r2r(mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */ |
| |
| psubw_r2r(mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */ |
| punpckldq_r2r(mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */ |
| |
| punpckhdq_r2r(mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */ |
| movq_r2r(mm1, mm7); |
| |
| paddw_r2r(mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */ |
| psubw_r2r(mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */ |
| |
| movq_r2r(mm6, mm5); |
| punpcklwd_r2r(mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */ |
| |
| punpckhwd_r2r(mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */ |
| movq_r2r(mm2, mm4); |
| |
| punpcklwd_r2r(mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */ |
| |
| punpckhwd_r2r(mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */ |
| |
| punpckhdq_r2r(mm6, mm4); /* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */ |
| |
| punpckhdq_r2r(mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */ |
| movq_r2r(mm0, mm5); |
| |
| punpckldq_r2r(mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */ |
| |
| punpckhdq_r2r(mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */ |
| movq_r2r(mm3, mm4); |
| |
| punpckhdq_r2r(mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */ |
| movq_r2r(mm5, mm1); |
| |
| punpckldq_r2r(mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */ |
| /* tmp7 = z11 + z13; : phase 5 */ |
| /* tmp8 = z11 - z13; : phase 5 */ |
| psubw_r2r(mm4, mm1); /* tmp8 */ |
| |
| paddw_r2r(mm4, mm5); /* tmp7 */ |
| /* tmp21 = MULTIPLY(tmp8, FIX_1_414213562); 2*c4 */ |
| psllw_i2r(2, mm1); |
| |
| psllw_i2r(2, mm0); |
| |
| pmulhw_m2r(fix_141, mm1); /* tmp21 */ |
| /* tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) : 2*(c2-c6) */ |
| /* + MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */ |
| psllw_i2r(2, mm3); |
| movq_r2r(mm0, mm7); |
| |
| pmulhw_m2r(fix_n184, mm7); |
| movq_r2r(mm3, mm6); |
| |
| movq_m2r(*(wsptr), mm2); /* tmp0,final1 */ |
| |
| pmulhw_m2r(fix_108n184, mm6); |
| /* tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */ |
| /* + MULTIPLY(z12, FIX_1_847759065); : 2*c2 */ |
| movq_r2r(mm2, mm4); /* final1 */ |
| |
| pmulhw_m2r(fix_184n261, mm0); |
| paddw_r2r(mm5, mm2); /* tmp0+tmp7,final1 */ |
| |
| pmulhw_m2r(fix_184, mm3); |
| psubw_r2r(mm5, mm4); /* tmp0-tmp7,final1 */ |
| |
| /* tmp6 = tmp22 - tmp7; phase 2 */ |
| psraw_i2r(3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */ |
| |
| paddw_r2r(mm6, mm7); /* tmp20 */ |
| psraw_i2r(3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */ |
| |
| paddw_r2r(mm0, mm3); /* tmp22 */ |
| |
| /* tmp5 = tmp21 - tmp6; */ |
| psubw_r2r(mm5, mm3); /* tmp6 */ |
| |
| /* tmp4 = tmp20 + tmp5; */ |
| movq_m2r(*(wsptr+1), mm0); /* tmp1,final2 */ |
| psubw_r2r(mm3, mm1); /* tmp5 */ |
| |
| movq_r2r(mm0, mm6); /* final2 */ |
| paddw_r2r(mm3, mm0); /* tmp1+tmp6,final2 */ |
| |
| /* Final output stage: scale down by a factor of 8 and range-limit */ |
| |
| /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final1 */ |
| |
| |
| /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final2 */ |
| psubw_r2r(mm3, mm6); /* tmp1-tmp6,final2 */ |
| psraw_i2r(3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */ |
| |
| psraw_i2r(3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */ |
| |
| packuswb_r2r(mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */ |
| |
| movq_m2r(*(wsptr+2), mm5); /* tmp2,final3 */ |
| packuswb_r2r(mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */ |
| |
| /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final3 */ |
| paddw_r2r(mm1, mm7); /* tmp4 */ |
| movq_r2r(mm5, mm3); |
| |
| paddw_r2r(mm1, mm5); /* tmp2+tmp5 */ |
| psubw_r2r(mm1, mm3); /* tmp2-tmp5 */ |
| |
| psraw_i2r(3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */ |
| |
| movq_m2r(*(wsptr+3), mm4); /* tmp3,final4 */ |
| psraw_i2r(3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */ |
| |
| |
| |
| /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; */ |
| /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */ |
| /* & RANGE_MASK]; final4 */ |
| movq_r2r(mm4, mm6); |
| paddw_r2r(mm7, mm4); /* tmp3+tmp4 */ |
| |
| psubw_r2r(mm7, mm6); /* tmp3-tmp4 */ |
| psraw_i2r(3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */ |
| |
| psraw_i2r(3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */ |
| |
| /* |
| movq_r2m(mm4, *dummy); |
| fprintf(stderr, "3-4 %016llx\n", dummy); |
| movq_r2m(mm4, *dummy); |
| fprintf(stderr, "3+4 %016llx\n", dummy); |
| */ |
| |
| |
| packuswb_r2r(mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */ |
| |
| packuswb_r2r(mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */ |
| movq_r2r(mm2, mm4); |
| |
| movq_r2r(mm5, mm7); |
| punpcklbw_r2r(mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */ |
| |
| punpckhbw_r2r(mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */ |
| movq_r2r(mm2, mm1); |
| |
| punpcklbw_r2r(mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */ |
| |
| punpckhbw_r2r(mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */ |
| |
| punpcklwd_r2r(mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */ |
| |
| movq_r2r(mm7, mm6); |
| punpckhwd_r2r(mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */ |
| |
| movq_r2r(mm2, mm0); |
| punpcklwd_r2r(mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */ |
| |
| punpckldq_r2r(mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */ |
| |
| movq_r2r(mm1, mm3); |
| |
| punpckhwd_r2r(mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */ |
| |
| dataptr += rskip; |
| movq_r2m(mm2, *(dataptr)); |
| |
| punpckhdq_r2r(mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */ |
| |
| dataptr += rskip; |
| movq_r2m(mm0, *(dataptr)); |
| |
| punpckldq_r2r(mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */ |
| |
| punpckhdq_r2r(mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */ |
| |
| dataptr += rskip; |
| movq_r2m(mm1, *(dataptr)); |
| |
| dataptr += rskip; |
| movq_r2m(mm3, *(dataptr)); |
| |
| #else |
| __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
| __s32 tmp10, tmp11, tmp12, tmp13; |
| __s32 z5, z10, z11, z12, z13; |
| __s16 *inptr; |
| __s32 *wsptr; |
| __u8 *outptr; |
| int ctr; |
| __s32 dcval; |
| __s32 workspace[64]; |
| |
| inptr = data; |
| wsptr = workspace; |
| for (ctr = 8; ctr > 0; ctr--) { |
| |
| if ((inptr[8] | inptr[16] | inptr[24] | |
| inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) { |
| dcval = inptr[0]; |
| wsptr[0] = dcval; |
| wsptr[8] = dcval; |
| wsptr[16] = dcval; |
| wsptr[24] = dcval; |
| wsptr[32] = dcval; |
| wsptr[40] = dcval; |
| wsptr[48] = dcval; |
| wsptr[56] = dcval; |
| |
| inptr++; |
| wsptr++; |
| continue; |
| } |
| |
| tmp0 = inptr[0]; |
| tmp1 = inptr[16]; |
| tmp2 = inptr[32]; |
| tmp3 = inptr[48]; |
| |
| tmp10 = tmp0 + tmp2; |
| tmp11 = tmp0 - tmp2; |
| |
| tmp13 = tmp1 + tmp3; |
| tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; |
| |
| tmp0 = tmp10 + tmp13; |
| tmp3 = tmp10 - tmp13; |
| tmp1 = tmp11 + tmp12; |
| tmp2 = tmp11 - tmp12; |
| |
| tmp4 = inptr[8]; |
| tmp5 = inptr[24]; |
| tmp6 = inptr[40]; |
| tmp7 = inptr[56]; |
| |
| z13 = tmp6 + tmp5; |
| z10 = tmp6 - tmp5; |
| z11 = tmp4 + tmp7; |
| z12 = tmp4 - tmp7; |
| |
| tmp7 = z11 + z13; |
| tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); |
| |
| z5 = MULTIPLY(z10 + z12, FIX_1_847759065); |
| tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; |
| tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; |
| |
| tmp6 = tmp12 - tmp7; |
| tmp5 = tmp11 - tmp6; |
| tmp4 = tmp10 + tmp5; |
| |
| wsptr[0] = (__s32) (tmp0 + tmp7); |
| wsptr[56] = (__s32) (tmp0 - tmp7); |
| wsptr[8] = (__s32) (tmp1 + tmp6); |
| wsptr[48] = (__s32) (tmp1 - tmp6); |
| wsptr[16] = (__s32) (tmp2 + tmp5); |
| wsptr[40] = (__s32) (tmp2 - tmp5); |
| wsptr[32] = (__s32) (tmp3 + tmp4); |
| wsptr[24] = (__s32) (tmp3 - tmp4); |
| |
| inptr++; |
| wsptr++; |
| } |
| |
| wsptr = workspace; |
| for (ctr = 0; ctr < 8; ctr++) { |
| outptr = &(odata[ctr*rskip]); |
| |
| tmp10 = wsptr[0] + wsptr[4]; |
| tmp11 = wsptr[0] - wsptr[4]; |
| |
| tmp13 = wsptr[2] + wsptr[6]; |
| tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13; |
| |
| tmp0 = tmp10 + tmp13; |
| tmp3 = tmp10 - tmp13; |
| tmp1 = tmp11 + tmp12; |
| tmp2 = tmp11 - tmp12; |
| |
| z13 = wsptr[5] + wsptr[3]; |
| z10 = wsptr[5] - wsptr[3]; |
| z11 = wsptr[1] + wsptr[7]; |
| z12 = wsptr[1] - wsptr[7]; |
| |
| tmp7 = z11 + z13; |
| tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); |
| |
| z5 = MULTIPLY(z10 + z12, FIX_1_847759065); |
| tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; |
| tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; |
| |
| tmp6 = tmp12 - tmp7; |
| tmp5 = tmp11 - tmp6; |
| tmp4 = tmp10 + tmp5; |
| |
| outptr[0] = RL(DESCALE(tmp0 + tmp7)); |
| outptr[7] = RL(DESCALE(tmp0 - tmp7)); |
| outptr[1] = RL(DESCALE(tmp1 + tmp6)); |
| outptr[6] = RL(DESCALE(tmp1 - tmp6)); |
| outptr[2] = RL(DESCALE(tmp2 + tmp5)); |
| outptr[5] = RL(DESCALE(tmp2 - tmp5)); |
| outptr[4] = RL(DESCALE(tmp3 + tmp4)); |
| outptr[3] = RL(DESCALE(tmp3 - tmp4)); |
| |
| wsptr += 8; |
| } |
| #endif |
| } |
| /* |
| |
| Main Routines |
| |
| This file contains most of the initialisation and control functions |
| |
| (C) Justin Schoeman 1998 |
| |
| */ |
| |
| /* |
| |
| Private function |
| |
| Initialise all the cache-aliged data blocks |
| |
| */ |
| |
| void RTjpeg_init_data(void) |
| { |
| unsigned long dptr; |
| |
| dptr=(unsigned long)&(RTjpeg_alldata[0]); |
| dptr+=32; |
| dptr=dptr>>5; |
| dptr=dptr<<5; /* cache align data */ |
| |
| RTjpeg_block=(__s16 *)dptr; |
| dptr+=sizeof(__s16)*64; |
| RTjpeg_lqt=(__s32 *)dptr; |
| dptr+=sizeof(__s32)*64; |
| RTjpeg_cqt=(__s32 *)dptr; |
| dptr+=sizeof(__s32)*64; |
| RTjpeg_liqt=(__u32 *)dptr; |
| dptr+=sizeof(__u32)*64; |
| RTjpeg_ciqt=(__u32 *)dptr; |
| } |
| |
| /* |
| |
| External Function |
| |
| Re-set quality factor |
| |
| Input: buf -> pointer to 128 ints for quant values store to pass back to |
| init_decompress. |
| Q -> quality factor (192=best, 32=worst) |
| */ |
| |
| void RTjpeg_init_Q(__u8 Q) |
| { |
| int i; |
| __u64 qual; |
| |
| qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ |
| |
| for(i=0; i<64; i++) |
| { |
| RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); |
| if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; |
| RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); |
| if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; |
| RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); |
| RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); |
| RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; |
| RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; |
| } |
| |
| RTjpeg_lb8=0; |
| while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); |
| RTjpeg_lb8--; |
| RTjpeg_cb8=0; |
| while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); |
| RTjpeg_cb8--; |
| |
| RTjpeg_dct_init(); |
| RTjpeg_idct_init(); |
| RTjpeg_quant_init(); |
| } |
| |
| /* |
| |
| External Function |
| |
| Initialise compression. |
| |
| Input: buf -> pointer to 128 ints for quant values store to pass back to |
| init_decompress. |
| width -> width of image |
| height -> height of image |
| Q -> quality factor (192=best, 32=worst) |
| |
| */ |
| |
| void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q) |
| { |
| int i; |
| __u64 qual; |
| |
| RTjpeg_init_data(); |
| |
| RTjpeg_width=width; |
| RTjpeg_height=height; |
| RTjpeg_Ywidth = RTjpeg_width>>3; |
| RTjpeg_Ysize=width * height; |
| RTjpeg_Cwidth = RTjpeg_width>>4; |
| RTjpeg_Csize= (width>>1) * height; |
| |
| qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ |
| |
| for(i=0; i<64; i++) |
| { |
| RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); |
| if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; |
| RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); |
| if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; |
| RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); |
| RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); |
| RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; |
| RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; |
| } |
| |
| RTjpeg_lb8=0; |
| while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); |
| RTjpeg_lb8--; |
| RTjpeg_cb8=0; |
| while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); |
| RTjpeg_cb8--; |
| |
| RTjpeg_dct_init(); |
| RTjpeg_quant_init(); |
| |
| for(i=0; i<64; i++) |
| buf[i]=RTjpeg_liqt[i]; |
| for(i=0; i<64; i++) |
| buf[64+i]=RTjpeg_ciqt[i]; |
| } |
| |
| void RTjpeg_init_decompress(__u32 *buf, int width, int height) |
| { |
| int i; |
| |
| RTjpeg_init_data(); |
| |
| RTjpeg_width=width; |
| RTjpeg_height=height; |
| RTjpeg_Ywidth = RTjpeg_width>>3; |
| RTjpeg_Ysize=width * height; |
| RTjpeg_Cwidth = RTjpeg_width>>4; |
| RTjpeg_Csize= (width>>1) * height; |
| |
| for(i=0; i<64; i++) |
| { |
| RTjpeg_liqt[i]=buf[i]; |
| RTjpeg_ciqt[i]=buf[i+64]; |
| } |
| |
| RTjpeg_lb8=0; |
| while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); |
| RTjpeg_lb8--; |
| RTjpeg_cb8=0; |
| while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); |
| RTjpeg_cb8--; |
| |
| RTjpeg_idct_init(); |
| |
| /* RTjpeg_color_init(); */ |
| } |
| |
| int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp) |
| { |
| __s8 * sb; |
| register __s8 * bp1 = bp + (RTjpeg_width<<3); |
| register __s8 * bp2 = bp + RTjpeg_Ysize; |
| register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); |
| register int i, j, k; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| sb=sp; |
| /* Y */ |
| for(i=RTjpeg_height>>1; i; i-=8) |
| { |
| for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) |
| { |
| RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| |
| RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| |
| RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| |
| RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| |
| RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); |
| |
| RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); |
| |
| } |
| bp+=RTjpeg_width<<4; |
| bp1+=RTjpeg_width<<4; |
| bp2+=RTjpeg_width<<2; |
| bp3+=RTjpeg_width<<2; |
| |
| } |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| return (sp-sb); |
| } |
| |
| int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp) |
| { |
| __s8 * sb; |
| register __s8 * bp2 = bp + RTjpeg_Ysize; |
| register __s8 * bp3 = bp2 + RTjpeg_Csize; |
| register int i, j, k; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| sb=sp; |
| /* Y */ |
| for(i=RTjpeg_height; i; i-=8) |
| { |
| for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) |
| { |
| RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| |
| RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| |
| RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); |
| |
| RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); |
| |
| } |
| bp+=RTjpeg_width<<3; |
| bp2+=RTjpeg_width<<2; |
| bp3+=RTjpeg_width<<2; |
| |
| } |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| return (sp-sb); |
| } |
| |
| int RTjpeg_compress8(__s8 *sp, unsigned char *bp) |
| { |
| __s8 * sb; |
| int i, j; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| |
| sb=sp; |
| /* Y */ |
| for(i=0; i<RTjpeg_height; i+=8) |
| { |
| for(j=0; j<RTjpeg_width; j+=8) |
| { |
| RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| } |
| bp+=RTjpeg_width; |
| } |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| return (sp-sb); |
| } |
| |
| void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp) |
| { |
| register __s8 * bp2 = bp + RTjpeg_Ysize; |
| register __s8 * bp3 = bp2 + (RTjpeg_Csize); |
| int i, j,k; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| |
| /* Y */ |
| for(i=RTjpeg_height; i; i-=8) |
| { |
| for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) { |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); |
| RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); |
| RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); |
| } |
| } |
| bp+=RTjpeg_width<<3; |
| bp2+=RTjpeg_width<<2; |
| bp3+=RTjpeg_width<<2; |
| } |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| } |
| |
| void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp) |
| { |
| register __s8 * bp1 = bp + (RTjpeg_width<<3); |
| register __s8 * bp2 = bp + RTjpeg_Ysize; |
| register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); |
| int i, j,k; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| |
| /* Y */ |
| for(i=RTjpeg_height>>1; i; i-=8) |
| { |
| for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) { |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); |
| RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1); |
| } |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); |
| RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); |
| } |
| } |
| bp+=RTjpeg_width<<4; |
| bp1+=RTjpeg_width<<4; |
| bp2+=RTjpeg_width<<2; |
| bp3+=RTjpeg_width<<2; |
| } |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| } |
| |
| void RTjpeg_decompress8(__s8 *sp, __u8 *bp) |
| { |
| int i, j; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| |
| /* Y */ |
| for(i=0; i<RTjpeg_height; i+=8) |
| { |
| for(j=0; j<RTjpeg_width; j+=8) |
| if(*sp==-1)sp++; |
| else |
| { |
| sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); |
| RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width); |
| } |
| bp+=RTjpeg_width<<3; |
| } |
| } |
| |
| /* |
| External Function |
| |
| Initialise additional data structures for motion compensation |
| |
| */ |
| |
| void RTjpeg_init_mcompress(void) |
| { |
| unsigned long tmp; |
| |
| if(!RTjpeg_old) |
| { |
| RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32); |
| tmp=(unsigned long)RTjpeg_old; |
| tmp+=32; |
| tmp=tmp>>5; |
| RTjpeg_old=(__s16 *)(tmp<<5); |
| } |
| if (!RTjpeg_old) |
| { |
| fprintf(stderr, "RTjpeg: Could not allocate memory\n"); |
| exit(-1); |
| } |
| bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height))); |
| } |
| |
| #ifdef HAVE_LIBMMX |
| |
| int RTjpeg_bcomp(__s16 *old, mmx_t *mask) |
| { |
| int i; |
| mmx_t *mold=(mmx_t *)old; |
| mmx_t *mblock=(mmx_t *)RTjpeg_block; |
| mmx_t result; |
| static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL; |
| |
| movq_m2r(*mask, mm7); |
| movq_m2r(neg, mm6); |
| pxor_r2r(mm5, mm5); |
| |
| for(i=0; i<8; i++) |
| { |
| movq_m2r(*(mblock++), mm0); |
| movq_m2r(*(mblock++), mm2); |
| movq_m2r(*(mold++), mm1); |
| movq_m2r(*(mold++), mm3); |
| psubsw_r2r(mm1, mm0); |
| psubsw_r2r(mm3, mm2); |
| movq_r2r(mm0, mm1); |
| movq_r2r(mm2, mm3); |
| pcmpgtw_r2r(mm7, mm0); |
| pcmpgtw_r2r(mm7, mm2); |
| pxor_r2r(mm6, mm1); |
| pxor_r2r(mm6, mm3); |
| pcmpgtw_r2r(mm7, mm1); |
| pcmpgtw_r2r(mm7, mm3); |
| por_r2r(mm0, mm5); |
| por_r2r(mm2, mm5); |
| por_r2r(mm1, mm5); |
| por_r2r(mm3, mm5); |
| } |
| movq_r2m(mm5, result); |
| |
| if(result.q) |
| { |
| if(!RTjpeg_mtest) |
| for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; |
| return 0; |
| } |
| /* printf("."); */ |
| return 1; |
| } |
| |
| #else |
| int RTjpeg_bcomp(__s16 *old, __u16 *mask) |
| { |
| int i; |
| |
| for(i=0; i<64; i++) |
| if(abs(old[i]-RTjpeg_block[i])>*mask) |
| { |
| if(!RTjpeg_mtest) |
| for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; |
| return 0; |
| } |
| return 1; |
| } |
| #endif |
| |
| void RTjpeg_set_test(int i) |
| { |
| RTjpeg_mtest=i; |
| } |
| |
| int RTjpeg_mcompress(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) |
| { |
| __s8 * sb; |
| __s16 *block; |
| register __s8 * bp2; |
| register __s8 * bp3; |
| register int i, j, k; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); |
| RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask); |
| #else |
| RTjpeg_lmask=lmask; |
| RTjpeg_cmask=cmask; |
| #endif |
| |
| bp = bp - RTjpeg_width*0; |
| bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0; |
| bp3 = bp2 + RTjpeg_Csize; |
| |
| sb=sp; |
| block=RTjpeg_old; |
| /* Y */ |
| for(i=RTjpeg_height; i; i-=8) |
| { |
| for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) |
| { |
| RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| if(RTjpeg_bcomp(block, &RTjpeg_lmask)) |
| { |
| *((__u8 *)sp++)=255; |
| } |
| else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| block+=64; |
| |
| RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| if(RTjpeg_bcomp(block, &RTjpeg_lmask)) |
| { |
| *((__u8 *)sp++)=255; |
| } |
| else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| block+=64; |
| |
| RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); |
| if(RTjpeg_bcomp(block, &RTjpeg_cmask)) |
| { |
| *((__u8 *)sp++)=255; |
| } |
| else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); |
| block+=64; |
| |
| RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); |
| if(RTjpeg_bcomp(block, &RTjpeg_cmask)) |
| { |
| *((__u8 *)sp++)=255; |
| } |
| else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); |
| block+=64; |
| |
| } |
| bp+=RTjpeg_width<<3; |
| bp2+=RTjpeg_width<<2; |
| bp3+=RTjpeg_width<<2; |
| } |
| /*printf ("%d\n", block - RTjpeg_old); */ |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| return (sp-sb); |
| } |
| |
| int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask) |
| { |
| __s8 * sb; |
| __s16 *block; |
| int i, j; |
| |
| #ifdef HAVE_LIBMMX |
| emms(); |
| RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); |
| #else |
| RTjpeg_lmask=lmask; |
| #endif |
| |
| |
| sb=sp; |
| block=RTjpeg_old; |
| /* Y */ |
| for(i=0; i<RTjpeg_height; i+=8) |
| { |
| for(j=0; j<RTjpeg_width; j+=8) |
| { |
| RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width); |
| RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); |
| if(RTjpeg_bcomp(block, &RTjpeg_lmask)) |
| { |
| *((__u8 *)sp++)=255; |
| /* printf("* %d ", sp[-1]); */ |
| } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); |
| block+=64; |
| } |
| bp+=RTjpeg_width<<3; |
| } |
| #ifdef HAVE_LIBMMX |
| emms(); |
| #endif |
| return (sp-sb); |
| } |
| |
| void RTjpeg_color_init(void) |
| { |
| } |
| |
| #define KcrR 76284 |
| #define KcrG 53281 |
| #define KcbG 25625 |
| #define KcbB 132252 |
| #define Ky 76284 |
| |
| void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb) |
| { |
| int tmp; |
| int i, j; |
| __s32 y, crR, crG, cbG, cbB; |
| __u8 *bufcr, *bufcb, *bufy, *bufoute; |
| int yskip; |
| |
| yskip=RTjpeg_width; |
| |
| bufcb=&buf[RTjpeg_width*RTjpeg_height]; |
| bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2]; |
| bufy=&buf[0]; |
| bufoute=rgb; |
| |
| for(i=0; i<(RTjpeg_height); i++) |
| { |
| for(j=0; j<RTjpeg_width; j+=2) |
| { |
| crR=(*bufcr-128)*KcrR; |
| crG=(*(bufcr++)-128)*KcrG; |
| cbG=(*bufcb-128)*KcbG; |
| cbB=(*(bufcb++)-128)*KcbB; |
| |
| y=(bufy[j]-16)*Ky; |
| |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+1]-16)*Ky; |
| |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| } |
| bufy+=yskip; |
| } |
| } |
| |
| |
| void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb) |
| { |
| int tmp; |
| int i, j; |
| __s32 y, crR, crG, cbG, cbB; |
| __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; |
| int oskip, yskip; |
| |
| oskip=RTjpeg_width*3; |
| yskip=RTjpeg_width; |
| |
| bufcb=&buf[RTjpeg_width*RTjpeg_height]; |
| bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; |
| bufy=&buf[0]; |
| bufoute=rgb; |
| bufouto=rgb+oskip; |
| |
| for(i=0; i<(RTjpeg_height>>1); i++) |
| { |
| for(j=0; j<RTjpeg_width; j+=2) |
| { |
| crR=(*bufcr-128)*KcrR; |
| crG=(*(bufcr++)-128)*KcrG; |
| cbG=(*bufcb-128)*KcbG; |
| cbB=(*(bufcb++)-128)*KcbB; |
| |
| y=(bufy[j]-16)*Ky; |
| |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+1]-16)*Ky; |
| |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+yskip]-16)*Ky; |
| |
| tmp=(y+crR)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+cbB)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+1+yskip]-16)*Ky; |
| |
| tmp=(y+crR)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+cbB)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| } |
| bufoute+=oskip; |
| bufouto+=oskip; |
| bufy+=yskip<<1; |
| } |
| } |
| |
| |
| void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb) |
| { |
| int tmp; |
| int i, j; |
| __s32 y, crR, crG, cbG, cbB; |
| __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; |
| int oskip, yskip; |
| |
| oskip=RTjpeg_width*4; |
| yskip=RTjpeg_width; |
| |
| bufcb=&buf[RTjpeg_width*RTjpeg_height]; |
| bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2]; |
| bufy=&buf[0]; |
| bufoute=rgb; |
| bufouto=rgb+oskip; |
| |
| for(i=0; i<(RTjpeg_height>>1); i++) |
| { |
| for(j=0; j<RTjpeg_width; j+=2) |
| { |
| crR=(*bufcr-128)*KcrR; |
| crG=(*(bufcr++)-128)*KcrG; |
| cbG=(*bufcb-128)*KcbG; |
| cbB=(*(bufcb++)-128)*KcbB; |
| |
| y=(bufy[j]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| bufoute++; |
| |
| y=(bufy[j+1]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| bufoute++; |
| |
| y=(bufy[j+yskip]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| bufouto++; |
| |
| y=(bufy[j+1+yskip]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| bufouto++; |
| |
| } |
| bufoute+=oskip; |
| bufouto+=oskip; |
| bufy+=yskip<<1; |
| } |
| } |
| |
| void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb) |
| { |
| int tmp; |
| int i, j; |
| __s32 y, crR, crG, cbG, cbB; |
| __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; |
| int oskip, yskip; |
| |
| oskip=RTjpeg_width*3; |
| yskip=RTjpeg_width; |
| |
| bufcb=&buf[RTjpeg_width*RTjpeg_height]; |
| bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; |
| bufy=&buf[0]; |
| bufoute=rgb; |
| bufouto=rgb+oskip; |
| |
| for(i=0; i<(RTjpeg_height>>1); i++) |
| { |
| for(j=0; j<RTjpeg_width; j+=2) |
| { |
| crR=(*bufcr-128)*KcrR; |
| crG=(*(bufcr++)-128)*KcrG; |
| cbG=(*bufcb-128)*KcbG; |
| cbB=(*(bufcb++)-128)*KcbB; |
| |
| y=(bufy[j]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+1]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+yskip]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| y=(bufy[j+1+yskip]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); |
| |
| } |
| bufoute+=oskip; |
| bufouto+=oskip; |
| bufy+=yskip<<1; |
| } |
| } |
| |
| void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb) |
| { |
| int tmp; |
| int i, j; |
| __s32 y, crR, crG, cbG, cbB; |
| __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; |
| int oskip, yskip; |
| unsigned char r, g, b; |
| |
| oskip=RTjpeg_width*2; |
| yskip=RTjpeg_width; |
| |
| bufcb=&buf[RTjpeg_width*RTjpeg_height]; |
| bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; |
| bufy=&buf[0]; |
| bufoute=rgb; |
| bufouto=rgb+oskip; |
| |
| for(i=0; i<(RTjpeg_height>>1); i++) |
| { |
| for(j=0; j<RTjpeg_width; j+=2) |
| { |
| crR=(*bufcr-128)*KcrR; |
| crG=(*(bufcr++)-128)*KcrG; |
| cbG=(*bufcb-128)*KcbG; |
| cbB=(*(bufcb++)-128)*KcbB; |
| |
| y=(bufy[j]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| b=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| g=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| r=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(int)((int)b >> 3); |
| tmp|=(int)(((int)g >> 2) << 5); |
| tmp|=(int)(((int)r >> 3) << 11); |
| *(bufoute++)=tmp&0xff; |
| *(bufoute++)=tmp>>8; |
| |
| |
| y=(bufy[j+1]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| b=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| g=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| r=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(int)((int)b >> 3); |
| tmp|=(int)(((int)g >> 2) << 5); |
| tmp|=(int)(((int)r >> 3) << 11); |
| *(bufoute++)=tmp&0xff; |
| *(bufoute++)=tmp>>8; |
| |
| y=(bufy[j+yskip]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| b=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| g=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| r=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(int)((int)b >> 3); |
| tmp|=(int)(((int)g >> 2) << 5); |
| tmp|=(int)(((int)r >> 3) << 11); |
| *(bufouto++)=tmp&0xff; |
| *(bufouto++)=tmp>>8; |
| |
| y=(bufy[j+1+yskip]-16)*Ky; |
| |
| tmp=(y+cbB)>>16; |
| b=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y-crG-cbG)>>16; |
| g=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(y+crR)>>16; |
| r=(tmp>255)?255:((tmp<0)?0:tmp); |
| tmp=(int)((int)b >> 3); |
| tmp|=(int)(((int)g >> 2) << 5); |
| tmp|=(int)(((int)r >> 3) << 11); |
| *(bufouto++)=tmp&0xff; |
| *(bufouto++)=tmp>>8; |
| |
| } |
| bufoute+=oskip; |
| bufouto+=oskip; |
| bufy+=yskip<<1; |
| } |
| } |
| |
| void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb) |
| { |
| bcopy(buf, rgb, RTjpeg_width*RTjpeg_height); |
| } |
| |
| void RTjpeg_double32(__u32 *buf) |
| { |
| int i, j; |
| |
| __u32 *iptr, *optr1, *optr2; |
| |
| iptr=buf+(RTjpeg_width*RTjpeg_height)-1; |
| optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1; |
| optr2=optr1-(2*RTjpeg_width); |
| |
| for(i=0; i<RTjpeg_height; i++) |
| { |
| for(j=0; j<RTjpeg_width; j++) |
| { |
| *(optr1--)=*iptr; |
| *(optr1--)=*iptr; |
| *(optr2--)=*iptr; |
| *(optr2--)=*(iptr--); |
| } |
| optr2=optr2-2*RTjpeg_width; |
| optr1=optr1-2*RTjpeg_width; |
| } |
| } |
| |
| void RTjpeg_double24(__u8 *buf) |
| { |
| } |
| |
| void RTjpeg_double16(__u16 *buf) |
| { |
| int i, j; |
| |
| __u16 *iptr, *optr1, *optr2; |
| |
| iptr=buf+(RTjpeg_width*RTjpeg_height)-1; |
| optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1; |
| optr2=optr1-(2*RTjpeg_width); |
| |
| for(i=0; i<RTjpeg_height; i++) |
| { |
| for(j=0; j<RTjpeg_width; j++) |
| { |
| *(optr1--)=*iptr; |
| *(optr1--)=*iptr; |
| *(optr2--)=*iptr; |
| *(optr2--)=*(iptr--); |
| } |
| optr2=optr2-2*RTjpeg_width; |
| optr1=optr1-2*RTjpeg_width; |
| } |
| } |
| |
| void RTjpeg_double8(__u8 *buf) |
| { |
| int i, j; |
| |
| __u8 *iptr, *optr1, *optr2; |
| |
| iptr=buf+(RTjpeg_width*RTjpeg_height)-1; |
| optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1; |
| optr2=optr1-(2*RTjpeg_width); |
| |
| for(i=0; i<RTjpeg_height; i++) |
| { |
| for(j=0; j<RTjpeg_width; j++) |
| { |
| *(optr1--)=*iptr; |
| *(optr1--)=*iptr; |
| *(optr2--)=*iptr; |
| *(optr2--)=*(iptr--); |
| } |
| optr2=optr2-2*RTjpeg_width; |
| optr1=optr1-2*RTjpeg_width; |
| } |
| } |
| |