Back to index

lightning-sunbird  0.9+nobinonly
jidctfst.c
Go to the documentation of this file.
00001 /*
00002  * jidctfst.c
00003  *
00004  * Copyright (C) 1994-1998, Thomas G. Lane.
00005  * This file is part of the Independent JPEG Group's software.
00006  * For conditions of distribution and use, see the accompanying README file.
00007  *
00008  * This file contains a fast, not so accurate integer implementation of the
00009  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
00010  * must also perform dequantization of the input coefficients.
00011  *
00012  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
00013  * on each row (or vice versa, but it's more convenient to emit a row at
00014  * a time).  Direct algorithms are also available, but they are much more
00015  * complex and seem not to be any faster when reduced to code.
00016  *
00017  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
00018  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
00019  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
00020  * JPEG textbook (see REFERENCES section in file README).  The following code
00021  * is based directly on figure 4-8 in P&M.
00022  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
00023  * possible to arrange the computation so that many of the multiplies are
00024  * simple scalings of the final outputs.  These multiplies can then be
00025  * folded into the multiplications or divisions by the JPEG quantization
00026  * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
00027  * to be done in the DCT itself.
00028  * The primary disadvantage of this method is that with fixed-point math,
00029  * accuracy is lost due to imprecise representation of the scaled
00030  * quantization values.  The smaller the quantization table entry, the less
00031  * precise the scaled value, so this implementation does worse with high-
00032  * quality-setting files than with low-quality ones.
00033  */
00034 
00035 #define JPEG_INTERNALS
00036 #include "jinclude.h"
00037 #include "jpeglib.h"
00038 #include "jdct.h"           /* Private declarations for DCT subsystem */
00039 
00040 
00041 #ifdef DCT_IFAST_SUPPORTED
00042 
00043 
00044 /*
00045  * This module is specialized to the case DCTSIZE = 8.
00046  */
00047 
00048 #if DCTSIZE != 8
00049   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
00050 #endif
00051 
00052 
00053 /* Scaling decisions are generally the same as in the LL&M algorithm;
00054  * see jidctint.c for more details.  However, we choose to descale
00055  * (right shift) multiplication products as soon as they are formed,
00056  * rather than carrying additional fractional bits into subsequent additions.
00057  * This compromises accuracy slightly, but it lets us save a few shifts.
00058  * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
00059  * everywhere except in the multiplications proper; this saves a good deal
00060  * of work on 16-bit-int machines.
00061  *
00062  * The dequantized coefficients are not integers because the AA&N scaling
00063  * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
00064  * so that the first and second IDCT rounds have the same input scaling.
00065  * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
00066  * avoid a descaling shift; this compromises accuracy rather drastically
00067  * for small quantization table entries, but it saves a lot of shifts.
00068  * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
00069  * so we use a much larger scaling factor to preserve accuracy.
00070  *
00071  * A final compromise is to represent the multiplicative constants to only
00072  * 8 fractional bits, rather than 13.  This saves some shifting work on some
00073  * machines, and may also reduce the cost of multiplication (since there
00074  * are fewer one-bits in the constants).
00075  */
00076 
00077 #if BITS_IN_JSAMPLE == 8
00078 #define CONST_BITS  8
00079 #define PASS1_BITS  2
00080 #else
00081 #define CONST_BITS  8
00082 #define PASS1_BITS  1              /* lose a little precision to avoid overflow */
00083 #endif
00084 
00085 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
00086  * causing a lot of useless floating-point operations at run time.
00087  * To get around this we use the following pre-calculated constants.
00088  * If you change CONST_BITS you may want to add appropriate values.
00089  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
00090  */
00091 
00092 #if CONST_BITS == 8
00093 #define FIX_1_082392200  ((INT32)  277)          /* FIX(1.082392200) */
00094 #define FIX_1_414213562  ((INT32)  362)          /* FIX(1.414213562) */
00095 #define FIX_1_847759065  ((INT32)  473)          /* FIX(1.847759065) */
00096 #define FIX_2_613125930  ((INT32)  669)          /* FIX(2.613125930) */
00097 #else
00098 #define FIX_1_082392200  FIX(1.082392200)
00099 #define FIX_1_414213562  FIX(1.414213562)
00100 #define FIX_1_847759065  FIX(1.847759065)
00101 #define FIX_2_613125930  FIX(2.613125930)
00102 #endif
00103 
00104 
00105 /* We can gain a little more speed, with a further compromise in accuracy,
00106  * by omitting the addition in a descaling shift.  This yields an incorrectly
00107  * rounded result half the time...
00108  */
00109 
00110 #ifndef USE_ACCURATE_ROUNDING
00111 #undef DESCALE
00112 #define DESCALE(x,n)  RIGHT_SHIFT(x, n)
00113 #endif
00114 
00115 
00116 /* Multiply a DCTELEM variable by an INT32 constant, and immediately
00117  * descale to yield a DCTELEM result.
00118  */
00119 
00120 #define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
00121 
00122 
00123 /* Dequantize a coefficient by multiplying it by the multiplier-table
00124  * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
00125  * multiplication will do.  For 12-bit data, the multiplier table is
00126  * declared INT32, so a 32-bit multiply will be used.
00127  */
00128 
00129 #if BITS_IN_JSAMPLE == 8
00130 #define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
00131 #else
00132 #define DEQUANTIZE(coef,quantval)  \
00133        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
00134 #endif
00135 
00136 
00137 /* Like DESCALE, but applies to a DCTELEM and produces an int.
00138  * We assume that int right shift is unsigned if INT32 right shift is.
00139  */
00140 
00141 #ifdef RIGHT_SHIFT_IS_UNSIGNED
00142 #define ISHIFT_TEMPS DCTELEM ishift_temp;
00143 #if BITS_IN_JSAMPLE == 8
00144 #define DCTELEMBITS  16            /* DCTELEM may be 16 or 32 bits */
00145 #else
00146 #define DCTELEMBITS  32            /* DCTELEM must be 32 bits */
00147 #endif
00148 #define IRIGHT_SHIFT(x,shft)  \
00149     ((ishift_temp = (x)) < 0 ? \
00150      (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
00151      (ishift_temp >> (shft)))
00152 #else
00153 #define ISHIFT_TEMPS
00154 #define IRIGHT_SHIFT(x,shft)       ((x) >> (shft))
00155 #endif
00156 
00157 #ifdef USE_ACCURATE_ROUNDING
00158 #define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
00159 #else
00160 #define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
00161 #endif
00162 
00163 #ifdef HAVE_MMX_INTEL_MNEMONICS
00164 __inline GLOBAL(void)
00165 jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00166                JCOEFPTR coef_block,
00167                JSAMPARRAY output_buf, JDIMENSION output_col);
00168 __inline GLOBAL(void)
00169 jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00170                JCOEFPTR coef_block,
00171                JSAMPARRAY output_buf, JDIMENSION output_col);
00172 #endif
00173 
00174 GLOBAL(void)
00175 jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
00176                JCOEFPTR coef_block,
00177                JSAMPARRAY output_buf, JDIMENSION output_col);
00178 
00179 
00180 #ifdef HAVE_MMX_INTEL_MNEMONICS
00181 GLOBAL(void)
00182 jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00183                JCOEFPTR coef_block,
00184                JSAMPARRAY output_buf, JDIMENSION output_col)
00185 {
00186 if (MMXAvailable)
00187        jpeg_idct_ifast_mmx(cinfo, compptr, coef_block, output_buf, output_col);
00188 else
00189        jpeg_idct_ifast_orig(cinfo, compptr, coef_block, output_buf, output_col);
00190 }
00191 #else
00192 
00193 /*
00194  * Perform dequantization and inverse DCT on one block of coefficients.
00195  */
00196 
00197 GLOBAL (void)
00198 jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00199                JCOEFPTR coef_block,
00200                JSAMPARRAY output_buf, JDIMENSION output_col)
00201 {
00202   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
00203   DCTELEM tmp10, tmp11, tmp12, tmp13;
00204   DCTELEM z5, z10, z11, z12, z13;
00205   JCOEFPTR inptr;
00206   IFAST_MULT_TYPE * quantptr;
00207   int * wsptr;
00208   JSAMPROW outptr;
00209   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
00210   int ctr;
00211   int workspace[DCTSIZE2];  /* buffers data between passes */
00212   SHIFT_TEMPS               /* for DESCALE */
00213   ISHIFT_TEMPS                     /* for IDESCALE */
00214 
00215   /* Pass 1: process columns from input, store into work array. */
00216 
00217   inptr = coef_block;
00218   quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
00219   wsptr = workspace;
00220   for (ctr = DCTSIZE; ctr > 0; ctr--) {
00221     /* Due to quantization, we will usually find that many of the input
00222      * coefficients are zero, especially the AC terms.  We can exploit this
00223      * by short-circuiting the IDCT calculation for any column in which all
00224      * the AC terms are zero.  In that case each output is equal to the
00225      * DC coefficient (with scale factor as needed).
00226      * With typical images and quantization tables, half or more of the
00227      * column DCT calculations can be simplified this way.
00228      */
00229     
00230     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
00231        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
00232        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
00233        inptr[DCTSIZE*7] == 0) {
00234       /* AC terms all zero */
00235       int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00236 
00237       wsptr[DCTSIZE*0] = dcval;
00238       wsptr[DCTSIZE*1] = dcval;
00239       wsptr[DCTSIZE*2] = dcval;
00240       wsptr[DCTSIZE*3] = dcval;
00241       wsptr[DCTSIZE*4] = dcval;
00242       wsptr[DCTSIZE*5] = dcval;
00243       wsptr[DCTSIZE*6] = dcval;
00244       wsptr[DCTSIZE*7] = dcval;
00245       
00246       inptr++;                     /* advance pointers to next column */
00247       quantptr++;
00248       wsptr++;
00249       continue;
00250     }
00251     
00252     /* Even part */
00253 
00254     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00255     tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
00256     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
00257     tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
00258 
00259     tmp10 = tmp0 + tmp2;    /* phase 3 */
00260     tmp11 = tmp0 - tmp2;
00261 
00262     tmp13 = tmp1 + tmp3;    /* phases 5-3 */
00263     tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
00264 
00265     tmp0 = tmp10 + tmp13;   /* phase 2 */
00266     tmp3 = tmp10 - tmp13;
00267     tmp1 = tmp11 + tmp12;
00268     tmp2 = tmp11 - tmp12;
00269     
00270     /* Odd part */
00271 
00272     tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
00273     tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
00274     tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
00275     tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
00276 
00277     z13 = tmp6 + tmp5;             /* phase 6 */
00278     z10 = tmp6 - tmp5;
00279     z11 = tmp4 + tmp7;
00280     z12 = tmp4 - tmp7;
00281 
00282     tmp7 = z11 + z13;              /* phase 5 */
00283     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
00284 
00285     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
00286     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
00287     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
00288 
00289     tmp6 = tmp12 - tmp7;    /* phase 2 */
00290     tmp5 = tmp11 - tmp6;
00291     tmp4 = tmp10 + tmp5;
00292 
00293     wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
00294     wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
00295     wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
00296     wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
00297     wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
00298     wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
00299     wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
00300     wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
00301 
00302     inptr++;                /* advance pointers to next column */
00303     quantptr++;
00304     wsptr++;
00305   }
00306   
00307   /* Pass 2: process rows from work array, store into output array. */
00308   /* Note that we must descale the results by a factor of 8 == 2**3, */
00309   /* and also undo the PASS1_BITS scaling. */
00310 
00311   wsptr = workspace;
00312   for (ctr = 0; ctr < DCTSIZE; ctr++) {
00313     outptr = output_buf[ctr] + output_col;
00314     /* Rows of zeroes can be exploited in the same way as we did with columns.
00315      * However, the column calculation has created many nonzero AC terms, so
00316      * the simplification applies less often (typically 5% to 10% of the time).
00317      * On machines with very fast multiplication, it's possible that the
00318      * test takes more time than it's worth.  In that case this section
00319      * may be commented out.
00320      */
00321     
00322 #ifndef NO_ZERO_ROW_TEST
00323     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
00324        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
00325       /* AC terms all zero */
00326       JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
00327                               & RANGE_MASK];
00328       
00329       outptr[0] = dcval;
00330       outptr[1] = dcval;
00331       outptr[2] = dcval;
00332       outptr[3] = dcval;
00333       outptr[4] = dcval;
00334       outptr[5] = dcval;
00335       outptr[6] = dcval;
00336       outptr[7] = dcval;
00337 
00338       wsptr += DCTSIZE;            /* advance pointer to next row */
00339       continue;
00340     }
00341 #endif
00342     
00343     /* Even part */
00344 
00345     tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
00346     tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
00347 
00348     tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
00349     tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
00350            - tmp13;
00351 
00352     tmp0 = tmp10 + tmp13;
00353     tmp3 = tmp10 - tmp13;
00354     tmp1 = tmp11 + tmp12;
00355     tmp2 = tmp11 - tmp12;
00356 
00357     /* Odd part */
00358 
00359     z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
00360     z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
00361     z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
00362     z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
00363 
00364     tmp7 = z11 + z13;              /* phase 5 */
00365     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
00366 
00367     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
00368     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
00369     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
00370 
00371     tmp6 = tmp12 - tmp7;    /* phase 2 */
00372     tmp5 = tmp11 - tmp6;
00373     tmp4 = tmp10 + tmp5;
00374 
00375     /* Final output stage: scale down by a factor of 8 and range-limit */
00376 
00377     outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
00378                          & RANGE_MASK];
00379     outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
00380                          & RANGE_MASK];
00381     outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
00382                          & RANGE_MASK];
00383     outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
00384                          & RANGE_MASK];
00385     outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
00386                          & RANGE_MASK];
00387     outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
00388                          & RANGE_MASK];
00389     outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
00390                          & RANGE_MASK];
00391     outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
00392                          & RANGE_MASK];
00393 
00394     wsptr += DCTSIZE;              /* advance pointer to next row */
00395   }
00396 }
00397 
00398 #endif
00399 
00400 #ifdef HAVE_MMX_INTEL_MNEMONICS
00401 
00402 
00403 _inline GLOBAL(void)
00404 jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00405                JCOEFPTR coef_block,
00406                JSAMPARRAY output_buf, JDIMENSION output_col)
00407 {
00408   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
00409   DCTELEM tmp10, tmp11, tmp12, tmp13;
00410   DCTELEM z5, z10, z11, z12, z13;
00411   JCOEFPTR inptr;
00412   IFAST_MULT_TYPE * quantptr;
00413   int * wsptr;
00414   JSAMPROW outptr;
00415   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
00416   int ctr;
00417   int workspace[DCTSIZE2];  /* buffers data between passes */
00418   SHIFT_TEMPS               /* for DESCALE */
00419   ISHIFT_TEMPS                     /* for IDESCALE */
00420 
00421   /* Pass 1: process columns from input, store into work array. */
00422 
00423   inptr = coef_block;
00424   quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
00425   wsptr = workspace;
00426   for (ctr = DCTSIZE; ctr > 0; ctr--) {
00427     /* Due to quantization, we will usually find that many of the input
00428      * coefficients are zero, especially the AC terms.  We can exploit this
00429      * by short-circuiting the IDCT calculation for any column in which all
00430      * the AC terms are zero.  In that case each output is equal to the
00431      * DC coefficient (with scale factor as needed).
00432      * With typical images and quantization tables, half or more of the
00433      * column DCT calculations can be simplified this way.
00434      */
00435     
00436     if ((inptr[DCTSIZE*1] | inptr[DCTSIZE*2] | inptr[DCTSIZE*3] |
00437         inptr[DCTSIZE*4] | inptr[DCTSIZE*5] | inptr[DCTSIZE*6] |
00438         inptr[DCTSIZE*7]) == 0) {
00439       /* AC terms all zero */
00440       int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00441 
00442       wsptr[DCTSIZE*0] = dcval;
00443       wsptr[DCTSIZE*1] = dcval;
00444       wsptr[DCTSIZE*2] = dcval;
00445       wsptr[DCTSIZE*3] = dcval;
00446       wsptr[DCTSIZE*4] = dcval;
00447       wsptr[DCTSIZE*5] = dcval;
00448       wsptr[DCTSIZE*6] = dcval;
00449       wsptr[DCTSIZE*7] = dcval;
00450       
00451       inptr++;                     /* advance pointers to next column */
00452       quantptr++;
00453       wsptr++;
00454       continue;
00455     }
00456     
00457     /* Even part */
00458 
00459     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00460     tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
00461     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
00462     tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
00463 
00464     tmp10 = tmp0 + tmp2;    /* phase 3 */
00465     tmp11 = tmp0 - tmp2;
00466 
00467     tmp13 = tmp1 + tmp3;    /* phases 5-3 */
00468     tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
00469 
00470     tmp0 = tmp10 + tmp13;   /* phase 2 */
00471     tmp3 = tmp10 - tmp13;
00472     tmp1 = tmp11 + tmp12;
00473     tmp2 = tmp11 - tmp12;
00474     
00475     /* Odd part */
00476 
00477     tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
00478     tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
00479     tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
00480     tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
00481 
00482     z13 = tmp6 + tmp5;             /* phase 6 */
00483     z10 = tmp6 - tmp5;
00484     z11 = tmp4 + tmp7;
00485     z12 = tmp4 - tmp7;
00486 
00487     tmp7 = z11 + z13;              /* phase 5 */
00488     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
00489 
00490     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
00491     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
00492     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
00493 
00494     tmp6 = tmp12 - tmp7;    /* phase 2 */
00495     tmp5 = tmp11 - tmp6;
00496     tmp4 = tmp10 + tmp5;
00497 
00498     wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
00499     wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
00500     wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
00501     wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
00502     wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
00503     wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
00504     wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
00505     wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
00506 
00507     inptr++;                /* advance pointers to next column */
00508     quantptr++;
00509     wsptr++;
00510   }
00511   
00512   /* Pass 2: process rows from work array, store into output array. */
00513   /* Note that we must descale the results by a factor of 8 == 2**3, */
00514   /* and also undo the PASS1_BITS scaling. */
00515 
00516   wsptr = workspace;
00517   for (ctr = 0; ctr < DCTSIZE; ctr++) {
00518     outptr = output_buf[ctr] + output_col;
00519     /* Rows of zeroes can be exploited in the same way as we did with columns.
00520      * However, the column calculation has created many nonzero AC terms, so
00521      * the simplification applies less often (typically 5% to 10% of the time).
00522      * On machines with very fast multiplication, it's possible that the
00523      * test takes more time than it's worth.  In that case this section
00524      * may be commented out.
00525      */
00526     
00527 #ifndef NO_ZERO_ROW_TEST
00528     if ((wsptr[1] | wsptr[2] | wsptr[3] | wsptr[4] | wsptr[5] | wsptr[6] |
00529         wsptr[7]) == 0) {
00530       /* AC terms all zero */
00531       JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
00532                               & RANGE_MASK];
00533       
00534       outptr[0] = dcval;
00535       outptr[1] = dcval;
00536       outptr[2] = dcval;
00537       outptr[3] = dcval;
00538       outptr[4] = dcval;
00539       outptr[5] = dcval;
00540       outptr[6] = dcval;
00541       outptr[7] = dcval;
00542 
00543       wsptr += DCTSIZE;            /* advance pointer to next row */
00544       continue;
00545     }
00546 #endif
00547     
00548     /* Even part */
00549 
00550     tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
00551     tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
00552 
00553     tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
00554     tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
00555            - tmp13;
00556 
00557     tmp0 = tmp10 + tmp13;
00558     tmp3 = tmp10 - tmp13;
00559     tmp1 = tmp11 + tmp12;
00560     tmp2 = tmp11 - tmp12;
00561 
00562     /* Odd part */
00563 
00564     z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
00565     z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
00566     z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
00567     z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
00568 
00569     tmp7 = z11 + z13;              /* phase 5 */
00570     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
00571 
00572     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
00573     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
00574     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
00575 
00576     tmp6 = tmp12 - tmp7;    /* phase 2 */
00577     tmp5 = tmp11 - tmp6;
00578     tmp4 = tmp10 + tmp5;
00579 
00580     /* Final output stage: scale down by a factor of 8 and range-limit */
00581 
00582     outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
00583                          & RANGE_MASK];
00584     outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
00585                          & RANGE_MASK];
00586     outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
00587                          & RANGE_MASK];
00588     outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
00589                          & RANGE_MASK];
00590     outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
00591                          & RANGE_MASK];
00592     outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
00593                          & RANGE_MASK];
00594     outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
00595                          & RANGE_MASK];
00596     outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
00597                          & RANGE_MASK];
00598 
00599     wsptr += DCTSIZE;              /* advance pointer to next row */
00600   }
00601 }
00602 
00603 
00604        static   __int64 fix_141           = 0x5a825a825a825a82;
00605        static   __int64 fix_184n261       = 0xcf04cf04cf04cf04;
00606        static   __int64 fix_184           = 0x7641764176417641;
00607        static   __int64 fix_n184          = 0x896f896f896f896f;
00608        static   __int64 fix_108n184       = 0xcf04cf04cf04cf04;
00609        static   __int64 const_0x0080      = 0x0080008000800080;
00610 
00611 
00612 __inline GLOBAL(void)
00613 jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00614                JCOEFPTR inptr,
00615                JSAMPARRAY outptr, JDIMENSION output_col)
00616 {
00617 
00618   int16 workspace[DCTSIZE2 + 4];   /* buffers data between passes */
00619   int16 *wsptr=workspace;
00620   int16 *quantptr=compptr->dct_table;
00621 
00622   __asm{ 
00623     
00624        mov           edi, quantptr
00625        mov           ebx, inptr
00626        mov           esi, wsptr
00627        add           esi, 0x07            ;align wsptr to qword
00628        and           esi, 0xfffffff8      ;align wsptr to qword
00629 
00630        mov           eax, esi
00631 
00632     /* Odd part */
00633 
00634 
00635        movq          mm1, [ebx + 8*10]           ;load inptr[DCTSIZE*5]
00636 
00637        pmullw        mm1, [edi + 8*10]           ;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
00638 
00639        movq          mm0, [ebx + 8*6]            ;load inptr[DCTSIZE*3]
00640 
00641        pmullw        mm0, [edi + 8*6]            ;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
00642 
00643        movq          mm3, [ebx + 8*2]            ;load inptr[DCTSIZE*1]
00644        movq   mm2, mm1                                  ;copy tmp6    /* phase 6 */
00645 
00646        pmullw        mm3, [edi + 8*2]            ;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
00647 
00648        movq          mm4, [ebx + 8*14]           ;load inptr[DCTSIZE*1]
00649        paddw  mm1, mm0                                  ;z13 = tmp6 + tmp5;
00650 
00651        pmullw        mm4, [edi + 8*14]        ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
00652        psubw  mm2, mm0                                  ;z10 = tmp6 - tmp5   
00653 
00654        psllw         mm2, 2                      ;shift z10
00655        movq          mm0, mm2                    ;copy z10
00656 
00657        pmulhw        mm2, fix_184n261     ;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
00658        movq          mm5, mm3                           ;copy tmp4
00659 
00660        pmulhw        mm0, fix_n184        ;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
00661        paddw         mm3, mm4                           ;z11 = tmp4 + tmp7;
00662 
00663        movq          mm6, mm3                           ;copy z11                   /* phase 5 */
00664        psubw         mm5, mm4                           ;z12 = tmp4 - tmp7;
00665 
00666        psubw         mm6, mm1                           ;z11-z13
00667        psllw         mm5, 2                      ;shift z12
00668 
00669        movq          mm4, [ebx + 8*12]           ;load inptr[DCTSIZE*6], even part
00670        movq          mm7, mm5                    ;copy z12
00671 
00672        pmulhw        mm5, fix_108n184     ;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
00673        paddw         mm3, mm1                           ;tmp7 = z11 + z13;   
00674 
00675 
00676     /* Even part */
00677        pmulhw        mm7, fix_184         ;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
00678        psllw         mm6, 2
00679 
00680        movq          mm1, [ebx + 8*4]            ;load inptr[DCTSIZE*2]
00681 
00682        pmullw        mm1, [edi + 8*4]            ;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
00683        paddw         mm0, mm5                    ;tmp10
00684 
00685        pmullw        mm4, [edi + 8*12]           ;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
00686        paddw         mm2, mm7                    ;tmp12
00687 
00688        pmulhw        mm6, fix_141                ;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
00689        psubw         mm2, mm3             ;tmp6 = tmp12 - tmp7
00690 
00691        movq          mm5, mm1                           ;copy tmp1
00692        paddw         mm1, mm4                           ;tmp13= tmp1 + tmp3; /* phases 5-3 */
00693 
00694        psubw         mm5, mm4                           ;tmp1-tmp3
00695        psubw         mm6, mm2             ;tmp5 = tmp11 - tmp6;
00696 
00697        movq          [esi+8*0], mm1                     ;save tmp13 in workspace
00698        psllw         mm5, 2                             ;shift tmp1-tmp3
00699     
00700        movq          mm7, [ebx + 8*0]            ;load inptr[DCTSIZE*0]
00701 
00702        pmulhw        mm5, fix_141                ;MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
00703        paddw         mm0, mm6             ;tmp4 = tmp10 + tmp5;
00704 
00705        pmullw        mm7, [edi + 8*0]            ;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00706 
00707        movq          mm4, [ebx + 8*8]            ;load inptr[DCTSIZE*4]
00708        
00709        pmullw        mm4, [edi + 8*8]            ;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
00710        psubw         mm5, mm1                           ;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
00711 
00712        movq          [esi+8*4], mm0              ;save tmp4 in workspace
00713        movq          mm1, mm7                    ;copy tmp0    /* phase 3 */
00714 
00715        movq          [esi+8*2], mm5              ;save tmp12 in workspace
00716        psubw         mm1, mm4                    ;tmp11 = tmp0 - tmp2; 
00717 
00718        paddw         mm7, mm4                    ;tmp10 = tmp0 + tmp2;
00719     movq             mm5, mm1             ;copy tmp11
00720        
00721        paddw         mm1, [esi+8*2]       ;tmp1 = tmp11 + tmp12;
00722        movq          mm4, mm7             ;copy tmp10          /* phase 2 */
00723 
00724        paddw         mm7, [esi+8*0]       ;tmp0 = tmp10 + tmp13;      
00725 
00726        psubw         mm4, [esi+8*0]       ;tmp3 = tmp10 - tmp13;
00727        movq          mm0, mm7             ;copy tmp0
00728 
00729        psubw         mm5, [esi+8*2]       ;tmp2 = tmp11 - tmp12;
00730        paddw         mm7, mm3             ;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
00731        
00732        psubw         mm0, mm3                    ;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
00733 
00734        movq          [esi + 8*0], mm7     ;wsptr[DCTSIZE*0]
00735        movq          mm3, mm1                    ;copy tmp1
00736 
00737        movq          [esi + 8*14], mm0    ;wsptr[DCTSIZE*7]
00738        paddw         mm1, mm2                    ;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
00739 
00740        psubw         mm3, mm2                    ;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
00741 
00742        movq          [esi + 8*2], mm1     ;wsptr[DCTSIZE*1]
00743        movq          mm1, mm4                    ;copy tmp3
00744 
00745        movq          [esi + 8*12], mm3    ;wsptr[DCTSIZE*6]
00746 
00747        paddw         mm4, [esi+8*4]              ;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
00748 
00749        psubw         mm1, [esi+8*4]              ;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
00750 
00751        movq          [esi + 8*8], mm4
00752        movq          mm7, mm5                    ;copy tmp2
00753 
00754        paddw         mm5, mm6                    ;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
00755 
00756        movq          [esi+8*6], mm1              ;
00757        psubw         mm7, mm6                    ;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
00758 
00759        movq          [esi + 8*4], mm5
00760 
00761        movq          [esi + 8*10], mm7
00762 
00763 
00764 
00765 /*****************************************************************/
00766        add           edi, 8
00767        add           ebx, 8
00768        add           esi, 8
00769 
00770 /*****************************************************************/
00771 
00772 
00773 
00774 
00775        movq          mm1, [ebx + 8*10]           ;load inptr[DCTSIZE*5]
00776 
00777        pmullw        mm1, [edi + 8*10]           ;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
00778 
00779        movq          mm0, [ebx + 8*6]            ;load inptr[DCTSIZE*3]
00780 
00781        pmullw        mm0, [edi + 8*6]            ;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
00782 
00783        movq          mm3, [ebx + 8*2]            ;load inptr[DCTSIZE*1]
00784        movq   mm2, mm1                                  ;copy tmp6    /* phase 6 */
00785 
00786        pmullw        mm3, [edi + 8*2]            ;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
00787 
00788        movq          mm4, [ebx + 8*14]           ;load inptr[DCTSIZE*1]
00789        paddw  mm1, mm0                                  ;z13 = tmp6 + tmp5;
00790 
00791        pmullw        mm4, [edi + 8*14]        ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
00792        psubw  mm2, mm0                                  ;z10 = tmp6 - tmp5   
00793 
00794        psllw         mm2, 2                      ;shift z10
00795        movq          mm0, mm2                    ;copy z10
00796 
00797        pmulhw        mm2, fix_184n261     ;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
00798        movq          mm5, mm3                           ;copy tmp4
00799 
00800        pmulhw        mm0, fix_n184        ;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
00801        paddw         mm3, mm4                           ;z11 = tmp4 + tmp7;
00802 
00803        movq          mm6, mm3                           ;copy z11                   /* phase 5 */
00804        psubw         mm5, mm4                           ;z12 = tmp4 - tmp7;
00805 
00806        psubw         mm6, mm1                           ;z11-z13
00807        psllw         mm5, 2                      ;shift z12
00808 
00809        movq          mm4, [ebx + 8*12]           ;load inptr[DCTSIZE*6], even part
00810        movq          mm7, mm5                    ;copy z12
00811 
00812        pmulhw        mm5, fix_108n184     ;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
00813        paddw         mm3, mm1                           ;tmp7 = z11 + z13;   
00814 
00815 
00816     /* Even part */
00817        pmulhw        mm7, fix_184         ;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
00818        psllw         mm6, 2
00819 
00820        movq          mm1, [ebx + 8*4]            ;load inptr[DCTSIZE*2]
00821 
00822        pmullw        mm1, [edi + 8*4]            ;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
00823        paddw         mm0, mm5                    ;tmp10
00824 
00825        pmullw        mm4, [edi + 8*12]           ;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
00826        paddw         mm2, mm7                    ;tmp12
00827 
00828        pmulhw        mm6, fix_141                ;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
00829        psubw         mm2, mm3             ;tmp6 = tmp12 - tmp7
00830 
00831        movq          mm5, mm1                           ;copy tmp1
00832        paddw         mm1, mm4                           ;tmp13= tmp1 + tmp3; /* phases 5-3 */
00833 
00834        psubw         mm5, mm4                           ;tmp1-tmp3
00835        psubw         mm6, mm2             ;tmp5 = tmp11 - tmp6;
00836 
00837        movq          [esi+8*0], mm1                     ;save tmp13 in workspace
00838        psllw         mm5, 2                             ;shift tmp1-tmp3
00839     
00840        movq          mm7, [ebx + 8*0]            ;load inptr[DCTSIZE*0]
00841        paddw         mm0, mm6             ;tmp4 = tmp10 + tmp5;
00842 
00843        pmulhw        mm5, fix_141                ;MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
00844 
00845        pmullw        mm7, [edi + 8*0]            ;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00846 
00847        movq          mm4, [ebx + 8*8]            ;load inptr[DCTSIZE*4]
00848        
00849        pmullw        mm4, [edi + 8*8]            ;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
00850        psubw         mm5, mm1                           ;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
00851 
00852        movq          [esi+8*4], mm0              ;save tmp4 in workspace
00853        movq          mm1, mm7                    ;copy tmp0    /* phase 3 */
00854 
00855        movq          [esi+8*2], mm5              ;save tmp12 in workspace
00856        psubw         mm1, mm4                    ;tmp11 = tmp0 - tmp2; 
00857 
00858        paddw         mm7, mm4                    ;tmp10 = tmp0 + tmp2;
00859     movq             mm5, mm1             ;copy tmp11
00860        
00861        paddw         mm1, [esi+8*2]       ;tmp1 = tmp11 + tmp12;
00862        movq          mm4, mm7             ;copy tmp10          /* phase 2 */
00863 
00864        paddw         mm7, [esi+8*0]       ;tmp0 = tmp10 + tmp13;      
00865 
00866        psubw         mm4, [esi+8*0]       ;tmp3 = tmp10 - tmp13;
00867        movq          mm0, mm7             ;copy tmp0
00868 
00869        psubw         mm5, [esi+8*2]       ;tmp2 = tmp11 - tmp12;
00870        paddw         mm7, mm3             ;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
00871        
00872        psubw         mm0, mm3                    ;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
00873 
00874        movq          [esi + 8*0], mm7     ;wsptr[DCTSIZE*0]
00875        movq          mm3, mm1                    ;copy tmp1
00876 
00877        movq          [esi + 8*14], mm0    ;wsptr[DCTSIZE*7]
00878        paddw         mm1, mm2                    ;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
00879 
00880        psubw         mm3, mm2                    ;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
00881 
00882        movq          [esi + 8*2], mm1     ;wsptr[DCTSIZE*1]
00883        movq          mm1, mm4                    ;copy tmp3
00884 
00885        movq          [esi + 8*12], mm3    ;wsptr[DCTSIZE*6]
00886 
00887        paddw         mm4, [esi+8*4]              ;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
00888 
00889        psubw         mm1, [esi+8*4]              ;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
00890 
00891        movq          [esi + 8*8], mm4
00892        movq          mm7, mm5                    ;copy tmp2
00893 
00894        paddw         mm5, mm6                    ;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
00895 
00896        movq          [esi+8*6], mm1              ;
00897        psubw         mm7, mm6                    ;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
00898 
00899        movq          [esi + 8*4], mm5
00900 
00901        movq          [esi + 8*10], mm7
00902 
00903 
00904 
00905 
00906 /*****************************************************************/
00907 
00908   /* Pass 2: process rows from work array, store into output array. */
00909   /* Note that we must descale the results by a factor of 8 == 2**3, */
00910   /* and also undo the PASS1_BITS scaling. */
00911 
00912 /*****************************************************************/
00913     /* Even part */
00914 
00915        mov                  esi, eax
00916        mov                  eax, outptr
00917 
00918 //    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
00919 //    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
00920 //    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
00921 //    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
00922        movq          mm0, [esi+8*0]              ;wsptr[0,0],[0,1],[0,2],[0,3]
00923 
00924        movq          mm1, [esi+8*1]              ;wsptr[0,4],[0,5],[0,6],[0,7]
00925        movq          mm2, mm0
00926        
00927        movq          mm3, [esi+8*2]              ;wsptr[1,0],[1,1],[1,2],[1,3]
00928        paddw         mm0, mm1                    ;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
00929 
00930        movq          mm4, [esi+8*3]              ;wsptr[1,4],[1,5],[1,6],[1,7]
00931        psubw         mm2, mm1                    ;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
00932 
00933        movq          mm6, mm0
00934        movq          mm5, mm3
00935        
00936        paddw         mm3, mm4                    ;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
00937        movq          mm1, mm2
00938 
00939        psubw         mm5, mm4                    ;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
00940        punpcklwd     mm0, mm3                    ;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
00941 
00942        movq          mm7, [esi+8*7]              ;wsptr[3,4],[3,5],[3,6],[3,7]
00943        punpckhwd     mm6, mm3                    ;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
00944 
00945        movq          mm3, [esi+8*4]              ;wsptr[2,0],[2,1],[2,2],[2,3]
00946        punpckldq     mm0, mm6      ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
00947 
00948        punpcklwd     mm1, mm5                    ;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
00949        movq          mm4, mm3
00950 
00951        movq          mm6, [esi+8*6]              ;wsptr[3,0],[3,1],[3,2],[3,3]
00952        punpckhwd     mm2, mm5                    ;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
00953 
00954        movq          mm5, [esi+8*5]              ;wsptr[2,4],[2,5],[2,6],[2,7]
00955        punpckldq     mm1, mm2      ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
00956 
00957        
00958        paddw         mm3, mm5                    ;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
00959        movq          mm2, mm6
00960 
00961        psubw         mm4, mm5                    ;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
00962        paddw         mm6, mm7                    ;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
00963 
00964        movq          mm5, mm3
00965        punpcklwd     mm3, mm6                    ;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
00966        
00967        psubw         mm2, mm7                    ;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
00968        punpckhwd     mm5, mm6                    ;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
00969 
00970        movq          mm7, mm4
00971        punpckldq     mm3, mm5      ;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
00972 
00973        punpcklwd     mm4, mm2                    ;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
00974 
00975        punpckhwd     mm7, mm2                    ;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
00976 
00977        punpckldq     mm4, mm7      ;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
00978        movq          mm6, mm1
00979 
00980 //     mm0 =  ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
00981 //     mm1 =  ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
00982 
00983 
00984        movq          mm2, mm0
00985        punpckhdq     mm6, mm4      ;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
00986 
00987        punpckldq     mm1, mm4      ;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
00988        psllw         mm6, 2
00989 
00990        pmulhw        mm6, fix_141
00991        punpckldq     mm0, mm3      ;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
00992 
00993        punpckhdq     mm2, mm3      ;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
00994        movq          mm7, mm0
00995 
00996 //    tmp0 = tmp10 + tmp13;
00997 //    tmp3 = tmp10 - tmp13;
00998        paddw         mm0, mm2      ;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
00999        psubw         mm7, mm2      ;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
01000 
01001 //    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
01002        psubw         mm6, mm2      ;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
01003 //    tmp1 = tmp11 + tmp12;
01004 //    tmp2 = tmp11 - tmp12;
01005        movq          mm5, mm1
01006 
01007 
01008 
01009     /* Odd part */
01010 
01011 //    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
01012 //    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
01013 //    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
01014 //    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
01015        movq          mm3, [esi+8*0]              ;wsptr[0,0],[0,1],[0,2],[0,3]
01016        paddw         mm1, mm6      ;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
01017 
01018        movq          mm4, [esi+8*1]              ;wsptr[0,4],[0,5],[0,6],[0,7]
01019        psubw         mm5, mm6      ;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
01020 
01021        movq          mm6, mm3
01022        punpckldq     mm3, mm4                    ;wsptr[0,0],[0,1],[0,4],[0,5]
01023 
01024        punpckhdq     mm4, mm6                    ;wsptr[0,6],[0,7],[0,2],[0,3]
01025        movq          mm2, mm3
01026 
01027 //Save tmp0 and tmp1 in wsptr
01028        movq          [esi+8*0], mm0              ;save tmp0
01029        paddw         mm2, mm4                    ;wsptr[xxx],[0,z11],[xxx],[0,z13]
01030 
01031        
01032 //Continue with z10 --- z13
01033        movq          mm6, [esi+8*2]              ;wsptr[1,0],[1,1],[1,2],[1,3]
01034        psubw         mm3, mm4                    ;wsptr[xxx],[0,z12],[xxx],[0,z10]
01035 
01036        movq          mm0, [esi+8*3]              ;wsptr[1,4],[1,5],[1,6],[1,7]
01037        movq          mm4, mm6
01038 
01039        movq          [esi+8*1], mm1              ;save tmp1
01040        punpckldq     mm6, mm0                    ;wsptr[1,0],[1,1],[1,4],[1,5]
01041 
01042        punpckhdq     mm0, mm4                    ;wsptr[1,6],[1,7],[1,2],[1,3]
01043        movq          mm1, mm6
01044        
01045 //Save tmp2 and tmp3 in wsptr
01046        paddw         mm6, mm0             ;wsptr[xxx],[1,z11],[xxx],[1,z13]
01047        movq          mm4, mm2
01048        
01049 //Continue with z10 --- z13
01050        movq          [esi+8*2], mm5              ;save tmp2
01051        punpcklwd     mm2, mm6             ;wsptr[xxx],[xxx],[0,z11],[1,z11]
01052 
01053        psubw         mm1, mm0             ;wsptr[xxx],[1,z12],[xxx],[1,z10]
01054        punpckhwd     mm4, mm6             ;wsptr[xxx],[xxx],[0,z13],[1,z13]
01055 
01056        movq          mm0, mm3
01057        punpcklwd     mm3, mm1             ;wsptr[xxx],[xxx],[0,z12],[1,z12]
01058 
01059        movq          [esi+8*3], mm7              ;save tmp3
01060        punpckhwd     mm0, mm1             ;wsptr[xxx],[xxx],[0,z10],[1,z10]
01061 
01062        movq          mm6, [esi+8*4]              ;wsptr[2,0],[2,1],[2,2],[2,3]
01063        punpckhdq     mm0, mm2             ;wsptr[0,z10],[1,z10],[0,z11],[1,z11]
01064 
01065        movq          mm7, [esi+8*5]              ;wsptr[2,4],[2,5],[2,6],[2,7]
01066        punpckhdq     mm3, mm4             ;wsptr[0,z12],[1,z12],[0,z13],[1,z13]
01067 
01068        movq          mm1, [esi+8*6]              ;wsptr[3,0],[3,1],[3,2],[3,3]
01069        movq          mm4, mm6
01070 
01071        punpckldq     mm6, mm7                    ;wsptr[2,0],[2,1],[2,4],[2,5]
01072        movq          mm5, mm1
01073 
01074        punpckhdq     mm7, mm4                    ;wsptr[2,6],[2,7],[2,2],[2,3]
01075        movq          mm2, mm6
01076        
01077        movq          mm4, [esi+8*7]              ;wsptr[3,4],[3,5],[3,6],[3,7]
01078        paddw         mm6, mm7             ;wsptr[xxx],[2,z11],[xxx],[2,z13]
01079 
01080        psubw         mm2, mm7             ;wsptr[xxx],[2,z12],[xxx],[2,z10]
01081        punpckldq     mm1, mm4                    ;wsptr[3,0],[3,1],[3,4],[3,5]
01082 
01083        punpckhdq     mm4, mm5                    ;wsptr[3,6],[3,7],[3,2],[3,3]
01084        movq          mm7, mm1
01085 
01086        paddw         mm1, mm4             ;wsptr[xxx],[3,z11],[xxx],[3,z13]
01087        psubw         mm7, mm4             ;wsptr[xxx],[3,z12],[xxx],[3,z10]
01088 
01089        movq          mm5, mm6
01090        punpcklwd     mm6, mm1             ;wsptr[xxx],[xxx],[2,z11],[3,z11]
01091 
01092        punpckhwd     mm5, mm1             ;wsptr[xxx],[xxx],[2,z13],[3,z13]
01093        movq          mm4, mm2
01094 
01095        punpcklwd     mm2, mm7             ;wsptr[xxx],[xxx],[2,z12],[3,z12]
01096 
01097        punpckhwd     mm4, mm7             ;wsptr[xxx],[xxx],[2,z10],[3,z10]
01098 
01099        punpckhdq     mm4, mm6             ;wsptr[2,z10],[3,z10],[2,z11],[3,z11]
01100 
01101        punpckhdq     mm2, mm5             ;wsptr[2,z12],[3,z12],[2,z13],[3,z13]
01102        movq          mm5, mm0
01103 
01104        punpckldq     mm0, mm4             ;wsptr[0,z10],[1,z10],[2,z10],[3,z10]
01105 
01106        punpckhdq     mm5, mm4             ;wsptr[0,z11],[1,z11],[2,z11],[3,z11]
01107        movq          mm4, mm3
01108 
01109        punpckhdq     mm4, mm2             ;wsptr[0,z13],[1,z13],[2,z13],[3,z13]
01110        movq          mm1, mm5
01111 
01112        punpckldq     mm3, mm2             ;wsptr[0,z12],[1,z12],[2,z12],[3,z12]
01113 //    tmp7 = z11 + z13;            /* phase 5 */
01114 //    tmp8 = z11 - z13;            /* phase 5 */
01115        psubw         mm1, mm4             ;tmp8
01116 
01117        paddw         mm5, mm4             ;tmp7
01118 //    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
01119        psllw         mm1, 2
01120 
01121        psllw         mm0, 2
01122 
01123        pmulhw        mm1, fix_141  ;tmp21
01124 //    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
01125 //                   + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
01126        psllw         mm3, 2
01127        movq          mm7, mm0
01128 
01129        pmulhw        mm7, fix_n184
01130        movq          mm6, mm3
01131 
01132        movq          mm2, [esi+8*0]       ;tmp0,final1
01133 
01134        pmulhw        mm6, fix_108n184
01135 //      tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
01136 //                   + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
01137        movq          mm4, mm2             ;final1
01138   
01139        pmulhw        mm0, fix_184n261
01140        paddw         mm2, mm5             ;tmp0+tmp7,final1
01141 
01142        pmulhw        mm3, fix_184
01143        psubw         mm4, mm5             ;tmp0-tmp7,final1
01144 
01145 //    tmp6 = tmp22 - tmp7;  /* phase 2 */
01146        psraw         mm2, 5               ;outptr[0,0],[1,0],[2,0],[3,0],final1
01147 
01148        paddsw        mm2, const_0x0080    ;final1
01149        paddw         mm7, mm6                    ;tmp20
01150        psraw         mm4, 5               ;outptr[0,7],[1,7],[2,7],[3,7],final1
01151 
01152        paddsw        mm4, const_0x0080    ;final1
01153        paddw         mm3, mm0                    ;tmp22
01154 
01155 //    tmp5 = tmp21 - tmp6;
01156        psubw         mm3, mm5             ;tmp6
01157 
01158 //    tmp4 = tmp20 + tmp5;
01159        movq          mm0, [esi+8*1]              ;tmp1,final2
01160        psubw         mm1, mm3             ;tmp5
01161 
01162        movq          mm6, mm0                    ;final2
01163        paddw         mm0, mm3             ;tmp1+tmp6,final2
01164 
01165     /* Final output stage: scale down by a factor of 8 and range-limit */
01166 
01167 
01168 //    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
01169 //                       & RANGE_MASK];
01170 //    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
01171 //                       & RANGE_MASK];   final1
01172 
01173 
01174 //    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
01175 //                       & RANGE_MASK];
01176 //    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
01177 //                       & RANGE_MASK];   final2
01178        psubw         mm6, mm3             ;tmp1-tmp6,final2
01179        psraw         mm0, 5               ;outptr[0,1],[1,1],[2,1],[3,1]
01180 
01181        paddsw        mm0, const_0x0080
01182        psraw         mm6, 5               ;outptr[0,6],[1,6],[2,6],[3,6]
01183        
01184        paddsw        mm6, const_0x0080           ;need to check this value
01185        packuswb      mm0, mm4      ;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
01186        
01187        movq          mm5, [esi+8*2]              ;tmp2,final3
01188        packuswb      mm2, mm6      ;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
01189 
01190 //    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
01191 //                       & RANGE_MASK];
01192 //    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
01193 //                       & RANGE_MASK];   final3
01194        paddw         mm7, mm1             ;tmp4
01195        movq          mm3, mm5
01196 
01197        paddw         mm5, mm1             ;tmp2+tmp5
01198        psubw         mm3, mm1             ;tmp2-tmp5
01199 
01200        psraw         mm5, 5               ;outptr[0,2],[1,2],[2,2],[3,2]
01201 
01202        paddsw        mm5, const_0x0080
01203        movq          mm4, [esi+8*3]              ;tmp3,final4
01204        psraw         mm3, 5               ;outptr[0,5],[1,5],[2,5],[3,5]
01205 
01206        paddsw        mm3, const_0x0080
01207 
01208 
01209 //    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
01210 //                       & RANGE_MASK];
01211 //    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
01212 //                       & RANGE_MASK];   final4
01213        movq          mm6, mm4
01214        paddw         mm4, mm7             ;tmp3+tmp4
01215 
01216        psubw         mm6, mm7             ;tmp3-tmp4
01217        psraw         mm4, 5               ;outptr[0,4],[1,4],[2,4],[3,4]
01218        mov                  ecx, [eax]
01219 
01220        paddsw        mm4, const_0x0080
01221        psraw         mm6, 5               ;outptr[0,3],[1,3],[2,3],[3,3]
01222 
01223        paddsw        mm6, const_0x0080
01224        packuswb      mm5, mm4      ;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
01225 
01226        packuswb      mm6, mm3      ;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
01227        movq          mm4, mm2
01228 
01229        movq          mm7, mm5
01230        punpcklbw     mm2, mm0      ;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
01231 
01232        punpckhbw     mm4, mm0      ;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
01233        movq          mm1, mm2
01234 
01235        punpcklbw     mm5, mm6      ;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
01236        add                  eax, 4
01237 
01238        punpckhbw     mm7, mm6      ;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
01239 
01240        punpcklwd     mm2, mm5      ;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
01241        add                  ecx, output_col
01242 
01243        movq          mm6, mm7
01244        punpckhwd     mm1, mm5      ;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
01245 
01246        movq          mm0, mm2
01247        punpcklwd     mm6, mm4      ;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
01248 
01249        mov                  ebx, [eax]
01250        punpckldq     mm2, mm6      ;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
01251 
01252        add                  eax, 4
01253        movq          mm3, mm1
01254 
01255        add                  ebx, output_col 
01256        punpckhwd     mm7, mm4      ;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
01257        
01258        movq          [ecx], mm2
01259        punpckhdq     mm0, mm6      ;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
01260 
01261        mov                  ecx, [eax]
01262        add                  eax, 4
01263        add                  ecx, output_col
01264 
01265        movq          [ebx], mm0
01266        punpckldq     mm1, mm7      ;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
01267 
01268        mov                  ebx, [eax]
01269 
01270        add                  ebx, output_col
01271        punpckhdq     mm3, mm7      ;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
01272        movq          [ecx], mm1
01273 
01274 
01275        movq          [ebx], mm3
01276 
01277 
01278               
01279 /*******************************************************************/
01280        
01281 
01282        add                  esi, 64
01283        add                  eax, 4
01284 
01285 /*******************************************************************/
01286 
01287 //    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
01288 //    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
01289 //    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
01290 //    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
01291        movq          mm0, [esi+8*0]              ;wsptr[0,0],[0,1],[0,2],[0,3]
01292 
01293        movq          mm1, [esi+8*1]              ;wsptr[0,4],[0,5],[0,6],[0,7]
01294        movq          mm2, mm0
01295        
01296        movq          mm3, [esi+8*2]              ;wsptr[1,0],[1,1],[1,2],[1,3]
01297        paddw         mm0, mm1                    ;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
01298 
01299        movq          mm4, [esi+8*3]              ;wsptr[1,4],[1,5],[1,6],[1,7]
01300        psubw         mm2, mm1                    ;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
01301 
01302        movq          mm6, mm0
01303        movq          mm5, mm3
01304        
01305        paddw         mm3, mm4                    ;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
01306        movq          mm1, mm2
01307 
01308        psubw         mm5, mm4                    ;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
01309        punpcklwd     mm0, mm3                    ;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
01310 
01311        movq          mm7, [esi+8*7]              ;wsptr[3,4],[3,5],[3,6],[3,7]
01312        punpckhwd     mm6, mm3                    ;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
01313 
01314        movq          mm3, [esi+8*4]              ;wsptr[2,0],[2,1],[2,2],[2,3]
01315        punpckldq     mm0, mm6      ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
01316 
01317        punpcklwd     mm1, mm5                    ;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
01318        movq          mm4, mm3
01319 
01320        movq          mm6, [esi+8*6]              ;wsptr[3,0],[3,1],[3,2],[3,3]
01321        punpckhwd     mm2, mm5                    ;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
01322 
01323        movq          mm5, [esi+8*5]              ;wsptr[2,4],[2,5],[2,6],[2,7]
01324        punpckldq     mm1, mm2      ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
01325 
01326        
01327        paddw         mm3, mm5                    ;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
01328        movq          mm2, mm6
01329 
01330        psubw         mm4, mm5                    ;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
01331        paddw         mm6, mm7                    ;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
01332 
01333        movq          mm5, mm3
01334        punpcklwd     mm3, mm6                    ;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
01335        
01336        psubw         mm2, mm7                    ;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
01337        punpckhwd     mm5, mm6                    ;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
01338 
01339        movq          mm7, mm4
01340        punpckldq     mm3, mm5      ;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
01341 
01342        punpcklwd     mm4, mm2                    ;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
01343 
01344        punpckhwd     mm7, mm2                    ;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
01345 
01346        punpckldq     mm4, mm7      ;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
01347        movq          mm6, mm1
01348 
01349 //     mm0 =  ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
01350 //     mm1 =  ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
01351 
01352 
01353        movq          mm2, mm0
01354        punpckhdq     mm6, mm4      ;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
01355 
01356        punpckldq     mm1, mm4      ;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
01357        psllw         mm6, 2
01358 
01359        pmulhw        mm6, fix_141
01360        punpckldq     mm0, mm3      ;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
01361 
01362        punpckhdq     mm2, mm3      ;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
01363        movq          mm7, mm0
01364 
01365 //    tmp0 = tmp10 + tmp13;
01366 //    tmp3 = tmp10 - tmp13;
01367        paddw         mm0, mm2      ;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
01368        psubw         mm7, mm2      ;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
01369 
01370 //    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
01371        psubw         mm6, mm2      ;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
01372 //    tmp1 = tmp11 + tmp12;
01373 //    tmp2 = tmp11 - tmp12;
01374        movq          mm5, mm1
01375 
01376 
01377 
01378     /* Odd part */
01379 
01380 //    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
01381 //    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
01382 //    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
01383 //    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
01384        movq          mm3, [esi+8*0]              ;wsptr[0,0],[0,1],[0,2],[0,3]
01385        paddw         mm1, mm6      ;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
01386 
01387        movq          mm4, [esi+8*1]              ;wsptr[0,4],[0,5],[0,6],[0,7]
01388        psubw         mm5, mm6      ;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
01389 
01390        movq          mm6, mm3
01391        punpckldq     mm3, mm4                    ;wsptr[0,0],[0,1],[0,4],[0,5]
01392 
01393        punpckhdq     mm4, mm6                    ;wsptr[0,6],[0,7],[0,2],[0,3]
01394        movq          mm2, mm3
01395 
01396 //Save tmp0 and tmp1 in wsptr
01397        movq          [esi+8*0], mm0              ;save tmp0
01398        paddw         mm2, mm4                    ;wsptr[xxx],[0,z11],[xxx],[0,z13]
01399 
01400        
01401 //Continue with z10 --- z13
01402        movq          mm6, [esi+8*2]              ;wsptr[1,0],[1,1],[1,2],[1,3]
01403        psubw         mm3, mm4                    ;wsptr[xxx],[0,z12],[xxx],[0,z10]
01404 
01405        movq          mm0, [esi+8*3]              ;wsptr[1,4],[1,5],[1,6],[1,7]
01406        movq          mm4, mm6
01407 
01408        movq          [esi+8*1], mm1              ;save tmp1
01409        punpckldq     mm6, mm0                    ;wsptr[1,0],[1,1],[1,4],[1,5]
01410 
01411        punpckhdq     mm0, mm4                    ;wsptr[1,6],[1,7],[1,2],[1,3]
01412        movq          mm1, mm6
01413        
01414 //Save tmp2 and tmp3 in wsptr
01415        paddw         mm6, mm0             ;wsptr[xxx],[1,z11],[xxx],[1,z13]
01416        movq          mm4, mm2
01417        
01418 //Continue with z10 --- z13
01419        movq          [esi+8*2], mm5              ;save tmp2
01420        punpcklwd     mm2, mm6             ;wsptr[xxx],[xxx],[0,z11],[1,z11]
01421 
01422        psubw         mm1, mm0             ;wsptr[xxx],[1,z12],[xxx],[1,z10]
01423        punpckhwd     mm4, mm6             ;wsptr[xxx],[xxx],[0,z13],[1,z13]
01424 
01425        movq          mm0, mm3
01426        punpcklwd     mm3, mm1             ;wsptr[xxx],[xxx],[0,z12],[1,z12]
01427 
01428        movq          [esi+8*3], mm7              ;save tmp3
01429        punpckhwd     mm0, mm1             ;wsptr[xxx],[xxx],[0,z10],[1,z10]
01430 
01431        movq          mm6, [esi+8*4]              ;wsptr[2,0],[2,1],[2,2],[2,3]
01432        punpckhdq     mm0, mm2             ;wsptr[0,z10],[1,z10],[0,z11],[1,z11]
01433 
01434        movq          mm7, [esi+8*5]              ;wsptr[2,4],[2,5],[2,6],[2,7]
01435        punpckhdq     mm3, mm4             ;wsptr[0,z12],[1,z12],[0,z13],[1,z13]
01436 
01437        movq          mm1, [esi+8*6]              ;wsptr[3,0],[3,1],[3,2],[3,3]
01438        movq          mm4, mm6
01439 
01440        punpckldq     mm6, mm7                    ;wsptr[2,0],[2,1],[2,4],[2,5]
01441        movq          mm5, mm1
01442 
01443        punpckhdq     mm7, mm4                    ;wsptr[2,6],[2,7],[2,2],[2,3]
01444        movq          mm2, mm6
01445        
01446        movq          mm4, [esi+8*7]              ;wsptr[3,4],[3,5],[3,6],[3,7]
01447        paddw         mm6, mm7             ;wsptr[xxx],[2,z11],[xxx],[2,z13]
01448 
01449        psubw         mm2, mm7             ;wsptr[xxx],[2,z12],[xxx],[2,z10]
01450        punpckldq     mm1, mm4                    ;wsptr[3,0],[3,1],[3,4],[3,5]
01451 
01452        punpckhdq     mm4, mm5                    ;wsptr[3,6],[3,7],[3,2],[3,3]
01453        movq          mm7, mm1
01454 
01455        paddw         mm1, mm4             ;wsptr[xxx],[3,z11],[xxx],[3,z13]
01456        psubw         mm7, mm4             ;wsptr[xxx],[3,z12],[xxx],[3,z10]
01457 
01458        movq          mm5, mm6
01459        punpcklwd     mm6, mm1             ;wsptr[xxx],[xxx],[2,z11],[3,z11]
01460 
01461        punpckhwd     mm5, mm1             ;wsptr[xxx],[xxx],[2,z13],[3,z13]
01462        movq          mm4, mm2
01463 
01464        punpcklwd     mm2, mm7             ;wsptr[xxx],[xxx],[2,z12],[3,z12]
01465 
01466        punpckhwd     mm4, mm7             ;wsptr[xxx],[xxx],[2,z10],[3,z10]
01467 
01468        punpckhdq     mm4, mm6             ;wsptr[2,z10],[3,z10],[2,z11],[3,z11]
01469 
01470        punpckhdq     mm2, mm5             ;wsptr[2,z12],[3,z12],[2,z13],[3,z13]
01471        movq          mm5, mm0
01472 
01473        punpckldq     mm0, mm4             ;wsptr[0,z10],[1,z10],[2,z10],[3,z10]
01474 
01475        punpckhdq     mm5, mm4             ;wsptr[0,z11],[1,z11],[2,z11],[3,z11]
01476        movq          mm4, mm3
01477 
01478        punpckhdq     mm4, mm2             ;wsptr[0,z13],[1,z13],[2,z13],[3,z13]
01479        movq          mm1, mm5
01480 
01481        punpckldq     mm3, mm2             ;wsptr[0,z12],[1,z12],[2,z12],[3,z12]
01482 //    tmp7 = z11 + z13;            /* phase 5 */
01483 //    tmp8 = z11 - z13;            /* phase 5 */
01484        psubw         mm1, mm4             ;tmp8
01485 
01486        paddw         mm5, mm4             ;tmp7
01487 //    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
01488        psllw         mm1, 2
01489 
01490        psllw         mm0, 2
01491 
01492        pmulhw        mm1, fix_141  ;tmp21
01493 //    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
01494 //                   + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
01495        psllw         mm3, 2
01496        movq          mm7, mm0
01497 
01498        pmulhw        mm7, fix_n184
01499        movq          mm6, mm3
01500 
01501        movq          mm2, [esi+8*0]       ;tmp0,final1
01502 
01503        pmulhw        mm6, fix_108n184
01504 //      tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
01505 //                   + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
01506        movq          mm4, mm2             ;final1
01507   
01508        pmulhw        mm0, fix_184n261
01509        paddw         mm2, mm5             ;tmp0+tmp7,final1
01510 
01511        pmulhw        mm3, fix_184
01512        psubw         mm4, mm5             ;tmp0-tmp7,final1
01513 
01514 //    tmp6 = tmp22 - tmp7;  /* phase 2 */
01515        psraw         mm2, 5               ;outptr[0,0],[1,0],[2,0],[3,0],final1
01516 
01517        paddsw        mm2, const_0x0080    ;final1
01518        paddw         mm7, mm6                    ;tmp20
01519        psraw         mm4, 5               ;outptr[0,7],[1,7],[2,7],[3,7],final1
01520 
01521        paddsw        mm4, const_0x0080    ;final1
01522        paddw         mm3, mm0                    ;tmp22
01523 
01524 //    tmp5 = tmp21 - tmp6;
01525        psubw         mm3, mm5             ;tmp6
01526 
01527 //    tmp4 = tmp20 + tmp5;
01528        movq          mm0, [esi+8*1]              ;tmp1,final2
01529        psubw         mm1, mm3             ;tmp5
01530 
01531        movq          mm6, mm0                    ;final2
01532        paddw         mm0, mm3             ;tmp1+tmp6,final2
01533 
01534     /* Final output stage: scale down by a factor of 8 and range-limit */
01535 
01536 
01537 //    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
01538 //                       & RANGE_MASK];
01539 //    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
01540 //                       & RANGE_MASK];   final1
01541 
01542 
01543 //    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
01544 //                       & RANGE_MASK];
01545 //    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
01546 //                       & RANGE_MASK];   final2
01547        psubw         mm6, mm3             ;tmp1-tmp6,final2
01548        psraw         mm0, 5               ;outptr[0,1],[1,1],[2,1],[3,1]
01549 
01550        paddsw        mm0, const_0x0080
01551        psraw         mm6, 5               ;outptr[0,6],[1,6],[2,6],[3,6]
01552        
01553        paddsw        mm6, const_0x0080           ;need to check this value
01554        packuswb      mm0, mm4      ;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
01555        
01556        movq          mm5, [esi+8*2]              ;tmp2,final3
01557        packuswb      mm2, mm6      ;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
01558 
01559 //    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
01560 //                       & RANGE_MASK];
01561 //    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
01562 //                       & RANGE_MASK];   final3
01563        paddw         mm7, mm1             ;tmp4
01564        movq          mm3, mm5
01565 
01566        paddw         mm5, mm1             ;tmp2+tmp5
01567        psubw         mm3, mm1             ;tmp2-tmp5
01568 
01569        psraw         mm5, 5               ;outptr[0,2],[1,2],[2,2],[3,2]
01570 
01571        paddsw        mm5, const_0x0080
01572        movq          mm4, [esi+8*3]              ;tmp3,final4
01573        psraw         mm3, 5               ;outptr[0,5],[1,5],[2,5],[3,5]
01574 
01575        paddsw        mm3, const_0x0080
01576 
01577 
01578 //    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
01579 //                       & RANGE_MASK];
01580 //    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
01581 //                       & RANGE_MASK];   final4
01582        movq          mm6, mm4
01583        paddw         mm4, mm7             ;tmp3+tmp4
01584 
01585        psubw         mm6, mm7             ;tmp3-tmp4
01586        psraw         mm4, 5               ;outptr[0,4],[1,4],[2,4],[3,4]
01587        mov                  ecx, [eax]
01588 
01589        paddsw        mm4, const_0x0080
01590        psraw         mm6, 5               ;outptr[0,3],[1,3],[2,3],[3,3]
01591 
01592        paddsw        mm6, const_0x0080
01593        packuswb      mm5, mm4      ;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
01594 
01595        packuswb      mm6, mm3      ;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
01596        movq          mm4, mm2
01597 
01598        movq          mm7, mm5
01599        punpcklbw     mm2, mm0      ;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
01600 
01601        punpckhbw     mm4, mm0      ;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
01602        movq          mm1, mm2
01603 
01604        punpcklbw     mm5, mm6      ;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
01605        add                  eax, 4
01606 
01607        punpckhbw     mm7, mm6      ;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
01608 
01609        punpcklwd     mm2, mm5      ;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
01610        add                  ecx, output_col
01611 
01612        movq          mm6, mm7
01613        punpckhwd     mm1, mm5      ;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
01614 
01615        movq          mm0, mm2
01616        punpcklwd     mm6, mm4      ;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
01617 
01618        mov                  ebx, [eax]
01619        punpckldq     mm2, mm6      ;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
01620 
01621        add                  eax, 4
01622        movq          mm3, mm1
01623 
01624        add                  ebx, output_col 
01625        punpckhwd     mm7, mm4      ;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
01626        
01627        movq          [ecx], mm2
01628        punpckhdq     mm0, mm6      ;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
01629 
01630        mov                  ecx, [eax]
01631        add                  eax, 4
01632        add                  ecx, output_col
01633 
01634        movq          [ebx], mm0
01635        punpckldq     mm1, mm7      ;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
01636 
01637        mov                  ebx, [eax]
01638 
01639        add                  ebx, output_col
01640        punpckhdq     mm3, mm7      ;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
01641        movq          [ecx], mm1
01642 
01643        movq          [ebx], mm3
01644 
01645        emms
01646        }
01647 }
01648 #endif
01649 
01650 #endif /* DCT_IFAST_SUPPORTED */