Back to index

lightning-sunbird  0.9+nobinonly
jidctint.c
Go to the documentation of this file.
00001 /*
00002  * jidctint.c
00003  *
00004  * Copyright (C) 1991-1998, Thomas G. Lane.
00005  * This file is part of the Independent JPEG Group's software.
00006  * For conditions of distribution and use, see the accompanying README file.
00007  *
00008  * This file contains a slow-but-accurate integer implementation of the
00009  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
00010  * must also perform dequantization of the input coefficients.
00011  *
00012  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
00013  * on each row (or vice versa, but it's more convenient to emit a row at
00014  * a time).  Direct algorithms are also available, but they are much more
00015  * complex and seem not to be any faster when reduced to code.
00016  *
00017  * This implementation is based on an algorithm described in
00018  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
00019  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
00020  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
00021  * The primary algorithm described there uses 11 multiplies and 29 adds.
00022  * We use their alternate method with 12 multiplies and 32 adds.
00023  * The advantage of this method is that no data path contains more than one
00024  * multiplication; this allows a very simple and accurate implementation in
00025  * scaled fixed-point arithmetic, with a minimal number of shifts.
00026  */
00027 
00028 #define JPEG_INTERNALS
00029 #include "jinclude.h"
00030 #include "jpeglib.h"
00031 #include "jdct.h"           /* Private declarations for DCT subsystem */
00032 
00033 #ifdef DCT_ISLOW_SUPPORTED
00034 
00035 
00036 /*
00037  * This module is specialized to the case DCTSIZE = 8.
00038  */
00039 
00040 #if DCTSIZE != 8
00041   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
00042 #endif
00043 
00044 
00045 /*
00046  * The poop on this scaling stuff is as follows:
00047  *
00048  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
00049  * larger than the true IDCT outputs.  The final outputs are therefore
00050  * a factor of N larger than desired; since N=8 this can be cured by
00051  * a simple right shift at the end of the algorithm.  The advantage of
00052  * this arrangement is that we save two multiplications per 1-D IDCT,
00053  * because the y0 and y4 inputs need not be divided by sqrt(N).
00054  *
00055  * We have to do addition and subtraction of the integer inputs, which
00056  * is no problem, and multiplication by fractional constants, which is
00057  * a problem to do in integer arithmetic.  We multiply all the constants
00058  * by CONST_SCALE and convert them to integer constants (thus retaining
00059  * CONST_BITS bits of precision in the constants).  After doing a
00060  * multiplication we have to divide the product by CONST_SCALE, with proper
00061  * rounding, to produce the correct output.  This division can be done
00062  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
00063  * as long as possible so that partial sums can be added together with
00064  * full fractional precision.
00065  *
00066  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
00067  * they are represented to better-than-integral precision.  These outputs
00068  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
00069  * with the recommended scaling.  (To scale up 12-bit sample data further, an
00070  * intermediate INT32 array would be needed.)
00071  *
00072  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
00073  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
00074  * shows that the values given below are the most effective.
00075  */
00076 
00077 #if BITS_IN_JSAMPLE == 8
00078 #define CONST_BITS  13
00079 #define PASS1_BITS  2
00080 #else
00081 #define CONST_BITS  13
00082 #define PASS1_BITS  1              /* lose a little precision to avoid overflow */
00083 #endif
00084 
00085 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
00086  * causing a lot of useless floating-point operations at run time.
00087  * To get around this we use the following pre-calculated constants.
00088  * If you change CONST_BITS you may want to add appropriate values.
00089  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
00090  */
00091 
00092 #if CONST_BITS == 13
00093 #define FIX_0_298631336  ((INT32)  2446)  /* FIX(0.298631336) */
00094 #define FIX_0_390180644  ((INT32)  3196)  /* FIX(0.390180644) */
00095 #define FIX_0_541196100  ((INT32)  4433)  /* FIX(0.541196100) */
00096 #define FIX_0_765366865  ((INT32)  6270)  /* FIX(0.765366865) */
00097 #define FIX_0_899976223  ((INT32)  7373)  /* FIX(0.899976223) */
00098 #define FIX_1_175875602  ((INT32)  9633)  /* FIX(1.175875602) */
00099 #define FIX_1_501321110  ((INT32)  12299) /* FIX(1.501321110) */
00100 #define FIX_1_847759065  ((INT32)  15137) /* FIX(1.847759065) */
00101 #define FIX_1_961570560  ((INT32)  16069) /* FIX(1.961570560) */
00102 #define FIX_2_053119869  ((INT32)  16819) /* FIX(2.053119869) */
00103 #define FIX_2_562915447  ((INT32)  20995) /* FIX(2.562915447) */
00104 #define FIX_3_072711026  ((INT32)  25172) /* FIX(3.072711026) */
00105 #else
00106 #define FIX_0_298631336  FIX(0.298631336)
00107 #define FIX_0_390180644  FIX(0.390180644)
00108 #define FIX_0_541196100  FIX(0.541196100)
00109 #define FIX_0_765366865  FIX(0.765366865)
00110 #define FIX_0_899976223  FIX(0.899976223)
00111 #define FIX_1_175875602  FIX(1.175875602)
00112 #define FIX_1_501321110  FIX(1.501321110)
00113 #define FIX_1_847759065  FIX(1.847759065)
00114 #define FIX_1_961570560  FIX(1.961570560)
00115 #define FIX_2_053119869  FIX(2.053119869)
00116 #define FIX_2_562915447  FIX(2.562915447)
00117 #define FIX_3_072711026  FIX(3.072711026)
00118 #endif
00119 
00120 
00121 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
00122  * For 8-bit samples with the recommended scaling, all the variable
00123  * and constant values involved are no more than 16 bits wide, so a
00124  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
00125  * For 12-bit samples, a full 32-bit multiplication will be needed.
00126  */
00127 
00128 #if BITS_IN_JSAMPLE == 8
00129 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
00130 #else
00131 #define MULTIPLY(var,const)  ((var) * (const))
00132 #endif
00133 
00134 
00135 /* Dequantize a coefficient by multiplying it by the multiplier-table
00136  * entry; produce an int result.  In this module, both inputs and result
00137  * are 16 bits or less, so either int or short multiply will work.
00138  */
00139 
00140 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
00141 
00142 
00143 /*
00144  * Perform dequantization and inverse DCT on one block of coefficients.
00145  */
00146 
00147 GLOBAL(void)
00148 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
00149                JCOEFPTR coef_block,
00150                JSAMPARRAY output_buf, JDIMENSION output_col)
00151 {
00152   INT32 tmp0, tmp1, tmp2, tmp3;
00153   INT32 tmp10, tmp11, tmp12, tmp13;
00154   INT32 z1, z2, z3, z4, z5;
00155   JCOEFPTR inptr;
00156   ISLOW_MULT_TYPE * quantptr;
00157   int * wsptr;
00158   JSAMPROW outptr;
00159   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
00160   int ctr;
00161   int workspace[DCTSIZE2];  /* buffers data between passes */
00162   SHIFT_TEMPS
00163 
00164   /* Pass 1: process columns from input, store into work array. */
00165   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
00166   /* furthermore, we scale the results by 2**PASS1_BITS. */
00167 
00168   inptr = coef_block;
00169   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
00170   wsptr = workspace;
00171   for (ctr = DCTSIZE; ctr > 0; ctr--) {
00172     /* Due to quantization, we will usually find that many of the input
00173      * coefficients are zero, especially the AC terms.  We can exploit this
00174      * by short-circuiting the IDCT calculation for any column in which all
00175      * the AC terms are zero.  In that case each output is equal to the
00176      * DC coefficient (with scale factor as needed).
00177      * With typical images and quantization tables, half or more of the
00178      * column DCT calculations can be simplified this way.
00179      */
00180     
00181     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
00182        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
00183        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
00184        inptr[DCTSIZE*7] == 0) {
00185       /* AC terms all zero */
00186       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
00187       
00188       wsptr[DCTSIZE*0] = dcval;
00189       wsptr[DCTSIZE*1] = dcval;
00190       wsptr[DCTSIZE*2] = dcval;
00191       wsptr[DCTSIZE*3] = dcval;
00192       wsptr[DCTSIZE*4] = dcval;
00193       wsptr[DCTSIZE*5] = dcval;
00194       wsptr[DCTSIZE*6] = dcval;
00195       wsptr[DCTSIZE*7] = dcval;
00196       
00197       inptr++;                     /* advance pointers to next column */
00198       quantptr++;
00199       wsptr++;
00200       continue;
00201     }
00202     
00203     /* Even part: reverse the even part of the forward DCT. */
00204     /* The rotator is sqrt(2)*c(-6). */
00205     
00206     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
00207     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
00208     
00209     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
00210     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
00211     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
00212     
00213     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
00214     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
00215 
00216     tmp0 = (z2 + z3) << CONST_BITS;
00217     tmp1 = (z2 - z3) << CONST_BITS;
00218     
00219     tmp10 = tmp0 + tmp3;
00220     tmp13 = tmp0 - tmp3;
00221     tmp11 = tmp1 + tmp2;
00222     tmp12 = tmp1 - tmp2;
00223     
00224     /* Odd part per figure 8; the matrix is unitary and hence its
00225      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
00226      */
00227     
00228     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
00229     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
00230     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
00231     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
00232     
00233     z1 = tmp0 + tmp3;
00234     z2 = tmp1 + tmp2;
00235     z3 = tmp0 + tmp2;
00236     z4 = tmp1 + tmp3;
00237     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
00238     
00239     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
00240     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
00241     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
00242     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
00243     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
00244     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
00245     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
00246     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
00247     
00248     z3 += z5;
00249     z4 += z5;
00250     
00251     tmp0 += z1 + z3;
00252     tmp1 += z2 + z4;
00253     tmp2 += z2 + z3;
00254     tmp3 += z1 + z4;
00255     
00256     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
00257     
00258     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
00259     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
00260     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
00261     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
00262     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
00263     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
00264     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
00265     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
00266     
00267     inptr++;                /* advance pointers to next column */
00268     quantptr++;
00269     wsptr++;
00270   }
00271   
00272   /* Pass 2: process rows from work array, store into output array. */
00273   /* Note that we must descale the results by a factor of 8 == 2**3, */
00274   /* and also undo the PASS1_BITS scaling. */
00275 
00276   wsptr = workspace;
00277   for (ctr = 0; ctr < DCTSIZE; ctr++) {
00278     outptr = output_buf[ctr] + output_col;
00279     /* Rows of zeroes can be exploited in the same way as we did with columns.
00280      * However, the column calculation has created many nonzero AC terms, so
00281      * the simplification applies less often (typically 5% to 10% of the time).
00282      * On machines with very fast multiplication, it's possible that the
00283      * test takes more time than it's worth.  In that case this section
00284      * may be commented out.
00285      */
00286     
00287 #ifndef NO_ZERO_ROW_TEST
00288     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
00289        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
00290       /* AC terms all zero */
00291       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
00292                               & RANGE_MASK];
00293       
00294       outptr[0] = dcval;
00295       outptr[1] = dcval;
00296       outptr[2] = dcval;
00297       outptr[3] = dcval;
00298       outptr[4] = dcval;
00299       outptr[5] = dcval;
00300       outptr[6] = dcval;
00301       outptr[7] = dcval;
00302 
00303       wsptr += DCTSIZE;            /* advance pointer to next row */
00304       continue;
00305     }
00306 #endif
00307     
00308     /* Even part: reverse the even part of the forward DCT. */
00309     /* The rotator is sqrt(2)*c(-6). */
00310     
00311     z2 = (INT32) wsptr[2];
00312     z3 = (INT32) wsptr[6];
00313     
00314     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
00315     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
00316     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
00317     
00318     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
00319     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
00320     
00321     tmp10 = tmp0 + tmp3;
00322     tmp13 = tmp0 - tmp3;
00323     tmp11 = tmp1 + tmp2;
00324     tmp12 = tmp1 - tmp2;
00325     
00326     /* Odd part per figure 8; the matrix is unitary and hence its
00327      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
00328      */
00329     
00330     tmp0 = (INT32) wsptr[7];
00331     tmp1 = (INT32) wsptr[5];
00332     tmp2 = (INT32) wsptr[3];
00333     tmp3 = (INT32) wsptr[1];
00334     
00335     z1 = tmp0 + tmp3;
00336     z2 = tmp1 + tmp2;
00337     z3 = tmp0 + tmp2;
00338     z4 = tmp1 + tmp3;
00339     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
00340     
00341     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
00342     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
00343     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
00344     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
00345     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
00346     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
00347     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
00348     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
00349     
00350     z3 += z5;
00351     z4 += z5;
00352     
00353     tmp0 += z1 + z3;
00354     tmp1 += z2 + z4;
00355     tmp2 += z2 + z3;
00356     tmp3 += z1 + z4;
00357     
00358     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
00359     
00360     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
00361                                      CONST_BITS+PASS1_BITS+3)
00362                          & RANGE_MASK];
00363     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
00364                                      CONST_BITS+PASS1_BITS+3)
00365                          & RANGE_MASK];
00366     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
00367                                      CONST_BITS+PASS1_BITS+3)
00368                          & RANGE_MASK];
00369     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
00370                                      CONST_BITS+PASS1_BITS+3)
00371                          & RANGE_MASK];
00372     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
00373                                      CONST_BITS+PASS1_BITS+3)
00374                          & RANGE_MASK];
00375     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
00376                                      CONST_BITS+PASS1_BITS+3)
00377                          & RANGE_MASK];
00378     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
00379                                      CONST_BITS+PASS1_BITS+3)
00380                          & RANGE_MASK];
00381     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
00382                                      CONST_BITS+PASS1_BITS+3)
00383                          & RANGE_MASK];
00384     
00385     wsptr += DCTSIZE;              /* advance pointer to next row */
00386   }
00387 }
00388 
00389 
00390 #ifdef HAVE_SSE2_INTEL_MNEMONICS
00391 
00392 /*
00393 * Intel SSE2 optimized Inverse Discrete Cosine Transform
00394 *
00395 *
00396 * Copyright (c) 2001-2002 Intel Corporation
00397 * All Rights Reserved
00398 *
00399 *
00400 *  Authors:
00401 *      Danilov G.
00402 *
00403 *
00404 *-----------------------------------------------------------------------------
00405 *
00406 * References:
00407 *    K.R. Rao and P. Yip
00408 *       Discrete Cosine Transform.
00409 *       Algorithms, Advantages, Applications.
00410 *       Academic Press, Inc, London, 1990.
00411 *    JPEG Group's software.
00412 *       This implementation is based on Appendix A.2 of the book (R&Y) ...
00413 *
00414 *-----------------------------------------------------------------------------
00415 */
00416 
00417 typedef unsigned char   Ipp8u;
00418 typedef unsigned short  Ipp16u;
00419 typedef unsigned int    Ipp32u;
00420 
00421 typedef signed char    Ipp8s;
00422 typedef signed short   Ipp16s;
00423 typedef signed int     Ipp32s;
00424 
00425 #define BITS_INV_ACC  4                   
00426 #define SHIFT_INV_ROW  16 - BITS_INV_ACC
00427 #define SHIFT_INV_COL 1 + BITS_INV_ACC
00428 
00429 #define RND_INV_ROW  1024 * (6 - BITS_INV_ACC)   /* 1 << (SHIFT_INV_ROW-1)          */
00430 #define RND_INV_COL = 16 * (BITS_INV_ACC - 3)   /* 1 << (SHIFT_INV_COL-1)           */
00431 #define RND_INV_CORR = RND_INV_COL - 1          /* correction -1.0 and round */
00432 
00433 #define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536        /* -0.5 + (16.0 or 32.0)    */
00434 #define c_inv_corr_1 1877 * (6 - BITS_INV_ACC)                        /* 0.9167     */     
00435 #define c_inv_corr_2 1236 * (6 - BITS_INV_ACC)                        /* 0.6035     */                                 
00436 #define c_inv_corr_3 680  * (6 - BITS_INV_ACC)                        /* 0.3322     */
00437 #define c_inv_corr_4 0    * (6 - BITS_INV_ACC)                        /* 0.0        */     
00438 #define c_inv_corr_5 -569  * (6 - BITS_INV_ACC)                       /* -0.278     */
00439 #define c_inv_corr_6 -512  * (6 - BITS_INV_ACC)                       /* -0.25      */     
00440 #define c_inv_corr_7 -651  * (6 - BITS_INV_ACC)                       /* -0.3176    */     
00441 
00442 #define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0
00443 #define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1
00444 #define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2
00445 #define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3
00446 #define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4
00447 #define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5
00448 #define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6
00449 #define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7
00450 
00451 /* Table for rows 0,4 - constants are multiplied on cos_4_16 */
00452 
00453 __declspec(align(16)) short tab_i_04[] = { 
00454        16384, 21407, 16384, 8867,         
00455        -16384, 21407, 16384, -8867,       
00456        16384,  -8867,  16384, -21407,  
00457     16384,   8867, -16384, -21407,  
00458     22725,  19266,  19266,  -4520,  
00459     4520,  19266,  19266, -22725,   
00460     12873, -22725,   4520, -12873,  
00461     12873,   4520, -22725, -12873}; 
00462 
00463 /* Table for rows 1,7 - constants are multiplied on cos_1_16 */
00464 
00465 __declspec(align(16)) short tab_i_17[] = {
00466        22725,  29692,  22725,  12299,   
00467     -22725,  29692,  22725, -12299,  
00468     22725, -12299,  22725, -29692,   
00469     22725,  12299, -22725, -29692,   
00470     31521,  26722,  26722,  -6270,   
00471     6270,  26722,  26722, -31521,    
00472     17855, -31521,   6270, -17855,   
00473     17855,   6270, -31521, -17855};  
00474 
00475 /* Table for rows 2,6 - constants are multiplied on cos_2_16 */
00476 
00477 __declspec(align(16)) short tab_i_26[] = {
00478        21407,  27969,  21407,  11585,     
00479     -21407,  27969,  21407, -11585,       
00480     21407, -11585,  21407, -27969, 
00481     21407,  11585, -21407, -27969, 
00482     29692,  25172,  25172,  -5906, 
00483     5906,  25172,  25172, -29692,  
00484     16819, -29692,   5906, -16819, 
00485     16819,   5906, -29692, -16819};       
00486 
00487 /* Table for rows 3,5 - constants are multiplied on cos_3_16 */
00488 
00489 __declspec(align(16)) short tab_i_35[] = {
00490        19266,  25172,  19266,  10426,     
00491     -19266,  25172,  19266, -10426,       
00492     19266, -10426,  19266, -25172, 
00493     19266,  10426, -19266, -25172, 
00494     26722,  22654,  22654,  -5315, 
00495     5315,  22654,  22654, -26722,  
00496     15137, -26722,   5315, -15137, 
00497     15137,   5315, -26722, -15137};       
00498        
00499 __declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0,
00500        RND_INV_ROW_0,RND_INV_ROW_0};
00501 __declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1,
00502        RND_INV_ROW_1,RND_INV_ROW_1};
00503 __declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2,
00504        RND_INV_ROW_2,RND_INV_ROW_2};
00505 __declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3,
00506        RND_INV_ROW_3,RND_INV_ROW_3};
00507 __declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4,
00508        RND_INV_ROW_4,RND_INV_ROW_4};
00509 __declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5,
00510        RND_INV_ROW_5,RND_INV_ROW_5};
00511 __declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6,
00512        RND_INV_ROW_6,RND_INV_ROW_6};
00513 __declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7,
00514        RND_INV_ROW_7,RND_INV_ROW_7};
00515 
00516 __declspec(align(16)) short tg_1_16[] = {
00517        13036,  13036,  13036,  13036,     /* tg * (2<<16) + 0.5 */
00518        13036,  13036,  13036,  13036};
00519 __declspec(align(16)) short tg_2_16[] = {
00520        27146,  27146,  27146,  27146,     /* tg * (2<<16) + 0.5 */
00521        27146,  27146,  27146,  27146};
00522 __declspec(align(16)) short tg_3_16[] = {
00523        -21746, -21746, -21746, -21746,    /* tg * (2<<16) + 0.5 */
00524        -21746, -21746, -21746, -21746};
00525 __declspec(align(16)) short cos_4_16[] = {
00526        -19195, -19195, -19195, -19195,    /* cos * (2<<16) + 0.5 */
00527        -19195, -19195, -19195, -19195};
00528 
00529 /*
00530 * In this implementation the outputs of the iDCT-1D are multiplied
00531 *    for rows 0,4 - on cos_4_16,
00532 *    for rows 1,7 - on cos_1_16,
00533 *    for rows 2,6 - on cos_2_16,
00534 *    for rows 3,5 - on cos_3_16
00535 * and are shifted to the left for rise of accuracy
00536 *
00537 * For used constants
00538 *    FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
00539 *
00540 *-----------------------------------------------------------------------------
00541 *
00542 * On the first stage the calculation is executed at once for two rows.
00543 * The permutation for each output row is done on second stage
00544 *    t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0
00545 *
00546 *-----------------------------------------------------------------------------
00547 */
00548        
00549 #define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm { \
00550        __asm pshuflw  xmm1, xmm0, 10001000b                           \
00551     __asm pshuflw  xmm0, xmm0, 11011101b                       \
00552     __asm pshufhw  xmm1, xmm1, 10001000b                       \
00553        __asm pshufhw  xmm0, xmm0, 11011101b                           \
00554        __asm movdqa   xmm2, XMMWORD PTR [TABLE]                \
00555        __asm pmaddwd  xmm2, xmm1                                             \
00556        __asm movdqa   xmm3, XMMWORD PTR [TABLE + 32]           \
00557        __asm pmaddwd  xmm3, xmm0                               \
00558        __asm pmaddwd  xmm1, XMMWORD PTR [TABLE + 16]           \
00559        __asm pmaddwd  xmm0, XMMWORD PTR [TABLE + 48]           \
00560        __asm pshuflw  xmm5, xmm4, 10001000b                           \
00561        __asm pshuflw  xmm4, xmm4, 11011101b                    \
00562        __asm pshufhw  xmm5, xmm5, 10001000b                    \
00563        __asm pshufhw  xmm4, xmm4, 11011101b                    \
00564        __asm movdqa   xmm6, XMMWORD PTR [TABLE]                \
00565        __asm pmaddwd  xmm6, xmm5                               \
00566        __asm movdqa   xmm7, XMMWORD PTR [TABLE + 32]           \
00567        __asm pmaddwd  xmm7, xmm4                               \
00568        __asm pmaddwd  xmm5, XMMWORD PTR [TABLE + 16]           \
00569        __asm pmaddwd  xmm4, XMMWORD PTR [TABLE + 48]           \
00570        __asm pshufd   xmm1, xmm1, 01001110b                    \
00571        __asm pshufd   xmm0, xmm0, 01001110b                    \
00572        __asm paddd    xmm2, XMMWORD PTR [ROUND1]               \
00573        __asm paddd    xmm3, xmm0                                             \
00574        __asm paddd    xmm1, xmm2                                             \
00575        __asm pshufd   xmm5, xmm5, 01001110b                    \
00576        __asm pshufd   xmm4, xmm4, 01001110b                    \
00577        __asm movdqa   xmm2, xmm1                                      \
00578        __asm psubd    xmm2, xmm3                                      \
00579        __asm psrad    xmm2, SHIFT_INV_ROW                             \
00580        __asm paddd    xmm1, xmm3                                             \
00581        __asm psrad    xmm1, SHIFT_INV_ROW                      \
00582        __asm packssdw xmm1, xmm2                                             \
00583        __asm paddd    xmm6, XMMWORD PTR [ROUND2]               \
00584        __asm paddd    xmm7, xmm4                                             \
00585        __asm paddd    xmm5, xmm6                                             \
00586        __asm movdqa   xmm6, xmm5                               \
00587        __asm psubd    xmm6, xmm7                               \
00588        __asm psrad    xmm6, SHIFT_INV_ROW                      \
00589        __asm paddd    xmm5, xmm7                                             \
00590        __asm psrad    xmm5, SHIFT_INV_ROW                      \
00591        __asm packssdw xmm5, xmm6                                             \
00592        }
00593 
00594 /*
00595 *
00596 * The second stage - inverse DCTs of columns
00597 *
00598 * The inputs are multiplied
00599 *    for rows 0,4 - on cos_4_16,
00600 *    for rows 1,7 - on cos_1_16,
00601 *    for rows 2,6 - on cos_2_16,
00602 *    for rows 3,5 - on cos_3_16
00603 * and are shifted to the left for rise of accuracy
00604 */
00605 
00606 #define DCT_8_INV_COL_8R(INP, OUTP) __asm {             \
00607        __asm movdqa   xmm0, [INP + 5*16]                \
00608     __asm movdqa   xmm1, XMMWORD PTR tg_3_16     \
00609     __asm movdqa   xmm2, xmm0                    \
00610     __asm movdqa   xmm3, [INP + 3*16]            \
00611     __asm pmulhw   xmm0, xmm1                    \
00612     __asm movdqa   xmm4, [INP + 7*16]            \
00613     __asm pmulhw   xmm1, xmm3                    \
00614     __asm movdqa   xmm5, XMMWORD PTR tg_1_16     \
00615     __asm movdqa   xmm6, xmm4                    \
00616     __asm pmulhw   xmm4, xmm5                    \
00617     __asm paddsw   xmm0, xmm2                    \
00618     __asm pmulhw   xmm5, [INP + 1*16]            \
00619     __asm paddsw   xmm1, xmm3                    \
00620     __asm movdqa   xmm7, [INP + 6*16]            \
00621     __asm paddsw   xmm0, xmm3                                  \
00622     __asm movdqa   xmm3, XMMWORD PTR tg_2_16     \
00623     __asm psubsw   xmm2, xmm1                                  \
00624     __asm pmulhw   xmm7, xmm3                    \
00625     __asm movdqa   xmm1, xmm0                    \
00626     __asm pmulhw   xmm3, [INP + 2*16]            \
00627     __asm psubsw   xmm5, xmm6                                  \
00628     __asm paddsw   xmm4, [INP + 1*16]            \
00629     __asm paddsw   xmm0, xmm4                    \
00630     __asm psubsw   xmm4, xmm1                                  \
00631     __asm pshufhw  xmm0, xmm0, 00011011b         \
00632     __asm paddsw   xmm7, [INP + 2*16]            \
00633     __asm movdqa   xmm6, xmm5                                  \
00634     __asm psubsw   xmm3, [INP + 6*16]            \
00635     __asm psubsw   xmm5, xmm2                    \
00636     __asm paddsw   xmm6, xmm2                                  \
00637        __asm movdqa   [OUTP + 7*16], xmm0               \
00638     __asm movdqa   xmm1, xmm4                    \
00639     __asm movdqa   xmm2, XMMWORD PTR cos_4_16    \
00640     __asm paddsw   xmm4, xmm5                    \
00641     __asm movdqa   xmm0, XMMWORD PTR cos_4_16    \
00642     __asm pmulhw   xmm2, xmm4                                  \
00643     __asm pshufhw  xmm6, xmm6, 00011011b         \
00644     __asm movdqa   [OUTP + 3*16], xmm6                  \
00645     __asm psubsw   xmm1, xmm5                    \
00646     __asm movdqa   xmm6, [INP + 0*16]            \
00647     __asm pmulhw   xmm0, xmm1                                  \
00648     __asm movdqa   xmm5, [INP + 4*16]            \
00649     __asm paddsw   xmm4, xmm2                                  \
00650     __asm paddsw   xmm5, xmm6                           \
00651     __asm psubsw   xmm6, [INP + 4*16]            \
00652     __asm paddsw   xmm0, xmm1                                  \
00653     __asm pshufhw  xmm4, xmm4, 00011011b         \
00654     __asm movdqa   xmm2, xmm5                    \
00655     __asm paddsw   xmm5, xmm7                    \
00656     __asm movdqa   xmm1, xmm6                                  \
00657     __asm psubsw   xmm2, xmm7                                  \
00658     __asm movdqa   xmm7, [OUTP + 7*16]                  \
00659     __asm paddsw   xmm6, xmm3                    \
00660     __asm pshufhw  xmm5, xmm5, 00011011b         \
00661        __asm paddsw   xmm7, xmm5                               \
00662     __asm psubsw   xmm1, xmm3                                  \
00663     __asm pshufhw  xmm6, xmm6, 00011011b         \
00664        __asm movdqa   xmm3, xmm6                               \
00665     __asm paddsw   xmm6, xmm4                    \
00666     __asm pshufhw  xmm2, xmm2, 00011011b         \
00667     __asm psraw    xmm7, SHIFT_INV_COL           \
00668     __asm movdqa   [OUTP + 0*16], xmm7                  \
00669     __asm movdqa   xmm7, xmm1                    \
00670     __asm paddsw   xmm1, xmm0                                  \
00671     __asm psraw    xmm6, SHIFT_INV_COL                  \
00672     __asm movdqa   [OUTP + 1*16], xmm6                  \
00673     __asm pshufhw  xmm1, xmm1, 00011011b         \
00674        __asm movdqa   xmm6, [OUTP + 3*16]               \
00675     __asm psubsw   xmm7, xmm0                    \
00676     __asm psraw    xmm1, SHIFT_INV_COL           \
00677     __asm movdqa   [OUTP + 2*16], xmm1                  \
00678     __asm psubsw   xmm5, [OUTP + 7*16]                  \
00679     __asm paddsw   xmm6, xmm2                    \
00680     __asm psubsw   xmm2, [OUTP + 3*16]                  \
00681     __asm psubsw   xmm3, xmm4                    \
00682     __asm psraw    xmm7, SHIFT_INV_COL           \
00683     __asm pshufhw  xmm7, xmm7, 00011011b         \
00684     __asm movdqa   [OUTP + 5*16], xmm7                  \
00685     __asm psraw    xmm5, SHIFT_INV_COL                  \
00686     __asm movdqa   [OUTP + 7*16], xmm5                  \
00687     __asm psraw    xmm6, SHIFT_INV_COL                  \
00688     __asm movdqa   [OUTP + 3*16], xmm6                  \
00689     __asm psraw    xmm2, SHIFT_INV_COL                  \
00690     __asm movdqa   [OUTP + 4*16], xmm2                  \
00691     __asm psraw    xmm3, SHIFT_INV_COL                  \
00692     __asm movdqa   [OUTP + 6*16], xmm3                  \
00693        }
00694 
00695 /*
00696 *
00697 *  Name:      dct_8x8_inv_16s
00698 *  Purpose:   Inverse Discrete Cosine Transform 8x8 with
00699 *             2D buffer of short int data
00700 *  Context:
00701 *      void dct_8x8_inv_16s ( short *src, short *dst )
00702 *  Parameters:
00703 *      src  - Pointer to the source buffer
00704 *      dst  - Pointer to the destination buffer
00705 *
00706 */
00707 
00708 GLOBAL(void)
00709 dct_8x8_inv_16s ( short *src, short *dst ) {
00710        
00711        __asm {
00712 
00713               mov     ecx,  src
00714               mov     edx,  dst
00715 
00716               movdqa  xmm0, [ecx+0*16]
00717               movdqa  xmm4, [ecx+4*16]
00718               DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4)
00719               movdqa     [edx+0*16], xmm1 
00720               movdqa     [edx+4*16], xmm5 
00721 
00722               movdqa  xmm0, [ecx+1*16]
00723               movdqa  xmm4, [ecx+7*16]
00724               DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7)
00725               movdqa     [edx+1*16], xmm1 
00726               movdqa     [edx+7*16], xmm5 
00727 
00728               movdqa  xmm0, [ecx+3*16]
00729               movdqa  xmm4, [ecx+5*16]
00730               DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5);
00731               movdqa     [edx+3*16], xmm1 
00732               movdqa     [edx+5*16], xmm5 
00733 
00734               movdqa  xmm0, [ecx+2*16]
00735               movdqa  xmm4, [ecx+6*16]
00736               DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6);
00737               movdqa     [edx+2*16], xmm1
00738               movdqa     [edx+6*16], xmm5    
00739 
00740               DCT_8_INV_COL_8R(edx+0, edx+0);
00741        }
00742 }
00743 
00744 
00745 /*
00746 *  Name:
00747 *    ownpj_QuantInv_8x8_16s
00748 *
00749 *  Purpose:
00750 *    Dequantize 8x8 block of DCT coefficients
00751 *
00752 *  Context:
00753 *    void ownpj_QuantInv_8x8_16s
00754 *            Ipp16s*  pSrc,
00755 *            Ipp16s*  pDst,
00756 *      const Ipp16u*  pQTbl)*
00757 *
00758 */
00759 
00760 GLOBAL(void)
00761 ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl)
00762 {
00763        __asm {
00764 
00765               push        ebx
00766               push        ecx
00767               push        edx
00768               push        esi
00769               push        edi
00770 
00771               mov         esi, pSrc
00772               mov         edi, pDst
00773               mov         edx, pQTbl
00774               mov         ecx, 4
00775               mov         ebx, 32
00776 
00777        again:
00778 
00779               movq        mm0, QWORD PTR [esi+0]
00780               movq        mm1, QWORD PTR [esi+8]
00781               movq        mm2, QWORD PTR [esi+16]
00782               movq        mm3, QWORD PTR [esi+24]
00783 
00784               prefetcht0  [esi+ebx] ; fetch next cache line
00785 
00786               pmullw      mm0, QWORD PTR [edx+0]
00787               pmullw      mm1, QWORD PTR [edx+8]
00788               pmullw      mm2, QWORD PTR [edx+16]
00789               pmullw      mm3, QWORD PTR [edx+24]
00790 
00791               movq        QWORD PTR [edi+0], mm0
00792               movq        QWORD PTR [edi+8], mm1
00793               movq        QWORD PTR [edi+16], mm2
00794               movq        QWORD PTR [edi+24], mm3
00795 
00796               add         esi, ebx
00797               add         edi, ebx
00798               add         edx, ebx
00799               dec         ecx
00800               jnz         again
00801 
00802               emms
00803 
00804               pop         edi
00805               pop         esi
00806               pop         edx
00807               pop         ecx
00808               pop         ebx
00809        }
00810 }
00811 
00812 
00813 /*
00814 *  Name:
00815 *    ownpj_Add128_8x8_16s8u
00816 *
00817 *  Purpose:
00818 *    signed to unsigned conversion (level shift)
00819 *    for 8x8 block of DCT coefficients
00820 *
00821 *  Context:
00822 *    void ownpj_Add128_8x8_16s8u
00823 *      const Ipp16s* pSrc,
00824 *            Ipp8u*  pDst,
00825 *            int     DstStep);
00826 *
00827 */
00828 
00829 __declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080};
00830 
00831 GLOBAL(void)
00832 ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep)
00833 {
00834        __asm {
00835               push        eax
00836               push        ebx
00837               push        ecx
00838               push        edx
00839               push        esi
00840               push        edi
00841 
00842               mov         esi, pSrc
00843               mov         edi, pDst
00844               mov         edx, DstStep
00845               mov         ecx, 2
00846               mov         ebx, edx
00847               mov         eax, edx
00848               sal         ebx, 1
00849               add         eax, ebx
00850               movdqa      xmm7, XMMWORD PTR const_128
00851 
00852        again:
00853 
00854               movdqa      xmm0, XMMWORD PTR [esi+0]  ; line 0
00855               movdqa      xmm1, XMMWORD PTR [esi+16] ; line 1
00856               movdqa      xmm2, XMMWORD PTR [esi+32] ; line 2
00857               movdqa      xmm3, XMMWORD PTR [esi+48] ; line 3
00858 
00859               paddw     xmm0, xmm7
00860               paddw     xmm1, xmm7
00861               paddw     xmm2, xmm7
00862               paddw     xmm3, xmm7
00863 
00864               packuswb  xmm0, xmm1
00865               packuswb  xmm2, xmm3
00866 
00867               movq      QWORD PTR [edi], xmm0      ;0*DstStep
00868               movq      QWORD PTR [edi+ebx], xmm2  ;2*DstStep
00869 
00870               psrldq      xmm0, 8
00871               psrldq      xmm2, 8
00872 
00873               movq      QWORD PTR [edi+edx], xmm0  ;1*DstStep
00874               movq      QWORD PTR [edi+eax], xmm2  ;3*DstStep
00875 
00876               add         edi, ebx
00877               add         esi, 64
00878               add         edi, ebx
00879               dec         ecx
00880               jnz         again
00881 
00882               pop         edi
00883               pop         esi
00884               pop         edx
00885               pop         ecx
00886               pop         ebx
00887               pop         eax
00888        }
00889 }
00890 
00891 
00892 /* 
00893 *  Name:
00894 *    ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R
00895 *
00896 *  Purpose:
00897 *    Inverse DCT transform, de-quantization and level shift
00898 *
00899 *  Parameters:
00900 *    pSrc               - pointer to source
00901 *    pDst               - pointer to output array
00902 *    DstStep            - line offset for output data
00903 *    pEncoderQuantTable - pointer to Quantization table
00904 *
00905 */
00906 
00907 GLOBAL(void)
00908 ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
00909   short * pSrc,
00910   unsigned char *  pDst,
00911   int     DstStep,
00912   const unsigned short * pQuantInvTable)
00913 {
00914 
00915        __declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)];
00916        Ipp16s * workbuf = (Ipp16s *)buf;  
00917 
00918        ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable);
00919        dct_8x8_inv_16s(workbuf,workbuf);
00920        ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep);
00921   
00922 } 
00923 
00924 GLOBAL(void)
00925 jpeg_idct_islow_sse2 (
00926        j_decompress_ptr cinfo, 
00927        jpeg_component_info * compptr,
00928        JCOEFPTR coef_block,
00929        JSAMPARRAY output_buf, 
00930        JDIMENSION output_col)
00931 {
00932        int                  ctr;
00933        JCOEFPTR      inptr;
00934        Ipp16u*              quantptr;
00935        Ipp8u*        wsptr;
00936        __declspec(align(16)) Ipp8u workspace[DCTSIZE2];        
00937        JSAMPROW      outptr;
00938 
00939        inptr = coef_block;
00940        quantptr = (Ipp16u*)compptr->dct_table;
00941        wsptr = workspace;
00942        
00943        ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr);
00944 
00945        for(ctr = 0; ctr < DCTSIZE; ctr++)
00946        {
00947               outptr = output_buf[ctr] + output_col;
00948 
00949               outptr[0] = wsptr[0];
00950               outptr[1] = wsptr[1];
00951               outptr[2] = wsptr[2];
00952               outptr[3] = wsptr[3];
00953               outptr[4] = wsptr[4];
00954               outptr[5] = wsptr[5];
00955               outptr[6] = wsptr[6];
00956               outptr[7] = wsptr[7];
00957 
00958               wsptr += DCTSIZE;
00959        }
00960 }
00961 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
00962 
00963 #endif /* DCT_ISLOW_SUPPORTED */