Copyright (c) 2006 Dean Gaudet <dean@arctic.org>  All Rights Reserved.

This will eventually be part of a larger patch under an appropriate license.


Index: gsrc/jidctfst_sse2.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ gsrc/jidctfst_sse2.c	2006-08-26 23:32:55.000000000 -0700
@@ -0,0 +1,287 @@
+/*
+ * jidctfst_sse2.c
+ *
+ * Copyright (C) 2006, Dean Gaudet.
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file is an SSE2 reimplementation of jidctfst.c.  See that file for
+ * a more thorough description of the implementation.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h"		/* Private declarations for DCT subsystem */
+
+#if defined(DCT_IFAST_SUPPORTED) && defined(__SSE2__)
+
+#include <stdint.h>
+#include <emmintrin.h>
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+#error "Sorry, this code only copes with 8x8 DCTs."
+#endif
+
+#if BITS_IN_JSAMPLE != 8
+#error "Sorry, this code only supports BITS_IN_JSAMPLE == 8"
+#endif
+
+
+/* Scaling decisions are generally the same as in the LL&M algorithm;
+ * see jidctint.c for more details.  However, we choose to descale
+ * (right shift) multiplication products as soon as they are formed,
+ * rather than carrying additional fractional bits into subsequent additions.
+ * This compromises accuracy slightly, but it lets us save a few shifts.
+ * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
+ * everywhere except in the multiplications proper; this saves a good deal
+ * of work on 16-bit-int machines.
+ *
+ * The dequantized coefficients are not integers because the AA&N scaling
+ * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
+ * so that the first and second IDCT rounds have the same input scaling.
+ * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
+ * avoid a descaling shift; this compromises accuracy rather drastically
+ * for small quantization table entries, but it saves a lot of shifts.
+ * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
+ * so we use a much larger scaling factor to preserve accuracy.
+ *
+ * A final compromise is to represent the multiplicative constants to only
+ * 8 fractional bits, rather than 13.  This saves some shifting work on some
+ * machines, and may also reduce the cost of multiplication (since there
+ * are fewer one-bits in the constants).
+ */
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+// a union of convenience... for creating a few constants.
+typedef union {
+  short s[8];
+  __m128i m;
+} simd_const_t;
+#define X(v) { .s = {v, v, v, v, v, v, v, v}}
+static const simd_const_t FIX_1_082392200 = X(FIX(1.082392200));
+static const simd_const_t FIX_1_414213562 = X(FIX(1.414213562));
+static const simd_const_t FIX_1_847759065 = X(FIX(1.847759065));
+static const simd_const_t FIX_n2_613125930 = X(-FIX(2.613125930));
+
+static const simd_const_t CENTERJSAMPLE_vec = X(CENTERJSAMPLE);
+#undef X
+
+
+// multiply packed shorts against the FIX constants above and normalize the
+// result
+static inline __m128i MULTIPLY(__m128i var, __m128i c)
+{
+  __m128i lo = _mm_srli_epi16(_mm_mullo_epi16(var, c), CONST_BITS);
+  __m128i hi = _mm_slli_epi16(_mm_mulhi_epi16(var, c), 16-CONST_BITS);
+  return _mm_add_epi16(lo, hi);
+}
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
+ * multiplication will do.  For 12-bit data, the multiplier table is
+ * declared INT32, so a 32-bit multiply will be used.
+ */
+
+static inline __m128i DEQUANTIZE(const JCOEF *inptr, const short *quantval)
+{
+  return _mm_mullo_epi16(_mm_loadu_si128((const __m128i *)inptr),
+                         _mm_load_si128((const __m128i *)quantval));
+}
+
+
+// s0..s7 are an input 8x8 matrix of packed shorts... calculate its transpose.
+// there's a commented example of a similar transpose when arranging the output
+// bytes at the end of jpeg_idct_ifast.
+#define transpose8x8(d0, d1, d2, d3, d4, d5, d6, d7, s0, s1, s2, s3, s4, s5, s6, s7) do { \
+    __m128i t0 = _mm_unpacklo_epi16(s0, s1); \
+    __m128i t1 = _mm_unpacklo_epi16(s2, s3); \
+    __m128i t2 = _mm_unpacklo_epi32(t0, t1); \
+    __m128i t3 = _mm_unpackhi_epi32(t0, t1); \
+    __m128i t4 = _mm_unpacklo_epi16(s4, s5); \
+    __m128i t5 = _mm_unpacklo_epi16(s6, s7); \
+    __m128i t6 = _mm_unpacklo_epi32(t4, t5); \
+    __m128i t7 = _mm_unpackhi_epi32(t4, t5); \
+    d0 = _mm_unpacklo_epi64(t2, t6); \
+    d1 = _mm_unpackhi_epi64(t2, t6); \
+    d2 = _mm_unpacklo_epi64(t3, t7); \
+    d3 = _mm_unpackhi_epi64(t3, t7); \
+    t0 = _mm_unpackhi_epi16(s0, s1); \
+    t1 = _mm_unpackhi_epi16(s2, s3); \
+    t2 = _mm_unpacklo_epi32(t0, t1); \
+    t3 = _mm_unpackhi_epi32(t0, t1); \
+    t4 = _mm_unpackhi_epi16(s4, s5); \
+    t5 = _mm_unpackhi_epi16(s6, s7); \
+    t6 = _mm_unpacklo_epi32(t4, t5); \
+    t7 = _mm_unpackhi_epi32(t4, t5); \
+    d4 = _mm_unpacklo_epi64(t2, t6); \
+    d5 = _mm_unpackhi_epi64(t2, t6); \
+    d6 = _mm_unpacklo_epi64(t3, t7); \
+    d7 = _mm_unpackhi_epi64(t3, t7); \
+  } while (0)
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ */
+
+GLOBAL(void)
+jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp10, tmp11, tmp12, tmp13;
+  __m128i z5, z10, z11, z12, z13;
+  const short * quantptr;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  quantptr = (const short *) compptr->dct_table;
+
+  /* Even part */
+
+  tmp0 = DEQUANTIZE(&coef_block[DCTSIZE*0], &quantptr[DCTSIZE*0]);
+  tmp1 = DEQUANTIZE(&coef_block[DCTSIZE*2], &quantptr[DCTSIZE*2]);
+  tmp2 = DEQUANTIZE(&coef_block[DCTSIZE*4], &quantptr[DCTSIZE*4]);
+  tmp3 = DEQUANTIZE(&coef_block[DCTSIZE*6], &quantptr[DCTSIZE*6]);
+
+  tmp10 = _mm_add_epi16(tmp0, tmp2);	/* phase 3 */
+  tmp11 = _mm_sub_epi16(tmp0, tmp2);
+
+  tmp13 = _mm_add_epi16(tmp1, tmp3);	/* phases 5-3 */
+  tmp12 = _mm_sub_epi16(MULTIPLY(_mm_sub_epi16(tmp1, tmp3), FIX_1_414213562.m), tmp13); /* 2*c4 */
+
+  tmp0 = _mm_add_epi16(tmp10, tmp13);	/* phase 2 */
+  tmp3 = _mm_sub_epi16(tmp10, tmp13);
+  tmp1 = _mm_add_epi16(tmp11, tmp12);
+  tmp2 = _mm_sub_epi16(tmp11, tmp12);
+
+  /* Odd part */
+
+  tmp4 = DEQUANTIZE(&coef_block[DCTSIZE*1], &quantptr[DCTSIZE*1]);
+  tmp5 = DEQUANTIZE(&coef_block[DCTSIZE*3], &quantptr[DCTSIZE*3]);
+  tmp6 = DEQUANTIZE(&coef_block[DCTSIZE*5], &quantptr[DCTSIZE*5]);
+  tmp7 = DEQUANTIZE(&coef_block[DCTSIZE*7], &quantptr[DCTSIZE*7]);
+
+  z13 = _mm_add_epi16(tmp6, tmp5);		/* phase 6 */
+  z10 = _mm_sub_epi16(tmp6, tmp5);
+  z11 = _mm_add_epi16(tmp4, tmp7);
+  z12 = _mm_sub_epi16(tmp4, tmp7);
+
+  tmp7 = _mm_add_epi16(z11, z13);		/* phase 5 */
+  tmp11 = MULTIPLY(_mm_sub_epi16(z11, z13), FIX_1_414213562.m); /* 2*c4 */
+
+  z5 = MULTIPLY(_mm_add_epi16(z10, z12), FIX_1_847759065.m); /* 2*c2 */
+  tmp10 = _mm_sub_epi16(MULTIPLY(z12, FIX_1_082392200.m), z5); /* 2*(c2-c6) */
+  tmp12 = _mm_add_epi16(MULTIPLY(z10, FIX_n2_613125930.m), z5); /* -2*(c2+c6) */
+
+  tmp6 = _mm_sub_epi16(tmp12, tmp7);	/* phase 2 */
+  tmp5 = _mm_sub_epi16(tmp11, tmp6);
+  tmp4 = _mm_add_epi16(tmp10, tmp5);
+
+  x0 = _mm_adds_epi16(tmp0, tmp7);
+  x7 = _mm_subs_epi16(tmp0, tmp7);
+  x1 = _mm_adds_epi16(tmp1, tmp6);
+  x6 = _mm_subs_epi16(tmp1, tmp6);
+  x2 = _mm_adds_epi16(tmp2, tmp5);
+  x5 = _mm_subs_epi16(tmp2, tmp5);
+  x4 = _mm_adds_epi16(tmp3, tmp4);
+  x3 = _mm_subs_epi16(tmp3, tmp4);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  transpose8x8(y0, y1, y2, y3, y4, y5, y6, y7, x0, x1, x2, x3, x4, x5, x6, x7);
+
+  /* Even part */
+
+  tmp10 = _mm_add_epi16(y0, y4);
+  tmp11 = _mm_sub_epi16(y0, y4);
+
+  tmp13 = _mm_add_epi16(y2, y6);
+  tmp12 = _mm_sub_epi16(MULTIPLY(_mm_sub_epi16(y2, y6), FIX_1_414213562.m), tmp13);
+
+  tmp0 = _mm_add_epi16(tmp10, tmp13);
+  tmp3 = _mm_sub_epi16(tmp10, tmp13);
+  tmp1 = _mm_add_epi16(tmp11, tmp12);
+  tmp2 = _mm_sub_epi16(tmp11, tmp12);
+
+  /* Odd part */
+
+  z13 = _mm_add_epi16(y5, y3);
+  z10 = _mm_sub_epi16(y5, y3);
+  z11 = _mm_add_epi16(y1, y7);
+  z12 = _mm_sub_epi16(y1, y7);
+
+  tmp7 = _mm_add_epi16(z11, z13);		/* phase 5 */
+  tmp11 = MULTIPLY(_mm_sub_epi16(z11, z13), FIX_1_414213562.m); /* 2*c4 */
+
+  z5 = MULTIPLY(_mm_add_epi16(z10, z12), FIX_1_847759065.m); /* 2*c2 */
+  tmp10 = _mm_sub_epi16(MULTIPLY(z12, FIX_1_082392200.m), z5); /* 2*(c2-c6) */
+  tmp12 = _mm_add_epi16(MULTIPLY(z10, FIX_n2_613125930.m), z5); /* -2*(c2+c6) */
+
+  tmp6 = _mm_sub_epi16(tmp12, tmp7);	/* phase 2 */
+  tmp5 = _mm_sub_epi16(tmp11, tmp6);
+  tmp4 = _mm_add_epi16(tmp10, tmp5);
+
+  /* Final output stage: scale down by a factor of 8 and range-limit */
+
+  x0 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp0, tmp7), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x7 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp0, tmp7), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x1 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp1, tmp6), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x6 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp1, tmp6), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x2 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp2, tmp5), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x5 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp2, tmp5), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x4 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp3, tmp4), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+  x3 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp3, tmp4), PASS1_BITS+3), CENTERJSAMPLE_vec.m);
+
+  // range_limit/saturation step
+  y0 = _mm_packus_epi16(x0, x4);        // 74 64 54 44 34 24 14 04 .. 70 60 50 40 30 20 10 00
+  y1 = _mm_packus_epi16(x1, x5);        // 75 65 55 45 35 25 15 05 .. 71 61 51 41 31 21 11 01
+  y2 = _mm_packus_epi16(x2, x6);        // 76 66 56 46 36 26 16 06 .. 72 62 52 42 32 22 12 02
+  y3 = _mm_packus_epi16(x3, x7);        // 77 67 57 47 37 27 17 07 .. 73 63 53 43 33 23 13 03
+
+  // we now have to perform another transpose to arrange the output bytes
+  x0 = _mm_unpacklo_epi8(y0, y1);       // 71 70 61 60 51 50 41 40 .. 31 30 21 20 11 10 01 00
+  x1 = _mm_unpacklo_epi8(y2, y3);       // 73 72 63 62 53 52 43 42 .. 33 32 23 22 13 12 03 02
+  x2 = _mm_unpacklo_epi16(x0, x1);      // 33 32 31 30 23 22 21 20 .. 13 12 11 10 03 02 01 00
+  x3 = _mm_unpackhi_epi16(x0, x1);      // 73 72 71 70 63 62 61 60 .. 53 52 51 50 43 42 41 40
+
+  x4 = _mm_unpackhi_epi8(y0, y1);       // 75 74 65 64 55 54 45 44 .. 35 34 25 24 15 14 05 04
+  x5 = _mm_unpackhi_epi8(y2, y3);       // 77 76 67 66 57 56 47 46 .. 37 36 27 26 17 16 07 06
+  x6 = _mm_unpacklo_epi16(x4, x5);      // 37 36 35 34 27 26 25 24 .. 17 16 15 14 07 06 05 04
+  x7 = _mm_unpackhi_epi16(x4, x5);      // 77 76 75 74 67 66 65 64 .. 57 56 55 54 47 46 45 44
+
+  y0 = _mm_unpacklo_epi32(x2, x6);      // 17 16 15 14 13 12 11 10 .. 07 06 05 04 03 02 01 00
+  y1 = _mm_unpackhi_epi32(x2, x6);
+  y2 = _mm_unpacklo_epi32(x3, x7);
+  y3 = _mm_unpackhi_epi32(x3, x7);
+
+  // there doesn't seem to be any way to do these low/high 8-byte stores
+  // without casting the registers to __m128d and using the movlpd/movhpd
+  // instructions... there's nothing technically wrong with this (it won't
+  // cause troubles regardless of the bit patterns in the registers).  it's
+  // just kind of ugly.
+  _mm_storel_pd((double *)(output_buf[0] + output_col), (__m128d)y0);
+  _mm_storeh_pd((double *)(output_buf[1] + output_col), (__m128d)y0);
+  _mm_storel_pd((double *)(output_buf[2] + output_col), (__m128d)y1);
+  _mm_storeh_pd((double *)(output_buf[3] + output_col), (__m128d)y1);
+  _mm_storel_pd((double *)(output_buf[4] + output_col), (__m128d)y2);
+  _mm_storeh_pd((double *)(output_buf[5] + output_col), (__m128d)y2);
+  _mm_storel_pd((double *)(output_buf[6] + output_col), (__m128d)y3);
+  _mm_storeh_pd((double *)(output_buf[7] + output_col), (__m128d)y3);
+}
+
+#endif /* DCT_IFAST_SUPPORTED */
Index: gsrc/makefile.cfg
===================================================================
--- gsrc.orig/makefile.cfg	2006-08-25 14:54:28.000000000 -0700
+++ gsrc/makefile.cfg	2006-08-26 23:32:55.000000000 -0700
@@ -79,7 +79,7 @@ LIBSOURCES= jcapimin.c jcapistd.c jccoef
         jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
         jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
         jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctfst_sse2.c jidctint.c jidctred.c jquant1.c \
         jquant2.c jutils.c jmemmgr.c
 # memmgr back ends: compile only one of these into a working library
 SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
@@ -121,7 +121,7 @@ CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O)
 DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \
         jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \
         jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \
-        jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \
+        jidctfst.$(O) jidctfst_sse2.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \
         jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O)
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
@@ -287,6 +287,7 @@ jfdctfst.$(O): jfdctfst.c jinclude.h jco
 jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jidctfst_sse2.$(O): jidctfst_sse2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
Index: gsrc/jddctmgr.c
===================================================================
--- gsrc.orig/jddctmgr.c	2006-08-26 23:32:43.000000000 -0700
+++ gsrc/jddctmgr.c	2006-08-26 23:32:55.000000000 -0700
@@ -15,6 +15,7 @@
  * dequantization multiplier table needed by the IDCT routine.
  */
 
+#include <stdint.h>
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
@@ -259,9 +260,11 @@ jinit_inverse_dct (j_decompress_ptr cinf
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate and pre-zero a multiplier table for each component */
-    compptr->dct_table =
+    /* Provide 16-byte alignment for SSE2 support. */
+    char *t =
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(multiplier_table));
+				  SIZEOF(multiplier_table) + 15);
+    compptr->dct_table = (void *)(((uintptr_t)t + 15) & ~(uintptr_t)15);
     MEMZERO(compptr->dct_table, SIZEOF(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
Index: gsrc/jdct.h
===================================================================
--- gsrc.orig/jdct.h	2006-08-26 23:32:43.000000000 -0700
+++ gsrc/jdct.h	2006-08-26 23:32:55.000000000 -0700
@@ -55,7 +55,11 @@ typedef JMETHOD(void, float_DCT_method_p
 
 typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
 #if BITS_IN_JSAMPLE == 8
+#ifndef __SSE2__
 typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
+#else
+typedef short IFAST_MULT_TYPE; /* SSE2 code requires short */
+#endif
 #define IFAST_SCALE_BITS  2	/* fractional bits in scale factors */
 #else
 typedef INT32 IFAST_MULT_TYPE;	/* need 32 bits for scaled quantizers */
Index: gsrc/jidctfst.c
===================================================================
--- gsrc.orig/jidctfst.c	2006-08-25 15:17:25.000000000 -0700
+++ gsrc/jidctfst.c	2006-08-26 23:34:26.000000000 -0700
@@ -37,7 +37,7 @@
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
 
-#ifdef DCT_IFAST_SUPPORTED
+#if defined(DCT_IFAST_SUPPORTED) && !defined(__SSE2__)
 
 
 /*