2 files changed, 211 insertions, 4 deletions
diff --git a/gs/base/gxht_thresh.c b/gs/base/gxht_thresh.c
index 2f975b478..517fdfae7 100644
--- a/gs/base/gxht_thresh.c
+++ b/gs/base/gxht_thresh.c
@@ -174,7 +174,115 @@ threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
 }
 #endif
-/* SSE2 and non-SSE2 implememntation of thresholding a row  */
+/* SSE2 and non-SSE2 implememntation of thresholding a row. Subtractive case  
+   There is some code replication between the two of these (additive and subtractive)
+   that I need to go back and determine how we can combine them without
+   any performance loss. */
+void
+gx_ht_threshold_row_bit_sub(byte *contone,  byte *threshold_strip,  int contone_stride,
+                  byte *halftone, int dithered_stride, int width,
+                  int num_rows, int offset_bits)
+{
+#ifndef HAVE_SSE2
+    int k, j;
+    byte *contone_ptr;
+    byte *thresh_ptr;
+    byte *halftone_ptr;
+    byte bit_init;
+    /* For the moment just do a very slow compare until we get
+       get this working.  This could use some serious optimization */
+    width -= offset_bits;
+    for (j = 0; j < num_rows; j++) {
+        byte h;
+        contone_ptr = contone;
+        thresh_ptr = threshold_strip + contone_stride * j;
+        halftone_ptr = halftone + dithered_stride * j;
+        /* First get the left remainder portion.  Put into MSBs of first byte */
+        bit_init = 0x80;
+        h = 0;
+        k = offset_bits;
+        if (k > 0) {
+            do {
+                if (*contone_ptr++ > *thresh_ptr++) {
+                    h |=  bit_init;
+                }
+                bit_init >>= 1;
+                if (bit_init == 0) {
+                    bit_init = 0x80;
+                    *halftone_ptr++ = h;
+                    h = 0;
+                }
+                k--;
+            } while (k > 0);
+            bit_init = 0x80;
+            *halftone_ptr++ = h;
+            h = 0;
+            if (offset_bits < 8)
+                *halftone_ptr++ = 0;
+        }
+        /* Now get the rest, which will be 16 bit aligned. */
+        k = width;
+        if (k > 0) {
+            do {
+                if (*contone_ptr++ > *thresh_ptr++) {
+                    h |=  bit_init;
+                }
+                bit_init >>= 1;
+                if (bit_init == 0) {
+                    bit_init = 0x80;
+                    *halftone_ptr++ = h;
+                    h = 0;
+                }
+                k--;
+            } while (k > 0);
+            if (bit_init != 0x80) {
+                *halftone_ptr++ = h;
+            }
+            if ((width & 15) < 8)
+                *halftone_ptr++ = 0;
+        }
+    }
+#else
+    byte *contone_ptr;
+    byte *thresh_ptr;
+    byte *halftone_ptr;
+    int num_tiles = (width - offset_bits + 15)>>4;
+    int k, j;
+    for (j = 0; j < num_rows; j++) {
+        /* contone and thresh_ptr are 128 bit aligned.  We do need to do this in
+           two steps to ensure that we pack the bits in an aligned fashion
+           into halftone_ptr.  */
+        contone_ptr = contone;
+        thresh_ptr = threshold_strip + contone_stride * j;
+        halftone_ptr = halftone + dithered_stride * j;
+        if (offset_bits > 0) {
+            /* Since we allowed for 16 bits in our left remainder
+               we can go directly in to the destination.  threshold_16_SSE
+               requires 128 bit alignment.  contone_ptr and thresh_ptr
+               are set up so that after we move in by offset_bits elements
+               then we are 128 bit aligned.  */
+            threshold_16_SSE_unaligned(thresh_ptr, contone_ptr,
+                                       halftone_ptr);
+            halftone_ptr += 2;
+            thresh_ptr += offset_bits;
+            contone_ptr += offset_bits;
+        }
+        /* Now we should have 128 bit aligned with our input data. Iterate
+           over sets of 16 going directly into our HT buffer.  Sources and
+           halftone_ptr buffers should be padded to allow 15 bit overrun */
+        for (k = 0; k < num_tiles; k++) {
+            threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr);
+            thresh_ptr += 16;
+            contone_ptr += 16;
+            halftone_ptr += 2;
+        }
+    }
+#endif
+}
+/* SSE2 and non-SSE2 implememntation of thresholding a row. additive case  */
 void
 gx_ht_threshold_row_bit(byte *contone,  byte *threshold_strip,  int contone_stride,
                  byte *halftone, int dithered_stride, int width,
@@ -279,7 +387,99 @@ gx_ht_threshold_row_bit(byte *contone,  byte *threshold_strip,  int contone_stri
 #endif
 }
-/* This thresholds a buffer that is LAND_BITS wide by data_length tall. */
+/* This thresholds a buffer that is LAND_BITS wide by data_length tall. 
+   Subtractive case */
+void
+gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align,
+                    ht_landscape_info_t ht_landscape, byte *halftone,
+                    int data_length)
+{
+    __align16 byte contone[LAND_BITS];
+    int position_start, position, curr_position;
+    int *widths = &(ht_landscape.widths[0]);
+    int local_widths[LAND_BITS];
+    int num_contone = ht_landscape.num_contones;
+    int k, j, w, contone_out_posit;
+    byte *contone_ptr, *thresh_ptr, *halftone_ptr;
+#ifdef PACIFY_VALGRIND
+    int extra = 0;
+#endif
+    /* Work through chunks of 16.  */
+    /* Data may have come in left to right or right to left. */
+    if (ht_landscape.index > 0) {
+        position = position_start = 0;
+    } else {
+        position = position_start = ht_landscape.curr_pos + 1;
+    }
+    thresh_ptr = thresh_align;
+    halftone_ptr = halftone;
+    /* Copy the widths to a local array, and truncate the last one (which may
+     * be the first one!) if required. */
+    k = 0;
+    for (j = 0; j < num_contone; j++)
+        k += (local_widths[j] = widths[position_start+j]);
+    if (k > LAND_BITS) {
+        if (ht_landscape.index > 0) {
+            local_widths[num_contone-1] -= k-LAND_BITS;
+        } else {
+            local_widths[0] -= k-LAND_BITS;
+        }
+    }
+#ifdef PACIFY_VALGRIND
+    if (k < LAND_BITS) {
+        extra = LAND_BITS - k;
+    }
+#endif
+    for (k = data_length; k > 0; k--) { /* Loop on rows */
+        contone_ptr = &(contone_align[position]); /* Point us to our row start */
+        curr_position = 0; /* We use this in keeping track of widths */
+        contone_out_posit = 0; /* Our index out */
+        for (j = num_contone; j > 0; j--) {
+            byte c = *contone_ptr;
+            /* The microsoft compiler, cleverly spots that the following loop
+             * can be replaced by a memset. Unfortunately, it can't spot that
+             * the typical length values of the memset are so small that we'd
+             * be better off doing it the slow way. We therefore introduce a
+             * sneaky 'volatile' cast below that stops this optimisation. */
+            w = local_widths[curr_position];
+            do {
+                ((volatile byte *)contone)[contone_out_posit] = c;
+                contone_out_posit++;
+            } while (--w);
+#ifdef PACIFY_VALGRIND
+            if (extra)
+                memset(contone+contone_out_posit, 0, extra);
+#endif
+            curr_position++; /* Move us to the next position in our width array */
+            contone_ptr++;   /* Move us to a new location in our contone buffer */
+        }
+        /* Now we have our left justified and expanded contone data for
+           LAND_BITS/16 sets of 16 bits. Go ahead and threshold these. */
+        contone_ptr = &contone[0];
+#if LAND_BITS > 16
+        j = LAND_BITS;
+        do {
+#endif
+#ifdef HAVE_SSE2
+            threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr);
+#else
+            threshold_16_bit(thresh_ptr, contone_ptr, halftone_ptr);
+#endif
+            thresh_ptr += 16;
+            position += 16;
+            halftone_ptr += 2;
+            contone_ptr += 16;
+#if LAND_BITS > 16
+            j -= 16;
+        } while (j > 0);
+#endif
+    }
+}
+/* This thresholds a buffer that is LAND_BITS wide by data_length tall.
+   Additive case.  Note I could likely do some code reduction between
+   the additive and subtractive cases */
 void
 gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align,
                    ht_landscape_info_t ht_landscape, byte *halftone,
@@ -718,7 +918,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun,
                if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE
                    && is_planar_dev) {
-                    gx_ht_threshold_row_bit(thresh_align, contone_align, contone_stride,
+                    gx_ht_threshold_row_bit_sub(contone_align, thresh_align, contone_stride,
                                      halftone, dithered_stride, dest_width, vdi,
                                      offset_bits);
                } else {
@@ -868,7 +1068,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun,
                    /* Apply the threshold operation */
                    if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE
                        && is_planar_dev) {
-                        gx_ht_threshold_landscape(thresh_align, contone_align,
+                        gx_ht_threshold_landscape_sub(contone_align, thresh_align,
                                            penum->ht_landscape, halftone, dest_height);
                    } else {
                        gx_ht_threshold_landscape(contone_align, thresh_align,
diff --git a/gs/base/gxht_thresh.h b/gs/base/gxht_thresh.h
index ef4a8ecc9..8a05eb4fa 100644
--- a/gs/base/gxht_thresh.h
+++ b/gs/base/gxht_thresh.h
@@ -29,9 +29,16 @@ void gx_ht_threshold_row_bit(byte *contone,  byte *threshold_strip,
                             int contone_stride, byte *halftone,
                             int dithered_stride, int width, int num_rows,
                             int offset_bits);
+void gx_ht_threshold_row_bit_sub(byte *contone,  byte *threshold_strip,
+                             int contone_stride, byte *halftone,
+                             int dithered_stride, int width, int num_rows,
+                             int offset_bits);
 void gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align,
                    ht_landscape_info_t ht_landscape, byte *halftone,
                    int data_length);
+void gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align,
+                    ht_landscape_info_t ht_landscape, byte *halftone,
+                    int data_length);
 int gxht_thresh_image_init(gx_image_enum *penum);
 int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width,
                       int dest_height, byte *thresh_align, gx_device * dev,

diff --git a/gs/base/gxht_thresh.c b/gs/base/gxht_thresh.c index 2f975b478..517fdfae7 100644 --- a/gs/base/gxht_thresh.c +++ b/gs/base/gxht_thresh.c
@@ -174,7 +174,115 @@ threshold_16_SSE_unaligned(byte contone_ptr, byte thresh_ptr, byte *ht_data)
174	}	174	}
175	#endif	175	#endif
176		176
177	/* SSE2 and non-SSE2 implememntation of thresholding a row */	177	/* SSE2 and non-SSE2 implememntation of thresholding a row. Subtractive case
		178	There is some code replication between the two of these (additive and subtractive)
		179	that I need to go back and determine how we can combine them without
		180	any performance loss. */
		181	void
		182	gx_ht_threshold_row_bit_sub(byte contone, byte threshold_strip, int contone_stride,
		183	byte *halftone, int dithered_stride, int width,
		184	int num_rows, int offset_bits)
		185	{
		186	#ifndef HAVE_SSE2
		187	int k, j;
		188	byte *contone_ptr;
		189	byte *thresh_ptr;
		190	byte *halftone_ptr;
		191	byte bit_init;
		192
		193	/* For the moment just do a very slow compare until we get
		194	get this working. This could use some serious optimization */
		195	width -= offset_bits;
		196	for (j = 0; j < num_rows; j++) {
		197	byte h;
		198	contone_ptr = contone;
		199	thresh_ptr = threshold_strip + contone_stride * j;
		200	halftone_ptr = halftone + dithered_stride * j;
		201	/* First get the left remainder portion. Put into MSBs of first byte */
		202	bit_init = 0x80;
		203	h = 0;
		204	k = offset_bits;
		205	if (k > 0) {
		206	do {
		207	if (contone_ptr++ > thresh_ptr++) {
		208	h \|= bit_init;
		209	}
		210	bit_init >>= 1;
		211	if (bit_init == 0) {
		212	bit_init = 0x80;
		213	*halftone_ptr++ = h;
		214	h = 0;
		215	}
		216	k--;
		217	} while (k > 0);
		218	bit_init = 0x80;
		219	*halftone_ptr++ = h;
		220	h = 0;
		221	if (offset_bits < 8)
		222	*halftone_ptr++ = 0;
		223	}
		224	/* Now get the rest, which will be 16 bit aligned. */
		225	k = width;
		226	if (k > 0) {
		227	do {
		228	if (contone_ptr++ > thresh_ptr++) {
		229	h \|= bit_init;
		230	}
		231	bit_init >>= 1;
		232	if (bit_init == 0) {
		233	bit_init = 0x80;
		234	*halftone_ptr++ = h;
		235	h = 0;
		236	}
		237	k--;
		238	} while (k > 0);
		239	if (bit_init != 0x80) {
		240	*halftone_ptr++ = h;
		241	}
		242	if ((width & 15) < 8)
		243	*halftone_ptr++ = 0;
		244	}
		245	}
		246	#else
		247	byte *contone_ptr;
		248	byte *thresh_ptr;
		249	byte *halftone_ptr;
		250	int num_tiles = (width - offset_bits + 15)>>4;
		251	int k, j;
		252
		253	for (j = 0; j < num_rows; j++) {
		254	/* contone and thresh_ptr are 128 bit aligned. We do need to do this in
		255	two steps to ensure that we pack the bits in an aligned fashion
		256	into halftone_ptr. */
		257	contone_ptr = contone;
		258	thresh_ptr = threshold_strip + contone_stride * j;
		259	halftone_ptr = halftone + dithered_stride * j;
		260	if (offset_bits > 0) {
		261	/* Since we allowed for 16 bits in our left remainder
		262	we can go directly in to the destination. threshold_16_SSE
		263	requires 128 bit alignment. contone_ptr and thresh_ptr
		264	are set up so that after we move in by offset_bits elements
		265	then we are 128 bit aligned. */
		266	threshold_16_SSE_unaligned(thresh_ptr, contone_ptr,
		267	halftone_ptr);
		268	halftone_ptr += 2;
		269	thresh_ptr += offset_bits;
		270	contone_ptr += offset_bits;
		271	}
		272	/* Now we should have 128 bit aligned with our input data. Iterate
		273	over sets of 16 going directly into our HT buffer. Sources and
		274	halftone_ptr buffers should be padded to allow 15 bit overrun */
		275	for (k = 0; k < num_tiles; k++) {
		276	threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr);
		277	thresh_ptr += 16;
		278	contone_ptr += 16;
		279	halftone_ptr += 2;
		280	}
		281	}
		282	#endif
		283	}
		284
		285	/* SSE2 and non-SSE2 implememntation of thresholding a row. additive case */
178	void	286	void
179	gx_ht_threshold_row_bit(byte contone, byte threshold_strip, int contone_stride,	287	gx_ht_threshold_row_bit(byte contone, byte threshold_strip, int contone_stride,
180	byte *halftone, int dithered_stride, int width,	288	byte *halftone, int dithered_stride, int width,
@@ -279,7 +387,99 @@ gx_ht_threshold_row_bit(byte contone, byte threshold_strip, int contone_stri
279	#endif	387	#endif
280	}	388	}
281		389
282	/* This thresholds a buffer that is LAND_BITS wide by data_length tall. */	390	/* This thresholds a buffer that is LAND_BITS wide by data_length tall.
		391	Subtractive case */
		392	void
		393	gx_ht_threshold_landscape_sub(byte contone_align, byte thresh_align,
		394	ht_landscape_info_t ht_landscape, byte *halftone,
		395	int data_length)
		396	{
		397	__align16 byte contone[LAND_BITS];
		398	int position_start, position, curr_position;
		399	int *widths = &(ht_landscape.widths[0]);
		400	int local_widths[LAND_BITS];
		401	int num_contone = ht_landscape.num_contones;
		402	int k, j, w, contone_out_posit;
		403	byte contone_ptr, thresh_ptr, *halftone_ptr;
		404	#ifdef PACIFY_VALGRIND
		405	int extra = 0;
		406	#endif
		407
		408	/* Work through chunks of 16. */
		409	/* Data may have come in left to right or right to left. */
		410	if (ht_landscape.index > 0) {
		411	position = position_start = 0;
		412	} else {
		413	position = position_start = ht_landscape.curr_pos + 1;
		414	}
		415	thresh_ptr = thresh_align;
		416	halftone_ptr = halftone;
		417	/* Copy the widths to a local array, and truncate the last one (which may
		418	* be the first one!) if required. */
		419	k = 0;
		420	for (j = 0; j < num_contone; j++)
		421	k += (local_widths[j] = widths[position_start+j]);
		422	if (k > LAND_BITS) {
		423	if (ht_landscape.index > 0) {
		424	local_widths[num_contone-1] -= k-LAND_BITS;
		425	} else {
		426	local_widths[0] -= k-LAND_BITS;
		427	}
		428	}
		429	#ifdef PACIFY_VALGRIND
		430	if (k < LAND_BITS) {
		431	extra = LAND_BITS - k;
		432	}
		433	#endif
		434	for (k = data_length; k > 0; k--) { /* Loop on rows */
		435	contone_ptr = &(contone_align[position]); /* Point us to our row start */
		436	curr_position = 0; /* We use this in keeping track of widths */
		437	contone_out_posit = 0; /* Our index out */
		438	for (j = num_contone; j > 0; j--) {
		439	byte c = *contone_ptr;
		440	/* The microsoft compiler, cleverly spots that the following loop
		441	* can be replaced by a memset. Unfortunately, it can't spot that
		442	* the typical length values of the memset are so small that we'd
		443	* be better off doing it the slow way. We therefore introduce a
		444	* sneaky 'volatile' cast below that stops this optimisation. */
		445	w = local_widths[curr_position];
		446	do {
		447	((volatile byte *)contone)[contone_out_posit] = c;
		448	contone_out_posit++;
		449	} while (--w);
		450	#ifdef PACIFY_VALGRIND
		451	if (extra)
		452	memset(contone+contone_out_posit, 0, extra);
		453	#endif
		454	curr_position++; /* Move us to the next position in our width array */
		455	contone_ptr++; /* Move us to a new location in our contone buffer */
		456	}
		457	/* Now we have our left justified and expanded contone data for
		458	LAND_BITS/16 sets of 16 bits. Go ahead and threshold these. */
		459	contone_ptr = &contone[0];
		460	#if LAND_BITS > 16
		461	j = LAND_BITS;
		462	do {
		463	#endif
		464	#ifdef HAVE_SSE2
		465	threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr);
		466	#else
		467	threshold_16_bit(thresh_ptr, contone_ptr, halftone_ptr);
		468	#endif
		469	thresh_ptr += 16;
		470	position += 16;
		471	halftone_ptr += 2;
		472	contone_ptr += 16;
		473	#if LAND_BITS > 16
		474	j -= 16;
		475	} while (j > 0);
		476	#endif
		477	}
		478	}
		479
		480	/* This thresholds a buffer that is LAND_BITS wide by data_length tall.
		481	Additive case. Note I could likely do some code reduction between
		482	the additive and subtractive cases */
283	void	483	void
284	gx_ht_threshold_landscape(byte contone_align, byte thresh_align,	484	gx_ht_threshold_landscape(byte contone_align, byte thresh_align,
285	ht_landscape_info_t ht_landscape, byte *halftone,	485	ht_landscape_info_t ht_landscape, byte *halftone,
@@ -718,7 +918,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun,
718		918
719	if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE	919	if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE
720	&& is_planar_dev) {	920	&& is_planar_dev) {
721	gx_ht_threshold_row_bit(thresh_align, contone_align, contone_stride,	921	gx_ht_threshold_row_bit_sub(contone_align, thresh_align, contone_stride,
722	halftone, dithered_stride, dest_width, vdi,	922	halftone, dithered_stride, dest_width, vdi,
723	offset_bits);	923	offset_bits);
724	} else {	924	} else {
@@ -868,7 +1068,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun,
868	/* Apply the threshold operation */	1068	/* Apply the threshold operation */
869	if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE	1069	if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE
870	&& is_planar_dev) {	1070	&& is_planar_dev) {
871	gx_ht_threshold_landscape(thresh_align, contone_align,	1071	gx_ht_threshold_landscape_sub(contone_align, thresh_align,
872	penum->ht_landscape, halftone, dest_height);	1072	penum->ht_landscape, halftone, dest_height);
873	} else {	1073	} else {
874	gx_ht_threshold_landscape(contone_align, thresh_align,	1074	gx_ht_threshold_landscape(contone_align, thresh_align,


diff --git a/gs/base/gxht_thresh.h b/gs/base/gxht_thresh.h index ef4a8ecc9..8a05eb4fa 100644 --- a/gs/base/gxht_thresh.h +++ b/gs/base/gxht_thresh.h
@@ -29,9 +29,16 @@ void gx_ht_threshold_row_bit(byte contone, byte threshold_strip,
29	int contone_stride, byte *halftone,	29	int contone_stride, byte *halftone,
30	int dithered_stride, int width, int num_rows,	30	int dithered_stride, int width, int num_rows,
31	int offset_bits);	31	int offset_bits);
		32	void gx_ht_threshold_row_bit_sub(byte contone, byte threshold_strip,
		33	int contone_stride, byte *halftone,
		34	int dithered_stride, int width, int num_rows,
		35	int offset_bits);
32	void gx_ht_threshold_landscape(byte contone_align, byte thresh_align,	36	void gx_ht_threshold_landscape(byte contone_align, byte thresh_align,
33	ht_landscape_info_t ht_landscape, byte *halftone,	37	ht_landscape_info_t ht_landscape, byte *halftone,
34	int data_length);	38	int data_length);
		39	void gx_ht_threshold_landscape_sub(byte contone_align, byte thresh_align,
		40	ht_landscape_info_t ht_landscape, byte *halftone,
		41	int data_length);
35	int gxht_thresh_image_init(gx_image_enum *penum);	42	int gxht_thresh_image_init(gx_image_enum *penum);
36	int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width,	43	int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width,
37	int dest_height, byte thresh_align, gx_device dev,	44	int dest_height, byte thresh_align, gx_device dev,