summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gs/base/gxht_thresh.c208
-rw-r--r--gs/base/gxht_thresh.h7
2 files changed, 211 insertions, 4 deletions
diff --git a/gs/base/gxht_thresh.c b/gs/base/gxht_thresh.c
index 2f975b478..517fdfae7 100644
--- a/gs/base/gxht_thresh.c
+++ b/gs/base/gxht_thresh.c
@@ -174,7 +174,115 @@ threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
174} 174}
175#endif 175#endif
176 176
177/* SSE2 and non-SSE2 implememntation of thresholding a row */ 177/* SSE2 and non-SSE2 implememntation of thresholding a row. Subtractive case
178 There is some code replication between the two of these (additive and subtractive)
179 that I need to go back and determine how we can combine them without
180 any performance loss. */
181void
182gx_ht_threshold_row_bit_sub(byte *contone, byte *threshold_strip, int contone_stride,
183 byte *halftone, int dithered_stride, int width,
184 int num_rows, int offset_bits)
185{
186#ifndef HAVE_SSE2
187 int k, j;
188 byte *contone_ptr;
189 byte *thresh_ptr;
190 byte *halftone_ptr;
191 byte bit_init;
192
193 /* For the moment just do a very slow compare until we get
194 get this working. This could use some serious optimization */
195 width -= offset_bits;
196 for (j = 0; j < num_rows; j++) {
197 byte h;
198 contone_ptr = contone;
199 thresh_ptr = threshold_strip + contone_stride * j;
200 halftone_ptr = halftone + dithered_stride * j;
201 /* First get the left remainder portion. Put into MSBs of first byte */
202 bit_init = 0x80;
203 h = 0;
204 k = offset_bits;
205 if (k > 0) {
206 do {
207 if (*contone_ptr++ > *thresh_ptr++) {
208 h |= bit_init;
209 }
210 bit_init >>= 1;
211 if (bit_init == 0) {
212 bit_init = 0x80;
213 *halftone_ptr++ = h;
214 h = 0;
215 }
216 k--;
217 } while (k > 0);
218 bit_init = 0x80;
219 *halftone_ptr++ = h;
220 h = 0;
221 if (offset_bits < 8)
222 *halftone_ptr++ = 0;
223 }
224 /* Now get the rest, which will be 16 bit aligned. */
225 k = width;
226 if (k > 0) {
227 do {
228 if (*contone_ptr++ > *thresh_ptr++) {
229 h |= bit_init;
230 }
231 bit_init >>= 1;
232 if (bit_init == 0) {
233 bit_init = 0x80;
234 *halftone_ptr++ = h;
235 h = 0;
236 }
237 k--;
238 } while (k > 0);
239 if (bit_init != 0x80) {
240 *halftone_ptr++ = h;
241 }
242 if ((width & 15) < 8)
243 *halftone_ptr++ = 0;
244 }
245 }
246#else
247 byte *contone_ptr;
248 byte *thresh_ptr;
249 byte *halftone_ptr;
250 int num_tiles = (width - offset_bits + 15)>>4;
251 int k, j;
252
253 for (j = 0; j < num_rows; j++) {
254 /* contone and thresh_ptr are 128 bit aligned. We do need to do this in
255 two steps to ensure that we pack the bits in an aligned fashion
256 into halftone_ptr. */
257 contone_ptr = contone;
258 thresh_ptr = threshold_strip + contone_stride * j;
259 halftone_ptr = halftone + dithered_stride * j;
260 if (offset_bits > 0) {
261 /* Since we allowed for 16 bits in our left remainder
262 we can go directly in to the destination. threshold_16_SSE
263 requires 128 bit alignment. contone_ptr and thresh_ptr
264 are set up so that after we move in by offset_bits elements
265 then we are 128 bit aligned. */
266 threshold_16_SSE_unaligned(thresh_ptr, contone_ptr,
267 halftone_ptr);
268 halftone_ptr += 2;
269 thresh_ptr += offset_bits;
270 contone_ptr += offset_bits;
271 }
272 /* Now we should have 128 bit aligned with our input data. Iterate
273 over sets of 16 going directly into our HT buffer. Sources and
274 halftone_ptr buffers should be padded to allow 15 bit overrun */
275 for (k = 0; k < num_tiles; k++) {
276 threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr);
277 thresh_ptr += 16;
278 contone_ptr += 16;
279 halftone_ptr += 2;
280 }
281 }
282#endif
283}
284
285/* SSE2 and non-SSE2 implememntation of thresholding a row. additive case */
178void 286void
179gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stride, 287gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stride,
180 byte *halftone, int dithered_stride, int width, 288 byte *halftone, int dithered_stride, int width,
@@ -279,7 +387,99 @@ gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stri
279#endif 387#endif
280} 388}
281 389
282/* This thresholds a buffer that is LAND_BITS wide by data_length tall. */ 390/* This thresholds a buffer that is LAND_BITS wide by data_length tall.
391 Subtractive case */
392void
393gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align,
394 ht_landscape_info_t ht_landscape, byte *halftone,
395 int data_length)
396{
397 __align16 byte contone[LAND_BITS];
398 int position_start, position, curr_position;
399 int *widths = &(ht_landscape.widths[0]);
400 int local_widths[LAND_BITS];
401 int num_contone = ht_landscape.num_contones;
402 int k, j, w, contone_out_posit;
403 byte *contone_ptr, *thresh_ptr, *halftone_ptr;
404#ifdef PACIFY_VALGRIND
405 int extra = 0;
406#endif
407
408 /* Work through chunks of 16. */
409 /* Data may have come in left to right or right to left. */
410 if (ht_landscape.index > 0) {
411 position = position_start = 0;
412 } else {
413 position = position_start = ht_landscape.curr_pos + 1;
414 }
415 thresh_ptr = thresh_align;
416 halftone_ptr = halftone;
417 /* Copy the widths to a local array, and truncate the last one (which may
418 * be the first one!) if required. */
419 k = 0;
420 for (j = 0; j < num_contone; j++)
421 k += (local_widths[j] = widths[position_start+j]);
422 if (k > LAND_BITS) {
423 if (ht_landscape.index > 0) {
424 local_widths[num_contone-1] -= k-LAND_BITS;
425 } else {
426 local_widths[0] -= k-LAND_BITS;
427 }
428 }
429#ifdef PACIFY_VALGRIND
430 if (k < LAND_BITS) {
431 extra = LAND_BITS - k;
432 }
433#endif
434 for (k = data_length; k > 0; k--) { /* Loop on rows */
435 contone_ptr = &(contone_align[position]); /* Point us to our row start */
436 curr_position = 0; /* We use this in keeping track of widths */
437 contone_out_posit = 0; /* Our index out */
438 for (j = num_contone; j > 0; j--) {
439 byte c = *contone_ptr;
440 /* The microsoft compiler, cleverly spots that the following loop
441 * can be replaced by a memset. Unfortunately, it can't spot that
442 * the typical length values of the memset are so small that we'd
443 * be better off doing it the slow way. We therefore introduce a
444 * sneaky 'volatile' cast below that stops this optimisation. */
445 w = local_widths[curr_position];
446 do {
447 ((volatile byte *)contone)[contone_out_posit] = c;
448 contone_out_posit++;
449 } while (--w);
450#ifdef PACIFY_VALGRIND
451 if (extra)
452 memset(contone+contone_out_posit, 0, extra);
453#endif
454 curr_position++; /* Move us to the next position in our width array */
455 contone_ptr++; /* Move us to a new location in our contone buffer */
456 }
457 /* Now we have our left justified and expanded contone data for
458 LAND_BITS/16 sets of 16 bits. Go ahead and threshold these. */
459 contone_ptr = &contone[0];
460#if LAND_BITS > 16
461 j = LAND_BITS;
462 do {
463#endif
464#ifdef HAVE_SSE2
465 threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr);
466#else
467 threshold_16_bit(thresh_ptr, contone_ptr, halftone_ptr);
468#endif
469 thresh_ptr += 16;
470 position += 16;
471 halftone_ptr += 2;
472 contone_ptr += 16;
473#if LAND_BITS > 16
474 j -= 16;
475 } while (j > 0);
476#endif
477 }
478}
479
480/* This thresholds a buffer that is LAND_BITS wide by data_length tall.
481 Additive case. Note I could likely do some code reduction between
482 the additive and subtractive cases */
283void 483void
284gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, 484gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align,
285 ht_landscape_info_t ht_landscape, byte *halftone, 485 ht_landscape_info_t ht_landscape, byte *halftone,
@@ -718,7 +918,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun,
718 918
719 if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE 919 if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE
720 && is_planar_dev) { 920 && is_planar_dev) {
721 gx_ht_threshold_row_bit(thresh_align, contone_align, contone_stride, 921 gx_ht_threshold_row_bit_sub(contone_align, thresh_align, contone_stride,
722 halftone, dithered_stride, dest_width, vdi, 922 halftone, dithered_stride, dest_width, vdi,
723 offset_bits); 923 offset_bits);
724 } else { 924 } else {
@@ -868,7 +1068,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun,
868 /* Apply the threshold operation */ 1068 /* Apply the threshold operation */
869 if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE 1069 if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE
870 && is_planar_dev) { 1070 && is_planar_dev) {
871 gx_ht_threshold_landscape(thresh_align, contone_align, 1071 gx_ht_threshold_landscape_sub(contone_align, thresh_align,
872 penum->ht_landscape, halftone, dest_height); 1072 penum->ht_landscape, halftone, dest_height);
873 } else { 1073 } else {
874 gx_ht_threshold_landscape(contone_align, thresh_align, 1074 gx_ht_threshold_landscape(contone_align, thresh_align,
diff --git a/gs/base/gxht_thresh.h b/gs/base/gxht_thresh.h
index ef4a8ecc9..8a05eb4fa 100644
--- a/gs/base/gxht_thresh.h
+++ b/gs/base/gxht_thresh.h
@@ -29,9 +29,16 @@ void gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip,
29 int contone_stride, byte *halftone, 29 int contone_stride, byte *halftone,
30 int dithered_stride, int width, int num_rows, 30 int dithered_stride, int width, int num_rows,
31 int offset_bits); 31 int offset_bits);
32void gx_ht_threshold_row_bit_sub(byte *contone, byte *threshold_strip,
33 int contone_stride, byte *halftone,
34 int dithered_stride, int width, int num_rows,
35 int offset_bits);
32void gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, 36void gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align,
33 ht_landscape_info_t ht_landscape, byte *halftone, 37 ht_landscape_info_t ht_landscape, byte *halftone,
34 int data_length); 38 int data_length);
39void gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align,
40 ht_landscape_info_t ht_landscape, byte *halftone,
41 int data_length);
35int gxht_thresh_image_init(gx_image_enum *penum); 42int gxht_thresh_image_init(gx_image_enum *penum);
36int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width, 43int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width,
37 int dest_height, byte *thresh_align, gx_device * dev, 44 int dest_height, byte *thresh_align, gx_device * dev,