diff options
-rw-r--r-- | gs/base/gxht_thresh.c | 208 | ||||
-rw-r--r-- | gs/base/gxht_thresh.h | 7 |
2 files changed, 211 insertions, 4 deletions
diff --git a/gs/base/gxht_thresh.c b/gs/base/gxht_thresh.c index 2f975b478..517fdfae7 100644 --- a/gs/base/gxht_thresh.c +++ b/gs/base/gxht_thresh.c | |||
@@ -174,7 +174,115 @@ threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data) | |||
174 | } | 174 | } |
175 | #endif | 175 | #endif |
176 | 176 | ||
177 | /* SSE2 and non-SSE2 implememntation of thresholding a row */ | 177 | /* SSE2 and non-SSE2 implememntation of thresholding a row. Subtractive case |
178 | There is some code replication between the two of these (additive and subtractive) | ||
179 | that I need to go back and determine how we can combine them without | ||
180 | any performance loss. */ | ||
181 | void | ||
182 | gx_ht_threshold_row_bit_sub(byte *contone, byte *threshold_strip, int contone_stride, | ||
183 | byte *halftone, int dithered_stride, int width, | ||
184 | int num_rows, int offset_bits) | ||
185 | { | ||
186 | #ifndef HAVE_SSE2 | ||
187 | int k, j; | ||
188 | byte *contone_ptr; | ||
189 | byte *thresh_ptr; | ||
190 | byte *halftone_ptr; | ||
191 | byte bit_init; | ||
192 | |||
193 | /* For the moment just do a very slow compare until we get | ||
194 | get this working. This could use some serious optimization */ | ||
195 | width -= offset_bits; | ||
196 | for (j = 0; j < num_rows; j++) { | ||
197 | byte h; | ||
198 | contone_ptr = contone; | ||
199 | thresh_ptr = threshold_strip + contone_stride * j; | ||
200 | halftone_ptr = halftone + dithered_stride * j; | ||
201 | /* First get the left remainder portion. Put into MSBs of first byte */ | ||
202 | bit_init = 0x80; | ||
203 | h = 0; | ||
204 | k = offset_bits; | ||
205 | if (k > 0) { | ||
206 | do { | ||
207 | if (*contone_ptr++ > *thresh_ptr++) { | ||
208 | h |= bit_init; | ||
209 | } | ||
210 | bit_init >>= 1; | ||
211 | if (bit_init == 0) { | ||
212 | bit_init = 0x80; | ||
213 | *halftone_ptr++ = h; | ||
214 | h = 0; | ||
215 | } | ||
216 | k--; | ||
217 | } while (k > 0); | ||
218 | bit_init = 0x80; | ||
219 | *halftone_ptr++ = h; | ||
220 | h = 0; | ||
221 | if (offset_bits < 8) | ||
222 | *halftone_ptr++ = 0; | ||
223 | } | ||
224 | /* Now get the rest, which will be 16 bit aligned. */ | ||
225 | k = width; | ||
226 | if (k > 0) { | ||
227 | do { | ||
228 | if (*contone_ptr++ > *thresh_ptr++) { | ||
229 | h |= bit_init; | ||
230 | } | ||
231 | bit_init >>= 1; | ||
232 | if (bit_init == 0) { | ||
233 | bit_init = 0x80; | ||
234 | *halftone_ptr++ = h; | ||
235 | h = 0; | ||
236 | } | ||
237 | k--; | ||
238 | } while (k > 0); | ||
239 | if (bit_init != 0x80) { | ||
240 | *halftone_ptr++ = h; | ||
241 | } | ||
242 | if ((width & 15) < 8) | ||
243 | *halftone_ptr++ = 0; | ||
244 | } | ||
245 | } | ||
246 | #else | ||
247 | byte *contone_ptr; | ||
248 | byte *thresh_ptr; | ||
249 | byte *halftone_ptr; | ||
250 | int num_tiles = (width - offset_bits + 15)>>4; | ||
251 | int k, j; | ||
252 | |||
253 | for (j = 0; j < num_rows; j++) { | ||
254 | /* contone and thresh_ptr are 128 bit aligned. We do need to do this in | ||
255 | two steps to ensure that we pack the bits in an aligned fashion | ||
256 | into halftone_ptr. */ | ||
257 | contone_ptr = contone; | ||
258 | thresh_ptr = threshold_strip + contone_stride * j; | ||
259 | halftone_ptr = halftone + dithered_stride * j; | ||
260 | if (offset_bits > 0) { | ||
261 | /* Since we allowed for 16 bits in our left remainder | ||
262 | we can go directly in to the destination. threshold_16_SSE | ||
263 | requires 128 bit alignment. contone_ptr and thresh_ptr | ||
264 | are set up so that after we move in by offset_bits elements | ||
265 | then we are 128 bit aligned. */ | ||
266 | threshold_16_SSE_unaligned(thresh_ptr, contone_ptr, | ||
267 | halftone_ptr); | ||
268 | halftone_ptr += 2; | ||
269 | thresh_ptr += offset_bits; | ||
270 | contone_ptr += offset_bits; | ||
271 | } | ||
272 | /* Now we should have 128 bit aligned with our input data. Iterate | ||
273 | over sets of 16 going directly into our HT buffer. Sources and | ||
274 | halftone_ptr buffers should be padded to allow 15 bit overrun */ | ||
275 | for (k = 0; k < num_tiles; k++) { | ||
276 | threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr); | ||
277 | thresh_ptr += 16; | ||
278 | contone_ptr += 16; | ||
279 | halftone_ptr += 2; | ||
280 | } | ||
281 | } | ||
282 | #endif | ||
283 | } | ||
284 | |||
285 | /* SSE2 and non-SSE2 implememntation of thresholding a row. additive case */ | ||
178 | void | 286 | void |
179 | gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stride, | 287 | gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stride, |
180 | byte *halftone, int dithered_stride, int width, | 288 | byte *halftone, int dithered_stride, int width, |
@@ -279,7 +387,99 @@ gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, int contone_stri | |||
279 | #endif | 387 | #endif |
280 | } | 388 | } |
281 | 389 | ||
282 | /* This thresholds a buffer that is LAND_BITS wide by data_length tall. */ | 390 | /* This thresholds a buffer that is LAND_BITS wide by data_length tall. |
391 | Subtractive case */ | ||
392 | void | ||
393 | gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align, | ||
394 | ht_landscape_info_t ht_landscape, byte *halftone, | ||
395 | int data_length) | ||
396 | { | ||
397 | __align16 byte contone[LAND_BITS]; | ||
398 | int position_start, position, curr_position; | ||
399 | int *widths = &(ht_landscape.widths[0]); | ||
400 | int local_widths[LAND_BITS]; | ||
401 | int num_contone = ht_landscape.num_contones; | ||
402 | int k, j, w, contone_out_posit; | ||
403 | byte *contone_ptr, *thresh_ptr, *halftone_ptr; | ||
404 | #ifdef PACIFY_VALGRIND | ||
405 | int extra = 0; | ||
406 | #endif | ||
407 | |||
408 | /* Work through chunks of 16. */ | ||
409 | /* Data may have come in left to right or right to left. */ | ||
410 | if (ht_landscape.index > 0) { | ||
411 | position = position_start = 0; | ||
412 | } else { | ||
413 | position = position_start = ht_landscape.curr_pos + 1; | ||
414 | } | ||
415 | thresh_ptr = thresh_align; | ||
416 | halftone_ptr = halftone; | ||
417 | /* Copy the widths to a local array, and truncate the last one (which may | ||
418 | * be the first one!) if required. */ | ||
419 | k = 0; | ||
420 | for (j = 0; j < num_contone; j++) | ||
421 | k += (local_widths[j] = widths[position_start+j]); | ||
422 | if (k > LAND_BITS) { | ||
423 | if (ht_landscape.index > 0) { | ||
424 | local_widths[num_contone-1] -= k-LAND_BITS; | ||
425 | } else { | ||
426 | local_widths[0] -= k-LAND_BITS; | ||
427 | } | ||
428 | } | ||
429 | #ifdef PACIFY_VALGRIND | ||
430 | if (k < LAND_BITS) { | ||
431 | extra = LAND_BITS - k; | ||
432 | } | ||
433 | #endif | ||
434 | for (k = data_length; k > 0; k--) { /* Loop on rows */ | ||
435 | contone_ptr = &(contone_align[position]); /* Point us to our row start */ | ||
436 | curr_position = 0; /* We use this in keeping track of widths */ | ||
437 | contone_out_posit = 0; /* Our index out */ | ||
438 | for (j = num_contone; j > 0; j--) { | ||
439 | byte c = *contone_ptr; | ||
440 | /* The microsoft compiler, cleverly spots that the following loop | ||
441 | * can be replaced by a memset. Unfortunately, it can't spot that | ||
442 | * the typical length values of the memset are so small that we'd | ||
443 | * be better off doing it the slow way. We therefore introduce a | ||
444 | * sneaky 'volatile' cast below that stops this optimisation. */ | ||
445 | w = local_widths[curr_position]; | ||
446 | do { | ||
447 | ((volatile byte *)contone)[contone_out_posit] = c; | ||
448 | contone_out_posit++; | ||
449 | } while (--w); | ||
450 | #ifdef PACIFY_VALGRIND | ||
451 | if (extra) | ||
452 | memset(contone+contone_out_posit, 0, extra); | ||
453 | #endif | ||
454 | curr_position++; /* Move us to the next position in our width array */ | ||
455 | contone_ptr++; /* Move us to a new location in our contone buffer */ | ||
456 | } | ||
457 | /* Now we have our left justified and expanded contone data for | ||
458 | LAND_BITS/16 sets of 16 bits. Go ahead and threshold these. */ | ||
459 | contone_ptr = &contone[0]; | ||
460 | #if LAND_BITS > 16 | ||
461 | j = LAND_BITS; | ||
462 | do { | ||
463 | #endif | ||
464 | #ifdef HAVE_SSE2 | ||
465 | threshold_16_SSE(thresh_ptr, contone_ptr, halftone_ptr); | ||
466 | #else | ||
467 | threshold_16_bit(thresh_ptr, contone_ptr, halftone_ptr); | ||
468 | #endif | ||
469 | thresh_ptr += 16; | ||
470 | position += 16; | ||
471 | halftone_ptr += 2; | ||
472 | contone_ptr += 16; | ||
473 | #if LAND_BITS > 16 | ||
474 | j -= 16; | ||
475 | } while (j > 0); | ||
476 | #endif | ||
477 | } | ||
478 | } | ||
479 | |||
480 | /* This thresholds a buffer that is LAND_BITS wide by data_length tall. | ||
481 | Additive case. Note I could likely do some code reduction between | ||
482 | the additive and subtractive cases */ | ||
283 | void | 483 | void |
284 | gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, | 484 | gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, |
285 | ht_landscape_info_t ht_landscape, byte *halftone, | 485 | ht_landscape_info_t ht_landscape, byte *halftone, |
@@ -718,7 +918,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun, | |||
718 | 918 | ||
719 | if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE | 919 | if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE |
720 | && is_planar_dev) { | 920 | && is_planar_dev) { |
721 | gx_ht_threshold_row_bit(thresh_align, contone_align, contone_stride, | 921 | gx_ht_threshold_row_bit_sub(contone_align, thresh_align, contone_stride, |
722 | halftone, dithered_stride, dest_width, vdi, | 922 | halftone, dithered_stride, dest_width, vdi, |
723 | offset_bits); | 923 | offset_bits); |
724 | } else { | 924 | } else { |
@@ -868,7 +1068,7 @@ gxht_thresh_planes(gx_image_enum *penum, fixed xrun, | |||
868 | /* Apply the threshold operation */ | 1068 | /* Apply the threshold operation */ |
869 | if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE | 1069 | if (dev->color_info.polarity == GX_CINFO_POLARITY_SUBTRACTIVE |
870 | && is_planar_dev) { | 1070 | && is_planar_dev) { |
871 | gx_ht_threshold_landscape(thresh_align, contone_align, | 1071 | gx_ht_threshold_landscape_sub(contone_align, thresh_align, |
872 | penum->ht_landscape, halftone, dest_height); | 1072 | penum->ht_landscape, halftone, dest_height); |
873 | } else { | 1073 | } else { |
874 | gx_ht_threshold_landscape(contone_align, thresh_align, | 1074 | gx_ht_threshold_landscape(contone_align, thresh_align, |
diff --git a/gs/base/gxht_thresh.h b/gs/base/gxht_thresh.h index ef4a8ecc9..8a05eb4fa 100644 --- a/gs/base/gxht_thresh.h +++ b/gs/base/gxht_thresh.h | |||
@@ -29,9 +29,16 @@ void gx_ht_threshold_row_bit(byte *contone, byte *threshold_strip, | |||
29 | int contone_stride, byte *halftone, | 29 | int contone_stride, byte *halftone, |
30 | int dithered_stride, int width, int num_rows, | 30 | int dithered_stride, int width, int num_rows, |
31 | int offset_bits); | 31 | int offset_bits); |
32 | void gx_ht_threshold_row_bit_sub(byte *contone, byte *threshold_strip, | ||
33 | int contone_stride, byte *halftone, | ||
34 | int dithered_stride, int width, int num_rows, | ||
35 | int offset_bits); | ||
32 | void gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, | 36 | void gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align, |
33 | ht_landscape_info_t ht_landscape, byte *halftone, | 37 | ht_landscape_info_t ht_landscape, byte *halftone, |
34 | int data_length); | 38 | int data_length); |
39 | void gx_ht_threshold_landscape_sub(byte *contone_align, byte *thresh_align, | ||
40 | ht_landscape_info_t ht_landscape, byte *halftone, | ||
41 | int data_length); | ||
35 | int gxht_thresh_image_init(gx_image_enum *penum); | 42 | int gxht_thresh_image_init(gx_image_enum *penum); |
36 | int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width, | 43 | int gxht_thresh_planes(gx_image_enum *penum, fixed xrun, int dest_width, |
37 | int dest_height, byte *thresh_align, gx_device * dev, | 44 | int dest_height, byte *thresh_align, gx_device * dev, |