diff options
-rw-r--r-- | include/mupdf/fitz/font.h | 6 | ||||
-rw-r--r-- | include/mupdf/fitz/structured-text.h | 188 | ||||
-rw-r--r-- | include/mupdf/fitz/util.h | 8 | ||||
-rw-r--r-- | platform/java/mupdf_native.c | 2 | ||||
-rw-r--r-- | platform/win32/libmupdf.vcproj | 4 | ||||
-rw-r--r-- | platform/x11/pdfapp.c | 61 | ||||
-rw-r--r-- | platform/x11/pdfapp.h | 1 | ||||
-rw-r--r-- | source/fitz/font.c | 22 | ||||
-rw-r--r-- | source/fitz/stext-device.c | 1047 | ||||
-rw-r--r-- | source/fitz/stext-output.c | 386 | ||||
-rw-r--r-- | source/fitz/stext-paragraph.c | 1538 | ||||
-rw-r--r-- | source/fitz/stext-search.c | 137 | ||||
-rw-r--r-- | source/fitz/util.c | 109 | ||||
-rw-r--r-- | source/tools/mudraw.c | 10 | ||||
-rw-r--r-- | source/tools/murun.c | 16 |
15 files changed, 681 insertions, 2854 deletions
diff --git a/include/mupdf/fitz/font.h b/include/mupdf/fitz/font.h index a6e172a1d..ef4cd74d4 100644 --- a/include/mupdf/fitz/font.h +++ b/include/mupdf/fitz/font.h | |||
@@ -601,6 +601,12 @@ int fz_encode_character_with_fallback(fz_context *ctx, fz_font *font, int unicod | |||
601 | void fz_get_glyph_name(fz_context *ctx, fz_font *font, int glyph, char *buf, int size); | 601 | void fz_get_glyph_name(fz_context *ctx, fz_font *font, int glyph, char *buf, int size); |
602 | 602 | ||
603 | /* | 603 | /* |
604 | Get font ascender and descender values. | ||
605 | */ | ||
606 | float fz_font_ascender(fz_context *ctx, fz_font *font); | ||
607 | float fz_font_descender(fz_context *ctx, fz_font *font); | ||
608 | |||
609 | /* | ||
604 | Internal functions for our Harfbuzz integration | 610 | Internal functions for our Harfbuzz integration |
605 | to work around the lack of thread safety. | 611 | to work around the lack of thread safety. |
606 | */ | 612 | */ |
diff --git a/include/mupdf/fitz/structured-text.h b/include/mupdf/fitz/structured-text.h index 61ee30ad1..0f3364b3b 100644 --- a/include/mupdf/fitz/structured-text.h +++ b/include/mupdf/fitz/structured-text.h | |||
@@ -16,15 +16,9 @@ | |||
16 | (In development - Subject to change in future versions) | 16 | (In development - Subject to change in future versions) |
17 | */ | 17 | */ |
18 | 18 | ||
19 | typedef struct fz_stext_style_s fz_stext_style; | ||
20 | typedef struct fz_stext_char_s fz_stext_char; | 19 | typedef struct fz_stext_char_s fz_stext_char; |
21 | typedef struct fz_stext_span_s fz_stext_span; | ||
22 | typedef struct fz_stext_line_s fz_stext_line; | 20 | typedef struct fz_stext_line_s fz_stext_line; |
23 | typedef struct fz_stext_block_s fz_stext_block; | 21 | typedef struct fz_stext_block_s fz_stext_block; |
24 | typedef struct fz_image_block_s fz_image_block; | ||
25 | typedef struct fz_page_block_s fz_page_block; | ||
26 | |||
27 | typedef struct fz_stext_sheet_s fz_stext_sheet; | ||
28 | typedef struct fz_stext_page_s fz_stext_page; | 22 | typedef struct fz_stext_page_s fz_stext_page; |
29 | 23 | ||
30 | /* | 24 | /* |
@@ -52,150 +46,58 @@ enum | |||
52 | }; | 46 | }; |
53 | 47 | ||
54 | /* | 48 | /* |
55 | fz_stext_sheet: A text sheet contains a list of distinct text styles | 49 | A text page is a list of blocks, together with an overall bounding box. |
56 | used on a page (or a series of pages). | ||
57 | */ | ||
58 | struct fz_stext_sheet_s | ||
59 | { | ||
60 | int maxid; | ||
61 | fz_stext_style *style; | ||
62 | }; | ||
63 | |||
64 | /* | ||
65 | fz_stext_style: A text style contains details of a distinct text style | ||
66 | used on a page. | ||
67 | */ | ||
68 | struct fz_stext_style_s | ||
69 | { | ||
70 | fz_stext_style *next; | ||
71 | int id; | ||
72 | fz_font *font; | ||
73 | float size; | ||
74 | int wmode; | ||
75 | int script; | ||
76 | /* Ascender and Descender only have the conventional sense in | ||
77 | * horizontal mode; in vertical mode they are rotated too - they are | ||
78 | * the maximum and minimum bounds respectively. */ | ||
79 | float ascender; | ||
80 | float descender; | ||
81 | /* etc... */ | ||
82 | }; | ||
83 | |||
84 | /* | ||
85 | fz_stext_page: A text page is a list of page blocks, together with | ||
86 | an overall bounding box. | ||
87 | */ | 50 | */ |
88 | struct fz_stext_page_s | 51 | struct fz_stext_page_s |
89 | { | 52 | { |
53 | fz_pool *pool; | ||
90 | fz_rect mediabox; | 54 | fz_rect mediabox; |
91 | int len, cap; | 55 | fz_stext_block *first_block, *last_block; |
92 | fz_page_block *blocks; | ||
93 | fz_stext_page *next; | ||
94 | }; | ||
95 | |||
96 | /* | ||
97 | fz_page_block: A page block is a typed block pointer. | ||
98 | */ | ||
99 | struct fz_page_block_s | ||
100 | { | ||
101 | int type; | ||
102 | union | ||
103 | { | ||
104 | fz_stext_block *text; | ||
105 | fz_image_block *image; | ||
106 | } u; | ||
107 | }; | 56 | }; |
108 | 57 | ||
109 | enum | 58 | enum |
110 | { | 59 | { |
111 | FZ_PAGE_BLOCK_TEXT = 0, | 60 | FZ_STEXT_BLOCK_TEXT = 0, |
112 | FZ_PAGE_BLOCK_IMAGE = 1 | 61 | FZ_STEXT_BLOCK_IMAGE = 1 |
113 | }; | 62 | }; |
114 | 63 | ||
115 | /* | 64 | /* |
116 | fz_stext_block: A text block is a list of lines of text. In typical | 65 | A text block is a list of lines of text, or an image. |
117 | cases this may correspond to a paragraph or a column of text. A | ||
118 | collection of blocks makes up a page. | ||
119 | */ | 66 | */ |
120 | struct fz_stext_block_s | 67 | struct fz_stext_block_s |
121 | { | 68 | { |
69 | int type; | ||
122 | fz_rect bbox; | 70 | fz_rect bbox; |
123 | int len, cap; | 71 | union { |
124 | fz_stext_line *lines; | 72 | struct { fz_stext_line *first_line, *last_line; } t; |
125 | }; | 73 | struct { fz_matrix transform; fz_image *image; } i; |
126 | 74 | } u; | |
127 | /* | 75 | fz_stext_block *next; |
128 | fz_image_block: An image block is an image, together with the list of lines of text. In typical | ||
129 | cases this may correspond to a paragraph or a column of text. A | ||
130 | collection of blocks makes up a page. | ||
131 | */ | ||
132 | struct fz_image_block_s | ||
133 | { | ||
134 | fz_rect bbox; | ||
135 | fz_matrix mat; | ||
136 | fz_image *image; | ||
137 | fz_colorspace *cspace; | ||
138 | float colors[FZ_MAX_COLORS]; | ||
139 | }; | 76 | }; |
140 | 77 | ||
141 | /* | 78 | /* |
142 | fz_stext_line: A text line is a list of text spans, with the same | 79 | A text line is a list of characters that share a common baseline. |
143 | baseline. In typical cases this should correspond (as expected) to | ||
144 | complete lines of text. A collection of lines makes up a block. | ||
145 | */ | 80 | */ |
146 | struct fz_stext_line_s | 81 | struct fz_stext_line_s |
147 | { | 82 | { |
148 | fz_stext_span *first_span, *last_span; | ||
149 | |||
150 | /* Cached information */ | ||
151 | float distance; /* Perpendicular distance from previous line */ | ||
152 | fz_rect bbox; | ||
153 | void *region; /* Opaque value for matching line masks */ | ||
154 | }; | ||
155 | |||
156 | /* | ||
157 | fz_stext_span: A text span is a list of characters that share a common | ||
158 | baseline/transformation. In typical cases a single span may be enough | ||
159 | to represent a complete line. In cases where the text has big gaps in | ||
160 | it (perhaps as it crosses columns or tables), a line may be represented | ||
161 | by multiple spans. | ||
162 | */ | ||
163 | struct fz_stext_span_s | ||
164 | { | ||
165 | int len, cap; | ||
166 | fz_stext_char *text; | ||
167 | fz_point min; /* Device space */ | ||
168 | fz_point max; /* Device space */ | ||
169 | int wmode; /* 0 for horizontal, 1 for vertical */ | 83 | int wmode; /* 0 for horizontal, 1 for vertical */ |
170 | fz_matrix transform; /* e and f are always 0 here */ | 84 | fz_rect bbox; |
171 | /* Ascender_max and Descender_min only have the conventional sense in | 85 | fz_stext_char *first_char, *last_char; |
172 | * horizontal mode; in vertical mode they are rotated too - they are | 86 | fz_stext_line *next; |
173 | * the maximum and minimum bounds respectively. */ | ||
174 | float ascender_max; /* Document space */ | ||
175 | float descender_min; /* Document space */ | ||
176 | fz_rect bbox; /* Device space */ | ||
177 | |||
178 | /* Cached information */ | ||
179 | float base_offset; /* Perpendicular distance from baseline of line */ | ||
180 | float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */ | ||
181 | int column; /* If non zero, the column that it's in */ | ||
182 | float column_width; /* Percentage */ | ||
183 | int align; /* 0 = left, 1 = centre, 2 = right */ | ||
184 | float indent; /* The indent position for this column. */ | ||
185 | |||
186 | fz_stext_span *next; | ||
187 | }; | 87 | }; |
188 | 88 | ||
189 | /* | 89 | /* |
190 | fz_stext_char: A text char is a unicode character, the style in which | 90 | A text char is a unicode character, the style in which is appears, and |
191 | is appears, and the point at which it is positioned. Transform | 91 | the point at which it is positioned. |
192 | (and hence bbox) information is given by the enclosing span. | ||
193 | */ | 92 | */ |
194 | struct fz_stext_char_s | 93 | struct fz_stext_char_s |
195 | { | 94 | { |
196 | fz_point p; /* Device space */ | 95 | int c, rtl; |
197 | int c; | 96 | fz_point origin; |
198 | fz_stext_style *style; | 97 | fz_rect bbox; |
98 | float size; | ||
99 | fz_font *font; | ||
100 | fz_stext_char *next; | ||
199 | }; | 101 | }; |
200 | 102 | ||
201 | typedef struct fz_char_and_box_s fz_char_and_box; | 103 | typedef struct fz_char_and_box_s fz_char_and_box; |
@@ -212,43 +114,29 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex | |||
212 | 114 | ||
213 | /* | 115 | /* |
214 | fz_stext_char_bbox: Return the bbox of a text char. Calculated from | 116 | fz_stext_char_bbox: Return the bbox of a text char. Calculated from |
215 | the supplied enclosing span. | 117 | the supplied enclosing line. |
216 | |||
217 | bbox: A place to store the bbox | ||
218 | 118 | ||
219 | span: The enclosing span | 119 | bbox: A place to store the bbox. |
220 | 120 | ||
221 | idx: The index of the char within the span | 121 | line: The enclosing line. |
222 | 122 | ||
223 | Returns bbox (updated) | 123 | ch: The character. |
224 | 124 | ||
225 | Does not throw exceptions | 125 | Returns bbox (updated). |
226 | */ | 126 | */ |
227 | fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int idx); | 127 | fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch); |
228 | |||
229 | /* | ||
230 | fz_new_stext_sheet: Create an empty style sheet. | ||
231 | |||
232 | The style sheet is filled out by the text device, creating | ||
233 | one style for each unique font, color, size combination that | ||
234 | is used. | ||
235 | */ | ||
236 | fz_stext_sheet *fz_new_stext_sheet(fz_context *ctx); | ||
237 | void fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet); | ||
238 | 128 | ||
239 | /* | 129 | /* |
240 | fz_new_stext_page: Create an empty text page. | 130 | fz_new_stext_page: Create an empty text page. |
241 | 131 | ||
242 | The text page is filled out by the text device to contain the blocks, | 132 | The text page is filled out by the text device to contain the blocks |
243 | lines and spans of text on the page. | 133 | and lines of text on the page. |
244 | 134 | ||
245 | mediabox: optional mediabox information. | 135 | mediabox: optional mediabox information. |
246 | */ | 136 | */ |
247 | fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox); | 137 | fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox); |
248 | void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page); | 138 | void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page); |
249 | 139 | ||
250 | void fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page); | ||
251 | |||
252 | /* | 140 | /* |
253 | fz_print_stext_page_as_html: Output a page to a file in HTML (visual) format. | 141 | fz_print_stext_page_as_html: Output a page to a file in HTML (visual) format. |
254 | */ | 142 | */ |
@@ -314,14 +202,10 @@ fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts | |||
314 | /* | 202 | /* |
315 | fz_new_stext_device: Create a device to extract the text on a page. | 203 | fz_new_stext_device: Create a device to extract the text on a page. |
316 | 204 | ||
317 | Gather and sort the text on a page into spans of uniform style, | 205 | Gather the text on a page into blocks and lines. |
318 | arranged into lines and blocks by reading order. The reading order | ||
319 | is determined by various heuristics, so may not be accurate. | ||
320 | 206 | ||
321 | sheet: The text sheet to which styles should be added. This can | 207 | The reading order is taken from the order the text is drawn in the |
322 | either be a newly created (empty) text sheet, or one containing | 208 | source file, so may not be accurate. |
323 | styles from a previous text device. The same sheet cannot be used | ||
324 | in multiple threads simultaneously. | ||
325 | 209 | ||
326 | page: The text page to which content should be added. This will | 210 | page: The text page to which content should be added. This will |
327 | usually be a newly created (empty) text page, but it can be one | 211 | usually be a newly created (empty) text page, but it can be one |
@@ -330,6 +214,6 @@ fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts | |||
330 | 214 | ||
331 | options: Options to configure the stext device. | 215 | options: Options to configure the stext device. |
332 | */ | 216 | */ |
333 | fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options); | 217 | fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options); |
334 | 218 | ||
335 | #endif | 219 | #endif |
diff --git a/include/mupdf/fitz/util.h b/include/mupdf/fitz/util.h index d452b58a1..4b827cad1 100644 --- a/include/mupdf/fitz/util.h +++ b/include/mupdf/fitz/util.h | |||
@@ -36,11 +36,11 @@ fz_pixmap *fz_new_pixmap_from_page_contents(fz_context *ctx, fz_page *page, cons | |||
36 | fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_matrix *ctm, fz_colorspace *cs, int alpha); | 36 | fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_matrix *ctm, fz_colorspace *cs, int alpha); |
37 | 37 | ||
38 | /* | 38 | /* |
39 | fz_new_stext_page_from_page: Extract structured text from a page. The sheet must not be NULL. | 39 | fz_new_stext_page_from_page: Extract structured text from a page. |
40 | */ | 40 | */ |
41 | fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options); | 41 | fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options); |
42 | fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options); | 42 | fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options); |
43 | fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options); | 43 | fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options); |
44 | 44 | ||
45 | /* | 45 | /* |
46 | fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle. | 46 | fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle. |
diff --git a/platform/java/mupdf_native.c b/platform/java/mupdf_native.c index bed3358da..ce5e6fea3 100644 --- a/platform/java/mupdf_native.c +++ b/platform/java/mupdf_native.c | |||
@@ -5111,8 +5111,6 @@ FUN(Page_textAsHtml)(JNIEnv *env, jobject self) | |||
5111 | fz_run_page(ctx, page, dev, &ctm, NULL); | 5111 | fz_run_page(ctx, page, dev, &ctm, NULL); |
5112 | fz_close_device(ctx, dev); | 5112 | fz_close_device(ctx, dev); |
5113 | 5113 | ||
5114 | fz_analyze_text(ctx, sheet, text); | ||
5115 | |||
5116 | buf = fz_new_buffer(ctx, 256); | 5114 | buf = fz_new_buffer(ctx, 256); |
5117 | out = fz_new_output_with_buffer(ctx, buf); | 5115 | out = fz_new_output_with_buffer(ctx, buf); |
5118 | fz_write_printf(ctx, out, "<html>\n"); | 5116 | fz_write_printf(ctx, out, "<html>\n"); |
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj index fc195fcea..3add80edf 100644 --- a/platform/win32/libmupdf.vcproj +++ b/platform/win32/libmupdf.vcproj | |||
@@ -1869,10 +1869,6 @@ | |||
1869 | > | 1869 | > |
1870 | </File> | 1870 | </File> |
1871 | <File | 1871 | <File |
1872 | RelativePath="..\..\source\fitz\stext-paragraph.c" | ||
1873 | > | ||
1874 | </File> | ||
1875 | <File | ||
1876 | RelativePath="..\..\source\fitz\stext-search.c" | 1872 | RelativePath="..\..\source\fitz\stext-search.c" |
1877 | > | 1873 | > |
1878 | </File> | 1874 | </File> |
diff --git a/platform/x11/pdfapp.c b/platform/x11/pdfapp.c index 61366a44e..6b08c4aa4 100644 --- a/platform/x11/pdfapp.c +++ b/platform/x11/pdfapp.c | |||
@@ -470,9 +470,6 @@ void pdfapp_close(pdfapp_t *app) | |||
470 | fz_drop_stext_page(app->ctx, app->page_text); | 470 | fz_drop_stext_page(app->ctx, app->page_text); |
471 | app->page_text = NULL; | 471 | app->page_text = NULL; |
472 | 472 | ||
473 | fz_drop_stext_sheet(app->ctx, app->page_sheet); | ||
474 | app->page_sheet = NULL; | ||
475 | |||
476 | fz_drop_link(app->ctx, app->page_links); | 473 | fz_drop_link(app->ctx, app->page_links); |
477 | app->page_links = NULL; | 474 | app->page_links = NULL; |
478 | 475 | ||
@@ -655,14 +652,12 @@ static void pdfapp_loadpage(pdfapp_t *app, int no_cache) | |||
655 | fz_drop_display_list(app->ctx, app->page_list); | 652 | fz_drop_display_list(app->ctx, app->page_list); |
656 | fz_drop_display_list(app->ctx, app->annotations_list); | 653 | fz_drop_display_list(app->ctx, app->annotations_list); |
657 | fz_drop_stext_page(app->ctx, app->page_text); | 654 | fz_drop_stext_page(app->ctx, app->page_text); |
658 | fz_drop_stext_sheet(app->ctx, app->page_sheet); | ||
659 | fz_drop_link(app->ctx, app->page_links); | 655 | fz_drop_link(app->ctx, app->page_links); |
660 | fz_drop_page(app->ctx, app->page); | 656 | fz_drop_page(app->ctx, app->page); |
661 | 657 | ||
662 | app->page_list = NULL; | 658 | app->page_list = NULL; |
663 | app->annotations_list = NULL; | 659 | app->annotations_list = NULL; |
664 | app->page_text = NULL; | 660 | app->page_text = NULL; |
665 | app->page_sheet = NULL; | ||
666 | app->page_links = NULL; | 661 | app->page_links = NULL; |
667 | app->page = NULL; | 662 | app->page = NULL; |
668 | app->page_bbox.x0 = 0; | 663 | app->page_bbox.x0 = 0; |
@@ -875,12 +870,11 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai | |||
875 | app->hit_count = 0; | 870 | app->hit_count = 0; |
876 | 871 | ||
877 | /* Extract text */ | 872 | /* Extract text */ |
878 | app->page_sheet = fz_new_stext_sheet(app->ctx); | ||
879 | app->page_text = fz_new_stext_page(app->ctx, fz_bound_page(app->ctx, app->page, &mediabox)); | 873 | app->page_text = fz_new_stext_page(app->ctx, fz_bound_page(app->ctx, app->page, &mediabox)); |
880 | 874 | ||
881 | if (app->page_list || app->annotations_list) | 875 | if (app->page_list || app->annotations_list) |
882 | { | 876 | { |
883 | tdev = fz_new_stext_device(app->ctx, app->page_sheet, app->page_text, NULL); | 877 | tdev = fz_new_stext_device(app->ctx, app->page_text, NULL); |
884 | pdfapp_runpage(app, tdev, &fz_identity, &fz_infinite_rect, &cookie); | 878 | pdfapp_runpage(app, tdev, &fz_identity, &fz_infinite_rect, &cookie); |
885 | fz_close_device(app->ctx, tdev); | 879 | fz_close_device(app->ctx, tdev); |
886 | fz_drop_device(app->ctx, tdev); | 880 | fz_drop_device(app->ctx, tdev); |
@@ -1905,8 +1899,10 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen) | |||
1905 | fz_rect hitbox; | 1899 | fz_rect hitbox; |
1906 | fz_matrix ctm; | 1900 | fz_matrix ctm; |
1907 | fz_stext_page *page = app->page_text; | 1901 | fz_stext_page *page = app->page_text; |
1908 | int c, i, p, need_newline; | 1902 | int p, need_newline; |
1909 | int block_num; | 1903 | fz_stext_block *block; |
1904 | fz_stext_line *line; | ||
1905 | fz_stext_char *ch; | ||
1910 | 1906 | ||
1911 | int x0 = app->selr.x0; | 1907 | int x0 = app->selr.x0; |
1912 | int x1 = app->selr.x1; | 1908 | int x1 = app->selr.x1; |
@@ -1918,50 +1914,37 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen) | |||
1918 | p = 0; | 1914 | p = 0; |
1919 | need_newline = 0; | 1915 | need_newline = 0; |
1920 | 1916 | ||
1921 | for (block_num = 0; block_num < page->len; block_num++) | 1917 | for (block = page->first_block; block; block = block->next) |
1922 | { | 1918 | { |
1923 | fz_stext_line *line; | 1919 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
1924 | fz_stext_block *block; | ||
1925 | fz_stext_span *span; | ||
1926 | |||
1927 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
1928 | continue; | 1920 | continue; |
1929 | block = page->blocks[block_num].u.text; | ||
1930 | 1921 | ||
1931 | for (line = block->lines; line < block->lines + block->len; line++) | 1922 | for (line = block->u.t.first_line; line; line = line->next) |
1932 | { | 1923 | { |
1933 | int saw_text = 0; | 1924 | int saw_text = 0; |
1934 | 1925 | for (ch = line->first_char; ch; ch = ch->next) | |
1935 | for (span = line->first_span; span; span = span->next) | ||
1936 | { | 1926 | { |
1937 | for (i = 0; i < span->len; i++) | 1927 | int c = ch->c; |
1928 | fz_stext_char_bbox(app->ctx, &hitbox, line, ch); | ||
1929 | if (c < 32) | ||
1930 | c = 0xFFFD; | ||
1931 | if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) | ||
1938 | { | 1932 | { |
1939 | fz_stext_char_bbox(app->ctx, &hitbox, span, i); | 1933 | saw_text = 1; |
1940 | fz_transform_rect(&hitbox, &ctm); | 1934 | if (need_newline) |
1941 | c = span->text[i].c; | ||
1942 | if (c < 32) | ||
1943 | c = '?'; | ||
1944 | if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) | ||
1945 | { | 1935 | { |
1946 | saw_text = 1; | ||
1947 | |||
1948 | if (need_newline) | ||
1949 | { | ||
1950 | #if defined(_WIN32) || defined(_WIN64) | 1936 | #if defined(_WIN32) || defined(_WIN64) |
1951 | if (p < ucslen - 1) | 1937 | if (p < ucslen - 1) |
1952 | ucsbuf[p++] = '\r'; | 1938 | ucsbuf[p++] = '\r'; |
1953 | #endif | 1939 | #endif |
1954 | if (p < ucslen - 1) | ||
1955 | ucsbuf[p++] = '\n'; | ||
1956 | need_newline = 0; | ||
1957 | } | ||
1958 | |||
1959 | if (p < ucslen - 1) | 1940 | if (p < ucslen - 1) |
1960 | ucsbuf[p++] = c; | 1941 | ucsbuf[p++] = '\n'; |
1942 | need_newline = 0; | ||
1961 | } | 1943 | } |
1944 | if (p < ucslen - 1) | ||
1945 | ucsbuf[p++] = c; | ||
1962 | } | 1946 | } |
1963 | } | 1947 | } |
1964 | |||
1965 | if (saw_text) | 1948 | if (saw_text) |
1966 | need_newline = 1; | 1949 | need_newline = 1; |
1967 | } | 1950 | } |
diff --git a/platform/x11/pdfapp.h b/platform/x11/pdfapp.h index 28a834815..09d8f16ad 100644 --- a/platform/x11/pdfapp.h +++ b/platform/x11/pdfapp.h | |||
@@ -91,7 +91,6 @@ struct pdfapp_s | |||
91 | fz_display_list *page_list; | 91 | fz_display_list *page_list; |
92 | fz_display_list *annotations_list; | 92 | fz_display_list *annotations_list; |
93 | fz_stext_page *page_text; | 93 | fz_stext_page *page_text; |
94 | fz_stext_sheet *page_sheet; | ||
95 | fz_link *page_links; | 94 | fz_link *page_links; |
96 | int errored; | 95 | int errored; |
97 | int incomplete; | 96 | int incomplete; |
diff --git a/source/fitz/font.c b/source/fitz/font.c index eb7c8c351..dfe4ab24c 100644 --- a/source/fitz/font.c +++ b/source/fitz/font.c | |||
@@ -193,6 +193,28 @@ fz_set_font_bbox(fz_context *ctx, fz_font *font, float xmin, float ymin, float x | |||
193 | } | 193 | } |
194 | } | 194 | } |
195 | 195 | ||
196 | float fz_font_ascender(fz_context *ctx, fz_font *font) | ||
197 | { | ||
198 | if (font->t3procs) | ||
199 | return font->bbox.y1; | ||
200 | else | ||
201 | { | ||
202 | FT_Face face = font->ft_face; | ||
203 | return (float)face->ascender / face->units_per_EM; | ||
204 | } | ||
205 | } | ||
206 | |||
207 | float fz_font_descender(fz_context *ctx, fz_font *font) | ||
208 | { | ||
209 | if (font->t3procs) | ||
210 | return font->bbox.y0; | ||
211 | else | ||
212 | { | ||
213 | FT_Face face = font->ft_face; | ||
214 | return (float)face->descender / face->units_per_EM; | ||
215 | } | ||
216 | } | ||
217 | |||
196 | /* | 218 | /* |
197 | * Freetype hooks | 219 | * Freetype hooks |
198 | */ | 220 | */ |
diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c index 73fa309e8..166f5aa0b 100644 --- a/source/fitz/stext-device.c +++ b/source/fitz/stext-device.c | |||
@@ -4,36 +4,25 @@ | |||
4 | #include <math.h> | 4 | #include <math.h> |
5 | #include <float.h> | 5 | #include <float.h> |
6 | 6 | ||
7 | /* Extract text into an unsorted span soup. */ | 7 | #include <stdio.h> /* for debug printing */ |
8 | |||
9 | /* Extract text into blocks and lines. */ | ||
8 | 10 | ||
9 | #define LINE_DIST 0.9f | 11 | #define LINE_DIST 0.9f |
10 | #define SPACE_DIST 0.15f | 12 | #define SPACE_DIST 0.15f |
11 | #define SPACE_MAX_DIST 0.8f | 13 | #define SPACE_MAX_DIST 0.8f |
12 | #define PARAGRAPH_DIST 0.5f | 14 | #define PARAGRAPH_DIST 0.5f |
13 | 15 | ||
14 | #include <stdio.h> /* for debug printing */ | ||
15 | #undef DEBUG_SPANS | ||
16 | #undef DEBUG_INTERNALS | ||
17 | #undef DEBUG_LINE_HEIGHTS | ||
18 | #undef DEBUG_MASKS | ||
19 | #undef DEBUG_ALIGN | ||
20 | #undef DEBUG_INDENTS | ||
21 | |||
22 | #include <ft2build.h> | ||
23 | #include FT_FREETYPE_H | ||
24 | #include FT_ADVANCES_H | ||
25 | |||
26 | typedef struct fz_stext_device_s fz_stext_device; | 16 | typedef struct fz_stext_device_s fz_stext_device; |
27 | 17 | ||
28 | typedef struct span_soup_s span_soup; | ||
29 | |||
30 | struct fz_stext_device_s | 18 | struct fz_stext_device_s |
31 | { | 19 | { |
32 | fz_device super; | 20 | fz_device super; |
33 | fz_stext_sheet *sheet; | ||
34 | fz_stext_page *page; | 21 | fz_stext_page *page; |
35 | span_soup *spans; | 22 | fz_point pen, start; |
36 | fz_stext_span *cur_span; | 23 | fz_matrix trm; |
24 | int new_obj; | ||
25 | int curdir; | ||
37 | int lastchar; | 26 | int lastchar; |
38 | int flags; | 27 | int flags; |
39 | }; | 28 | }; |
@@ -42,553 +31,235 @@ const char *fz_stext_options_usage = | |||
42 | "Structured text output options:\n" | 31 | "Structured text output options:\n" |
43 | "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n" | 32 | "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n" |
44 | "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n" | 33 | "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n" |
34 | "\tpreserve-images: keep images in output\n" | ||
45 | "\n"; | 35 | "\n"; |
46 | 36 | ||
47 | static fz_rect * | 37 | fz_rect * |
48 | add_point_to_rect(fz_rect *a, const fz_point *p) | 38 | fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch) |
49 | { | 39 | { |
50 | if (p->x < a->x0) | 40 | *bbox = ch->bbox; |
51 | a->x0 = p->x; | 41 | return bbox; |
52 | if (p->x > a->x1) | ||
53 | a->x1 = p->x; | ||
54 | if (p->y < a->y0) | ||
55 | a->y0 = p->y; | ||
56 | if (p->y > a->y1) | ||
57 | a->y1 = p->y; | ||
58 | return a; | ||
59 | } | 42 | } |
60 | 43 | ||
61 | fz_rect * | 44 | fz_stext_page * |
62 | fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int i) | 45 | fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox) |
63 | { | 46 | { |
64 | fz_point a, d; | 47 | fz_pool *pool = fz_new_pool(ctx); |
65 | const fz_point *max; | 48 | fz_stext_page *page; |
66 | fz_stext_char *ch; | 49 | fz_try(ctx) |
67 | |||
68 | if (!span || i >= span->len) | ||
69 | { | ||
70 | *bbox = fz_empty_rect; | ||
71 | return bbox; | ||
72 | } | ||
73 | ch = &span->text[i]; | ||
74 | if (i == span->len-1) | ||
75 | max = &span->max; | ||
76 | else | ||
77 | max = &span->text[i+1].p; | ||
78 | if (span->wmode == 0) | ||
79 | { | 50 | { |
80 | a.x = 0; | 51 | page = fz_pool_alloc(ctx, pool, sizeof(*page)); |
81 | a.y = span->ascender_max; | 52 | page->pool = pool; |
82 | d.x = 0; | 53 | page->mediabox = *mediabox; |
83 | d.y = span->descender_min; | 54 | page->first_block = NULL; |
55 | page->last_block = NULL; | ||
84 | } | 56 | } |
85 | else | 57 | fz_catch(ctx) |
86 | { | 58 | { |
87 | a.x = span->ascender_max; | 59 | fz_drop_pool(ctx, pool); |
88 | a.y = 0; | 60 | fz_rethrow(ctx); |
89 | d.x = span->descender_min; | ||
90 | d.y = 0; | ||
91 | } | 61 | } |
92 | fz_transform_vector(&a, &span->transform); | 62 | return page; |
93 | fz_transform_vector(&d, &span->transform); | ||
94 | bbox->x0 = bbox->x1 = ch->p.x + a.x; | ||
95 | bbox->y0 = bbox->y1 = ch->p.y + a.y; | ||
96 | a.x += max->x; | ||
97 | a.y += max->y; | ||
98 | add_point_to_rect(bbox, &a); | ||
99 | a.x = ch->p.x + d.x; | ||
100 | a.y = ch->p.y + d.y; | ||
101 | add_point_to_rect(bbox, &a); | ||
102 | a.x = max->x + d.x; | ||
103 | a.y = max->y + d.y; | ||
104 | add_point_to_rect(bbox, &a); | ||
105 | return bbox; | ||
106 | } | 63 | } |
107 | 64 | ||
108 | static void | 65 | void |
109 | add_bbox_to_span(fz_stext_span *span) | 66 | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) |
110 | { | 67 | { |
111 | fz_point a, d; | 68 | if (page) |
112 | fz_rect *bbox = &span->bbox; | ||
113 | |||
114 | if (!span) | ||
115 | return; | ||
116 | if (span->wmode == 0) | ||
117 | { | ||
118 | a.x = 0; | ||
119 | a.y = span->ascender_max; | ||
120 | d.x = 0; | ||
121 | d.y = span->descender_min; | ||
122 | } | ||
123 | else | ||
124 | { | 69 | { |
125 | a.x = span->ascender_max; | 70 | fz_stext_block *block; |
126 | a.y = 0; | 71 | for (block = page->first_block; block; block = block->next) |
127 | d.x = span->descender_min; | 72 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
128 | d.y = 0; | 73 | fz_drop_image(ctx, block->u.i.image); |
74 | fz_drop_pool(ctx, page->pool); | ||
129 | } | 75 | } |
130 | fz_transform_vector(&a, &span->transform); | ||
131 | fz_transform_vector(&d, &span->transform); | ||
132 | bbox->x0 = bbox->x1 = span->min.x + a.x; | ||
133 | bbox->y0 = bbox->y1 = span->min.y + a.y; | ||
134 | a.x += span->max.x; | ||
135 | a.y += span->max.y; | ||
136 | add_point_to_rect(bbox, &a); | ||
137 | a.x = span->min.x + d.x; | ||
138 | a.y = span->min.y + d.y; | ||
139 | add_point_to_rect(bbox, &a); | ||
140 | a.x = span->max.x + d.x; | ||
141 | a.y = span->max.y + d.y; | ||
142 | add_point_to_rect(bbox, &a); | ||
143 | } | 76 | } |
144 | 77 | ||
145 | struct span_soup_s | 78 | static fz_stext_block * |
146 | { | 79 | add_block_to_page(fz_context *ctx, fz_stext_page *page) |
147 | int len, cap; | ||
148 | fz_stext_span **spans; | ||
149 | }; | ||
150 | |||
151 | static span_soup * | ||
152 | new_span_soup(fz_context *ctx) | ||
153 | { | 80 | { |
154 | span_soup *soup = fz_malloc_struct(ctx, span_soup); | 81 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
155 | soup->len = 0; | 82 | if (!page->first_block) |
156 | soup->cap = 0; | 83 | page->first_block = page->last_block = block; |
157 | soup->spans = NULL; | 84 | else |
158 | return soup; | 85 | { |
86 | page->last_block->next = block; | ||
87 | page->last_block = block; | ||
88 | } | ||
89 | return block; | ||
159 | } | 90 | } |
160 | 91 | ||
161 | static void | 92 | static fz_stext_block * |
162 | free_span_soup(fz_context *ctx, span_soup *soup) | 93 | add_text_block_to_page(fz_context *ctx, fz_stext_page *page) |
163 | { | 94 | { |
164 | int i; | 95 | fz_stext_block *block = add_block_to_page(ctx, page); |
165 | 96 | block->type = FZ_STEXT_BLOCK_TEXT; | |
166 | if (soup == NULL) | 97 | return block; |
167 | return; | ||
168 | for (i = 0; i < soup->len; i++) | ||
169 | { | ||
170 | fz_free(ctx, soup->spans[i]); | ||
171 | } | ||
172 | fz_free(ctx, soup->spans); | ||
173 | fz_free(ctx, soup); | ||
174 | } | 98 | } |
175 | 99 | ||
176 | static void | 100 | static fz_stext_block * |
177 | add_span_to_soup(fz_context *ctx, span_soup *soup, fz_stext_span *span) | 101 | add_image_block_to_page(fz_context *ctx, fz_stext_page *page, const fz_matrix *ctm, fz_image *image) |
178 | { | 102 | { |
179 | if (span == NULL) | 103 | fz_stext_block *block = add_block_to_page(ctx, page); |
180 | return; | 104 | block->type = FZ_STEXT_BLOCK_IMAGE; |
181 | if (soup->len == soup->cap) | 105 | block->u.i.transform = *ctm; |
182 | { | 106 | block->u.i.image = fz_keep_image(ctx, image); |
183 | int newcap = (soup->cap ? soup->cap * 2 : 16); | 107 | block->bbox.x0 = 0; |
184 | soup->spans = fz_resize_array(ctx, soup->spans, newcap, sizeof(*soup->spans)); | 108 | block->bbox.y0 = 0; |
185 | soup->cap = newcap; | 109 | block->bbox.x1 = 1; |
186 | } | 110 | block->bbox.y1 = 1; |
187 | add_bbox_to_span(span); | 111 | fz_transform_rect(&block->bbox, ctm); |
188 | soup->spans[soup->len++] = span; | 112 | return block; |
189 | } | 113 | } |
190 | 114 | ||
191 | static fz_stext_line * | 115 | static fz_stext_line * |
192 | push_span(fz_context *ctx, fz_stext_device *tdev, fz_stext_span *span, int new_line, float distance) | 116 | add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, int wmode) |
193 | { | 117 | { |
194 | fz_stext_line *line; | 118 | fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); |
195 | fz_stext_block *block; | 119 | if (!block->u.t.first_line) |
196 | fz_stext_page *page = tdev->page; | 120 | block->u.t.first_line = block->u.t.last_line = line; |
197 | int prev_not_text = 0; | ||
198 | |||
199 | if (page->len == 0 || page->blocks[page->len-1].type != FZ_PAGE_BLOCK_TEXT) | ||
200 | prev_not_text = 1; | ||
201 | |||
202 | if (new_line || prev_not_text) | ||
203 | { | ||
204 | float size = fz_matrix_expansion(&span->transform); | ||
205 | /* So, a new line. Part of the same block or not? */ | ||
206 | if (distance == 0 || distance > size * 1.5f || distance < -size * PARAGRAPH_DIST || page->len == 0 || prev_not_text) | ||
207 | { | ||
208 | /* New block */ | ||
209 | if (page->len == page->cap) | ||
210 | { | ||
211 | int newcap = (page->cap ? page->cap*2 : 4); | ||
212 | page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks)); | ||
213 | page->cap = newcap; | ||
214 | } | ||
215 | block = fz_malloc_struct(ctx, fz_stext_block); | ||
216 | page->blocks[page->len].type = FZ_PAGE_BLOCK_TEXT; | ||
217 | page->blocks[page->len].u.text = block; | ||
218 | block->cap = 0; | ||
219 | block->len = 0; | ||
220 | block->lines = 0; | ||
221 | block->bbox = fz_empty_rect; | ||
222 | page->len++; | ||
223 | distance = 0; | ||
224 | } | ||
225 | |||
226 | /* New line */ | ||
227 | block = page->blocks[page->len-1].u.text; | ||
228 | if (block->len == block->cap) | ||
229 | { | ||
230 | int newcap = (block->cap ? block->cap*2 : 4); | ||
231 | block->lines = fz_resize_array(ctx, block->lines, newcap, sizeof(*block->lines)); | ||
232 | block->cap = newcap; | ||
233 | } | ||
234 | block->lines[block->len].first_span = NULL; | ||
235 | block->lines[block->len].last_span = NULL; | ||
236 | block->lines[block->len].distance = distance; | ||
237 | block->lines[block->len].bbox = fz_empty_rect; | ||
238 | block->len++; | ||
239 | } | ||
240 | |||
241 | /* Find last line and append to it */ | ||
242 | block = page->blocks[page->len-1].u.text; | ||
243 | line = &block->lines[block->len-1]; | ||
244 | |||
245 | fz_union_rect(&block->lines[block->len-1].bbox, &span->bbox); | ||
246 | fz_union_rect(&block->bbox, &span->bbox); | ||
247 | span->base_offset = (new_line ? 0 : distance); | ||
248 | |||
249 | if (!line->first_span) | ||
250 | { | ||
251 | line->first_span = line->last_span = span; | ||
252 | span->next = NULL; | ||
253 | } | ||
254 | else | 121 | else |
255 | { | 122 | { |
256 | line->last_span->next = span; | 123 | block->u.t.last_line->next = line; |
257 | line->last_span = span; | 124 | block->u.t.last_line = line; |
258 | } | 125 | } |
259 | 126 | ||
127 | line->wmode = wmode; | ||
128 | |||
260 | return line; | 129 | return line; |
261 | } | 130 | } |
262 | 131 | ||
263 | #if defined(DEBUG_SPANS) || defined(DEBUG_ALIGN) || defined(DEBUG_INDENTS) | 132 | static float min4(float a, float b, float c, float d) |
264 | static void | ||
265 | dump_span(fz_stext_span *s) | ||
266 | { | 133 | { |
267 | int i; | 134 | return fz_min(fz_min(a, b), fz_min(c, d)); |
268 | for (i=0; i < s->len; i++) | ||
269 | { | ||
270 | printf("%c", s->text[i].c); | ||
271 | } | ||
272 | } | 135 | } |
273 | #endif | ||
274 | 136 | ||
275 | #ifdef DEBUG_ALIGN | 137 | static float max4(float a, float b, float c, float d) |
276 | static void | ||
277 | dump_line(fz_stext_line *line) | ||
278 | { | 138 | { |
279 | int i; | 139 | return fz_max(fz_max(a, b), fz_max(c, d)); |
280 | for (i=0; i < line->len; i++) | ||
281 | { | ||
282 | fz_stext_span *s = line->spans[i]; | ||
283 | if (s->spacing > 1) | ||
284 | printf(" "); | ||
285 | dump_span(s); | ||
286 | } | ||
287 | printf("\n"); | ||
288 | } | 140 | } |
289 | #endif | ||
290 | 141 | ||
291 | static void | 142 | static fz_stext_char * |
292 | strain_soup(fz_context *ctx, fz_stext_device *tdev) | 143 | add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, const fz_matrix *trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int rtl) |
293 | { | 144 | { |
294 | span_soup *soup = tdev->spans; | 145 | fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); |
295 | fz_stext_line *last_line = NULL; | 146 | fz_point a, d; |
296 | fz_stext_span *last_span = NULL; | ||
297 | int span_num; | ||
298 | |||
299 | if (soup == NULL) | ||
300 | return; | ||
301 | 147 | ||
302 | /* Really dumb implementation to match what we had before */ | 148 | if (!line->first_char) |
303 | for (span_num=0; span_num < soup->len; span_num++) | 149 | line->first_char = line->last_char = ch; |
150 | else | ||
304 | { | 151 | { |
305 | fz_stext_span *span = soup->spans[span_num]; | 152 | line->last_char->next = ch; |
306 | int new_line = 1; | 153 | line->last_char = ch; |
307 | float distance = 0; | ||
308 | float spacing = 0; | ||
309 | soup->spans[span_num] = NULL; | ||
310 | if (last_span) | ||
311 | { | ||
312 | /* If we have a last_span, we must have a last_line */ | ||
313 | /* Do span and last_line share the same baseline? */ | ||
314 | fz_point p, q, perp_r; | ||
315 | float dot; | ||
316 | float size = fz_matrix_expansion(&span->transform); | ||
317 | |||
318 | #ifdef DEBUG_SPANS | ||
319 | { | ||
320 | printf("Comparing: \""); | ||
321 | dump_span(last_span); | ||
322 | printf("\" and \""); | ||
323 | dump_span(span); | ||
324 | printf("\"\n"); | ||
325 | } | ||
326 | #endif | ||
327 | |||
328 | p.x = last_line->first_span->max.x - last_line->first_span->min.x; | ||
329 | p.y = last_line->first_span->max.y - last_line->first_span->min.y; | ||
330 | fz_normalize_vector(&p); | ||
331 | q.x = span->max.x - span->min.x; | ||
332 | q.y = span->max.y - span->min.y; | ||
333 | fz_normalize_vector(&q); | ||
334 | #ifdef DEBUG_SPANS | ||
335 | printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y); | ||
336 | printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y); | ||
337 | #endif | ||
338 | perp_r.y = last_line->first_span->min.x - span->min.x; | ||
339 | perp_r.x = -(last_line->first_span->min.y - span->min.y); | ||
340 | /* Check if p and q are parallel. If so, then this | ||
341 | * line is parallel with the last one. */ | ||
342 | dot = p.x * q.x + p.y * q.y; | ||
343 | if (fabsf(dot) > 0.9995f) | ||
344 | { | ||
345 | /* If we take the dot product of normalised(p) and | ||
346 | * perp(r), we get the perpendicular distance from | ||
347 | * one line to the next (assuming they are parallel). */ | ||
348 | distance = p.x * perp_r.x + p.y * perp_r.y; | ||
349 | /* We allow 'small' distances of baseline changes | ||
350 | * to cope with super/subscript. FIXME: We should | ||
351 | * gather subscript/superscript information here. */ | ||
352 | new_line = (fabsf(distance) > size * LINE_DIST); | ||
353 | } | ||
354 | else | ||
355 | { | ||
356 | new_line = 1; | ||
357 | distance = 0; | ||
358 | } | ||
359 | if (!new_line) | ||
360 | { | ||
361 | fz_point delta; | ||
362 | |||
363 | delta.x = span->min.x - last_span->max.x; | ||
364 | delta.y = span->min.y - last_span->max.y; | ||
365 | |||
366 | spacing = (p.x * delta.x + p.y * delta.y); | ||
367 | spacing = fabsf(spacing); | ||
368 | /* Only allow changes in baseline (subscript/superscript etc) | ||
369 | * when the spacing is small. */ | ||
370 | if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f) | ||
371 | { | ||
372 | new_line = 1; | ||
373 | distance = 0; | ||
374 | spacing = 0; | ||
375 | } | ||
376 | else | ||
377 | { | ||
378 | spacing /= size * SPACE_DIST; | ||
379 | /* Apply the same logic here as when we're adding chars to build spans. */ | ||
380 | if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) | ||
381 | spacing = 1; | ||
382 | } | ||
383 | } | ||
384 | #ifdef DEBUG_SPANS | ||
385 | printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing); | ||
386 | #endif | ||
387 | } | ||
388 | span->spacing = spacing; | ||
389 | last_line = push_span(ctx, tdev, span, new_line, distance); | ||
390 | last_span = span; | ||
391 | } | 154 | } |
392 | } | ||
393 | |||
394 | fz_stext_sheet * | ||
395 | fz_new_stext_sheet(fz_context *ctx) | ||
396 | { | ||
397 | fz_stext_sheet *sheet = fz_malloc(ctx, sizeof *sheet); | ||
398 | sheet->maxid = 0; | ||
399 | sheet->style = NULL; | ||
400 | return sheet; | ||
401 | } | ||
402 | 155 | ||
403 | void | 156 | ch->c = c; |
404 | fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet) | 157 | ch->rtl = rtl; |
405 | { | 158 | ch->origin = *p; |
406 | fz_stext_style *style; | 159 | ch->size = size; |
160 | ch->font = font; /* TODO: keep and drop */ | ||
407 | 161 | ||
408 | if (sheet == NULL) | 162 | if (line->wmode == 0) |
409 | return; | ||
410 | |||
411 | style = sheet->style; | ||
412 | while (style) | ||
413 | { | 163 | { |
414 | fz_stext_style *next = style->next; | 164 | a.x = 0; |
415 | fz_drop_font(ctx, style->font); | 165 | d.x = 0; |
416 | fz_free(ctx, style); | 166 | a.y = fz_font_ascender(ctx, font); |
417 | style = next; | 167 | d.y = fz_font_descender(ctx, font); |
418 | } | 168 | } |
419 | fz_free(ctx, sheet); | 169 | else |
420 | } | ||
421 | |||
422 | static fz_stext_style * | ||
423 | fz_lookup_stext_style_imp(fz_context *ctx, fz_stext_sheet *sheet, | ||
424 | float size, fz_font *font, int wmode, int script) | ||
425 | { | ||
426 | fz_stext_style *style; | ||
427 | |||
428 | for (style = sheet->style; style; style = style->next) | ||
429 | { | 170 | { |
430 | if (style->font == font && | 171 | fz_rect *bbox = fz_font_bbox(ctx, font); |
431 | style->size == size && | 172 | a.x = bbox->x1; |
432 | style->wmode == wmode && | 173 | d.x = bbox->x0; |
433 | style->script == script) /* FIXME: others */ | 174 | a.y = 0; |
434 | { | 175 | d.y = 0; |
435 | return style; | ||
436 | } | ||
437 | } | 176 | } |
177 | fz_transform_vector(&a, trm); | ||
178 | fz_transform_vector(&d, trm); | ||
438 | 179 | ||
439 | /* Better make a new one and add it to our list */ | 180 | ch->bbox.x0 = min4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x); |
440 | style = fz_malloc(ctx, sizeof *style); | 181 | ch->bbox.x1 = max4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x); |
441 | style->id = sheet->maxid++; | 182 | ch->bbox.y0 = min4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y); |
442 | style->font = fz_keep_font(ctx, font); | 183 | ch->bbox.y1 = max4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y); |
443 | style->size = size; | ||
444 | style->wmode = wmode; | ||
445 | style->script = script; | ||
446 | style->next = sheet->style; | ||
447 | sheet->style = style; | ||
448 | return style; | ||
449 | } | ||
450 | 184 | ||
451 | static fz_stext_style * | 185 | if (fz_is_empty_rect(&line->bbox)) |
452 | fz_lookup_stext_style(fz_context *ctx, fz_stext_sheet *sheet, fz_text_span *span, const fz_matrix *ctm, | 186 | line->bbox = ch->bbox; |
453 | fz_colorspace *colorspace, const float *color, float alpha, const fz_stroke_state *stroke) | 187 | else |
454 | { | ||
455 | float size = 1.0f; | ||
456 | fz_font *font = span ? span->font : NULL; | ||
457 | int wmode = span ? span->wmode : 0; | ||
458 | if (ctm && span) | ||
459 | { | 188 | { |
460 | fz_matrix tm = span->trm; | 189 | line->bbox.x0 = fz_min(line->bbox.x0, ch->bbox.x0); |
461 | fz_matrix trm; | 190 | line->bbox.y0 = fz_min(line->bbox.y0, ch->bbox.y0); |
462 | tm.e = 0; | 191 | line->bbox.x1 = fz_min(line->bbox.x1, ch->bbox.x1); |
463 | tm.f = 0; | 192 | line->bbox.y1 = fz_min(line->bbox.y1, ch->bbox.y1); |
464 | fz_concat(&trm, &tm, ctm); | ||
465 | size = fz_matrix_expansion(&trm); | ||
466 | } | 193 | } |
467 | return fz_lookup_stext_style_imp(ctx, sheet, size, font, wmode, 0); | ||
468 | } | ||
469 | 194 | ||
470 | fz_stext_page * | 195 | return ch; |
471 | fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox) | ||
472 | { | ||
473 | fz_stext_page *page = fz_malloc(ctx, sizeof(*page)); | ||
474 | page->mediabox = *mediabox; | ||
475 | page->len = 0; | ||
476 | page->cap = 0; | ||
477 | page->blocks = NULL; | ||
478 | page->next = NULL; | ||
479 | return page; | ||
480 | } | 196 | } |
481 | 197 | ||
482 | static void | 198 | static int |
483 | fz_drop_stext_line_contents(fz_context *ctx, fz_stext_line *line) | 199 | direction_from_bidi_class(int bidiclass, int curdir) |
484 | { | 200 | { |
485 | fz_stext_span *span, *next; | 201 | switch (bidiclass) |
486 | for (span = line->first_span; span; span=next) | ||
487 | { | 202 | { |
488 | next = span->next; | 203 | /* strong */ |
489 | fz_free(ctx, span->text); | 204 | case UCDN_BIDI_CLASS_L: return 1; |
490 | fz_free(ctx, span); | 205 | case UCDN_BIDI_CLASS_R: return -1; |
491 | } | 206 | case UCDN_BIDI_CLASS_AL: return -1; |
492 | } | ||
493 | 207 | ||
494 | static void | 208 | /* weak */ |
495 | fz_drop_stext_block(fz_context *ctx, fz_stext_block *block) | 209 | case UCDN_BIDI_CLASS_EN: |
496 | { | 210 | case UCDN_BIDI_CLASS_ES: |
497 | fz_stext_line *line; | 211 | case UCDN_BIDI_CLASS_ET: |
498 | if (block == NULL) | 212 | case UCDN_BIDI_CLASS_AN: |
499 | return; | 213 | case UCDN_BIDI_CLASS_CS: |
500 | for (line = block->lines; line < block->lines + block->len; line++) | 214 | case UCDN_BIDI_CLASS_NSM: |
501 | fz_drop_stext_line_contents(ctx, line); | 215 | case UCDN_BIDI_CLASS_BN: |
502 | fz_free(ctx, block->lines); | 216 | return curdir; |
503 | fz_free(ctx, block); | ||
504 | } | ||
505 | 217 | ||
506 | static void | 218 | /* neutral */ |
507 | fz_drop_image_block(fz_context *ctx, fz_image_block *block) | 219 | case UCDN_BIDI_CLASS_B: |
508 | { | 220 | case UCDN_BIDI_CLASS_S: |
509 | if (block == NULL) | 221 | case UCDN_BIDI_CLASS_WS: |
510 | return; | 222 | case UCDN_BIDI_CLASS_ON: |
511 | fz_drop_image(ctx, block->image); | 223 | return curdir; |
512 | fz_drop_colorspace(ctx, block->cspace); | ||
513 | fz_free(ctx, block); | ||
514 | } | ||
515 | 224 | ||
516 | void | 225 | /* embedding, override, pop ... we don't support them */ |
517 | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) | 226 | default: |
518 | { | 227 | return 0; |
519 | fz_page_block *block; | ||
520 | if (page == NULL) | ||
521 | return; | ||
522 | for (block = page->blocks; block < page->blocks + page->len; block++) | ||
523 | { | ||
524 | switch (block->type) | ||
525 | { | ||
526 | case FZ_PAGE_BLOCK_TEXT: | ||
527 | fz_drop_stext_block(ctx, block->u.text); | ||
528 | break; | ||
529 | case FZ_PAGE_BLOCK_IMAGE: | ||
530 | fz_drop_image_block(ctx, block->u.image); | ||
531 | break; | ||
532 | } | ||
533 | } | 228 | } |
534 | fz_free(ctx, page->blocks); | ||
535 | fz_free(ctx, page); | ||
536 | } | 229 | } |
537 | 230 | ||
538 | static fz_stext_span * | 231 | static int |
539 | fz_new_stext_span(fz_context *ctx, const fz_point *p, int wmode, const fz_matrix *trm) | 232 | sign_eq(float x, float y) |
540 | { | 233 | { |
541 | fz_stext_span *span = fz_malloc_struct(ctx, fz_stext_span); | 234 | return (x < 0 && y < 0) || (x > 0 && y > 0) || (x == 0 && y == 0); |
542 | span->ascender_max = 0; | ||
543 | span->descender_min = 0; | ||
544 | span->cap = 0; | ||
545 | span->len = 0; | ||
546 | span->min = *p; | ||
547 | span->max = *p; | ||
548 | span->wmode = wmode; | ||
549 | span->transform.a = trm->a; | ||
550 | span->transform.b = trm->b; | ||
551 | span->transform.c = trm->c; | ||
552 | span->transform.d = trm->d; | ||
553 | span->transform.e = 0; | ||
554 | span->transform.f = 0; | ||
555 | span->text = NULL; | ||
556 | span->next = NULL; | ||
557 | return span; | ||
558 | } | 235 | } |
559 | 236 | ||
560 | static void | 237 | static int |
561 | add_char_to_span(fz_context *ctx, fz_stext_span *span, int c, fz_point *p, fz_point *max, fz_stext_style *style) | 238 | mat_sign_eq(const fz_matrix *x, const fz_matrix *y) |
562 | { | 239 | { |
563 | if (span->len == span->cap) | 240 | return sign_eq(x->a, y->a) && sign_eq(x->b, y->b) && sign_eq(x->c, y->c) && sign_eq(x->d, y->d); |
564 | { | ||
565 | int newcap = (span->cap ? span->cap * 2 : 16); | ||
566 | span->text = fz_resize_array(ctx, span->text, newcap, sizeof(fz_stext_char)); | ||
567 | span->cap = newcap; | ||
568 | span->bbox = fz_empty_rect; | ||
569 | } | ||
570 | span->max = *max; | ||
571 | if (style->ascender > span->ascender_max) | ||
572 | span->ascender_max = style->ascender; | ||
573 | if (style->descender < span->descender_min) | ||
574 | span->descender_min = style->descender; | ||
575 | span->text[span->len].c = c; | ||
576 | span->text[span->len].p = *p; | ||
577 | span->text[span->len].style = style; | ||
578 | span->len++; | ||
579 | } | 241 | } |
580 | 242 | ||
581 | static void | 243 | static void |
582 | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) | 244 | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode) |
583 | { | 245 | { |
584 | int can_append = 1; | 246 | fz_stext_page *page = dev->page; |
247 | fz_stext_block *cur_block; | ||
248 | fz_stext_line *cur_line; | ||
249 | |||
250 | int new_para = 0; | ||
251 | int new_line = 1; | ||
585 | int add_space = 0; | 252 | int add_space = 0; |
586 | fz_point dir, ndir, p, q, r; | 253 | fz_point dir, ndir, p, q; |
587 | float size; | 254 | float size; |
588 | fz_point delta; | 255 | fz_point delta; |
589 | float spacing = 0; | 256 | float spacing = 0; |
590 | float base_offset = 0; | 257 | float base_offset = 0; |
258 | int rtl = 0; | ||
259 | |||
260 | dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir); | ||
591 | 261 | ||
262 | /* dir = direction vector for motion. ndir = normalised(dir) */ | ||
592 | if (wmode == 0) | 263 | if (wmode == 0) |
593 | { | 264 | { |
594 | dir.x = 1; | 265 | dir.x = 1; |
@@ -602,17 +273,16 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty | |||
602 | fz_transform_vector(&dir, trm); | 273 | fz_transform_vector(&dir, trm); |
603 | ndir = dir; | 274 | ndir = dir; |
604 | fz_normalize_vector(&ndir); | 275 | fz_normalize_vector(&ndir); |
605 | /* dir = direction vector for motion. ndir = normalised(dir) */ | ||
606 | 276 | ||
607 | size = fz_matrix_expansion(trm); | 277 | size = fz_matrix_expansion(trm); |
608 | 278 | ||
609 | /* We need to identify where glyphs 'start' (p) and 'stop' (q). | 279 | /* We need to identify where glyphs 'start' (p) and 'stop' (q). |
610 | * Each glyph holds it's 'start' position, and the next glyph in the | 280 | * Each glyph holds its 'start' position, and the next glyph in the |
611 | * span (or span->max if there is no next glyph) holds it's 'end' | 281 | * span (or span->max if there is no next glyph) holds its 'end' |
612 | * position. | 282 | * position. |
613 | * | 283 | * |
614 | * For both horizontal and vertical motion, trm->{e,f} gives the | 284 | * For both horizontal and vertical motion, trm->{e,f} gives the |
615 | * bottom left corner of the glyph. | 285 | * origin (usually the bottom left) of the glyph. |
616 | * | 286 | * |
617 | * In horizontal mode: | 287 | * In horizontal mode: |
618 | * + p is bottom left. | 288 | * + p is bottom left. |
@@ -636,37 +306,38 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty | |||
636 | q.y = trm->f; | 306 | q.y = trm->f; |
637 | } | 307 | } |
638 | 308 | ||
639 | if (glyph < 0) | 309 | /* Find current position to enter new text. */ |
310 | cur_block = page->last_block; | ||
311 | if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) | ||
312 | cur_block = NULL; | ||
313 | cur_line = cur_block ? cur_block->u.t.last_line : NULL; | ||
314 | |||
315 | if (cur_line && glyph < 0) | ||
640 | { | 316 | { |
641 | /* Don't reset 'pen' to start of no-glyph characters in cluster */ | 317 | /* Don't advance pen or break lines for no-glyph characters in a cluster */ |
642 | if (dev->cur_span) | 318 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, 0); |
643 | q = dev->cur_span->max; | 319 | dev->lastchar = c; |
644 | goto no_glyph; | 320 | return; |
645 | } | 321 | } |
646 | 322 | ||
647 | if (dev->cur_span == NULL || | 323 | if (cur_line == NULL || !mat_sign_eq(trm, &dev->trm) || cur_line->wmode != wmode) |
648 | trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b || | ||
649 | trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d || | ||
650 | dev->cur_span->wmode != wmode) | ||
651 | { | 324 | { |
652 | /* If the matrix has changed, or the wmode is different (or | 325 | /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), |
653 | * if we don't have a span at all), then we can't append. */ | 326 | * then we can't append to the current block/line. */ |
654 | #ifdef DEBUG_SPANS | 327 | new_para = 1; |
655 | printf("Transform/WMode changed\n"); | 328 | new_line = 1; |
656 | #endif | ||
657 | can_append = 0; | ||
658 | } | 329 | } |
659 | else | 330 | else |
660 | { | 331 | { |
661 | delta.x = q.x - dev->cur_span->max.x; | 332 | /* Detect fake bold where text is printed twice in the same place. */ |
662 | delta.y = q.y - dev->cur_span->max.y; | 333 | delta.x = q.x - dev->pen.x; |
334 | delta.y = q.y - dev->pen.y; | ||
663 | if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) | 335 | if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) |
664 | return; | 336 | return; |
665 | 337 | ||
666 | /* Calculate how far we've moved since the end of the current | 338 | /* Calculate how far we've moved since the last character. */ |
667 | * span. */ | 339 | delta.x = p.x - dev->pen.x; |
668 | delta.x = p.x - dev->cur_span->max.x; | 340 | delta.y = p.y - dev->pen.y; |
669 | delta.y = p.y - dev->cur_span->max.y; | ||
670 | 341 | ||
671 | /* The transform has not changed, so we know we're in the same | 342 | /* The transform has not changed, so we know we're in the same |
672 | * direction. Calculate 2 distances; how far off the previous | 343 | * direction. Calculate 2 distances; how far off the previous |
@@ -675,102 +346,129 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty | |||
675 | spacing = ndir.x * delta.x + ndir.y * delta.y; | 346 | spacing = ndir.x * delta.x + ndir.y * delta.y; |
676 | base_offset = -ndir.y * delta.x + ndir.x * delta.y; | 347 | base_offset = -ndir.y * delta.x + ndir.x * delta.y; |
677 | 348 | ||
678 | spacing /= size * SPACE_DIST; | 349 | /* Only a small amount off the baseline - we'll take this */ |
679 | if (fabsf(base_offset) < size * 0.1f) | 350 | if (fabsf(base_offset) < size * 0.8f) |
680 | { | 351 | { |
681 | /* Only a small amount off the baseline - we'll take this */ | 352 | /* LTR or neutral character */ |
682 | if (fabsf(spacing) < 1.0f) | 353 | if (dev->curdir >= 0) |
683 | { | 354 | { |
684 | /* Motion is in line, and small. */ | 355 | if (fabs(spacing) < size * SPACE_DIST) |
685 | } | 356 | { |
686 | else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) | 357 | /* Motion is in line, and small. */ |
687 | { | 358 | new_line = 0; |
688 | /* Motion is in line, but large enough | 359 | } |
689 | * to warrant us adding a space */ | 360 | else if (spacing >= size * SPACE_DIST && spacing < size * SPACE_MAX_DIST) |
690 | if (dev->lastchar != ' ' && wmode == 0) | 361 | { |
691 | add_space = 1; | 362 | /* Motion is in line, but large enough to warrant us adding a space. */ |
363 | if (dev->lastchar != ' ' && wmode == 0) | ||
364 | add_space = 1; | ||
365 | new_line = 0; | ||
366 | } | ||
367 | else | ||
368 | { | ||
369 | /* Motion is in line, but large enough to warrant splitting to a new line */ | ||
370 | new_line = 1; | ||
371 | } | ||
692 | } | 372 | } |
373 | |||
374 | /* RTL character -- disable space character and column detection heuristics */ | ||
693 | else | 375 | else |
694 | { | 376 | { |
695 | /* Motion is in line, but too large - split to a new span */ | 377 | new_line = 0; |
696 | can_append = 0; | 378 | if (spacing > size * SPACE_DIST || spacing < 0) |
379 | rtl = 0; /* backward (or big jump to 'right' side) means logical order */ | ||
380 | else | ||
381 | rtl = 1; /* visual order, we need to reverse in a post process pass */ | ||
697 | } | 382 | } |
698 | } | 383 | } |
384 | |||
385 | /* Enough for a new line, but not enough for a new paragraph */ | ||
386 | else if (fabsf(base_offset) < size * 1.3f) | ||
387 | { | ||
388 | /* Check indent to spot text-indent style paragraphs */ | ||
389 | if (wmode == 0 && cur_line && dev->new_obj) | ||
390 | if (fabsf(p.x - dev->start.x) > size * 0.5f) | ||
391 | new_para = 1; | ||
392 | new_line = 1; | ||
393 | } | ||
394 | |||
395 | /* Way off the baseline - open a new paragraph */ | ||
699 | else | 396 | else |
700 | { | 397 | { |
701 | can_append = 0; | 398 | new_para = 1; |
702 | #ifdef DEBUG_SPANS | 399 | new_line = 1; |
703 | spacing = 0; | ||
704 | #endif | ||
705 | } | 400 | } |
706 | } | 401 | } |
707 | 402 | ||
708 | #ifdef DEBUG_SPANS | 403 | /* Start a new block (but only at the beginning of a text object) */ |
709 | printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset); | 404 | if (new_para || !cur_block) |
710 | #endif | 405 | { |
406 | cur_block = add_text_block_to_page(ctx, page); | ||
407 | cur_line = cur_block->u.t.last_line; | ||
408 | } | ||
711 | 409 | ||
712 | /* Start a new span */ | 410 | /* Start a new line */ |
713 | if (!can_append) | 411 | if (new_line || !cur_line) |
714 | { | 412 | { |
715 | add_span_to_soup(ctx, dev->spans, dev->cur_span); | 413 | cur_line = add_line_to_block(ctx, page, cur_block, wmode); |
716 | dev->cur_span = NULL; | 414 | dev->start = p; |
717 | dev->cur_span = fz_new_stext_span(ctx, &p, wmode, trm); | ||
718 | dev->cur_span->spacing = 0; | ||
719 | } | 415 | } |
720 | 416 | ||
721 | /* Add synthetic space */ | 417 | /* Add synthetic space */ |
722 | if (add_space) | 418 | if (add_space) |
723 | { | 419 | add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, rtl); |
724 | /* We know we always have a cur_span here */ | ||
725 | r = dev->cur_span->max; | ||
726 | add_char_to_span(ctx, dev->cur_span, ' ', &r, &p, style); | ||
727 | } | ||
728 | 420 | ||
729 | no_glyph: | 421 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, rtl); |
730 | add_char_to_span(ctx, dev->cur_span, c, &p, &q, style); | ||
731 | dev->lastchar = c; | 422 | dev->lastchar = c; |
423 | dev->pen = q; | ||
424 | |||
425 | dev->new_obj = 0; | ||
426 | dev->trm = *trm; | ||
732 | } | 427 | } |
733 | 428 | ||
734 | static void | 429 | static void |
735 | fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) | 430 | fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode) |
736 | { | 431 | { |
737 | /* ignore when one unicode character maps to multiple glyphs */ | 432 | /* ignore when one unicode character maps to multiple glyphs */ |
738 | if (c == -1) | 433 | if (c == -1) |
739 | return; | 434 | return; |
740 | 435 | ||
741 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) | 436 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) |
437 | { | ||
742 | switch (c) | 438 | switch (c) |
743 | { | 439 | { |
744 | case 0xFB00: /* ff */ | 440 | case 0xFB00: /* ff */ |
745 | fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); | 441 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
746 | fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); | 442 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
747 | return; | 443 | return; |
748 | case 0xFB01: /* fi */ | 444 | case 0xFB01: /* fi */ |
749 | fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); | 445 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
750 | fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); | 446 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); |
751 | return; | 447 | return; |
752 | case 0xFB02: /* fl */ | 448 | case 0xFB02: /* fl */ |
753 | fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); | 449 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
754 | fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); | 450 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); |
755 | return; | 451 | return; |
756 | case 0xFB03: /* ffi */ | 452 | case 0xFB03: /* ffi */ |
757 | fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); | 453 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
758 | fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); | 454 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
759 | fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); | 455 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); |
760 | return; | 456 | return; |
761 | case 0xFB04: /* ffl */ | 457 | case 0xFB04: /* ffl */ |
762 | fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); | 458 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
763 | fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); | 459 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
764 | fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); | 460 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); |
765 | return; | 461 | return; |
766 | case 0xFB05: /* long st */ | 462 | case 0xFB05: /* long st */ |
767 | case 0xFB06: /* st */ | 463 | case 0xFB06: /* st */ |
768 | fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode); | 464 | fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode); |
769 | fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode); | 465 | fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode); |
770 | return; | 466 | return; |
771 | } | 467 | } |
468 | } | ||
772 | 469 | ||
773 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) | 470 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) |
471 | { | ||
774 | switch (c) | 472 | switch (c) |
775 | { | 473 | { |
776 | case 0x0009: /* tab */ | 474 | case 0x0009: /* tab */ |
@@ -794,56 +492,23 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, | |||
794 | case 0x3000: /* ideographic space */ | 492 | case 0x3000: /* ideographic space */ |
795 | c = ' '; | 493 | c = ' '; |
796 | } | 494 | } |
495 | } | ||
797 | 496 | ||
798 | fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode); | 497 | fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode); |
799 | } | 498 | } |
800 | 499 | ||
801 | static void | 500 | static void |
802 | fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm, fz_stext_style *style) | 501 | fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm) |
803 | { | 502 | { |
804 | fz_font *font = span->font; | 503 | fz_font *font = span->font; |
805 | FT_Face face = fz_font_ft_face(ctx, font); | ||
806 | fz_buffer **t3procs = fz_font_t3_procs(ctx, font); | ||
807 | fz_rect *bbox = fz_font_bbox(ctx, font); | ||
808 | fz_matrix tm = span->trm; | 504 | fz_matrix tm = span->trm; |
809 | fz_matrix trm; | 505 | fz_matrix trm; |
810 | float adv; | 506 | float adv; |
811 | float ascender = 1; | 507 | int i; |
812 | float descender = 0; | ||
813 | int i, err; | ||
814 | 508 | ||
815 | if (span->len == 0) | 509 | if (span->len == 0) |
816 | return; | 510 | return; |
817 | 511 | ||
818 | if (dev->spans == NULL) | ||
819 | dev->spans = new_span_soup(ctx); | ||
820 | |||
821 | if (style->wmode == 0) | ||
822 | { | ||
823 | if (face) | ||
824 | { | ||
825 | fz_lock(ctx, FZ_LOCK_FREETYPE); | ||
826 | err = FT_Set_Char_Size(face, 64, 64, 72, 72); | ||
827 | if (err) | ||
828 | fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); | ||
829 | ascender = (float)face->ascender / face->units_per_EM; | ||
830 | descender = (float)face->descender / face->units_per_EM; | ||
831 | fz_unlock(ctx, FZ_LOCK_FREETYPE); | ||
832 | } | ||
833 | else if (t3procs && !fz_is_empty_rect(bbox)) | ||
834 | { | ||
835 | ascender = bbox->y1; | ||
836 | descender = bbox->y0; | ||
837 | } | ||
838 | } | ||
839 | else | ||
840 | { | ||
841 | ascender = bbox->x1; | ||
842 | descender = bbox->x0; | ||
843 | } | ||
844 | style->ascender = ascender; | ||
845 | style->descender = descender; | ||
846 | |||
847 | tm.e = 0; | 512 | tm.e = 0; |
848 | tm.f = 0; | 513 | tm.f = 0; |
849 | fz_concat(&trm, &tm, ctm); | 514 | fz_concat(&trm, &tm, ctm); |
@@ -857,11 +522,11 @@ fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, cons | |||
857 | 522 | ||
858 | /* Calculate bounding box and new pen position based on font metrics */ | 523 | /* Calculate bounding box and new pen position based on font metrics */ |
859 | if (span->items[i].gid >= 0) | 524 | if (span->items[i].gid >= 0) |
860 | adv = fz_advance_glyph(ctx, font, span->items[i].gid, style->wmode); | 525 | adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode); |
861 | else | 526 | else |
862 | adv = 0; | 527 | adv = 0; |
863 | 528 | ||
864 | fz_add_stext_char(ctx, dev, style, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode); | 529 | fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode); |
865 | } | 530 | } |
866 | } | 531 | } |
867 | 532 | ||
@@ -870,13 +535,10 @@ fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, const f | |||
870 | fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) | 535 | fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) |
871 | { | 536 | { |
872 | fz_stext_device *tdev = (fz_stext_device*)dev; | 537 | fz_stext_device *tdev = (fz_stext_device*)dev; |
873 | fz_stext_style *style; | ||
874 | fz_text_span *span; | 538 | fz_text_span *span; |
539 | tdev->new_obj = 1; | ||
875 | for (span = text->head; span; span = span->next) | 540 | for (span = text->head; span; span = span->next) |
876 | { | 541 | fz_stext_extract(ctx, tdev, span, ctm); |
877 | style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, NULL); | ||
878 | fz_stext_extract(ctx, tdev, span, ctm, style); | ||
879 | } | ||
880 | } | 542 | } |
881 | 543 | ||
882 | static void | 544 | static void |
@@ -884,94 +546,61 @@ fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const | |||
884 | fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) | 546 | fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) |
885 | { | 547 | { |
886 | fz_stext_device *tdev = (fz_stext_device*)dev; | 548 | fz_stext_device *tdev = (fz_stext_device*)dev; |
887 | fz_stext_style *style; | ||
888 | fz_text_span *span; | 549 | fz_text_span *span; |
550 | tdev->new_obj = 1; | ||
889 | for (span = text->head; span; span = span->next) | 551 | for (span = text->head; span; span = span->next) |
890 | { | 552 | fz_stext_extract(ctx, tdev, span, ctm); |
891 | style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, stroke); | ||
892 | fz_stext_extract(ctx, tdev, span, ctm, style); | ||
893 | } | ||
894 | } | 553 | } |
895 | 554 | ||
896 | static void | 555 | static void |
897 | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor) | 556 | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor) |
898 | { | 557 | { |
899 | fz_stext_device *tdev = (fz_stext_device*)dev; | 558 | fz_stext_device *tdev = (fz_stext_device*)dev; |
900 | fz_stext_style *style; | ||
901 | fz_text_span *span; | 559 | fz_text_span *span; |
560 | tdev->new_obj = 1; | ||
902 | for (span = text->head; span; span = span->next) | 561 | for (span = text->head; span; span = span->next) |
903 | { | 562 | fz_stext_extract(ctx, tdev, span, ctm); |
904 | style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL); | ||
905 | fz_stext_extract(ctx, tdev, span, ctm, style); | ||
906 | } | ||
907 | } | 563 | } |
908 | 564 | ||
909 | static void | 565 | static void |
910 | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor) | 566 | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor) |
911 | { | 567 | { |
912 | fz_stext_device *tdev = (fz_stext_device*)dev; | 568 | fz_stext_device *tdev = (fz_stext_device*)dev; |
913 | fz_stext_style *style; | ||
914 | fz_text_span *span; | 569 | fz_text_span *span; |
570 | tdev->new_obj = 1; | ||
915 | for (span = text->head; span; span = span->next) | 571 | for (span = text->head; span; span = span->next) |
916 | { | 572 | fz_stext_extract(ctx, tdev, span, ctm); |
917 | style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, stroke); | ||
918 | fz_stext_extract(ctx, tdev, span, ctm, style); | ||
919 | } | ||
920 | } | 573 | } |
921 | 574 | ||
922 | static void | 575 | static void |
923 | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm) | 576 | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm) |
924 | { | 577 | { |
925 | fz_stext_device *tdev = (fz_stext_device*)dev; | 578 | fz_stext_device *tdev = (fz_stext_device*)dev; |
926 | fz_stext_style *style; | ||
927 | fz_text_span *span; | 579 | fz_text_span *span; |
580 | tdev->new_obj = 1; | ||
928 | for (span = text->head; span; span = span->next) | 581 | for (span = text->head; span; span = span->next) |
929 | { | 582 | fz_stext_extract(ctx, tdev, span, ctm); |
930 | style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL); | ||
931 | fz_stext_extract(ctx, tdev, span, ctm, style); | ||
932 | } | ||
933 | } | 583 | } |
934 | 584 | ||
585 | /* Images and shadings */ | ||
586 | |||
935 | static void | 587 | static void |
936 | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, | 588 | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params) |
937 | fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params) | ||
938 | { | 589 | { |
939 | fz_stext_device *tdev = (fz_stext_device*)dev; | 590 | fz_stext_device *tdev = (fz_stext_device*)dev; |
940 | fz_stext_page *page = tdev->page; | ||
941 | fz_image_block *block; | ||
942 | 591 | ||
943 | /* If the alpha is less than 50% then it's probably a watermark or | 592 | /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ |
944 | * effect or something. Skip it */ | ||
945 | if (alpha < 0.5f) | 593 | if (alpha < 0.5f) |
946 | return; | 594 | return; |
947 | 595 | ||
948 | /* New block */ | 596 | add_image_block_to_page(ctx, tdev->page, ctm, img); |
949 | if (page->len == page->cap) | ||
950 | { | ||
951 | int newcap = (page->cap ? page->cap*2 : 4); | ||
952 | page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks)); | ||
953 | page->cap = newcap; | ||
954 | } | ||
955 | block = fz_malloc_struct(ctx, fz_image_block); | ||
956 | page->blocks[page->len].type = FZ_PAGE_BLOCK_IMAGE; | ||
957 | page->blocks[page->len].u.image = block; | ||
958 | block->image = fz_keep_image(ctx, img); | ||
959 | block->cspace = fz_keep_colorspace(ctx, cspace); | ||
960 | if (cspace) | ||
961 | memcpy(block->colors, color, sizeof(block->colors[0])*fz_colorspace_n(ctx, cspace)); | ||
962 | block->mat = *ctm; | ||
963 | block->bbox.x0 = 0; | ||
964 | block->bbox.y0 = 0; | ||
965 | block->bbox.x1 = 1; | ||
966 | block->bbox.y1 = 1; | ||
967 | fz_transform_rect(&block->bbox, ctm); | ||
968 | page->len++; | ||
969 | } | 597 | } |
970 | 598 | ||
971 | static void | 599 | static void |
972 | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params) | 600 | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, |
601 | fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params) | ||
973 | { | 602 | { |
974 | fz_stext_fill_image_mask(ctx, dev, img, ctm, NULL, NULL, alpha, color_params); | 603 | fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); |
975 | } | 604 | } |
976 | 605 | ||
977 | static fz_image * | 606 | static fz_image * |
@@ -1025,103 +654,89 @@ fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, const fz_m | |||
1025 | fz_rethrow(ctx); | 654 | fz_rethrow(ctx); |
1026 | } | 655 | } |
1027 | 656 | ||
1028 | static int | 657 | /* RTL visual to logical order pass */ |
1029 | direction_from_bidi_class(int bidiclass, int curdir) | ||
1030 | { | ||
1031 | switch (bidiclass) | ||
1032 | { | ||
1033 | /* strong */ | ||
1034 | case UCDN_BIDI_CLASS_L: return 1; | ||
1035 | case UCDN_BIDI_CLASS_R: return -1; | ||
1036 | case UCDN_BIDI_CLASS_AL: return -1; | ||
1037 | |||
1038 | /* weak */ | ||
1039 | case UCDN_BIDI_CLASS_EN: | ||
1040 | case UCDN_BIDI_CLASS_ES: | ||
1041 | case UCDN_BIDI_CLASS_ET: | ||
1042 | case UCDN_BIDI_CLASS_AN: | ||
1043 | case UCDN_BIDI_CLASS_CS: | ||
1044 | case UCDN_BIDI_CLASS_NSM: | ||
1045 | case UCDN_BIDI_CLASS_BN: | ||
1046 | return curdir; | ||
1047 | |||
1048 | /* neutral */ | ||
1049 | case UCDN_BIDI_CLASS_B: | ||
1050 | case UCDN_BIDI_CLASS_S: | ||
1051 | case UCDN_BIDI_CLASS_WS: | ||
1052 | case UCDN_BIDI_CLASS_ON: | ||
1053 | return curdir; | ||
1054 | |||
1055 | /* embedding, override, pop ... we don't support them */ | ||
1056 | default: | ||
1057 | return 0; | ||
1058 | } | ||
1059 | } | ||
1060 | 658 | ||
1061 | static void | 659 | static void |
1062 | fz_bidi_reorder_run(fz_stext_span *span, int a, int b, int dir) | 660 | fz_bidi_reorder_run(fz_stext_char *a, fz_stext_char *b, int dir) |
1063 | { | 661 | { |
1064 | if (a < b && dir == -1) | 662 | if (a < b && dir == -1) |
1065 | { | 663 | { |
1066 | fz_stext_char c; | 664 | fz_stext_char tmp; |
1067 | int m = a + (b - a) / 2; | 665 | fz_stext_char *m = a + (b - a) / 2; |
1068 | while (a < m) | 666 | while (a < m) |
1069 | { | 667 | { |
1070 | b--; | 668 | b--; |
1071 | c = span->text[a]; | 669 | |
1072 | span->text[a] = span->text[b]; | 670 | tmp.c = a->c; |
1073 | span->text[b] = c; | 671 | tmp.origin = a->origin; |
672 | tmp.bbox = a->bbox; | ||
673 | tmp.size = a->size; | ||
674 | tmp.font = a->font; | ||
675 | |||
676 | a->c = b->c; | ||
677 | a->origin = b->origin; | ||
678 | a->bbox = b->bbox; | ||
679 | a->size = b->size; | ||
680 | a->font = b->font; | ||
681 | |||
682 | b->c = tmp.c; | ||
683 | b->origin = tmp.origin; | ||
684 | b->bbox = tmp.bbox; | ||
685 | b->size = tmp.size; | ||
686 | b->font = tmp.font; | ||
687 | |||
1074 | a++; | 688 | a++; |
1075 | } | 689 | } |
1076 | } | 690 | } |
1077 | } | 691 | } |
1078 | 692 | ||
1079 | static void | 693 | static void |
1080 | fz_bidi_reorder_span(fz_stext_span *span) | 694 | fz_bidi_reorder_line(fz_stext_line *line) |
1081 | { | 695 | { |
1082 | int a, b, dir, curdir; | 696 | fz_stext_char *a, *b; |
697 | int dir, curdir; | ||
1083 | 698 | ||
1084 | a = 0; | 699 | a = line->first_char; |
1085 | curdir = 1; | 700 | curdir = 0; |
1086 | for (b = 0; b < span->len; b++) | 701 | for (b = line->first_char; b; b = b->next) |
1087 | { | 702 | { |
1088 | dir = direction_from_bidi_class(ucdn_get_bidi_class(span->text[b].c), curdir); | 703 | dir = b->rtl; |
1089 | if (dir != curdir) | 704 | if (dir != curdir) |
1090 | { | 705 | { |
1091 | fz_bidi_reorder_run(span, a, b, curdir); | 706 | fz_bidi_reorder_run(a, b, curdir); |
1092 | curdir = dir; | 707 | curdir = dir; |
1093 | a = b; | 708 | a = b; |
1094 | } | 709 | } |
1095 | } | 710 | } |
1096 | fz_bidi_reorder_run(span, a, b, curdir); | 711 | fz_bidi_reorder_run(a, b, curdir); |
1097 | } | 712 | } |
1098 | 713 | ||
1099 | static void | 714 | static void |
1100 | fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page) | 715 | fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page) |
1101 | { | 716 | { |
1102 | fz_page_block *pageblock; | ||
1103 | fz_stext_block *block; | 717 | fz_stext_block *block; |
1104 | fz_stext_line *line; | 718 | fz_stext_line *line; |
1105 | fz_stext_span *span; | ||
1106 | 719 | ||
1107 | for (pageblock = page->blocks; pageblock < page->blocks + page->len; pageblock++) | 720 | for (block = page->first_block; block; block = block->next) |
1108 | if (pageblock->type == FZ_PAGE_BLOCK_TEXT) | 721 | if (block->type == FZ_STEXT_BLOCK_TEXT) |
1109 | for (block = pageblock->u.text, line = block->lines; line < block->lines + block->len; line++) | 722 | for (line = block->u.t.first_line; line; line = line->next) |
1110 | for (span = line->first_span; span; span = span->next) | 723 | fz_bidi_reorder_line(line); |
1111 | fz_bidi_reorder_span(span); | ||
1112 | } | 724 | } |
1113 | 725 | ||
1114 | static void | 726 | static void |
1115 | fz_stext_close_device(fz_context *ctx, fz_device *dev) | 727 | fz_stext_close_device(fz_context *ctx, fz_device *dev) |
1116 | { | 728 | { |
1117 | fz_stext_device *tdev = (fz_stext_device*)dev; | 729 | fz_stext_device *tdev = (fz_stext_device*)dev; |
730 | fz_stext_page *page = tdev->page; | ||
731 | fz_stext_block *block; | ||
732 | fz_stext_line *line; | ||
1118 | 733 | ||
1119 | add_span_to_soup(ctx, tdev->spans, tdev->cur_span); | 734 | for (block = page->first_block; block; block = block->next) |
1120 | tdev->cur_span = NULL; | 735 | if (block->type == FZ_STEXT_BLOCK_TEXT) |
1121 | 736 | for (line = block->u.t.first_line; line; line = line->next) | |
1122 | strain_soup(ctx, tdev); | 737 | fz_union_rect(&block->bbox, &line->bbox); |
1123 | 738 | ||
1124 | /* TODO: smart sorting of blocks in reading order */ | 739 | /* TODO: smart sorting of blocks and lines in reading order */ |
1125 | /* TODO: unicode NFC normalization */ | 740 | /* TODO: unicode NFC normalization */ |
1126 | 741 | ||
1127 | fz_bidi_reorder_stext_page(ctx, tdev->page); | 742 | fz_bidi_reorder_stext_page(ctx, tdev->page); |
@@ -1130,9 +745,6 @@ fz_stext_close_device(fz_context *ctx, fz_device *dev) | |||
1130 | static void | 745 | static void |
1131 | fz_stext_drop_device(fz_context *ctx, fz_device *dev) | 746 | fz_stext_drop_device(fz_context *ctx, fz_device *dev) |
1132 | { | 747 | { |
1133 | fz_stext_device *tdev = (fz_stext_device*)dev; | ||
1134 | free_span_soup(ctx, tdev->spans); | ||
1135 | tdev->spans = NULL; | ||
1136 | } | 748 | } |
1137 | 749 | ||
1138 | fz_stext_options * | 750 | fz_stext_options * |
@@ -1153,7 +765,7 @@ fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *stri | |||
1153 | } | 765 | } |
1154 | 766 | ||
1155 | fz_device * | 767 | fz_device * |
1156 | fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts) | 768 | fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) |
1157 | { | 769 | { |
1158 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); | 770 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); |
1159 | 771 | ||
@@ -1174,11 +786,12 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, | |||
1174 | dev->super.fill_image_mask = fz_stext_fill_image_mask; | 786 | dev->super.fill_image_mask = fz_stext_fill_image_mask; |
1175 | } | 787 | } |
1176 | 788 | ||
1177 | dev->sheet = sheet; | ||
1178 | dev->page = page; | 789 | dev->page = page; |
1179 | dev->spans = NULL; | 790 | dev->pen.x = 0; |
1180 | dev->cur_span = NULL; | 791 | dev->pen.y = 0; |
792 | dev->trm = fz_identity; | ||
1181 | dev->lastchar = ' '; | 793 | dev->lastchar = ' '; |
794 | dev->curdir = 1; | ||
1182 | 795 | ||
1183 | return (fz_device*)dev; | 796 | return (fz_device*)dev; |
1184 | } | 797 | } |
diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c index 63124aa7f..f5f724121 100644 --- a/source/fitz/stext-output.c +++ b/source/fitz/stext-output.c | |||
@@ -9,40 +9,28 @@ | |||
9 | /* HTML output (visual formatting with preserved layout) */ | 9 | /* HTML output (visual formatting with preserved layout) */ |
10 | 10 | ||
11 | static void | 11 | static void |
12 | fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style) | 12 | fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size) |
13 | { | 13 | { |
14 | int is_bold = fz_font_is_bold(ctx, style->font); | 14 | int is_bold = fz_font_is_bold(ctx, font); |
15 | int is_italic = fz_font_is_italic(ctx, style->font); | 15 | int is_italic = fz_font_is_italic(ctx, font); |
16 | int is_serif = fz_font_is_serif(ctx, style->font); | 16 | int is_serif = fz_font_is_serif(ctx, font); |
17 | int is_mono = fz_font_is_monospaced(ctx, style->font); | 17 | int is_mono = fz_font_is_monospaced(ctx, font); |
18 | int script = style->script; | ||
19 | 18 | ||
20 | fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", style->size); | 19 | fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", size); |
21 | if (is_mono) | 20 | if (is_mono) |
22 | fz_write_string(ctx, out, "<tt>"); | 21 | fz_write_string(ctx, out, "<tt>"); |
23 | if (is_bold) | 22 | if (is_bold) |
24 | fz_write_string(ctx, out, "<b>"); | 23 | fz_write_string(ctx, out, "<b>"); |
25 | if (is_italic) | 24 | if (is_italic) |
26 | fz_write_string(ctx, out, "<i>"); | 25 | fz_write_string(ctx, out, "<i>"); |
27 | |||
28 | while (script-- > 0) | ||
29 | fz_write_string(ctx, out, "<sup>"); | ||
30 | while (++script < 0) | ||
31 | fz_write_string(ctx, out, "<sub>"); | ||
32 | } | 26 | } |
33 | 27 | ||
34 | static void | 28 | static void |
35 | fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) | 29 | fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size) |
36 | { | 30 | { |
37 | int is_mono = fz_font_is_monospaced(ctx, style->font); | 31 | int is_mono = fz_font_is_monospaced(ctx, font); |
38 | int is_bold = fz_font_is_bold(ctx, style->font); | 32 | int is_bold = fz_font_is_bold(ctx,font); |
39 | int is_italic = fz_font_is_italic(ctx, style->font); | 33 | int is_italic = fz_font_is_italic(ctx, font); |
40 | int script = style->script; | ||
41 | |||
42 | while (script-- > 0) | ||
43 | fz_write_string(ctx, out, "</sup>"); | ||
44 | while (++script < 0) | ||
45 | fz_write_string(ctx, out, "</sub>"); | ||
46 | 34 | ||
47 | if (is_italic) | 35 | if (is_italic) |
48 | fz_write_string(ctx, out, "</i>"); | 36 | fz_write_string(ctx, out, "</i>"); |
@@ -54,7 +42,7 @@ fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) | |||
54 | } | 42 | } |
55 | 43 | ||
56 | static void | 44 | static void |
57 | fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block) | 45 | fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
58 | { | 46 | { |
59 | int x = block->bbox.x0; | 47 | int x = block->bbox.x0; |
60 | int y = block->bbox.y0; | 48 | int y = block->bbox.y0; |
@@ -62,90 +50,78 @@ fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *bl | |||
62 | int h = block->bbox.y1 - block->bbox.y0; | 50 | int h = block->bbox.y1 - block->bbox.y0; |
63 | 51 | ||
64 | fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h); | 52 | fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h); |
65 | fz_write_image_as_data_uri(ctx, out, block->image); | 53 | fz_write_image_as_data_uri(ctx, out, block->u.i.image); |
66 | fz_write_string(ctx, out, "\">\n"); | 54 | fz_write_string(ctx, out, "\">\n"); |
67 | } | 55 | } |
68 | 56 | ||
69 | void | 57 | void |
70 | fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) | 58 | fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) |
71 | { | 59 | { |
72 | fz_stext_style *style = NULL; | ||
73 | fz_stext_line *line; | 60 | fz_stext_line *line; |
74 | fz_stext_span *span; | ||
75 | fz_stext_char *ch; | 61 | fz_stext_char *ch; |
76 | int x, y; | 62 | int x, y; |
77 | 63 | ||
78 | style = NULL; | 64 | fz_font *font = NULL; |
65 | float size = 0; | ||
79 | 66 | ||
80 | for (line = block->lines; line < block->lines + block->len; ++line) | 67 | for (line = block->u.t.first_line; line; line = line->next) |
81 | { | 68 | { |
82 | for (span = line->first_span; span; span = span->next) | 69 | x = line->bbox.x0; |
70 | y = line->bbox.y0; | ||
71 | |||
72 | fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x); | ||
73 | font = NULL; | ||
74 | |||
75 | for (ch = line->first_char; ch; ch = ch->next) | ||
83 | { | 76 | { |
84 | if (span == line->first_span || span->spacing > 1) | 77 | if (ch->font != font || ch->size != size) |
85 | { | 78 | { |
86 | if (style) | 79 | if (font) |
87 | { | 80 | fz_print_style_end_html(ctx, out, font, size); |
88 | fz_print_style_end_html(ctx, out, style); | 81 | font = ch->font; |
89 | fz_write_string(ctx, out, "</p>\n"); | 82 | size = ch->size; |
90 | style = NULL; | 83 | fz_print_style_begin_html(ctx, out, font, size); |
91 | } | ||
92 | x = span->bbox.x0; | ||
93 | y = span->bbox.y0; | ||
94 | fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x); | ||
95 | } | 84 | } |
96 | 85 | ||
97 | for (ch = span->text; ch < span->text + span->len; ++ch) | 86 | switch (ch->c) |
98 | { | 87 | { |
99 | if (ch->style != style) | 88 | default: |
100 | { | 89 | if (ch->c >= 32 && ch->c <= 127) |
101 | if (style) | 90 | fz_write_byte(ctx, out, ch->c); |
102 | fz_print_style_end_html(ctx, out, style); | 91 | else |
103 | style = ch->style; | 92 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
104 | fz_print_style_begin_html(ctx, out, style); | 93 | break; |
105 | } | 94 | case '<': fz_write_string(ctx, out, "<"); break; |
106 | 95 | case '>': fz_write_string(ctx, out, ">"); break; | |
107 | switch (ch->c) | 96 | case '&': fz_write_string(ctx, out, "&"); break; |
108 | { | 97 | case '"': fz_write_string(ctx, out, """); break; |
109 | default: | 98 | case '\'': fz_write_string(ctx, out, "'"); break; |
110 | if (ch->c >= 32 && ch->c <= 127) | ||
111 | fz_write_byte(ctx, out, ch->c); | ||
112 | else | ||
113 | fz_write_printf(ctx, out, "&#x%x;", ch->c); | ||
114 | break; | ||
115 | case '<': fz_write_string(ctx, out, "<"); break; | ||
116 | case '>': fz_write_string(ctx, out, ">"); break; | ||
117 | case '&': fz_write_string(ctx, out, "&"); break; | ||
118 | case '"': fz_write_string(ctx, out, """); break; | ||
119 | case '\'': fz_write_string(ctx, out, "'"); break; | ||
120 | } | ||
121 | } | 99 | } |
122 | } | 100 | } |
123 | 101 | ||
124 | if (style) | 102 | if (font) |
125 | { | 103 | fz_print_style_end_html(ctx, out, font, size); |
126 | fz_print_style_end_html(ctx, out, style); | 104 | |
127 | fz_write_string(ctx, out, "</p>\n"); | 105 | fz_write_string(ctx, out, "</p>\n"); |
128 | style = NULL; | ||
129 | } | ||
130 | } | 106 | } |
131 | } | 107 | } |
132 | 108 | ||
133 | void | 109 | void |
134 | fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) | 110 | fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) |
135 | { | 111 | { |
136 | fz_page_block *block; | 112 | fz_stext_block *block; |
137 | 113 | ||
138 | int w = page->mediabox.x1 - page->mediabox.x0; | 114 | int w = page->mediabox.x1 - page->mediabox.x0; |
139 | int h = page->mediabox.y1 - page->mediabox.y0; | 115 | int h = page->mediabox.y1 - page->mediabox.y0; |
140 | 116 | ||
141 | fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h); | 117 | fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h); |
142 | 118 | ||
143 | for (block = page->blocks; block < page->blocks + page->len; ++block) | 119 | for (block = page->first_block; block; block = block->next) |
144 | { | 120 | { |
145 | if (block->type == FZ_PAGE_BLOCK_IMAGE) | 121 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
146 | fz_print_stext_image_as_html(ctx, out, block->u.image); | 122 | fz_print_stext_image_as_html(ctx, out, block); |
147 | else if (block->type == FZ_PAGE_BLOCK_TEXT) | 123 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
148 | fz_print_stext_block_as_html(ctx, out, block->u.text); | 124 | fz_print_stext_block_as_html(ctx, out, block); |
149 | } | 125 | } |
150 | 126 | ||
151 | fz_write_string(ctx, out, "</div>\n"); | 127 | fz_write_string(ctx, out, "</div>\n"); |
@@ -177,23 +153,22 @@ fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) | |||
177 | /* XHTML output (semantic, little layout, suitable for reflow) */ | 153 | /* XHTML output (semantic, little layout, suitable for reflow) */ |
178 | 154 | ||
179 | static void | 155 | static void |
180 | fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block) | 156 | fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
181 | { | 157 | { |
182 | int w = block->bbox.x1 - block->bbox.x0; | 158 | int w = block->bbox.x1 - block->bbox.x0; |
183 | int h = block->bbox.y1 - block->bbox.y0; | 159 | int h = block->bbox.y1 - block->bbox.y0; |
184 | 160 | ||
185 | fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h); | 161 | fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h); |
186 | fz_write_image_as_data_uri(ctx, out, block->image); | 162 | fz_write_image_as_data_uri(ctx, out, block->u.i.image); |
187 | fz_write_string(ctx, out, "\"/></p>\n"); | 163 | fz_write_string(ctx, out, "\"/></p>\n"); |
188 | } | 164 | } |
189 | 165 | ||
190 | static void | 166 | static void |
191 | fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) | 167 | fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size) |
192 | { | 168 | { |
193 | int is_mono = fz_font_is_monospaced(ctx, style->font); | 169 | int is_mono = fz_font_is_monospaced(ctx, font); |
194 | int is_bold = fz_font_is_bold(ctx, style->font); | 170 | int is_bold = fz_font_is_bold(ctx, font); |
195 | int is_italic = fz_font_is_italic(ctx, style->font); | 171 | int is_italic = fz_font_is_italic(ctx, font); |
196 | int script = style->script; | ||
197 | 172 | ||
198 | if (is_mono) | 173 | if (is_mono) |
199 | fz_write_string(ctx, out, "<tt>"); | 174 | fz_write_string(ctx, out, "<tt>"); |
@@ -201,25 +176,14 @@ fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *styl | |||
201 | fz_write_string(ctx, out, "<b>"); | 176 | fz_write_string(ctx, out, "<b>"); |
202 | if (is_italic) | 177 | if (is_italic) |
203 | fz_write_string(ctx, out, "<i>"); | 178 | fz_write_string(ctx, out, "<i>"); |
204 | |||
205 | while (script-- > 0) | ||
206 | fz_write_string(ctx, out, "<sup>"); | ||
207 | while (++script < 0) | ||
208 | fz_write_string(ctx, out, "<sub>"); | ||
209 | } | 179 | } |
210 | 180 | ||
211 | static void | 181 | static void |
212 | fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) | 182 | fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size) |
213 | { | 183 | { |
214 | int is_mono = fz_font_is_monospaced(ctx, style->font); | 184 | int is_mono = fz_font_is_monospaced(ctx, font); |
215 | int is_bold = fz_font_is_bold(ctx, style->font); | 185 | int is_bold = fz_font_is_bold(ctx, font); |
216 | int is_italic = fz_font_is_italic(ctx, style->font); | 186 | int is_italic = fz_font_is_italic(ctx, font); |
217 | int script = style->script; | ||
218 | |||
219 | while (script-- > 0) | ||
220 | fz_write_string(ctx, out, "</sup>"); | ||
221 | while (++script < 0) | ||
222 | fz_write_string(ctx, out, "</sub>"); | ||
223 | 187 | ||
224 | if (is_italic) | 188 | if (is_italic) |
225 | fz_write_string(ctx, out, "</i>"); | 189 | fz_write_string(ctx, out, "</i>"); |
@@ -232,68 +196,63 @@ fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) | |||
232 | static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) | 196 | static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) |
233 | { | 197 | { |
234 | fz_stext_line *line; | 198 | fz_stext_line *line; |
235 | fz_stext_span *span; | ||
236 | fz_stext_char *ch; | 199 | fz_stext_char *ch; |
237 | fz_stext_style *style; | ||
238 | 200 | ||
239 | style = NULL; | 201 | fz_font *font = NULL; |
240 | fz_write_string(ctx, out, "<p>\n"); | 202 | float size = 0; |
203 | |||
204 | fz_write_string(ctx, out, "<p>"); | ||
241 | 205 | ||
242 | for (line = block->lines; line < block->lines + block->len; ++line) | 206 | for (line = block->u.t.first_line; line; line = line->next) |
243 | { | 207 | { |
244 | if (line > block->lines) | 208 | if (line != block->u.t.first_line) |
245 | fz_write_string(ctx, out, "<br/>\n"); | 209 | fz_write_string(ctx, out, "\n"); |
246 | for (span = line->first_span; span; span = span->next) | 210 | for (ch = line->first_char; ch; ch = ch->next) |
247 | { | 211 | { |
248 | if (span->spacing > 1) | 212 | if (ch->font != font || ch->size != size) |
249 | fz_write_byte(ctx, out, ' '); | ||
250 | |||
251 | for (ch = span->text; ch < span->text + span->len; ++ch) | ||
252 | { | 213 | { |
253 | if (ch->style != style) | 214 | if (font) |
254 | { | 215 | fz_print_style_end_xhtml(ctx, out, font, size); |
255 | if (style) | 216 | font = ch->font; |
256 | fz_print_style_end_xhtml(ctx, out, style); | 217 | size = ch->size; |
257 | style = ch->style; | 218 | fz_print_style_begin_xhtml(ctx, out, font, size); |
258 | fz_print_style_begin_xhtml(ctx, out, style); | 219 | } |
259 | } | ||
260 | 220 | ||
261 | switch (ch->c) | 221 | switch (ch->c) |
262 | { | 222 | { |
263 | default: | 223 | default: |
264 | if (ch->c >= 32 && ch->c <= 127) | 224 | if (ch->c >= 32 && ch->c <= 127) |
265 | fz_write_byte(ctx, out, ch->c); | 225 | fz_write_byte(ctx, out, ch->c); |
266 | else | 226 | else |
267 | fz_write_printf(ctx, out, "&#x%x;", ch->c); | 227 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
268 | break; | 228 | break; |
269 | case '<': fz_write_string(ctx, out, "<"); break; | 229 | case '<': fz_write_string(ctx, out, "<"); break; |
270 | case '>': fz_write_string(ctx, out, ">"); break; | 230 | case '>': fz_write_string(ctx, out, ">"); break; |
271 | case '&': fz_write_string(ctx, out, "&"); break; | 231 | case '&': fz_write_string(ctx, out, "&"); break; |
272 | case '"': fz_write_string(ctx, out, """); break; | 232 | case '"': fz_write_string(ctx, out, """); break; |
273 | case '\'': fz_write_string(ctx, out, "'"); break; | 233 | case '\'': fz_write_string(ctx, out, "'"); break; |
274 | } | ||
275 | } | 234 | } |
276 | } | 235 | } |
277 | } | 236 | } |
278 | 237 | ||
279 | if (style) | 238 | if (font) |
280 | fz_print_style_end_xhtml(ctx, out, style); | 239 | fz_print_style_end_xhtml(ctx, out, font, size); |
281 | fz_write_string(ctx, out, "\n</p>\n"); | 240 | fz_write_string(ctx, out, "</p>\n"); |
282 | } | 241 | } |
283 | 242 | ||
284 | void | 243 | void |
285 | fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) | 244 | fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) |
286 | { | 245 | { |
287 | fz_page_block *block; | 246 | fz_stext_block *block; |
288 | 247 | ||
289 | fz_write_string(ctx, out, "<div>\n"); | 248 | fz_write_string(ctx, out, "<div>\n"); |
290 | 249 | ||
291 | for (block = page->blocks; block < page->blocks + page->len; ++block) | 250 | for (block = page->first_block; block; block = block->next) |
292 | { | 251 | { |
293 | if (block->type == FZ_PAGE_BLOCK_IMAGE) | 252 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
294 | fz_print_stext_image_as_xhtml(ctx, out, block->u.image); | 253 | fz_print_stext_image_as_xhtml(ctx, out, block); |
295 | else if (block->type == FZ_PAGE_BLOCK_TEXT) | 254 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
296 | fz_print_stext_block_as_xhtml(ctx, out, block->u.text); | 255 | fz_print_stext_block_as_xhtml(ctx, out, block); |
297 | } | 256 | } |
298 | 257 | ||
299 | fz_write_string(ctx, out, "</div>\n"); | 258 | fz_write_string(ctx, out, "</div>\n"); |
@@ -311,6 +270,7 @@ fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) | |||
311 | fz_write_string(ctx, out, "<style>\n"); | 270 | fz_write_string(ctx, out, "<style>\n"); |
312 | fz_write_string(ctx, out, "body{background-color:gray}\n"); | 271 | fz_write_string(ctx, out, "body{background-color:gray}\n"); |
313 | fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n"); | 272 | fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n"); |
273 | fz_write_string(ctx, out, "p{white-space:pre-wrap}\n"); | ||
314 | fz_write_string(ctx, out, "</style>\n"); | 274 | fz_write_string(ctx, out, "</style>\n"); |
315 | fz_write_string(ctx, out, "</head>\n"); | 275 | fz_write_string(ctx, out, "</head>\n"); |
316 | fz_write_string(ctx, out, "<body>\n"); | 276 | fz_write_string(ctx, out, "<body>\n"); |
@@ -328,87 +288,79 @@ fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) | |||
328 | void | 288 | void |
329 | fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) | 289 | fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) |
330 | { | 290 | { |
331 | int block_n; | 291 | fz_stext_block *block; |
292 | fz_stext_line *line; | ||
293 | fz_stext_char *ch; | ||
332 | 294 | ||
333 | fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n", | 295 | fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n", |
334 | page->mediabox.x1 - page->mediabox.x0, | 296 | page->mediabox.x1 - page->mediabox.x0, |
335 | page->mediabox.y1 - page->mediabox.y0); | 297 | page->mediabox.y1 - page->mediabox.y0); |
336 | 298 | ||
337 | for (block_n = 0; block_n < page->len; block_n++) | 299 | for (block = page->first_block; block; block = block->next) |
338 | { | 300 | { |
339 | switch (page->blocks[block_n].type) | 301 | switch (block->type) |
340 | { | ||
341 | case FZ_PAGE_BLOCK_TEXT: | ||
342 | { | 302 | { |
343 | fz_stext_block *block = page->blocks[block_n].u.text; | 303 | case FZ_STEXT_BLOCK_TEXT: |
344 | fz_stext_line *line; | ||
345 | const char *s; | ||
346 | |||
347 | fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n", | 304 | fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n", |
348 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); | 305 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); |
349 | for (line = block->lines; line < block->lines + block->len; line++) | 306 | for (line = block->u.t.first_line; line; line = line->next) |
350 | { | 307 | { |
351 | fz_stext_span *span; | 308 | fz_font *font = NULL; |
309 | float size = 0; | ||
310 | const char *name = NULL; | ||
311 | const char *s; | ||
312 | fz_rect rect; | ||
313 | |||
352 | fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n", | 314 | fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n", |
353 | line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); | 315 | line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); |
354 | for (span = line->first_span; span; span = span->next) | 316 | |
317 | for (ch = line->first_char; ch; ch = ch->next) | ||
355 | { | 318 | { |
356 | fz_stext_style *style = NULL; | 319 | if (ch->font != font || ch->size != size) |
357 | const char *name = NULL; | 320 | { |
358 | int char_num; | 321 | if (font) |
359 | for (char_num = 0; char_num < span->len; char_num++) | 322 | fz_write_string(ctx, out, "</font>\n"); |
323 | font = ch->font; | ||
324 | size = ch->size; | ||
325 | name = fz_font_name(ctx, font); | ||
326 | s = strchr(name, '+'); | ||
327 | s = s ? s + 1 : name; | ||
328 | fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", s, size); | ||
329 | } | ||
330 | fz_stext_char_bbox(ctx, &rect, line, ch); | ||
331 | fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"", | ||
332 | rect.x0, rect.y0, rect.x1, rect.y1, ch->origin.x, ch->origin.y); | ||
333 | switch (ch->c) | ||
360 | { | 334 | { |
361 | fz_stext_char *ch = &span->text[char_num]; | 335 | case '<': fz_write_string(ctx, out, "<"); break; |
362 | if (ch->style != style) | 336 | case '>': fz_write_string(ctx, out, ">"); break; |
363 | { | 337 | case '&': fz_write_string(ctx, out, "&"); break; |
364 | if (style) | 338 | case '"': fz_write_string(ctx, out, """); break; |
365 | { | 339 | case '\'': fz_write_string(ctx, out, "'"); break; |
366 | fz_write_string(ctx, out, "</span>\n"); | 340 | default: |
367 | } | 341 | if (ch->c >= 32 && ch->c <= 127) |
368 | style = ch->style; | 342 | fz_write_printf(ctx, out, "%c", ch->c); |
369 | name = fz_font_name(ctx, style->font); | 343 | else |
370 | s = strchr(name, '+'); | 344 | fz_write_printf(ctx, out, "&#x%x;", ch->c); |
371 | s = s ? s + 1 : name; | 345 | break; |
372 | fz_write_printf(ctx, out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n", | ||
373 | span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, | ||
374 | s, style->size); | ||
375 | } | ||
376 | { | ||
377 | fz_rect rect; | ||
378 | fz_stext_char_bbox(ctx, &rect, span, char_num); | ||
379 | fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"", | ||
380 | rect.x0, rect.y0, rect.x1, rect.y1, ch->p.x, ch->p.y); | ||
381 | } | ||
382 | switch (ch->c) | ||
383 | { | ||
384 | case '<': fz_write_string(ctx, out, "<"); break; | ||
385 | case '>': fz_write_string(ctx, out, ">"); break; | ||
386 | case '&': fz_write_string(ctx, out, "&"); break; | ||
387 | case '"': fz_write_string(ctx, out, """); break; | ||
388 | case '\'': fz_write_string(ctx, out, "'"); break; | ||
389 | default: | ||
390 | if (ch->c >= 32 && ch->c <= 127) | ||
391 | fz_write_printf(ctx, out, "%c", ch->c); | ||
392 | else | ||
393 | fz_write_printf(ctx, out, "&#x%x;", ch->c); | ||
394 | break; | ||
395 | } | ||
396 | fz_write_string(ctx, out, "\"/>\n"); | ||
397 | } | 346 | } |
398 | if (style) | 347 | fz_write_string(ctx, out, "\"/>\n"); |
399 | fz_write_string(ctx, out, "</span>\n"); | ||
400 | } | 348 | } |
349 | |||
350 | if (font) | ||
351 | fz_write_string(ctx, out, "</font>\n"); | ||
352 | |||
401 | fz_write_string(ctx, out, "</line>\n"); | 353 | fz_write_string(ctx, out, "</line>\n"); |
402 | } | 354 | } |
403 | fz_write_string(ctx, out, "</block>\n"); | 355 | fz_write_string(ctx, out, "</block>\n"); |
404 | break; | 356 | break; |
405 | } | 357 | |
406 | case FZ_PAGE_BLOCK_IMAGE: | 358 | case FZ_STEXT_BLOCK_IMAGE: |
407 | { | 359 | fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n", |
360 | block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); | ||
408 | break; | 361 | break; |
409 | } | 362 | } |
410 | } | 363 | } |
411 | } | ||
412 | fz_write_string(ctx, out, "</page>\n"); | 364 | fz_write_string(ctx, out, "</page>\n"); |
413 | } | 365 | } |
414 | 366 | ||
@@ -417,31 +369,23 @@ fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) | |||
417 | void | 369 | void |
418 | fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) | 370 | fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) |
419 | { | 371 | { |
420 | fz_page_block *pblock; | 372 | fz_stext_block *block; |
373 | fz_stext_line *line; | ||
374 | fz_stext_char *ch; | ||
375 | char utf[10]; | ||
376 | int i, n; | ||
421 | 377 | ||
422 | for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock) | 378 | for (block = page->first_block; block; block = block->next) |
423 | { | 379 | { |
424 | if (pblock->type == FZ_PAGE_BLOCK_TEXT) | 380 | if (block->type == FZ_STEXT_BLOCK_TEXT) |
425 | { | 381 | { |
426 | fz_stext_block *block = pblock->u.text; | 382 | for (line = block->u.t.first_line; line; line = line->next) |
427 | fz_stext_line *line; | ||
428 | fz_stext_char *ch; | ||
429 | char utf[10]; | ||
430 | int i, n; | ||
431 | |||
432 | for (line = block->lines; line < block->lines + block->len; line++) | ||
433 | { | 383 | { |
434 | fz_stext_span *span; | 384 | for (ch = line->first_char; ch; ch = ch->next) |
435 | for (span = line->first_span; span; span = span->next) | ||
436 | { | 385 | { |
437 | if (span->spacing > 1) | 386 | n = fz_runetochar(utf, ch->c); |
438 | fz_write_byte(ctx, out, ' '); | 387 | for (i = 0; i < n; i++) |
439 | for (ch = span->text; ch < span->text + span->len; ch++) | 388 | fz_write_byte(ctx, out, utf[i]); |
440 | { | ||
441 | n = fz_runetochar(utf, ch->c); | ||
442 | for (i = 0; i < n; i++) | ||
443 | fz_write_byte(ctx, out, utf[i]); | ||
444 | } | ||
445 | } | 389 | } |
446 | fz_write_string(ctx, out, "\n"); | 390 | fz_write_string(ctx, out, "\n"); |
447 | } | 391 | } |
@@ -466,7 +410,6 @@ struct fz_text_writer_s | |||
466 | fz_document_writer super; | 410 | fz_document_writer super; |
467 | int format; | 411 | int format; |
468 | fz_stext_options opts; | 412 | fz_stext_options opts; |
469 | fz_stext_sheet *sheet; | ||
470 | fz_stext_page *page; | 413 | fz_stext_page *page; |
471 | fz_output *out; | 414 | fz_output *out; |
472 | }; | 415 | }; |
@@ -483,7 +426,7 @@ text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediab | |||
483 | } | 426 | } |
484 | 427 | ||
485 | wri->page = fz_new_stext_page(ctx, mediabox); | 428 | wri->page = fz_new_stext_page(ctx, mediabox); |
486 | return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts); | 429 | return fz_new_stext_device(ctx, wri->page, &wri->opts); |
487 | } | 430 | } |
488 | 431 | ||
489 | static void | 432 | static void |
@@ -537,7 +480,6 @@ text_drop_writer(fz_context *ctx, fz_document_writer *wri_) | |||
537 | { | 480 | { |
538 | fz_text_writer *wri = (fz_text_writer*)wri_; | 481 | fz_text_writer *wri = (fz_text_writer*)wri_; |
539 | fz_drop_stext_page(ctx, wri->page); | 482 | fz_drop_stext_page(ctx, wri->page); |
540 | fz_drop_stext_sheet(ctx, wri->sheet); | ||
541 | fz_drop_output(ctx, wri->out); | 483 | fz_drop_output(ctx, wri->out); |
542 | } | 484 | } |
543 | 485 | ||
@@ -561,7 +503,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const | |||
561 | else if (!strcmp(format, "stext")) | 503 | else if (!strcmp(format, "stext")) |
562 | wri->format = FZ_FORMAT_STEXT; | 504 | wri->format = FZ_FORMAT_STEXT; |
563 | 505 | ||
564 | wri->sheet = fz_new_stext_sheet(ctx); | ||
565 | wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); | 506 | wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); |
566 | 507 | ||
567 | switch (wri->format) | 508 | switch (wri->format) |
@@ -581,7 +522,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const | |||
581 | fz_catch(ctx) | 522 | fz_catch(ctx) |
582 | { | 523 | { |
583 | fz_drop_output(ctx, wri->out); | 524 | fz_drop_output(ctx, wri->out); |
584 | fz_drop_stext_sheet(ctx, wri->sheet); | ||
585 | fz_free(ctx, wri); | 525 | fz_free(ctx, wri); |
586 | fz_rethrow(ctx); | 526 | fz_rethrow(ctx); |
587 | } | 527 | } |
diff --git a/source/fitz/stext-paragraph.c b/source/fitz/stext-paragraph.c deleted file mode 100644 index e275ecae1..000000000 --- a/source/fitz/stext-paragraph.c +++ /dev/null | |||
@@ -1,1538 +0,0 @@ | |||
1 | #include "mupdf/fitz.h" | ||
2 | |||
3 | #include <string.h> | ||
4 | #include <assert.h> | ||
5 | #include <math.h> | ||
6 | |||
7 | /* Assemble span soup into blocks and lines. */ | ||
8 | |||
9 | #define MY_EPSILON 0.001f | ||
10 | |||
11 | #include <stdio.h> /* for debug printing */ | ||
12 | #undef DEBUG_LINE_HEIGHTS | ||
13 | #undef DEBUG_MASKS | ||
14 | #undef DEBUG_ALIGN | ||
15 | #undef DEBUG_INDENTS | ||
16 | |||
17 | #undef SPOT_LINE_NUMBERS | ||
18 | |||
19 | typedef struct line_height_s | ||
20 | { | ||
21 | float height; | ||
22 | int count; | ||
23 | fz_stext_style *style; | ||
24 | } line_height; | ||
25 | |||
26 | typedef struct line_heights_s | ||
27 | { | ||
28 | fz_context *ctx; | ||
29 | int cap; | ||
30 | int len; | ||
31 | line_height *lh; | ||
32 | } line_heights; | ||
33 | |||
34 | static line_heights * | ||
35 | new_line_heights(fz_context *ctx) | ||
36 | { | ||
37 | line_heights *lh = fz_malloc_struct(ctx, line_heights); | ||
38 | lh->ctx = ctx; | ||
39 | return lh; | ||
40 | } | ||
41 | |||
42 | static void | ||
43 | free_line_heights(line_heights *lh) | ||
44 | { | ||
45 | if (!lh) | ||
46 | return; | ||
47 | fz_free(lh->ctx, lh->lh); | ||
48 | fz_free(lh->ctx, lh); | ||
49 | } | ||
50 | |||
51 | static void | ||
52 | insert_line_height(line_heights *lh, fz_stext_style *style, float height) | ||
53 | { | ||
54 | int i; | ||
55 | |||
56 | #ifdef DEBUG_LINE_HEIGHTS | ||
57 | printf("style=%x height=%g\n", style, height); | ||
58 | #endif | ||
59 | |||
60 | /* If we have one already, add it in */ | ||
61 | for (i=0; i < lh->len; i++) | ||
62 | { | ||
63 | /* Match if we are within 5% */ | ||
64 | if (lh->lh[i].style == style && lh->lh[i].height * 0.95f <= height && lh->lh[i].height * 1.05f >= height) | ||
65 | { | ||
66 | /* Ensure that the average height is correct */ | ||
67 | lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1); | ||
68 | lh->lh[i].count++; | ||
69 | return; | ||
70 | } | ||
71 | } | ||
72 | |||
73 | /* Otherwise extend (if required) and add it */ | ||
74 | if (lh->cap == lh->len) | ||
75 | { | ||
76 | int newcap = (lh->cap ? lh->cap * 2 : 4); | ||
77 | lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height)); | ||
78 | lh->cap = newcap; | ||
79 | } | ||
80 | |||
81 | lh->lh[lh->len].count = 1; | ||
82 | lh->lh[lh->len].height = height; | ||
83 | lh->lh[lh->len].style = style; | ||
84 | lh->len++; | ||
85 | } | ||
86 | |||
87 | static void | ||
88 | cull_line_heights(line_heights *lh) | ||
89 | { | ||
90 | int i, j, k; | ||
91 | |||
92 | #ifdef DEBUG_LINE_HEIGHTS | ||
93 | printf("Before culling:\n"); | ||
94 | for (i = 0; i < lh->len; i++) | ||
95 | { | ||
96 | fz_stext_style *style = lh->lh[i].style; | ||
97 | printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count); | ||
98 | } | ||
99 | #endif | ||
100 | for (i = 0; i < lh->len; i++) | ||
101 | { | ||
102 | fz_stext_style *style = lh->lh[i].style; | ||
103 | int count = lh->lh[i].count; | ||
104 | int max = i; | ||
105 | |||
106 | /* Find the max for this style */ | ||
107 | for (j = i+1; j < lh->len; j++) | ||
108 | { | ||
109 | if (lh->lh[j].style == style && lh->lh[j].count > count) | ||
110 | { | ||
111 | max = j; | ||
112 | count = lh->lh[j].count; | ||
113 | } | ||
114 | } | ||
115 | |||
116 | /* Destroy all the ones other than the max */ | ||
117 | if (max != i) | ||
118 | { | ||
119 | lh->lh[i].count = count; | ||
120 | lh->lh[i].height = lh->lh[max].height; | ||
121 | lh->lh[max].count = 0; | ||
122 | } | ||
123 | j = i+1; | ||
124 | for (k = j; k < lh->len; k++) | ||
125 | { | ||
126 | if (lh->lh[k].style != style) | ||
127 | lh->lh[j++] = lh->lh[k]; | ||
128 | } | ||
129 | lh->len = j; | ||
130 | } | ||
131 | #ifdef DEBUG_LINE_HEIGHTS | ||
132 | printf("After culling:\n"); | ||
133 | for (i = 0; i < lh->len; i++) | ||
134 | { | ||
135 | fz_stext_style *style = lh->lh[i].style; | ||
136 | printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count); | ||
137 | } | ||
138 | #endif | ||
139 | } | ||
140 | |||
141 | static float | ||
142 | line_height_for_style(line_heights *lh, fz_stext_style *style) | ||
143 | { | ||
144 | int i; | ||
145 | |||
146 | for (i=0; i < lh->len; i++) | ||
147 | { | ||
148 | if (lh->lh[i].style == style) | ||
149 | return lh->lh[i].height; | ||
150 | } | ||
151 | return 0.0f; /* Never reached */ | ||
152 | } | ||
153 | |||
154 | static void | ||
155 | split_block(fz_context *ctx, fz_stext_page *page, int block_num, int linenum) | ||
156 | { | ||
157 | int split_len; | ||
158 | fz_stext_block *block, *block2; | ||
159 | |||
160 | if (page->len == page->cap) | ||
161 | { | ||
162 | int new_cap = fz_maxi(16, page->cap * 2); | ||
163 | page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks)); | ||
164 | page->cap = new_cap; | ||
165 | } | ||
166 | |||
167 | memmove(page->blocks+block_num+1, page->blocks+block_num, (page->len - block_num)*sizeof(*page->blocks)); | ||
168 | page->len++; | ||
169 | |||
170 | block2 = fz_malloc_struct(ctx, fz_stext_block); | ||
171 | block = page->blocks[block_num].u.text; | ||
172 | |||
173 | page->blocks[block_num+1].type = FZ_PAGE_BLOCK_TEXT; | ||
174 | page->blocks[block_num+1].u.text = block2; | ||
175 | split_len = block->len - linenum; | ||
176 | block2->bbox = block->bbox; /* FIXME! */ | ||
177 | block2->cap = 0; | ||
178 | block2->len = 0; | ||
179 | block2->lines = NULL; | ||
180 | block2->lines = fz_malloc_array(ctx, split_len, sizeof(fz_stext_line)); | ||
181 | block2->cap = block2->len; | ||
182 | block2->len = split_len; | ||
183 | block->len = linenum; | ||
184 | memcpy(block2->lines, block->lines + linenum, split_len * sizeof(fz_stext_line)); | ||
185 | block2->lines[0].distance = 0; | ||
186 | } | ||
187 | |||
188 | static inline int | ||
189 | is_unicode_wspace(int c) | ||
190 | { | ||
191 | return (c == 9 || /* TAB */ | ||
192 | c == 0x0a || /* HT */ | ||
193 | c == 0x0b || /* LF */ | ||
194 | c == 0x0c || /* VT */ | ||
195 | c == 0x0d || /* FF */ | ||
196 | c == 0x20 || /* CR */ | ||
197 | c == 0x85 || /* NEL */ | ||
198 | c == 0xA0 || /* No break space */ | ||
199 | c == 0x1680 || /* Ogham space mark */ | ||
200 | c == 0x180E || /* Mongolian Vowel Separator */ | ||
201 | c == 0x2000 || /* En quad */ | ||
202 | c == 0x2001 || /* Em quad */ | ||
203 | c == 0x2002 || /* En space */ | ||
204 | c == 0x2003 || /* Em space */ | ||
205 | c == 0x2004 || /* Three-per-Em space */ | ||
206 | c == 0x2005 || /* Four-per-Em space */ | ||
207 | c == 0x2006 || /* Five-per-Em space */ | ||
208 | c == 0x2007 || /* Figure space */ | ||
209 | c == 0x2008 || /* Punctuation space */ | ||
210 | c == 0x2009 || /* Thin space */ | ||
211 | c == 0x200A || /* Hair space */ | ||
212 | c == 0x2028 || /* Line separator */ | ||
213 | c == 0x2029 || /* Paragraph separator */ | ||
214 | c == 0x202F || /* Narrow no-break space */ | ||
215 | c == 0x205F || /* Medium mathematical space */ | ||
216 | c == 0x3000); /* Ideographic space */ | ||
217 | } | ||
218 | |||
219 | static inline int | ||
220 | is_unicode_bullet(int c) | ||
221 | { | ||
222 | /* The last 2 aren't strictly bullets, but will do for our usage here */ | ||
223 | return (c == 0x2022 || /* Bullet */ | ||
224 | c == 0x2023 || /* Triangular bullet */ | ||
225 | c == 0x25e6 || /* White bullet */ | ||
226 | c == 0x2043 || /* Hyphen bullet */ | ||
227 | c == 0x2219 || /* Bullet operator */ | ||
228 | c == 149 || /* Ascii bullet */ | ||
229 | c == '*'); | ||
230 | } | ||
231 | |||
232 | #ifdef SPOT_LINE_NUMBERS | ||
233 | static inline int | ||
234 | is_number(int c) | ||
235 | { | ||
236 | return ((c >= '0' && c <= '9') || | ||
237 | (c == '.')); | ||
238 | } | ||
239 | |||
240 | static inline int | ||
241 | is_latin_char(int c) | ||
242 | { | ||
243 | return ((c >= 'A' && c <= 'Z') || | ||
244 | (c >= 'a' && c <= 'z')); | ||
245 | } | ||
246 | |||
247 | static inline int | ||
248 | is_roman(int c) | ||
249 | { | ||
250 | return (c == 'i' || c == 'I' || | ||
251 | c == 'v' || c == 'V' || | ||
252 | c == 'x' || c == 'X' || | ||
253 | c == 'l' || c == 'L' || | ||
254 | c == 'c' || c == 'C' || | ||
255 | c == 'm' || c == 'M'); | ||
256 | } | ||
257 | #endif | ||
258 | |||
259 | static int | ||
260 | is_list_entry(fz_stext_line *line, fz_stext_span *span, int *char_num_ptr) | ||
261 | { | ||
262 | int char_num; | ||
263 | fz_stext_char *chr; | ||
264 | |||
265 | /* First, skip over any whitespace */ | ||
266 | for (char_num = 0; char_num < span->len; char_num++) | ||
267 | { | ||
268 | chr = &span->text[char_num]; | ||
269 | if (!is_unicode_wspace(chr->c)) | ||
270 | break; | ||
271 | } | ||
272 | *char_num_ptr = char_num; | ||
273 | |||
274 | if (span != line->first_span || char_num >= span->len) | ||
275 | return 0; | ||
276 | |||
277 | /* Now we check for various special cases, which we consider to mean | ||
278 | * that this is probably a list entry and therefore should always count | ||
279 | * as a separate paragraph (and hence not be entered in the line height | ||
280 | * table). */ | ||
281 | chr = &span->text[char_num]; | ||
282 | |||
283 | /* Is the first char on the line, a bullet point? */ | ||
284 | if (is_unicode_bullet(chr->c)) | ||
285 | return 1; | ||
286 | |||
287 | #ifdef SPOT_LINE_NUMBERS | ||
288 | /* Is the entire first span a number? Or does it start with a number | ||
289 | * followed by ) or : ? Allowed to involve single latin chars too. */ | ||
290 | if (is_number(chr->c) || is_latin_char(chr->c)) | ||
291 | { | ||
292 | int cn = char_num; | ||
293 | int met_char = is_latin_char(chr->c); | ||
294 | for (cn = char_num+1; cn < span->len; cn++) | ||
295 | { | ||
296 | fz_stext_char *chr2 = &span->text[cn]; | ||
297 | |||
298 | if (is_latin_char(chr2->c) && !met_char) | ||
299 | { | ||
300 | met_char = 1; | ||
301 | continue; | ||
302 | } | ||
303 | met_char = 0; | ||
304 | if (!is_number(chr2->c) && !is_unicode_wspace(chr2->c)) | ||
305 | break; | ||
306 | else if (chr2->c == ')' || chr2->c == ':') | ||
307 | { | ||
308 | cn = span->len; | ||
309 | break; | ||
310 | } | ||
311 | } | ||
312 | if (cn == span->len) | ||
313 | return 1; | ||
314 | } | ||
315 | |||
316 | /* Is the entire first span a roman numeral? Or does it start with | ||
317 | * a roman numeral followed by ) or : ? */ | ||
318 | if (is_roman(chr->c)) | ||
319 | { | ||
320 | int cn = char_num; | ||
321 | for (cn = char_num+1; cn < span->len; cn++) | ||
322 | { | ||
323 | fz_stext_char *chr2 = &span->text[cn]; | ||
324 | |||
325 | if (!is_roman(chr2->c) && !is_unicode_wspace(chr2->c)) | ||
326 | break; | ||
327 | else if (chr2->c == ')' || chr2->c == ':') | ||
328 | { | ||
329 | cn = span->len; | ||
330 | break; | ||
331 | } | ||
332 | } | ||
333 | if (cn == span->len) | ||
334 | return 1; | ||
335 | } | ||
336 | #endif | ||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | typedef struct region_masks_s region_masks; | ||
341 | |||
342 | typedef struct region_mask_s region_mask; | ||
343 | |||
344 | typedef struct region_s region; | ||
345 | |||
346 | struct region_s | ||
347 | { | ||
348 | float start; | ||
349 | float stop; | ||
350 | float ave_start; | ||
351 | float ave_stop; | ||
352 | int align; | ||
353 | float colw; | ||
354 | }; | ||
355 | |||
356 | struct region_mask_s | ||
357 | { | ||
358 | fz_context *ctx; | ||
359 | int freq; | ||
360 | fz_point blv; | ||
361 | int cap; | ||
362 | int len; | ||
363 | float size; | ||
364 | region *mask; | ||
365 | }; | ||
366 | |||
367 | struct region_masks_s | ||
368 | { | ||
369 | fz_context *ctx; | ||
370 | int cap; | ||
371 | int len; | ||
372 | region_mask **mask; | ||
373 | }; | ||
374 | |||
375 | static region_masks * | ||
376 | new_region_masks(fz_context *ctx) | ||
377 | { | ||
378 | region_masks *rms = fz_malloc_struct(ctx, region_masks); | ||
379 | rms->ctx = ctx; | ||
380 | rms->cap = 0; | ||
381 | rms->len = 0; | ||
382 | rms->mask = NULL; | ||
383 | return rms; | ||
384 | } | ||
385 | |||
386 | static void | ||
387 | free_region_mask(region_mask *rm) | ||
388 | { | ||
389 | if (!rm) | ||
390 | return; | ||
391 | fz_free(rm->ctx, rm->mask); | ||
392 | fz_free(rm->ctx, rm); | ||
393 | } | ||
394 | |||
395 | static void | ||
396 | free_region_masks(region_masks *rms) | ||
397 | { | ||
398 | int i; | ||
399 | |||
400 | if (!rms) | ||
401 | return; | ||
402 | for (i=0; i < rms->len; i++) | ||
403 | { | ||
404 | free_region_mask(rms->mask[i]); | ||
405 | } | ||
406 | fz_free(rms->ctx, rms->mask); | ||
407 | fz_free(rms->ctx, rms); | ||
408 | } | ||
409 | |||
410 | static int region_masks_mergeable(const region_mask *rm1, const region_mask *rm2, float *score) | ||
411 | { | ||
412 | int i1, i2; | ||
413 | int count = 0; | ||
414 | |||
415 | *score = 0; | ||
416 | if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON) | ||
417 | return 0; | ||
418 | |||
419 | for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; ) | ||
420 | { | ||
421 | if (rm1->mask[i1].stop < rm2->mask[i2].start) | ||
422 | { | ||
423 | /* rm1's region is entirely before rm2's */ | ||
424 | *score += rm1->mask[i1].stop - rm1->mask[i1].start; | ||
425 | i1++; | ||
426 | } | ||
427 | else if (rm1->mask[i1].start > rm2->mask[i2].stop) | ||
428 | { | ||
429 | /* rm2's region is entirely before rm1's */ | ||
430 | *score += rm2->mask[i2].stop - rm2->mask[i2].start; | ||
431 | i2++; | ||
432 | } | ||
433 | else | ||
434 | { | ||
435 | float lscore, rscore; | ||
436 | if (rm1->mask[i1].start < rm2->mask[i2].start) | ||
437 | { | ||
438 | if (i2 > 0 && rm2->mask[i2-1].stop >= rm1->mask[i1].start) | ||
439 | return 0; /* Not compatible */ | ||
440 | lscore = rm2->mask[i2].start - rm1->mask[i1].start; | ||
441 | } | ||
442 | else | ||
443 | { | ||
444 | if (i1 > 0 && rm1->mask[i1-1].stop >= rm2->mask[i2].start) | ||
445 | return 0; /* Not compatible */ | ||
446 | lscore = rm1->mask[i1].start - rm2->mask[i2].start; | ||
447 | } | ||
448 | if (rm1->mask[i1].stop > rm2->mask[i2].stop) | ||
449 | { | ||
450 | if (i2+1 < rm2->len && rm2->mask[i2+1].start <= rm1->mask[i1].stop) | ||
451 | return 0; /* Not compatible */ | ||
452 | rscore = rm1->mask[i1].stop - rm2->mask[i2].stop; | ||
453 | } | ||
454 | else | ||
455 | { | ||
456 | if (i1+1 < rm1->len && rm1->mask[i1+1].start <= rm2->mask[i2].stop) | ||
457 | return 0; /* Not compatible */ | ||
458 | rscore = rm2->mask[i2].stop - rm1->mask[i1].stop; | ||
459 | } | ||
460 | /* In order to allow a region to merge, either the | ||
461 | * left, the right, or the centre must agree */ | ||
462 | if (lscore < 1) | ||
463 | { | ||
464 | if (rscore < 1) | ||
465 | { | ||
466 | rscore = 0; | ||
467 | } | ||
468 | lscore = 0; | ||
469 | } | ||
470 | else if (rscore < 1) | ||
471 | { | ||
472 | rscore = 0; | ||
473 | } | ||
474 | else | ||
475 | { | ||
476 | /* Neither Left or right agree. Does the centre? */ | ||
477 | float ave1 = rm1->mask[i1].start + rm1->mask[i1].stop; | ||
478 | float ave2 = rm2->mask[i2].start + rm2->mask[i2].stop; | ||
479 | if (fabsf(ave1-ave2) > 1) | ||
480 | { | ||
481 | /* Nothing agrees, so don't merge */ | ||
482 | return 0; | ||
483 | } | ||
484 | lscore = 0; | ||
485 | rscore = 0; | ||
486 | } | ||
487 | *score += lscore + rscore; | ||
488 | /* These two regions could be merged */ | ||
489 | i1++; | ||
490 | i2++; | ||
491 | } | ||
492 | count++; | ||
493 | } | ||
494 | count += rm1->len-i1 + rm2->len-i2; | ||
495 | return count; | ||
496 | } | ||
497 | |||
498 | static int region_mask_matches(const region_mask *rm1, const region_mask *rm2, float *score) | ||
499 | { | ||
500 | int i1, i2; | ||
501 | int close = 1; | ||
502 | |||
503 | *score = 0; | ||
504 | if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON) | ||
505 | return 0; | ||
506 | |||
507 | for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; ) | ||
508 | { | ||
509 | if (rm1->mask[i1].stop < rm2->mask[i2].start) | ||
510 | { | ||
511 | /* rm1's region is entirely before rm2's */ | ||
512 | *score += rm1->mask[i1].stop - rm1->mask[i1].start; | ||
513 | i1++; | ||
514 | } | ||
515 | else if (rm1->mask[i1].start > rm2->mask[i2].stop) | ||
516 | { | ||
517 | /* Not compatible */ | ||
518 | return 0; | ||
519 | } | ||
520 | else | ||
521 | { | ||
522 | float lscore, rscore; | ||
523 | if (rm1->mask[i1].start > rm2->mask[i2].start) | ||
524 | { | ||
525 | /* Not compatible */ | ||
526 | return 0; | ||
527 | } | ||
528 | if (rm1->mask[i1].stop < rm2->mask[i2].stop) | ||
529 | { | ||
530 | /* Not compatible */ | ||
531 | return 0; | ||
532 | } | ||
533 | lscore = rm2->mask[i2].start - rm1->mask[i1].start; | ||
534 | rscore = rm1->mask[i1].stop - rm2->mask[i2].stop; | ||
535 | if (lscore < 1) | ||
536 | { | ||
537 | if (rscore < 1) | ||
538 | close++; | ||
539 | close++; | ||
540 | } | ||
541 | else if (rscore < 1) | ||
542 | close++; | ||
543 | else if (fabsf(lscore - rscore) < 1) | ||
544 | { | ||
545 | lscore = fabsf(lscore-rscore); | ||
546 | rscore = 0; | ||
547 | close++; | ||
548 | } | ||
549 | *score += lscore + rscore; | ||
550 | i1++; | ||
551 | i2++; | ||
552 | } | ||
553 | } | ||
554 | if (i1 < rm1->len) | ||
555 | { | ||
556 | /* Still more to go in rm1 */ | ||
557 | if (rm1->mask[i1].start < rm2->mask[rm2->len-1].stop) | ||
558 | return 0; | ||
559 | } | ||
560 | else if (i2 < rm2->len) | ||
561 | { | ||
562 | /* Still more to go in rm2 */ | ||
563 | if (rm2->mask[i2].start < rm1->mask[rm1->len-1].stop) | ||
564 | return 0; | ||
565 | } | ||
566 | |||
567 | return close; | ||
568 | } | ||
569 | |||
570 | static void region_mask_merge(region_mask *rm1, const region_mask *rm2, int newlen) | ||
571 | { | ||
572 | int o, i1, i2; | ||
573 | |||
574 | /* First, ensure that rm1 is long enough */ | ||
575 | if (rm1->cap < newlen) | ||
576 | { | ||
577 | int newcap = rm1->cap ? rm1->cap : 2; | ||
578 | do | ||
579 | { | ||
580 | newcap *= 2; | ||
581 | } | ||
582 | while (newcap < newlen); | ||
583 | rm1->mask = fz_resize_array(rm1->ctx, rm1->mask, newcap, sizeof(*rm1->mask)); | ||
584 | rm1->cap = newcap; | ||
585 | } | ||
586 | |||
587 | /* Now run backwards along rm1, filling it out with the merged regions */ | ||
588 | for (o = newlen-1, i1 = rm1->len-1, i2 = rm2->len-1; o >= 0; o--) | ||
589 | { | ||
590 | /* So we read from i1 and i2 and store in o */ | ||
591 | if (i1 < 0) | ||
592 | { | ||
593 | /* Just copy i2 */ | ||
594 | rm1->mask[o] = rm2->mask[i2]; | ||
595 | i2--; | ||
596 | } | ||
597 | else if (i2 < 0) | ||
598 | { | ||
599 | /* Just copy i1 */ | ||
600 | rm1->mask[o] = rm1->mask[i1]; | ||
601 | i1--; | ||
602 | } | ||
603 | else if (rm1->mask[i1].stop < rm2->mask[i2].start) | ||
604 | { | ||
605 | /* rm1's region is entirely before rm2's - copy rm2's */ | ||
606 | rm1->mask[o] = rm2->mask[i2]; | ||
607 | i2--; | ||
608 | } | ||
609 | else if (rm2->mask[i2].stop < rm1->mask[i1].start) | ||
610 | { | ||
611 | /* rm2's region is entirely before rm1's - copy rm1's */ | ||
612 | rm1->mask[o] = rm1->mask[i1]; | ||
613 | i1--; | ||
614 | } | ||
615 | else | ||
616 | { | ||
617 | /* We must be merging */ | ||
618 | rm1->mask[o].ave_start = (rm1->mask[i1].start * rm1->freq + rm2->mask[i2].start * rm2->freq)/(rm1->freq + rm2->freq); | ||
619 | rm1->mask[o].ave_stop = (rm1->mask[i1].stop * rm1->freq + rm2->mask[i2].stop * rm2->freq)/(rm1->freq + rm2->freq); | ||
620 | rm1->mask[o].start = fz_min(rm1->mask[i1].start, rm2->mask[i2].start); | ||
621 | rm1->mask[o].stop = fz_max(rm1->mask[i1].stop, rm2->mask[i2].stop); | ||
622 | i1--; | ||
623 | i2--; | ||
624 | } | ||
625 | } | ||
626 | rm1->freq += rm2->freq; | ||
627 | rm1->len = newlen; | ||
628 | } | ||
629 | |||
630 | static region_mask *region_masks_match(const region_masks *rms, const region_mask *rm, fz_stext_line *line, region_mask *prev_match) | ||
631 | { | ||
632 | int i; | ||
633 | float best_score = 9999999; | ||
634 | float score; | ||
635 | int best = -1; | ||
636 | int best_count = 0; | ||
637 | |||
638 | /* If the 'previous match' matches, use it regardless. */ | ||
639 | if (prev_match && region_mask_matches(prev_match, rm, &score)) | ||
640 | { | ||
641 | return prev_match; | ||
642 | } | ||
643 | |||
644 | /* Run through and find the 'most compatible' region mask. We are | ||
645 | * guaranteed that there will always be at least one compatible one! | ||
646 | */ | ||
647 | for (i=0; i < rms->len; i++) | ||
648 | { | ||
649 | int count = region_mask_matches(rms->mask[i], rm, &score); | ||
650 | if (count > best_count || (count == best_count && (score < best_score || best == -1))) | ||
651 | { | ||
652 | best = i; | ||
653 | best_score = score; | ||
654 | best_count = count; | ||
655 | } | ||
656 | } | ||
657 | assert(best >= 0 && best < rms->len); | ||
658 | |||
659 | /* So we have the matching mask. */ | ||
660 | return rms->mask[best]; | ||
661 | } | ||
662 | |||
663 | #ifdef DEBUG_MASKS | ||
664 | static void | ||
665 | dump_region_mask(const region_mask *rm) | ||
666 | { | ||
667 | int j; | ||
668 | for (j = 0; j < rm->len; j++) | ||
669 | { | ||
670 | printf("%g->%g ", rm->mask[j].start, rm->mask[j].stop); | ||
671 | } | ||
672 | printf("* %d\n", rm->freq); | ||
673 | } | ||
674 | |||
675 | static void | ||
676 | dump_region_masks(const region_masks *rms) | ||
677 | { | ||
678 | int i; | ||
679 | |||
680 | for (i = 0; i < rms->len; i++) | ||
681 | { | ||
682 | region_mask *rm = rms->mask[i]; | ||
683 | dump_region_mask(rm); | ||
684 | } | ||
685 | } | ||
686 | #endif | ||
687 | |||
688 | static void region_masks_add(region_masks *rms, region_mask *rm) | ||
689 | { | ||
690 | /* Add rm to rms */ | ||
691 | if (rms->len == rms->cap) | ||
692 | { | ||
693 | int newcap = (rms->cap ? rms->cap * 2 : 4); | ||
694 | rms->mask = fz_resize_array(rms->ctx, rms->mask, newcap, sizeof(*rms->mask)); | ||
695 | rms->cap = newcap; | ||
696 | } | ||
697 | rms->mask[rms->len] = rm; | ||
698 | rms->len++; | ||
699 | } | ||
700 | |||
701 | static void region_masks_sort(region_masks *rms) | ||
702 | { | ||
703 | int i, j; | ||
704 | |||
705 | /* First calculate sizes */ | ||
706 | for (i=0; i < rms->len; i++) | ||
707 | { | ||
708 | region_mask *rm = rms->mask[i]; | ||
709 | float size = 0; | ||
710 | for (j=0; j < rm->len; j++) | ||
711 | { | ||
712 | size += rm->mask[j].stop - rm->mask[j].start; | ||
713 | } | ||
714 | rm->size = size; | ||
715 | } | ||
716 | |||
717 | /* Now, sort on size */ | ||
718 | /* FIXME: bubble sort - use heapsort for efficiency */ | ||
719 | for (i=0; i < rms->len-1; i++) | ||
720 | { | ||
721 | for (j=i+1; j < rms->len; j++) | ||
722 | { | ||
723 | if (rms->mask[i]->size < rms->mask[j]->size) | ||
724 | { | ||
725 | region_mask *tmp = rms->mask[i]; | ||
726 | rms->mask[i] = rms->mask[j]; | ||
727 | rms->mask[j] = tmp; | ||
728 | } | ||
729 | } | ||
730 | } | ||
731 | } | ||
732 | |||
733 | static void region_masks_merge(region_masks *rms, region_mask *rm) | ||
734 | { | ||
735 | int i; | ||
736 | float best_score = 9999999; | ||
737 | float score; | ||
738 | int best = -1; | ||
739 | int best_count = 0; | ||
740 | |||
741 | #ifdef DEBUG_MASKS | ||
742 | printf("\nAdding:\n"); | ||
743 | dump_region_mask(rm); | ||
744 | printf("To:\n"); | ||
745 | dump_region_masks(rms); | ||
746 | #endif | ||
747 | for (i=0; i < rms->len; i++) | ||
748 | { | ||
749 | int count = region_masks_mergeable(rms->mask[i], rm, &score); | ||
750 | if (count && (score < best_score || best == -1)) | ||
751 | { | ||
752 | best = i; | ||
753 | best_count = count; | ||
754 | best_score = score; | ||
755 | } | ||
756 | } | ||
757 | if (best != -1) | ||
758 | { | ||
759 | region_mask_merge(rms->mask[best], rm, best_count); | ||
760 | #ifdef DEBUG_MASKS | ||
761 | printf("Merges to give:\n"); | ||
762 | dump_region_masks(rms); | ||
763 | #endif | ||
764 | free_region_mask(rm); | ||
765 | return; | ||
766 | } | ||
767 | region_masks_add(rms, rm); | ||
768 | #ifdef DEBUG_MASKS | ||
769 | printf("Adding new one to give:\n"); | ||
770 | dump_region_masks(rms); | ||
771 | #endif | ||
772 | } | ||
773 | |||
774 | static region_mask * | ||
775 | new_region_mask(fz_context *ctx, const fz_point *blv) | ||
776 | { | ||
777 | region_mask *rm = fz_malloc_struct(ctx, region_mask); | ||
778 | rm->ctx = ctx; | ||
779 | rm->freq = 1; | ||
780 | rm->blv = *blv; | ||
781 | rm->cap = 0; | ||
782 | rm->len = 0; | ||
783 | rm->mask = NULL; | ||
784 | return rm; | ||
785 | } | ||
786 | |||
787 | static void | ||
788 | region_mask_project(const region_mask *rm, const fz_point *min, const fz_point *max, float *start, float *end) | ||
789 | { | ||
790 | /* We project min and max down onto the blv */ | ||
791 | float s = min->x * rm->blv.x + min->y * rm->blv.y; | ||
792 | float e = max->x * rm->blv.x + max->y * rm->blv.y; | ||
793 | if (s > e) | ||
794 | { | ||
795 | *start = e; | ||
796 | *end = s; | ||
797 | } | ||
798 | else | ||
799 | { | ||
800 | *start = s; | ||
801 | *end = e; | ||
802 | } | ||
803 | } | ||
804 | |||
805 | static void | ||
806 | region_mask_add(region_mask *rm, const fz_point *min, const fz_point *max) | ||
807 | { | ||
808 | float start, end; | ||
809 | int i, j; | ||
810 | |||
811 | region_mask_project(rm, min, max, &start, &end); | ||
812 | |||
813 | /* Now add start/end into our region list. Typically we will be adding | ||
814 | * to the end of the region list, so search from there backwards. */ | ||
815 | for (i = rm->len; i > 0;) | ||
816 | { | ||
817 | if (start > rm->mask[i-1].stop) | ||
818 | break; | ||
819 | i--; | ||
820 | } | ||
821 | /* So we know that our interval can only affect list items >= i. | ||
822 | * We know that start is after our previous end. */ | ||
823 | if (i == rm->len || end < rm->mask[i].start) | ||
824 | { | ||
825 | /* Insert new one. No overlap. No merging */ | ||
826 | if (rm->len == rm->cap) | ||
827 | { | ||
828 | int newcap = (rm->cap ? rm->cap * 2 : 4); | ||
829 | rm->mask = fz_resize_array(rm->ctx, rm->mask, newcap, sizeof(*rm->mask)); | ||
830 | rm->cap = newcap; | ||
831 | } | ||
832 | if (rm->len > i) | ||
833 | memmove(&rm->mask[i+1], &rm->mask[i], (rm->len - i) * sizeof(*rm->mask)); | ||
834 | rm->mask[i].ave_start = start; | ||
835 | rm->mask[i].ave_stop = end; | ||
836 | rm->mask[i].start = start; | ||
837 | rm->mask[i].stop = end; | ||
838 | rm->len++; | ||
839 | } | ||
840 | else | ||
841 | { | ||
842 | /* Extend current one down. */ | ||
843 | rm->mask[i].ave_start = start; | ||
844 | rm->mask[i].start = start; | ||
845 | if (rm->mask[i].stop < end) | ||
846 | { | ||
847 | rm->mask[i].stop = end; | ||
848 | rm->mask[i].ave_stop = end; | ||
849 | /* Our region may now extend upwards too far */ | ||
850 | i++; | ||
851 | j = i; | ||
852 | while (j < rm->len && rm->mask[j].start <= end) | ||
853 | { | ||
854 | rm->mask[i-1].stop = end = rm->mask[j].stop; | ||
855 | j++; | ||
856 | } | ||
857 | if (i != j) | ||
858 | { | ||
859 | /* Move everything from j down to i */ | ||
860 | while (j < rm->len) | ||
861 | { | ||
862 | rm->mask[i++] = rm->mask[j++]; | ||
863 | } | ||
864 | } | ||
865 | rm->len -= j-i; | ||
866 | } | ||
867 | } | ||
868 | } | ||
869 | |||
870 | static int | ||
871 | region_mask_column(region_mask *rm, const fz_point *min, const fz_point *max, int *align, float *colw, float *left_) | ||
872 | { | ||
873 | float start, end, left, right; | ||
874 | int i; | ||
875 | |||
876 | region_mask_project(rm, min, max, &start, &end); | ||
877 | |||
878 | for (i = 0; i < rm->len; i++) | ||
879 | { | ||
880 | /* The use of MY_EPSILON here is because we might be matching | ||
881 | * start/end values calculated with slightly different blv's */ | ||
882 | if (rm->mask[i].start - MY_EPSILON <= start && rm->mask[i].stop + MY_EPSILON >= end) | ||
883 | break; | ||
884 | } | ||
885 | if (i >= rm->len) | ||
886 | { | ||
887 | *align = 0; | ||
888 | *colw = 0; | ||
889 | return 0; | ||
890 | } | ||
891 | left = start - rm->mask[i].start; | ||
892 | right = rm->mask[i].stop - end; | ||
893 | if (left < 1 && right < 1) | ||
894 | *align = rm->mask[i].align; | ||
895 | else if (left*2 <= right) | ||
896 | *align = 0; /* Left */ | ||
897 | else if (right * 2 < left) | ||
898 | *align = 2; /* Right */ | ||
899 | else | ||
900 | *align = 1; | ||
901 | *left_ = left; | ||
902 | *colw = rm->mask[i].colw; | ||
903 | return i; | ||
904 | } | ||
905 | |||
906 | static void | ||
907 | region_mask_alignment(region_mask *rm) | ||
908 | { | ||
909 | int i; | ||
910 | float width = 0; | ||
911 | |||
912 | for (i = 0; i < rm->len; i++) | ||
913 | { | ||
914 | width += rm->mask[i].stop - rm->mask[i].start; | ||
915 | } | ||
916 | for (i = 0; i < rm->len; i++) | ||
917 | { | ||
918 | region *r = &rm->mask[i]; | ||
919 | float left = r->ave_start - r->start; | ||
920 | float right = r->stop - r->ave_stop; | ||
921 | if (left*2 <= right) | ||
922 | r->align = 0; /* Left */ | ||
923 | else if (right * 2 < left) | ||
924 | r->align = 2; /* Right */ | ||
925 | else | ||
926 | r->align = 1; | ||
927 | r->colw = 100 * (rm->mask[i].stop - rm->mask[i].start) / width; | ||
928 | } | ||
929 | } | ||
930 | |||
931 | static void | ||
932 | region_masks_alignment(region_masks *rms) | ||
933 | { | ||
934 | int i; | ||
935 | |||
936 | for (i = 0; i < rms->len; i++) | ||
937 | { | ||
938 | region_mask_alignment(rms->mask[i]); | ||
939 | } | ||
940 | } | ||
941 | |||
942 | static int | ||
943 | is_unicode_hyphen(int c) | ||
944 | { | ||
945 | /* We omit 0x2011 (Non breaking hyphen) and 0x2043 (Hyphen Bullet) | ||
946 | * from this list. */ | ||
947 | return (c == '-' || | ||
948 | c == 0x2010 || /* Hyphen */ | ||
949 | c == 0x002d || /* Hyphen-Minus */ | ||
950 | c == 0x00ad || /* Soft hyphen */ | ||
951 | c == 0x058a || /* Armenian Hyphen */ | ||
952 | c == 0x1400 || /* Canadian Syllabive Hyphen */ | ||
953 | c == 0x1806); /* Mongolian Todo soft hyphen */ | ||
954 | } | ||
955 | |||
956 | static int | ||
957 | is_unicode_hyphenatable(int c) | ||
958 | { | ||
959 | /* This is a pretty ad-hoc collection. It may need tuning. */ | ||
960 | return ((c >= 'A' && c <= 'Z') || | ||
961 | (c >= 'a' && c <= 'z') || | ||
962 | (c >= 0x00c0 && c <= 0x00d6) || | ||
963 | (c >= 0x00d8 && c <= 0x00f6) || | ||
964 | (c >= 0x00f8 && c <= 0x02af) || | ||
965 | (c >= 0x1d00 && c <= 0x1dbf) || | ||
966 | (c >= 0x1e00 && c <= 0x1eff) || | ||
967 | (c >= 0x2c60 && c <= 0x2c7f) || | ||
968 | (c >= 0xa722 && c <= 0xa78e) || | ||
969 | (c >= 0xa790 && c <= 0xa793) || | ||
970 | (c >= 0xa7a8 && c <= 0xa7af) || | ||
971 | (c >= 0xfb00 && c <= 0xfb07) || | ||
972 | (c >= 0xff21 && c <= 0xff3a) || | ||
973 | (c >= 0xff41 && c <= 0xff5a)); | ||
974 | } | ||
975 | |||
976 | static void | ||
977 | dehyphenate(fz_stext_span *s1, fz_stext_span *s2) | ||
978 | { | ||
979 | int i; | ||
980 | |||
981 | for (i = s1->len-1; i > 0; i--) | ||
982 | if (!is_unicode_wspace(s1->text[i].c)) | ||
983 | break; | ||
984 | /* Can't leave an empty span. */ | ||
985 | if (i == 0) | ||
986 | return; | ||
987 | |||
988 | if (!is_unicode_hyphen(s1->text[i].c)) | ||
989 | return; | ||
990 | if (!is_unicode_hyphenatable(s1->text[i-1].c)) | ||
991 | return; | ||
992 | if (!is_unicode_hyphenatable(s2->text[0].c)) | ||
993 | return; | ||
994 | s1->len = i; | ||
995 | s2->spacing = 0; | ||
996 | } | ||
997 | |||
998 | #ifdef DEBUG_ALIGN | ||
999 | static void | ||
1000 | dump_span(fz_stext_span *span) | ||
1001 | { | ||
1002 | } | ||
1003 | |||
1004 | static void | ||
1005 | dump_line(fz_stext_line *line) | ||
1006 | { | ||
1007 | fz_stext_span *span; | ||
1008 | |||
1009 | if (!line) | ||
1010 | return; | ||
1011 | printf("d=%g: ", line->distance); | ||
1012 | |||
1013 | span = line->first_span; | ||
1014 | while (span) | ||
1015 | { | ||
1016 | dump_span(span); | ||
1017 | span = span->next; | ||
1018 | } | ||
1019 | |||
1020 | printf("\n"); | ||
1021 | } | ||
1022 | #endif | ||
1023 | |||
1024 | void | ||
1025 | fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page) | ||
1026 | { | ||
1027 | fz_stext_line *line; | ||
1028 | fz_stext_span *span; | ||
1029 | line_heights *lh; | ||
1030 | region_masks *rms; | ||
1031 | int block_num; | ||
1032 | |||
1033 | /* Simple paragraph analysis; look for the most common 'inter line' | ||
1034 | * spacing. This will be assumed to be our line spacing. Anything | ||
1035 | * more than 25% wider than this will be assumed to be a paragraph | ||
1036 | * space. */ | ||
1037 | |||
1038 | /* Step 1: Gather the line height information */ | ||
1039 | lh = new_line_heights(ctx); | ||
1040 | for (block_num = 0; block_num < page->len; block_num++) | ||
1041 | { | ||
1042 | fz_stext_block *block; | ||
1043 | |||
1044 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
1045 | continue; | ||
1046 | block = page->blocks[block_num].u.text; | ||
1047 | |||
1048 | for (line = block->lines; line < block->lines + block->len; line++) | ||
1049 | { | ||
1050 | /* For every style in the line, add lineheight to the | ||
1051 | * record for that style. FIXME: This is a nasty n^2 | ||
1052 | * algorithm at the moment. */ | ||
1053 | fz_stext_style *style = NULL; | ||
1054 | |||
1055 | if (line->distance == 0) | ||
1056 | continue; | ||
1057 | |||
1058 | for (span = line->first_span; span; span = span->next) | ||
1059 | { | ||
1060 | int char_num; | ||
1061 | |||
1062 | if (is_list_entry(line, span, &char_num)) | ||
1063 | goto list_entry; | ||
1064 | |||
1065 | for (; char_num < span->len; char_num++) | ||
1066 | { | ||
1067 | fz_stext_char *chr = &span->text[char_num]; | ||
1068 | |||
1069 | /* Ignore any whitespace chars */ | ||
1070 | if (is_unicode_wspace(chr->c)) | ||
1071 | continue; | ||
1072 | |||
1073 | if (chr->style != style) | ||
1074 | { | ||
1075 | /* Have we had this style before? */ | ||
1076 | int match = 0; | ||
1077 | fz_stext_span *span2; | ||
1078 | for (span2 = line->first_span; span2 != span; span2 = span2->next) | ||
1079 | { | ||
1080 | int char_num2; | ||
1081 | for (char_num2 = 0; char_num2 < span2->len; char_num2++) | ||
1082 | { | ||
1083 | fz_stext_char *chr2 = &span2->text[char_num2]; | ||
1084 | if (chr2->style == chr->style) | ||
1085 | { | ||
1086 | match = 1; | ||
1087 | break; | ||
1088 | } | ||
1089 | } | ||
1090 | } | ||
1091 | if (char_num > 0 && match == 0) | ||
1092 | { | ||
1093 | fz_stext_span *span2 = span; | ||
1094 | int char_num2; | ||
1095 | for (char_num2 = 0; char_num2 < char_num; char_num2++) | ||
1096 | { | ||
1097 | fz_stext_char *chr2 = &span2->text[char_num2]; | ||
1098 | if (chr2->style == chr->style) | ||
1099 | { | ||
1100 | match = 1; | ||
1101 | break; | ||
1102 | } | ||
1103 | } | ||
1104 | } | ||
1105 | if (match == 0) | ||
1106 | insert_line_height(lh, chr->style, line->distance); | ||
1107 | style = chr->style; | ||
1108 | } | ||
1109 | } | ||
1110 | list_entry: | ||
1111 | {} | ||
1112 | } | ||
1113 | } | ||
1114 | } | ||
1115 | |||
1116 | /* Step 2: Find the most popular line height for each style */ | ||
1117 | cull_line_heights(lh); | ||
1118 | |||
1119 | /* Step 3: Run through the blocks, breaking each block into two if | ||
1120 | * the line height isn't right. */ | ||
1121 | for (block_num = 0; block_num < page->len; block_num++) | ||
1122 | { | ||
1123 | int line_num; | ||
1124 | fz_stext_block *block; | ||
1125 | |||
1126 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
1127 | continue; | ||
1128 | block = page->blocks[block_num].u.text; | ||
1129 | |||
1130 | for (line_num = 0; line_num < block->len; line_num++) | ||
1131 | { | ||
1132 | /* For every style in the line, check to see if lineheight | ||
1133 | * is correct for that style. FIXME: We check each style | ||
1134 | * more than once, currently. */ | ||
1135 | int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */ | ||
1136 | fz_stext_style *style = NULL; | ||
1137 | line = &block->lines[line_num]; | ||
1138 | |||
1139 | if (line->distance == 0) | ||
1140 | continue; | ||
1141 | |||
1142 | #ifdef DEBUG_LINE_HEIGHTS | ||
1143 | printf("line height=%g\n", line->distance); | ||
1144 | #endif | ||
1145 | for (span = line->first_span; span; span = span->next) | ||
1146 | { | ||
1147 | int char_num; | ||
1148 | |||
1149 | if (is_list_entry(line, span, &char_num)) | ||
1150 | goto force_paragraph; | ||
1151 | |||
1152 | /* Now we do the rest of the line */ | ||
1153 | for (; char_num < span->len; char_num++) | ||
1154 | { | ||
1155 | fz_stext_char *chr = &span->text[char_num]; | ||
1156 | |||
1157 | /* Ignore any whitespace chars */ | ||
1158 | if (is_unicode_wspace(chr->c)) | ||
1159 | continue; | ||
1160 | |||
1161 | if (chr->style != style) | ||
1162 | { | ||
1163 | float proper_step = line_height_for_style(lh, chr->style); | ||
1164 | if (proper_step * 0.95f <= line->distance && line->distance <= proper_step * 1.05f) | ||
1165 | { | ||
1166 | ok = 1; | ||
1167 | break; | ||
1168 | } | ||
1169 | style = chr->style; | ||
1170 | } | ||
1171 | } | ||
1172 | if (ok) | ||
1173 | break; | ||
1174 | } | ||
1175 | if (!ok) | ||
1176 | { | ||
1177 | force_paragraph: | ||
1178 | split_block(ctx, page, block_num, line_num); | ||
1179 | break; | ||
1180 | } | ||
1181 | } | ||
1182 | } | ||
1183 | free_line_heights(lh); | ||
1184 | |||
1185 | /* Simple line region analysis: | ||
1186 | * For each line: | ||
1187 | * form a list of 'start/stop' points (henceforth a 'region mask') | ||
1188 | * find the normalised baseline vector for the line. | ||
1189 | * Store the region mask and baseline vector. | ||
1190 | * Collate lines that have compatible region masks and identical | ||
1191 | * baseline vectors. | ||
1192 | * If the collated masks are column-like, then split into columns. | ||
1193 | * Otherwise split into tables. | ||
1194 | */ | ||
1195 | rms = new_region_masks(ctx); | ||
1196 | |||
1197 | /* Step 1: Form the region masks and store them into a list with the | ||
1198 | * normalised baseline vectors. */ | ||
1199 | for (block_num = 0; block_num < page->len; block_num++) | ||
1200 | { | ||
1201 | fz_stext_block *block; | ||
1202 | |||
1203 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
1204 | continue; | ||
1205 | block = page->blocks[block_num].u.text; | ||
1206 | |||
1207 | for (line = block->lines; line < block->lines + block->len; line++) | ||
1208 | { | ||
1209 | fz_point blv; | ||
1210 | region_mask *rm; | ||
1211 | |||
1212 | #ifdef DEBUG_MASKS | ||
1213 | printf("Line: "); | ||
1214 | dump_line(line); | ||
1215 | #endif | ||
1216 | blv = line->first_span->max; | ||
1217 | blv.x -= line->first_span->min.x; | ||
1218 | blv.y -= line->first_span->min.y; | ||
1219 | fz_normalize_vector(&blv); | ||
1220 | |||
1221 | rm = new_region_mask(ctx, &blv); | ||
1222 | for (span = line->first_span; span; span = span->next) | ||
1223 | { | ||
1224 | fz_point *region_min = &span->min; | ||
1225 | fz_point *region_max = &span->max; | ||
1226 | |||
1227 | /* Treat adjacent spans as one big region */ | ||
1228 | while (span->next && span->next->spacing < 1.5f) | ||
1229 | { | ||
1230 | span = span->next; | ||
1231 | region_max = &span->max; | ||
1232 | } | ||
1233 | |||
1234 | region_mask_add(rm, region_min, region_max); | ||
1235 | } | ||
1236 | #ifdef DEBUG_MASKS | ||
1237 | dump_region_mask(rm); | ||
1238 | #endif | ||
1239 | region_masks_add(rms, rm); | ||
1240 | } | ||
1241 | } | ||
1242 | |||
1243 | /* Step 2: Sort the region_masks by size of masked region */ | ||
1244 | region_masks_sort(rms); | ||
1245 | |||
1246 | #ifdef DEBUG_MASKS | ||
1247 | printf("Sorted list of regions:\n"); | ||
1248 | dump_region_masks(rms); | ||
1249 | #endif | ||
1250 | /* Step 3: Merge the region masks where possible (large ones first) */ | ||
1251 | { | ||
1252 | int i; | ||
1253 | region_masks *rms2; | ||
1254 | rms2 = new_region_masks(ctx); | ||
1255 | for (i=0; i < rms->len; i++) | ||
1256 | { | ||
1257 | region_mask *rm = rms->mask[i]; | ||
1258 | rms->mask[i] = NULL; | ||
1259 | region_masks_merge(rms2, rm); | ||
1260 | } | ||
1261 | free_region_masks(rms); | ||
1262 | rms = rms2; | ||
1263 | } | ||
1264 | |||
1265 | #ifdef DEBUG_MASKS | ||
1266 | printf("Merged list of regions:\n"); | ||
1267 | dump_region_masks(rms); | ||
1268 | #endif | ||
1269 | |||
1270 | /* Step 4: Figure out alignment */ | ||
1271 | region_masks_alignment(rms); | ||
1272 | |||
1273 | /* Step 5: At this point, we should probably look at the region masks | ||
1274 | * to try to guess which ones represent columns on the page. With our | ||
1275 | * current code, we could only get blocks of lines that span 2 or more | ||
1276 | * columns if the PDF producer wrote text out horizontally across 2 | ||
1277 | * or more columns, and we've never seen that (yet!). So we skip this | ||
1278 | * step for now. */ | ||
1279 | |||
1280 | /* Step 6: Run through the lines again, deciding which ones fit into | ||
1281 | * which region mask. */ | ||
1282 | { | ||
1283 | region_mask *prev_match = NULL; | ||
1284 | for (block_num = 0; block_num < page->len; block_num++) | ||
1285 | { | ||
1286 | fz_stext_block *block; | ||
1287 | |||
1288 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
1289 | continue; | ||
1290 | block = page->blocks[block_num].u.text; | ||
1291 | |||
1292 | for (line = block->lines; line < block->lines + block->len; line++) | ||
1293 | { | ||
1294 | fz_point blv; | ||
1295 | region_mask *rm; | ||
1296 | region_mask *match; | ||
1297 | |||
1298 | blv = line->first_span->max; | ||
1299 | blv.x -= line->first_span->min.x; | ||
1300 | blv.y -= line->first_span->min.y; | ||
1301 | fz_normalize_vector(&blv); | ||
1302 | |||
1303 | #ifdef DEBUG_MASKS | ||
1304 | dump_line(line); | ||
1305 | #endif | ||
1306 | rm = new_region_mask(ctx, &blv); | ||
1307 | for (span = line->first_span; span; span = span->next) | ||
1308 | { | ||
1309 | fz_point *region_min = &span->min; | ||
1310 | fz_point *region_max = &span->max; | ||
1311 | |||
1312 | /* Treat adjacent spans as one big region */ | ||
1313 | while (span->next && span->next->spacing < 1.5f) | ||
1314 | { | ||
1315 | span = span->next; | ||
1316 | region_max = &span->max; | ||
1317 | } | ||
1318 | |||
1319 | region_mask_add(rm, region_min, region_max); | ||
1320 | } | ||
1321 | #ifdef DEBUG_MASKS | ||
1322 | printf("Mask: "); | ||
1323 | dump_region_mask(rm); | ||
1324 | #endif | ||
1325 | match = region_masks_match(rms, rm, line, prev_match); | ||
1326 | prev_match = match; | ||
1327 | #ifdef DEBUG_MASKS | ||
1328 | printf("Matches: "); | ||
1329 | dump_region_mask(match); | ||
1330 | #endif | ||
1331 | free_region_mask(rm); | ||
1332 | span = line->first_span; | ||
1333 | while (span) | ||
1334 | { | ||
1335 | fz_point *region_min = &span->min; | ||
1336 | fz_point *region_max = &span->max; | ||
1337 | fz_stext_span *sn; | ||
1338 | int col, align; | ||
1339 | float colw, left; | ||
1340 | |||
1341 | /* Treat adjacent spans as one big region */ | ||
1342 | #ifdef DEBUG_ALIGN | ||
1343 | dump_span(span); | ||
1344 | #endif | ||
1345 | for (sn = span->next; sn && sn->spacing < 1.5f; sn = sn->next) | ||
1346 | { | ||
1347 | region_max = &sn->max; | ||
1348 | #ifdef DEBUG_ALIGN | ||
1349 | dump_span(sn); | ||
1350 | #endif | ||
1351 | } | ||
1352 | col = region_mask_column(match, region_min, region_max, &align, &colw, &left); | ||
1353 | #ifdef DEBUG_ALIGN | ||
1354 | printf(" = col%d colw=%g align=%d\n", col, colw, align); | ||
1355 | #endif | ||
1356 | do | ||
1357 | { | ||
1358 | span->column = col; | ||
1359 | span->align = align; | ||
1360 | span->indent = left; | ||
1361 | span->column_width = colw; | ||
1362 | span = span->next; | ||
1363 | } | ||
1364 | while (span != sn); | ||
1365 | |||
1366 | if (span) | ||
1367 | span = span->next; | ||
1368 | } | ||
1369 | line->region = match; | ||
1370 | } | ||
1371 | } | ||
1372 | free_region_masks(rms); | ||
1373 | } | ||
1374 | |||
1375 | /* Step 7: Collate lines within a block that share the same region | ||
1376 | * mask. */ | ||
1377 | for (block_num = 0; block_num < page->len; block_num++) | ||
1378 | { | ||
1379 | int line_num; | ||
1380 | int prev_line_num; | ||
1381 | |||
1382 | fz_stext_block *block; | ||
1383 | |||
1384 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
1385 | continue; | ||
1386 | block = page->blocks[block_num].u.text; | ||
1387 | |||
1388 | /* First merge lines. This may leave empty lines behind. */ | ||
1389 | for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++) | ||
1390 | { | ||
1391 | fz_stext_line *prev_line; | ||
1392 | line = &block->lines[line_num]; | ||
1393 | if (!line->first_span) | ||
1394 | continue; | ||
1395 | prev_line = &block->lines[prev_line_num]; | ||
1396 | if (prev_line->region == line->region) | ||
1397 | { | ||
1398 | /* We only merge lines if the second line | ||
1399 | * only uses 1 of the columns. */ | ||
1400 | int col = line->first_span->column; | ||
1401 | /* Copy the left value for the first span | ||
1402 | * in the first column in this line forward | ||
1403 | * for all the rest of the spans in the same | ||
1404 | * column. */ | ||
1405 | float indent = line->first_span->indent; | ||
1406 | for (span = line->first_span->next; span; span = span->next) | ||
1407 | { | ||
1408 | if (col != span->column) | ||
1409 | break; | ||
1410 | span->indent = indent; | ||
1411 | } | ||
1412 | if (span) | ||
1413 | { | ||
1414 | prev_line_num = line_num; | ||
1415 | continue; | ||
1416 | } | ||
1417 | |||
1418 | /* Merge line into prev_line */ | ||
1419 | { | ||
1420 | fz_stext_span **prev_line_span = &prev_line->first_span; | ||
1421 | int try_dehyphen = -1; | ||
1422 | fz_stext_span *prev_span = NULL; | ||
1423 | span = line->first_span; | ||
1424 | while (span && *prev_line_span) | ||
1425 | { | ||
1426 | /* Skip forwards through the original | ||
1427 | * line, until we find a place where | ||
1428 | * span should go. */ | ||
1429 | if ((*prev_line_span)->column <= span->column) | ||
1430 | { | ||
1431 | /* The current span we are considering | ||
1432 | * in prev_line is earlier than span. | ||
1433 | * Just skip forwards in prev_line. */ | ||
1434 | prev_span = (*prev_line_span); | ||
1435 | prev_line_span = &prev_span->next; | ||
1436 | try_dehyphen = span->column; | ||
1437 | } | ||
1438 | else | ||
1439 | { | ||
1440 | /* We want to copy span into prev_line. */ | ||
1441 | fz_stext_span *next = (*prev_line_span)->next; | ||
1442 | |||
1443 | if (prev_line_span == &prev_line->first_span) | ||
1444 | prev_line->first_span = span; | ||
1445 | if (next == NULL) | ||
1446 | prev_line->last_span = span; | ||
1447 | if (try_dehyphen == span->column) | ||
1448 | dehyphenate(prev_span, span); | ||
1449 | try_dehyphen = -1; | ||
1450 | prev_span = *prev_line_span = span; | ||
1451 | span = span->next; | ||
1452 | (*prev_line_span)->next = next; | ||
1453 | prev_line_span = &(*prev_line_span)->next; | ||
1454 | } | ||
1455 | } | ||
1456 | if (span) | ||
1457 | { | ||
1458 | *prev_line_span = span; | ||
1459 | prev_line->last_span = line->last_span; | ||
1460 | } | ||
1461 | |||
1462 | line->first_span = NULL; | ||
1463 | line->last_span = NULL; | ||
1464 | } | ||
1465 | } | ||
1466 | else | ||
1467 | prev_line_num = line_num; | ||
1468 | } | ||
1469 | |||
1470 | /* Now get rid of the empty lines */ | ||
1471 | for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++) | ||
1472 | { | ||
1473 | line = &block->lines[line_num]; | ||
1474 | if (line->first_span) | ||
1475 | block->lines[prev_line_num++] = *line; | ||
1476 | } | ||
1477 | block->len = prev_line_num; | ||
1478 | |||
1479 | /* Now try to spot indents */ | ||
1480 | for (line_num = 0; line_num < block->len; line_num++) | ||
1481 | { | ||
1482 | fz_stext_span *span_num, *sn; | ||
1483 | int col, count; | ||
1484 | line = &block->lines[line_num]; | ||
1485 | |||
1486 | /* Run through the spans... */ | ||
1487 | span_num = line->first_span; | ||
1488 | { | ||
1489 | float indent = 0; | ||
1490 | /* For each set of spans that share the same | ||
1491 | * column... */ | ||
1492 | col = span_num->column; | ||
1493 | #ifdef DEBUG_INDENTS | ||
1494 | printf("Indent %g: ", span_num->indent); | ||
1495 | dump_span(span_num); | ||
1496 | printf("\n"); | ||
1497 | #endif | ||
1498 | |||
1499 | /* find the average indent of all but the first.. */ | ||
1500 | for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++) | ||
1501 | { | ||
1502 | #ifdef DEBUG_INDENTS | ||
1503 | printf("Indent %g: ", sn->indent); | ||
1504 | dump_span(sn); | ||
1505 | printf("\n"); | ||
1506 | #endif | ||
1507 | indent += sn->indent; | ||
1508 | sn->indent = 0; | ||
1509 | } | ||
1510 | if (sn != span_num->next) | ||
1511 | indent /= count; | ||
1512 | |||
1513 | /* And compare this indent with the first one... */ | ||
1514 | #ifdef DEBUG_INDENTS | ||
1515 | printf("Average indent %g ", indent); | ||
1516 | #endif | ||
1517 | indent -= span_num->indent; | ||
1518 | #ifdef DEBUG_INDENTS | ||
1519 | printf("delta %g ", indent); | ||
1520 | #endif | ||
1521 | if (fabsf(indent) < 1) | ||
1522 | { | ||
1523 | /* No indent worth speaking of */ | ||
1524 | indent = 0; | ||
1525 | } | ||
1526 | #ifdef DEBUG_INDENTS | ||
1527 | printf("recorded %g\n", indent); | ||
1528 | #endif | ||
1529 | span_num->indent = indent; | ||
1530 | span_num = sn; | ||
1531 | } | ||
1532 | for (; span_num; span_num = span_num->next) | ||
1533 | { | ||
1534 | span_num->indent = 0; | ||
1535 | } | ||
1536 | } | ||
1537 | } | ||
1538 | } | ||
diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c index 00705208f..6c30ea29a 100644 --- a/source/fitz/stext-search.c +++ b/source/fitz/stext-search.c | |||
@@ -18,30 +18,28 @@ static inline int iswhite(int c) | |||
18 | 18 | ||
19 | fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx) | 19 | fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx) |
20 | { | 20 | { |
21 | int block_num; | 21 | fz_stext_block *block; |
22 | fz_stext_line *line; | ||
23 | fz_stext_char *ch; | ||
22 | int ofs = 0; | 24 | int ofs = 0; |
23 | 25 | ||
24 | for (block_num = 0; block_num < page->len; block_num++) | 26 | for (block = page->first_block; block; block = block->next) |
25 | { | 27 | { |
26 | fz_stext_block *block; | 28 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
27 | fz_stext_line *line; | ||
28 | fz_stext_span *span; | ||
29 | |||
30 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
31 | continue; | 29 | continue; |
32 | block = page->blocks[block_num].u.text; | 30 | for (line = block->u.t.first_line; line; line = line->next) |
33 | for (line = block->lines; line < block->lines + block->len; line++) | ||
34 | { | 31 | { |
35 | for (span = line->first_span; span; span = span->next) | 32 | for (ch = line->first_char; ch; ch = ch->next) |
36 | { | 33 | { |
37 | if (idx < ofs + span->len) | 34 | if (ofs == idx) |
38 | { | 35 | { |
39 | cab->c = span->text[idx - ofs].c; | 36 | cab->c = ch->c; |
40 | fz_stext_char_bbox(ctx, &cab->bbox, span, idx - ofs); | 37 | fz_stext_char_bbox(ctx, &cab->bbox, line, ch); |
41 | return cab; | 38 | return cab; |
42 | } | 39 | } |
43 | ofs += span->len; | 40 | ++ofs; |
44 | } | 41 | } |
42 | |||
45 | /* pseudo-newline */ | 43 | /* pseudo-newline */ |
46 | if (idx == ofs) | 44 | if (idx == ofs) |
47 | { | 45 | { |
@@ -49,7 +47,7 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex | |||
49 | cab->c = ' '; | 47 | cab->c = ' '; |
50 | return cab; | 48 | return cab; |
51 | } | 49 | } |
52 | ofs++; | 50 | ++ofs; |
53 | } | 51 | } |
54 | } | 52 | } |
55 | cab->bbox = fz_empty_rect; | 53 | cab->bbox = fz_empty_rect; |
@@ -73,27 +71,23 @@ static fz_rect *bboxat(fz_context *ctx, fz_stext_page *page, int idx, fz_rect *b | |||
73 | 71 | ||
74 | static int textlen_stext(fz_context *ctx, fz_stext_page *page) | 72 | static int textlen_stext(fz_context *ctx, fz_stext_page *page) |
75 | { | 73 | { |
74 | fz_stext_block *block; | ||
75 | fz_stext_line *line; | ||
76 | fz_stext_char *ch; | ||
76 | int len = 0; | 77 | int len = 0; |
77 | int block_num; | ||
78 | 78 | ||
79 | for (block_num = 0; block_num < page->len; block_num++) | 79 | for (block = page->first_block; block; block = block->next) |
80 | { | 80 | { |
81 | fz_stext_block *block; | 81 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
82 | fz_stext_line *line; | ||
83 | fz_stext_span *span; | ||
84 | |||
85 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
86 | continue; | 82 | continue; |
87 | block = page->blocks[block_num].u.text; | 83 | for (line = block->u.t.first_line; line; line = line->next) |
88 | for (line = block->lines; line < block->lines + block->len; line++) | ||
89 | { | 84 | { |
90 | for (span = line->first_span; span; span = span->next) | 85 | for (ch = line->first_char; ch; ch = ch->next) |
91 | { | 86 | ++len; |
92 | len += span->len; | 87 | ++len; /* pseudo-newline */ |
93 | } | ||
94 | len++; /* pseudo-newline */ | ||
95 | } | 88 | } |
96 | } | 89 | } |
90 | |||
97 | return len; | 91 | return len; |
98 | } | 92 | } |
99 | 93 | ||
@@ -181,8 +175,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re | |||
181 | fz_rect linebox, charbox; | 175 | fz_rect linebox, charbox; |
182 | fz_stext_block *block; | 176 | fz_stext_block *block; |
183 | fz_stext_line *line; | 177 | fz_stext_line *line; |
184 | fz_stext_span *span; | 178 | fz_stext_char *ch; |
185 | int i, block_num, hit_count; | 179 | int hit_count; |
186 | 180 | ||
187 | float x0 = rect.x0; | 181 | float x0 = rect.x0; |
188 | float x1 = rect.x1; | 182 | float x1 = rect.x1; |
@@ -191,31 +185,27 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re | |||
191 | 185 | ||
192 | hit_count = 0; | 186 | hit_count = 0; |
193 | 187 | ||
194 | for (block_num = 0; block_num < page->len; block_num++) | 188 | for (block = page->first_block; block; block = block->next) |
195 | { | 189 | { |
196 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | 190 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
197 | continue; | 191 | continue; |
198 | block = page->blocks[block_num].u.text; | 192 | for (line = block->u.t.first_line; line; line = line->next) |
199 | for (line = block->lines; line < block->lines + block->len; line++) | ||
200 | { | 193 | { |
201 | linebox = fz_empty_rect; | 194 | linebox = fz_empty_rect; |
202 | for (span = line->first_span; span; span = span->next) | 195 | for (ch = line->first_char; ch; ch = ch->next) |
203 | { | 196 | { |
204 | for (i = 0; i < span->len; i++) | 197 | fz_stext_char_bbox(ctx, &charbox, line, ch); |
198 | if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1) | ||
205 | { | 199 | { |
206 | fz_stext_char_bbox(ctx, &charbox, span, i); | 200 | if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) |
207 | if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1) | ||
208 | { | 201 | { |
209 | if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) | 202 | if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) |
210 | { | 203 | hit_bbox[hit_count++] = linebox; |
211 | if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) | 204 | linebox = charbox; |
212 | hit_bbox[hit_count++] = linebox; | 205 | } |
213 | linebox = charbox; | 206 | else |
214 | } | 207 | { |
215 | else | 208 | fz_union_rect(&linebox, &charbox); |
216 | { | ||
217 | fz_union_rect(&linebox, &charbox); | ||
218 | } | ||
219 | } | 209 | } |
220 | } | 210 | } |
221 | } | 211 | } |
@@ -232,8 +222,11 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect) | |||
232 | { | 222 | { |
233 | fz_buffer *buffer; | 223 | fz_buffer *buffer; |
234 | fz_rect hitbox; | 224 | fz_rect hitbox; |
235 | int c, i, block_num, seen = 0; | 225 | int c, seen = 0; |
236 | unsigned char *s; | 226 | unsigned char *s; |
227 | fz_stext_block *block; | ||
228 | fz_stext_line *line; | ||
229 | fz_stext_char *ch; | ||
237 | 230 | ||
238 | float x0 = rect.x0; | 231 | float x0 = rect.x0; |
239 | float x1 = rect.x1; | 232 | float x1 = rect.x1; |
@@ -242,41 +235,33 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect) | |||
242 | 235 | ||
243 | buffer = fz_new_buffer(ctx, 1024); | 236 | buffer = fz_new_buffer(ctx, 1024); |
244 | 237 | ||
245 | for (block_num = 0; block_num < page->len; block_num++) | 238 | for (block = page->first_block; block; block = block->next) |
246 | { | 239 | { |
247 | fz_stext_block *block; | 240 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
248 | fz_stext_line *line; | ||
249 | fz_stext_span *span; | ||
250 | |||
251 | if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
252 | continue; | 241 | continue; |
253 | block = page->blocks[block_num].u.text; | 242 | for (line = block->u.t.first_line; line; line = line->next) |
254 | for (line = block->lines; line < block->lines + block->len; line++) | ||
255 | { | 243 | { |
256 | for (span = line->first_span; span; span = span->next) | 244 | if (seen) |
257 | { | 245 | { |
258 | if (seen) | 246 | fz_append_byte(ctx, buffer, '\n'); |
259 | { | 247 | } |
260 | fz_append_byte(ctx, buffer, '\n'); | ||
261 | } | ||
262 | 248 | ||
263 | seen = 0; | 249 | seen = 0; |
264 | 250 | ||
265 | for (i = 0; i < span->len; i++) | 251 | for (ch = line->first_char; ch; ch = ch->next) |
252 | { | ||
253 | fz_stext_char_bbox(ctx, &hitbox, line, ch); | ||
254 | c = ch->c; | ||
255 | if (c < 32) | ||
256 | c = 0xFFFD; | ||
257 | if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) | ||
266 | { | 258 | { |
267 | fz_stext_char_bbox(ctx, &hitbox, span, i); | 259 | fz_append_rune(ctx, buffer, c); |
268 | c = span->text[i].c; | 260 | seen = 1; |
269 | if (c < 32) | ||
270 | c = 0xFFFD; | ||
271 | if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) | ||
272 | { | ||
273 | fz_append_rune(ctx, buffer, c); | ||
274 | seen = 1; | ||
275 | } | ||
276 | } | 261 | } |
277 | |||
278 | seen = (seen && span == line->last_span); | ||
279 | } | 262 | } |
263 | |||
264 | seen = (seen && line == block->u.t.last_line); | ||
280 | } | 265 | } |
281 | } | 266 | } |
282 | 267 | ||
diff --git a/source/fitz/util.c b/source/fitz/util.c index 6f9001746..d6a7f3174 100644 --- a/source/fitz/util.c +++ b/source/fitz/util.c | |||
@@ -267,7 +267,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co | |||
267 | } | 267 | } |
268 | 268 | ||
269 | fz_stext_page * | 269 | fz_stext_page * |
270 | fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options) | 270 | fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options) |
271 | { | 271 | { |
272 | fz_stext_page *text; | 272 | fz_stext_page *text; |
273 | fz_device *dev; | 273 | fz_device *dev; |
@@ -279,7 +279,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s | |||
279 | text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox)); | 279 | text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox)); |
280 | fz_try(ctx) | 280 | fz_try(ctx) |
281 | { | 281 | { |
282 | dev = fz_new_stext_device(ctx, sheet, text, options); | 282 | dev = fz_new_stext_device(ctx, text, options); |
283 | fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL); | 283 | fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL); |
284 | fz_close_device(ctx, dev); | 284 | fz_close_device(ctx, dev); |
285 | } | 285 | } |
@@ -297,7 +297,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s | |||
297 | } | 297 | } |
298 | 298 | ||
299 | fz_stext_page * | 299 | fz_stext_page * |
300 | fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options) | 300 | fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options) |
301 | { | 301 | { |
302 | fz_stext_page *text; | 302 | fz_stext_page *text; |
303 | fz_device *dev; | 303 | fz_device *dev; |
@@ -309,7 +309,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee | |||
309 | text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox)); | 309 | text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox)); |
310 | fz_try(ctx) | 310 | fz_try(ctx) |
311 | { | 311 | { |
312 | dev = fz_new_stext_device(ctx, sheet, text, options); | 312 | dev = fz_new_stext_device(ctx, text, options); |
313 | fz_run_page(ctx, page, dev, &fz_identity, NULL); | 313 | fz_run_page(ctx, page, dev, &fz_identity, NULL); |
314 | fz_close_device(ctx, dev); | 314 | fz_close_device(ctx, dev); |
315 | } | 315 | } |
@@ -327,14 +327,14 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee | |||
327 | } | 327 | } |
328 | 328 | ||
329 | fz_stext_page * | 329 | fz_stext_page * |
330 | fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options) | 330 | fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options) |
331 | { | 331 | { |
332 | fz_page *page; | 332 | fz_page *page; |
333 | fz_stext_page *text; | 333 | fz_stext_page *text; |
334 | 334 | ||
335 | page = fz_load_page(ctx, doc, number); | 335 | page = fz_load_page(ctx, doc, number); |
336 | fz_try(ctx) | 336 | fz_try(ctx) |
337 | text = fz_new_stext_page_from_page(ctx, page, sheet, options); | 337 | text = fz_new_stext_page_from_page(ctx, page, options); |
338 | fz_always(ctx) | 338 | fz_always(ctx) |
339 | fz_drop_page(ctx, page); | 339 | fz_drop_page(ctx, page); |
340 | fz_catch(ctx) | 340 | fz_catch(ctx) |
@@ -345,24 +345,14 @@ fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number | |||
345 | int | 345 | int |
346 | fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needle, fz_rect *hit_bbox, int hit_max) | 346 | fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needle, fz_rect *hit_bbox, int hit_max) |
347 | { | 347 | { |
348 | fz_stext_sheet *sheet = NULL; | 348 | fz_stext_page *text; |
349 | fz_stext_page *text = NULL; | ||
350 | int count; | 349 | int count; |
351 | 350 | ||
352 | fz_var(sheet); | 351 | text = fz_new_stext_page_from_display_list(ctx, list, NULL); |
353 | fz_var(text); | ||
354 | |||
355 | fz_try(ctx) | 352 | fz_try(ctx) |
356 | { | ||
357 | sheet = fz_new_stext_sheet(ctx); | ||
358 | text = fz_new_stext_page_from_display_list(ctx, list, sheet, NULL); | ||
359 | count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); | 353 | count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); |
360 | } | ||
361 | fz_always(ctx) | 354 | fz_always(ctx) |
362 | { | ||
363 | fz_drop_stext_page(ctx, text); | 355 | fz_drop_stext_page(ctx, text); |
364 | fz_drop_stext_sheet(ctx, sheet); | ||
365 | } | ||
366 | fz_catch(ctx) | 356 | fz_catch(ctx) |
367 | fz_rethrow(ctx); | 357 | fz_rethrow(ctx); |
368 | return count; | 358 | return count; |
@@ -371,24 +361,14 @@ fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needl | |||
371 | int | 361 | int |
372 | fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max) | 362 | fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max) |
373 | { | 363 | { |
374 | fz_stext_sheet *sheet = NULL; | 364 | fz_stext_page *text; |
375 | fz_stext_page *text = NULL; | ||
376 | int count; | 365 | int count; |
377 | 366 | ||
378 | fz_var(sheet); | 367 | text = fz_new_stext_page_from_page(ctx, page, NULL); |
379 | fz_var(text); | ||
380 | |||
381 | fz_try(ctx) | 368 | fz_try(ctx) |
382 | { | ||
383 | sheet = fz_new_stext_sheet(ctx); | ||
384 | text = fz_new_stext_page_from_page(ctx, page, sheet, NULL); | ||
385 | count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); | 369 | count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); |
386 | } | ||
387 | fz_always(ctx) | 370 | fz_always(ctx) |
388 | { | ||
389 | fz_drop_stext_page(ctx, text); | 371 | fz_drop_stext_page(ctx, text); |
390 | fz_drop_stext_sheet(ctx, sheet); | ||
391 | } | ||
392 | fz_catch(ctx) | 372 | fz_catch(ctx) |
393 | fz_rethrow(ctx); | 373 | fz_rethrow(ctx); |
394 | return count; | 374 | return count; |
@@ -411,14 +391,15 @@ fz_search_page_number(fz_context *ctx, fz_document *doc, int number, const char | |||
411 | } | 391 | } |
412 | 392 | ||
413 | fz_buffer * | 393 | fz_buffer * |
414 | fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rect *sel, int crlf) | 394 | fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page, const fz_rect *sel, int crlf) |
415 | { | 395 | { |
416 | fz_buffer *buf; | 396 | fz_buffer *buf; |
417 | fz_rect hitbox; | 397 | fz_rect hitbox; |
418 | float x0, y0, x1, y1; | 398 | float x0, y0, x1, y1; |
419 | int block_num; | 399 | fz_stext_block *block; |
400 | fz_stext_line *line; | ||
401 | fz_stext_char *ch; | ||
420 | int need_newline; | 402 | int need_newline; |
421 | int i; | ||
422 | 403 | ||
423 | need_newline = 0; | 404 | need_newline = 0; |
424 | 405 | ||
@@ -438,45 +419,33 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec | |||
438 | buf = fz_new_buffer(ctx, 256); | 419 | buf = fz_new_buffer(ctx, 256); |
439 | fz_try(ctx) | 420 | fz_try(ctx) |
440 | { | 421 | { |
441 | for (block_num = 0; block_num < text->len; block_num++) | 422 | for (block = page->first_block; block; block = block->next) |
442 | { | 423 | { |
443 | fz_stext_line *line; | 424 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
444 | fz_stext_block *block; | ||
445 | fz_stext_span *span; | ||
446 | |||
447 | if (text->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) | ||
448 | continue; | 425 | continue; |
449 | 426 | ||
450 | block = text->blocks[block_num].u.text; | 427 | for (line = block->u.t.first_line; line; line = line->next) |
451 | for (line = block->lines; line < block->lines + block->len; line++) | ||
452 | { | 428 | { |
453 | int saw_text = 0; | 429 | int saw_text = 0; |
454 | for (span = line->first_span; span; span = span->next) | 430 | for (ch = line->first_char; ch; ch = ch->next) |
455 | { | 431 | { |
456 | if (span->spacing > 1) | 432 | int c = ch->c; |
457 | fz_append_byte(ctx, buf, ' '); | 433 | fz_stext_char_bbox(ctx, &hitbox, line, ch); |
458 | for (i = 0; i < span->len; i++) | 434 | if (c < 32) |
435 | c = 0xFFFD; | ||
436 | if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) | ||
459 | { | 437 | { |
460 | int c; | 438 | saw_text = 1; |
461 | fz_stext_char_bbox(ctx, &hitbox, span, i); | 439 | if (need_newline) |
462 | c = span->text[i].c; | ||
463 | if (c < 32) | ||
464 | c = 0xFFFD; | ||
465 | if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) | ||
466 | { | 440 | { |
467 | saw_text = 1; | 441 | if (crlf) |
468 | if (need_newline) | 442 | fz_append_byte(ctx, buf, '\r'); |
469 | { | 443 | fz_append_byte(ctx, buf, '\n'); |
470 | if (crlf) | 444 | need_newline = 0; |
471 | fz_append_byte(ctx, buf, '\r'); | ||
472 | fz_append_byte(ctx, buf, '\n'); | ||
473 | need_newline = 0; | ||
474 | } | ||
475 | fz_append_rune(ctx, buf, c); | ||
476 | } | 445 | } |
446 | fz_append_rune(ctx, buf, c); | ||
477 | } | 447 | } |
478 | } | 448 | } |
479 | |||
480 | if (saw_text) | 449 | if (saw_text) |
481 | need_newline = 1; | 450 | need_newline = 1; |
482 | } | 451 | } |
@@ -494,42 +463,32 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec | |||
494 | fz_buffer * | 463 | fz_buffer * |
495 | fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options) | 464 | fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options) |
496 | { | 465 | { |
497 | fz_stext_sheet *sheet; | ||
498 | fz_stext_page *text; | 466 | fz_stext_page *text; |
499 | fz_buffer *buf; | 467 | fz_buffer *buf; |
500 | 468 | ||
501 | sheet = fz_new_stext_sheet(ctx); | 469 | text = fz_new_stext_page_from_display_list(ctx, list, options); |
502 | fz_try(ctx) | 470 | fz_try(ctx) |
503 | { | ||
504 | text = fz_new_stext_page_from_display_list(ctx, list, sheet, options); | ||
505 | buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); | 471 | buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); |
506 | } | ||
507 | fz_always(ctx) | 472 | fz_always(ctx) |
508 | fz_drop_stext_sheet(ctx, sheet); | 473 | fz_drop_stext_page(ctx, text); |
509 | fz_catch(ctx) | 474 | fz_catch(ctx) |
510 | fz_rethrow(ctx); | 475 | fz_rethrow(ctx); |
511 | fz_drop_stext_page(ctx, text); | ||
512 | return buf; | 476 | return buf; |
513 | } | 477 | } |
514 | 478 | ||
515 | fz_buffer * | 479 | fz_buffer * |
516 | fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options) | 480 | fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options) |
517 | { | 481 | { |
518 | fz_stext_sheet *sheet; | ||
519 | fz_stext_page *text; | 482 | fz_stext_page *text; |
520 | fz_buffer *buf; | 483 | fz_buffer *buf; |
521 | 484 | ||
522 | sheet = fz_new_stext_sheet(ctx); | 485 | text = fz_new_stext_page_from_page(ctx, page, options); |
523 | fz_try(ctx) | 486 | fz_try(ctx) |
524 | { | ||
525 | text = fz_new_stext_page_from_page(ctx, page, sheet, options); | ||
526 | buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); | 487 | buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); |
527 | } | ||
528 | fz_always(ctx) | 488 | fz_always(ctx) |
529 | fz_drop_stext_sheet(ctx, sheet); | 489 | fz_drop_stext_page(ctx, text); |
530 | fz_catch(ctx) | 490 | fz_catch(ctx) |
531 | fz_rethrow(ctx); | 491 | fz_rethrow(ctx); |
532 | fz_drop_stext_page(ctx, text); | ||
533 | return buf; | 492 | return buf; |
534 | } | 493 | } |
535 | 494 | ||
diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c index de05ab659..e1303fb87 100644 --- a/source/tools/mudraw.c +++ b/source/tools/mudraw.c | |||
@@ -248,7 +248,6 @@ static int band_height = 0; | |||
248 | static int lowmemory = 0; | 248 | static int lowmemory = 0; |
249 | 249 | ||
250 | static int errored = 0; | 250 | static int errored = 0; |
251 | static fz_stext_sheet *sheet = NULL; | ||
252 | static fz_colorspace *colorspace; | 251 | static fz_colorspace *colorspace; |
253 | static int spots = 0; | 252 | static int spots = 0; |
254 | static int alpha; | 253 | static int alpha; |
@@ -391,9 +390,6 @@ file_level_headers(fz_context *ctx) | |||
391 | if (output_format == OUT_STEXT || output_format == OUT_TRACE) | 390 | if (output_format == OUT_STEXT || output_format == OUT_TRACE) |
392 | fz_write_printf(ctx, out, "<?xml version=\"1.0\"?>\n"); | 391 | fz_write_printf(ctx, out, "<?xml version=\"1.0\"?>\n"); |
393 | 392 | ||
394 | if (output_format == OUT_TEXT || output_format == OUT_HTML || output_format == OUT_XHTML || output_format == OUT_STEXT) | ||
395 | sheet = fz_new_stext_sheet(ctx); | ||
396 | |||
397 | if (output_format == OUT_HTML) | 393 | if (output_format == OUT_HTML) |
398 | fz_print_stext_header_as_html(ctx, out); | 394 | fz_print_stext_header_as_html(ctx, out); |
399 | if (output_format == OUT_XHTML) | 395 | if (output_format == OUT_XHTML) |
@@ -422,8 +418,6 @@ file_level_trailers(fz_context *ctx) | |||
422 | 418 | ||
423 | if (output_format == OUT_PS) | 419 | if (output_format == OUT_PS) |
424 | fz_write_ps_file_trailer(ctx, out, output_pagenum); | 420 | fz_write_ps_file_trailer(ctx, out, output_pagenum); |
425 | |||
426 | fz_drop_stext_sheet(ctx, sheet); | ||
427 | } | 421 | } |
428 | 422 | ||
429 | static void drawband(fz_context *ctx, fz_page *page, fz_display_list *list, const fz_matrix *ctm, const fz_rect *tbounds, fz_cookie *cookie, int band_start, fz_pixmap *pix, fz_bitmap **bit) | 423 | static void drawband(fz_context *ctx, fz_page *page, fz_display_list *list, const fz_matrix *ctm, const fz_rect *tbounds, fz_cookie *cookie, int band_start, fz_pixmap *pix, fz_bitmap **bit) |
@@ -534,7 +528,7 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in | |||
534 | 528 | ||
535 | stext_options.flags = (output_format == OUT_HTML || output_format == OUT_XHTML) ? FZ_STEXT_PRESERVE_IMAGES : 0; | 529 | stext_options.flags = (output_format == OUT_HTML || output_format == OUT_XHTML) ? FZ_STEXT_PRESERVE_IMAGES : 0; |
536 | text = fz_new_stext_page(ctx, &mediabox); | 530 | text = fz_new_stext_page(ctx, &mediabox); |
537 | dev = fz_new_stext_device(ctx, sheet, text, &stext_options); | 531 | dev = fz_new_stext_device(ctx, text, &stext_options); |
538 | if (lowmemory) | 532 | if (lowmemory) |
539 | fz_enable_device_hints(ctx, dev, FZ_NO_CACHE); | 533 | fz_enable_device_hints(ctx, dev, FZ_NO_CACHE); |
540 | if (list) | 534 | if (list) |
@@ -550,12 +544,10 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in | |||
550 | } | 544 | } |
551 | else if (output_format == OUT_HTML) | 545 | else if (output_format == OUT_HTML) |
552 | { | 546 | { |
553 | fz_analyze_text(ctx, sheet, text); | ||
554 | fz_print_stext_page_as_html(ctx, out, text); | 547 | fz_print_stext_page_as_html(ctx, out, text); |
555 | } | 548 | } |
556 | else if (output_format == OUT_XHTML) | 549 | else if (output_format == OUT_XHTML) |
557 | { | 550 | { |
558 | fz_analyze_text(ctx, sheet, text); | ||
559 | fz_print_stext_page_as_xhtml(ctx, out, text); | 551 | fz_print_stext_page_as_xhtml(ctx, out, text); |
560 | } | 552 | } |
561 | else if (output_format == OUT_TEXT) | 553 | else if (output_format == OUT_TEXT) |
diff --git a/source/tools/murun.c b/source/tools/murun.c index b7443286c..7a713903e 100644 --- a/source/tools/murun.c +++ b/source/tools/murun.c | |||
@@ -1827,19 +1827,13 @@ static void ffi_Page_toStructuredText(js_State *J) | |||
1827 | fz_context *ctx = js_getcontext(J); | 1827 | fz_context *ctx = js_getcontext(J); |
1828 | fz_page *page = ffi_topage(J, 0); | 1828 | fz_page *page = ffi_topage(J, 0); |
1829 | const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; | 1829 | const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; |
1830 | fz_stext_sheet *sheet = NULL; | ||
1831 | fz_stext_options so; | 1830 | fz_stext_options so; |
1832 | fz_stext_page *text; | 1831 | fz_stext_page *text; |
1833 | 1832 | ||
1834 | fz_var(sheet); | ||
1835 | |||
1836 | fz_try(ctx) { | 1833 | fz_try(ctx) { |
1837 | sheet = fz_new_stext_sheet(ctx); | ||
1838 | fz_parse_stext_options(ctx, &so, options); | 1834 | fz_parse_stext_options(ctx, &so, options); |
1839 | text = fz_new_stext_page_from_page(ctx, page, sheet, &so); | 1835 | text = fz_new_stext_page_from_page(ctx, page, &so); |
1840 | } | 1836 | } |
1841 | fz_always(ctx) | ||
1842 | fz_drop_stext_sheet(ctx, sheet); | ||
1843 | fz_catch(ctx) | 1837 | fz_catch(ctx) |
1844 | rethrow(J); | 1838 | rethrow(J); |
1845 | 1839 | ||
@@ -2673,19 +2667,13 @@ static void ffi_DisplayList_toStructuredText(js_State *J) | |||
2673 | fz_context *ctx = js_getcontext(J); | 2667 | fz_context *ctx = js_getcontext(J); |
2674 | fz_display_list *list = js_touserdata(J, 0, "fz_display_list"); | 2668 | fz_display_list *list = js_touserdata(J, 0, "fz_display_list"); |
2675 | const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; | 2669 | const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; |
2676 | fz_stext_sheet *sheet = NULL; | ||
2677 | fz_stext_options so; | 2670 | fz_stext_options so; |
2678 | fz_stext_page *text; | 2671 | fz_stext_page *text; |
2679 | 2672 | ||
2680 | fz_var(sheet); | ||
2681 | |||
2682 | fz_try(ctx) { | 2673 | fz_try(ctx) { |
2683 | sheet = fz_new_stext_sheet(ctx); | ||
2684 | fz_parse_stext_options(ctx, &so, options); | 2674 | fz_parse_stext_options(ctx, &so, options); |
2685 | text = fz_new_stext_page_from_display_list(ctx, list, sheet, &so); | 2675 | text = fz_new_stext_page_from_display_list(ctx, list, &so); |
2686 | } | 2676 | } |
2687 | fz_always(ctx) | ||
2688 | fz_drop_stext_sheet(ctx, sheet); | ||
2689 | fz_catch(ctx) | 2677 | fz_catch(ctx) |
2690 | rethrow(J); | 2678 | rethrow(J); |
2691 | 2679 | ||