summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/mupdf/fitz/font.h6
-rw-r--r--include/mupdf/fitz/structured-text.h188
-rw-r--r--include/mupdf/fitz/util.h8
-rw-r--r--platform/java/mupdf_native.c2
-rw-r--r--platform/win32/libmupdf.vcproj4
-rw-r--r--platform/x11/pdfapp.c61
-rw-r--r--platform/x11/pdfapp.h1
-rw-r--r--source/fitz/font.c22
-rw-r--r--source/fitz/stext-device.c1047
-rw-r--r--source/fitz/stext-output.c386
-rw-r--r--source/fitz/stext-paragraph.c1538
-rw-r--r--source/fitz/stext-search.c137
-rw-r--r--source/fitz/util.c109
-rw-r--r--source/tools/mudraw.c10
-rw-r--r--source/tools/murun.c16
15 files changed, 681 insertions, 2854 deletions
diff --git a/include/mupdf/fitz/font.h b/include/mupdf/fitz/font.h
index a6e172a1d..ef4cd74d4 100644
--- a/include/mupdf/fitz/font.h
+++ b/include/mupdf/fitz/font.h
@@ -601,6 +601,12 @@ int fz_encode_character_with_fallback(fz_context *ctx, fz_font *font, int unicod
601void fz_get_glyph_name(fz_context *ctx, fz_font *font, int glyph, char *buf, int size); 601void fz_get_glyph_name(fz_context *ctx, fz_font *font, int glyph, char *buf, int size);
602 602
603/* 603/*
604 Get font ascender and descender values.
605*/
606float fz_font_ascender(fz_context *ctx, fz_font *font);
607float fz_font_descender(fz_context *ctx, fz_font *font);
608
609/*
604 Internal functions for our Harfbuzz integration 610 Internal functions for our Harfbuzz integration
605 to work around the lack of thread safety. 611 to work around the lack of thread safety.
606*/ 612*/
diff --git a/include/mupdf/fitz/structured-text.h b/include/mupdf/fitz/structured-text.h
index 61ee30ad1..0f3364b3b 100644
--- a/include/mupdf/fitz/structured-text.h
+++ b/include/mupdf/fitz/structured-text.h
@@ -16,15 +16,9 @@
16 (In development - Subject to change in future versions) 16 (In development - Subject to change in future versions)
17*/ 17*/
18 18
19typedef struct fz_stext_style_s fz_stext_style;
20typedef struct fz_stext_char_s fz_stext_char; 19typedef struct fz_stext_char_s fz_stext_char;
21typedef struct fz_stext_span_s fz_stext_span;
22typedef struct fz_stext_line_s fz_stext_line; 20typedef struct fz_stext_line_s fz_stext_line;
23typedef struct fz_stext_block_s fz_stext_block; 21typedef struct fz_stext_block_s fz_stext_block;
24typedef struct fz_image_block_s fz_image_block;
25typedef struct fz_page_block_s fz_page_block;
26
27typedef struct fz_stext_sheet_s fz_stext_sheet;
28typedef struct fz_stext_page_s fz_stext_page; 22typedef struct fz_stext_page_s fz_stext_page;
29 23
30/* 24/*
@@ -52,150 +46,58 @@ enum
52}; 46};
53 47
54/* 48/*
55 fz_stext_sheet: A text sheet contains a list of distinct text styles 49 A text page is a list of blocks, together with an overall bounding box.
56 used on a page (or a series of pages).
57*/
58struct fz_stext_sheet_s
59{
60 int maxid;
61 fz_stext_style *style;
62};
63
64/*
65 fz_stext_style: A text style contains details of a distinct text style
66 used on a page.
67*/
68struct fz_stext_style_s
69{
70 fz_stext_style *next;
71 int id;
72 fz_font *font;
73 float size;
74 int wmode;
75 int script;
76 /* Ascender and Descender only have the conventional sense in
77 * horizontal mode; in vertical mode they are rotated too - they are
78 * the maximum and minimum bounds respectively. */
79 float ascender;
80 float descender;
81 /* etc... */
82};
83
84/*
85 fz_stext_page: A text page is a list of page blocks, together with
86 an overall bounding box.
87*/ 50*/
88struct fz_stext_page_s 51struct fz_stext_page_s
89{ 52{
53 fz_pool *pool;
90 fz_rect mediabox; 54 fz_rect mediabox;
91 int len, cap; 55 fz_stext_block *first_block, *last_block;
92 fz_page_block *blocks;
93 fz_stext_page *next;
94};
95
96/*
97 fz_page_block: A page block is a typed block pointer.
98*/
99struct fz_page_block_s
100{
101 int type;
102 union
103 {
104 fz_stext_block *text;
105 fz_image_block *image;
106 } u;
107}; 56};
108 57
109enum 58enum
110{ 59{
111 FZ_PAGE_BLOCK_TEXT = 0, 60 FZ_STEXT_BLOCK_TEXT = 0,
112 FZ_PAGE_BLOCK_IMAGE = 1 61 FZ_STEXT_BLOCK_IMAGE = 1
113}; 62};
114 63
115/* 64/*
116 fz_stext_block: A text block is a list of lines of text. In typical 65 A text block is a list of lines of text, or an image.
117 cases this may correspond to a paragraph or a column of text. A
118 collection of blocks makes up a page.
119*/ 66*/
120struct fz_stext_block_s 67struct fz_stext_block_s
121{ 68{
69 int type;
122 fz_rect bbox; 70 fz_rect bbox;
123 int len, cap; 71 union {
124 fz_stext_line *lines; 72 struct { fz_stext_line *first_line, *last_line; } t;
125}; 73 struct { fz_matrix transform; fz_image *image; } i;
126 74 } u;
127/* 75 fz_stext_block *next;
128 fz_image_block: An image block is an image, together with the list of lines of text. In typical
129 cases this may correspond to a paragraph or a column of text. A
130 collection of blocks makes up a page.
131*/
132struct fz_image_block_s
133{
134 fz_rect bbox;
135 fz_matrix mat;
136 fz_image *image;
137 fz_colorspace *cspace;
138 float colors[FZ_MAX_COLORS];
139}; 76};
140 77
141/* 78/*
142 fz_stext_line: A text line is a list of text spans, with the same 79 A text line is a list of characters that share a common baseline.
143 baseline. In typical cases this should correspond (as expected) to
144 complete lines of text. A collection of lines makes up a block.
145*/ 80*/
146struct fz_stext_line_s 81struct fz_stext_line_s
147{ 82{
148 fz_stext_span *first_span, *last_span;
149
150 /* Cached information */
151 float distance; /* Perpendicular distance from previous line */
152 fz_rect bbox;
153 void *region; /* Opaque value for matching line masks */
154};
155
156/*
157 fz_stext_span: A text span is a list of characters that share a common
158 baseline/transformation. In typical cases a single span may be enough
159 to represent a complete line. In cases where the text has big gaps in
160 it (perhaps as it crosses columns or tables), a line may be represented
161 by multiple spans.
162*/
163struct fz_stext_span_s
164{
165 int len, cap;
166 fz_stext_char *text;
167 fz_point min; /* Device space */
168 fz_point max; /* Device space */
169 int wmode; /* 0 for horizontal, 1 for vertical */ 83 int wmode; /* 0 for horizontal, 1 for vertical */
170 fz_matrix transform; /* e and f are always 0 here */ 84 fz_rect bbox;
171 /* Ascender_max and Descender_min only have the conventional sense in 85 fz_stext_char *first_char, *last_char;
172 * horizontal mode; in vertical mode they are rotated too - they are 86 fz_stext_line *next;
173 * the maximum and minimum bounds respectively. */
174 float ascender_max; /* Document space */
175 float descender_min; /* Document space */
176 fz_rect bbox; /* Device space */
177
178 /* Cached information */
179 float base_offset; /* Perpendicular distance from baseline of line */
180 float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
181 int column; /* If non zero, the column that it's in */
182 float column_width; /* Percentage */
183 int align; /* 0 = left, 1 = centre, 2 = right */
184 float indent; /* The indent position for this column. */
185
186 fz_stext_span *next;
187}; 87};
188 88
189/* 89/*
190 fz_stext_char: A text char is a unicode character, the style in which 90 A text char is a unicode character, the style in which is appears, and
191 is appears, and the point at which it is positioned. Transform 91 the point at which it is positioned.
192 (and hence bbox) information is given by the enclosing span.
193*/ 92*/
194struct fz_stext_char_s 93struct fz_stext_char_s
195{ 94{
196 fz_point p; /* Device space */ 95 int c, rtl;
197 int c; 96 fz_point origin;
198 fz_stext_style *style; 97 fz_rect bbox;
98 float size;
99 fz_font *font;
100 fz_stext_char *next;
199}; 101};
200 102
201typedef struct fz_char_and_box_s fz_char_and_box; 103typedef struct fz_char_and_box_s fz_char_and_box;
@@ -212,43 +114,29 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex
212 114
213/* 115/*
214 fz_stext_char_bbox: Return the bbox of a text char. Calculated from 116 fz_stext_char_bbox: Return the bbox of a text char. Calculated from
215 the supplied enclosing span. 117 the supplied enclosing line.
216
217 bbox: A place to store the bbox
218 118
219 span: The enclosing span 119 bbox: A place to store the bbox.
220 120
221 idx: The index of the char within the span 121 line: The enclosing line.
222 122
223 Returns bbox (updated) 123 ch: The character.
224 124
225 Does not throw exceptions 125 Returns bbox (updated).
226*/ 126*/
227fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int idx); 127fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch);
228
229/*
230 fz_new_stext_sheet: Create an empty style sheet.
231
232 The style sheet is filled out by the text device, creating
233 one style for each unique font, color, size combination that
234 is used.
235*/
236fz_stext_sheet *fz_new_stext_sheet(fz_context *ctx);
237void fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet);
238 128
239/* 129/*
240 fz_new_stext_page: Create an empty text page. 130 fz_new_stext_page: Create an empty text page.
241 131
242 The text page is filled out by the text device to contain the blocks, 132 The text page is filled out by the text device to contain the blocks
243 lines and spans of text on the page. 133 and lines of text on the page.
244 134
245 mediabox: optional mediabox information. 135 mediabox: optional mediabox information.
246*/ 136*/
247fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox); 137fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox);
248void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page); 138void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
249 139
250void fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page);
251
252/* 140/*
253 fz_print_stext_page_as_html: Output a page to a file in HTML (visual) format. 141 fz_print_stext_page_as_html: Output a page to a file in HTML (visual) format.
254*/ 142*/
@@ -314,14 +202,10 @@ fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts
314/* 202/*
315 fz_new_stext_device: Create a device to extract the text on a page. 203 fz_new_stext_device: Create a device to extract the text on a page.
316 204
317 Gather and sort the text on a page into spans of uniform style, 205 Gather the text on a page into blocks and lines.
318 arranged into lines and blocks by reading order. The reading order
319 is determined by various heuristics, so may not be accurate.
320 206
321 sheet: The text sheet to which styles should be added. This can 207 The reading order is taken from the order the text is drawn in the
322 either be a newly created (empty) text sheet, or one containing 208 source file, so may not be accurate.
323 styles from a previous text device. The same sheet cannot be used
324 in multiple threads simultaneously.
325 209
326 page: The text page to which content should be added. This will 210 page: The text page to which content should be added. This will
327 usually be a newly created (empty) text page, but it can be one 211 usually be a newly created (empty) text page, but it can be one
@@ -330,6 +214,6 @@ fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts
330 214
331 options: Options to configure the stext device. 215 options: Options to configure the stext device.
332*/ 216*/
333fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options); 217fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
334 218
335#endif 219#endif
diff --git a/include/mupdf/fitz/util.h b/include/mupdf/fitz/util.h
index d452b58a1..4b827cad1 100644
--- a/include/mupdf/fitz/util.h
+++ b/include/mupdf/fitz/util.h
@@ -36,11 +36,11 @@ fz_pixmap *fz_new_pixmap_from_page_contents(fz_context *ctx, fz_page *page, cons
36fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_matrix *ctm, fz_colorspace *cs, int alpha); 36fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_matrix *ctm, fz_colorspace *cs, int alpha);
37 37
38/* 38/*
39 fz_new_stext_page_from_page: Extract structured text from a page. The sheet must not be NULL. 39 fz_new_stext_page_from_page: Extract structured text from a page.
40*/ 40*/
41fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options); 41fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options);
42fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options); 42fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options);
43fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options); 43fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options);
44 44
45/* 45/*
46 fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle. 46 fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle.
diff --git a/platform/java/mupdf_native.c b/platform/java/mupdf_native.c
index bed3358da..ce5e6fea3 100644
--- a/platform/java/mupdf_native.c
+++ b/platform/java/mupdf_native.c
@@ -5111,8 +5111,6 @@ FUN(Page_textAsHtml)(JNIEnv *env, jobject self)
5111 fz_run_page(ctx, page, dev, &ctm, NULL); 5111 fz_run_page(ctx, page, dev, &ctm, NULL);
5112 fz_close_device(ctx, dev); 5112 fz_close_device(ctx, dev);
5113 5113
5114 fz_analyze_text(ctx, sheet, text);
5115
5116 buf = fz_new_buffer(ctx, 256); 5114 buf = fz_new_buffer(ctx, 256);
5117 out = fz_new_output_with_buffer(ctx, buf); 5115 out = fz_new_output_with_buffer(ctx, buf);
5118 fz_write_printf(ctx, out, "<html>\n"); 5116 fz_write_printf(ctx, out, "<html>\n");
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj
index fc195fcea..3add80edf 100644
--- a/platform/win32/libmupdf.vcproj
+++ b/platform/win32/libmupdf.vcproj
@@ -1869,10 +1869,6 @@
1869 > 1869 >
1870 </File> 1870 </File>
1871 <File 1871 <File
1872 RelativePath="..\..\source\fitz\stext-paragraph.c"
1873 >
1874 </File>
1875 <File
1876 RelativePath="..\..\source\fitz\stext-search.c" 1872 RelativePath="..\..\source\fitz\stext-search.c"
1877 > 1873 >
1878 </File> 1874 </File>
diff --git a/platform/x11/pdfapp.c b/platform/x11/pdfapp.c
index 61366a44e..6b08c4aa4 100644
--- a/platform/x11/pdfapp.c
+++ b/platform/x11/pdfapp.c
@@ -470,9 +470,6 @@ void pdfapp_close(pdfapp_t *app)
470 fz_drop_stext_page(app->ctx, app->page_text); 470 fz_drop_stext_page(app->ctx, app->page_text);
471 app->page_text = NULL; 471 app->page_text = NULL;
472 472
473 fz_drop_stext_sheet(app->ctx, app->page_sheet);
474 app->page_sheet = NULL;
475
476 fz_drop_link(app->ctx, app->page_links); 473 fz_drop_link(app->ctx, app->page_links);
477 app->page_links = NULL; 474 app->page_links = NULL;
478 475
@@ -655,14 +652,12 @@ static void pdfapp_loadpage(pdfapp_t *app, int no_cache)
655 fz_drop_display_list(app->ctx, app->page_list); 652 fz_drop_display_list(app->ctx, app->page_list);
656 fz_drop_display_list(app->ctx, app->annotations_list); 653 fz_drop_display_list(app->ctx, app->annotations_list);
657 fz_drop_stext_page(app->ctx, app->page_text); 654 fz_drop_stext_page(app->ctx, app->page_text);
658 fz_drop_stext_sheet(app->ctx, app->page_sheet);
659 fz_drop_link(app->ctx, app->page_links); 655 fz_drop_link(app->ctx, app->page_links);
660 fz_drop_page(app->ctx, app->page); 656 fz_drop_page(app->ctx, app->page);
661 657
662 app->page_list = NULL; 658 app->page_list = NULL;
663 app->annotations_list = NULL; 659 app->annotations_list = NULL;
664 app->page_text = NULL; 660 app->page_text = NULL;
665 app->page_sheet = NULL;
666 app->page_links = NULL; 661 app->page_links = NULL;
667 app->page = NULL; 662 app->page = NULL;
668 app->page_bbox.x0 = 0; 663 app->page_bbox.x0 = 0;
@@ -875,12 +870,11 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai
875 app->hit_count = 0; 870 app->hit_count = 0;
876 871
877 /* Extract text */ 872 /* Extract text */
878 app->page_sheet = fz_new_stext_sheet(app->ctx);
879 app->page_text = fz_new_stext_page(app->ctx, fz_bound_page(app->ctx, app->page, &mediabox)); 873 app->page_text = fz_new_stext_page(app->ctx, fz_bound_page(app->ctx, app->page, &mediabox));
880 874
881 if (app->page_list || app->annotations_list) 875 if (app->page_list || app->annotations_list)
882 { 876 {
883 tdev = fz_new_stext_device(app->ctx, app->page_sheet, app->page_text, NULL); 877 tdev = fz_new_stext_device(app->ctx, app->page_text, NULL);
884 pdfapp_runpage(app, tdev, &fz_identity, &fz_infinite_rect, &cookie); 878 pdfapp_runpage(app, tdev, &fz_identity, &fz_infinite_rect, &cookie);
885 fz_close_device(app->ctx, tdev); 879 fz_close_device(app->ctx, tdev);
886 fz_drop_device(app->ctx, tdev); 880 fz_drop_device(app->ctx, tdev);
@@ -1905,8 +1899,10 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
1905 fz_rect hitbox; 1899 fz_rect hitbox;
1906 fz_matrix ctm; 1900 fz_matrix ctm;
1907 fz_stext_page *page = app->page_text; 1901 fz_stext_page *page = app->page_text;
1908 int c, i, p, need_newline; 1902 int p, need_newline;
1909 int block_num; 1903 fz_stext_block *block;
1904 fz_stext_line *line;
1905 fz_stext_char *ch;
1910 1906
1911 int x0 = app->selr.x0; 1907 int x0 = app->selr.x0;
1912 int x1 = app->selr.x1; 1908 int x1 = app->selr.x1;
@@ -1918,50 +1914,37 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
1918 p = 0; 1914 p = 0;
1919 need_newline = 0; 1915 need_newline = 0;
1920 1916
1921 for (block_num = 0; block_num < page->len; block_num++) 1917 for (block = page->first_block; block; block = block->next)
1922 { 1918 {
1923 fz_stext_line *line; 1919 if (block->type != FZ_STEXT_BLOCK_TEXT)
1924 fz_stext_block *block;
1925 fz_stext_span *span;
1926
1927 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
1928 continue; 1920 continue;
1929 block = page->blocks[block_num].u.text;
1930 1921
1931 for (line = block->lines; line < block->lines + block->len; line++) 1922 for (line = block->u.t.first_line; line; line = line->next)
1932 { 1923 {
1933 int saw_text = 0; 1924 int saw_text = 0;
1934 1925 for (ch = line->first_char; ch; ch = ch->next)
1935 for (span = line->first_span; span; span = span->next)
1936 { 1926 {
1937 for (i = 0; i < span->len; i++) 1927 int c = ch->c;
1928 fz_stext_char_bbox(app->ctx, &hitbox, line, ch);
1929 if (c < 32)
1930 c = 0xFFFD;
1931 if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
1938 { 1932 {
1939 fz_stext_char_bbox(app->ctx, &hitbox, span, i); 1933 saw_text = 1;
1940 fz_transform_rect(&hitbox, &ctm); 1934 if (need_newline)
1941 c = span->text[i].c;
1942 if (c < 32)
1943 c = '?';
1944 if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
1945 { 1935 {
1946 saw_text = 1;
1947
1948 if (need_newline)
1949 {
1950#if defined(_WIN32) || defined(_WIN64) 1936#if defined(_WIN32) || defined(_WIN64)
1951 if (p < ucslen - 1) 1937 if (p < ucslen - 1)
1952 ucsbuf[p++] = '\r'; 1938 ucsbuf[p++] = '\r';
1953#endif 1939#endif
1954 if (p < ucslen - 1)
1955 ucsbuf[p++] = '\n';
1956 need_newline = 0;
1957 }
1958
1959 if (p < ucslen - 1) 1940 if (p < ucslen - 1)
1960 ucsbuf[p++] = c; 1941 ucsbuf[p++] = '\n';
1942 need_newline = 0;
1961 } 1943 }
1944 if (p < ucslen - 1)
1945 ucsbuf[p++] = c;
1962 } 1946 }
1963 } 1947 }
1964
1965 if (saw_text) 1948 if (saw_text)
1966 need_newline = 1; 1949 need_newline = 1;
1967 } 1950 }
diff --git a/platform/x11/pdfapp.h b/platform/x11/pdfapp.h
index 28a834815..09d8f16ad 100644
--- a/platform/x11/pdfapp.h
+++ b/platform/x11/pdfapp.h
@@ -91,7 +91,6 @@ struct pdfapp_s
91 fz_display_list *page_list; 91 fz_display_list *page_list;
92 fz_display_list *annotations_list; 92 fz_display_list *annotations_list;
93 fz_stext_page *page_text; 93 fz_stext_page *page_text;
94 fz_stext_sheet *page_sheet;
95 fz_link *page_links; 94 fz_link *page_links;
96 int errored; 95 int errored;
97 int incomplete; 96 int incomplete;
diff --git a/source/fitz/font.c b/source/fitz/font.c
index eb7c8c351..dfe4ab24c 100644
--- a/source/fitz/font.c
+++ b/source/fitz/font.c
@@ -193,6 +193,28 @@ fz_set_font_bbox(fz_context *ctx, fz_font *font, float xmin, float ymin, float x
193 } 193 }
194} 194}
195 195
196float fz_font_ascender(fz_context *ctx, fz_font *font)
197{
198 if (font->t3procs)
199 return font->bbox.y1;
200 else
201 {
202 FT_Face face = font->ft_face;
203 return (float)face->ascender / face->units_per_EM;
204 }
205}
206
207float fz_font_descender(fz_context *ctx, fz_font *font)
208{
209 if (font->t3procs)
210 return font->bbox.y0;
211 else
212 {
213 FT_Face face = font->ft_face;
214 return (float)face->descender / face->units_per_EM;
215 }
216}
217
196/* 218/*
197 * Freetype hooks 219 * Freetype hooks
198 */ 220 */
diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
index 73fa309e8..166f5aa0b 100644
--- a/source/fitz/stext-device.c
+++ b/source/fitz/stext-device.c
@@ -4,36 +4,25 @@
4#include <math.h> 4#include <math.h>
5#include <float.h> 5#include <float.h>
6 6
7/* Extract text into an unsorted span soup. */ 7#include <stdio.h> /* for debug printing */
8
9/* Extract text into blocks and lines. */
8 10
9#define LINE_DIST 0.9f 11#define LINE_DIST 0.9f
10#define SPACE_DIST 0.15f 12#define SPACE_DIST 0.15f
11#define SPACE_MAX_DIST 0.8f 13#define SPACE_MAX_DIST 0.8f
12#define PARAGRAPH_DIST 0.5f 14#define PARAGRAPH_DIST 0.5f
13 15
14#include <stdio.h> /* for debug printing */
15#undef DEBUG_SPANS
16#undef DEBUG_INTERNALS
17#undef DEBUG_LINE_HEIGHTS
18#undef DEBUG_MASKS
19#undef DEBUG_ALIGN
20#undef DEBUG_INDENTS
21
22#include <ft2build.h>
23#include FT_FREETYPE_H
24#include FT_ADVANCES_H
25
26typedef struct fz_stext_device_s fz_stext_device; 16typedef struct fz_stext_device_s fz_stext_device;
27 17
28typedef struct span_soup_s span_soup;
29
30struct fz_stext_device_s 18struct fz_stext_device_s
31{ 19{
32 fz_device super; 20 fz_device super;
33 fz_stext_sheet *sheet;
34 fz_stext_page *page; 21 fz_stext_page *page;
35 span_soup *spans; 22 fz_point pen, start;
36 fz_stext_span *cur_span; 23 fz_matrix trm;
24 int new_obj;
25 int curdir;
37 int lastchar; 26 int lastchar;
38 int flags; 27 int flags;
39}; 28};
@@ -42,553 +31,235 @@ const char *fz_stext_options_usage =
42 "Structured text output options:\n" 31 "Structured text output options:\n"
43 "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n" 32 "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n"
44 "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n" 33 "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n"
34 "\tpreserve-images: keep images in output\n"
45 "\n"; 35 "\n";
46 36
47static fz_rect * 37fz_rect *
48add_point_to_rect(fz_rect *a, const fz_point *p) 38fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch)
49{ 39{
50 if (p->x < a->x0) 40 *bbox = ch->bbox;
51 a->x0 = p->x; 41 return bbox;
52 if (p->x > a->x1)
53 a->x1 = p->x;
54 if (p->y < a->y0)
55 a->y0 = p->y;
56 if (p->y > a->y1)
57 a->y1 = p->y;
58 return a;
59} 42}
60 43
61fz_rect * 44fz_stext_page *
62fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int i) 45fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox)
63{ 46{
64 fz_point a, d; 47 fz_pool *pool = fz_new_pool(ctx);
65 const fz_point *max; 48 fz_stext_page *page;
66 fz_stext_char *ch; 49 fz_try(ctx)
67
68 if (!span || i >= span->len)
69 {
70 *bbox = fz_empty_rect;
71 return bbox;
72 }
73 ch = &span->text[i];
74 if (i == span->len-1)
75 max = &span->max;
76 else
77 max = &span->text[i+1].p;
78 if (span->wmode == 0)
79 { 50 {
80 a.x = 0; 51 page = fz_pool_alloc(ctx, pool, sizeof(*page));
81 a.y = span->ascender_max; 52 page->pool = pool;
82 d.x = 0; 53 page->mediabox = *mediabox;
83 d.y = span->descender_min; 54 page->first_block = NULL;
55 page->last_block = NULL;
84 } 56 }
85 else 57 fz_catch(ctx)
86 { 58 {
87 a.x = span->ascender_max; 59 fz_drop_pool(ctx, pool);
88 a.y = 0; 60 fz_rethrow(ctx);
89 d.x = span->descender_min;
90 d.y = 0;
91 } 61 }
92 fz_transform_vector(&a, &span->transform); 62 return page;
93 fz_transform_vector(&d, &span->transform);
94 bbox->x0 = bbox->x1 = ch->p.x + a.x;
95 bbox->y0 = bbox->y1 = ch->p.y + a.y;
96 a.x += max->x;
97 a.y += max->y;
98 add_point_to_rect(bbox, &a);
99 a.x = ch->p.x + d.x;
100 a.y = ch->p.y + d.y;
101 add_point_to_rect(bbox, &a);
102 a.x = max->x + d.x;
103 a.y = max->y + d.y;
104 add_point_to_rect(bbox, &a);
105 return bbox;
106} 63}
107 64
108static void 65void
109add_bbox_to_span(fz_stext_span *span) 66fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
110{ 67{
111 fz_point a, d; 68 if (page)
112 fz_rect *bbox = &span->bbox;
113
114 if (!span)
115 return;
116 if (span->wmode == 0)
117 {
118 a.x = 0;
119 a.y = span->ascender_max;
120 d.x = 0;
121 d.y = span->descender_min;
122 }
123 else
124 { 69 {
125 a.x = span->ascender_max; 70 fz_stext_block *block;
126 a.y = 0; 71 for (block = page->first_block; block; block = block->next)
127 d.x = span->descender_min; 72 if (block->type == FZ_STEXT_BLOCK_IMAGE)
128 d.y = 0; 73 fz_drop_image(ctx, block->u.i.image);
74 fz_drop_pool(ctx, page->pool);
129 } 75 }
130 fz_transform_vector(&a, &span->transform);
131 fz_transform_vector(&d, &span->transform);
132 bbox->x0 = bbox->x1 = span->min.x + a.x;
133 bbox->y0 = bbox->y1 = span->min.y + a.y;
134 a.x += span->max.x;
135 a.y += span->max.y;
136 add_point_to_rect(bbox, &a);
137 a.x = span->min.x + d.x;
138 a.y = span->min.y + d.y;
139 add_point_to_rect(bbox, &a);
140 a.x = span->max.x + d.x;
141 a.y = span->max.y + d.y;
142 add_point_to_rect(bbox, &a);
143} 76}
144 77
145struct span_soup_s 78static fz_stext_block *
146{ 79add_block_to_page(fz_context *ctx, fz_stext_page *page)
147 int len, cap;
148 fz_stext_span **spans;
149};
150
151static span_soup *
152new_span_soup(fz_context *ctx)
153{ 80{
154 span_soup *soup = fz_malloc_struct(ctx, span_soup); 81 fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
155 soup->len = 0; 82 if (!page->first_block)
156 soup->cap = 0; 83 page->first_block = page->last_block = block;
157 soup->spans = NULL; 84 else
158 return soup; 85 {
86 page->last_block->next = block;
87 page->last_block = block;
88 }
89 return block;
159} 90}
160 91
161static void 92static fz_stext_block *
162free_span_soup(fz_context *ctx, span_soup *soup) 93add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
163{ 94{
164 int i; 95 fz_stext_block *block = add_block_to_page(ctx, page);
165 96 block->type = FZ_STEXT_BLOCK_TEXT;
166 if (soup == NULL) 97 return block;
167 return;
168 for (i = 0; i < soup->len; i++)
169 {
170 fz_free(ctx, soup->spans[i]);
171 }
172 fz_free(ctx, soup->spans);
173 fz_free(ctx, soup);
174} 98}
175 99
176static void 100static fz_stext_block *
177add_span_to_soup(fz_context *ctx, span_soup *soup, fz_stext_span *span) 101add_image_block_to_page(fz_context *ctx, fz_stext_page *page, const fz_matrix *ctm, fz_image *image)
178{ 102{
179 if (span == NULL) 103 fz_stext_block *block = add_block_to_page(ctx, page);
180 return; 104 block->type = FZ_STEXT_BLOCK_IMAGE;
181 if (soup->len == soup->cap) 105 block->u.i.transform = *ctm;
182 { 106 block->u.i.image = fz_keep_image(ctx, image);
183 int newcap = (soup->cap ? soup->cap * 2 : 16); 107 block->bbox.x0 = 0;
184 soup->spans = fz_resize_array(ctx, soup->spans, newcap, sizeof(*soup->spans)); 108 block->bbox.y0 = 0;
185 soup->cap = newcap; 109 block->bbox.x1 = 1;
186 } 110 block->bbox.y1 = 1;
187 add_bbox_to_span(span); 111 fz_transform_rect(&block->bbox, ctm);
188 soup->spans[soup->len++] = span; 112 return block;
189} 113}
190 114
191static fz_stext_line * 115static fz_stext_line *
192push_span(fz_context *ctx, fz_stext_device *tdev, fz_stext_span *span, int new_line, float distance) 116add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, int wmode)
193{ 117{
194 fz_stext_line *line; 118 fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
195 fz_stext_block *block; 119 if (!block->u.t.first_line)
196 fz_stext_page *page = tdev->page; 120 block->u.t.first_line = block->u.t.last_line = line;
197 int prev_not_text = 0;
198
199 if (page->len == 0 || page->blocks[page->len-1].type != FZ_PAGE_BLOCK_TEXT)
200 prev_not_text = 1;
201
202 if (new_line || prev_not_text)
203 {
204 float size = fz_matrix_expansion(&span->transform);
205 /* So, a new line. Part of the same block or not? */
206 if (distance == 0 || distance > size * 1.5f || distance < -size * PARAGRAPH_DIST || page->len == 0 || prev_not_text)
207 {
208 /* New block */
209 if (page->len == page->cap)
210 {
211 int newcap = (page->cap ? page->cap*2 : 4);
212 page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks));
213 page->cap = newcap;
214 }
215 block = fz_malloc_struct(ctx, fz_stext_block);
216 page->blocks[page->len].type = FZ_PAGE_BLOCK_TEXT;
217 page->blocks[page->len].u.text = block;
218 block->cap = 0;
219 block->len = 0;
220 block->lines = 0;
221 block->bbox = fz_empty_rect;
222 page->len++;
223 distance = 0;
224 }
225
226 /* New line */
227 block = page->blocks[page->len-1].u.text;
228 if (block->len == block->cap)
229 {
230 int newcap = (block->cap ? block->cap*2 : 4);
231 block->lines = fz_resize_array(ctx, block->lines, newcap, sizeof(*block->lines));
232 block->cap = newcap;
233 }
234 block->lines[block->len].first_span = NULL;
235 block->lines[block->len].last_span = NULL;
236 block->lines[block->len].distance = distance;
237 block->lines[block->len].bbox = fz_empty_rect;
238 block->len++;
239 }
240
241 /* Find last line and append to it */
242 block = page->blocks[page->len-1].u.text;
243 line = &block->lines[block->len-1];
244
245 fz_union_rect(&block->lines[block->len-1].bbox, &span->bbox);
246 fz_union_rect(&block->bbox, &span->bbox);
247 span->base_offset = (new_line ? 0 : distance);
248
249 if (!line->first_span)
250 {
251 line->first_span = line->last_span = span;
252 span->next = NULL;
253 }
254 else 121 else
255 { 122 {
256 line->last_span->next = span; 123 block->u.t.last_line->next = line;
257 line->last_span = span; 124 block->u.t.last_line = line;
258 } 125 }
259 126
127 line->wmode = wmode;
128
260 return line; 129 return line;
261} 130}
262 131
263#if defined(DEBUG_SPANS) || defined(DEBUG_ALIGN) || defined(DEBUG_INDENTS) 132static float min4(float a, float b, float c, float d)
264static void
265dump_span(fz_stext_span *s)
266{ 133{
267 int i; 134 return fz_min(fz_min(a, b), fz_min(c, d));
268 for (i=0; i < s->len; i++)
269 {
270 printf("%c", s->text[i].c);
271 }
272} 135}
273#endif
274 136
275#ifdef DEBUG_ALIGN 137static float max4(float a, float b, float c, float d)
276static void
277dump_line(fz_stext_line *line)
278{ 138{
279 int i; 139 return fz_max(fz_max(a, b), fz_max(c, d));
280 for (i=0; i < line->len; i++)
281 {
282 fz_stext_span *s = line->spans[i];
283 if (s->spacing > 1)
284 printf(" ");
285 dump_span(s);
286 }
287 printf("\n");
288} 140}
289#endif
290 141
291static void 142static fz_stext_char *
292strain_soup(fz_context *ctx, fz_stext_device *tdev) 143add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, const fz_matrix *trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int rtl)
293{ 144{
294 span_soup *soup = tdev->spans; 145 fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
295 fz_stext_line *last_line = NULL; 146 fz_point a, d;
296 fz_stext_span *last_span = NULL;
297 int span_num;
298
299 if (soup == NULL)
300 return;
301 147
302 /* Really dumb implementation to match what we had before */ 148 if (!line->first_char)
303 for (span_num=0; span_num < soup->len; span_num++) 149 line->first_char = line->last_char = ch;
150 else
304 { 151 {
305 fz_stext_span *span = soup->spans[span_num]; 152 line->last_char->next = ch;
306 int new_line = 1; 153 line->last_char = ch;
307 float distance = 0;
308 float spacing = 0;
309 soup->spans[span_num] = NULL;
310 if (last_span)
311 {
312 /* If we have a last_span, we must have a last_line */
313 /* Do span and last_line share the same baseline? */
314 fz_point p, q, perp_r;
315 float dot;
316 float size = fz_matrix_expansion(&span->transform);
317
318#ifdef DEBUG_SPANS
319 {
320 printf("Comparing: \"");
321 dump_span(last_span);
322 printf("\" and \"");
323 dump_span(span);
324 printf("\"\n");
325 }
326#endif
327
328 p.x = last_line->first_span->max.x - last_line->first_span->min.x;
329 p.y = last_line->first_span->max.y - last_line->first_span->min.y;
330 fz_normalize_vector(&p);
331 q.x = span->max.x - span->min.x;
332 q.y = span->max.y - span->min.y;
333 fz_normalize_vector(&q);
334#ifdef DEBUG_SPANS
335 printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y);
336 printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y);
337#endif
338 perp_r.y = last_line->first_span->min.x - span->min.x;
339 perp_r.x = -(last_line->first_span->min.y - span->min.y);
340 /* Check if p and q are parallel. If so, then this
341 * line is parallel with the last one. */
342 dot = p.x * q.x + p.y * q.y;
343 if (fabsf(dot) > 0.9995f)
344 {
345 /* If we take the dot product of normalised(p) and
346 * perp(r), we get the perpendicular distance from
347 * one line to the next (assuming they are parallel). */
348 distance = p.x * perp_r.x + p.y * perp_r.y;
349 /* We allow 'small' distances of baseline changes
350 * to cope with super/subscript. FIXME: We should
351 * gather subscript/superscript information here. */
352 new_line = (fabsf(distance) > size * LINE_DIST);
353 }
354 else
355 {
356 new_line = 1;
357 distance = 0;
358 }
359 if (!new_line)
360 {
361 fz_point delta;
362
363 delta.x = span->min.x - last_span->max.x;
364 delta.y = span->min.y - last_span->max.y;
365
366 spacing = (p.x * delta.x + p.y * delta.y);
367 spacing = fabsf(spacing);
368 /* Only allow changes in baseline (subscript/superscript etc)
369 * when the spacing is small. */
370 if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f)
371 {
372 new_line = 1;
373 distance = 0;
374 spacing = 0;
375 }
376 else
377 {
378 spacing /= size * SPACE_DIST;
379 /* Apply the same logic here as when we're adding chars to build spans. */
380 if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
381 spacing = 1;
382 }
383 }
384#ifdef DEBUG_SPANS
385 printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing);
386#endif
387 }
388 span->spacing = spacing;
389 last_line = push_span(ctx, tdev, span, new_line, distance);
390 last_span = span;
391 } 154 }
392}
393
394fz_stext_sheet *
395fz_new_stext_sheet(fz_context *ctx)
396{
397 fz_stext_sheet *sheet = fz_malloc(ctx, sizeof *sheet);
398 sheet->maxid = 0;
399 sheet->style = NULL;
400 return sheet;
401}
402 155
403void 156 ch->c = c;
404fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet) 157 ch->rtl = rtl;
405{ 158 ch->origin = *p;
406 fz_stext_style *style; 159 ch->size = size;
160 ch->font = font; /* TODO: keep and drop */
407 161
408 if (sheet == NULL) 162 if (line->wmode == 0)
409 return;
410
411 style = sheet->style;
412 while (style)
413 { 163 {
414 fz_stext_style *next = style->next; 164 a.x = 0;
415 fz_drop_font(ctx, style->font); 165 d.x = 0;
416 fz_free(ctx, style); 166 a.y = fz_font_ascender(ctx, font);
417 style = next; 167 d.y = fz_font_descender(ctx, font);
418 } 168 }
419 fz_free(ctx, sheet); 169 else
420}
421
422static fz_stext_style *
423fz_lookup_stext_style_imp(fz_context *ctx, fz_stext_sheet *sheet,
424 float size, fz_font *font, int wmode, int script)
425{
426 fz_stext_style *style;
427
428 for (style = sheet->style; style; style = style->next)
429 { 170 {
430 if (style->font == font && 171 fz_rect *bbox = fz_font_bbox(ctx, font);
431 style->size == size && 172 a.x = bbox->x1;
432 style->wmode == wmode && 173 d.x = bbox->x0;
433 style->script == script) /* FIXME: others */ 174 a.y = 0;
434 { 175 d.y = 0;
435 return style;
436 }
437 } 176 }
177 fz_transform_vector(&a, trm);
178 fz_transform_vector(&d, trm);
438 179
439 /* Better make a new one and add it to our list */ 180 ch->bbox.x0 = min4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x);
440 style = fz_malloc(ctx, sizeof *style); 181 ch->bbox.x1 = max4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x);
441 style->id = sheet->maxid++; 182 ch->bbox.y0 = min4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y);
442 style->font = fz_keep_font(ctx, font); 183 ch->bbox.y1 = max4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y);
443 style->size = size;
444 style->wmode = wmode;
445 style->script = script;
446 style->next = sheet->style;
447 sheet->style = style;
448 return style;
449}
450 184
451static fz_stext_style * 185 if (fz_is_empty_rect(&line->bbox))
452fz_lookup_stext_style(fz_context *ctx, fz_stext_sheet *sheet, fz_text_span *span, const fz_matrix *ctm, 186 line->bbox = ch->bbox;
453 fz_colorspace *colorspace, const float *color, float alpha, const fz_stroke_state *stroke) 187 else
454{
455 float size = 1.0f;
456 fz_font *font = span ? span->font : NULL;
457 int wmode = span ? span->wmode : 0;
458 if (ctm && span)
459 { 188 {
460 fz_matrix tm = span->trm; 189 line->bbox.x0 = fz_min(line->bbox.x0, ch->bbox.x0);
461 fz_matrix trm; 190 line->bbox.y0 = fz_min(line->bbox.y0, ch->bbox.y0);
462 tm.e = 0; 191 line->bbox.x1 = fz_min(line->bbox.x1, ch->bbox.x1);
463 tm.f = 0; 192 line->bbox.y1 = fz_min(line->bbox.y1, ch->bbox.y1);
464 fz_concat(&trm, &tm, ctm);
465 size = fz_matrix_expansion(&trm);
466 } 193 }
467 return fz_lookup_stext_style_imp(ctx, sheet, size, font, wmode, 0);
468}
469 194
470fz_stext_page * 195 return ch;
471fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox)
472{
473 fz_stext_page *page = fz_malloc(ctx, sizeof(*page));
474 page->mediabox = *mediabox;
475 page->len = 0;
476 page->cap = 0;
477 page->blocks = NULL;
478 page->next = NULL;
479 return page;
480} 196}
481 197
482static void 198static int
483fz_drop_stext_line_contents(fz_context *ctx, fz_stext_line *line) 199direction_from_bidi_class(int bidiclass, int curdir)
484{ 200{
485 fz_stext_span *span, *next; 201 switch (bidiclass)
486 for (span = line->first_span; span; span=next)
487 { 202 {
488 next = span->next; 203 /* strong */
489 fz_free(ctx, span->text); 204 case UCDN_BIDI_CLASS_L: return 1;
490 fz_free(ctx, span); 205 case UCDN_BIDI_CLASS_R: return -1;
491 } 206 case UCDN_BIDI_CLASS_AL: return -1;
492}
493 207
494static void 208 /* weak */
495fz_drop_stext_block(fz_context *ctx, fz_stext_block *block) 209 case UCDN_BIDI_CLASS_EN:
496{ 210 case UCDN_BIDI_CLASS_ES:
497 fz_stext_line *line; 211 case UCDN_BIDI_CLASS_ET:
498 if (block == NULL) 212 case UCDN_BIDI_CLASS_AN:
499 return; 213 case UCDN_BIDI_CLASS_CS:
500 for (line = block->lines; line < block->lines + block->len; line++) 214 case UCDN_BIDI_CLASS_NSM:
501 fz_drop_stext_line_contents(ctx, line); 215 case UCDN_BIDI_CLASS_BN:
502 fz_free(ctx, block->lines); 216 return curdir;
503 fz_free(ctx, block);
504}
505 217
506static void 218 /* neutral */
507fz_drop_image_block(fz_context *ctx, fz_image_block *block) 219 case UCDN_BIDI_CLASS_B:
508{ 220 case UCDN_BIDI_CLASS_S:
509 if (block == NULL) 221 case UCDN_BIDI_CLASS_WS:
510 return; 222 case UCDN_BIDI_CLASS_ON:
511 fz_drop_image(ctx, block->image); 223 return curdir;
512 fz_drop_colorspace(ctx, block->cspace);
513 fz_free(ctx, block);
514}
515 224
516void 225 /* embedding, override, pop ... we don't support them */
517fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) 226 default:
518{ 227 return 0;
519 fz_page_block *block;
520 if (page == NULL)
521 return;
522 for (block = page->blocks; block < page->blocks + page->len; block++)
523 {
524 switch (block->type)
525 {
526 case FZ_PAGE_BLOCK_TEXT:
527 fz_drop_stext_block(ctx, block->u.text);
528 break;
529 case FZ_PAGE_BLOCK_IMAGE:
530 fz_drop_image_block(ctx, block->u.image);
531 break;
532 }
533 } 228 }
534 fz_free(ctx, page->blocks);
535 fz_free(ctx, page);
536} 229}
537 230
538static fz_stext_span * 231static int
539fz_new_stext_span(fz_context *ctx, const fz_point *p, int wmode, const fz_matrix *trm) 232sign_eq(float x, float y)
540{ 233{
541 fz_stext_span *span = fz_malloc_struct(ctx, fz_stext_span); 234 return (x < 0 && y < 0) || (x > 0 && y > 0) || (x == 0 && y == 0);
542 span->ascender_max = 0;
543 span->descender_min = 0;
544 span->cap = 0;
545 span->len = 0;
546 span->min = *p;
547 span->max = *p;
548 span->wmode = wmode;
549 span->transform.a = trm->a;
550 span->transform.b = trm->b;
551 span->transform.c = trm->c;
552 span->transform.d = trm->d;
553 span->transform.e = 0;
554 span->transform.f = 0;
555 span->text = NULL;
556 span->next = NULL;
557 return span;
558} 235}
559 236
560static void 237static int
561add_char_to_span(fz_context *ctx, fz_stext_span *span, int c, fz_point *p, fz_point *max, fz_stext_style *style) 238mat_sign_eq(const fz_matrix *x, const fz_matrix *y)
562{ 239{
563 if (span->len == span->cap) 240 return sign_eq(x->a, y->a) && sign_eq(x->b, y->b) && sign_eq(x->c, y->c) && sign_eq(x->d, y->d);
564 {
565 int newcap = (span->cap ? span->cap * 2 : 16);
566 span->text = fz_resize_array(ctx, span->text, newcap, sizeof(fz_stext_char));
567 span->cap = newcap;
568 span->bbox = fz_empty_rect;
569 }
570 span->max = *max;
571 if (style->ascender > span->ascender_max)
572 span->ascender_max = style->ascender;
573 if (style->descender < span->descender_min)
574 span->descender_min = style->descender;
575 span->text[span->len].c = c;
576 span->text[span->len].p = *p;
577 span->text[span->len].style = style;
578 span->len++;
579} 241}
580 242
581static void 243static void
582fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) 244fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode)
583{ 245{
584 int can_append = 1; 246 fz_stext_page *page = dev->page;
247 fz_stext_block *cur_block;
248 fz_stext_line *cur_line;
249
250 int new_para = 0;
251 int new_line = 1;
585 int add_space = 0; 252 int add_space = 0;
586 fz_point dir, ndir, p, q, r; 253 fz_point dir, ndir, p, q;
587 float size; 254 float size;
588 fz_point delta; 255 fz_point delta;
589 float spacing = 0; 256 float spacing = 0;
590 float base_offset = 0; 257 float base_offset = 0;
258 int rtl = 0;
259
260 dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir);
591 261
262 /* dir = direction vector for motion. ndir = normalised(dir) */
592 if (wmode == 0) 263 if (wmode == 0)
593 { 264 {
594 dir.x = 1; 265 dir.x = 1;
@@ -602,17 +273,16 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty
602 fz_transform_vector(&dir, trm); 273 fz_transform_vector(&dir, trm);
603 ndir = dir; 274 ndir = dir;
604 fz_normalize_vector(&ndir); 275 fz_normalize_vector(&ndir);
605 /* dir = direction vector for motion. ndir = normalised(dir) */
606 276
607 size = fz_matrix_expansion(trm); 277 size = fz_matrix_expansion(trm);
608 278
609 /* We need to identify where glyphs 'start' (p) and 'stop' (q). 279 /* We need to identify where glyphs 'start' (p) and 'stop' (q).
610 * Each glyph holds it's 'start' position, and the next glyph in the 280 * Each glyph holds its 'start' position, and the next glyph in the
611 * span (or span->max if there is no next glyph) holds it's 'end' 281 * span (or span->max if there is no next glyph) holds its 'end'
612 * position. 282 * position.
613 * 283 *
614 * For both horizontal and vertical motion, trm->{e,f} gives the 284 * For both horizontal and vertical motion, trm->{e,f} gives the
615 * bottom left corner of the glyph. 285 * origin (usually the bottom left) of the glyph.
616 * 286 *
617 * In horizontal mode: 287 * In horizontal mode:
618 * + p is bottom left. 288 * + p is bottom left.
@@ -636,37 +306,38 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty
636 q.y = trm->f; 306 q.y = trm->f;
637 } 307 }
638 308
639 if (glyph < 0) 309 /* Find current position to enter new text. */
310 cur_block = page->last_block;
311 if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
312 cur_block = NULL;
313 cur_line = cur_block ? cur_block->u.t.last_line : NULL;
314
315 if (cur_line && glyph < 0)
640 { 316 {
641 /* Don't reset 'pen' to start of no-glyph characters in cluster */ 317 /* Don't advance pen or break lines for no-glyph characters in a cluster */
642 if (dev->cur_span) 318 add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, 0);
643 q = dev->cur_span->max; 319 dev->lastchar = c;
644 goto no_glyph; 320 return;
645 } 321 }
646 322
647 if (dev->cur_span == NULL || 323 if (cur_line == NULL || !mat_sign_eq(trm, &dev->trm) || cur_line->wmode != wmode)
648 trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b ||
649 trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d ||
650 dev->cur_span->wmode != wmode)
651 { 324 {
652 /* If the matrix has changed, or the wmode is different (or 325 /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
653 * if we don't have a span at all), then we can't append. */ 326 * then we can't append to the current block/line. */
654#ifdef DEBUG_SPANS 327 new_para = 1;
655 printf("Transform/WMode changed\n"); 328 new_line = 1;
656#endif
657 can_append = 0;
658 } 329 }
659 else 330 else
660 { 331 {
661 delta.x = q.x - dev->cur_span->max.x; 332 /* Detect fake bold where text is printed twice in the same place. */
662 delta.y = q.y - dev->cur_span->max.y; 333 delta.x = q.x - dev->pen.x;
334 delta.y = q.y - dev->pen.y;
663 if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) 335 if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar)
664 return; 336 return;
665 337
666 /* Calculate how far we've moved since the end of the current 338 /* Calculate how far we've moved since the last character. */
667 * span. */ 339 delta.x = p.x - dev->pen.x;
668 delta.x = p.x - dev->cur_span->max.x; 340 delta.y = p.y - dev->pen.y;
669 delta.y = p.y - dev->cur_span->max.y;
670 341
671 /* The transform has not changed, so we know we're in the same 342 /* The transform has not changed, so we know we're in the same
672 * direction. Calculate 2 distances; how far off the previous 343 * direction. Calculate 2 distances; how far off the previous
@@ -675,102 +346,129 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty
675 spacing = ndir.x * delta.x + ndir.y * delta.y; 346 spacing = ndir.x * delta.x + ndir.y * delta.y;
676 base_offset = -ndir.y * delta.x + ndir.x * delta.y; 347 base_offset = -ndir.y * delta.x + ndir.x * delta.y;
677 348
678 spacing /= size * SPACE_DIST; 349 /* Only a small amount off the baseline - we'll take this */
679 if (fabsf(base_offset) < size * 0.1f) 350 if (fabsf(base_offset) < size * 0.8f)
680 { 351 {
681 /* Only a small amount off the baseline - we'll take this */ 352 /* LTR or neutral character */
682 if (fabsf(spacing) < 1.0f) 353 if (dev->curdir >= 0)
683 { 354 {
684 /* Motion is in line, and small. */ 355 if (fabs(spacing) < size * SPACE_DIST)
685 } 356 {
686 else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) 357 /* Motion is in line, and small. */
687 { 358 new_line = 0;
688 /* Motion is in line, but large enough 359 }
689 * to warrant us adding a space */ 360 else if (spacing >= size * SPACE_DIST && spacing < size * SPACE_MAX_DIST)
690 if (dev->lastchar != ' ' && wmode == 0) 361 {
691 add_space = 1; 362 /* Motion is in line, but large enough to warrant us adding a space. */
363 if (dev->lastchar != ' ' && wmode == 0)
364 add_space = 1;
365 new_line = 0;
366 }
367 else
368 {
369 /* Motion is in line, but large enough to warrant splitting to a new line */
370 new_line = 1;
371 }
692 } 372 }
373
374 /* RTL character -- disable space character and column detection heuristics */
693 else 375 else
694 { 376 {
695 /* Motion is in line, but too large - split to a new span */ 377 new_line = 0;
696 can_append = 0; 378 if (spacing > size * SPACE_DIST || spacing < 0)
379 rtl = 0; /* backward (or big jump to 'right' side) means logical order */
380 else
381 rtl = 1; /* visual order, we need to reverse in a post process pass */
697 } 382 }
698 } 383 }
384
385 /* Enough for a new line, but not enough for a new paragraph */
386 else if (fabsf(base_offset) < size * 1.3f)
387 {
388 /* Check indent to spot text-indent style paragraphs */
389 if (wmode == 0 && cur_line && dev->new_obj)
390 if (fabsf(p.x - dev->start.x) > size * 0.5f)
391 new_para = 1;
392 new_line = 1;
393 }
394
395 /* Way off the baseline - open a new paragraph */
699 else 396 else
700 { 397 {
701 can_append = 0; 398 new_para = 1;
702#ifdef DEBUG_SPANS 399 new_line = 1;
703 spacing = 0;
704#endif
705 } 400 }
706 } 401 }
707 402
708#ifdef DEBUG_SPANS 403 /* Start a new block (but only at the beginning of a text object) */
709 printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset); 404 if (new_para || !cur_block)
710#endif 405 {
406 cur_block = add_text_block_to_page(ctx, page);
407 cur_line = cur_block->u.t.last_line;
408 }
711 409
712 /* Start a new span */ 410 /* Start a new line */
713 if (!can_append) 411 if (new_line || !cur_line)
714 { 412 {
715 add_span_to_soup(ctx, dev->spans, dev->cur_span); 413 cur_line = add_line_to_block(ctx, page, cur_block, wmode);
716 dev->cur_span = NULL; 414 dev->start = p;
717 dev->cur_span = fz_new_stext_span(ctx, &p, wmode, trm);
718 dev->cur_span->spacing = 0;
719 } 415 }
720 416
721 /* Add synthetic space */ 417 /* Add synthetic space */
722 if (add_space) 418 if (add_space)
723 { 419 add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, rtl);
724 /* We know we always have a cur_span here */
725 r = dev->cur_span->max;
726 add_char_to_span(ctx, dev->cur_span, ' ', &r, &p, style);
727 }
728 420
729no_glyph: 421 add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, rtl);
730 add_char_to_span(ctx, dev->cur_span, c, &p, &q, style);
731 dev->lastchar = c; 422 dev->lastchar = c;
423 dev->pen = q;
424
425 dev->new_obj = 0;
426 dev->trm = *trm;
732} 427}
733 428
734static void 429static void
735fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) 430fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode)
736{ 431{
737 /* ignore when one unicode character maps to multiple glyphs */ 432 /* ignore when one unicode character maps to multiple glyphs */
738 if (c == -1) 433 if (c == -1)
739 return; 434 return;
740 435
741 if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) 436 if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
437 {
742 switch (c) 438 switch (c)
743 { 439 {
744 case 0xFB00: /* ff */ 440 case 0xFB00: /* ff */
745 fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); 441 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
746 fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); 442 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
747 return; 443 return;
748 case 0xFB01: /* fi */ 444 case 0xFB01: /* fi */
749 fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); 445 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
750 fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); 446 fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode);
751 return; 447 return;
752 case 0xFB02: /* fl */ 448 case 0xFB02: /* fl */
753 fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); 449 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
754 fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); 450 fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode);
755 return; 451 return;
756 case 0xFB03: /* ffi */ 452 case 0xFB03: /* ffi */
757 fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); 453 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
758 fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); 454 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
759 fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); 455 fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode);
760 return; 456 return;
761 case 0xFB04: /* ffl */ 457 case 0xFB04: /* ffl */
762 fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); 458 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
763 fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); 459 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
764 fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); 460 fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode);
765 return; 461 return;
766 case 0xFB05: /* long st */ 462 case 0xFB05: /* long st */
767 case 0xFB06: /* st */ 463 case 0xFB06: /* st */
768 fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode); 464 fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode);
769 fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode); 465 fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode);
770 return; 466 return;
771 } 467 }
468 }
772 469
773 if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) 470 if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
471 {
774 switch (c) 472 switch (c)
775 { 473 {
776 case 0x0009: /* tab */ 474 case 0x0009: /* tab */
@@ -794,56 +492,23 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style,
794 case 0x3000: /* ideographic space */ 492 case 0x3000: /* ideographic space */
795 c = ' '; 493 c = ' ';
796 } 494 }
495 }
797 496
798 fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode); 497 fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode);
799} 498}
800 499
801static void 500static void
802fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm, fz_stext_style *style) 501fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm)
803{ 502{
804 fz_font *font = span->font; 503 fz_font *font = span->font;
805 FT_Face face = fz_font_ft_face(ctx, font);
806 fz_buffer **t3procs = fz_font_t3_procs(ctx, font);
807 fz_rect *bbox = fz_font_bbox(ctx, font);
808 fz_matrix tm = span->trm; 504 fz_matrix tm = span->trm;
809 fz_matrix trm; 505 fz_matrix trm;
810 float adv; 506 float adv;
811 float ascender = 1; 507 int i;
812 float descender = 0;
813 int i, err;
814 508
815 if (span->len == 0) 509 if (span->len == 0)
816 return; 510 return;
817 511
818 if (dev->spans == NULL)
819 dev->spans = new_span_soup(ctx);
820
821 if (style->wmode == 0)
822 {
823 if (face)
824 {
825 fz_lock(ctx, FZ_LOCK_FREETYPE);
826 err = FT_Set_Char_Size(face, 64, 64, 72, 72);
827 if (err)
828 fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
829 ascender = (float)face->ascender / face->units_per_EM;
830 descender = (float)face->descender / face->units_per_EM;
831 fz_unlock(ctx, FZ_LOCK_FREETYPE);
832 }
833 else if (t3procs && !fz_is_empty_rect(bbox))
834 {
835 ascender = bbox->y1;
836 descender = bbox->y0;
837 }
838 }
839 else
840 {
841 ascender = bbox->x1;
842 descender = bbox->x0;
843 }
844 style->ascender = ascender;
845 style->descender = descender;
846
847 tm.e = 0; 512 tm.e = 0;
848 tm.f = 0; 513 tm.f = 0;
849 fz_concat(&trm, &tm, ctm); 514 fz_concat(&trm, &tm, ctm);
@@ -857,11 +522,11 @@ fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, cons
857 522
858 /* Calculate bounding box and new pen position based on font metrics */ 523 /* Calculate bounding box and new pen position based on font metrics */
859 if (span->items[i].gid >= 0) 524 if (span->items[i].gid >= 0)
860 adv = fz_advance_glyph(ctx, font, span->items[i].gid, style->wmode); 525 adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode);
861 else 526 else
862 adv = 0; 527 adv = 0;
863 528
864 fz_add_stext_char(ctx, dev, style, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode); 529 fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode);
865 } 530 }
866} 531}
867 532
@@ -870,13 +535,10 @@ fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, const f
870 fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) 535 fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params)
871{ 536{
872 fz_stext_device *tdev = (fz_stext_device*)dev; 537 fz_stext_device *tdev = (fz_stext_device*)dev;
873 fz_stext_style *style;
874 fz_text_span *span; 538 fz_text_span *span;
539 tdev->new_obj = 1;
875 for (span = text->head; span; span = span->next) 540 for (span = text->head; span; span = span->next)
876 { 541 fz_stext_extract(ctx, tdev, span, ctm);
877 style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, NULL);
878 fz_stext_extract(ctx, tdev, span, ctm, style);
879 }
880} 542}
881 543
882static void 544static void
@@ -884,94 +546,61 @@ fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const
884 fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) 546 fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params)
885{ 547{
886 fz_stext_device *tdev = (fz_stext_device*)dev; 548 fz_stext_device *tdev = (fz_stext_device*)dev;
887 fz_stext_style *style;
888 fz_text_span *span; 549 fz_text_span *span;
550 tdev->new_obj = 1;
889 for (span = text->head; span; span = span->next) 551 for (span = text->head; span; span = span->next)
890 { 552 fz_stext_extract(ctx, tdev, span, ctm);
891 style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, stroke);
892 fz_stext_extract(ctx, tdev, span, ctm, style);
893 }
894} 553}
895 554
896static void 555static void
897fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor) 556fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor)
898{ 557{
899 fz_stext_device *tdev = (fz_stext_device*)dev; 558 fz_stext_device *tdev = (fz_stext_device*)dev;
900 fz_stext_style *style;
901 fz_text_span *span; 559 fz_text_span *span;
560 tdev->new_obj = 1;
902 for (span = text->head; span; span = span->next) 561 for (span = text->head; span; span = span->next)
903 { 562 fz_stext_extract(ctx, tdev, span, ctm);
904 style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL);
905 fz_stext_extract(ctx, tdev, span, ctm, style);
906 }
907} 563}
908 564
909static void 565static void
910fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor) 566fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor)
911{ 567{
912 fz_stext_device *tdev = (fz_stext_device*)dev; 568 fz_stext_device *tdev = (fz_stext_device*)dev;
913 fz_stext_style *style;
914 fz_text_span *span; 569 fz_text_span *span;
570 tdev->new_obj = 1;
915 for (span = text->head; span; span = span->next) 571 for (span = text->head; span; span = span->next)
916 { 572 fz_stext_extract(ctx, tdev, span, ctm);
917 style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, stroke);
918 fz_stext_extract(ctx, tdev, span, ctm, style);
919 }
920} 573}
921 574
922static void 575static void
923fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm) 576fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm)
924{ 577{
925 fz_stext_device *tdev = (fz_stext_device*)dev; 578 fz_stext_device *tdev = (fz_stext_device*)dev;
926 fz_stext_style *style;
927 fz_text_span *span; 579 fz_text_span *span;
580 tdev->new_obj = 1;
928 for (span = text->head; span; span = span->next) 581 for (span = text->head; span; span = span->next)
929 { 582 fz_stext_extract(ctx, tdev, span, ctm);
930 style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL);
931 fz_stext_extract(ctx, tdev, span, ctm, style);
932 }
933} 583}
934 584
585/* Images and shadings */
586
935static void 587static void
936fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, 588fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params)
937 fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params)
938{ 589{
939 fz_stext_device *tdev = (fz_stext_device*)dev; 590 fz_stext_device *tdev = (fz_stext_device*)dev;
940 fz_stext_page *page = tdev->page;
941 fz_image_block *block;
942 591
943 /* If the alpha is less than 50% then it's probably a watermark or 592 /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
944 * effect or something. Skip it */
945 if (alpha < 0.5f) 593 if (alpha < 0.5f)
946 return; 594 return;
947 595
948 /* New block */ 596 add_image_block_to_page(ctx, tdev->page, ctm, img);
949 if (page->len == page->cap)
950 {
951 int newcap = (page->cap ? page->cap*2 : 4);
952 page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks));
953 page->cap = newcap;
954 }
955 block = fz_malloc_struct(ctx, fz_image_block);
956 page->blocks[page->len].type = FZ_PAGE_BLOCK_IMAGE;
957 page->blocks[page->len].u.image = block;
958 block->image = fz_keep_image(ctx, img);
959 block->cspace = fz_keep_colorspace(ctx, cspace);
960 if (cspace)
961 memcpy(block->colors, color, sizeof(block->colors[0])*fz_colorspace_n(ctx, cspace));
962 block->mat = *ctm;
963 block->bbox.x0 = 0;
964 block->bbox.y0 = 0;
965 block->bbox.x1 = 1;
966 block->bbox.y1 = 1;
967 fz_transform_rect(&block->bbox, ctm);
968 page->len++;
969} 597}
970 598
971static void 599static void
972fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params) 600fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm,
601 fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params)
973{ 602{
974 fz_stext_fill_image_mask(ctx, dev, img, ctm, NULL, NULL, alpha, color_params); 603 fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
975} 604}
976 605
977static fz_image * 606static fz_image *
@@ -1025,103 +654,89 @@ fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, const fz_m
1025 fz_rethrow(ctx); 654 fz_rethrow(ctx);
1026} 655}
1027 656
1028static int 657/* RTL visual to logical order pass */
1029direction_from_bidi_class(int bidiclass, int curdir)
1030{
1031 switch (bidiclass)
1032 {
1033 /* strong */
1034 case UCDN_BIDI_CLASS_L: return 1;
1035 case UCDN_BIDI_CLASS_R: return -1;
1036 case UCDN_BIDI_CLASS_AL: return -1;
1037
1038 /* weak */
1039 case UCDN_BIDI_CLASS_EN:
1040 case UCDN_BIDI_CLASS_ES:
1041 case UCDN_BIDI_CLASS_ET:
1042 case UCDN_BIDI_CLASS_AN:
1043 case UCDN_BIDI_CLASS_CS:
1044 case UCDN_BIDI_CLASS_NSM:
1045 case UCDN_BIDI_CLASS_BN:
1046 return curdir;
1047
1048 /* neutral */
1049 case UCDN_BIDI_CLASS_B:
1050 case UCDN_BIDI_CLASS_S:
1051 case UCDN_BIDI_CLASS_WS:
1052 case UCDN_BIDI_CLASS_ON:
1053 return curdir;
1054
1055 /* embedding, override, pop ... we don't support them */
1056 default:
1057 return 0;
1058 }
1059}
1060 658
1061static void 659static void
1062fz_bidi_reorder_run(fz_stext_span *span, int a, int b, int dir) 660fz_bidi_reorder_run(fz_stext_char *a, fz_stext_char *b, int dir)
1063{ 661{
1064 if (a < b && dir == -1) 662 if (a < b && dir == -1)
1065 { 663 {
1066 fz_stext_char c; 664 fz_stext_char tmp;
1067 int m = a + (b - a) / 2; 665 fz_stext_char *m = a + (b - a) / 2;
1068 while (a < m) 666 while (a < m)
1069 { 667 {
1070 b--; 668 b--;
1071 c = span->text[a]; 669
1072 span->text[a] = span->text[b]; 670 tmp.c = a->c;
1073 span->text[b] = c; 671 tmp.origin = a->origin;
672 tmp.bbox = a->bbox;
673 tmp.size = a->size;
674 tmp.font = a->font;
675
676 a->c = b->c;
677 a->origin = b->origin;
678 a->bbox = b->bbox;
679 a->size = b->size;
680 a->font = b->font;
681
682 b->c = tmp.c;
683 b->origin = tmp.origin;
684 b->bbox = tmp.bbox;
685 b->size = tmp.size;
686 b->font = tmp.font;
687
1074 a++; 688 a++;
1075 } 689 }
1076 } 690 }
1077} 691}
1078 692
1079static void 693static void
1080fz_bidi_reorder_span(fz_stext_span *span) 694fz_bidi_reorder_line(fz_stext_line *line)
1081{ 695{
1082 int a, b, dir, curdir; 696 fz_stext_char *a, *b;
697 int dir, curdir;
1083 698
1084 a = 0; 699 a = line->first_char;
1085 curdir = 1; 700 curdir = 0;
1086 for (b = 0; b < span->len; b++) 701 for (b = line->first_char; b; b = b->next)
1087 { 702 {
1088 dir = direction_from_bidi_class(ucdn_get_bidi_class(span->text[b].c), curdir); 703 dir = b->rtl;
1089 if (dir != curdir) 704 if (dir != curdir)
1090 { 705 {
1091 fz_bidi_reorder_run(span, a, b, curdir); 706 fz_bidi_reorder_run(a, b, curdir);
1092 curdir = dir; 707 curdir = dir;
1093 a = b; 708 a = b;
1094 } 709 }
1095 } 710 }
1096 fz_bidi_reorder_run(span, a, b, curdir); 711 fz_bidi_reorder_run(a, b, curdir);
1097} 712}
1098 713
1099static void 714static void
1100fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page) 715fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page)
1101{ 716{
1102 fz_page_block *pageblock;
1103 fz_stext_block *block; 717 fz_stext_block *block;
1104 fz_stext_line *line; 718 fz_stext_line *line;
1105 fz_stext_span *span;
1106 719
1107 for (pageblock = page->blocks; pageblock < page->blocks + page->len; pageblock++) 720 for (block = page->first_block; block; block = block->next)
1108 if (pageblock->type == FZ_PAGE_BLOCK_TEXT) 721 if (block->type == FZ_STEXT_BLOCK_TEXT)
1109 for (block = pageblock->u.text, line = block->lines; line < block->lines + block->len; line++) 722 for (line = block->u.t.first_line; line; line = line->next)
1110 for (span = line->first_span; span; span = span->next) 723 fz_bidi_reorder_line(line);
1111 fz_bidi_reorder_span(span);
1112} 724}
1113 725
1114static void 726static void
1115fz_stext_close_device(fz_context *ctx, fz_device *dev) 727fz_stext_close_device(fz_context *ctx, fz_device *dev)
1116{ 728{
1117 fz_stext_device *tdev = (fz_stext_device*)dev; 729 fz_stext_device *tdev = (fz_stext_device*)dev;
730 fz_stext_page *page = tdev->page;
731 fz_stext_block *block;
732 fz_stext_line *line;
1118 733
1119 add_span_to_soup(ctx, tdev->spans, tdev->cur_span); 734 for (block = page->first_block; block; block = block->next)
1120 tdev->cur_span = NULL; 735 if (block->type == FZ_STEXT_BLOCK_TEXT)
1121 736 for (line = block->u.t.first_line; line; line = line->next)
1122 strain_soup(ctx, tdev); 737 fz_union_rect(&block->bbox, &line->bbox);
1123 738
1124 /* TODO: smart sorting of blocks in reading order */ 739 /* TODO: smart sorting of blocks and lines in reading order */
1125 /* TODO: unicode NFC normalization */ 740 /* TODO: unicode NFC normalization */
1126 741
1127 fz_bidi_reorder_stext_page(ctx, tdev->page); 742 fz_bidi_reorder_stext_page(ctx, tdev->page);
@@ -1130,9 +745,6 @@ fz_stext_close_device(fz_context *ctx, fz_device *dev)
1130static void 745static void
1131fz_stext_drop_device(fz_context *ctx, fz_device *dev) 746fz_stext_drop_device(fz_context *ctx, fz_device *dev)
1132{ 747{
1133 fz_stext_device *tdev = (fz_stext_device*)dev;
1134 free_span_soup(ctx, tdev->spans);
1135 tdev->spans = NULL;
1136} 748}
1137 749
1138fz_stext_options * 750fz_stext_options *
@@ -1153,7 +765,7 @@ fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *stri
1153} 765}
1154 766
1155fz_device * 767fz_device *
1156fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts) 768fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
1157{ 769{
1158 fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); 770 fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
1159 771
@@ -1174,11 +786,12 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page,
1174 dev->super.fill_image_mask = fz_stext_fill_image_mask; 786 dev->super.fill_image_mask = fz_stext_fill_image_mask;
1175 } 787 }
1176 788
1177 dev->sheet = sheet;
1178 dev->page = page; 789 dev->page = page;
1179 dev->spans = NULL; 790 dev->pen.x = 0;
1180 dev->cur_span = NULL; 791 dev->pen.y = 0;
792 dev->trm = fz_identity;
1181 dev->lastchar = ' '; 793 dev->lastchar = ' ';
794 dev->curdir = 1;
1182 795
1183 return (fz_device*)dev; 796 return (fz_device*)dev;
1184} 797}
diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c
index 63124aa7f..f5f724121 100644
--- a/source/fitz/stext-output.c
+++ b/source/fitz/stext-output.c
@@ -9,40 +9,28 @@
9/* HTML output (visual formatting with preserved layout) */ 9/* HTML output (visual formatting with preserved layout) */
10 10
11static void 11static void
12fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style) 12fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size)
13{ 13{
14 int is_bold = fz_font_is_bold(ctx, style->font); 14 int is_bold = fz_font_is_bold(ctx, font);
15 int is_italic = fz_font_is_italic(ctx, style->font); 15 int is_italic = fz_font_is_italic(ctx, font);
16 int is_serif = fz_font_is_serif(ctx, style->font); 16 int is_serif = fz_font_is_serif(ctx, font);
17 int is_mono = fz_font_is_monospaced(ctx, style->font); 17 int is_mono = fz_font_is_monospaced(ctx, font);
18 int script = style->script;
19 18
20 fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", style->size); 19 fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", size);
21 if (is_mono) 20 if (is_mono)
22 fz_write_string(ctx, out, "<tt>"); 21 fz_write_string(ctx, out, "<tt>");
23 if (is_bold) 22 if (is_bold)
24 fz_write_string(ctx, out, "<b>"); 23 fz_write_string(ctx, out, "<b>");
25 if (is_italic) 24 if (is_italic)
26 fz_write_string(ctx, out, "<i>"); 25 fz_write_string(ctx, out, "<i>");
27
28 while (script-- > 0)
29 fz_write_string(ctx, out, "<sup>");
30 while (++script < 0)
31 fz_write_string(ctx, out, "<sub>");
32} 26}
33 27
34static void 28static void
35fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) 29fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size)
36{ 30{
37 int is_mono = fz_font_is_monospaced(ctx, style->font); 31 int is_mono = fz_font_is_monospaced(ctx, font);
38 int is_bold = fz_font_is_bold(ctx, style->font); 32 int is_bold = fz_font_is_bold(ctx,font);
39 int is_italic = fz_font_is_italic(ctx, style->font); 33 int is_italic = fz_font_is_italic(ctx, font);
40 int script = style->script;
41
42 while (script-- > 0)
43 fz_write_string(ctx, out, "</sup>");
44 while (++script < 0)
45 fz_write_string(ctx, out, "</sub>");
46 34
47 if (is_italic) 35 if (is_italic)
48 fz_write_string(ctx, out, "</i>"); 36 fz_write_string(ctx, out, "</i>");
@@ -54,7 +42,7 @@ fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
54} 42}
55 43
56static void 44static void
57fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block) 45fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
58{ 46{
59 int x = block->bbox.x0; 47 int x = block->bbox.x0;
60 int y = block->bbox.y0; 48 int y = block->bbox.y0;
@@ -62,90 +50,78 @@ fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *bl
62 int h = block->bbox.y1 - block->bbox.y0; 50 int h = block->bbox.y1 - block->bbox.y0;
63 51
64 fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h); 52 fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h);
65 fz_write_image_as_data_uri(ctx, out, block->image); 53 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
66 fz_write_string(ctx, out, "\">\n"); 54 fz_write_string(ctx, out, "\">\n");
67} 55}
68 56
69void 57void
70fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) 58fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
71{ 59{
72 fz_stext_style *style = NULL;
73 fz_stext_line *line; 60 fz_stext_line *line;
74 fz_stext_span *span;
75 fz_stext_char *ch; 61 fz_stext_char *ch;
76 int x, y; 62 int x, y;
77 63
78 style = NULL; 64 fz_font *font = NULL;
65 float size = 0;
79 66
80 for (line = block->lines; line < block->lines + block->len; ++line) 67 for (line = block->u.t.first_line; line; line = line->next)
81 { 68 {
82 for (span = line->first_span; span; span = span->next) 69 x = line->bbox.x0;
70 y = line->bbox.y0;
71
72 fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x);
73 font = NULL;
74
75 for (ch = line->first_char; ch; ch = ch->next)
83 { 76 {
84 if (span == line->first_span || span->spacing > 1) 77 if (ch->font != font || ch->size != size)
85 { 78 {
86 if (style) 79 if (font)
87 { 80 fz_print_style_end_html(ctx, out, font, size);
88 fz_print_style_end_html(ctx, out, style); 81 font = ch->font;
89 fz_write_string(ctx, out, "</p>\n"); 82 size = ch->size;
90 style = NULL; 83 fz_print_style_begin_html(ctx, out, font, size);
91 }
92 x = span->bbox.x0;
93 y = span->bbox.y0;
94 fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x);
95 } 84 }
96 85
97 for (ch = span->text; ch < span->text + span->len; ++ch) 86 switch (ch->c)
98 { 87 {
99 if (ch->style != style) 88 default:
100 { 89 if (ch->c >= 32 && ch->c <= 127)
101 if (style) 90 fz_write_byte(ctx, out, ch->c);
102 fz_print_style_end_html(ctx, out, style); 91 else
103 style = ch->style; 92 fz_write_printf(ctx, out, "&#x%x;", ch->c);
104 fz_print_style_begin_html(ctx, out, style); 93 break;
105 } 94 case '<': fz_write_string(ctx, out, "&lt;"); break;
106 95 case '>': fz_write_string(ctx, out, "&gt;"); break;
107 switch (ch->c) 96 case '&': fz_write_string(ctx, out, "&amp;"); break;
108 { 97 case '"': fz_write_string(ctx, out, "&quot;"); break;
109 default: 98 case '\'': fz_write_string(ctx, out, "&apos;"); break;
110 if (ch->c >= 32 && ch->c <= 127)
111 fz_write_byte(ctx, out, ch->c);
112 else
113 fz_write_printf(ctx, out, "&#x%x;", ch->c);
114 break;
115 case '<': fz_write_string(ctx, out, "&lt;"); break;
116 case '>': fz_write_string(ctx, out, "&gt;"); break;
117 case '&': fz_write_string(ctx, out, "&amp;"); break;
118 case '"': fz_write_string(ctx, out, "&quot;"); break;
119 case '\'': fz_write_string(ctx, out, "&apos;"); break;
120 }
121 } 99 }
122 } 100 }
123 101
124 if (style) 102 if (font)
125 { 103 fz_print_style_end_html(ctx, out, font, size);
126 fz_print_style_end_html(ctx, out, style); 104
127 fz_write_string(ctx, out, "</p>\n"); 105 fz_write_string(ctx, out, "</p>\n");
128 style = NULL;
129 }
130 } 106 }
131} 107}
132 108
133void 109void
134fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) 110fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page)
135{ 111{
136 fz_page_block *block; 112 fz_stext_block *block;
137 113
138 int w = page->mediabox.x1 - page->mediabox.x0; 114 int w = page->mediabox.x1 - page->mediabox.x0;
139 int h = page->mediabox.y1 - page->mediabox.y0; 115 int h = page->mediabox.y1 - page->mediabox.y0;
140 116
141 fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h); 117 fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h);
142 118
143 for (block = page->blocks; block < page->blocks + page->len; ++block) 119 for (block = page->first_block; block; block = block->next)
144 { 120 {
145 if (block->type == FZ_PAGE_BLOCK_IMAGE) 121 if (block->type == FZ_STEXT_BLOCK_IMAGE)
146 fz_print_stext_image_as_html(ctx, out, block->u.image); 122 fz_print_stext_image_as_html(ctx, out, block);
147 else if (block->type == FZ_PAGE_BLOCK_TEXT) 123 else if (block->type == FZ_STEXT_BLOCK_TEXT)
148 fz_print_stext_block_as_html(ctx, out, block->u.text); 124 fz_print_stext_block_as_html(ctx, out, block);
149 } 125 }
150 126
151 fz_write_string(ctx, out, "</div>\n"); 127 fz_write_string(ctx, out, "</div>\n");
@@ -177,23 +153,22 @@ fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
177/* XHTML output (semantic, little layout, suitable for reflow) */ 153/* XHTML output (semantic, little layout, suitable for reflow) */
178 154
179static void 155static void
180fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block) 156fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
181{ 157{
182 int w = block->bbox.x1 - block->bbox.x0; 158 int w = block->bbox.x1 - block->bbox.x0;
183 int h = block->bbox.y1 - block->bbox.y0; 159 int h = block->bbox.y1 - block->bbox.y0;
184 160
185 fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h); 161 fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h);
186 fz_write_image_as_data_uri(ctx, out, block->image); 162 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
187 fz_write_string(ctx, out, "\"/></p>\n"); 163 fz_write_string(ctx, out, "\"/></p>\n");
188} 164}
189 165
190static void 166static void
191fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) 167fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size)
192{ 168{
193 int is_mono = fz_font_is_monospaced(ctx, style->font); 169 int is_mono = fz_font_is_monospaced(ctx, font);
194 int is_bold = fz_font_is_bold(ctx, style->font); 170 int is_bold = fz_font_is_bold(ctx, font);
195 int is_italic = fz_font_is_italic(ctx, style->font); 171 int is_italic = fz_font_is_italic(ctx, font);
196 int script = style->script;
197 172
198 if (is_mono) 173 if (is_mono)
199 fz_write_string(ctx, out, "<tt>"); 174 fz_write_string(ctx, out, "<tt>");
@@ -201,25 +176,14 @@ fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *styl
201 fz_write_string(ctx, out, "<b>"); 176 fz_write_string(ctx, out, "<b>");
202 if (is_italic) 177 if (is_italic)
203 fz_write_string(ctx, out, "<i>"); 178 fz_write_string(ctx, out, "<i>");
204
205 while (script-- > 0)
206 fz_write_string(ctx, out, "<sup>");
207 while (++script < 0)
208 fz_write_string(ctx, out, "<sub>");
209} 179}
210 180
211static void 181static void
212fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) 182fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size)
213{ 183{
214 int is_mono = fz_font_is_monospaced(ctx, style->font); 184 int is_mono = fz_font_is_monospaced(ctx, font);
215 int is_bold = fz_font_is_bold(ctx, style->font); 185 int is_bold = fz_font_is_bold(ctx, font);
216 int is_italic = fz_font_is_italic(ctx, style->font); 186 int is_italic = fz_font_is_italic(ctx, font);
217 int script = style->script;
218
219 while (script-- > 0)
220 fz_write_string(ctx, out, "</sup>");
221 while (++script < 0)
222 fz_write_string(ctx, out, "</sub>");
223 187
224 if (is_italic) 188 if (is_italic)
225 fz_write_string(ctx, out, "</i>"); 189 fz_write_string(ctx, out, "</i>");
@@ -232,68 +196,63 @@ fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
232static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) 196static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
233{ 197{
234 fz_stext_line *line; 198 fz_stext_line *line;
235 fz_stext_span *span;
236 fz_stext_char *ch; 199 fz_stext_char *ch;
237 fz_stext_style *style;
238 200
239 style = NULL; 201 fz_font *font = NULL;
240 fz_write_string(ctx, out, "<p>\n"); 202 float size = 0;
203
204 fz_write_string(ctx, out, "<p>");
241 205
242 for (line = block->lines; line < block->lines + block->len; ++line) 206 for (line = block->u.t.first_line; line; line = line->next)
243 { 207 {
244 if (line > block->lines) 208 if (line != block->u.t.first_line)
245 fz_write_string(ctx, out, "<br/>\n"); 209 fz_write_string(ctx, out, "\n");
246 for (span = line->first_span; span; span = span->next) 210 for (ch = line->first_char; ch; ch = ch->next)
247 { 211 {
248 if (span->spacing > 1) 212 if (ch->font != font || ch->size != size)
249 fz_write_byte(ctx, out, ' ');
250
251 for (ch = span->text; ch < span->text + span->len; ++ch)
252 { 213 {
253 if (ch->style != style) 214 if (font)
254 { 215 fz_print_style_end_xhtml(ctx, out, font, size);
255 if (style) 216 font = ch->font;
256 fz_print_style_end_xhtml(ctx, out, style); 217 size = ch->size;
257 style = ch->style; 218 fz_print_style_begin_xhtml(ctx, out, font, size);
258 fz_print_style_begin_xhtml(ctx, out, style); 219 }
259 }
260 220
261 switch (ch->c) 221 switch (ch->c)
262 { 222 {
263 default: 223 default:
264 if (ch->c >= 32 && ch->c <= 127) 224 if (ch->c >= 32 && ch->c <= 127)
265 fz_write_byte(ctx, out, ch->c); 225 fz_write_byte(ctx, out, ch->c);
266 else 226 else
267 fz_write_printf(ctx, out, "&#x%x;", ch->c); 227 fz_write_printf(ctx, out, "&#x%x;", ch->c);
268 break; 228 break;
269 case '<': fz_write_string(ctx, out, "&lt;"); break; 229 case '<': fz_write_string(ctx, out, "&lt;"); break;
270 case '>': fz_write_string(ctx, out, "&gt;"); break; 230 case '>': fz_write_string(ctx, out, "&gt;"); break;
271 case '&': fz_write_string(ctx, out, "&amp;"); break; 231 case '&': fz_write_string(ctx, out, "&amp;"); break;
272 case '"': fz_write_string(ctx, out, "&quot;"); break; 232 case '"': fz_write_string(ctx, out, "&quot;"); break;
273 case '\'': fz_write_string(ctx, out, "&apos;"); break; 233 case '\'': fz_write_string(ctx, out, "&apos;"); break;
274 }
275 } 234 }
276 } 235 }
277 } 236 }
278 237
279 if (style) 238 if (font)
280 fz_print_style_end_xhtml(ctx, out, style); 239 fz_print_style_end_xhtml(ctx, out, font, size);
281 fz_write_string(ctx, out, "\n</p>\n"); 240 fz_write_string(ctx, out, "</p>\n");
282} 241}
283 242
284void 243void
285fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) 244fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page)
286{ 245{
287 fz_page_block *block; 246 fz_stext_block *block;
288 247
289 fz_write_string(ctx, out, "<div>\n"); 248 fz_write_string(ctx, out, "<div>\n");
290 249
291 for (block = page->blocks; block < page->blocks + page->len; ++block) 250 for (block = page->first_block; block; block = block->next)
292 { 251 {
293 if (block->type == FZ_PAGE_BLOCK_IMAGE) 252 if (block->type == FZ_STEXT_BLOCK_IMAGE)
294 fz_print_stext_image_as_xhtml(ctx, out, block->u.image); 253 fz_print_stext_image_as_xhtml(ctx, out, block);
295 else if (block->type == FZ_PAGE_BLOCK_TEXT) 254 else if (block->type == FZ_STEXT_BLOCK_TEXT)
296 fz_print_stext_block_as_xhtml(ctx, out, block->u.text); 255 fz_print_stext_block_as_xhtml(ctx, out, block);
297 } 256 }
298 257
299 fz_write_string(ctx, out, "</div>\n"); 258 fz_write_string(ctx, out, "</div>\n");
@@ -311,6 +270,7 @@ fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
311 fz_write_string(ctx, out, "<style>\n"); 270 fz_write_string(ctx, out, "<style>\n");
312 fz_write_string(ctx, out, "body{background-color:gray}\n"); 271 fz_write_string(ctx, out, "body{background-color:gray}\n");
313 fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n"); 272 fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n");
273 fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
314 fz_write_string(ctx, out, "</style>\n"); 274 fz_write_string(ctx, out, "</style>\n");
315 fz_write_string(ctx, out, "</head>\n"); 275 fz_write_string(ctx, out, "</head>\n");
316 fz_write_string(ctx, out, "<body>\n"); 276 fz_write_string(ctx, out, "<body>\n");
@@ -328,87 +288,79 @@ fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
328void 288void
329fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) 289fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page)
330{ 290{
331 int block_n; 291 fz_stext_block *block;
292 fz_stext_line *line;
293 fz_stext_char *ch;
332 294
333 fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n", 295 fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n",
334 page->mediabox.x1 - page->mediabox.x0, 296 page->mediabox.x1 - page->mediabox.x0,
335 page->mediabox.y1 - page->mediabox.y0); 297 page->mediabox.y1 - page->mediabox.y0);
336 298
337 for (block_n = 0; block_n < page->len; block_n++) 299 for (block = page->first_block; block; block = block->next)
338 { 300 {
339 switch (page->blocks[block_n].type) 301 switch (block->type)
340 {
341 case FZ_PAGE_BLOCK_TEXT:
342 { 302 {
343 fz_stext_block *block = page->blocks[block_n].u.text; 303 case FZ_STEXT_BLOCK_TEXT:
344 fz_stext_line *line;
345 const char *s;
346
347 fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n", 304 fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
348 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); 305 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
349 for (line = block->lines; line < block->lines + block->len; line++) 306 for (line = block->u.t.first_line; line; line = line->next)
350 { 307 {
351 fz_stext_span *span; 308 fz_font *font = NULL;
309 float size = 0;
310 const char *name = NULL;
311 const char *s;
312 fz_rect rect;
313
352 fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n", 314 fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n",
353 line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); 315 line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
354 for (span = line->first_span; span; span = span->next) 316
317 for (ch = line->first_char; ch; ch = ch->next)
355 { 318 {
356 fz_stext_style *style = NULL; 319 if (ch->font != font || ch->size != size)
357 const char *name = NULL; 320 {
358 int char_num; 321 if (font)
359 for (char_num = 0; char_num < span->len; char_num++) 322 fz_write_string(ctx, out, "</font>\n");
323 font = ch->font;
324 size = ch->size;
325 name = fz_font_name(ctx, font);
326 s = strchr(name, '+');
327 s = s ? s + 1 : name;
328 fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", s, size);
329 }
330 fz_stext_char_bbox(ctx, &rect, line, ch);
331 fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"",
332 rect.x0, rect.y0, rect.x1, rect.y1, ch->origin.x, ch->origin.y);
333 switch (ch->c)
360 { 334 {
361 fz_stext_char *ch = &span->text[char_num]; 335 case '<': fz_write_string(ctx, out, "&lt;"); break;
362 if (ch->style != style) 336 case '>': fz_write_string(ctx, out, "&gt;"); break;
363 { 337 case '&': fz_write_string(ctx, out, "&amp;"); break;
364 if (style) 338 case '"': fz_write_string(ctx, out, "&quot;"); break;
365 { 339 case '\'': fz_write_string(ctx, out, "&apos;"); break;
366 fz_write_string(ctx, out, "</span>\n"); 340 default:
367 } 341 if (ch->c >= 32 && ch->c <= 127)
368 style = ch->style; 342 fz_write_printf(ctx, out, "%c", ch->c);
369 name = fz_font_name(ctx, style->font); 343 else
370 s = strchr(name, '+'); 344 fz_write_printf(ctx, out, "&#x%x;", ch->c);
371 s = s ? s + 1 : name; 345 break;
372 fz_write_printf(ctx, out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n",
373 span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1,
374 s, style->size);
375 }
376 {
377 fz_rect rect;
378 fz_stext_char_bbox(ctx, &rect, span, char_num);
379 fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"",
380 rect.x0, rect.y0, rect.x1, rect.y1, ch->p.x, ch->p.y);
381 }
382 switch (ch->c)
383 {
384 case '<': fz_write_string(ctx, out, "&lt;"); break;
385 case '>': fz_write_string(ctx, out, "&gt;"); break;
386 case '&': fz_write_string(ctx, out, "&amp;"); break;
387 case '"': fz_write_string(ctx, out, "&quot;"); break;
388 case '\'': fz_write_string(ctx, out, "&apos;"); break;
389 default:
390 if (ch->c >= 32 && ch->c <= 127)
391 fz_write_printf(ctx, out, "%c", ch->c);
392 else
393 fz_write_printf(ctx, out, "&#x%x;", ch->c);
394 break;
395 }
396 fz_write_string(ctx, out, "\"/>\n");
397 } 346 }
398 if (style) 347 fz_write_string(ctx, out, "\"/>\n");
399 fz_write_string(ctx, out, "</span>\n");
400 } 348 }
349
350 if (font)
351 fz_write_string(ctx, out, "</font>\n");
352
401 fz_write_string(ctx, out, "</line>\n"); 353 fz_write_string(ctx, out, "</line>\n");
402 } 354 }
403 fz_write_string(ctx, out, "</block>\n"); 355 fz_write_string(ctx, out, "</block>\n");
404 break; 356 break;
405 } 357
406 case FZ_PAGE_BLOCK_IMAGE: 358 case FZ_STEXT_BLOCK_IMAGE:
407 { 359 fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
360 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
408 break; 361 break;
409 } 362 }
410 } 363 }
411 }
412 fz_write_string(ctx, out, "</page>\n"); 364 fz_write_string(ctx, out, "</page>\n");
413} 365}
414 366
@@ -417,31 +369,23 @@ fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page)
417void 369void
418fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) 370fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
419{ 371{
420 fz_page_block *pblock; 372 fz_stext_block *block;
373 fz_stext_line *line;
374 fz_stext_char *ch;
375 char utf[10];
376 int i, n;
421 377
422 for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock) 378 for (block = page->first_block; block; block = block->next)
423 { 379 {
424 if (pblock->type == FZ_PAGE_BLOCK_TEXT) 380 if (block->type == FZ_STEXT_BLOCK_TEXT)
425 { 381 {
426 fz_stext_block *block = pblock->u.text; 382 for (line = block->u.t.first_line; line; line = line->next)
427 fz_stext_line *line;
428 fz_stext_char *ch;
429 char utf[10];
430 int i, n;
431
432 for (line = block->lines; line < block->lines + block->len; line++)
433 { 383 {
434 fz_stext_span *span; 384 for (ch = line->first_char; ch; ch = ch->next)
435 for (span = line->first_span; span; span = span->next)
436 { 385 {
437 if (span->spacing > 1) 386 n = fz_runetochar(utf, ch->c);
438 fz_write_byte(ctx, out, ' '); 387 for (i = 0; i < n; i++)
439 for (ch = span->text; ch < span->text + span->len; ch++) 388 fz_write_byte(ctx, out, utf[i]);
440 {
441 n = fz_runetochar(utf, ch->c);
442 for (i = 0; i < n; i++)
443 fz_write_byte(ctx, out, utf[i]);
444 }
445 } 389 }
446 fz_write_string(ctx, out, "\n"); 390 fz_write_string(ctx, out, "\n");
447 } 391 }
@@ -466,7 +410,6 @@ struct fz_text_writer_s
466 fz_document_writer super; 410 fz_document_writer super;
467 int format; 411 int format;
468 fz_stext_options opts; 412 fz_stext_options opts;
469 fz_stext_sheet *sheet;
470 fz_stext_page *page; 413 fz_stext_page *page;
471 fz_output *out; 414 fz_output *out;
472}; 415};
@@ -483,7 +426,7 @@ text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediab
483 } 426 }
484 427
485 wri->page = fz_new_stext_page(ctx, mediabox); 428 wri->page = fz_new_stext_page(ctx, mediabox);
486 return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts); 429 return fz_new_stext_device(ctx, wri->page, &wri->opts);
487} 430}
488 431
489static void 432static void
@@ -537,7 +480,6 @@ text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
537{ 480{
538 fz_text_writer *wri = (fz_text_writer*)wri_; 481 fz_text_writer *wri = (fz_text_writer*)wri_;
539 fz_drop_stext_page(ctx, wri->page); 482 fz_drop_stext_page(ctx, wri->page);
540 fz_drop_stext_sheet(ctx, wri->sheet);
541 fz_drop_output(ctx, wri->out); 483 fz_drop_output(ctx, wri->out);
542} 484}
543 485
@@ -561,7 +503,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const
561 else if (!strcmp(format, "stext")) 503 else if (!strcmp(format, "stext"))
562 wri->format = FZ_FORMAT_STEXT; 504 wri->format = FZ_FORMAT_STEXT;
563 505
564 wri->sheet = fz_new_stext_sheet(ctx);
565 wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); 506 wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
566 507
567 switch (wri->format) 508 switch (wri->format)
@@ -581,7 +522,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const
581 fz_catch(ctx) 522 fz_catch(ctx)
582 { 523 {
583 fz_drop_output(ctx, wri->out); 524 fz_drop_output(ctx, wri->out);
584 fz_drop_stext_sheet(ctx, wri->sheet);
585 fz_free(ctx, wri); 525 fz_free(ctx, wri);
586 fz_rethrow(ctx); 526 fz_rethrow(ctx);
587 } 527 }
diff --git a/source/fitz/stext-paragraph.c b/source/fitz/stext-paragraph.c
deleted file mode 100644
index e275ecae1..000000000
--- a/source/fitz/stext-paragraph.c
+++ /dev/null
@@ -1,1538 +0,0 @@
1#include "mupdf/fitz.h"
2
3#include <string.h>
4#include <assert.h>
5#include <math.h>
6
7/* Assemble span soup into blocks and lines. */
8
9#define MY_EPSILON 0.001f
10
11#include <stdio.h> /* for debug printing */
12#undef DEBUG_LINE_HEIGHTS
13#undef DEBUG_MASKS
14#undef DEBUG_ALIGN
15#undef DEBUG_INDENTS
16
17#undef SPOT_LINE_NUMBERS
18
19typedef struct line_height_s
20{
21 float height;
22 int count;
23 fz_stext_style *style;
24} line_height;
25
26typedef struct line_heights_s
27{
28 fz_context *ctx;
29 int cap;
30 int len;
31 line_height *lh;
32} line_heights;
33
34static line_heights *
35new_line_heights(fz_context *ctx)
36{
37 line_heights *lh = fz_malloc_struct(ctx, line_heights);
38 lh->ctx = ctx;
39 return lh;
40}
41
42static void
43free_line_heights(line_heights *lh)
44{
45 if (!lh)
46 return;
47 fz_free(lh->ctx, lh->lh);
48 fz_free(lh->ctx, lh);
49}
50
51static void
52insert_line_height(line_heights *lh, fz_stext_style *style, float height)
53{
54 int i;
55
56#ifdef DEBUG_LINE_HEIGHTS
57 printf("style=%x height=%g\n", style, height);
58#endif
59
60 /* If we have one already, add it in */
61 for (i=0; i < lh->len; i++)
62 {
63 /* Match if we are within 5% */
64 if (lh->lh[i].style == style && lh->lh[i].height * 0.95f <= height && lh->lh[i].height * 1.05f >= height)
65 {
66 /* Ensure that the average height is correct */
67 lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1);
68 lh->lh[i].count++;
69 return;
70 }
71 }
72
73 /* Otherwise extend (if required) and add it */
74 if (lh->cap == lh->len)
75 {
76 int newcap = (lh->cap ? lh->cap * 2 : 4);
77 lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height));
78 lh->cap = newcap;
79 }
80
81 lh->lh[lh->len].count = 1;
82 lh->lh[lh->len].height = height;
83 lh->lh[lh->len].style = style;
84 lh->len++;
85}
86
87static void
88cull_line_heights(line_heights *lh)
89{
90 int i, j, k;
91
92#ifdef DEBUG_LINE_HEIGHTS
93 printf("Before culling:\n");
94 for (i = 0; i < lh->len; i++)
95 {
96 fz_stext_style *style = lh->lh[i].style;
97 printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
98 }
99#endif
100 for (i = 0; i < lh->len; i++)
101 {
102 fz_stext_style *style = lh->lh[i].style;
103 int count = lh->lh[i].count;
104 int max = i;
105
106 /* Find the max for this style */
107 for (j = i+1; j < lh->len; j++)
108 {
109 if (lh->lh[j].style == style && lh->lh[j].count > count)
110 {
111 max = j;
112 count = lh->lh[j].count;
113 }
114 }
115
116 /* Destroy all the ones other than the max */
117 if (max != i)
118 {
119 lh->lh[i].count = count;
120 lh->lh[i].height = lh->lh[max].height;
121 lh->lh[max].count = 0;
122 }
123 j = i+1;
124 for (k = j; k < lh->len; k++)
125 {
126 if (lh->lh[k].style != style)
127 lh->lh[j++] = lh->lh[k];
128 }
129 lh->len = j;
130 }
131#ifdef DEBUG_LINE_HEIGHTS
132 printf("After culling:\n");
133 for (i = 0; i < lh->len; i++)
134 {
135 fz_stext_style *style = lh->lh[i].style;
136 printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
137 }
138#endif
139}
140
141static float
142line_height_for_style(line_heights *lh, fz_stext_style *style)
143{
144 int i;
145
146 for (i=0; i < lh->len; i++)
147 {
148 if (lh->lh[i].style == style)
149 return lh->lh[i].height;
150 }
151 return 0.0f; /* Never reached */
152}
153
154static void
155split_block(fz_context *ctx, fz_stext_page *page, int block_num, int linenum)
156{
157 int split_len;
158 fz_stext_block *block, *block2;
159
160 if (page->len == page->cap)
161 {
162 int new_cap = fz_maxi(16, page->cap * 2);
163 page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
164 page->cap = new_cap;
165 }
166
167 memmove(page->blocks+block_num+1, page->blocks+block_num, (page->len - block_num)*sizeof(*page->blocks));
168 page->len++;
169
170 block2 = fz_malloc_struct(ctx, fz_stext_block);
171 block = page->blocks[block_num].u.text;
172
173 page->blocks[block_num+1].type = FZ_PAGE_BLOCK_TEXT;
174 page->blocks[block_num+1].u.text = block2;
175 split_len = block->len - linenum;
176 block2->bbox = block->bbox; /* FIXME! */
177 block2->cap = 0;
178 block2->len = 0;
179 block2->lines = NULL;
180 block2->lines = fz_malloc_array(ctx, split_len, sizeof(fz_stext_line));
181 block2->cap = block2->len;
182 block2->len = split_len;
183 block->len = linenum;
184 memcpy(block2->lines, block->lines + linenum, split_len * sizeof(fz_stext_line));
185 block2->lines[0].distance = 0;
186}
187
188static inline int
189is_unicode_wspace(int c)
190{
191 return (c == 9 || /* TAB */
192 c == 0x0a || /* HT */
193 c == 0x0b || /* LF */
194 c == 0x0c || /* VT */
195 c == 0x0d || /* FF */
196 c == 0x20 || /* CR */
197 c == 0x85 || /* NEL */
198 c == 0xA0 || /* No break space */
199 c == 0x1680 || /* Ogham space mark */
200 c == 0x180E || /* Mongolian Vowel Separator */
201 c == 0x2000 || /* En quad */
202 c == 0x2001 || /* Em quad */
203 c == 0x2002 || /* En space */
204 c == 0x2003 || /* Em space */
205 c == 0x2004 || /* Three-per-Em space */
206 c == 0x2005 || /* Four-per-Em space */
207 c == 0x2006 || /* Five-per-Em space */
208 c == 0x2007 || /* Figure space */
209 c == 0x2008 || /* Punctuation space */
210 c == 0x2009 || /* Thin space */
211 c == 0x200A || /* Hair space */
212 c == 0x2028 || /* Line separator */
213 c == 0x2029 || /* Paragraph separator */
214 c == 0x202F || /* Narrow no-break space */
215 c == 0x205F || /* Medium mathematical space */
216 c == 0x3000); /* Ideographic space */
217}
218
219static inline int
220is_unicode_bullet(int c)
221{
222 /* The last 2 aren't strictly bullets, but will do for our usage here */
223 return (c == 0x2022 || /* Bullet */
224 c == 0x2023 || /* Triangular bullet */
225 c == 0x25e6 || /* White bullet */
226 c == 0x2043 || /* Hyphen bullet */
227 c == 0x2219 || /* Bullet operator */
228 c == 149 || /* Ascii bullet */
229 c == '*');
230}
231
232#ifdef SPOT_LINE_NUMBERS
233static inline int
234is_number(int c)
235{
236 return ((c >= '0' && c <= '9') ||
237 (c == '.'));
238}
239
240static inline int
241is_latin_char(int c)
242{
243 return ((c >= 'A' && c <= 'Z') ||
244 (c >= 'a' && c <= 'z'));
245}
246
247static inline int
248is_roman(int c)
249{
250 return (c == 'i' || c == 'I' ||
251 c == 'v' || c == 'V' ||
252 c == 'x' || c == 'X' ||
253 c == 'l' || c == 'L' ||
254 c == 'c' || c == 'C' ||
255 c == 'm' || c == 'M');
256}
257#endif
258
259static int
260is_list_entry(fz_stext_line *line, fz_stext_span *span, int *char_num_ptr)
261{
262 int char_num;
263 fz_stext_char *chr;
264
265 /* First, skip over any whitespace */
266 for (char_num = 0; char_num < span->len; char_num++)
267 {
268 chr = &span->text[char_num];
269 if (!is_unicode_wspace(chr->c))
270 break;
271 }
272 *char_num_ptr = char_num;
273
274 if (span != line->first_span || char_num >= span->len)
275 return 0;
276
277 /* Now we check for various special cases, which we consider to mean
278 * that this is probably a list entry and therefore should always count
279 * as a separate paragraph (and hence not be entered in the line height
280 * table). */
281 chr = &span->text[char_num];
282
283 /* Is the first char on the line, a bullet point? */
284 if (is_unicode_bullet(chr->c))
285 return 1;
286
287#ifdef SPOT_LINE_NUMBERS
288 /* Is the entire first span a number? Or does it start with a number
289 * followed by ) or : ? Allowed to involve single latin chars too. */
290 if (is_number(chr->c) || is_latin_char(chr->c))
291 {
292 int cn = char_num;
293 int met_char = is_latin_char(chr->c);
294 for (cn = char_num+1; cn < span->len; cn++)
295 {
296 fz_stext_char *chr2 = &span->text[cn];
297
298 if (is_latin_char(chr2->c) && !met_char)
299 {
300 met_char = 1;
301 continue;
302 }
303 met_char = 0;
304 if (!is_number(chr2->c) && !is_unicode_wspace(chr2->c))
305 break;
306 else if (chr2->c == ')' || chr2->c == ':')
307 {
308 cn = span->len;
309 break;
310 }
311 }
312 if (cn == span->len)
313 return 1;
314 }
315
316 /* Is the entire first span a roman numeral? Or does it start with
317 * a roman numeral followed by ) or : ? */
318 if (is_roman(chr->c))
319 {
320 int cn = char_num;
321 for (cn = char_num+1; cn < span->len; cn++)
322 {
323 fz_stext_char *chr2 = &span->text[cn];
324
325 if (!is_roman(chr2->c) && !is_unicode_wspace(chr2->c))
326 break;
327 else if (chr2->c == ')' || chr2->c == ':')
328 {
329 cn = span->len;
330 break;
331 }
332 }
333 if (cn == span->len)
334 return 1;
335 }
336#endif
337 return 0;
338}
339
340typedef struct region_masks_s region_masks;
341
342typedef struct region_mask_s region_mask;
343
344typedef struct region_s region;
345
346struct region_s
347{
348 float start;
349 float stop;
350 float ave_start;
351 float ave_stop;
352 int align;
353 float colw;
354};
355
356struct region_mask_s
357{
358 fz_context *ctx;
359 int freq;
360 fz_point blv;
361 int cap;
362 int len;
363 float size;
364 region *mask;
365};
366
367struct region_masks_s
368{
369 fz_context *ctx;
370 int cap;
371 int len;
372 region_mask **mask;
373};
374
375static region_masks *
376new_region_masks(fz_context *ctx)
377{
378 region_masks *rms = fz_malloc_struct(ctx, region_masks);
379 rms->ctx = ctx;
380 rms->cap = 0;
381 rms->len = 0;
382 rms->mask = NULL;
383 return rms;
384}
385
386static void
387free_region_mask(region_mask *rm)
388{
389 if (!rm)
390 return;
391 fz_free(rm->ctx, rm->mask);
392 fz_free(rm->ctx, rm);
393}
394
395static void
396free_region_masks(region_masks *rms)
397{
398 int i;
399
400 if (!rms)
401 return;
402 for (i=0; i < rms->len; i++)
403 {
404 free_region_mask(rms->mask[i]);
405 }
406 fz_free(rms->ctx, rms->mask);
407 fz_free(rms->ctx, rms);
408}
409
410static int region_masks_mergeable(const region_mask *rm1, const region_mask *rm2, float *score)
411{
412 int i1, i2;
413 int count = 0;
414
415 *score = 0;
416 if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
417 return 0;
418
419 for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
420 {
421 if (rm1->mask[i1].stop < rm2->mask[i2].start)
422 {
423 /* rm1's region is entirely before rm2's */
424 *score += rm1->mask[i1].stop - rm1->mask[i1].start;
425 i1++;
426 }
427 else if (rm1->mask[i1].start > rm2->mask[i2].stop)
428 {
429 /* rm2's region is entirely before rm1's */
430 *score += rm2->mask[i2].stop - rm2->mask[i2].start;
431 i2++;
432 }
433 else
434 {
435 float lscore, rscore;
436 if (rm1->mask[i1].start < rm2->mask[i2].start)
437 {
438 if (i2 > 0 && rm2->mask[i2-1].stop >= rm1->mask[i1].start)
439 return 0; /* Not compatible */
440 lscore = rm2->mask[i2].start - rm1->mask[i1].start;
441 }
442 else
443 {
444 if (i1 > 0 && rm1->mask[i1-1].stop >= rm2->mask[i2].start)
445 return 0; /* Not compatible */
446 lscore = rm1->mask[i1].start - rm2->mask[i2].start;
447 }
448 if (rm1->mask[i1].stop > rm2->mask[i2].stop)
449 {
450 if (i2+1 < rm2->len && rm2->mask[i2+1].start <= rm1->mask[i1].stop)
451 return 0; /* Not compatible */
452 rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
453 }
454 else
455 {
456 if (i1+1 < rm1->len && rm1->mask[i1+1].start <= rm2->mask[i2].stop)
457 return 0; /* Not compatible */
458 rscore = rm2->mask[i2].stop - rm1->mask[i1].stop;
459 }
460 /* In order to allow a region to merge, either the
461 * left, the right, or the centre must agree */
462 if (lscore < 1)
463 {
464 if (rscore < 1)
465 {
466 rscore = 0;
467 }
468 lscore = 0;
469 }
470 else if (rscore < 1)
471 {
472 rscore = 0;
473 }
474 else
475 {
476 /* Neither Left or right agree. Does the centre? */
477 float ave1 = rm1->mask[i1].start + rm1->mask[i1].stop;
478 float ave2 = rm2->mask[i2].start + rm2->mask[i2].stop;
479 if (fabsf(ave1-ave2) > 1)
480 {
481 /* Nothing agrees, so don't merge */
482 return 0;
483 }
484 lscore = 0;
485 rscore = 0;
486 }
487 *score += lscore + rscore;
488 /* These two regions could be merged */
489 i1++;
490 i2++;
491 }
492 count++;
493 }
494 count += rm1->len-i1 + rm2->len-i2;
495 return count;
496}
497
498static int region_mask_matches(const region_mask *rm1, const region_mask *rm2, float *score)
499{
500 int i1, i2;
501 int close = 1;
502
503 *score = 0;
504 if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
505 return 0;
506
507 for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
508 {
509 if (rm1->mask[i1].stop < rm2->mask[i2].start)
510 {
511 /* rm1's region is entirely before rm2's */
512 *score += rm1->mask[i1].stop - rm1->mask[i1].start;
513 i1++;
514 }
515 else if (rm1->mask[i1].start > rm2->mask[i2].stop)
516 {
517 /* Not compatible */
518 return 0;
519 }
520 else
521 {
522 float lscore, rscore;
523 if (rm1->mask[i1].start > rm2->mask[i2].start)
524 {
525 /* Not compatible */
526 return 0;
527 }
528 if (rm1->mask[i1].stop < rm2->mask[i2].stop)
529 {
530 /* Not compatible */
531 return 0;
532 }
533 lscore = rm2->mask[i2].start - rm1->mask[i1].start;
534 rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
535 if (lscore < 1)
536 {
537 if (rscore < 1)
538 close++;
539 close++;
540 }
541 else if (rscore < 1)
542 close++;
543 else if (fabsf(lscore - rscore) < 1)
544 {
545 lscore = fabsf(lscore-rscore);
546 rscore = 0;
547 close++;
548 }
549 *score += lscore + rscore;
550 i1++;
551 i2++;
552 }
553 }
554 if (i1 < rm1->len)
555 {
556 /* Still more to go in rm1 */
557 if (rm1->mask[i1].start < rm2->mask[rm2->len-1].stop)
558 return 0;
559 }
560 else if (i2 < rm2->len)
561 {
562 /* Still more to go in rm2 */
563 if (rm2->mask[i2].start < rm1->mask[rm1->len-1].stop)
564 return 0;
565 }
566
567 return close;
568}
569
570static void region_mask_merge(region_mask *rm1, const region_mask *rm2, int newlen)
571{
572 int o, i1, i2;
573
574 /* First, ensure that rm1 is long enough */
575 if (rm1->cap < newlen)
576 {
577 int newcap = rm1->cap ? rm1->cap : 2;
578 do
579 {
580 newcap *= 2;
581 }
582 while (newcap < newlen);
583 rm1->mask = fz_resize_array(rm1->ctx, rm1->mask, newcap, sizeof(*rm1->mask));
584 rm1->cap = newcap;
585 }
586
587 /* Now run backwards along rm1, filling it out with the merged regions */
588 for (o = newlen-1, i1 = rm1->len-1, i2 = rm2->len-1; o >= 0; o--)
589 {
590 /* So we read from i1 and i2 and store in o */
591 if (i1 < 0)
592 {
593 /* Just copy i2 */
594 rm1->mask[o] = rm2->mask[i2];
595 i2--;
596 }
597 else if (i2 < 0)
598 {
599 /* Just copy i1 */
600 rm1->mask[o] = rm1->mask[i1];
601 i1--;
602 }
603 else if (rm1->mask[i1].stop < rm2->mask[i2].start)
604 {
605 /* rm1's region is entirely before rm2's - copy rm2's */
606 rm1->mask[o] = rm2->mask[i2];
607 i2--;
608 }
609 else if (rm2->mask[i2].stop < rm1->mask[i1].start)
610 {
611 /* rm2's region is entirely before rm1's - copy rm1's */
612 rm1->mask[o] = rm1->mask[i1];
613 i1--;
614 }
615 else
616 {
617 /* We must be merging */
618 rm1->mask[o].ave_start = (rm1->mask[i1].start * rm1->freq + rm2->mask[i2].start * rm2->freq)/(rm1->freq + rm2->freq);
619 rm1->mask[o].ave_stop = (rm1->mask[i1].stop * rm1->freq + rm2->mask[i2].stop * rm2->freq)/(rm1->freq + rm2->freq);
620 rm1->mask[o].start = fz_min(rm1->mask[i1].start, rm2->mask[i2].start);
621 rm1->mask[o].stop = fz_max(rm1->mask[i1].stop, rm2->mask[i2].stop);
622 i1--;
623 i2--;
624 }
625 }
626 rm1->freq += rm2->freq;
627 rm1->len = newlen;
628}
629
630static region_mask *region_masks_match(const region_masks *rms, const region_mask *rm, fz_stext_line *line, region_mask *prev_match)
631{
632 int i;
633 float best_score = 9999999;
634 float score;
635 int best = -1;
636 int best_count = 0;
637
638 /* If the 'previous match' matches, use it regardless. */
639 if (prev_match && region_mask_matches(prev_match, rm, &score))
640 {
641 return prev_match;
642 }
643
644 /* Run through and find the 'most compatible' region mask. We are
645 * guaranteed that there will always be at least one compatible one!
646 */
647 for (i=0; i < rms->len; i++)
648 {
649 int count = region_mask_matches(rms->mask[i], rm, &score);
650 if (count > best_count || (count == best_count && (score < best_score || best == -1)))
651 {
652 best = i;
653 best_score = score;
654 best_count = count;
655 }
656 }
657 assert(best >= 0 && best < rms->len);
658
659 /* So we have the matching mask. */
660 return rms->mask[best];
661}
662
663#ifdef DEBUG_MASKS
664static void
665dump_region_mask(const region_mask *rm)
666{
667 int j;
668 for (j = 0; j < rm->len; j++)
669 {
670 printf("%g->%g ", rm->mask[j].start, rm->mask[j].stop);
671 }
672 printf("* %d\n", rm->freq);
673}
674
675static void
676dump_region_masks(const region_masks *rms)
677{
678 int i;
679
680 for (i = 0; i < rms->len; i++)
681 {
682 region_mask *rm = rms->mask[i];
683 dump_region_mask(rm);
684 }
685}
686#endif
687
688static void region_masks_add(region_masks *rms, region_mask *rm)
689{
690 /* Add rm to rms */
691 if (rms->len == rms->cap)
692 {
693 int newcap = (rms->cap ? rms->cap * 2 : 4);
694 rms->mask = fz_resize_array(rms->ctx, rms->mask, newcap, sizeof(*rms->mask));
695 rms->cap = newcap;
696 }
697 rms->mask[rms->len] = rm;
698 rms->len++;
699}
700
701static void region_masks_sort(region_masks *rms)
702{
703 int i, j;
704
705 /* First calculate sizes */
706 for (i=0; i < rms->len; i++)
707 {
708 region_mask *rm = rms->mask[i];
709 float size = 0;
710 for (j=0; j < rm->len; j++)
711 {
712 size += rm->mask[j].stop - rm->mask[j].start;
713 }
714 rm->size = size;
715 }
716
717 /* Now, sort on size */
718 /* FIXME: bubble sort - use heapsort for efficiency */
719 for (i=0; i < rms->len-1; i++)
720 {
721 for (j=i+1; j < rms->len; j++)
722 {
723 if (rms->mask[i]->size < rms->mask[j]->size)
724 {
725 region_mask *tmp = rms->mask[i];
726 rms->mask[i] = rms->mask[j];
727 rms->mask[j] = tmp;
728 }
729 }
730 }
731}
732
733static void region_masks_merge(region_masks *rms, region_mask *rm)
734{
735 int i;
736 float best_score = 9999999;
737 float score;
738 int best = -1;
739 int best_count = 0;
740
741#ifdef DEBUG_MASKS
742 printf("\nAdding:\n");
743 dump_region_mask(rm);
744 printf("To:\n");
745 dump_region_masks(rms);
746#endif
747 for (i=0; i < rms->len; i++)
748 {
749 int count = region_masks_mergeable(rms->mask[i], rm, &score);
750 if (count && (score < best_score || best == -1))
751 {
752 best = i;
753 best_count = count;
754 best_score = score;
755 }
756 }
757 if (best != -1)
758 {
759 region_mask_merge(rms->mask[best], rm, best_count);
760#ifdef DEBUG_MASKS
761 printf("Merges to give:\n");
762 dump_region_masks(rms);
763#endif
764 free_region_mask(rm);
765 return;
766 }
767 region_masks_add(rms, rm);
768#ifdef DEBUG_MASKS
769 printf("Adding new one to give:\n");
770 dump_region_masks(rms);
771#endif
772}
773
774static region_mask *
775new_region_mask(fz_context *ctx, const fz_point *blv)
776{
777 region_mask *rm = fz_malloc_struct(ctx, region_mask);
778 rm->ctx = ctx;
779 rm->freq = 1;
780 rm->blv = *blv;
781 rm->cap = 0;
782 rm->len = 0;
783 rm->mask = NULL;
784 return rm;
785}
786
787static void
788region_mask_project(const region_mask *rm, const fz_point *min, const fz_point *max, float *start, float *end)
789{
790 /* We project min and max down onto the blv */
791 float s = min->x * rm->blv.x + min->y * rm->blv.y;
792 float e = max->x * rm->blv.x + max->y * rm->blv.y;
793 if (s > e)
794 {
795 *start = e;
796 *end = s;
797 }
798 else
799 {
800 *start = s;
801 *end = e;
802 }
803}
804
805static void
806region_mask_add(region_mask *rm, const fz_point *min, const fz_point *max)
807{
808 float start, end;
809 int i, j;
810
811 region_mask_project(rm, min, max, &start, &end);
812
813 /* Now add start/end into our region list. Typically we will be adding
814 * to the end of the region list, so search from there backwards. */
815 for (i = rm->len; i > 0;)
816 {
817 if (start > rm->mask[i-1].stop)
818 break;
819 i--;
820 }
821 /* So we know that our interval can only affect list items >= i.
822 * We know that start is after our previous end. */
823 if (i == rm->len || end < rm->mask[i].start)
824 {
825 /* Insert new one. No overlap. No merging */
826 if (rm->len == rm->cap)
827 {
828 int newcap = (rm->cap ? rm->cap * 2 : 4);
829 rm->mask = fz_resize_array(rm->ctx, rm->mask, newcap, sizeof(*rm->mask));
830 rm->cap = newcap;
831 }
832 if (rm->len > i)
833 memmove(&rm->mask[i+1], &rm->mask[i], (rm->len - i) * sizeof(*rm->mask));
834 rm->mask[i].ave_start = start;
835 rm->mask[i].ave_stop = end;
836 rm->mask[i].start = start;
837 rm->mask[i].stop = end;
838 rm->len++;
839 }
840 else
841 {
842 /* Extend current one down. */
843 rm->mask[i].ave_start = start;
844 rm->mask[i].start = start;
845 if (rm->mask[i].stop < end)
846 {
847 rm->mask[i].stop = end;
848 rm->mask[i].ave_stop = end;
849 /* Our region may now extend upwards too far */
850 i++;
851 j = i;
852 while (j < rm->len && rm->mask[j].start <= end)
853 {
854 rm->mask[i-1].stop = end = rm->mask[j].stop;
855 j++;
856 }
857 if (i != j)
858 {
859 /* Move everything from j down to i */
860 while (j < rm->len)
861 {
862 rm->mask[i++] = rm->mask[j++];
863 }
864 }
865 rm->len -= j-i;
866 }
867 }
868}
869
870static int
871region_mask_column(region_mask *rm, const fz_point *min, const fz_point *max, int *align, float *colw, float *left_)
872{
873 float start, end, left, right;
874 int i;
875
876 region_mask_project(rm, min, max, &start, &end);
877
878 for (i = 0; i < rm->len; i++)
879 {
880 /* The use of MY_EPSILON here is because we might be matching
881 * start/end values calculated with slightly different blv's */
882 if (rm->mask[i].start - MY_EPSILON <= start && rm->mask[i].stop + MY_EPSILON >= end)
883 break;
884 }
885 if (i >= rm->len)
886 {
887 *align = 0;
888 *colw = 0;
889 return 0;
890 }
891 left = start - rm->mask[i].start;
892 right = rm->mask[i].stop - end;
893 if (left < 1 && right < 1)
894 *align = rm->mask[i].align;
895 else if (left*2 <= right)
896 *align = 0; /* Left */
897 else if (right * 2 < left)
898 *align = 2; /* Right */
899 else
900 *align = 1;
901 *left_ = left;
902 *colw = rm->mask[i].colw;
903 return i;
904}
905
906static void
907region_mask_alignment(region_mask *rm)
908{
909 int i;
910 float width = 0;
911
912 for (i = 0; i < rm->len; i++)
913 {
914 width += rm->mask[i].stop - rm->mask[i].start;
915 }
916 for (i = 0; i < rm->len; i++)
917 {
918 region *r = &rm->mask[i];
919 float left = r->ave_start - r->start;
920 float right = r->stop - r->ave_stop;
921 if (left*2 <= right)
922 r->align = 0; /* Left */
923 else if (right * 2 < left)
924 r->align = 2; /* Right */
925 else
926 r->align = 1;
927 r->colw = 100 * (rm->mask[i].stop - rm->mask[i].start) / width;
928 }
929}
930
931static void
932region_masks_alignment(region_masks *rms)
933{
934 int i;
935
936 for (i = 0; i < rms->len; i++)
937 {
938 region_mask_alignment(rms->mask[i]);
939 }
940}
941
942static int
943is_unicode_hyphen(int c)
944{
945 /* We omit 0x2011 (Non breaking hyphen) and 0x2043 (Hyphen Bullet)
946 * from this list. */
947 return (c == '-' ||
948 c == 0x2010 || /* Hyphen */
949 c == 0x002d || /* Hyphen-Minus */
950 c == 0x00ad || /* Soft hyphen */
951 c == 0x058a || /* Armenian Hyphen */
952 c == 0x1400 || /* Canadian Syllabive Hyphen */
953 c == 0x1806); /* Mongolian Todo soft hyphen */
954}
955
956static int
957is_unicode_hyphenatable(int c)
958{
959 /* This is a pretty ad-hoc collection. It may need tuning. */
960 return ((c >= 'A' && c <= 'Z') ||
961 (c >= 'a' && c <= 'z') ||
962 (c >= 0x00c0 && c <= 0x00d6) ||
963 (c >= 0x00d8 && c <= 0x00f6) ||
964 (c >= 0x00f8 && c <= 0x02af) ||
965 (c >= 0x1d00 && c <= 0x1dbf) ||
966 (c >= 0x1e00 && c <= 0x1eff) ||
967 (c >= 0x2c60 && c <= 0x2c7f) ||
968 (c >= 0xa722 && c <= 0xa78e) ||
969 (c >= 0xa790 && c <= 0xa793) ||
970 (c >= 0xa7a8 && c <= 0xa7af) ||
971 (c >= 0xfb00 && c <= 0xfb07) ||
972 (c >= 0xff21 && c <= 0xff3a) ||
973 (c >= 0xff41 && c <= 0xff5a));
974}
975
976static void
977dehyphenate(fz_stext_span *s1, fz_stext_span *s2)
978{
979 int i;
980
981 for (i = s1->len-1; i > 0; i--)
982 if (!is_unicode_wspace(s1->text[i].c))
983 break;
984 /* Can't leave an empty span. */
985 if (i == 0)
986 return;
987
988 if (!is_unicode_hyphen(s1->text[i].c))
989 return;
990 if (!is_unicode_hyphenatable(s1->text[i-1].c))
991 return;
992 if (!is_unicode_hyphenatable(s2->text[0].c))
993 return;
994 s1->len = i;
995 s2->spacing = 0;
996}
997
998#ifdef DEBUG_ALIGN
999static void
1000dump_span(fz_stext_span *span)
1001{
1002}
1003
1004static void
1005dump_line(fz_stext_line *line)
1006{
1007 fz_stext_span *span;
1008
1009 if (!line)
1010 return;
1011 printf("d=%g: ", line->distance);
1012
1013 span = line->first_span;
1014 while (span)
1015 {
1016 dump_span(span);
1017 span = span->next;
1018 }
1019
1020 printf("\n");
1021}
1022#endif
1023
1024void
1025fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page)
1026{
1027 fz_stext_line *line;
1028 fz_stext_span *span;
1029 line_heights *lh;
1030 region_masks *rms;
1031 int block_num;
1032
1033 /* Simple paragraph analysis; look for the most common 'inter line'
1034 * spacing. This will be assumed to be our line spacing. Anything
1035 * more than 25% wider than this will be assumed to be a paragraph
1036 * space. */
1037
1038 /* Step 1: Gather the line height information */
1039 lh = new_line_heights(ctx);
1040 for (block_num = 0; block_num < page->len; block_num++)
1041 {
1042 fz_stext_block *block;
1043
1044 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
1045 continue;
1046 block = page->blocks[block_num].u.text;
1047
1048 for (line = block->lines; line < block->lines + block->len; line++)
1049 {
1050 /* For every style in the line, add lineheight to the
1051 * record for that style. FIXME: This is a nasty n^2
1052 * algorithm at the moment. */
1053 fz_stext_style *style = NULL;
1054
1055 if (line->distance == 0)
1056 continue;
1057
1058 for (span = line->first_span; span; span = span->next)
1059 {
1060 int char_num;
1061
1062 if (is_list_entry(line, span, &char_num))
1063 goto list_entry;
1064
1065 for (; char_num < span->len; char_num++)
1066 {
1067 fz_stext_char *chr = &span->text[char_num];
1068
1069 /* Ignore any whitespace chars */
1070 if (is_unicode_wspace(chr->c))
1071 continue;
1072
1073 if (chr->style != style)
1074 {
1075 /* Have we had this style before? */
1076 int match = 0;
1077 fz_stext_span *span2;
1078 for (span2 = line->first_span; span2 != span; span2 = span2->next)
1079 {
1080 int char_num2;
1081 for (char_num2 = 0; char_num2 < span2->len; char_num2++)
1082 {
1083 fz_stext_char *chr2 = &span2->text[char_num2];
1084 if (chr2->style == chr->style)
1085 {
1086 match = 1;
1087 break;
1088 }
1089 }
1090 }
1091 if (char_num > 0 && match == 0)
1092 {
1093 fz_stext_span *span2 = span;
1094 int char_num2;
1095 for (char_num2 = 0; char_num2 < char_num; char_num2++)
1096 {
1097 fz_stext_char *chr2 = &span2->text[char_num2];
1098 if (chr2->style == chr->style)
1099 {
1100 match = 1;
1101 break;
1102 }
1103 }
1104 }
1105 if (match == 0)
1106 insert_line_height(lh, chr->style, line->distance);
1107 style = chr->style;
1108 }
1109 }
1110list_entry:
1111 {}
1112 }
1113 }
1114 }
1115
1116 /* Step 2: Find the most popular line height for each style */
1117 cull_line_heights(lh);
1118
1119 /* Step 3: Run through the blocks, breaking each block into two if
1120 * the line height isn't right. */
1121 for (block_num = 0; block_num < page->len; block_num++)
1122 {
1123 int line_num;
1124 fz_stext_block *block;
1125
1126 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
1127 continue;
1128 block = page->blocks[block_num].u.text;
1129
1130 for (line_num = 0; line_num < block->len; line_num++)
1131 {
1132 /* For every style in the line, check to see if lineheight
1133 * is correct for that style. FIXME: We check each style
1134 * more than once, currently. */
1135 int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */
1136 fz_stext_style *style = NULL;
1137 line = &block->lines[line_num];
1138
1139 if (line->distance == 0)
1140 continue;
1141
1142#ifdef DEBUG_LINE_HEIGHTS
1143 printf("line height=%g\n", line->distance);
1144#endif
1145 for (span = line->first_span; span; span = span->next)
1146 {
1147 int char_num;
1148
1149 if (is_list_entry(line, span, &char_num))
1150 goto force_paragraph;
1151
1152 /* Now we do the rest of the line */
1153 for (; char_num < span->len; char_num++)
1154 {
1155 fz_stext_char *chr = &span->text[char_num];
1156
1157 /* Ignore any whitespace chars */
1158 if (is_unicode_wspace(chr->c))
1159 continue;
1160
1161 if (chr->style != style)
1162 {
1163 float proper_step = line_height_for_style(lh, chr->style);
1164 if (proper_step * 0.95f <= line->distance && line->distance <= proper_step * 1.05f)
1165 {
1166 ok = 1;
1167 break;
1168 }
1169 style = chr->style;
1170 }
1171 }
1172 if (ok)
1173 break;
1174 }
1175 if (!ok)
1176 {
1177force_paragraph:
1178 split_block(ctx, page, block_num, line_num);
1179 break;
1180 }
1181 }
1182 }
1183 free_line_heights(lh);
1184
1185 /* Simple line region analysis:
1186 * For each line:
1187 * form a list of 'start/stop' points (henceforth a 'region mask')
1188 * find the normalised baseline vector for the line.
1189 * Store the region mask and baseline vector.
1190 * Collate lines that have compatible region masks and identical
1191 * baseline vectors.
1192 * If the collated masks are column-like, then split into columns.
1193 * Otherwise split into tables.
1194 */
1195 rms = new_region_masks(ctx);
1196
1197 /* Step 1: Form the region masks and store them into a list with the
1198 * normalised baseline vectors. */
1199 for (block_num = 0; block_num < page->len; block_num++)
1200 {
1201 fz_stext_block *block;
1202
1203 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
1204 continue;
1205 block = page->blocks[block_num].u.text;
1206
1207 for (line = block->lines; line < block->lines + block->len; line++)
1208 {
1209 fz_point blv;
1210 region_mask *rm;
1211
1212#ifdef DEBUG_MASKS
1213 printf("Line: ");
1214 dump_line(line);
1215#endif
1216 blv = line->first_span->max;
1217 blv.x -= line->first_span->min.x;
1218 blv.y -= line->first_span->min.y;
1219 fz_normalize_vector(&blv);
1220
1221 rm = new_region_mask(ctx, &blv);
1222 for (span = line->first_span; span; span = span->next)
1223 {
1224 fz_point *region_min = &span->min;
1225 fz_point *region_max = &span->max;
1226
1227 /* Treat adjacent spans as one big region */
1228 while (span->next && span->next->spacing < 1.5f)
1229 {
1230 span = span->next;
1231 region_max = &span->max;
1232 }
1233
1234 region_mask_add(rm, region_min, region_max);
1235 }
1236#ifdef DEBUG_MASKS
1237 dump_region_mask(rm);
1238#endif
1239 region_masks_add(rms, rm);
1240 }
1241 }
1242
1243 /* Step 2: Sort the region_masks by size of masked region */
1244 region_masks_sort(rms);
1245
1246#ifdef DEBUG_MASKS
1247 printf("Sorted list of regions:\n");
1248 dump_region_masks(rms);
1249#endif
1250 /* Step 3: Merge the region masks where possible (large ones first) */
1251 {
1252 int i;
1253 region_masks *rms2;
1254 rms2 = new_region_masks(ctx);
1255 for (i=0; i < rms->len; i++)
1256 {
1257 region_mask *rm = rms->mask[i];
1258 rms->mask[i] = NULL;
1259 region_masks_merge(rms2, rm);
1260 }
1261 free_region_masks(rms);
1262 rms = rms2;
1263 }
1264
1265#ifdef DEBUG_MASKS
1266 printf("Merged list of regions:\n");
1267 dump_region_masks(rms);
1268#endif
1269
1270 /* Step 4: Figure out alignment */
1271 region_masks_alignment(rms);
1272
1273 /* Step 5: At this point, we should probably look at the region masks
1274 * to try to guess which ones represent columns on the page. With our
1275 * current code, we could only get blocks of lines that span 2 or more
1276 * columns if the PDF producer wrote text out horizontally across 2
1277 * or more columns, and we've never seen that (yet!). So we skip this
1278 * step for now. */
1279
1280 /* Step 6: Run through the lines again, deciding which ones fit into
1281 * which region mask. */
1282 {
1283 region_mask *prev_match = NULL;
1284 for (block_num = 0; block_num < page->len; block_num++)
1285 {
1286 fz_stext_block *block;
1287
1288 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
1289 continue;
1290 block = page->blocks[block_num].u.text;
1291
1292 for (line = block->lines; line < block->lines + block->len; line++)
1293 {
1294 fz_point blv;
1295 region_mask *rm;
1296 region_mask *match;
1297
1298 blv = line->first_span->max;
1299 blv.x -= line->first_span->min.x;
1300 blv.y -= line->first_span->min.y;
1301 fz_normalize_vector(&blv);
1302
1303#ifdef DEBUG_MASKS
1304 dump_line(line);
1305#endif
1306 rm = new_region_mask(ctx, &blv);
1307 for (span = line->first_span; span; span = span->next)
1308 {
1309 fz_point *region_min = &span->min;
1310 fz_point *region_max = &span->max;
1311
1312 /* Treat adjacent spans as one big region */
1313 while (span->next && span->next->spacing < 1.5f)
1314 {
1315 span = span->next;
1316 region_max = &span->max;
1317 }
1318
1319 region_mask_add(rm, region_min, region_max);
1320 }
1321#ifdef DEBUG_MASKS
1322 printf("Mask: ");
1323 dump_region_mask(rm);
1324#endif
1325 match = region_masks_match(rms, rm, line, prev_match);
1326 prev_match = match;
1327#ifdef DEBUG_MASKS
1328 printf("Matches: ");
1329 dump_region_mask(match);
1330#endif
1331 free_region_mask(rm);
1332 span = line->first_span;
1333 while (span)
1334 {
1335 fz_point *region_min = &span->min;
1336 fz_point *region_max = &span->max;
1337 fz_stext_span *sn;
1338 int col, align;
1339 float colw, left;
1340
1341 /* Treat adjacent spans as one big region */
1342#ifdef DEBUG_ALIGN
1343 dump_span(span);
1344#endif
1345 for (sn = span->next; sn && sn->spacing < 1.5f; sn = sn->next)
1346 {
1347 region_max = &sn->max;
1348#ifdef DEBUG_ALIGN
1349 dump_span(sn);
1350#endif
1351 }
1352 col = region_mask_column(match, region_min, region_max, &align, &colw, &left);
1353#ifdef DEBUG_ALIGN
1354 printf(" = col%d colw=%g align=%d\n", col, colw, align);
1355#endif
1356 do
1357 {
1358 span->column = col;
1359 span->align = align;
1360 span->indent = left;
1361 span->column_width = colw;
1362 span = span->next;
1363 }
1364 while (span != sn);
1365
1366 if (span)
1367 span = span->next;
1368 }
1369 line->region = match;
1370 }
1371 }
1372 free_region_masks(rms);
1373 }
1374
1375 /* Step 7: Collate lines within a block that share the same region
1376 * mask. */
1377 for (block_num = 0; block_num < page->len; block_num++)
1378 {
1379 int line_num;
1380 int prev_line_num;
1381
1382 fz_stext_block *block;
1383
1384 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
1385 continue;
1386 block = page->blocks[block_num].u.text;
1387
1388 /* First merge lines. This may leave empty lines behind. */
1389 for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++)
1390 {
1391 fz_stext_line *prev_line;
1392 line = &block->lines[line_num];
1393 if (!line->first_span)
1394 continue;
1395 prev_line = &block->lines[prev_line_num];
1396 if (prev_line->region == line->region)
1397 {
1398 /* We only merge lines if the second line
1399 * only uses 1 of the columns. */
1400 int col = line->first_span->column;
1401 /* Copy the left value for the first span
1402 * in the first column in this line forward
1403 * for all the rest of the spans in the same
1404 * column. */
1405 float indent = line->first_span->indent;
1406 for (span = line->first_span->next; span; span = span->next)
1407 {
1408 if (col != span->column)
1409 break;
1410 span->indent = indent;
1411 }
1412 if (span)
1413 {
1414 prev_line_num = line_num;
1415 continue;
1416 }
1417
1418 /* Merge line into prev_line */
1419 {
1420 fz_stext_span **prev_line_span = &prev_line->first_span;
1421 int try_dehyphen = -1;
1422 fz_stext_span *prev_span = NULL;
1423 span = line->first_span;
1424 while (span && *prev_line_span)
1425 {
1426 /* Skip forwards through the original
1427 * line, until we find a place where
1428 * span should go. */
1429 if ((*prev_line_span)->column <= span->column)
1430 {
1431 /* The current span we are considering
1432 * in prev_line is earlier than span.
1433 * Just skip forwards in prev_line. */
1434 prev_span = (*prev_line_span);
1435 prev_line_span = &prev_span->next;
1436 try_dehyphen = span->column;
1437 }
1438 else
1439 {
1440 /* We want to copy span into prev_line. */
1441 fz_stext_span *next = (*prev_line_span)->next;
1442
1443 if (prev_line_span == &prev_line->first_span)
1444 prev_line->first_span = span;
1445 if (next == NULL)
1446 prev_line->last_span = span;
1447 if (try_dehyphen == span->column)
1448 dehyphenate(prev_span, span);
1449 try_dehyphen = -1;
1450 prev_span = *prev_line_span = span;
1451 span = span->next;
1452 (*prev_line_span)->next = next;
1453 prev_line_span = &(*prev_line_span)->next;
1454 }
1455 }
1456 if (span)
1457 {
1458 *prev_line_span = span;
1459 prev_line->last_span = line->last_span;
1460 }
1461
1462 line->first_span = NULL;
1463 line->last_span = NULL;
1464 }
1465 }
1466 else
1467 prev_line_num = line_num;
1468 }
1469
1470 /* Now get rid of the empty lines */
1471 for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++)
1472 {
1473 line = &block->lines[line_num];
1474 if (line->first_span)
1475 block->lines[prev_line_num++] = *line;
1476 }
1477 block->len = prev_line_num;
1478
1479 /* Now try to spot indents */
1480 for (line_num = 0; line_num < block->len; line_num++)
1481 {
1482 fz_stext_span *span_num, *sn;
1483 int col, count;
1484 line = &block->lines[line_num];
1485
1486 /* Run through the spans... */
1487 span_num = line->first_span;
1488 {
1489 float indent = 0;
1490 /* For each set of spans that share the same
1491 * column... */
1492 col = span_num->column;
1493#ifdef DEBUG_INDENTS
1494 printf("Indent %g: ", span_num->indent);
1495 dump_span(span_num);
1496 printf("\n");
1497#endif
1498
1499 /* find the average indent of all but the first.. */
1500 for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++)
1501 {
1502#ifdef DEBUG_INDENTS
1503 printf("Indent %g: ", sn->indent);
1504 dump_span(sn);
1505 printf("\n");
1506#endif
1507 indent += sn->indent;
1508 sn->indent = 0;
1509 }
1510 if (sn != span_num->next)
1511 indent /= count;
1512
1513 /* And compare this indent with the first one... */
1514#ifdef DEBUG_INDENTS
1515 printf("Average indent %g ", indent);
1516#endif
1517 indent -= span_num->indent;
1518#ifdef DEBUG_INDENTS
1519 printf("delta %g ", indent);
1520#endif
1521 if (fabsf(indent) < 1)
1522 {
1523 /* No indent worth speaking of */
1524 indent = 0;
1525 }
1526#ifdef DEBUG_INDENTS
1527 printf("recorded %g\n", indent);
1528#endif
1529 span_num->indent = indent;
1530 span_num = sn;
1531 }
1532 for (; span_num; span_num = span_num->next)
1533 {
1534 span_num->indent = 0;
1535 }
1536 }
1537 }
1538}
diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c
index 00705208f..6c30ea29a 100644
--- a/source/fitz/stext-search.c
+++ b/source/fitz/stext-search.c
@@ -18,30 +18,28 @@ static inline int iswhite(int c)
18 18
19fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx) 19fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx)
20{ 20{
21 int block_num; 21 fz_stext_block *block;
22 fz_stext_line *line;
23 fz_stext_char *ch;
22 int ofs = 0; 24 int ofs = 0;
23 25
24 for (block_num = 0; block_num < page->len; block_num++) 26 for (block = page->first_block; block; block = block->next)
25 { 27 {
26 fz_stext_block *block; 28 if (block->type != FZ_STEXT_BLOCK_TEXT)
27 fz_stext_line *line;
28 fz_stext_span *span;
29
30 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
31 continue; 29 continue;
32 block = page->blocks[block_num].u.text; 30 for (line = block->u.t.first_line; line; line = line->next)
33 for (line = block->lines; line < block->lines + block->len; line++)
34 { 31 {
35 for (span = line->first_span; span; span = span->next) 32 for (ch = line->first_char; ch; ch = ch->next)
36 { 33 {
37 if (idx < ofs + span->len) 34 if (ofs == idx)
38 { 35 {
39 cab->c = span->text[idx - ofs].c; 36 cab->c = ch->c;
40 fz_stext_char_bbox(ctx, &cab->bbox, span, idx - ofs); 37 fz_stext_char_bbox(ctx, &cab->bbox, line, ch);
41 return cab; 38 return cab;
42 } 39 }
43 ofs += span->len; 40 ++ofs;
44 } 41 }
42
45 /* pseudo-newline */ 43 /* pseudo-newline */
46 if (idx == ofs) 44 if (idx == ofs)
47 { 45 {
@@ -49,7 +47,7 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex
49 cab->c = ' '; 47 cab->c = ' ';
50 return cab; 48 return cab;
51 } 49 }
52 ofs++; 50 ++ofs;
53 } 51 }
54 } 52 }
55 cab->bbox = fz_empty_rect; 53 cab->bbox = fz_empty_rect;
@@ -73,27 +71,23 @@ static fz_rect *bboxat(fz_context *ctx, fz_stext_page *page, int idx, fz_rect *b
73 71
74static int textlen_stext(fz_context *ctx, fz_stext_page *page) 72static int textlen_stext(fz_context *ctx, fz_stext_page *page)
75{ 73{
74 fz_stext_block *block;
75 fz_stext_line *line;
76 fz_stext_char *ch;
76 int len = 0; 77 int len = 0;
77 int block_num;
78 78
79 for (block_num = 0; block_num < page->len; block_num++) 79 for (block = page->first_block; block; block = block->next)
80 { 80 {
81 fz_stext_block *block; 81 if (block->type != FZ_STEXT_BLOCK_TEXT)
82 fz_stext_line *line;
83 fz_stext_span *span;
84
85 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
86 continue; 82 continue;
87 block = page->blocks[block_num].u.text; 83 for (line = block->u.t.first_line; line; line = line->next)
88 for (line = block->lines; line < block->lines + block->len; line++)
89 { 84 {
90 for (span = line->first_span; span; span = span->next) 85 for (ch = line->first_char; ch; ch = ch->next)
91 { 86 ++len;
92 len += span->len; 87 ++len; /* pseudo-newline */
93 }
94 len++; /* pseudo-newline */
95 } 88 }
96 } 89 }
90
97 return len; 91 return len;
98} 92}
99 93
@@ -181,8 +175,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re
181 fz_rect linebox, charbox; 175 fz_rect linebox, charbox;
182 fz_stext_block *block; 176 fz_stext_block *block;
183 fz_stext_line *line; 177 fz_stext_line *line;
184 fz_stext_span *span; 178 fz_stext_char *ch;
185 int i, block_num, hit_count; 179 int hit_count;
186 180
187 float x0 = rect.x0; 181 float x0 = rect.x0;
188 float x1 = rect.x1; 182 float x1 = rect.x1;
@@ -191,31 +185,27 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re
191 185
192 hit_count = 0; 186 hit_count = 0;
193 187
194 for (block_num = 0; block_num < page->len; block_num++) 188 for (block = page->first_block; block; block = block->next)
195 { 189 {
196 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) 190 if (block->type != FZ_STEXT_BLOCK_TEXT)
197 continue; 191 continue;
198 block = page->blocks[block_num].u.text; 192 for (line = block->u.t.first_line; line; line = line->next)
199 for (line = block->lines; line < block->lines + block->len; line++)
200 { 193 {
201 linebox = fz_empty_rect; 194 linebox = fz_empty_rect;
202 for (span = line->first_span; span; span = span->next) 195 for (ch = line->first_char; ch; ch = ch->next)
203 { 196 {
204 for (i = 0; i < span->len; i++) 197 fz_stext_char_bbox(ctx, &charbox, line, ch);
198 if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1)
205 { 199 {
206 fz_stext_char_bbox(ctx, &charbox, span, i); 200 if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5)
207 if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1)
208 { 201 {
209 if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) 202 if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
210 { 203 hit_bbox[hit_count++] = linebox;
211 if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) 204 linebox = charbox;
212 hit_bbox[hit_count++] = linebox; 205 }
213 linebox = charbox; 206 else
214 } 207 {
215 else 208 fz_union_rect(&linebox, &charbox);
216 {
217 fz_union_rect(&linebox, &charbox);
218 }
219 } 209 }
220 } 210 }
221 } 211 }
@@ -232,8 +222,11 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect)
232{ 222{
233 fz_buffer *buffer; 223 fz_buffer *buffer;
234 fz_rect hitbox; 224 fz_rect hitbox;
235 int c, i, block_num, seen = 0; 225 int c, seen = 0;
236 unsigned char *s; 226 unsigned char *s;
227 fz_stext_block *block;
228 fz_stext_line *line;
229 fz_stext_char *ch;
237 230
238 float x0 = rect.x0; 231 float x0 = rect.x0;
239 float x1 = rect.x1; 232 float x1 = rect.x1;
@@ -242,41 +235,33 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect)
242 235
243 buffer = fz_new_buffer(ctx, 1024); 236 buffer = fz_new_buffer(ctx, 1024);
244 237
245 for (block_num = 0; block_num < page->len; block_num++) 238 for (block = page->first_block; block; block = block->next)
246 { 239 {
247 fz_stext_block *block; 240 if (block->type != FZ_STEXT_BLOCK_TEXT)
248 fz_stext_line *line;
249 fz_stext_span *span;
250
251 if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
252 continue; 241 continue;
253 block = page->blocks[block_num].u.text; 242 for (line = block->u.t.first_line; line; line = line->next)
254 for (line = block->lines; line < block->lines + block->len; line++)
255 { 243 {
256 for (span = line->first_span; span; span = span->next) 244 if (seen)
257 { 245 {
258 if (seen) 246 fz_append_byte(ctx, buffer, '\n');
259 { 247 }
260 fz_append_byte(ctx, buffer, '\n');
261 }
262 248
263 seen = 0; 249 seen = 0;
264 250
265 for (i = 0; i < span->len; i++) 251 for (ch = line->first_char; ch; ch = ch->next)
252 {
253 fz_stext_char_bbox(ctx, &hitbox, line, ch);
254 c = ch->c;
255 if (c < 32)
256 c = 0xFFFD;
257 if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
266 { 258 {
267 fz_stext_char_bbox(ctx, &hitbox, span, i); 259 fz_append_rune(ctx, buffer, c);
268 c = span->text[i].c; 260 seen = 1;
269 if (c < 32)
270 c = 0xFFFD;
271 if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
272 {
273 fz_append_rune(ctx, buffer, c);
274 seen = 1;
275 }
276 } 261 }
277
278 seen = (seen && span == line->last_span);
279 } 262 }
263
264 seen = (seen && line == block->u.t.last_line);
280 } 265 }
281 } 266 }
282 267
diff --git a/source/fitz/util.c b/source/fitz/util.c
index 6f9001746..d6a7f3174 100644
--- a/source/fitz/util.c
+++ b/source/fitz/util.c
@@ -267,7 +267,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co
267} 267}
268 268
269fz_stext_page * 269fz_stext_page *
270fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options) 270fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options)
271{ 271{
272 fz_stext_page *text; 272 fz_stext_page *text;
273 fz_device *dev; 273 fz_device *dev;
@@ -279,7 +279,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
279 text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox)); 279 text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox));
280 fz_try(ctx) 280 fz_try(ctx)
281 { 281 {
282 dev = fz_new_stext_device(ctx, sheet, text, options); 282 dev = fz_new_stext_device(ctx, text, options);
283 fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL); 283 fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL);
284 fz_close_device(ctx, dev); 284 fz_close_device(ctx, dev);
285 } 285 }
@@ -297,7 +297,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
297} 297}
298 298
299fz_stext_page * 299fz_stext_page *
300fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options) 300fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options)
301{ 301{
302 fz_stext_page *text; 302 fz_stext_page *text;
303 fz_device *dev; 303 fz_device *dev;
@@ -309,7 +309,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
309 text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox)); 309 text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox));
310 fz_try(ctx) 310 fz_try(ctx)
311 { 311 {
312 dev = fz_new_stext_device(ctx, sheet, text, options); 312 dev = fz_new_stext_device(ctx, text, options);
313 fz_run_page(ctx, page, dev, &fz_identity, NULL); 313 fz_run_page(ctx, page, dev, &fz_identity, NULL);
314 fz_close_device(ctx, dev); 314 fz_close_device(ctx, dev);
315 } 315 }
@@ -327,14 +327,14 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
327} 327}
328 328
329fz_stext_page * 329fz_stext_page *
330fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options) 330fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options)
331{ 331{
332 fz_page *page; 332 fz_page *page;
333 fz_stext_page *text; 333 fz_stext_page *text;
334 334
335 page = fz_load_page(ctx, doc, number); 335 page = fz_load_page(ctx, doc, number);
336 fz_try(ctx) 336 fz_try(ctx)
337 text = fz_new_stext_page_from_page(ctx, page, sheet, options); 337 text = fz_new_stext_page_from_page(ctx, page, options);
338 fz_always(ctx) 338 fz_always(ctx)
339 fz_drop_page(ctx, page); 339 fz_drop_page(ctx, page);
340 fz_catch(ctx) 340 fz_catch(ctx)
@@ -345,24 +345,14 @@ fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number
345int 345int
346fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needle, fz_rect *hit_bbox, int hit_max) 346fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needle, fz_rect *hit_bbox, int hit_max)
347{ 347{
348 fz_stext_sheet *sheet = NULL; 348 fz_stext_page *text;
349 fz_stext_page *text = NULL;
350 int count; 349 int count;
351 350
352 fz_var(sheet); 351 text = fz_new_stext_page_from_display_list(ctx, list, NULL);
353 fz_var(text);
354
355 fz_try(ctx) 352 fz_try(ctx)
356 {
357 sheet = fz_new_stext_sheet(ctx);
358 text = fz_new_stext_page_from_display_list(ctx, list, sheet, NULL);
359 count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); 353 count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max);
360 }
361 fz_always(ctx) 354 fz_always(ctx)
362 {
363 fz_drop_stext_page(ctx, text); 355 fz_drop_stext_page(ctx, text);
364 fz_drop_stext_sheet(ctx, sheet);
365 }
366 fz_catch(ctx) 356 fz_catch(ctx)
367 fz_rethrow(ctx); 357 fz_rethrow(ctx);
368 return count; 358 return count;
@@ -371,24 +361,14 @@ fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needl
371int 361int
372fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max) 362fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max)
373{ 363{
374 fz_stext_sheet *sheet = NULL; 364 fz_stext_page *text;
375 fz_stext_page *text = NULL;
376 int count; 365 int count;
377 366
378 fz_var(sheet); 367 text = fz_new_stext_page_from_page(ctx, page, NULL);
379 fz_var(text);
380
381 fz_try(ctx) 368 fz_try(ctx)
382 {
383 sheet = fz_new_stext_sheet(ctx);
384 text = fz_new_stext_page_from_page(ctx, page, sheet, NULL);
385 count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); 369 count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max);
386 }
387 fz_always(ctx) 370 fz_always(ctx)
388 {
389 fz_drop_stext_page(ctx, text); 371 fz_drop_stext_page(ctx, text);
390 fz_drop_stext_sheet(ctx, sheet);
391 }
392 fz_catch(ctx) 372 fz_catch(ctx)
393 fz_rethrow(ctx); 373 fz_rethrow(ctx);
394 return count; 374 return count;
@@ -411,14 +391,15 @@ fz_search_page_number(fz_context *ctx, fz_document *doc, int number, const char
411} 391}
412 392
413fz_buffer * 393fz_buffer *
414fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rect *sel, int crlf) 394fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page, const fz_rect *sel, int crlf)
415{ 395{
416 fz_buffer *buf; 396 fz_buffer *buf;
417 fz_rect hitbox; 397 fz_rect hitbox;
418 float x0, y0, x1, y1; 398 float x0, y0, x1, y1;
419 int block_num; 399 fz_stext_block *block;
400 fz_stext_line *line;
401 fz_stext_char *ch;
420 int need_newline; 402 int need_newline;
421 int i;
422 403
423 need_newline = 0; 404 need_newline = 0;
424 405
@@ -438,45 +419,33 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec
438 buf = fz_new_buffer(ctx, 256); 419 buf = fz_new_buffer(ctx, 256);
439 fz_try(ctx) 420 fz_try(ctx)
440 { 421 {
441 for (block_num = 0; block_num < text->len; block_num++) 422 for (block = page->first_block; block; block = block->next)
442 { 423 {
443 fz_stext_line *line; 424 if (block->type != FZ_STEXT_BLOCK_TEXT)
444 fz_stext_block *block;
445 fz_stext_span *span;
446
447 if (text->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
448 continue; 425 continue;
449 426
450 block = text->blocks[block_num].u.text; 427 for (line = block->u.t.first_line; line; line = line->next)
451 for (line = block->lines; line < block->lines + block->len; line++)
452 { 428 {
453 int saw_text = 0; 429 int saw_text = 0;
454 for (span = line->first_span; span; span = span->next) 430 for (ch = line->first_char; ch; ch = ch->next)
455 { 431 {
456 if (span->spacing > 1) 432 int c = ch->c;
457 fz_append_byte(ctx, buf, ' '); 433 fz_stext_char_bbox(ctx, &hitbox, line, ch);
458 for (i = 0; i < span->len; i++) 434 if (c < 32)
435 c = 0xFFFD;
436 if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
459 { 437 {
460 int c; 438 saw_text = 1;
461 fz_stext_char_bbox(ctx, &hitbox, span, i); 439 if (need_newline)
462 c = span->text[i].c;
463 if (c < 32)
464 c = 0xFFFD;
465 if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
466 { 440 {
467 saw_text = 1; 441 if (crlf)
468 if (need_newline) 442 fz_append_byte(ctx, buf, '\r');
469 { 443 fz_append_byte(ctx, buf, '\n');
470 if (crlf) 444 need_newline = 0;
471 fz_append_byte(ctx, buf, '\r');
472 fz_append_byte(ctx, buf, '\n');
473 need_newline = 0;
474 }
475 fz_append_rune(ctx, buf, c);
476 } 445 }
446 fz_append_rune(ctx, buf, c);
477 } 447 }
478 } 448 }
479
480 if (saw_text) 449 if (saw_text)
481 need_newline = 1; 450 need_newline = 1;
482 } 451 }
@@ -494,42 +463,32 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec
494fz_buffer * 463fz_buffer *
495fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options) 464fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options)
496{ 465{
497 fz_stext_sheet *sheet;
498 fz_stext_page *text; 466 fz_stext_page *text;
499 fz_buffer *buf; 467 fz_buffer *buf;
500 468
501 sheet = fz_new_stext_sheet(ctx); 469 text = fz_new_stext_page_from_display_list(ctx, list, options);
502 fz_try(ctx) 470 fz_try(ctx)
503 {
504 text = fz_new_stext_page_from_display_list(ctx, list, sheet, options);
505 buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); 471 buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf);
506 }
507 fz_always(ctx) 472 fz_always(ctx)
508 fz_drop_stext_sheet(ctx, sheet); 473 fz_drop_stext_page(ctx, text);
509 fz_catch(ctx) 474 fz_catch(ctx)
510 fz_rethrow(ctx); 475 fz_rethrow(ctx);
511 fz_drop_stext_page(ctx, text);
512 return buf; 476 return buf;
513} 477}
514 478
515fz_buffer * 479fz_buffer *
516fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options) 480fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options)
517{ 481{
518 fz_stext_sheet *sheet;
519 fz_stext_page *text; 482 fz_stext_page *text;
520 fz_buffer *buf; 483 fz_buffer *buf;
521 484
522 sheet = fz_new_stext_sheet(ctx); 485 text = fz_new_stext_page_from_page(ctx, page, options);
523 fz_try(ctx) 486 fz_try(ctx)
524 {
525 text = fz_new_stext_page_from_page(ctx, page, sheet, options);
526 buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); 487 buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf);
527 }
528 fz_always(ctx) 488 fz_always(ctx)
529 fz_drop_stext_sheet(ctx, sheet); 489 fz_drop_stext_page(ctx, text);
530 fz_catch(ctx) 490 fz_catch(ctx)
531 fz_rethrow(ctx); 491 fz_rethrow(ctx);
532 fz_drop_stext_page(ctx, text);
533 return buf; 492 return buf;
534} 493}
535 494
diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c
index de05ab659..e1303fb87 100644
--- a/source/tools/mudraw.c
+++ b/source/tools/mudraw.c
@@ -248,7 +248,6 @@ static int band_height = 0;
248static int lowmemory = 0; 248static int lowmemory = 0;
249 249
250static int errored = 0; 250static int errored = 0;
251static fz_stext_sheet *sheet = NULL;
252static fz_colorspace *colorspace; 251static fz_colorspace *colorspace;
253static int spots = 0; 252static int spots = 0;
254static int alpha; 253static int alpha;
@@ -391,9 +390,6 @@ file_level_headers(fz_context *ctx)
391 if (output_format == OUT_STEXT || output_format == OUT_TRACE) 390 if (output_format == OUT_STEXT || output_format == OUT_TRACE)
392 fz_write_printf(ctx, out, "<?xml version=\"1.0\"?>\n"); 391 fz_write_printf(ctx, out, "<?xml version=\"1.0\"?>\n");
393 392
394 if (output_format == OUT_TEXT || output_format == OUT_HTML || output_format == OUT_XHTML || output_format == OUT_STEXT)
395 sheet = fz_new_stext_sheet(ctx);
396
397 if (output_format == OUT_HTML) 393 if (output_format == OUT_HTML)
398 fz_print_stext_header_as_html(ctx, out); 394 fz_print_stext_header_as_html(ctx, out);
399 if (output_format == OUT_XHTML) 395 if (output_format == OUT_XHTML)
@@ -422,8 +418,6 @@ file_level_trailers(fz_context *ctx)
422 418
423 if (output_format == OUT_PS) 419 if (output_format == OUT_PS)
424 fz_write_ps_file_trailer(ctx, out, output_pagenum); 420 fz_write_ps_file_trailer(ctx, out, output_pagenum);
425
426 fz_drop_stext_sheet(ctx, sheet);
427} 421}
428 422
429static void drawband(fz_context *ctx, fz_page *page, fz_display_list *list, const fz_matrix *ctm, const fz_rect *tbounds, fz_cookie *cookie, int band_start, fz_pixmap *pix, fz_bitmap **bit) 423static void drawband(fz_context *ctx, fz_page *page, fz_display_list *list, const fz_matrix *ctm, const fz_rect *tbounds, fz_cookie *cookie, int band_start, fz_pixmap *pix, fz_bitmap **bit)
@@ -534,7 +528,7 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in
534 528
535 stext_options.flags = (output_format == OUT_HTML || output_format == OUT_XHTML) ? FZ_STEXT_PRESERVE_IMAGES : 0; 529 stext_options.flags = (output_format == OUT_HTML || output_format == OUT_XHTML) ? FZ_STEXT_PRESERVE_IMAGES : 0;
536 text = fz_new_stext_page(ctx, &mediabox); 530 text = fz_new_stext_page(ctx, &mediabox);
537 dev = fz_new_stext_device(ctx, sheet, text, &stext_options); 531 dev = fz_new_stext_device(ctx, text, &stext_options);
538 if (lowmemory) 532 if (lowmemory)
539 fz_enable_device_hints(ctx, dev, FZ_NO_CACHE); 533 fz_enable_device_hints(ctx, dev, FZ_NO_CACHE);
540 if (list) 534 if (list)
@@ -550,12 +544,10 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in
550 } 544 }
551 else if (output_format == OUT_HTML) 545 else if (output_format == OUT_HTML)
552 { 546 {
553 fz_analyze_text(ctx, sheet, text);
554 fz_print_stext_page_as_html(ctx, out, text); 547 fz_print_stext_page_as_html(ctx, out, text);
555 } 548 }
556 else if (output_format == OUT_XHTML) 549 else if (output_format == OUT_XHTML)
557 { 550 {
558 fz_analyze_text(ctx, sheet, text);
559 fz_print_stext_page_as_xhtml(ctx, out, text); 551 fz_print_stext_page_as_xhtml(ctx, out, text);
560 } 552 }
561 else if (output_format == OUT_TEXT) 553 else if (output_format == OUT_TEXT)
diff --git a/source/tools/murun.c b/source/tools/murun.c
index b7443286c..7a713903e 100644
--- a/source/tools/murun.c
+++ b/source/tools/murun.c
@@ -1827,19 +1827,13 @@ static void ffi_Page_toStructuredText(js_State *J)
1827 fz_context *ctx = js_getcontext(J); 1827 fz_context *ctx = js_getcontext(J);
1828 fz_page *page = ffi_topage(J, 0); 1828 fz_page *page = ffi_topage(J, 0);
1829 const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; 1829 const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL;
1830 fz_stext_sheet *sheet = NULL;
1831 fz_stext_options so; 1830 fz_stext_options so;
1832 fz_stext_page *text; 1831 fz_stext_page *text;
1833 1832
1834 fz_var(sheet);
1835
1836 fz_try(ctx) { 1833 fz_try(ctx) {
1837 sheet = fz_new_stext_sheet(ctx);
1838 fz_parse_stext_options(ctx, &so, options); 1834 fz_parse_stext_options(ctx, &so, options);
1839 text = fz_new_stext_page_from_page(ctx, page, sheet, &so); 1835 text = fz_new_stext_page_from_page(ctx, page, &so);
1840 } 1836 }
1841 fz_always(ctx)
1842 fz_drop_stext_sheet(ctx, sheet);
1843 fz_catch(ctx) 1837 fz_catch(ctx)
1844 rethrow(J); 1838 rethrow(J);
1845 1839
@@ -2673,19 +2667,13 @@ static void ffi_DisplayList_toStructuredText(js_State *J)
2673 fz_context *ctx = js_getcontext(J); 2667 fz_context *ctx = js_getcontext(J);
2674 fz_display_list *list = js_touserdata(J, 0, "fz_display_list"); 2668 fz_display_list *list = js_touserdata(J, 0, "fz_display_list");
2675 const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; 2669 const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL;
2676 fz_stext_sheet *sheet = NULL;
2677 fz_stext_options so; 2670 fz_stext_options so;
2678 fz_stext_page *text; 2671 fz_stext_page *text;
2679 2672
2680 fz_var(sheet);
2681
2682 fz_try(ctx) { 2673 fz_try(ctx) {
2683 sheet = fz_new_stext_sheet(ctx);
2684 fz_parse_stext_options(ctx, &so, options); 2674 fz_parse_stext_options(ctx, &so, options);
2685 text = fz_new_stext_page_from_display_list(ctx, list, sheet, &so); 2675 text = fz_new_stext_page_from_display_list(ctx, list, &so);
2686 } 2676 }
2687 fz_always(ctx)
2688 fz_drop_stext_sheet(ctx, sheet);
2689 fz_catch(ctx) 2677 fz_catch(ctx)
2690 rethrow(J); 2678 rethrow(J);
2691 2679