comparison mupdf-source/source/fitz/stext-output.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24
25 #define SUBSCRIPT_OFFSET 0.2f
26 #define SUPERSCRIPT_OFFSET -0.2f
27
28 #include <ft2build.h>
29 #include FT_FREETYPE_H
30
31 // Text black color when converted from DeviceCMYK to RGB
32 #define CMYK_BLACK 0x221f1f
33
34 static void
35 scale_run(fz_context *ctx, fz_stext_block *block, float scale)
36 {
37 fz_matrix m = fz_scale(scale, scale);
38 fz_stext_line *line;
39 fz_stext_char *ch;
40
41 while (block)
42 {
43 block->bbox = fz_transform_rect(block->bbox, m);
44 switch (block->type)
45 {
46 case FZ_STEXT_BLOCK_TEXT:
47 for (line = block->u.t.first_line; line; line = line->next)
48 {
49 line->bbox = fz_transform_rect(block->bbox, m);
50 for (ch = line->first_char; ch; ch = ch->next)
51 {
52 ch->origin = fz_transform_point(ch->origin, m);
53 ch->quad = fz_transform_quad(ch->quad, m);
54 ch->size = ch->size * scale;
55 }
56 }
57 break;
58
59 case FZ_STEXT_BLOCK_IMAGE:
60 block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale);
61 break;
62
63 case FZ_STEXT_BLOCK_STRUCT:
64 if (block->u.s.down)
65 scale_run(ctx, block->u.s.down->first_block, scale);
66 break;
67 }
68 block = block->next;
69 }
70 }
71
72 static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale)
73 {
74 scale_run(ctx, page->first_block, scale);
75 }
76
77 /* HTML output (visual formatting with preserved layout) */
78
79 static int
80 detect_super_script(fz_stext_line *line, fz_stext_char *ch)
81 {
82 if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
83 return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
84 return 0;
85 }
86
87 static const char *
88 font_full_name(fz_context *ctx, fz_font *font)
89 {
90 const char *name = fz_font_name(ctx, font);
91 const char *s = strchr(name, '+');
92 return s ? s + 1 : name;
93 }
94
95 static const char *
96 html_clean_font_name(const char *fontname)
97 {
98 if (strstr(fontname, "Times"))
99 return "Times New Roman";
100 if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica"))
101 {
102 if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed"))
103 return "Arial Narrow";
104 return "Arial";
105 }
106 if (strstr(fontname, "Courier"))
107 return "Courier";
108 return fontname;
109 }
110
111 static void
112 font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
113 {
114 const char *name = html_clean_font_name(font_full_name(ctx, font));
115 char *s;
116 fz_strlcpy(buf, name, size);
117 s = strrchr(buf, '-');
118 if (s)
119 *s = 0;
120 if (is_mono)
121 fz_strlcat(buf, ",monospace", size);
122 else
123 fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
124 }
125
126 static void
127 fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
128 {
129 char family[80];
130
131 int is_bold = fz_font_is_bold(ctx, font);
132 int is_italic = fz_font_is_italic(ctx, font);
133 int is_serif = fz_font_is_serif(ctx, font);
134 int is_mono = fz_font_is_monospaced(ctx, font);
135
136 font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
137
138 if (sup) fz_write_string(ctx, out, "<sup>");
139 if (is_mono) fz_write_string(ctx, out, "<tt>");
140 if (is_bold) fz_write_string(ctx, out, "<b>");
141 if (is_italic) fz_write_string(ctx, out, "<i>");
142 fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size);
143 if (color != 0 && color != CMYK_BLACK)
144 fz_write_printf(ctx, out, ";color:#%06x", color & 0xffffff);
145 fz_write_printf(ctx, out, "\">");
146 }
147
148 static void
149 fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
150 {
151 int is_mono = fz_font_is_monospaced(ctx, font);
152 int is_bold = fz_font_is_bold(ctx,font);
153 int is_italic = fz_font_is_italic(ctx, font);
154
155 fz_write_string(ctx, out, "</span>");
156 if (is_italic) fz_write_string(ctx, out, "</i>");
157 if (is_bold) fz_write_string(ctx, out, "</b>");
158 if (is_mono) fz_write_string(ctx, out, "</tt>");
159 if (sup) fz_write_string(ctx, out, "</sup>");
160 }
161
162 static void
163 fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
164 {
165 fz_matrix ctm = block->u.i.transform;
166
167 #define USE_CSS_MATRIX_TRANSFORMS
168 #ifdef USE_CSS_MATRIX_TRANSFORMS
169 /* Matrix maths notes.
170 * When we get here ctm maps the unit square to the position in device
171 * space occupied by the image.
172 *
173 * That is to say that mapping the 4 corners of the unit square through
174 * the transform, give us the 4 target corners. We extend the corners
175 * by adding an extra '1' into them to allow transforms to work. Thus
176 * (x,y) maps through ctm = (a b c d e f) as:
177 *
178 * (x y 1) (a b 0) = (X Y 1)
179 * (c d 0)
180 * (e f 1)
181 *
182 * To simplify reading of matrix maths, we use the trick where we
183 * 'drop' the first matrix down the page. Thus the corners c0=(0,0),
184 * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively:
185 *
186 * ( a b 0)
187 * ( c d 0)
188 * ( e f 1)
189 * (0 0 1) ( e f 1)
190 * (0 1 1) ( c+e d+f 1)
191 * (1 1 1) (a+c+e b+d+f 1)
192 * (1 0 1) ( a+e b+f 1)
193 *
194 * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f)
195 *
196 * Unfortunately, the CSS matrix transform, does not map the unit square.
197 * Rather it does something moderately mad. As far as I can work out, the
198 * top left corner of a (0,0) -> (w, h) box is transformed using the .e
199 * and .f entries of the matrix. Then the image from within that square
200 * is transformed using the centre of that square as the origin.
201 *
202 * So, an image placed at (0,0) in destination space with 1:1 transform
203 * will result in an image a (0,0) as you'd expect. But an image at (0,0)
204 * with a scale of 2, will result in 25% of the image off the left of the
205 * screen, and 25% off the top.
206 *
207 * Accordingly, we have to adjust the ctm in several steps.
208 */
209 /* Move to moving the centre of the image. */
210 ctm.e += (ctm.a+ctm.c)/2;
211 ctm.f += (ctm.b+ctm.d)/2;
212 /* Move from transforming the unit square to w/h */
213 ctm.a /= block->u.i.image->w;
214 ctm.b /= block->u.i.image->w;
215 ctm.c /= block->u.i.image->h;
216 ctm.d /= block->u.i.image->h;
217 /* Move from points to pixels */
218 ctm.a *= 96.0f/72;
219 ctm.b *= 96.0f/72;
220 ctm.c *= 96.0f/72;
221 ctm.d *= 96.0f/72;
222 ctm.e *= 96.0f/72;
223 ctm.f *= 96.0f/72;
224 /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */
225 ctm.e -= block->u.i.image->w/2;
226 ctm.f -= block->u.i.image->h/2;
227
228 fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"",
229 ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f);
230 #else
231 /* Alternative version of the code that uses scaleX/Y and rotate
232 * instead, but only copes with axis aligned cases. */
233 int t;
234
235 int x = block->bbox.x0;
236 int y = block->bbox.y0;
237 int w = block->bbox.x1 - block->bbox.x0;
238 int h = block->bbox.y1 - block->bbox.y0;
239
240 const char *flip = "";
241
242 if (ctm.b == 0 && ctm.c == 0)
243 {
244 if (ctm.a < 0 && ctm.d < 0)
245 flip = "transform: scaleX(-1) scaleY(-1);";
246 else if (ctm.a < 0)
247 {
248 flip = "transform: scaleX(-1);";
249 }
250 else if (ctm.d < 0)
251 {
252 flip = "transform: scaleY(-1);";
253 }
254 } else if (ctm.a == 0 && ctm.d == 0) {
255 if (ctm.b < 0 && ctm.c < 0)
256 {
257 flip = "transform: scaleY(-1) rotate(90deg);";
258 x += (w-h)/2;
259 y -= (w-h)/2;
260 t = w; w = h; h = t;
261 }
262 else if (ctm.b < 0)
263 {
264 flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);";
265 x += (w-h)/2;
266 y -= (w-h)/2;
267 t = w; w = h; h = t;
268 }
269 else if (ctm.c < 0)
270 {
271 flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);";
272 x += (w-h)/2;
273 y -= (w-h)/2;
274 t = w; w = h; h = t;
275 }
276 else
277 {
278 flip = "transform: scaleY(-1) rotate(270deg);";
279 x += (w-h)/2;
280 y -= (w-h)/2;
281 t = w; w = h; h = t;
282 }
283 }
284
285 fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h);
286 #endif
287 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
288 fz_write_string(ctx, out, "\">\n");
289 }
290
291 void
292 fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
293 {
294 fz_stext_line *line;
295 fz_stext_char *ch;
296 float x, y, h;
297
298 fz_font *font = NULL;
299 float size = 0;
300 int sup = 0;
301 uint32_t color = 0;
302
303 for (line = block->u.t.first_line; line; line = line->next)
304 {
305 x = line->bbox.x0;
306 y = line->bbox.y0;
307 h = line->bbox.y1 - line->bbox.y0;
308
309 if (line->first_char)
310 {
311 h = line->first_char->size;
312 y = line->first_char->origin.y - h * 0.8f;
313 }
314
315 fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h);
316 font = NULL;
317
318 for (ch = line->first_char; ch; ch = ch->next)
319 {
320 int ch_sup = detect_super_script(line, ch);
321 if (ch->font != font || ch->size != size || ch_sup != sup || ch->argb != color)
322 {
323 if (font)
324 fz_print_style_end_html(ctx, out, font, size, sup, color);
325 font = ch->font;
326 size = ch->size;
327 color = ch->argb;
328 sup = ch_sup;
329 fz_print_style_begin_html(ctx, out, font, size, sup, color);
330 }
331
332 switch (ch->c)
333 {
334 default:
335 if (ch->c >= 32 && ch->c <= 127)
336 fz_write_byte(ctx, out, ch->c);
337 else
338 fz_write_printf(ctx, out, "&#x%x;", ch->c);
339 break;
340 case '<': fz_write_string(ctx, out, "&lt;"); break;
341 case '>': fz_write_string(ctx, out, "&gt;"); break;
342 case '&': fz_write_string(ctx, out, "&amp;"); break;
343 case '"': fz_write_string(ctx, out, "&quot;"); break;
344 case '\'': fz_write_string(ctx, out, "&apos;"); break;
345 }
346 }
347
348 if (font)
349 fz_print_style_end_html(ctx, out, font, size, sup, color);
350
351 fz_write_string(ctx, out, "</p>\n");
352 }
353 }
354
355 static const char *
356 html_tag_for_struct(fz_stext_struct *s)
357 {
358 const char *raw;
359
360 if (s == NULL)
361 return "DIV";
362
363 raw = s->raw;
364 if (raw == NULL)
365 raw = fz_structure_to_string(s->standard);
366
367 if (!fz_strcasecmp(raw, "blockquote"))
368 return "blockquote";
369 if (!fz_strcasecmp(raw, "title"))
370 return "h1";
371 if (!fz_strcasecmp(raw, "sub"))
372 return "sub";
373 if (!fz_strcasecmp(raw, "p"))
374 return "p";
375 if (!fz_strcasecmp(raw, "h"))
376 return "h1"; /* Pick one! */
377 if (!fz_strcasecmp(raw, "h1"))
378 return "h1";
379 if (!fz_strcasecmp(raw, "h2"))
380 return "h2";
381 if (!fz_strcasecmp(raw, "h3"))
382 return "h3";
383 if (!fz_strcasecmp(raw, "h4"))
384 return "h4";
385 if (!fz_strcasecmp(raw, "h5"))
386 return "h5";
387 if (!fz_strcasecmp(raw, "h6"))
388 return "h6";
389
390 if (!fz_strcasecmp(raw, "list"))
391 return "ul";
392 if (!fz_strcasecmp(raw, "listitem"))
393 return "li";
394 if (!fz_strcasecmp(raw, "table"))
395 return "table";
396 if (!fz_strcasecmp(raw, "tr"))
397 return "tr";
398 if (!fz_strcasecmp(raw, "th"))
399 return "th";
400 if (!fz_strcasecmp(raw, "td"))
401 return "td";
402 if (!fz_strcasecmp(raw, "thead"))
403 return "thead";
404 if (!fz_strcasecmp(raw, "tbody"))
405 return "tbody";
406 if (!fz_strcasecmp(raw, "tfoot"))
407 return "tfoot";
408
409 if (!fz_strcasecmp(raw, "span"))
410 return "span";
411 if (!fz_strcasecmp(raw, "code"))
412 return "code";
413 if (!fz_strcasecmp(raw, "em"))
414 return "em";
415 if (!fz_strcasecmp(raw, "strong"))
416 return "strong";
417
418 return "div";
419 }
420
421 static void
422 print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block);
423
424 static void
425 fz_print_stext_struct_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
426 {
427 const char *tag;
428
429 if (block->u.s.down == NULL)
430 return;
431
432 tag = html_tag_for_struct(block->u.s.down);
433
434 fz_write_printf(ctx, out, "<%s>\n", tag);
435
436 print_blocks_as_html(ctx, out, block->u.s.down->first_block);
437
438 fz_write_printf(ctx, out, "</%s>\n", tag);
439 }
440
441 static void
442 print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
443 {
444 for (; block; block = block->next)
445 {
446 if (block->type == FZ_STEXT_BLOCK_IMAGE)
447 fz_print_stext_image_as_html(ctx, out, block);
448 else if (block->type == FZ_STEXT_BLOCK_TEXT)
449 fz_print_stext_block_as_html(ctx, out, block);
450 else if (block->type == FZ_STEXT_BLOCK_STRUCT)
451 fz_print_stext_struct_as_html(ctx, out, block);
452 }
453 }
454
455 void
456 fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
457 {
458 float w = page->mediabox.x1 - page->mediabox.x0;
459 float h = page->mediabox.y1 - page->mediabox.y0;
460
461 fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h);
462
463 print_blocks_as_html(ctx, out, page->first_block);
464
465 fz_write_string(ctx, out, "</div>\n");
466 }
467
468 void
469 fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
470 {
471 fz_write_string(ctx, out, "<!DOCTYPE html>\n");
472 fz_write_string(ctx, out, "<html>\n");
473 fz_write_string(ctx, out, "<head>\n");
474 fz_write_string(ctx, out, "<style>\n");
475 fz_write_string(ctx, out, "body{background-color:slategray}\n");
476 fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n");
477 fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n");
478 fz_write_string(ctx, out, "</style>\n");
479 fz_write_string(ctx, out, "</head>\n");
480 fz_write_string(ctx, out, "<body>\n");
481 }
482
483 void
484 fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
485 {
486 fz_write_string(ctx, out, "</body>\n");
487 fz_write_string(ctx, out, "</html>\n");
488 }
489
490 /* XHTML output (semantic, little layout, suitable for reflow) */
491
492 static void
493 find_table_pos(fz_stext_grid_positions *xs, float x0, float x1, int *ix0, int *ix1)
494 {
495 int i;
496
497 *ix0 = -1;
498 *ix1 = -1;
499
500 for (i = 1; i < xs->len; i++)
501 if (x0 < xs->list[i].pos)
502 {
503 *ix0 = i-1;
504 break;
505 }
506 for (; i < xs->len; i++)
507 if (x1 < xs->list[i].pos)
508 {
509 *ix1 = i-1;
510 break;
511 }
512 if (i == xs->len)
513 *ix1 = i-1;
514 }
515
516 static void
517 run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out);
518
519 static void
520 fz_print_stext_table_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
521 {
522 fz_stext_block *grid, *tr, *td;
523 int w, h;
524 int x, y;
525 uint8_t *cells;
526 int malformed = 0;
527
528 for (grid = block; grid != NULL; grid = grid->next)
529 if (grid->type == FZ_STEXT_BLOCK_GRID)
530 break;
531 if (grid == NULL)
532 {
533 fz_warn(ctx, "Malformed table data");
534 return;
535 }
536 w = grid->u.b.xs->len;
537 h = grid->u.b.ys->len;
538 cells = fz_calloc(ctx, w, h);
539
540 fz_try(ctx)
541 {
542 fz_write_printf(ctx, out, "<table>\n");
543
544 y = 0;
545 for (tr = grid->next; tr != NULL; tr = tr->next)
546 {
547 if (tr->type != FZ_STEXT_BLOCK_STRUCT || tr->u.s.down == NULL || tr->u.s.down->standard != FZ_STRUCTURE_TR)
548 {
549 malformed = 1;
550 continue;
551 }
552 fz_write_printf(ctx, out, "<tr>\n");
553 x = 0;
554 for (td = tr->u.s.down->first_block; td != NULL; td = td->next)
555 {
556 int x0, y0, x1, y1;
557 if (td->type != FZ_STEXT_BLOCK_STRUCT || td->u.s.down == NULL || td->u.s.down->standard != FZ_STRUCTURE_TD)
558 {
559 malformed = 1;
560 continue;
561 }
562 find_table_pos(grid->u.b.xs, td->bbox.x0, td->bbox.x1, &x0, &x1);
563 find_table_pos(grid->u.b.ys, td->bbox.y0, td->bbox.y1, &y0, &y1);
564 if (x0 < 0 || x1 < 0 || x1 >= w)
565 {
566 malformed = 1;
567 x0 = x;
568 x1 = x+1;
569 }
570 if (y0 < 0 || y1 < 0 || y1 >= h)
571 {
572 malformed = 1;
573 y0 = y;
574 y1 = y+1;
575 }
576 if (y < y0)
577 {
578 malformed = 1;
579 continue;
580 }
581 if (x > x0)
582 {
583 malformed = 1;
584 }
585 while (x < x0)
586 {
587 uint8_t *c = &cells[x + w*y];
588 if (*c == 0)
589 {
590 fz_write_printf(ctx, out, "<td></td>");
591 *c = 1;
592 }
593 x++;
594 }
595 fz_write_string(ctx, out, "<td");
596 if (x1 > x0+1)
597 fz_write_printf(ctx, out, " rowspan=%d", x1-x0);
598 if (y1 > y0+1)
599 fz_write_printf(ctx, out, " colspan=%d", y1-y0);
600 fz_write_string(ctx, out, ">\n");
601 run_to_xhtml(ctx, td->u.s.down->first_block, out);
602 fz_write_printf(ctx, out, "</td>\n");
603 for ( ; y0 < y1; y0++)
604 for (x = x0; x < x1; x++)
605 {
606 uint8_t *c = &cells[x + w*y0];
607 if (*c != 0)
608 malformed = 1;
609 *c = 1;
610 }
611 }
612 fz_write_printf(ctx, out, "</tr>\n");
613 y++;
614 }
615
616 fz_write_printf(ctx, out, "</table>\n");
617 }
618 fz_always(ctx)
619 fz_free(ctx, cells);
620 fz_catch(ctx)
621 fz_rethrow(ctx);
622
623 if (malformed)
624 fz_warn(ctx, "Malformed table data");
625 }
626
627 static void
628 fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
629 {
630 int w = block->bbox.x1 - block->bbox.x0;
631 int h = block->bbox.y1 - block->bbox.y0;
632
633 fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
634 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
635 fz_write_string(ctx, out, "\"/></p>\n");
636 }
637
638 static void
639 fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
640 {
641 int is_mono = fz_font_is_monospaced(ctx, font);
642 int is_bold = fz_font_is_bold(ctx, font);
643 int is_italic = fz_font_is_italic(ctx, font);
644
645 if (sup)
646 fz_write_string(ctx, out, "<sup>");
647 if (is_mono)
648 fz_write_string(ctx, out, "<tt>");
649 if (is_bold)
650 fz_write_string(ctx, out, "<b>");
651 if (is_italic)
652 fz_write_string(ctx, out, "<i>");
653 }
654
655 static void
656 fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
657 {
658 int is_mono = fz_font_is_monospaced(ctx, font);
659 int is_bold = fz_font_is_bold(ctx, font);
660 int is_italic = fz_font_is_italic(ctx, font);
661
662 if (is_italic)
663 fz_write_string(ctx, out, "</i>");
664 if (is_bold)
665 fz_write_string(ctx, out, "</b>");
666 if (is_mono)
667 fz_write_string(ctx, out, "</tt>");
668 if (sup)
669 fz_write_string(ctx, out, "</sup>");
670 }
671
672 static float avg_font_size_of_line(fz_stext_char *ch)
673 {
674 float size = 0;
675 int n = 0;
676 if (!ch)
677 return 0;
678 while (ch)
679 {
680 size += ch->size;
681 ++n;
682 ch = ch->next;
683 }
684 return size / n;
685 }
686
687 static const char *tag_from_font_size(float size)
688 {
689 if (size >= 20) return "h1";
690 if (size >= 15) return "h2";
691 if (size >= 12) return "h3";
692 return "p";
693 }
694
695 static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
696 {
697 fz_stext_line *line;
698 fz_stext_char *ch;
699
700 fz_font *font = NULL;
701 int sup = 0;
702 int sp = 1;
703 const char *tag = NULL;
704 const char *new_tag;
705
706 for (line = block->u.t.first_line; line; line = line->next)
707 {
708 new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
709 if (tag != new_tag)
710 {
711 if (tag)
712 {
713 if (font)
714 fz_print_style_end_xhtml(ctx, out, font, sup);
715 fz_write_printf(ctx, out, "</%s>", tag);
716 }
717 tag = new_tag;
718 fz_write_printf(ctx, out, "<%s>", tag);
719 if (font)
720 fz_print_style_begin_xhtml(ctx, out, font, sup);
721 }
722
723 if (!sp)
724 fz_write_byte(ctx, out, ' ');
725
726 for (ch = line->first_char; ch; ch = ch->next)
727 {
728 int ch_sup = detect_super_script(line, ch);
729 if (ch->font != font || ch_sup != sup)
730 {
731 if (font)
732 fz_print_style_end_xhtml(ctx, out, font, sup);
733 font = ch->font;
734 sup = ch_sup;
735 fz_print_style_begin_xhtml(ctx, out, font, sup);
736 }
737
738 sp = (ch->c == ' ');
739 switch (ch->c)
740 {
741 default:
742 if (ch->c >= 32 && ch->c <= 127)
743 fz_write_byte(ctx, out, ch->c);
744 else
745 fz_write_printf(ctx, out, "&#x%x;", ch->c);
746 break;
747 case '<': fz_write_string(ctx, out, "&lt;"); break;
748 case '>': fz_write_string(ctx, out, "&gt;"); break;
749 case '&': fz_write_string(ctx, out, "&amp;"); break;
750 case '"': fz_write_string(ctx, out, "&quot;"); break;
751 case '\'': fz_write_string(ctx, out, "&apos;"); break;
752 }
753 }
754 }
755
756 if (font)
757 fz_print_style_end_xhtml(ctx, out, font, sup);
758 fz_write_printf(ctx, out, "</%s>\n", tag);
759 }
760
761 static void
762 fz_print_struct_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
763 {
764 const char *tag;
765
766 if (block->u.s.down == NULL)
767 return;
768
769 if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
770 {
771 fz_print_stext_table_as_xhtml(ctx, out, block->u.s.down->first_block);
772 return;
773 }
774
775 tag = html_tag_for_struct(block->u.s.down);
776
777 fz_write_printf(ctx, out, "<%s>\n", tag);
778
779 run_to_xhtml(ctx, block->u.s.down->first_block, out);
780
781 fz_write_printf(ctx, out, "</%s>\n", tag);
782 }
783
784 static void
785 run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out)
786 {
787 while (block)
788 {
789 switch(block->type)
790 {
791 case FZ_STEXT_BLOCK_IMAGE:
792 fz_print_stext_image_as_xhtml(ctx, out, block);
793 break;
794 case FZ_STEXT_BLOCK_TEXT:
795 fz_print_stext_block_as_xhtml(ctx, out, block);
796 break;
797 case FZ_STEXT_BLOCK_STRUCT:
798 fz_print_struct_as_xhtml(ctx, out, block);
799 break;
800 }
801 block = block->next;
802 }
803 }
804
805 void
806 fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
807 {
808 fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
809
810 run_to_xhtml(ctx, page->first_block, out);
811
812 fz_write_string(ctx, out, "</div>\n");
813 }
814
815 void
816 fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
817 {
818 fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
819 fz_write_string(ctx, out, "<!DOCTYPE html");
820 fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
821 fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
822 fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
823 fz_write_string(ctx, out, "<head>\n");
824 fz_write_string(ctx, out, "<style>\n");
825 fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
826 fz_write_string(ctx, out, "</style>\n");
827 fz_write_string(ctx, out, "</head>\n");
828 fz_write_string(ctx, out, "<body>\n");
829 }
830
831 void
832 fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
833 {
834 fz_write_string(ctx, out, "</body>\n");
835 fz_write_string(ctx, out, "</html>\n");
836 }
837
838 /* Detailed XML dump of the entire structured text data */
839
840 static void
841 xml_write_char(fz_context *ctx, fz_output *out, int c)
842 {
843 switch (c)
844 {
845 case '<': fz_write_string(ctx, out, "&lt;"); break;
846 case '>': fz_write_string(ctx, out, "&gt;"); break;
847 case '&': fz_write_string(ctx, out, "&amp;"); break;
848 case '"': fz_write_string(ctx, out, "&quot;"); break;
849 case '\'': fz_write_string(ctx, out, "&apos;"); break;
850 default:
851 if (c >= 32 && c <= 127)
852 fz_write_printf(ctx, out, "%c", c);
853 else
854 fz_write_printf(ctx, out, "&#x%x;", c);
855 break;
856 }
857 }
858
859 static void
860 as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out)
861 {
862 fz_stext_line *line;
863 fz_stext_char *ch;
864 int i;
865
866 while (block)
867 {
868 switch (block->type)
869 {
870 case FZ_STEXT_BLOCK_TEXT:
871 fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\"",
872 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
873 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
874 fz_write_printf(ctx, out, " justify=\"unknown\"");
875 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_LEFT)
876 fz_write_printf(ctx, out, " justify=\"left\"");
877 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_CENTRE)
878 fz_write_printf(ctx, out, " justify=\"centre\"");
879 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_RIGHT)
880 fz_write_printf(ctx, out, " justify=\"right\"");
881 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
882 fz_write_printf(ctx, out, " justify=\"full\"");
883 fz_write_printf(ctx, out, ">\n");
884 for (line = block->u.t.first_line; line; line = line->next)
885 {
886 fz_font *font = NULL;
887 float size = 0;
888 const char *name = NULL;
889
890 fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\"",
891 line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
892 line->wmode,
893 line->dir.x, line->dir.y);
894
895 /* This is duplication of information, but it makes it MUCH easier to search for
896 * text fragments in large output. */
897 {
898 int valid = 1;
899 fz_write_printf(ctx, out, " text=\"");
900 for (ch = line->first_char; ch; ch = ch->next)
901 {
902 if (valid)
903 valid = fz_is_valid_xml_char(ch->c);
904 xml_write_char(ctx, out, fz_range_limit_xml_char(ch->c));
905 }
906 if (!valid)
907 {
908 fz_write_printf(ctx, out, "\" hextext=\"");
909 for (ch = line->first_char; ch; ch = ch->next)
910 {
911 char text[8];
912 int n = fz_runetochar(text, ch->c);
913 for (i = 0; i < n; i++)
914 fz_write_printf(ctx, out, "%02x", text[i]);
915 }
916 }
917 fz_write_printf(ctx, out, "\"");
918 }
919
920 fz_write_printf(ctx, out, ">\n");
921
922 for (ch = line->first_char; ch; ch = ch->next)
923 {
924 if (ch->font != font || ch->size != size)
925 {
926 const char *s;
927 if (font)
928 fz_write_string(ctx, out, "</font>\n");
929 font = ch->font;
930 size = ch->size;
931 s = name = font_full_name(ctx, font);
932 while (*s)
933 {
934 int c = *s++;
935 if (c < 32 || c >= 127)
936 break;
937 }
938 if (*s)
939 fz_write_printf(ctx, out, "<font hexname=%>", name);
940 else
941 fz_write_printf(ctx, out, "<font name=\"%s\"", name);
942 fz_write_printf(ctx, out, " size=\"%g\">\n", size);
943 }
944 fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" alpha=\"#%02x\" flags=\"%d\" c=\"",
945 ch->quad.ul.x, ch->quad.ul.y,
946 ch->quad.ur.x, ch->quad.ur.y,
947 ch->quad.ll.x, ch->quad.ll.y,
948 ch->quad.lr.x, ch->quad.lr.y,
949 ch->origin.x, ch->origin.y,
950 ch->bidi,
951 ch->argb & 0xFFFFFF,
952 ch->argb>>24,
953 ch->flags);
954 xml_write_char(ctx, out, ch->c);
955 if (!fz_is_valid_xml_char(ch->c))
956 {
957 char text[8];
958 int n = fz_runetochar(text, ch->c);
959 fz_write_string(ctx, out, "\" hexc=\"");
960 for (i = 0; i < n; i++)
961 fz_write_printf(ctx, out, "%02x", text[i]);
962 }
963 fz_write_string(ctx, out, "\"/>\n");
964 }
965
966 if (font)
967 fz_write_string(ctx, out, "</font>\n");
968
969 fz_write_string(ctx, out, "</line>\n");
970 }
971 fz_write_string(ctx, out, "</block>\n");
972 break;
973
974 case FZ_STEXT_BLOCK_IMAGE:
975 fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
976 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
977 break;
978
979 case FZ_STEXT_BLOCK_STRUCT:
980 fz_write_printf(ctx, out, "<struct idx=\"%d\" bbox=\"%g %g %g %g\"", block->u.s.index,
981 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
982 if (block->u.s.down)
983 fz_write_printf(ctx, out, " raw=\"%s\" std=\"%s\"",
984 block->u.s.down->raw, fz_structure_to_string(block->u.s.down->standard));
985 fz_write_printf(ctx, out, ">\n");
986 if (block->u.s.down)
987 as_xml(ctx, block->u.s.down->first_block, out);
988 fz_write_printf(ctx, out, "</struct>\n");
989 break;
990
991 case FZ_STEXT_BLOCK_VECTOR:
992 fz_write_printf(ctx, out, "<vector bbox=\"%g %g %g %g\" stroke=\"%d\" rectangle=\"%d\" continues=\"%d\" argb=\"%08x\"/>\n",
993 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1,
994 !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED),
995 !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE),
996 !!(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES),
997 block->u.v.argb);
998 break;
999
1000 case FZ_STEXT_BLOCK_GRID:
1001 fz_write_printf(ctx, out, "<grid xpos=\"");
1002 for (i = 0; i < block->u.b.xs->len; i++)
1003 fz_write_printf(ctx, out, "%g ", block->u.b.xs->list[i].pos);
1004 fz_write_printf(ctx, out, "\" xuncertainty=\"");
1005 for (i = 0; i < block->u.b.xs->len; i++)
1006 fz_write_printf(ctx, out, "%d ", block->u.b.xs->list[i].uncertainty);
1007 fz_write_printf(ctx, out, "\" xmaxuncertainty=\"%d\" ypos=\"", block->u.b.xs->max_uncertainty);
1008 for (i = 0; i < block->u.b.ys->len; i++)
1009 fz_write_printf(ctx, out, "%g ", block->u.b.ys->list[i].pos);
1010 fz_write_printf(ctx, out, "\" yuncertainty=\"");
1011 for (i = 0; i < block->u.b.ys->len; i++)
1012 fz_write_printf(ctx, out, "%d ", block->u.b.ys->list[i].uncertainty);
1013 fz_write_printf(ctx, out, "\" ymaxuncertainty=\"%d\" />\n", block->u.b.ys->max_uncertainty);
1014 break;
1015 }
1016 block = block->next;
1017 }
1018 }
1019
1020 void
1021 fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
1022 {
1023 fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
1024 page->mediabox.x1 - page->mediabox.x0,
1025 page->mediabox.y1 - page->mediabox.y0);
1026
1027 as_xml(ctx, page->first_block, out);
1028
1029 fz_write_string(ctx, out, "</page>\n");
1030 }
1031
1032 /* JSON dump */
1033
1034 static void
1035 as_json(fz_context *ctx, fz_stext_block *block, fz_output *out, float scale)
1036 {
1037 fz_stext_line *line;
1038 fz_stext_char *ch;
1039 int comma = 0;
1040
1041 while (block)
1042 {
1043 if (comma)
1044 fz_write_string(ctx, out, ",");
1045 comma = 1;
1046
1047 switch (block->type)
1048 {
1049 case FZ_STEXT_BLOCK_TEXT:
1050 fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
1051 fz_write_printf(ctx, out, "%q:{", "bbox");
1052 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
1053 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
1054 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
1055 fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
1056 fz_write_printf(ctx, out, "%q:[", "lines");
1057
1058 for (line = block->u.t.first_line; line; line = line->next)
1059 {
1060 if (line != block->u.t.first_line)
1061 fz_write_string(ctx, out, ",");
1062 fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
1063 fz_write_printf(ctx, out, "%q:{", "bbox");
1064 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
1065 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
1066 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
1067 fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
1068
1069 /* Since we force preserve-spans, the first char has the style for the entire line. */
1070 if (line->first_char)
1071 {
1072 fz_font *font = line->first_char->font;
1073 char *font_family = "sans-serif";
1074 char *font_weight = "normal";
1075 char *font_style = "normal";
1076 if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
1077 else if (fz_font_is_serif(ctx, font)) font_family = "serif";
1078 if (fz_font_is_bold(ctx, font)) font_weight = "bold";
1079 if (fz_font_is_italic(ctx, font)) font_style = "italic";
1080 fz_write_printf(ctx, out, "%q:{", "font");
1081 fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
1082 fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
1083 fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
1084 fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
1085 fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
1086 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
1087 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
1088 }
1089
1090 fz_write_printf(ctx, out, "%q:\"", "text");
1091 for (ch = line->first_char; ch; ch = ch->next)
1092 {
1093 if (ch->c == '"' || ch->c == '\\')
1094 fz_write_printf(ctx, out, "\\%c", ch->c);
1095 else if (ch->c < 32)
1096 fz_write_printf(ctx, out, "\\u%04x", ch->c);
1097 else
1098 fz_write_printf(ctx, out, "%C", ch->c);
1099 }
1100 fz_write_printf(ctx, out, "\"}");
1101 }
1102 fz_write_string(ctx, out, "]}");
1103 break;
1104
1105 case FZ_STEXT_BLOCK_IMAGE:
1106 fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
1107 fz_write_printf(ctx, out, "%q:{", "bbox");
1108 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
1109 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
1110 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
1111 fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
1112 break;
1113
1114 case FZ_STEXT_BLOCK_STRUCT:
1115 fz_write_printf(ctx, out, "{%q:%q,", "type", "structure");
1116 fz_write_printf(ctx, out, "%q:%d", "index", block->u.s.index);
1117 if (block->u.s.down)
1118 {
1119 fz_write_printf(ctx, out, ",%q:%q", "raw", block->u.s.down->raw);
1120 fz_write_printf(ctx, out, ",%q:%q", "std", fz_structure_to_string(block->u.s.down->standard));
1121 fz_write_printf(ctx, out, ",%q:[", "contents");
1122 as_json(ctx, block->u.s.down->first_block, out, scale);
1123 fz_write_printf(ctx, out, "]");
1124 }
1125 fz_write_printf(ctx, out, "}");
1126 break;
1127
1128 }
1129 block = block->next;
1130 }
1131 }
1132
1133 void
1134 fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
1135 {
1136 fz_write_printf(ctx, out, "{%q:[", "blocks");
1137
1138 as_json(ctx, page->first_block, out, scale);
1139
1140 fz_write_string(ctx, out, "]}");
1141 }
1142
1143 /* Plain text */
1144
1145 static void
1146 do_as_text(fz_context *ctx, fz_output *out, fz_stext_block *first_block)
1147 {
1148 fz_stext_block *block;
1149 fz_stext_line *line;
1150 fz_stext_char *ch;
1151 char utf[10];
1152 int i, n;
1153
1154 for (block = first_block; block; block = block->next)
1155 {
1156 switch (block->type)
1157 {
1158 case FZ_STEXT_BLOCK_TEXT:
1159 for (line = block->u.t.first_line; line; line = line->next)
1160 {
1161 for (ch = line->first_char; ch; ch = ch->next)
1162 {
1163 n = fz_runetochar(utf, ch->c);
1164 for (i = 0; i < n; i++)
1165 fz_write_byte(ctx, out, utf[i]);
1166 }
1167 fz_write_string(ctx, out, "\n");
1168 }
1169 fz_write_string(ctx, out, "\n");
1170 break;
1171 case FZ_STEXT_BLOCK_STRUCT:
1172 if (block->u.s.down != NULL)
1173 do_as_text(ctx, out, block->u.s.down->first_block);
1174 break;
1175 }
1176 }
1177 }
1178
1179 void
1180 fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
1181 {
1182 do_as_text(ctx, out, page->first_block);
1183 }
1184
1185 /* Text output writer */
1186
1187 enum {
1188 FZ_FORMAT_TEXT,
1189 FZ_FORMAT_HTML,
1190 FZ_FORMAT_XHTML,
1191 FZ_FORMAT_STEXT_XML,
1192 FZ_FORMAT_STEXT_JSON,
1193 };
1194
1195 typedef struct
1196 {
1197 fz_document_writer super;
1198 int format;
1199 int number;
1200 fz_stext_options opts;
1201 fz_stext_page *page;
1202 fz_output *out;
1203 } fz_text_writer;
1204
1205 static fz_device *
1206 text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
1207 {
1208 fz_text_writer *wri = (fz_text_writer*)wri_;
1209 float s = wri->opts.scale;
1210
1211 if (wri->page)
1212 {
1213 fz_drop_stext_page(ctx, wri->page);
1214 wri->page = NULL;
1215 }
1216
1217 wri->number++;
1218
1219 wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s)));
1220 return fz_new_stext_device(ctx, wri->page, &wri->opts);
1221 }
1222
1223 static void
1224 text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
1225 {
1226 fz_text_writer *wri = (fz_text_writer*)wri_;
1227 float s = wri->opts.scale;
1228
1229 fz_scale_stext_page(ctx, wri->page, s);
1230
1231 fz_try(ctx)
1232 {
1233 fz_close_device(ctx, dev);
1234 switch (wri->format)
1235 {
1236 default:
1237 case FZ_FORMAT_TEXT:
1238 fz_print_stext_page_as_text(ctx, wri->out, wri->page);
1239 break;
1240 case FZ_FORMAT_HTML:
1241 fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
1242 break;
1243 case FZ_FORMAT_XHTML:
1244 fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
1245 break;
1246 case FZ_FORMAT_STEXT_XML:
1247 fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
1248 break;
1249 case FZ_FORMAT_STEXT_JSON:
1250 if (wri->number > 1)
1251 fz_write_string(ctx, wri->out, ",");
1252 fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
1253 break;
1254 }
1255 }
1256 fz_always(ctx)
1257 {
1258 fz_drop_device(ctx, dev);
1259 fz_drop_stext_page(ctx, wri->page);
1260 wri->page = NULL;
1261 }
1262 fz_catch(ctx)
1263 fz_rethrow(ctx);
1264 }
1265
1266 static void
1267 text_close_writer(fz_context *ctx, fz_document_writer *wri_)
1268 {
1269 fz_text_writer *wri = (fz_text_writer*)wri_;
1270 switch (wri->format)
1271 {
1272 case FZ_FORMAT_HTML:
1273 fz_print_stext_trailer_as_html(ctx, wri->out);
1274 break;
1275 case FZ_FORMAT_XHTML:
1276 fz_print_stext_trailer_as_xhtml(ctx, wri->out);
1277 break;
1278 case FZ_FORMAT_STEXT_XML:
1279 fz_write_string(ctx, wri->out, "</document>\n");
1280 break;
1281 case FZ_FORMAT_STEXT_JSON:
1282 fz_write_string(ctx, wri->out, "]\n");
1283 break;
1284 }
1285 fz_close_output(ctx, wri->out);
1286 }
1287
1288 static void
1289 text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
1290 {
1291 fz_text_writer *wri = (fz_text_writer*)wri_;
1292 fz_drop_stext_page(ctx, wri->page);
1293 fz_drop_output(ctx, wri->out);
1294 }
1295
1296 fz_document_writer *
1297 fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
1298 {
1299 fz_text_writer *wri = NULL;
1300
1301 fz_var(wri);
1302
1303 fz_try(ctx)
1304 {
1305 wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
1306 fz_parse_stext_options(ctx, &wri->opts, options);
1307
1308 wri->format = FZ_FORMAT_TEXT;
1309 if (!strcmp(format, "text"))
1310 wri->format = FZ_FORMAT_TEXT;
1311 else if (!strcmp(format, "html"))
1312 wri->format = FZ_FORMAT_HTML;
1313 else if (!strcmp(format, "xhtml"))
1314 wri->format = FZ_FORMAT_XHTML;
1315 else if (!strcmp(format, "stext"))
1316 wri->format = FZ_FORMAT_STEXT_XML;
1317 else if (!strcmp(format, "stext.xml"))
1318 wri->format = FZ_FORMAT_STEXT_XML;
1319 else if (!strcmp(format, "stext.json"))
1320 {
1321 wri->format = FZ_FORMAT_STEXT_JSON;
1322 wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
1323 }
1324
1325 wri->out = out;
1326
1327 switch (wri->format)
1328 {
1329 case FZ_FORMAT_HTML:
1330 fz_print_stext_header_as_html(ctx, wri->out);
1331 break;
1332 case FZ_FORMAT_XHTML:
1333 fz_print_stext_header_as_xhtml(ctx, wri->out);
1334 break;
1335 case FZ_FORMAT_STEXT_XML:
1336 fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
1337 fz_write_string(ctx, wri->out, "<document>\n");
1338 break;
1339 case FZ_FORMAT_STEXT_JSON:
1340 fz_write_string(ctx, wri->out, "[");
1341 break;
1342 }
1343 }
1344 fz_catch(ctx)
1345 {
1346 fz_drop_output(ctx, out);
1347 fz_free(ctx, wri);
1348 fz_rethrow(ctx);
1349 }
1350
1351 return (fz_document_writer*)wri;
1352 }
1353
1354 fz_document_writer *
1355 fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
1356 {
1357 fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
1358 return fz_new_text_writer_with_output(ctx, format, out, options);
1359 }