Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/stext-output.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 | |
| 25 #define SUBSCRIPT_OFFSET 0.2f | |
| 26 #define SUPERSCRIPT_OFFSET -0.2f | |
| 27 | |
| 28 #include <ft2build.h> | |
| 29 #include FT_FREETYPE_H | |
| 30 | |
| 31 // Text black color when converted from DeviceCMYK to RGB | |
| 32 #define CMYK_BLACK 0x221f1f | |
| 33 | |
| 34 static void | |
| 35 scale_run(fz_context *ctx, fz_stext_block *block, float scale) | |
| 36 { | |
| 37 fz_matrix m = fz_scale(scale, scale); | |
| 38 fz_stext_line *line; | |
| 39 fz_stext_char *ch; | |
| 40 | |
| 41 while (block) | |
| 42 { | |
| 43 block->bbox = fz_transform_rect(block->bbox, m); | |
| 44 switch (block->type) | |
| 45 { | |
| 46 case FZ_STEXT_BLOCK_TEXT: | |
| 47 for (line = block->u.t.first_line; line; line = line->next) | |
| 48 { | |
| 49 line->bbox = fz_transform_rect(block->bbox, m); | |
| 50 for (ch = line->first_char; ch; ch = ch->next) | |
| 51 { | |
| 52 ch->origin = fz_transform_point(ch->origin, m); | |
| 53 ch->quad = fz_transform_quad(ch->quad, m); | |
| 54 ch->size = ch->size * scale; | |
| 55 } | |
| 56 } | |
| 57 break; | |
| 58 | |
| 59 case FZ_STEXT_BLOCK_IMAGE: | |
| 60 block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale); | |
| 61 break; | |
| 62 | |
| 63 case FZ_STEXT_BLOCK_STRUCT: | |
| 64 if (block->u.s.down) | |
| 65 scale_run(ctx, block->u.s.down->first_block, scale); | |
| 66 break; | |
| 67 } | |
| 68 block = block->next; | |
| 69 } | |
| 70 } | |
| 71 | |
| 72 static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale) | |
| 73 { | |
| 74 scale_run(ctx, page->first_block, scale); | |
| 75 } | |
| 76 | |
| 77 /* HTML output (visual formatting with preserved layout) */ | |
| 78 | |
| 79 static int | |
| 80 detect_super_script(fz_stext_line *line, fz_stext_char *ch) | |
| 81 { | |
| 82 if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) | |
| 83 return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; | |
| 84 return 0; | |
| 85 } | |
| 86 | |
| 87 static const char * | |
| 88 font_full_name(fz_context *ctx, fz_font *font) | |
| 89 { | |
| 90 const char *name = fz_font_name(ctx, font); | |
| 91 const char *s = strchr(name, '+'); | |
| 92 return s ? s + 1 : name; | |
| 93 } | |
| 94 | |
| 95 static const char * | |
| 96 html_clean_font_name(const char *fontname) | |
| 97 { | |
| 98 if (strstr(fontname, "Times")) | |
| 99 return "Times New Roman"; | |
| 100 if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica")) | |
| 101 { | |
| 102 if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed")) | |
| 103 return "Arial Narrow"; | |
| 104 return "Arial"; | |
| 105 } | |
| 106 if (strstr(fontname, "Courier")) | |
| 107 return "Courier"; | |
| 108 return fontname; | |
| 109 } | |
| 110 | |
| 111 static void | |
| 112 font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif) | |
| 113 { | |
| 114 const char *name = html_clean_font_name(font_full_name(ctx, font)); | |
| 115 char *s; | |
| 116 fz_strlcpy(buf, name, size); | |
| 117 s = strrchr(buf, '-'); | |
| 118 if (s) | |
| 119 *s = 0; | |
| 120 if (is_mono) | |
| 121 fz_strlcat(buf, ",monospace", size); | |
| 122 else | |
| 123 fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size); | |
| 124 } | |
| 125 | |
| 126 static void | |
| 127 fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color) | |
| 128 { | |
| 129 char family[80]; | |
| 130 | |
| 131 int is_bold = fz_font_is_bold(ctx, font); | |
| 132 int is_italic = fz_font_is_italic(ctx, font); | |
| 133 int is_serif = fz_font_is_serif(ctx, font); | |
| 134 int is_mono = fz_font_is_monospaced(ctx, font); | |
| 135 | |
| 136 font_family_name(ctx, font, family, sizeof family, is_mono, is_serif); | |
| 137 | |
| 138 if (sup) fz_write_string(ctx, out, "<sup>"); | |
| 139 if (is_mono) fz_write_string(ctx, out, "<tt>"); | |
| 140 if (is_bold) fz_write_string(ctx, out, "<b>"); | |
| 141 if (is_italic) fz_write_string(ctx, out, "<i>"); | |
| 142 fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size); | |
| 143 if (color != 0 && color != CMYK_BLACK) | |
| 144 fz_write_printf(ctx, out, ";color:#%06x", color & 0xffffff); | |
| 145 fz_write_printf(ctx, out, "\">"); | |
| 146 } | |
| 147 | |
| 148 static void | |
| 149 fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color) | |
| 150 { | |
| 151 int is_mono = fz_font_is_monospaced(ctx, font); | |
| 152 int is_bold = fz_font_is_bold(ctx,font); | |
| 153 int is_italic = fz_font_is_italic(ctx, font); | |
| 154 | |
| 155 fz_write_string(ctx, out, "</span>"); | |
| 156 if (is_italic) fz_write_string(ctx, out, "</i>"); | |
| 157 if (is_bold) fz_write_string(ctx, out, "</b>"); | |
| 158 if (is_mono) fz_write_string(ctx, out, "</tt>"); | |
| 159 if (sup) fz_write_string(ctx, out, "</sup>"); | |
| 160 } | |
| 161 | |
| 162 static void | |
| 163 fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 164 { | |
| 165 fz_matrix ctm = block->u.i.transform; | |
| 166 | |
| 167 #define USE_CSS_MATRIX_TRANSFORMS | |
| 168 #ifdef USE_CSS_MATRIX_TRANSFORMS | |
| 169 /* Matrix maths notes. | |
| 170 * When we get here ctm maps the unit square to the position in device | |
| 171 * space occupied by the image. | |
| 172 * | |
| 173 * That is to say that mapping the 4 corners of the unit square through | |
| 174 * the transform, give us the 4 target corners. We extend the corners | |
| 175 * by adding an extra '1' into them to allow transforms to work. Thus | |
| 176 * (x,y) maps through ctm = (a b c d e f) as: | |
| 177 * | |
| 178 * (x y 1) (a b 0) = (X Y 1) | |
| 179 * (c d 0) | |
| 180 * (e f 1) | |
| 181 * | |
| 182 * To simplify reading of matrix maths, we use the trick where we | |
| 183 * 'drop' the first matrix down the page. Thus the corners c0=(0,0), | |
| 184 * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively: | |
| 185 * | |
| 186 * ( a b 0) | |
| 187 * ( c d 0) | |
| 188 * ( e f 1) | |
| 189 * (0 0 1) ( e f 1) | |
| 190 * (0 1 1) ( c+e d+f 1) | |
| 191 * (1 1 1) (a+c+e b+d+f 1) | |
| 192 * (1 0 1) ( a+e b+f 1) | |
| 193 * | |
| 194 * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f) | |
| 195 * | |
| 196 * Unfortunately, the CSS matrix transform, does not map the unit square. | |
| 197 * Rather it does something moderately mad. As far as I can work out, the | |
| 198 * top left corner of a (0,0) -> (w, h) box is transformed using the .e | |
| 199 * and .f entries of the matrix. Then the image from within that square | |
| 200 * is transformed using the centre of that square as the origin. | |
| 201 * | |
| 202 * So, an image placed at (0,0) in destination space with 1:1 transform | |
| 203 * will result in an image a (0,0) as you'd expect. But an image at (0,0) | |
| 204 * with a scale of 2, will result in 25% of the image off the left of the | |
| 205 * screen, and 25% off the top. | |
| 206 * | |
| 207 * Accordingly, we have to adjust the ctm in several steps. | |
| 208 */ | |
| 209 /* Move to moving the centre of the image. */ | |
| 210 ctm.e += (ctm.a+ctm.c)/2; | |
| 211 ctm.f += (ctm.b+ctm.d)/2; | |
| 212 /* Move from transforming the unit square to w/h */ | |
| 213 ctm.a /= block->u.i.image->w; | |
| 214 ctm.b /= block->u.i.image->w; | |
| 215 ctm.c /= block->u.i.image->h; | |
| 216 ctm.d /= block->u.i.image->h; | |
| 217 /* Move from points to pixels */ | |
| 218 ctm.a *= 96.0f/72; | |
| 219 ctm.b *= 96.0f/72; | |
| 220 ctm.c *= 96.0f/72; | |
| 221 ctm.d *= 96.0f/72; | |
| 222 ctm.e *= 96.0f/72; | |
| 223 ctm.f *= 96.0f/72; | |
| 224 /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */ | |
| 225 ctm.e -= block->u.i.image->w/2; | |
| 226 ctm.f -= block->u.i.image->h/2; | |
| 227 | |
| 228 fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"", | |
| 229 ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f); | |
| 230 #else | |
| 231 /* Alternative version of the code that uses scaleX/Y and rotate | |
| 232 * instead, but only copes with axis aligned cases. */ | |
| 233 int t; | |
| 234 | |
| 235 int x = block->bbox.x0; | |
| 236 int y = block->bbox.y0; | |
| 237 int w = block->bbox.x1 - block->bbox.x0; | |
| 238 int h = block->bbox.y1 - block->bbox.y0; | |
| 239 | |
| 240 const char *flip = ""; | |
| 241 | |
| 242 if (ctm.b == 0 && ctm.c == 0) | |
| 243 { | |
| 244 if (ctm.a < 0 && ctm.d < 0) | |
| 245 flip = "transform: scaleX(-1) scaleY(-1);"; | |
| 246 else if (ctm.a < 0) | |
| 247 { | |
| 248 flip = "transform: scaleX(-1);"; | |
| 249 } | |
| 250 else if (ctm.d < 0) | |
| 251 { | |
| 252 flip = "transform: scaleY(-1);"; | |
| 253 } | |
| 254 } else if (ctm.a == 0 && ctm.d == 0) { | |
| 255 if (ctm.b < 0 && ctm.c < 0) | |
| 256 { | |
| 257 flip = "transform: scaleY(-1) rotate(90deg);"; | |
| 258 x += (w-h)/2; | |
| 259 y -= (w-h)/2; | |
| 260 t = w; w = h; h = t; | |
| 261 } | |
| 262 else if (ctm.b < 0) | |
| 263 { | |
| 264 flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);"; | |
| 265 x += (w-h)/2; | |
| 266 y -= (w-h)/2; | |
| 267 t = w; w = h; h = t; | |
| 268 } | |
| 269 else if (ctm.c < 0) | |
| 270 { | |
| 271 flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);"; | |
| 272 x += (w-h)/2; | |
| 273 y -= (w-h)/2; | |
| 274 t = w; w = h; h = t; | |
| 275 } | |
| 276 else | |
| 277 { | |
| 278 flip = "transform: scaleY(-1) rotate(270deg);"; | |
| 279 x += (w-h)/2; | |
| 280 y -= (w-h)/2; | |
| 281 t = w; w = h; h = t; | |
| 282 } | |
| 283 } | |
| 284 | |
| 285 fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h); | |
| 286 #endif | |
| 287 fz_write_image_as_data_uri(ctx, out, block->u.i.image); | |
| 288 fz_write_string(ctx, out, "\">\n"); | |
| 289 } | |
| 290 | |
| 291 void | |
| 292 fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 293 { | |
| 294 fz_stext_line *line; | |
| 295 fz_stext_char *ch; | |
| 296 float x, y, h; | |
| 297 | |
| 298 fz_font *font = NULL; | |
| 299 float size = 0; | |
| 300 int sup = 0; | |
| 301 uint32_t color = 0; | |
| 302 | |
| 303 for (line = block->u.t.first_line; line; line = line->next) | |
| 304 { | |
| 305 x = line->bbox.x0; | |
| 306 y = line->bbox.y0; | |
| 307 h = line->bbox.y1 - line->bbox.y0; | |
| 308 | |
| 309 if (line->first_char) | |
| 310 { | |
| 311 h = line->first_char->size; | |
| 312 y = line->first_char->origin.y - h * 0.8f; | |
| 313 } | |
| 314 | |
| 315 fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h); | |
| 316 font = NULL; | |
| 317 | |
| 318 for (ch = line->first_char; ch; ch = ch->next) | |
| 319 { | |
| 320 int ch_sup = detect_super_script(line, ch); | |
| 321 if (ch->font != font || ch->size != size || ch_sup != sup || ch->argb != color) | |
| 322 { | |
| 323 if (font) | |
| 324 fz_print_style_end_html(ctx, out, font, size, sup, color); | |
| 325 font = ch->font; | |
| 326 size = ch->size; | |
| 327 color = ch->argb; | |
| 328 sup = ch_sup; | |
| 329 fz_print_style_begin_html(ctx, out, font, size, sup, color); | |
| 330 } | |
| 331 | |
| 332 switch (ch->c) | |
| 333 { | |
| 334 default: | |
| 335 if (ch->c >= 32 && ch->c <= 127) | |
| 336 fz_write_byte(ctx, out, ch->c); | |
| 337 else | |
| 338 fz_write_printf(ctx, out, "&#x%x;", ch->c); | |
| 339 break; | |
| 340 case '<': fz_write_string(ctx, out, "<"); break; | |
| 341 case '>': fz_write_string(ctx, out, ">"); break; | |
| 342 case '&': fz_write_string(ctx, out, "&"); break; | |
| 343 case '"': fz_write_string(ctx, out, """); break; | |
| 344 case '\'': fz_write_string(ctx, out, "'"); break; | |
| 345 } | |
| 346 } | |
| 347 | |
| 348 if (font) | |
| 349 fz_print_style_end_html(ctx, out, font, size, sup, color); | |
| 350 | |
| 351 fz_write_string(ctx, out, "</p>\n"); | |
| 352 } | |
| 353 } | |
| 354 | |
| 355 static const char * | |
| 356 html_tag_for_struct(fz_stext_struct *s) | |
| 357 { | |
| 358 const char *raw; | |
| 359 | |
| 360 if (s == NULL) | |
| 361 return "DIV"; | |
| 362 | |
| 363 raw = s->raw; | |
| 364 if (raw == NULL) | |
| 365 raw = fz_structure_to_string(s->standard); | |
| 366 | |
| 367 if (!fz_strcasecmp(raw, "blockquote")) | |
| 368 return "blockquote"; | |
| 369 if (!fz_strcasecmp(raw, "title")) | |
| 370 return "h1"; | |
| 371 if (!fz_strcasecmp(raw, "sub")) | |
| 372 return "sub"; | |
| 373 if (!fz_strcasecmp(raw, "p")) | |
| 374 return "p"; | |
| 375 if (!fz_strcasecmp(raw, "h")) | |
| 376 return "h1"; /* Pick one! */ | |
| 377 if (!fz_strcasecmp(raw, "h1")) | |
| 378 return "h1"; | |
| 379 if (!fz_strcasecmp(raw, "h2")) | |
| 380 return "h2"; | |
| 381 if (!fz_strcasecmp(raw, "h3")) | |
| 382 return "h3"; | |
| 383 if (!fz_strcasecmp(raw, "h4")) | |
| 384 return "h4"; | |
| 385 if (!fz_strcasecmp(raw, "h5")) | |
| 386 return "h5"; | |
| 387 if (!fz_strcasecmp(raw, "h6")) | |
| 388 return "h6"; | |
| 389 | |
| 390 if (!fz_strcasecmp(raw, "list")) | |
| 391 return "ul"; | |
| 392 if (!fz_strcasecmp(raw, "listitem")) | |
| 393 return "li"; | |
| 394 if (!fz_strcasecmp(raw, "table")) | |
| 395 return "table"; | |
| 396 if (!fz_strcasecmp(raw, "tr")) | |
| 397 return "tr"; | |
| 398 if (!fz_strcasecmp(raw, "th")) | |
| 399 return "th"; | |
| 400 if (!fz_strcasecmp(raw, "td")) | |
| 401 return "td"; | |
| 402 if (!fz_strcasecmp(raw, "thead")) | |
| 403 return "thead"; | |
| 404 if (!fz_strcasecmp(raw, "tbody")) | |
| 405 return "tbody"; | |
| 406 if (!fz_strcasecmp(raw, "tfoot")) | |
| 407 return "tfoot"; | |
| 408 | |
| 409 if (!fz_strcasecmp(raw, "span")) | |
| 410 return "span"; | |
| 411 if (!fz_strcasecmp(raw, "code")) | |
| 412 return "code"; | |
| 413 if (!fz_strcasecmp(raw, "em")) | |
| 414 return "em"; | |
| 415 if (!fz_strcasecmp(raw, "strong")) | |
| 416 return "strong"; | |
| 417 | |
| 418 return "div"; | |
| 419 } | |
| 420 | |
| 421 static void | |
| 422 print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block); | |
| 423 | |
| 424 static void | |
| 425 fz_print_stext_struct_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 426 { | |
| 427 const char *tag; | |
| 428 | |
| 429 if (block->u.s.down == NULL) | |
| 430 return; | |
| 431 | |
| 432 tag = html_tag_for_struct(block->u.s.down); | |
| 433 | |
| 434 fz_write_printf(ctx, out, "<%s>\n", tag); | |
| 435 | |
| 436 print_blocks_as_html(ctx, out, block->u.s.down->first_block); | |
| 437 | |
| 438 fz_write_printf(ctx, out, "</%s>\n", tag); | |
| 439 } | |
| 440 | |
| 441 static void | |
| 442 print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 443 { | |
| 444 for (; block; block = block->next) | |
| 445 { | |
| 446 if (block->type == FZ_STEXT_BLOCK_IMAGE) | |
| 447 fz_print_stext_image_as_html(ctx, out, block); | |
| 448 else if (block->type == FZ_STEXT_BLOCK_TEXT) | |
| 449 fz_print_stext_block_as_html(ctx, out, block); | |
| 450 else if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 451 fz_print_stext_struct_as_html(ctx, out, block); | |
| 452 } | |
| 453 } | |
| 454 | |
| 455 void | |
| 456 fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) | |
| 457 { | |
| 458 float w = page->mediabox.x1 - page->mediabox.x0; | |
| 459 float h = page->mediabox.y1 - page->mediabox.y0; | |
| 460 | |
| 461 fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h); | |
| 462 | |
| 463 print_blocks_as_html(ctx, out, page->first_block); | |
| 464 | |
| 465 fz_write_string(ctx, out, "</div>\n"); | |
| 466 } | |
| 467 | |
| 468 void | |
| 469 fz_print_stext_header_as_html(fz_context *ctx, fz_output *out) | |
| 470 { | |
| 471 fz_write_string(ctx, out, "<!DOCTYPE html>\n"); | |
| 472 fz_write_string(ctx, out, "<html>\n"); | |
| 473 fz_write_string(ctx, out, "<head>\n"); | |
| 474 fz_write_string(ctx, out, "<style>\n"); | |
| 475 fz_write_string(ctx, out, "body{background-color:slategray}\n"); | |
| 476 fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n"); | |
| 477 fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n"); | |
| 478 fz_write_string(ctx, out, "</style>\n"); | |
| 479 fz_write_string(ctx, out, "</head>\n"); | |
| 480 fz_write_string(ctx, out, "<body>\n"); | |
| 481 } | |
| 482 | |
| 483 void | |
| 484 fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) | |
| 485 { | |
| 486 fz_write_string(ctx, out, "</body>\n"); | |
| 487 fz_write_string(ctx, out, "</html>\n"); | |
| 488 } | |
| 489 | |
| 490 /* XHTML output (semantic, little layout, suitable for reflow) */ | |
| 491 | |
| 492 static void | |
| 493 find_table_pos(fz_stext_grid_positions *xs, float x0, float x1, int *ix0, int *ix1) | |
| 494 { | |
| 495 int i; | |
| 496 | |
| 497 *ix0 = -1; | |
| 498 *ix1 = -1; | |
| 499 | |
| 500 for (i = 1; i < xs->len; i++) | |
| 501 if (x0 < xs->list[i].pos) | |
| 502 { | |
| 503 *ix0 = i-1; | |
| 504 break; | |
| 505 } | |
| 506 for (; i < xs->len; i++) | |
| 507 if (x1 < xs->list[i].pos) | |
| 508 { | |
| 509 *ix1 = i-1; | |
| 510 break; | |
| 511 } | |
| 512 if (i == xs->len) | |
| 513 *ix1 = i-1; | |
| 514 } | |
| 515 | |
| 516 static void | |
| 517 run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out); | |
| 518 | |
| 519 static void | |
| 520 fz_print_stext_table_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 521 { | |
| 522 fz_stext_block *grid, *tr, *td; | |
| 523 int w, h; | |
| 524 int x, y; | |
| 525 uint8_t *cells; | |
| 526 int malformed = 0; | |
| 527 | |
| 528 for (grid = block; grid != NULL; grid = grid->next) | |
| 529 if (grid->type == FZ_STEXT_BLOCK_GRID) | |
| 530 break; | |
| 531 if (grid == NULL) | |
| 532 { | |
| 533 fz_warn(ctx, "Malformed table data"); | |
| 534 return; | |
| 535 } | |
| 536 w = grid->u.b.xs->len; | |
| 537 h = grid->u.b.ys->len; | |
| 538 cells = fz_calloc(ctx, w, h); | |
| 539 | |
| 540 fz_try(ctx) | |
| 541 { | |
| 542 fz_write_printf(ctx, out, "<table>\n"); | |
| 543 | |
| 544 y = 0; | |
| 545 for (tr = grid->next; tr != NULL; tr = tr->next) | |
| 546 { | |
| 547 if (tr->type != FZ_STEXT_BLOCK_STRUCT || tr->u.s.down == NULL || tr->u.s.down->standard != FZ_STRUCTURE_TR) | |
| 548 { | |
| 549 malformed = 1; | |
| 550 continue; | |
| 551 } | |
| 552 fz_write_printf(ctx, out, "<tr>\n"); | |
| 553 x = 0; | |
| 554 for (td = tr->u.s.down->first_block; td != NULL; td = td->next) | |
| 555 { | |
| 556 int x0, y0, x1, y1; | |
| 557 if (td->type != FZ_STEXT_BLOCK_STRUCT || td->u.s.down == NULL || td->u.s.down->standard != FZ_STRUCTURE_TD) | |
| 558 { | |
| 559 malformed = 1; | |
| 560 continue; | |
| 561 } | |
| 562 find_table_pos(grid->u.b.xs, td->bbox.x0, td->bbox.x1, &x0, &x1); | |
| 563 find_table_pos(grid->u.b.ys, td->bbox.y0, td->bbox.y1, &y0, &y1); | |
| 564 if (x0 < 0 || x1 < 0 || x1 >= w) | |
| 565 { | |
| 566 malformed = 1; | |
| 567 x0 = x; | |
| 568 x1 = x+1; | |
| 569 } | |
| 570 if (y0 < 0 || y1 < 0 || y1 >= h) | |
| 571 { | |
| 572 malformed = 1; | |
| 573 y0 = y; | |
| 574 y1 = y+1; | |
| 575 } | |
| 576 if (y < y0) | |
| 577 { | |
| 578 malformed = 1; | |
| 579 continue; | |
| 580 } | |
| 581 if (x > x0) | |
| 582 { | |
| 583 malformed = 1; | |
| 584 } | |
| 585 while (x < x0) | |
| 586 { | |
| 587 uint8_t *c = &cells[x + w*y]; | |
| 588 if (*c == 0) | |
| 589 { | |
| 590 fz_write_printf(ctx, out, "<td></td>"); | |
| 591 *c = 1; | |
| 592 } | |
| 593 x++; | |
| 594 } | |
| 595 fz_write_string(ctx, out, "<td"); | |
| 596 if (x1 > x0+1) | |
| 597 fz_write_printf(ctx, out, " rowspan=%d", x1-x0); | |
| 598 if (y1 > y0+1) | |
| 599 fz_write_printf(ctx, out, " colspan=%d", y1-y0); | |
| 600 fz_write_string(ctx, out, ">\n"); | |
| 601 run_to_xhtml(ctx, td->u.s.down->first_block, out); | |
| 602 fz_write_printf(ctx, out, "</td>\n"); | |
| 603 for ( ; y0 < y1; y0++) | |
| 604 for (x = x0; x < x1; x++) | |
| 605 { | |
| 606 uint8_t *c = &cells[x + w*y0]; | |
| 607 if (*c != 0) | |
| 608 malformed = 1; | |
| 609 *c = 1; | |
| 610 } | |
| 611 } | |
| 612 fz_write_printf(ctx, out, "</tr>\n"); | |
| 613 y++; | |
| 614 } | |
| 615 | |
| 616 fz_write_printf(ctx, out, "</table>\n"); | |
| 617 } | |
| 618 fz_always(ctx) | |
| 619 fz_free(ctx, cells); | |
| 620 fz_catch(ctx) | |
| 621 fz_rethrow(ctx); | |
| 622 | |
| 623 if (malformed) | |
| 624 fz_warn(ctx, "Malformed table data"); | |
| 625 } | |
| 626 | |
| 627 static void | |
| 628 fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 629 { | |
| 630 int w = block->bbox.x1 - block->bbox.x0; | |
| 631 int h = block->bbox.y1 - block->bbox.y0; | |
| 632 | |
| 633 fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h); | |
| 634 fz_write_image_as_data_uri(ctx, out, block->u.i.image); | |
| 635 fz_write_string(ctx, out, "\"/></p>\n"); | |
| 636 } | |
| 637 | |
| 638 static void | |
| 639 fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup) | |
| 640 { | |
| 641 int is_mono = fz_font_is_monospaced(ctx, font); | |
| 642 int is_bold = fz_font_is_bold(ctx, font); | |
| 643 int is_italic = fz_font_is_italic(ctx, font); | |
| 644 | |
| 645 if (sup) | |
| 646 fz_write_string(ctx, out, "<sup>"); | |
| 647 if (is_mono) | |
| 648 fz_write_string(ctx, out, "<tt>"); | |
| 649 if (is_bold) | |
| 650 fz_write_string(ctx, out, "<b>"); | |
| 651 if (is_italic) | |
| 652 fz_write_string(ctx, out, "<i>"); | |
| 653 } | |
| 654 | |
| 655 static void | |
| 656 fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup) | |
| 657 { | |
| 658 int is_mono = fz_font_is_monospaced(ctx, font); | |
| 659 int is_bold = fz_font_is_bold(ctx, font); | |
| 660 int is_italic = fz_font_is_italic(ctx, font); | |
| 661 | |
| 662 if (is_italic) | |
| 663 fz_write_string(ctx, out, "</i>"); | |
| 664 if (is_bold) | |
| 665 fz_write_string(ctx, out, "</b>"); | |
| 666 if (is_mono) | |
| 667 fz_write_string(ctx, out, "</tt>"); | |
| 668 if (sup) | |
| 669 fz_write_string(ctx, out, "</sup>"); | |
| 670 } | |
| 671 | |
| 672 static float avg_font_size_of_line(fz_stext_char *ch) | |
| 673 { | |
| 674 float size = 0; | |
| 675 int n = 0; | |
| 676 if (!ch) | |
| 677 return 0; | |
| 678 while (ch) | |
| 679 { | |
| 680 size += ch->size; | |
| 681 ++n; | |
| 682 ch = ch->next; | |
| 683 } | |
| 684 return size / n; | |
| 685 } | |
| 686 | |
| 687 static const char *tag_from_font_size(float size) | |
| 688 { | |
| 689 if (size >= 20) return "h1"; | |
| 690 if (size >= 15) return "h2"; | |
| 691 if (size >= 12) return "h3"; | |
| 692 return "p"; | |
| 693 } | |
| 694 | |
| 695 static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 696 { | |
| 697 fz_stext_line *line; | |
| 698 fz_stext_char *ch; | |
| 699 | |
| 700 fz_font *font = NULL; | |
| 701 int sup = 0; | |
| 702 int sp = 1; | |
| 703 const char *tag = NULL; | |
| 704 const char *new_tag; | |
| 705 | |
| 706 for (line = block->u.t.first_line; line; line = line->next) | |
| 707 { | |
| 708 new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char)); | |
| 709 if (tag != new_tag) | |
| 710 { | |
| 711 if (tag) | |
| 712 { | |
| 713 if (font) | |
| 714 fz_print_style_end_xhtml(ctx, out, font, sup); | |
| 715 fz_write_printf(ctx, out, "</%s>", tag); | |
| 716 } | |
| 717 tag = new_tag; | |
| 718 fz_write_printf(ctx, out, "<%s>", tag); | |
| 719 if (font) | |
| 720 fz_print_style_begin_xhtml(ctx, out, font, sup); | |
| 721 } | |
| 722 | |
| 723 if (!sp) | |
| 724 fz_write_byte(ctx, out, ' '); | |
| 725 | |
| 726 for (ch = line->first_char; ch; ch = ch->next) | |
| 727 { | |
| 728 int ch_sup = detect_super_script(line, ch); | |
| 729 if (ch->font != font || ch_sup != sup) | |
| 730 { | |
| 731 if (font) | |
| 732 fz_print_style_end_xhtml(ctx, out, font, sup); | |
| 733 font = ch->font; | |
| 734 sup = ch_sup; | |
| 735 fz_print_style_begin_xhtml(ctx, out, font, sup); | |
| 736 } | |
| 737 | |
| 738 sp = (ch->c == ' '); | |
| 739 switch (ch->c) | |
| 740 { | |
| 741 default: | |
| 742 if (ch->c >= 32 && ch->c <= 127) | |
| 743 fz_write_byte(ctx, out, ch->c); | |
| 744 else | |
| 745 fz_write_printf(ctx, out, "&#x%x;", ch->c); | |
| 746 break; | |
| 747 case '<': fz_write_string(ctx, out, "<"); break; | |
| 748 case '>': fz_write_string(ctx, out, ">"); break; | |
| 749 case '&': fz_write_string(ctx, out, "&"); break; | |
| 750 case '"': fz_write_string(ctx, out, """); break; | |
| 751 case '\'': fz_write_string(ctx, out, "'"); break; | |
| 752 } | |
| 753 } | |
| 754 } | |
| 755 | |
| 756 if (font) | |
| 757 fz_print_style_end_xhtml(ctx, out, font, sup); | |
| 758 fz_write_printf(ctx, out, "</%s>\n", tag); | |
| 759 } | |
| 760 | |
| 761 static void | |
| 762 fz_print_struct_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) | |
| 763 { | |
| 764 const char *tag; | |
| 765 | |
| 766 if (block->u.s.down == NULL) | |
| 767 return; | |
| 768 | |
| 769 if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) | |
| 770 { | |
| 771 fz_print_stext_table_as_xhtml(ctx, out, block->u.s.down->first_block); | |
| 772 return; | |
| 773 } | |
| 774 | |
| 775 tag = html_tag_for_struct(block->u.s.down); | |
| 776 | |
| 777 fz_write_printf(ctx, out, "<%s>\n", tag); | |
| 778 | |
| 779 run_to_xhtml(ctx, block->u.s.down->first_block, out); | |
| 780 | |
| 781 fz_write_printf(ctx, out, "</%s>\n", tag); | |
| 782 } | |
| 783 | |
| 784 static void | |
| 785 run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out) | |
| 786 { | |
| 787 while (block) | |
| 788 { | |
| 789 switch(block->type) | |
| 790 { | |
| 791 case FZ_STEXT_BLOCK_IMAGE: | |
| 792 fz_print_stext_image_as_xhtml(ctx, out, block); | |
| 793 break; | |
| 794 case FZ_STEXT_BLOCK_TEXT: | |
| 795 fz_print_stext_block_as_xhtml(ctx, out, block); | |
| 796 break; | |
| 797 case FZ_STEXT_BLOCK_STRUCT: | |
| 798 fz_print_struct_as_xhtml(ctx, out, block); | |
| 799 break; | |
| 800 } | |
| 801 block = block->next; | |
| 802 } | |
| 803 } | |
| 804 | |
| 805 void | |
| 806 fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) | |
| 807 { | |
| 808 fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id); | |
| 809 | |
| 810 run_to_xhtml(ctx, page->first_block, out); | |
| 811 | |
| 812 fz_write_string(ctx, out, "</div>\n"); | |
| 813 } | |
| 814 | |
| 815 void | |
| 816 fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) | |
| 817 { | |
| 818 fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n"); | |
| 819 fz_write_string(ctx, out, "<!DOCTYPE html"); | |
| 820 fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""); | |
| 821 fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"); | |
| 822 fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n"); | |
| 823 fz_write_string(ctx, out, "<head>\n"); | |
| 824 fz_write_string(ctx, out, "<style>\n"); | |
| 825 fz_write_string(ctx, out, "p{white-space:pre-wrap}\n"); | |
| 826 fz_write_string(ctx, out, "</style>\n"); | |
| 827 fz_write_string(ctx, out, "</head>\n"); | |
| 828 fz_write_string(ctx, out, "<body>\n"); | |
| 829 } | |
| 830 | |
| 831 void | |
| 832 fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) | |
| 833 { | |
| 834 fz_write_string(ctx, out, "</body>\n"); | |
| 835 fz_write_string(ctx, out, "</html>\n"); | |
| 836 } | |
| 837 | |
| 838 /* Detailed XML dump of the entire structured text data */ | |
| 839 | |
| 840 static void | |
| 841 xml_write_char(fz_context *ctx, fz_output *out, int c) | |
| 842 { | |
| 843 switch (c) | |
| 844 { | |
| 845 case '<': fz_write_string(ctx, out, "<"); break; | |
| 846 case '>': fz_write_string(ctx, out, ">"); break; | |
| 847 case '&': fz_write_string(ctx, out, "&"); break; | |
| 848 case '"': fz_write_string(ctx, out, """); break; | |
| 849 case '\'': fz_write_string(ctx, out, "'"); break; | |
| 850 default: | |
| 851 if (c >= 32 && c <= 127) | |
| 852 fz_write_printf(ctx, out, "%c", c); | |
| 853 else | |
| 854 fz_write_printf(ctx, out, "&#x%x;", c); | |
| 855 break; | |
| 856 } | |
| 857 } | |
| 858 | |
| 859 static void | |
| 860 as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out) | |
| 861 { | |
| 862 fz_stext_line *line; | |
| 863 fz_stext_char *ch; | |
| 864 int i; | |
| 865 | |
| 866 while (block) | |
| 867 { | |
| 868 switch (block->type) | |
| 869 { | |
| 870 case FZ_STEXT_BLOCK_TEXT: | |
| 871 fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\"", | |
| 872 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); | |
| 873 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_UNKNOWN) | |
| 874 fz_write_printf(ctx, out, " justify=\"unknown\""); | |
| 875 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_LEFT) | |
| 876 fz_write_printf(ctx, out, " justify=\"left\""); | |
| 877 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_CENTRE) | |
| 878 fz_write_printf(ctx, out, " justify=\"centre\""); | |
| 879 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_RIGHT) | |
| 880 fz_write_printf(ctx, out, " justify=\"right\""); | |
| 881 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) | |
| 882 fz_write_printf(ctx, out, " justify=\"full\""); | |
| 883 fz_write_printf(ctx, out, ">\n"); | |
| 884 for (line = block->u.t.first_line; line; line = line->next) | |
| 885 { | |
| 886 fz_font *font = NULL; | |
| 887 float size = 0; | |
| 888 const char *name = NULL; | |
| 889 | |
| 890 fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\"", | |
| 891 line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1, | |
| 892 line->wmode, | |
| 893 line->dir.x, line->dir.y); | |
| 894 | |
| 895 /* This is duplication of information, but it makes it MUCH easier to search for | |
| 896 * text fragments in large output. */ | |
| 897 { | |
| 898 int valid = 1; | |
| 899 fz_write_printf(ctx, out, " text=\""); | |
| 900 for (ch = line->first_char; ch; ch = ch->next) | |
| 901 { | |
| 902 if (valid) | |
| 903 valid = fz_is_valid_xml_char(ch->c); | |
| 904 xml_write_char(ctx, out, fz_range_limit_xml_char(ch->c)); | |
| 905 } | |
| 906 if (!valid) | |
| 907 { | |
| 908 fz_write_printf(ctx, out, "\" hextext=\""); | |
| 909 for (ch = line->first_char; ch; ch = ch->next) | |
| 910 { | |
| 911 char text[8]; | |
| 912 int n = fz_runetochar(text, ch->c); | |
| 913 for (i = 0; i < n; i++) | |
| 914 fz_write_printf(ctx, out, "%02x", text[i]); | |
| 915 } | |
| 916 } | |
| 917 fz_write_printf(ctx, out, "\""); | |
| 918 } | |
| 919 | |
| 920 fz_write_printf(ctx, out, ">\n"); | |
| 921 | |
| 922 for (ch = line->first_char; ch; ch = ch->next) | |
| 923 { | |
| 924 if (ch->font != font || ch->size != size) | |
| 925 { | |
| 926 const char *s; | |
| 927 if (font) | |
| 928 fz_write_string(ctx, out, "</font>\n"); | |
| 929 font = ch->font; | |
| 930 size = ch->size; | |
| 931 s = name = font_full_name(ctx, font); | |
| 932 while (*s) | |
| 933 { | |
| 934 int c = *s++; | |
| 935 if (c < 32 || c >= 127) | |
| 936 break; | |
| 937 } | |
| 938 if (*s) | |
| 939 fz_write_printf(ctx, out, "<font hexname=%>", name); | |
| 940 else | |
| 941 fz_write_printf(ctx, out, "<font name=\"%s\"", name); | |
| 942 fz_write_printf(ctx, out, " size=\"%g\">\n", size); | |
| 943 } | |
| 944 fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" alpha=\"#%02x\" flags=\"%d\" c=\"", | |
| 945 ch->quad.ul.x, ch->quad.ul.y, | |
| 946 ch->quad.ur.x, ch->quad.ur.y, | |
| 947 ch->quad.ll.x, ch->quad.ll.y, | |
| 948 ch->quad.lr.x, ch->quad.lr.y, | |
| 949 ch->origin.x, ch->origin.y, | |
| 950 ch->bidi, | |
| 951 ch->argb & 0xFFFFFF, | |
| 952 ch->argb>>24, | |
| 953 ch->flags); | |
| 954 xml_write_char(ctx, out, ch->c); | |
| 955 if (!fz_is_valid_xml_char(ch->c)) | |
| 956 { | |
| 957 char text[8]; | |
| 958 int n = fz_runetochar(text, ch->c); | |
| 959 fz_write_string(ctx, out, "\" hexc=\""); | |
| 960 for (i = 0; i < n; i++) | |
| 961 fz_write_printf(ctx, out, "%02x", text[i]); | |
| 962 } | |
| 963 fz_write_string(ctx, out, "\"/>\n"); | |
| 964 } | |
| 965 | |
| 966 if (font) | |
| 967 fz_write_string(ctx, out, "</font>\n"); | |
| 968 | |
| 969 fz_write_string(ctx, out, "</line>\n"); | |
| 970 } | |
| 971 fz_write_string(ctx, out, "</block>\n"); | |
| 972 break; | |
| 973 | |
| 974 case FZ_STEXT_BLOCK_IMAGE: | |
| 975 fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n", | |
| 976 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); | |
| 977 break; | |
| 978 | |
| 979 case FZ_STEXT_BLOCK_STRUCT: | |
| 980 fz_write_printf(ctx, out, "<struct idx=\"%d\" bbox=\"%g %g %g %g\"", block->u.s.index, | |
| 981 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); | |
| 982 if (block->u.s.down) | |
| 983 fz_write_printf(ctx, out, " raw=\"%s\" std=\"%s\"", | |
| 984 block->u.s.down->raw, fz_structure_to_string(block->u.s.down->standard)); | |
| 985 fz_write_printf(ctx, out, ">\n"); | |
| 986 if (block->u.s.down) | |
| 987 as_xml(ctx, block->u.s.down->first_block, out); | |
| 988 fz_write_printf(ctx, out, "</struct>\n"); | |
| 989 break; | |
| 990 | |
| 991 case FZ_STEXT_BLOCK_VECTOR: | |
| 992 fz_write_printf(ctx, out, "<vector bbox=\"%g %g %g %g\" stroke=\"%d\" rectangle=\"%d\" continues=\"%d\" argb=\"%08x\"/>\n", | |
| 993 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1, | |
| 994 !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED), | |
| 995 !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE), | |
| 996 !!(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES), | |
| 997 block->u.v.argb); | |
| 998 break; | |
| 999 | |
| 1000 case FZ_STEXT_BLOCK_GRID: | |
| 1001 fz_write_printf(ctx, out, "<grid xpos=\""); | |
| 1002 for (i = 0; i < block->u.b.xs->len; i++) | |
| 1003 fz_write_printf(ctx, out, "%g ", block->u.b.xs->list[i].pos); | |
| 1004 fz_write_printf(ctx, out, "\" xuncertainty=\""); | |
| 1005 for (i = 0; i < block->u.b.xs->len; i++) | |
| 1006 fz_write_printf(ctx, out, "%d ", block->u.b.xs->list[i].uncertainty); | |
| 1007 fz_write_printf(ctx, out, "\" xmaxuncertainty=\"%d\" ypos=\"", block->u.b.xs->max_uncertainty); | |
| 1008 for (i = 0; i < block->u.b.ys->len; i++) | |
| 1009 fz_write_printf(ctx, out, "%g ", block->u.b.ys->list[i].pos); | |
| 1010 fz_write_printf(ctx, out, "\" yuncertainty=\""); | |
| 1011 for (i = 0; i < block->u.b.ys->len; i++) | |
| 1012 fz_write_printf(ctx, out, "%d ", block->u.b.ys->list[i].uncertainty); | |
| 1013 fz_write_printf(ctx, out, "\" ymaxuncertainty=\"%d\" />\n", block->u.b.ys->max_uncertainty); | |
| 1014 break; | |
| 1015 } | |
| 1016 block = block->next; | |
| 1017 } | |
| 1018 } | |
| 1019 | |
| 1020 void | |
| 1021 fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id) | |
| 1022 { | |
| 1023 fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id, | |
| 1024 page->mediabox.x1 - page->mediabox.x0, | |
| 1025 page->mediabox.y1 - page->mediabox.y0); | |
| 1026 | |
| 1027 as_xml(ctx, page->first_block, out); | |
| 1028 | |
| 1029 fz_write_string(ctx, out, "</page>\n"); | |
| 1030 } | |
| 1031 | |
| 1032 /* JSON dump */ | |
| 1033 | |
| 1034 static void | |
| 1035 as_json(fz_context *ctx, fz_stext_block *block, fz_output *out, float scale) | |
| 1036 { | |
| 1037 fz_stext_line *line; | |
| 1038 fz_stext_char *ch; | |
| 1039 int comma = 0; | |
| 1040 | |
| 1041 while (block) | |
| 1042 { | |
| 1043 if (comma) | |
| 1044 fz_write_string(ctx, out, ","); | |
| 1045 comma = 1; | |
| 1046 | |
| 1047 switch (block->type) | |
| 1048 { | |
| 1049 case FZ_STEXT_BLOCK_TEXT: | |
| 1050 fz_write_printf(ctx, out, "{%q:%q,", "type", "text"); | |
| 1051 fz_write_printf(ctx, out, "%q:{", "bbox"); | |
| 1052 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale)); | |
| 1053 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale)); | |
| 1054 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale)); | |
| 1055 fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale)); | |
| 1056 fz_write_printf(ctx, out, "%q:[", "lines"); | |
| 1057 | |
| 1058 for (line = block->u.t.first_line; line; line = line->next) | |
| 1059 { | |
| 1060 if (line != block->u.t.first_line) | |
| 1061 fz_write_string(ctx, out, ","); | |
| 1062 fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode); | |
| 1063 fz_write_printf(ctx, out, "%q:{", "bbox"); | |
| 1064 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale)); | |
| 1065 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale)); | |
| 1066 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale)); | |
| 1067 fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale)); | |
| 1068 | |
| 1069 /* Since we force preserve-spans, the first char has the style for the entire line. */ | |
| 1070 if (line->first_char) | |
| 1071 { | |
| 1072 fz_font *font = line->first_char->font; | |
| 1073 char *font_family = "sans-serif"; | |
| 1074 char *font_weight = "normal"; | |
| 1075 char *font_style = "normal"; | |
| 1076 if (fz_font_is_monospaced(ctx, font)) font_family = "monospace"; | |
| 1077 else if (fz_font_is_serif(ctx, font)) font_family = "serif"; | |
| 1078 if (fz_font_is_bold(ctx, font)) font_weight = "bold"; | |
| 1079 if (fz_font_is_italic(ctx, font)) font_style = "italic"; | |
| 1080 fz_write_printf(ctx, out, "%q:{", "font"); | |
| 1081 fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font)); | |
| 1082 fz_write_printf(ctx, out, "%q:%q,", "family", font_family); | |
| 1083 fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight); | |
| 1084 fz_write_printf(ctx, out, "%q:%q,", "style", font_style); | |
| 1085 fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale)); | |
| 1086 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale)); | |
| 1087 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale)); | |
| 1088 } | |
| 1089 | |
| 1090 fz_write_printf(ctx, out, "%q:\"", "text"); | |
| 1091 for (ch = line->first_char; ch; ch = ch->next) | |
| 1092 { | |
| 1093 if (ch->c == '"' || ch->c == '\\') | |
| 1094 fz_write_printf(ctx, out, "\\%c", ch->c); | |
| 1095 else if (ch->c < 32) | |
| 1096 fz_write_printf(ctx, out, "\\u%04x", ch->c); | |
| 1097 else | |
| 1098 fz_write_printf(ctx, out, "%C", ch->c); | |
| 1099 } | |
| 1100 fz_write_printf(ctx, out, "\"}"); | |
| 1101 } | |
| 1102 fz_write_string(ctx, out, "]}"); | |
| 1103 break; | |
| 1104 | |
| 1105 case FZ_STEXT_BLOCK_IMAGE: | |
| 1106 fz_write_printf(ctx, out, "{%q:%q,", "type", "image"); | |
| 1107 fz_write_printf(ctx, out, "%q:{", "bbox"); | |
| 1108 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale)); | |
| 1109 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale)); | |
| 1110 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale)); | |
| 1111 fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale)); | |
| 1112 break; | |
| 1113 | |
| 1114 case FZ_STEXT_BLOCK_STRUCT: | |
| 1115 fz_write_printf(ctx, out, "{%q:%q,", "type", "structure"); | |
| 1116 fz_write_printf(ctx, out, "%q:%d", "index", block->u.s.index); | |
| 1117 if (block->u.s.down) | |
| 1118 { | |
| 1119 fz_write_printf(ctx, out, ",%q:%q", "raw", block->u.s.down->raw); | |
| 1120 fz_write_printf(ctx, out, ",%q:%q", "std", fz_structure_to_string(block->u.s.down->standard)); | |
| 1121 fz_write_printf(ctx, out, ",%q:[", "contents"); | |
| 1122 as_json(ctx, block->u.s.down->first_block, out, scale); | |
| 1123 fz_write_printf(ctx, out, "]"); | |
| 1124 } | |
| 1125 fz_write_printf(ctx, out, "}"); | |
| 1126 break; | |
| 1127 | |
| 1128 } | |
| 1129 block = block->next; | |
| 1130 } | |
| 1131 } | |
| 1132 | |
| 1133 void | |
| 1134 fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale) | |
| 1135 { | |
| 1136 fz_write_printf(ctx, out, "{%q:[", "blocks"); | |
| 1137 | |
| 1138 as_json(ctx, page->first_block, out, scale); | |
| 1139 | |
| 1140 fz_write_string(ctx, out, "]}"); | |
| 1141 } | |
| 1142 | |
| 1143 /* Plain text */ | |
| 1144 | |
| 1145 static void | |
| 1146 do_as_text(fz_context *ctx, fz_output *out, fz_stext_block *first_block) | |
| 1147 { | |
| 1148 fz_stext_block *block; | |
| 1149 fz_stext_line *line; | |
| 1150 fz_stext_char *ch; | |
| 1151 char utf[10]; | |
| 1152 int i, n; | |
| 1153 | |
| 1154 for (block = first_block; block; block = block->next) | |
| 1155 { | |
| 1156 switch (block->type) | |
| 1157 { | |
| 1158 case FZ_STEXT_BLOCK_TEXT: | |
| 1159 for (line = block->u.t.first_line; line; line = line->next) | |
| 1160 { | |
| 1161 for (ch = line->first_char; ch; ch = ch->next) | |
| 1162 { | |
| 1163 n = fz_runetochar(utf, ch->c); | |
| 1164 for (i = 0; i < n; i++) | |
| 1165 fz_write_byte(ctx, out, utf[i]); | |
| 1166 } | |
| 1167 fz_write_string(ctx, out, "\n"); | |
| 1168 } | |
| 1169 fz_write_string(ctx, out, "\n"); | |
| 1170 break; | |
| 1171 case FZ_STEXT_BLOCK_STRUCT: | |
| 1172 if (block->u.s.down != NULL) | |
| 1173 do_as_text(ctx, out, block->u.s.down->first_block); | |
| 1174 break; | |
| 1175 } | |
| 1176 } | |
| 1177 } | |
| 1178 | |
| 1179 void | |
| 1180 fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) | |
| 1181 { | |
| 1182 do_as_text(ctx, out, page->first_block); | |
| 1183 } | |
| 1184 | |
| 1185 /* Text output writer */ | |
| 1186 | |
| 1187 enum { | |
| 1188 FZ_FORMAT_TEXT, | |
| 1189 FZ_FORMAT_HTML, | |
| 1190 FZ_FORMAT_XHTML, | |
| 1191 FZ_FORMAT_STEXT_XML, | |
| 1192 FZ_FORMAT_STEXT_JSON, | |
| 1193 }; | |
| 1194 | |
| 1195 typedef struct | |
| 1196 { | |
| 1197 fz_document_writer super; | |
| 1198 int format; | |
| 1199 int number; | |
| 1200 fz_stext_options opts; | |
| 1201 fz_stext_page *page; | |
| 1202 fz_output *out; | |
| 1203 } fz_text_writer; | |
| 1204 | |
| 1205 static fz_device * | |
| 1206 text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) | |
| 1207 { | |
| 1208 fz_text_writer *wri = (fz_text_writer*)wri_; | |
| 1209 float s = wri->opts.scale; | |
| 1210 | |
| 1211 if (wri->page) | |
| 1212 { | |
| 1213 fz_drop_stext_page(ctx, wri->page); | |
| 1214 wri->page = NULL; | |
| 1215 } | |
| 1216 | |
| 1217 wri->number++; | |
| 1218 | |
| 1219 wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s))); | |
| 1220 return fz_new_stext_device(ctx, wri->page, &wri->opts); | |
| 1221 } | |
| 1222 | |
| 1223 static void | |
| 1224 text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) | |
| 1225 { | |
| 1226 fz_text_writer *wri = (fz_text_writer*)wri_; | |
| 1227 float s = wri->opts.scale; | |
| 1228 | |
| 1229 fz_scale_stext_page(ctx, wri->page, s); | |
| 1230 | |
| 1231 fz_try(ctx) | |
| 1232 { | |
| 1233 fz_close_device(ctx, dev); | |
| 1234 switch (wri->format) | |
| 1235 { | |
| 1236 default: | |
| 1237 case FZ_FORMAT_TEXT: | |
| 1238 fz_print_stext_page_as_text(ctx, wri->out, wri->page); | |
| 1239 break; | |
| 1240 case FZ_FORMAT_HTML: | |
| 1241 fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number); | |
| 1242 break; | |
| 1243 case FZ_FORMAT_XHTML: | |
| 1244 fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number); | |
| 1245 break; | |
| 1246 case FZ_FORMAT_STEXT_XML: | |
| 1247 fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number); | |
| 1248 break; | |
| 1249 case FZ_FORMAT_STEXT_JSON: | |
| 1250 if (wri->number > 1) | |
| 1251 fz_write_string(ctx, wri->out, ","); | |
| 1252 fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1); | |
| 1253 break; | |
| 1254 } | |
| 1255 } | |
| 1256 fz_always(ctx) | |
| 1257 { | |
| 1258 fz_drop_device(ctx, dev); | |
| 1259 fz_drop_stext_page(ctx, wri->page); | |
| 1260 wri->page = NULL; | |
| 1261 } | |
| 1262 fz_catch(ctx) | |
| 1263 fz_rethrow(ctx); | |
| 1264 } | |
| 1265 | |
| 1266 static void | |
| 1267 text_close_writer(fz_context *ctx, fz_document_writer *wri_) | |
| 1268 { | |
| 1269 fz_text_writer *wri = (fz_text_writer*)wri_; | |
| 1270 switch (wri->format) | |
| 1271 { | |
| 1272 case FZ_FORMAT_HTML: | |
| 1273 fz_print_stext_trailer_as_html(ctx, wri->out); | |
| 1274 break; | |
| 1275 case FZ_FORMAT_XHTML: | |
| 1276 fz_print_stext_trailer_as_xhtml(ctx, wri->out); | |
| 1277 break; | |
| 1278 case FZ_FORMAT_STEXT_XML: | |
| 1279 fz_write_string(ctx, wri->out, "</document>\n"); | |
| 1280 break; | |
| 1281 case FZ_FORMAT_STEXT_JSON: | |
| 1282 fz_write_string(ctx, wri->out, "]\n"); | |
| 1283 break; | |
| 1284 } | |
| 1285 fz_close_output(ctx, wri->out); | |
| 1286 } | |
| 1287 | |
| 1288 static void | |
| 1289 text_drop_writer(fz_context *ctx, fz_document_writer *wri_) | |
| 1290 { | |
| 1291 fz_text_writer *wri = (fz_text_writer*)wri_; | |
| 1292 fz_drop_stext_page(ctx, wri->page); | |
| 1293 fz_drop_output(ctx, wri->out); | |
| 1294 } | |
| 1295 | |
| 1296 fz_document_writer * | |
| 1297 fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options) | |
| 1298 { | |
| 1299 fz_text_writer *wri = NULL; | |
| 1300 | |
| 1301 fz_var(wri); | |
| 1302 | |
| 1303 fz_try(ctx) | |
| 1304 { | |
| 1305 wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer); | |
| 1306 fz_parse_stext_options(ctx, &wri->opts, options); | |
| 1307 | |
| 1308 wri->format = FZ_FORMAT_TEXT; | |
| 1309 if (!strcmp(format, "text")) | |
| 1310 wri->format = FZ_FORMAT_TEXT; | |
| 1311 else if (!strcmp(format, "html")) | |
| 1312 wri->format = FZ_FORMAT_HTML; | |
| 1313 else if (!strcmp(format, "xhtml")) | |
| 1314 wri->format = FZ_FORMAT_XHTML; | |
| 1315 else if (!strcmp(format, "stext")) | |
| 1316 wri->format = FZ_FORMAT_STEXT_XML; | |
| 1317 else if (!strcmp(format, "stext.xml")) | |
| 1318 wri->format = FZ_FORMAT_STEXT_XML; | |
| 1319 else if (!strcmp(format, "stext.json")) | |
| 1320 { | |
| 1321 wri->format = FZ_FORMAT_STEXT_JSON; | |
| 1322 wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS; | |
| 1323 } | |
| 1324 | |
| 1325 wri->out = out; | |
| 1326 | |
| 1327 switch (wri->format) | |
| 1328 { | |
| 1329 case FZ_FORMAT_HTML: | |
| 1330 fz_print_stext_header_as_html(ctx, wri->out); | |
| 1331 break; | |
| 1332 case FZ_FORMAT_XHTML: | |
| 1333 fz_print_stext_header_as_xhtml(ctx, wri->out); | |
| 1334 break; | |
| 1335 case FZ_FORMAT_STEXT_XML: | |
| 1336 fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n"); | |
| 1337 fz_write_string(ctx, wri->out, "<document>\n"); | |
| 1338 break; | |
| 1339 case FZ_FORMAT_STEXT_JSON: | |
| 1340 fz_write_string(ctx, wri->out, "["); | |
| 1341 break; | |
| 1342 } | |
| 1343 } | |
| 1344 fz_catch(ctx) | |
| 1345 { | |
| 1346 fz_drop_output(ctx, out); | |
| 1347 fz_free(ctx, wri); | |
| 1348 fz_rethrow(ctx); | |
| 1349 } | |
| 1350 | |
| 1351 return (fz_document_writer*)wri; | |
| 1352 } | |
| 1353 | |
| 1354 fz_document_writer * | |
| 1355 fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options) | |
| 1356 { | |
| 1357 fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); | |
| 1358 return fz_new_text_writer_with_output(ctx, format, out, options); | |
| 1359 } |
