Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/join.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 #include "extract/extract.h" | |
| 2 #include "extract/alloc.h" | |
| 3 | |
| 4 #include "astring.h" | |
| 5 #include "document.h" | |
| 6 #include "mem.h" | |
| 7 #include "outf.h" | |
| 8 | |
| 9 #include <assert.h> | |
| 10 #include <float.h> | |
| 11 #include <math.h> | |
| 12 #include <stdio.h> | |
| 13 | |
| 14 | |
| 15 static char_t *span_char_first(span_t *span) | |
| 16 { | |
| 17 assert(span->chars_num > 0); | |
| 18 return &span->chars[0]; | |
| 19 } | |
| 20 | |
| 21 static char_t *span_char_last(span_t *span) | |
| 22 { | |
| 23 assert(span->chars_num > 0); | |
| 24 return &span->chars[span->chars_num-1]; | |
| 25 } | |
| 26 | |
| 27 const char *extract_matrix_string(const matrix_t *matrix) | |
| 28 { | |
| 29 static char ret[5][64]; | |
| 30 static int i = 0; | |
| 31 i = (i + 1) % 5; | |
| 32 snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f %f %f}", | |
| 33 matrix->a, | |
| 34 matrix->b, | |
| 35 matrix->c, | |
| 36 matrix->d, | |
| 37 matrix->e, | |
| 38 matrix->f); | |
| 39 | |
| 40 return ret[i]; | |
| 41 } | |
| 42 | |
| 43 const char *extract_matrix4_string(const matrix4_t *matrix) | |
| 44 { | |
| 45 static char ret[5][64]; | |
| 46 static int i = 0; | |
| 47 i = (i + 1) % 5; | |
| 48 snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f}", | |
| 49 matrix->a, | |
| 50 matrix->b, | |
| 51 matrix->c, | |
| 52 matrix->d); | |
| 53 return ret[i]; | |
| 54 } | |
| 55 | |
| 56 /* Returns first line in paragraph. */ | |
| 57 static line_t *paragraph_line_first(const paragraph_t *paragraph) | |
| 58 { | |
| 59 return content_first_line(¶graph->content); | |
| 60 } | |
| 61 | |
| 62 /* Returns last line in paragraph. */ | |
| 63 static line_t *paragraph_line_last(const paragraph_t *paragraph) | |
| 64 { | |
| 65 return content_last_line(¶graph->content); | |
| 66 } | |
| 67 | |
| 68 | |
| 69 | |
| 70 /* Things for direct conversion of text spans into lines and paragraphs. */ | |
| 71 | |
| 72 static int | |
| 73 matrices_are_compatible(const matrix4_t *ctm_a, const matrix4_t *ctm_b, int wmode) | |
| 74 { | |
| 75 double dot, pdot; | |
| 76 | |
| 77 /* Calculate the dot product between the direction vector transformed by each ctm. This should be large for | |
| 78 * compatible lines (cos they should be colinear). Also calculate the dot product between the direction | |
| 79 * vector transformed by the first ctm, and perp(direction vector) transformed by the second ctm. This | |
| 80 * should be zero. */ | |
| 81 if (wmode) | |
| 82 { | |
| 83 dot = ctm_a->c * ctm_b->c + ctm_a->d * ctm_b->d; | |
| 84 pdot = ctm_a->c * ctm_b->d - ctm_a->d * ctm_b->c; | |
| 85 } | |
| 86 else | |
| 87 { | |
| 88 dot = ctm_a->a * ctm_b->a + ctm_a->b * ctm_b->b; | |
| 89 pdot = ctm_a->a * ctm_b->b - ctm_a->b * ctm_b->a; | |
| 90 } | |
| 91 /* Negative dot means lines are in opposite sense. */ | |
| 92 if (dot <= 0) | |
| 93 return 0; | |
| 94 /* Remove the scaling from pdot to get back to a unit vector. */ | |
| 95 pdot /= dot; | |
| 96 | |
| 97 return (fabs(pdot) < 0.1); | |
| 98 } | |
| 99 | |
| 100 /* Returns 1 if lines have same wmode and have the same baseline vector, else 0. */ | |
| 101 static int | |
| 102 lines_are_compatible(line_t *a, | |
| 103 line_t *b) | |
| 104 { | |
| 105 span_t *first_span_a = content_first_span(&a->content); | |
| 106 span_t *first_span_b = content_first_span(&b->content); | |
| 107 | |
| 108 if (a == b) return 0; | |
| 109 if (!first_span_a || !first_span_b) return 0; | |
| 110 | |
| 111 /* We only join lines with the same wmode. */ | |
| 112 if (first_span_a->flags.wmode != first_span_b->flags.wmode) | |
| 113 return 0; | |
| 114 | |
| 115 return matrices_are_compatible(&first_span_a->ctm, &first_span_b->ctm, first_span_a->flags.wmode); | |
| 116 } | |
| 117 | |
| 118 | |
| 119 static const unsigned ucs_NONE = ((unsigned) -1); | |
| 120 | |
| 121 /* Returns with <o_span> containing char_t's from <span> that are inside | |
| 122 rect, and *span modified to remove any char_t's that we have moved to | |
| 123 <o_span>. | |
| 124 | |
| 125 May return with span->chars_num == 0, in which case the caller must remove the | |
| 126 span (including freeing .font_name), because lots of code assumes that there | |
| 127 are no empty spans. */ | |
| 128 static int | |
| 129 span_inside_rect( | |
| 130 extract_alloc_t *alloc, | |
| 131 span_t *span, | |
| 132 rect_t *rect, | |
| 133 span_t *o_span) | |
| 134 { | |
| 135 int c; | |
| 136 content_t save = *(content_t *)o_span; | |
| 137 | |
| 138 *o_span = *span; | |
| 139 *(content_t *)o_span = save; /* Avoid changing prev/next. */ | |
| 140 extract_strdup(alloc, span->font_name, &o_span->font_name); | |
| 141 o_span->chars = NULL; | |
| 142 o_span->chars_num = 0; | |
| 143 for (c=0; c<span->chars_num; ++c) | |
| 144 { | |
| 145 /* For now we just look at whether span's (x, y) is within any | |
| 146 rects[]. We could instead try to find character's bounding box etc. */ | |
| 147 char_t *char_ = &span->chars[c]; | |
| 148 if (char_->x >= rect->min.x && | |
| 149 char_->x < rect->max.x && | |
| 150 char_->y >= rect->min.y && | |
| 151 char_->y < rect->max.y) | |
| 152 { | |
| 153 char_t *c = extract_span_append_c(alloc, o_span, char_->ucs); | |
| 154 if (c == NULL) return -1; | |
| 155 *c = *char_; | |
| 156 char_->ucs = ucs_NONE; /* Mark for removal below, so it is not used again. */ | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 /* Remove any char_t's that we've used. */ | |
| 161 { | |
| 162 int cc = 0; | |
| 163 for (c=0; c<span->chars_num; ++c) | |
| 164 { | |
| 165 char_t* char_ = &span->chars[c]; | |
| 166 if (char_->ucs != ucs_NONE) | |
| 167 { | |
| 168 span->chars[cc] = span->chars[c]; | |
| 169 cc += 1; | |
| 170 } | |
| 171 } | |
| 172 /* This might set span->chars_num to zero; our caller needs to remove | |
| 173 the span - lots of code assumes that all spans contain at least one | |
| 174 character. */ | |
| 175 span->chars_num = cc; | |
| 176 } | |
| 177 | |
| 178 if (o_span->chars_num) | |
| 179 { | |
| 180 outf("o_span: %s", extract_span_string(alloc, o_span)); | |
| 181 } | |
| 182 return 0; | |
| 183 } | |
| 184 | |
| 185 /* | |
| 186 On entry: | |
| 187 <lines> is a list of span_t's. | |
| 188 | |
| 189 On exit: | |
| 190 <lines> is a list of line_t's, made up by having pulled as many of the span_t's | |
| 191 as are appropriate together. | |
| 192 */ | |
| 193 static int | |
| 194 make_lines( | |
| 195 extract_alloc_t *alloc, | |
| 196 content_root_t *lines, | |
| 197 double master_space_guess) | |
| 198 { | |
| 199 int ret = -1; | |
| 200 int a; | |
| 201 content_line_iterator lit; | |
| 202 line_t *line_a; | |
| 203 content_span_iterator sit; | |
| 204 span_t *span; | |
| 205 | |
| 206 /* On entry <lines> contains spans. Make each span part of a <line>. */ | |
| 207 for (a = 0, span = content_span_iterator_init(&sit, lines); span != NULL; span = content_span_iterator_next(&sit), a++) | |
| 208 { | |
| 209 line_t *line; | |
| 210 | |
| 211 if (content_replace_new_line(alloc, &span->base, &line)) goto end; | |
| 212 content_append_span(&line->content, span); | |
| 213 outfx("initial line a=%i: %s", a, line_string(line)); | |
| 214 } | |
| 215 | |
| 216 /* For each line, look for nearest aligned line, and append if found. */ | |
| 217 for (a=0, line_a = content_line_iterator_init(&lit, lines); line_a != NULL; a++, line_a = content_line_iterator_next(&lit)) | |
| 218 { | |
| 219 content_line_iterator lit2; | |
| 220 line_t *line_b; | |
| 221 int b; | |
| 222 int nearest_line_b = -1; | |
| 223 double nearest_score = 0; | |
| 224 line_t *nearest_line = NULL; | |
| 225 double nearest_colinear = 0; | |
| 226 double nearest_space_guess = 0; | |
| 227 span_t *span_a; | |
| 228 | |
| 229 span_a = extract_line_span_last(line_a); | |
| 230 | |
| 231 for (b = 0, line_b = content_line_iterator_init(&lit2, lines); line_b != NULL; b++, line_b = content_line_iterator_next(&lit2)) | |
| 232 { | |
| 233 if (line_a == line_b) | |
| 234 continue; | |
| 235 | |
| 236 if (!lines_are_compatible(line_a, line_b)) | |
| 237 continue; | |
| 238 | |
| 239 { | |
| 240 span_t *span_b = extract_line_span_first(line_b); | |
| 241 char_t *last_a = extract_span_char_last(span_a); | |
| 242 /* Predict the end of span_a. */ | |
| 243 point_t dir = { last_a->adv * (1 - span_a->flags.wmode), last_a->adv * span_a->flags.wmode }; | |
| 244 point_t tdir = extract_matrix4_transform_point(span_a->ctm, dir); | |
| 245 point_t span_a_end = { last_a->x + tdir.x, last_a->y + tdir.y }; | |
| 246 /* Find the difference between the end of span_a and the start of span_b. */ | |
| 247 char_t *first_b = span_char_first(span_b); | |
| 248 point_t diff = { first_b->x - span_a_end.x, first_b->y - span_a_end.y }; | |
| 249 double scale_squared = ((span_a->flags.wmode) ? | |
| 250 (span_a->ctm.c * span_a->ctm.c + span_a->ctm.d * span_a->ctm.d) : | |
| 251 (span_a->ctm.a * span_a->ctm.a + span_a->ctm.b * span_a->ctm.b)); | |
| 252 /* Now find the differences in position, both colinear and perpendicular. */ | |
| 253 double colinear = (diff.x * tdir.x + diff.y * tdir.y) / last_a->adv / scale_squared; | |
| 254 double perp = (diff.x * tdir.y - diff.y * tdir.x) / last_a->adv / scale_squared; | |
| 255 /* colinear and perp are now both pre-transform space distances, to match adv etc. */ | |
| 256 double score; | |
| 257 double space_guess = (last_a->adv + first_b->adv)/2 * master_space_guess; | |
| 258 | |
| 259 /* Heuristic: perpendicular distance larger than half of adv rules it out as a match. */ | |
| 260 /* Ideally we should be using font bbox here, but we don't have that, currently. */ | |
| 261 /* NOTE: We should match the logic in extract_add_char here! */ | |
| 262 if (fabs(perp) > 3*space_guess/2 || fabs(colinear) > space_guess * 8) | |
| 263 continue; | |
| 264 | |
| 265 /* We now form a score for this match. */ | |
| 266 score = fabs(colinear); | |
| 267 if (score < fabs(perp) * 10) /* perpendicular distance matters much more. */ | |
| 268 score = fabs(perp) * 10; | |
| 269 | |
| 270 if (!nearest_line || score < nearest_score) | |
| 271 { | |
| 272 nearest_line = line_b; | |
| 273 nearest_score = score; | |
| 274 nearest_line_b = b; | |
| 275 nearest_colinear = colinear; | |
| 276 nearest_space_guess = space_guess; | |
| 277 } | |
| 278 } | |
| 279 } | |
| 280 | |
| 281 if (nearest_line) | |
| 282 { | |
| 283 /* line_a and nearest_line are aligned so we can move line_b's | |
| 284 spans on to the end of line_a. */ | |
| 285 span_t *span_b = extract_line_span_first(nearest_line); | |
| 286 b = nearest_line_b; | |
| 287 | |
| 288 if (extract_span_char_last(span_a)->ucs != ' ' && | |
| 289 span_char_first(span_b)->ucs != ' ') | |
| 290 { | |
| 291 /* Again, match the logic in extract_add_char here. */ | |
| 292 int insert_space = (nearest_colinear > 2*nearest_space_guess/3); | |
| 293 if (insert_space) | |
| 294 { | |
| 295 /* Append space to span_a before concatenation. */ | |
| 296 char_t *item = extract_span_append_c(alloc, span_a, ' '); | |
| 297 if (item == NULL) goto end; | |
| 298 item->adv = 0; /* FIXME */ | |
| 299 /* This is a hack to give our extra space a vaguely useful | |
| 300 (x,y) coordinate - this can be used later on when ordering | |
| 301 paragraphs. We could try to be more accurate by adding | |
| 302 item[-1]'s .adv suitably transformed by .wmode, .ctm and | |
| 303 .trm. */ | |
| 304 item->x = item[-1].x; | |
| 305 item->y = item[-1].y; | |
| 306 } | |
| 307 } | |
| 308 | |
| 309 /* We might end up with two adjacent spaces here. But removing a | |
| 310 space could result in an empty line_t, which could break various | |
| 311 assumptions elsewhere. */ | |
| 312 | |
| 313 /* Move all the content from nearest_line to line_a. */ | |
| 314 content_concat(&line_a->content, &nearest_line->content); | |
| 315 | |
| 316 /* Ensure that we ignore nearest_line from now on. */ | |
| 317 if (lit.next == &nearest_line->base) | |
| 318 lit.next = lit.next->next; | |
| 319 extract_line_free(alloc, &nearest_line); | |
| 320 | |
| 321 if (b > a) { | |
| 322 /* We haven't yet tried appending any spans to nearest_line, so | |
| 323 the new extended line_a needs checking again. */ | |
| 324 lit.next = &line_a->base; | |
| 325 a--; | |
| 326 } else { | |
| 327 a--; /* b is before a, so a's number needs to move back one. */ | |
| 328 } | |
| 329 } | |
| 330 } | |
| 331 | |
| 332 ret = 0; | |
| 333 | |
| 334 end: | |
| 335 if (ret) { | |
| 336 /* Free everything. */ | |
| 337 extract_span_free(alloc, &span); | |
| 338 content_clear(alloc, lines); | |
| 339 } | |
| 340 return ret; | |
| 341 } | |
| 342 | |
| 343 | |
| 344 /* A comparison function, for sorting paragraphs within a | |
| 345 page. */ | |
| 346 static int paragraphs_cmp(const content_t *a, const content_t *b) | |
| 347 { | |
| 348 const paragraph_t *a_paragraph = (const paragraph_t *)a; | |
| 349 const paragraph_t *b_paragraph = (const paragraph_t *)b; | |
| 350 line_t *a_line, *b_line; | |
| 351 span_t *a_span, *b_span; | |
| 352 | |
| 353 if (a->type != content_paragraph || b->type != content_paragraph) | |
| 354 return 0; | |
| 355 | |
| 356 a_line = paragraph_line_first(a_paragraph); | |
| 357 b_line = paragraph_line_first(b_paragraph); | |
| 358 a_span = extract_line_span_first(a_line); | |
| 359 b_span = extract_line_span_first(b_line); | |
| 360 | |
| 361 /* We can't directly compare stuff with different wmodes. */ | |
| 362 if (a_span->flags.wmode != b_span->flags.wmode) | |
| 363 { | |
| 364 return a_span->flags.wmode - b_span->flags.wmode; | |
| 365 } | |
| 366 | |
| 367 /* If matrices are compatible (i.e. they share the same baseline vector), don't consider that as | |
| 368 * part of the sort. | |
| 369 * | |
| 370 * Actually, is this safe? We don't necessarily have transitivity here due to the epsilon in the | |
| 371 * comparison. A might be compatible with B, and B with C, but A might not be with C. | |
| 372 * Worry about that if we get an example. | |
| 373 */ | |
| 374 if (!matrices_are_compatible(&a_span->ctm, &b_span->ctm, a_span->flags.wmode)) | |
| 375 { | |
| 376 /* If ctm matrices differ, always return this diff first. Note that we | |
| 377 ignore .e and .f because if data is from ghostscript then .e and .f | |
| 378 vary for each span, and we don't care about these differences. */ | |
| 379 return extract_matrix4_cmp(&a_span->ctm, &b_span->ctm); | |
| 380 } | |
| 381 | |
| 382 /* So, we know the matrices are compatible - i.e. the baselines are parallel. | |
| 383 * Just sort on how far down the page we are going. */ | |
| 384 { | |
| 385 span_t *span_a = content_first_span(&a_line->content); | |
| 386 span_t *span_b = content_first_span(&b_line->content); | |
| 387 point_t dir = { 1 - span_a->flags.wmode, span_a->flags.wmode }; | |
| 388 point_t tdir = extract_matrix4_transform_point(span_a->ctm, dir); | |
| 389 point_t diff = { span_a->chars[0].x - span_b->chars[0].x, span_a->chars[0].y - span_b->chars[0].y }; | |
| 390 double perp = (diff.x * tdir.y - diff.y * tdir.x); | |
| 391 | |
| 392 #if 0 | |
| 393 printf("Comparing:\n"); | |
| 394 content_dump_brief(&a_line->content); | |
| 395 printf("And:\n"); | |
| 396 content_dump_brief(&b_line->content); | |
| 397 printf("perp=%g\n", perp); | |
| 398 #endif | |
| 399 | |
| 400 if (perp < 0) | |
| 401 return 1; | |
| 402 if (perp > 0) | |
| 403 return -1; | |
| 404 } | |
| 405 return 0; | |
| 406 } | |
| 407 | |
| 408 static double | |
| 409 font_size_from_ctm(const matrix4_t *ctm) | |
| 410 { | |
| 411 if (ctm->b == 0) | |
| 412 return fabs(ctm->a); | |
| 413 if (ctm->a == 0) | |
| 414 return fabs(ctm->b); | |
| 415 | |
| 416 return sqrt(ctm->a * ctm->a + ctm->b * ctm->b); | |
| 417 } | |
| 418 | |
| 419 static void | |
| 420 calculate_line_height(line_t *line) | |
| 421 { | |
| 422 content_span_iterator sit; | |
| 423 span_t *span; | |
| 424 double asc = 0, desc = 0; | |
| 425 | |
| 426 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 427 { | |
| 428 double span_font_size = font_size_from_ctm(&span->ctm); | |
| 429 double min_y = span->font_bbox.min.y * span_font_size; | |
| 430 double max_y = span->font_bbox.max.y * span_font_size; | |
| 431 if (min_y < desc) | |
| 432 desc = min_y; | |
| 433 if (max_y > asc) | |
| 434 asc = max_y; | |
| 435 } | |
| 436 line->ascender = asc; | |
| 437 line->descender = desc; | |
| 438 } | |
| 439 | |
| 440 /* | |
| 441 On entry: | |
| 442 <content> is a list of lines. | |
| 443 | |
| 444 On exit: | |
| 445 <content> is a list of paragraphs, formed from pulling appropriate lines | |
| 446 together. | |
| 447 */ | |
| 448 static int | |
| 449 make_paragraphs( | |
| 450 extract_alloc_t *alloc, | |
| 451 content_root_t *content) | |
| 452 { | |
| 453 int ret = -1; | |
| 454 int a; | |
| 455 content_line_iterator lit; | |
| 456 line_t *line; | |
| 457 content_paragraph_iterator pit; | |
| 458 paragraph_t *paragraph_a; | |
| 459 | |
| 460 /* Convert every line_t to be a paragraph_t containing that line_t. */ | |
| 461 for (line = content_line_iterator_init(&lit, content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 462 { | |
| 463 paragraph_t *paragraph; | |
| 464 if (content_replace_new_paragraph(alloc, &line->base, ¶graph)) | |
| 465 goto end; | |
| 466 content_append_line(¶graph->content, line); | |
| 467 calculate_line_height(line); | |
| 468 } | |
| 469 | |
| 470 /* Now join paragraphs together where possible. */ | |
| 471 for (a=0, paragraph_a = content_paragraph_iterator_init(&pit, content); paragraph_a != NULL; a++, paragraph_a = content_paragraph_iterator_next(&pit)) { | |
| 472 paragraph_t *nearest_paragraph = NULL; | |
| 473 int nearest_paragraph_b = -1; | |
| 474 double nearest_score = 0; | |
| 475 line_t *line_a; | |
| 476 paragraph_t *paragraph_b; | |
| 477 content_paragraph_iterator pit2; | |
| 478 int b; | |
| 479 span_t *span_a; | |
| 480 | |
| 481 line_a = paragraph_line_last(paragraph_a); | |
| 482 assert(line_a != NULL); | |
| 483 span_a = extract_line_span_last(line_a); | |
| 484 assert(span_a != NULL); | |
| 485 | |
| 486 /* Look for nearest paragraph_t that could be appended to | |
| 487 paragraph_a. */ | |
| 488 for (b=0, paragraph_b = content_paragraph_iterator_init(&pit2, content); paragraph_b != NULL; b++, paragraph_b = content_paragraph_iterator_next(&pit2)) | |
| 489 { | |
| 490 line_t *line_b; | |
| 491 | |
| 492 if (paragraph_a == paragraph_b) | |
| 493 continue; | |
| 494 line_b = paragraph_line_first(paragraph_b); | |
| 495 if (!lines_are_compatible(line_a, line_b)) { | |
| 496 continue; | |
| 497 } | |
| 498 | |
| 499 { | |
| 500 span_t *line_a_first_span = extract_line_span_first(line_a); | |
| 501 span_t *line_a_last_span = extract_line_span_last(line_a); | |
| 502 span_t *line_b_first_span = extract_line_span_first(line_b); | |
| 503 span_t *line_b_last_span = extract_line_span_last(line_b); | |
| 504 char_t *first_a = span_char_first(line_a_first_span); | |
| 505 char_t *first_b = span_char_first(line_b_first_span); | |
| 506 char_t *last_a = span_char_last(line_a_last_span); | |
| 507 char_t *last_b = span_char_last(line_b_last_span); | |
| 508 point_t dir = { 1 - span_a->flags.wmode, span_a->flags.wmode }; | |
| 509 point_t tdir_a = extract_matrix4_transform_point(line_a_last_span->ctm, dir); | |
| 510 point_t tdir_b = extract_matrix4_transform_point(line_b_last_span->ctm, dir); | |
| 511 /* Find the difference between the start of span_a and the start of span_b. */ | |
| 512 point_t start_diff = { first_b->x - first_a->x, first_b->y - first_a->y }; | |
| 513 point_t end_a = { last_a->x + last_a->adv * tdir_a.x, last_a->y + last_a->adv * tdir_a.y }; | |
| 514 point_t end_b = { last_b->x + last_b->adv * tdir_b.x, last_b->y + last_b->adv * tdir_b.y }; | |
| 515 /* Now find the perpendicular difference in position. */ | |
| 516 double scale_squared = ((span_a->flags.wmode) ? | |
| 517 (span_a->ctm.c * span_a->ctm.c + span_a->ctm.d * span_a->ctm.d) : | |
| 518 (span_a->ctm.a * span_a->ctm.a + span_a->ctm.b * span_a->ctm.b)); | |
| 519 double perp = (start_diff.x * tdir_a.y - start_diff.y * tdir_a.x) / sqrt(scale_squared); | |
| 520 /* perp is now a post-transform space distance. */ | |
| 521 double score; | |
| 522 /* Now consider the linear difference between: 1) start of a to end of a, 2) start of a to start of b, | |
| 523 * 3) start of a and the end of b. */ | |
| 524 point_t saea = { end_a.x - first_a->x, end_a.y - first_a->y }; | |
| 525 point_t sasb = { first_b->x - first_a->x, first_b->y - first_a->y }; | |
| 526 point_t saeb = { end_b.x - first_a->x, end_b.y - first_a->y }; | |
| 527 double dot_saea = ( saea.x * tdir_a.x + saea.y * tdir_a.y ); | |
| 528 double dot_sasb = ( sasb.x * tdir_a.x + sasb.y * tdir_a.y ); | |
| 529 double dot_saeb = ( saeb.x * tdir_a.x + saeb.y * tdir_a.y ); | |
| 530 | |
| 531 /* We are only interested in scoring down the page. */ | |
| 532 score = -perp; | |
| 533 | |
| 534 #if 0 | |
| 535 printf("Comparing:\n"); | |
| 536 content_dump_brief(¶graph_a->content); | |
| 537 printf("And:\n"); | |
| 538 content_dump_brief(¶graph_b->content); | |
| 539 printf("score=%g\n", score); | |
| 540 printf("saea=%g sasb=%g saeb=%g\n", dot_saea, dot_sasb, dot_saeb); | |
| 541 #endif | |
| 542 | |
| 543 /* Check for "horizontal" alignment of the two lines. If line_b starts | |
| 544 * entirely to the right of the end of line_a, we can't join with it. */ | |
| 545 if (dot_sasb > dot_saea) | |
| 546 continue; | |
| 547 /* If line_b ends entirely to the left of the start of line_a, we can't | |
| 548 * join with it. */ | |
| 549 if (dot_saeb < 0) | |
| 550 continue; | |
| 551 | |
| 552 if (score >= 0 && (!nearest_paragraph || score < nearest_score)) | |
| 553 { | |
| 554 nearest_paragraph = paragraph_b; | |
| 555 nearest_score = score; | |
| 556 nearest_paragraph_b = b; | |
| 557 } | |
| 558 } | |
| 559 } | |
| 560 | |
| 561 if (nearest_paragraph) { | |
| 562 double line_a_height = line_a->ascender - line_a->descender; | |
| 563 line_t *line_b = paragraph_line_first(nearest_paragraph); | |
| 564 double line_b_height = line_b->ascender - line_b->descender; | |
| 565 double expected_height = (line_a_height + line_b_height)/2; | |
| 566 | |
| 567 #if 0 | |
| 568 printf("Best score = %g, expected_height=%g\n", nearest_score, expected_height); | |
| 569 #endif | |
| 570 | |
| 571 if (nearest_score > 0 && nearest_score < 2 * expected_height) { | |
| 572 /* Paragraphs are close together vertically compared to maximum | |
| 573 font size of first line in second paragraph, so we'll join them | |
| 574 into a single paragraph. */ | |
| 575 span_t *a_span = extract_line_span_last(line_a); | |
| 576 | |
| 577 if (extract_span_char_last(a_span)->ucs == '-' || | |
| 578 extract_span_char_last(a_span)->ucs == 0x2212 /* unicode dash */) | |
| 579 { | |
| 580 /* remove trailing '-' at end of prev line. char_t doesn't | |
| 581 contain any malloc-heap pointers so this doesn't leak. */ | |
| 582 a_span->chars_num -= 1; | |
| 583 if (a_span->chars_num == 0) | |
| 584 { | |
| 585 /* The span is now empty, unlink and free it. */ | |
| 586 extract_span_free(alloc, &a_span); | |
| 587 | |
| 588 /* If this leaves the line empty, remove the line. */ | |
| 589 if (line_a->content.base.next == &line_a->content.base) | |
| 590 { | |
| 591 extract_line_free(alloc, &line_a); | |
| 592 a--; /* Keep the index correct. */ | |
| 593 } | |
| 594 } | |
| 595 } | |
| 596 else if (extract_span_char_last(a_span)->ucs == ' ') | |
| 597 { | |
| 598 } | |
| 599 else if (extract_span_char_last(a_span)->ucs == '/') | |
| 600 { | |
| 601 } | |
| 602 else | |
| 603 { | |
| 604 /* Insert space before joining adjacent lines. */ | |
| 605 char_t *c_prev; | |
| 606 char_t *c = extract_span_append_c(alloc, extract_line_span_last(line_a), ' '); | |
| 607 if (c == NULL) goto end; | |
| 608 c_prev = &a_span->chars[ a_span->chars_num-2]; | |
| 609 c->x = c_prev->x + c_prev->adv * a_span->ctm.a; | |
| 610 c->y = c_prev->y + c_prev->adv * a_span->ctm.c; | |
| 611 } | |
| 612 | |
| 613 /* Join the two paragraphs by moving content from nearest_paragraph to paragraph_a. */ | |
| 614 content_concat(¶graph_a->content, &nearest_paragraph->content); | |
| 615 | |
| 616 #if 0 | |
| 617 printf("Joining to give:\n"); | |
| 618 content_dump_brief(¶graph_a->content); | |
| 619 #endif | |
| 620 | |
| 621 /* Ensure that we skip nearest_paragraph in future. */ | |
| 622 if (pit.next == &nearest_paragraph->base) | |
| 623 pit.next = pit.next->next; | |
| 624 extract_paragraph_free(alloc, &nearest_paragraph); | |
| 625 | |
| 626 if (nearest_paragraph_b > a) { | |
| 627 /* We haven't yet tried appending any paragraphs to | |
| 628 nearest_paragraph_b, so the new extended paragraph_a needs | |
| 629 checking again. */ | |
| 630 pit.next = ¶graph_a->base; | |
| 631 a -= 1; | |
| 632 } else { | |
| 633 a -= 1; | |
| 634 } | |
| 635 } | |
| 636 } | |
| 637 } | |
| 638 | |
| 639 /* Sort paragraphs so they appear in correct order, using paragraphs_cmp(). | |
| 640 */ | |
| 641 content_sort(content, paragraphs_cmp); | |
| 642 | |
| 643 ret = 0; | |
| 644 | |
| 645 end: | |
| 646 | |
| 647 return ret; | |
| 648 } | |
| 649 | |
| 650 /* Return the last non space char of a span (or the first one if | |
| 651 * they are all spaces. */ | |
| 652 static char_t * | |
| 653 last_non_space_char(span_t *span) | |
| 654 { | |
| 655 int i = span->chars_num - 1; | |
| 656 | |
| 657 while (i > 0 && span->chars[i].ucs == 32) | |
| 658 i--; | |
| 659 | |
| 660 return &span->chars[i]; | |
| 661 } | |
| 662 | |
| 663 /* | |
| 664 On entry: | |
| 665 <content> is a list of paragraphs. | |
| 666 | |
| 667 On exit: | |
| 668 <content> is a list of paragraphs, with information about alignment etc. | |
| 669 */ | |
| 670 static int | |
| 671 analyse_paragraphs(content_root_t *content) | |
| 672 { | |
| 673 content_paragraph_iterator pit; | |
| 674 paragraph_t *paragraph; | |
| 675 | |
| 676 /* Examine each paragraph in turn. */ | |
| 677 for (paragraph = content_paragraph_iterator_init(&pit, content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 678 { | |
| 679 content_line_iterator lit; | |
| 680 line_t *line; | |
| 681 double para_l = 0, para_r = 0; | |
| 682 int first_span_of_para = 1; | |
| 683 matrix4_t inverse; | |
| 684 double space_guess = 0; /* Stop clever-clever compilers warning. */ | |
| 685 int previous_line_flags = -1; | |
| 686 double previous_line_spare = 0; | |
| 687 int first_line = 1; | |
| 688 | |
| 689 /* Bound this paragraph on the left and right, pre-transform. */ | |
| 690 for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 691 { | |
| 692 content_span_iterator sit; | |
| 693 span_t *span; | |
| 694 | |
| 695 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 696 { | |
| 697 char_t *lc = &span->chars[0]; | |
| 698 char_t *rc = last_non_space_char(span); | |
| 699 point_t dir = { rc->adv * (1 - span->flags.wmode), rc->adv * span->flags.wmode }; | |
| 700 point_t tdir = extract_matrix4_transform_point(span->ctm, dir); | |
| 701 point_t left = { lc->x, lc->y }; | |
| 702 point_t right = { rc->x + tdir.x, rc->y + tdir.y }; | |
| 703 double l, r; | |
| 704 | |
| 705 /* We examine the ctm on the first span, and store its inverse. We then map all | |
| 706 * the coords we encounter back through this, to give us a position in a consistent | |
| 707 * source space (i.e. we don't use each different ctm we meet for different spans). */ | |
| 708 if (first_span_of_para) | |
| 709 { | |
| 710 inverse = extract_matrix4_invert(&span->ctm); | |
| 711 space_guess = (span->font_bbox.max.x - span->font_bbox.min.x)/2; | |
| 712 } | |
| 713 | |
| 714 left = extract_matrix4_transform_point(inverse, left); | |
| 715 right = extract_matrix4_transform_point(inverse, right); | |
| 716 l = span->flags.wmode ? left.y : left.x; | |
| 717 r = span->flags.wmode ? right.y : right.x; | |
| 718 | |
| 719 if (l < para_l || first_span_of_para) | |
| 720 para_l = l; | |
| 721 if (r > para_r || first_span_of_para) | |
| 722 para_r = r; | |
| 723 first_span_of_para = 0; | |
| 724 } | |
| 725 } | |
| 726 | |
| 727 /* So now we know that para_l/para_r are the bounds for this paragraph, under the ctm of the first_span. */ | |
| 728 | |
| 729 /* Now, look again at the lines that made up that paragraph, to figure out how well each line | |
| 730 * fills those bounds. */ | |
| 731 for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 732 { | |
| 733 content_span_iterator sit; | |
| 734 span_t *span; | |
| 735 double line_l = 0, line_r = 0; | |
| 736 int first_span = 1; | |
| 737 | |
| 738 /* If we're not the first line, then we want to find the first word. */ | |
| 739 int first_word = !first_line; | |
| 740 int word_width_found = 0; | |
| 741 point_t word_end; | |
| 742 int word_wmode; | |
| 743 | |
| 744 /* For each line, find the line bounds. */ | |
| 745 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 746 { | |
| 747 char_t *lc = &span->chars[0]; | |
| 748 char_t *rc = last_non_space_char(span); | |
| 749 point_t dir = { 1 - span->flags.wmode, span->flags.wmode }; | |
| 750 point_t tdir = extract_matrix4_transform_point(span->ctm, dir); | |
| 751 point_t left = { lc->x, lc->y }; | |
| 752 point_t right = { rc->x + tdir.x * rc->adv, rc->y + tdir.y * rc->adv }; | |
| 753 double l, r; | |
| 754 | |
| 755 /* If we're not the first line, then calculate the length of the first word on the | |
| 756 * new line. */ | |
| 757 if (first_word) | |
| 758 { | |
| 759 int i; | |
| 760 | |
| 761 for (i = 0; i < span->chars_num; i++) | |
| 762 { | |
| 763 if (span->chars[i].ucs == 32) | |
| 764 break; | |
| 765 } | |
| 766 if (i > 0) | |
| 767 { | |
| 768 double adv = span->chars[i-1].adv; | |
| 769 word_end.x = span->chars[i-1].x + adv * tdir.x; | |
| 770 word_end.y = span->chars[i-1].y + adv * tdir.y; | |
| 771 word_wmode = span->flags.wmode; | |
| 772 word_width_found = 1; | |
| 773 if (i < span->chars_num) | |
| 774 first_word = 0; | |
| 775 } | |
| 776 } | |
| 777 | |
| 778 left = extract_matrix4_transform_point(inverse, left); | |
| 779 right = extract_matrix4_transform_point(inverse, right); | |
| 780 l = span->flags.wmode ? left.y : left.x; | |
| 781 r = span->flags.wmode ? right.y : right.x; | |
| 782 | |
| 783 if (l < line_l || first_span) | |
| 784 line_l = l; | |
| 785 if (r < line_r || first_span) | |
| 786 line_r = r; | |
| 787 first_span = 0; | |
| 788 } | |
| 789 | |
| 790 #if 0 | |
| 791 printf("Considering:\n"); | |
| 792 content_dump_brief(&line->content); | |
| 793 printf("\n"); | |
| 794 #endif | |
| 795 | |
| 796 /* Did we find a width for the first word? */ | |
| 797 if (word_width_found) | |
| 798 { | |
| 799 double w; | |
| 800 word_end = extract_matrix4_transform_point(inverse, word_end); | |
| 801 w = word_wmode ? word_end.y : word_end.x; | |
| 802 w -= line_l; | |
| 803 /* Now w = the extent of the word. */ | |
| 804 /* If the previous line had enough room for this extent, plus a space, | |
| 805 * then we know that this paragraph can't just be breaking lines when | |
| 806 * they get full. */ | |
| 807 if (previous_line_spare > w + space_guess) | |
| 808 paragraph->line_flags |= paragraph_breaks_strangely; | |
| 809 } | |
| 810 | |
| 811 if (previous_line_flags != -1) | |
| 812 { | |
| 813 paragraph->line_flags |= previous_line_flags; | |
| 814 } | |
| 815 previous_line_flags = 0; | |
| 816 if (line_l > para_l + space_guess) | |
| 817 previous_line_flags |= paragraph_not_aligned_left; | |
| 818 if (line_r < para_r - space_guess) | |
| 819 previous_line_flags |= paragraph_not_aligned_right; | |
| 820 { | |
| 821 /* In order to figure out if we are plausibly centred, | |
| 822 * calculate the l and r gaps. */ | |
| 823 double l = line_l - para_l; | |
| 824 double r = para_r - line_r; | |
| 825 | |
| 826 if (fabs(l - r) > space_guess/2) | |
| 827 paragraph->line_flags |= paragraph_not_centred; | |
| 828 if (l > space_guess/2) | |
| 829 paragraph->line_flags |= paragraph_not_fully_justified; | |
| 830 if (r > space_guess/2) | |
| 831 previous_line_flags |= paragraph_not_fully_justified; | |
| 832 } | |
| 833 previous_line_spare = para_r - line_r + line_l - para_l; | |
| 834 first_line = 0; | |
| 835 } | |
| 836 | |
| 837 if (previous_line_flags != -1) | |
| 838 { | |
| 839 paragraph->line_flags |= (previous_line_flags & paragraph_not_aligned_left); | |
| 840 } | |
| 841 } | |
| 842 | |
| 843 return 0; | |
| 844 } | |
| 845 | |
| 846 static int | |
| 847 spot_rotated_blocks( | |
| 848 extract_alloc_t *alloc, | |
| 849 content_root_t *lines) | |
| 850 { | |
| 851 /* On entry, we have that the content in lines has been | |
| 852 sorted so that paragraphs with the same rotation are together, | |
| 853 and sorted in order of increasing y. All we have to do here is | |
| 854 to spot a run of paragraphs with the same rotation, and to gather | |
| 855 them into a block. */ | |
| 856 content_iterator cit; | |
| 857 content_t *content; | |
| 858 content_iterator cit0 = { 0 }; /* Stop clever-clever compilers warning. */ | |
| 859 content_t *content0 = NULL; | |
| 860 int ret = -1; | |
| 861 matrix4_t ctm0; | |
| 862 int wmode, wmode0; | |
| 863 int ctm0_set = 0; | |
| 864 | |
| 865 for (content = content_iterator_init(&cit, lines); content != NULL; content = content_iterator_next(&cit)) | |
| 866 { | |
| 867 matrix4_t ctm; | |
| 868 int ctm_set = 0; | |
| 869 int flush = 0; | |
| 870 | |
| 871 switch (content->type) | |
| 872 { | |
| 873 case content_paragraph: | |
| 874 { | |
| 875 double rotate; | |
| 876 span_t *span = content_first_span(&content_first_line(&((paragraph_t *)content)->content)->content); | |
| 877 wmode = span->flags.wmode; | |
| 878 ctm = span->ctm; | |
| 879 rotate = atan2(ctm.b, ctm.a); | |
| 880 /* We are not gathering rotated stuff into blocks. If the rotation returns to zero | |
| 881 * then flush any collection we might have found. Otherwise, remember that we have | |
| 882 * a ctm value set, so we can compare to it. */ | |
| 883 if (rotate == 0) | |
| 884 flush = 1; | |
| 885 else | |
| 886 ctm_set = 1; | |
| 887 /* If the ctm value differs from the first ctm0 we met for the current collection, | |
| 888 * flush the collection. */ | |
| 889 if (ctm0_set && (wmode != wmode0 || !matrices_are_compatible(&ctm, &ctm0, wmode0))) | |
| 890 flush = 1; | |
| 891 break; | |
| 892 } | |
| 893 default: | |
| 894 flush = 1; | |
| 895 break; | |
| 896 } | |
| 897 | |
| 898 if (flush && content0) | |
| 899 { | |
| 900 /* Move [content0..content) to a block */ | |
| 901 block_t *block; | |
| 902 content_t *c = content_iterator_next(&cit0); | |
| 903 /* Replace content0 with new block. */ | |
| 904 if (content_replace_new_block(alloc, content0, &block)) goto end; | |
| 905 /* Insert content0 into block. */ | |
| 906 content_append(&block->content, content0); | |
| 907 /* Now move the rest of the list into block too. */ | |
| 908 for (; c != content; c = content_iterator_next(&cit0)) | |
| 909 { | |
| 910 content_append(&block->content, c); | |
| 911 } | |
| 912 ctm0_set = 0; | |
| 913 content0 = NULL; | |
| 914 } | |
| 915 if (ctm_set && !ctm0_set) | |
| 916 { | |
| 917 ctm0 = ctm; | |
| 918 ctm0_set = 1; | |
| 919 wmode0 = wmode; | |
| 920 content0 = content; | |
| 921 cit0 = cit; | |
| 922 } | |
| 923 } | |
| 924 | |
| 925 if (content0) | |
| 926 { | |
| 927 /* Move [content0..NULL) to a block */ | |
| 928 block_t *block; | |
| 929 content_t *c = content_iterator_next(&cit0); | |
| 930 /* Replace content0 with new block. */ | |
| 931 if (content_replace_new_block(alloc, content0, &block)) goto end; | |
| 932 /* Insert content0 into block. */ | |
| 933 content_append(&block->content, content0); | |
| 934 /* Now move the rest of the list into block too. */ | |
| 935 for (; c != content; c = content_iterator_next(&cit0)) | |
| 936 { | |
| 937 content_append(&block->content, c); | |
| 938 } | |
| 939 } | |
| 940 | |
| 941 ret = 0; | |
| 942 end: | |
| 943 | |
| 944 return ret; | |
| 945 } | |
| 946 | |
| 947 /* Take any content from content that falls inside rect, and append | |
| 948 * it to subset. */ | |
| 949 static int | |
| 950 spans_within_rect( | |
| 951 extract_alloc_t *alloc, | |
| 952 content_root_t *content, | |
| 953 rect_t *rect, | |
| 954 content_root_t *subset) | |
| 955 { | |
| 956 content_span_iterator it; | |
| 957 span_t *candidate; | |
| 958 | |
| 959 /* Make <lines> contain new span_t's and char_t's that are inside rects[]. */ | |
| 960 for (candidate = content_span_iterator_init(&it, content); candidate != NULL; candidate = content_span_iterator_next(&it)) | |
| 961 { | |
| 962 span_t *span; | |
| 963 | |
| 964 if (candidate->chars_num == 0) | |
| 965 continue; /* In case used for table, */ | |
| 966 | |
| 967 /* Create a new span. */ | |
| 968 if (content_new_span(alloc, &span, candidate->structure)) | |
| 969 return -1; | |
| 970 /* Extract any chars from candidate that fall inside rect, inserting | |
| 971 * those chars into subset. */ | |
| 972 if (span_inside_rect(alloc, candidate, rect, span)) | |
| 973 return -1; | |
| 974 if (span->chars_num) | |
| 975 { | |
| 976 /* We populated it with some chars! */ | |
| 977 /* Unlink span from where it was, and insert into subset. */ | |
| 978 content_append_span(subset, span); | |
| 979 } | |
| 980 else | |
| 981 { | |
| 982 /* No chars found. Bin it. */ | |
| 983 extract_span_free(alloc, &span); | |
| 984 } | |
| 985 span = NULL; /* Avoid us freeing on error now ownership has moved. */ | |
| 986 | |
| 987 if (!candidate->chars_num) | |
| 988 { | |
| 989 /* All characters in this span are inside table, so remove | |
| 990 * the vestigial span. */ | |
| 991 extract_span_free(alloc, &candidate); | |
| 992 } | |
| 993 } | |
| 994 | |
| 995 return 0; | |
| 996 } | |
| 997 | |
| 998 static int | |
| 999 join_content( | |
| 1000 extract_alloc_t *alloc, | |
| 1001 content_root_t *lines, | |
| 1002 double master_space_guess) | |
| 1003 { | |
| 1004 if (make_lines(alloc, lines, master_space_guess)) | |
| 1005 return -1; | |
| 1006 if (make_paragraphs(alloc, lines)) | |
| 1007 return -1; | |
| 1008 if (analyse_paragraphs(lines)) | |
| 1009 return -1; | |
| 1010 if (spot_rotated_blocks(alloc, lines)) | |
| 1011 return -1; | |
| 1012 | |
| 1013 return 0; | |
| 1014 } | |
| 1015 | |
| 1016 | |
| 1017 /* Compares two tableline_t's rectangles using x as primary key. */ | |
| 1018 static int tablelines_compare_x(const void *a, const void *b) | |
| 1019 { | |
| 1020 const tableline_t *aa = a; | |
| 1021 const tableline_t *bb = b; | |
| 1022 | |
| 1023 if (aa->rect.min.x > bb->rect.min.x) return +1; | |
| 1024 if (aa->rect.min.x < bb->rect.min.x) return -1; | |
| 1025 if (aa->rect.min.y > bb->rect.min.y) return +1; | |
| 1026 if (aa->rect.min.y < bb->rect.min.y) return -1; | |
| 1027 | |
| 1028 return 0; | |
| 1029 } | |
| 1030 | |
| 1031 /* Compares two tableline_t's rectangles using y as primary key. */ | |
| 1032 static int tablelines_compare_y(const void *a, const void *b) | |
| 1033 { | |
| 1034 const tableline_t *aa = a; | |
| 1035 const tableline_t *bb = b; | |
| 1036 | |
| 1037 if (aa->rect.min.y > bb->rect.min.y) return +1; | |
| 1038 if (aa->rect.min.y < bb->rect.min.y) return -1; | |
| 1039 if (aa->rect.min.x > bb->rect.min.x) return +1; | |
| 1040 if (aa->rect.min.x < bb->rect.min.x) return -1; | |
| 1041 | |
| 1042 return 0; | |
| 1043 } | |
| 1044 | |
| 1045 /* Makes <out> to contain all lines in <all> with y coordinate in the range | |
| 1046 y_min..y_max. */ | |
| 1047 static int | |
| 1048 table_find_y_range( | |
| 1049 extract_alloc_t *alloc, | |
| 1050 tablelines_t *all, | |
| 1051 double y_min, | |
| 1052 double y_max, | |
| 1053 tablelines_t *out) | |
| 1054 { | |
| 1055 int i; | |
| 1056 | |
| 1057 for (i=0; i<all->tablelines_num; ++i) | |
| 1058 { | |
| 1059 if (all->tablelines[i].rect.min.y >= y_min && all->tablelines[i].rect.min.y < y_max) | |
| 1060 { | |
| 1061 if (extract_realloc(alloc, &out->tablelines, sizeof(*out->tablelines) * (out->tablelines_num + 1))) return -1; | |
| 1062 out->tablelines[out->tablelines_num] = all->tablelines[i]; | |
| 1063 out->tablelines_num += 1; | |
| 1064 } | |
| 1065 else | |
| 1066 { | |
| 1067 outf("Excluding line because outside y=%f..%f: %s", y_min, y_max, extract_rect_string(&all->tablelines[i].rect)); | |
| 1068 } | |
| 1069 } | |
| 1070 | |
| 1071 return 0; | |
| 1072 } | |
| 1073 | |
| 1074 | |
| 1075 /* Returns one if a_min..a_max significantly overlapps b_min..b_max, otherwise | |
| 1076 zero. */ | |
| 1077 static int | |
| 1078 overlap(double a_min, double a_max, double b_min, double b_max) | |
| 1079 { | |
| 1080 double overlap; | |
| 1081 int ret0; | |
| 1082 int ret1; | |
| 1083 | |
| 1084 assert(a_min < a_max); | |
| 1085 assert(b_min < b_max); | |
| 1086 if (b_min < a_min) b_min = a_min; | |
| 1087 if (b_max > a_max) b_max = a_max; | |
| 1088 if (b_max < b_min) b_max = b_min; | |
| 1089 overlap = (b_max - b_min) / (a_max - a_min); | |
| 1090 ret0 = overlap > 0.2; | |
| 1091 ret1 = overlap > 0.8; | |
| 1092 if (ret0 != ret1) | |
| 1093 { | |
| 1094 if (0) outf0("warning, unclear overlap=%f: a=%f..%f b=%f..%f", overlap, a_min, a_max, b_min, b_max); | |
| 1095 } | |
| 1096 | |
| 1097 return overlap > 0.8; | |
| 1098 } | |
| 1099 | |
| 1100 void extract_cell_init(cell_t *cell) | |
| 1101 { | |
| 1102 cell->rect.min.x = 0; | |
| 1103 cell->rect.min.y = 0; | |
| 1104 cell->rect.max.x = 0; | |
| 1105 cell->rect.max.y = 0; | |
| 1106 cell->above = 0; | |
| 1107 cell->left = 0; | |
| 1108 cell->extend_right = 0; | |
| 1109 cell->extend_down = 0; | |
| 1110 content_init_root(&cell->content, NULL); | |
| 1111 } | |
| 1112 | |
| 1113 | |
| 1114 /* Find cell extensions to right and down by looking at cells' .left and | |
| 1115 above flags. | |
| 1116 | |
| 1117 For example for adjacent cells ABC..., we extend A to include cells BC.. | |
| 1118 until we reach a cell with .left set to one. | |
| 1119 | |
| 1120 ABCDE | |
| 1121 FGHIJ | |
| 1122 KLMNO | |
| 1123 | |
| 1124 When looking to extend cell A, we only look at cells in the same column or | |
| 1125 same row, (i.e. in the above example we look at BCDE and FK, and not at | |
| 1126 GHIJ and LMNO). | |
| 1127 | |
| 1128 For example if BCDE have no left lines and FK have no above lines, we | |
| 1129 ignore any lines in GHIJ and LMNO and make A extend to the entire 3x4 | |
| 1130 box. Having found this box, we set .above=0 and .left to 0 in all enclosed | |
| 1131 cells, which simplifies html table generation code. | |
| 1132 */ | |
| 1133 static int table_find_extend(cell_t **cells, int cells_num_x, int cells_num_y) | |
| 1134 { | |
| 1135 int y; | |
| 1136 | |
| 1137 for (y=0; y<cells_num_y; ++y) | |
| 1138 { | |
| 1139 int x; | |
| 1140 for (x=0; x<cells_num_x; ++x) | |
| 1141 { | |
| 1142 cell_t* cell = cells[y * cells_num_x + x]; | |
| 1143 outf("xy=(%i %i) above=%i left=%i", x, y, cell->above, cell->left); | |
| 1144 if (cell->left && cell->above) | |
| 1145 { | |
| 1146 /* See how far this cell extends to right and down. */ | |
| 1147 int xx; | |
| 1148 int yy; | |
| 1149 for (xx=x+1; xx<cells_num_x; ++xx) | |
| 1150 { | |
| 1151 if (cells[y * cells_num_x + xx]->left) break; | |
| 1152 } | |
| 1153 cell->extend_right = xx - x; | |
| 1154 cell->rect.max.x = cells[y * cells_num_x + xx-1]->rect.max.x; | |
| 1155 for (yy=y+1; yy<cells_num_y; ++yy) | |
| 1156 { | |
| 1157 if (cells[yy * cells_num_x + x]->above) break; | |
| 1158 } | |
| 1159 cell->extend_down = yy - y; | |
| 1160 cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y; | |
| 1161 | |
| 1162 /* Clear .above and .left in enclosed cells. */ | |
| 1163 for (xx = x; xx < x + cell->extend_right; ++xx) | |
| 1164 { | |
| 1165 int yy; | |
| 1166 for (yy = y; yy < y + cell->extend_down; ++yy) | |
| 1167 { | |
| 1168 cell_t* cell2 = cells[cells_num_x * yy + xx]; | |
| 1169 if ( xx==x && yy==y) | |
| 1170 {} | |
| 1171 else | |
| 1172 { | |
| 1173 if (xx==x) | |
| 1174 { | |
| 1175 cell2->extend_right = cell->extend_right; | |
| 1176 } | |
| 1177 cell2->above = 0; | |
| 1178 /* We set .left to 1 for left-most cells - e.g. F | |
| 1179 and K in the above diagram; this allows us to | |
| 1180 generate correct html without lots of recursing | |
| 1181 looking for extend_down in earlier cells. */ | |
| 1182 cell2->left = (xx == x); | |
| 1183 outf("xy=(%i %i) xxyy=(%i %i) have set cell2->above=%i left=%i", | |
| 1184 x, y, xx, yy, cell2->above, cell2->left | |
| 1185 ); | |
| 1186 } | |
| 1187 } | |
| 1188 } | |
| 1189 } | |
| 1190 } | |
| 1191 } | |
| 1192 return 0; | |
| 1193 } | |
| 1194 | |
| 1195 | |
| 1196 /* Sets each cell to contain the text that is within the cell's boundary. We | |
| 1197 remove any found text from the page. */ | |
| 1198 static int | |
| 1199 table_find_cells_text( | |
| 1200 extract_alloc_t *alloc, | |
| 1201 subpage_t *subpage, | |
| 1202 cell_t **cells, | |
| 1203 int cells_num_x, | |
| 1204 int cells_num_y, | |
| 1205 double master_space_guess) | |
| 1206 { | |
| 1207 /* Find text within each cell. We don't attempt to handle images within | |
| 1208 cells. */ | |
| 1209 int e = -1; | |
| 1210 int i; | |
| 1211 int cells_num = cells_num_x * cells_num_y; | |
| 1212 table_t *table; | |
| 1213 | |
| 1214 for (i=0; i<cells_num; ++i) | |
| 1215 { | |
| 1216 cell_t* cell = cells[i]; | |
| 1217 if (!cell->above || !cell->left) continue; | |
| 1218 | |
| 1219 if (spans_within_rect(alloc, &subpage->content, &cell->rect, &cell->content)) | |
| 1220 return -1; | |
| 1221 if (join_content(alloc, &cell->content, master_space_guess)) | |
| 1222 return -1; | |
| 1223 } | |
| 1224 | |
| 1225 /* Append the table we have found to page->tables[]. */ | |
| 1226 if (content_append_new_table(alloc, &subpage->tables, &table)) goto end; | |
| 1227 table->pos.x = cells[0]->rect.min.x; | |
| 1228 table->pos.y = cells[0]->rect.min.y; | |
| 1229 table->cells = cells; | |
| 1230 table->cells_num_x = cells_num_x; | |
| 1231 table->cells_num_y = cells_num_y; | |
| 1232 | |
| 1233 if (0) | |
| 1234 { | |
| 1235 /* For debugging. */ | |
| 1236 int y; | |
| 1237 outf0("table:\n"); | |
| 1238 for (y=0; y<cells_num_y; ++y) | |
| 1239 { | |
| 1240 int x; | |
| 1241 for (x=0; x<cells_num_x; ++x) | |
| 1242 { | |
| 1243 cell_t* cell = cells[cells_num_x * y + x]; | |
| 1244 fprintf(stderr, " %c%c x=%i y=% 3i 3i w=%i h=%i", | |
| 1245 cell->left ? '|' : ' ', | |
| 1246 cell->above ? '-' : ' ', | |
| 1247 x, | |
| 1248 y, | |
| 1249 cell->extend_right, | |
| 1250 cell->extend_down | |
| 1251 ); | |
| 1252 } | |
| 1253 fprintf(stderr, "\n"); | |
| 1254 } | |
| 1255 | |
| 1256 } | |
| 1257 | |
| 1258 e = 0; | |
| 1259 end: | |
| 1260 | |
| 1261 return e; | |
| 1262 } | |
| 1263 | |
| 1264 | |
| 1265 /* Finds single table made from lines whose y coordinates are in the range | |
| 1266 y_min..y_max. */ | |
| 1267 static int | |
| 1268 table_find(extract_alloc_t *alloc, subpage_t *subpage, double y_min, double y_max, double master_space_guess) | |
| 1269 { | |
| 1270 tablelines_t *all_h = &subpage->tablelines_horizontal; | |
| 1271 tablelines_t *all_v = &subpage->tablelines_vertical; | |
| 1272 int e = -1; | |
| 1273 int i; | |
| 1274 | |
| 1275 /* Find subset of vertical and horizontal lines that are within range | |
| 1276 y_min..y_max, and sort by y coordinate. */ | |
| 1277 tablelines_t tl_h = {NULL, 0}; | |
| 1278 tablelines_t tl_v = {NULL, 0}; | |
| 1279 cell_t **cells = NULL; | |
| 1280 int cells_num = 0; | |
| 1281 int cells_num_x = 0; | |
| 1282 int cells_num_y = 0; | |
| 1283 int x; | |
| 1284 int y; | |
| 1285 | |
| 1286 outf("y=(%f %f)", y_min, y_max); | |
| 1287 | |
| 1288 if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end; | |
| 1289 if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end; | |
| 1290 /* Suppress false coverity warning - qsort() does not dereference null | |
| 1291 pointer if nmemb is zero. */ | |
| 1292 /* coverity[var_deref_model] */ | |
| 1293 qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x); | |
| 1294 | |
| 1295 if (0) | |
| 1296 { | |
| 1297 /* Show raw lines info. */ | |
| 1298 outf0("all_h->tablelines_num=%i tl_h.tablelines_num=%i", all_h->tablelines_num, tl_h.tablelines_num); | |
| 1299 for (i=0; i<tl_h.tablelines_num; ++i) | |
| 1300 { | |
| 1301 outf0(" %i: %s", i, extract_rect_string(&tl_h.tablelines[i].rect)); | |
| 1302 } | |
| 1303 | |
| 1304 outf0("all_v->tablelines_num=%i tl_v.tablelines_num=%i", all_v->tablelines_num, tl_v.tablelines_num); | |
| 1305 for (i=0; i<tl_v.tablelines_num; ++i) | |
| 1306 { | |
| 1307 outf0(" %i: %s", i, extract_rect_string(&tl_v.tablelines[i].rect)); | |
| 1308 } | |
| 1309 } | |
| 1310 /* Find the cells defined by the vertical and horizontal lines. | |
| 1311 | |
| 1312 It seems that lines can be disjoint, e.g. what looks like a single | |
| 1313 horizontal line could be made up of multiple lines all with the same | |
| 1314 y coordinate, so we use i_next and j_next to skip these sublines when | |
| 1315 iterating. */ | |
| 1316 cells = NULL; | |
| 1317 cells_num = 0; | |
| 1318 cells_num_x = 0; | |
| 1319 cells_num_y = 0; | |
| 1320 for (i=0; i<tl_h.tablelines_num; ) | |
| 1321 { | |
| 1322 int i_next; | |
| 1323 int j; | |
| 1324 for (i_next=i+1; i_next<tl_h.tablelines_num; ++i_next) | |
| 1325 { | |
| 1326 if (tl_h.tablelines[i_next].rect.min.y - tl_h.tablelines[i].rect.min.y > 5) break; | |
| 1327 } | |
| 1328 if (i_next == tl_h.tablelines_num) | |
| 1329 { | |
| 1330 /* Ignore last row of points - cells need another row below. */ | |
| 1331 break; | |
| 1332 } | |
| 1333 cells_num_y += 1; | |
| 1334 | |
| 1335 for (j=0; j<tl_v.tablelines_num; ) | |
| 1336 { | |
| 1337 int j_next; | |
| 1338 int ii; | |
| 1339 int jj; | |
| 1340 cell_t* cell; | |
| 1341 | |
| 1342 for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next) | |
| 1343 { | |
| 1344 if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break; | |
| 1345 } | |
| 1346 outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect)); | |
| 1347 | |
| 1348 if (j_next == tl_v.tablelines_num) break; | |
| 1349 | |
| 1350 if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end; | |
| 1351 if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end; | |
| 1352 cell = cells[cells_num]; | |
| 1353 cells_num += 1; | |
| 1354 if (i==0) cells_num_x += 1; | |
| 1355 | |
| 1356 cell->rect.min.x = tl_v.tablelines[j].rect.min.x; | |
| 1357 cell->rect.min.y = tl_h.tablelines[i].rect.min.y; | |
| 1358 cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x; | |
| 1359 cell->rect.max.y = (i_next < tl_h.tablelines_num) ? tl_h.tablelines[i_next].rect.min.y : cell->rect.min.y; | |
| 1360 cell->above = (i==0); | |
| 1361 cell->left = (j==0); | |
| 1362 cell->extend_right = 1; | |
| 1363 cell->extend_down = 1; | |
| 1364 content_init_root(&cell->content, NULL); | |
| 1365 | |
| 1366 /* Set cell->above if there is a horizontal line above the cell. */ | |
| 1367 outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect)); | |
| 1368 for (ii = i; ii < i_next; ++ii) | |
| 1369 { | |
| 1370 tableline_t* h = &tl_h.tablelines[ii]; | |
| 1371 if (overlap( | |
| 1372 cell->rect.min.x, | |
| 1373 cell->rect.max.x, | |
| 1374 h->rect.min.x, | |
| 1375 h->rect.max.x | |
| 1376 )) | |
| 1377 { | |
| 1378 cell->above = 1; | |
| 1379 break; | |
| 1380 } | |
| 1381 } | |
| 1382 | |
| 1383 /* Set cell->left if there is a vertical line to the left of the cell. */ | |
| 1384 for (jj = j; jj < j_next; ++jj) | |
| 1385 { | |
| 1386 tableline_t* v = &tl_v.tablelines[jj]; | |
| 1387 if (overlap( | |
| 1388 cell->rect.min.y, | |
| 1389 cell->rect.max.y, | |
| 1390 v->rect.min.y, | |
| 1391 v->rect.max.y | |
| 1392 )) | |
| 1393 { | |
| 1394 cell->left = 1; | |
| 1395 break; | |
| 1396 } | |
| 1397 } | |
| 1398 | |
| 1399 j = j_next; | |
| 1400 } | |
| 1401 | |
| 1402 i = i_next; | |
| 1403 } | |
| 1404 | |
| 1405 assert(cells_num == cells_num_x * cells_num_y); | |
| 1406 | |
| 1407 /* Remove cols and rows where no cells have .above and .left - these | |
| 1408 will not appear. It also avoids spurious empty columns when table uses | |
| 1409 closely-spaced double lines as separators. */ | |
| 1410 for (x=0; x<cells_num_x; ++x) | |
| 1411 { | |
| 1412 int has_cells = 0; | |
| 1413 for (y=0; y<cells_num_y; ++y) | |
| 1414 { | |
| 1415 cell_t* cell = cells[y * cells_num_x + x]; | |
| 1416 if (cell->above && cell->left) | |
| 1417 { | |
| 1418 has_cells = 1; | |
| 1419 break; | |
| 1420 } | |
| 1421 } | |
| 1422 if (!has_cells) | |
| 1423 { | |
| 1424 /* Remove column <x>. */ | |
| 1425 int j = 0; | |
| 1426 outf("Removing column %i. cells_num=%i cells_num_x=%i cells_num_y=%i", x, cells_num, cells_num_x, cells_num_y); | |
| 1427 for (i=0; i<cells_num; ++i) | |
| 1428 { | |
| 1429 if (i % cells_num_x == x) | |
| 1430 { | |
| 1431 extract_cell_free(alloc, &cells[i]); | |
| 1432 continue; | |
| 1433 } | |
| 1434 cells[j] = cells[i]; | |
| 1435 j += 1; | |
| 1436 } | |
| 1437 cells_num -= cells_num_y; | |
| 1438 cells_num_x -= 1; | |
| 1439 } | |
| 1440 } | |
| 1441 | |
| 1442 if (cells_num == 0) | |
| 1443 { | |
| 1444 e = 0; | |
| 1445 goto end; | |
| 1446 } | |
| 1447 | |
| 1448 if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end; | |
| 1449 | |
| 1450 if (table_find_cells_text(alloc, subpage, cells, cells_num_x, cells_num_y, master_space_guess)) goto end; | |
| 1451 | |
| 1452 e = 0; | |
| 1453 end: | |
| 1454 | |
| 1455 extract_free(alloc, &tl_h.tablelines); | |
| 1456 extract_free(alloc, &tl_v.tablelines); | |
| 1457 if (e) | |
| 1458 { | |
| 1459 for (i=0; i<cells_num; ++i) | |
| 1460 { | |
| 1461 extract_cell_free(alloc, &cells[i]); | |
| 1462 } | |
| 1463 extract_free(alloc, &cells); | |
| 1464 } | |
| 1465 | |
| 1466 return e; | |
| 1467 } | |
| 1468 | |
| 1469 | |
| 1470 /* Finds tables in <page> by looking for lines in page->tablelines_horizontal | |
| 1471 and page->tablelines_vertical that look like table dividers. | |
| 1472 | |
| 1473 Any text found inside tables is removed from page->spans[]. | |
| 1474 */ | |
| 1475 static int extract_subpage_tables_find_lines( | |
| 1476 extract_alloc_t *alloc, | |
| 1477 subpage_t *subpage, | |
| 1478 double master_space_guess) | |
| 1479 { | |
| 1480 double miny; | |
| 1481 double maxy; | |
| 1482 double margin = 1; | |
| 1483 int iv; | |
| 1484 int ih; | |
| 1485 outf("page->tablelines_horizontal.tablelines_num=%i", subpage->tablelines_horizontal.tablelines_num); | |
| 1486 outf("page->tablelines_vertical.tablelines_num=%i", subpage->tablelines_vertical.tablelines_num); | |
| 1487 | |
| 1488 /* Sort all lines by y coordinate. */ | |
| 1489 qsort(subpage->tablelines_horizontal.tablelines, | |
| 1490 subpage->tablelines_horizontal.tablelines_num, | |
| 1491 sizeof(*subpage->tablelines_horizontal.tablelines), | |
| 1492 tablelines_compare_y); | |
| 1493 qsort(subpage->tablelines_vertical.tablelines, | |
| 1494 subpage->tablelines_vertical.tablelines_num, | |
| 1495 sizeof(*subpage->tablelines_vertical.tablelines), | |
| 1496 tablelines_compare_y); | |
| 1497 | |
| 1498 if (0) | |
| 1499 { | |
| 1500 /* Show info about lines. */ | |
| 1501 int i; | |
| 1502 outf0("tablelines_horizontal:"); | |
| 1503 for (i=0; i<subpage->tablelines_horizontal.tablelines_num; ++i) | |
| 1504 { | |
| 1505 outf0(" color=%f: %s", | |
| 1506 subpage->tablelines_horizontal.tablelines[i].color, | |
| 1507 extract_rect_string(&subpage->tablelines_horizontal.tablelines[i].rect) | |
| 1508 ); | |
| 1509 } | |
| 1510 outf0("tablelines_vertical:"); | |
| 1511 for (i=0; i<subpage->tablelines_vertical.tablelines_num; ++i) | |
| 1512 { | |
| 1513 outf0(" color=%f: %s", | |
| 1514 subpage->tablelines_vertical.tablelines[i].color, | |
| 1515 extract_rect_string(&subpage->tablelines_vertical.tablelines[i].rect) | |
| 1516 ); | |
| 1517 } | |
| 1518 } | |
| 1519 | |
| 1520 /* Look for completely separate vertical regions that define different | |
| 1521 tables, by looking for vertical gaps between the rects of each | |
| 1522 horizontal/vertical line. */ | |
| 1523 maxy = -DBL_MAX; | |
| 1524 miny = -DBL_MAX; | |
| 1525 iv = 0; | |
| 1526 ih = 0; | |
| 1527 for(;;) | |
| 1528 { | |
| 1529 tableline_t *tlv = NULL; | |
| 1530 tableline_t *tlh = NULL; | |
| 1531 tableline_t *tl; | |
| 1532 if (iv < subpage->tablelines_vertical.tablelines_num) | |
| 1533 { | |
| 1534 tlv = &subpage->tablelines_vertical.tablelines[iv]; | |
| 1535 } | |
| 1536 /* We only consider horizontal lines that are not white. This is a bit | |
| 1537 of a cheat to get the right behaviour with twotables_2.pdf. */ | |
| 1538 while (ih < subpage->tablelines_horizontal.tablelines_num) | |
| 1539 { | |
| 1540 if (subpage->tablelines_horizontal.tablelines[ih].color == 1) | |
| 1541 { | |
| 1542 /* Ignore white horizontal lines. */ | |
| 1543 ++ih; | |
| 1544 } | |
| 1545 else | |
| 1546 { | |
| 1547 tlh = &subpage->tablelines_horizontal.tablelines[ih]; | |
| 1548 break; | |
| 1549 } | |
| 1550 } | |
| 1551 if (tlv && tlh) | |
| 1552 { | |
| 1553 tl = (tlv->rect.min.y < tlh->rect.min.y) ? tlv : tlh; | |
| 1554 } | |
| 1555 else if (tlv) tl = tlv; | |
| 1556 else if (tlh) tl = tlh; | |
| 1557 else break; | |
| 1558 if (tl == tlv) iv += 1; | |
| 1559 else ih += 1; | |
| 1560 if (tl->rect.min.y > maxy + margin) | |
| 1561 { | |
| 1562 if (maxy > miny) | |
| 1563 { | |
| 1564 outf("New table. maxy=%f miny=%f", maxy, miny); | |
| 1565 /* Find table. */ | |
| 1566 table_find(alloc, subpage, miny - margin, maxy + margin, master_space_guess); | |
| 1567 } | |
| 1568 miny = tl->rect.min.y; | |
| 1569 } | |
| 1570 if (tl->rect.max.y > maxy) maxy = tl->rect.max.y; | |
| 1571 } | |
| 1572 | |
| 1573 /* Find last table. */ | |
| 1574 table_find(alloc, subpage, miny - margin, maxy + margin, master_space_guess); | |
| 1575 | |
| 1576 return 0; | |
| 1577 } | |
| 1578 | |
| 1579 | |
| 1580 /* For debugging only. */ | |
| 1581 static void show_tables(content_root_t *tables) | |
| 1582 { | |
| 1583 content_table_iterator tit; | |
| 1584 table_t *table; | |
| 1585 | |
| 1586 outf0("tables_num=%i", content_count_tables(tables)); | |
| 1587 for (table = content_table_iterator_init(&tit, tables); table != NULL; table = content_table_iterator_next(&tit)) | |
| 1588 { | |
| 1589 int y; | |
| 1590 outf0("table: cells_num_y=%i cells_num_x=%i", table->cells_num_y, table->cells_num_x); | |
| 1591 for (y=0; y<table->cells_num_y; ++y) | |
| 1592 { | |
| 1593 int x; | |
| 1594 for (x=0; x<table->cells_num_x; ++x) | |
| 1595 { | |
| 1596 cell_t* cell = table->cells[table->cells_num_x * y + x]; | |
| 1597 outf0("cell: y=% 3i x=% 3i: left=%i above=%i rect=%s", | |
| 1598 y, x, cell->left, cell->above, extract_rect_string(&cell->rect)); | |
| 1599 } | |
| 1600 } | |
| 1601 } | |
| 1602 } | |
| 1603 | |
| 1604 /* Find tables in <page>. | |
| 1605 | |
| 1606 At the moment this only calls extract_page_tables_find_lines(), but in future | |
| 1607 will call other functions that find tables in different ways, e.g. by analysing | |
| 1608 an image of a page, or looking for blocks of whitespace in between chunks of | |
| 1609 text. */ | |
| 1610 static int | |
| 1611 extract_subpage_tables_find( | |
| 1612 extract_alloc_t *alloc, | |
| 1613 subpage_t *subpage, | |
| 1614 double master_space_guess) | |
| 1615 { | |
| 1616 if (extract_subpage_tables_find_lines(alloc, subpage, master_space_guess)) return -1; | |
| 1617 | |
| 1618 if (0) | |
| 1619 { | |
| 1620 outf0("=== tables from extract_page_tables_find_lines():"); | |
| 1621 show_tables(&subpage->tables); | |
| 1622 } | |
| 1623 | |
| 1624 return 0; | |
| 1625 } | |
| 1626 | |
| 1627 /* Finds tables and paragraphs on <page>. */ | |
| 1628 static int | |
| 1629 extract_join_subpage( | |
| 1630 extract_alloc_t *alloc, | |
| 1631 subpage_t *subpage, | |
| 1632 double master_space_guess) | |
| 1633 { | |
| 1634 /* Find tables on this page first. This will remove text that is within | |
| 1635 tables from page->spans, so that text doesn't appear more than once in | |
| 1636 the final output. */ | |
| 1637 if (extract_subpage_tables_find(alloc, subpage, master_space_guess)) return -1; | |
| 1638 | |
| 1639 /* Now join remaining spans into lines and paragraphs. */ | |
| 1640 if (join_content(alloc, &subpage->content, master_space_guess)) | |
| 1641 return -1; | |
| 1642 | |
| 1643 return 0; | |
| 1644 } | |
| 1645 | |
| 1646 | |
| 1647 /* For each page in <document> we find tables and join spans into lines and paragraphs. | |
| 1648 | |
| 1649 A line is a list of spans that are at the same angle and on the same | |
| 1650 line. A paragraph is a list of lines that are at the same angle and close | |
| 1651 together. | |
| 1652 */ | |
| 1653 int extract_document_join(extract_alloc_t *alloc, document_t *document, int layout_analysis, double master_space_guess) | |
| 1654 { | |
| 1655 int p; | |
| 1656 | |
| 1657 for (p=0; p<document->pages_num; ++p) { | |
| 1658 extract_page_t* page = document->pages[p]; | |
| 1659 int c; | |
| 1660 | |
| 1661 /* If we have layout analysis enabled, then we do our 'boxer' analysis to | |
| 1662 * try to spot subdivisions and subpages. */ | |
| 1663 if (layout_analysis && extract_page_analyse(alloc, page)) return -1; | |
| 1664 | |
| 1665 for (c=0; c<page->subpages_num; ++c) { | |
| 1666 subpage_t* subpage = page->subpages[c]; | |
| 1667 | |
| 1668 outf("processing page %i, subpage %i: num_spans=%i", p, c, content_count_spans(&subpage->content)); | |
| 1669 if (extract_join_subpage(alloc, subpage, master_space_guess)) return -1; | |
| 1670 } | |
| 1671 } | |
| 1672 | |
| 1673 return 0; | |
| 1674 } |
