comparison mupdf-source/thirdparty/extract/src/join.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 #include "extract/extract.h"
2 #include "extract/alloc.h"
3
4 #include "astring.h"
5 #include "document.h"
6 #include "mem.h"
7 #include "outf.h"
8
9 #include <assert.h>
10 #include <float.h>
11 #include <math.h>
12 #include <stdio.h>
13
14
15 static char_t *span_char_first(span_t *span)
16 {
17 assert(span->chars_num > 0);
18 return &span->chars[0];
19 }
20
21 static char_t *span_char_last(span_t *span)
22 {
23 assert(span->chars_num > 0);
24 return &span->chars[span->chars_num-1];
25 }
26
27 const char *extract_matrix_string(const matrix_t *matrix)
28 {
29 static char ret[5][64];
30 static int i = 0;
31 i = (i + 1) % 5;
32 snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f %f %f}",
33 matrix->a,
34 matrix->b,
35 matrix->c,
36 matrix->d,
37 matrix->e,
38 matrix->f);
39
40 return ret[i];
41 }
42
43 const char *extract_matrix4_string(const matrix4_t *matrix)
44 {
45 static char ret[5][64];
46 static int i = 0;
47 i = (i + 1) % 5;
48 snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f}",
49 matrix->a,
50 matrix->b,
51 matrix->c,
52 matrix->d);
53 return ret[i];
54 }
55
56 /* Returns first line in paragraph. */
57 static line_t *paragraph_line_first(const paragraph_t *paragraph)
58 {
59 return content_first_line(&paragraph->content);
60 }
61
62 /* Returns last line in paragraph. */
63 static line_t *paragraph_line_last(const paragraph_t *paragraph)
64 {
65 return content_last_line(&paragraph->content);
66 }
67
68
69
70 /* Things for direct conversion of text spans into lines and paragraphs. */
71
72 static int
73 matrices_are_compatible(const matrix4_t *ctm_a, const matrix4_t *ctm_b, int wmode)
74 {
75 double dot, pdot;
76
77 /* Calculate the dot product between the direction vector transformed by each ctm. This should be large for
78 * compatible lines (cos they should be colinear). Also calculate the dot product between the direction
79 * vector transformed by the first ctm, and perp(direction vector) transformed by the second ctm. This
80 * should be zero. */
81 if (wmode)
82 {
83 dot = ctm_a->c * ctm_b->c + ctm_a->d * ctm_b->d;
84 pdot = ctm_a->c * ctm_b->d - ctm_a->d * ctm_b->c;
85 }
86 else
87 {
88 dot = ctm_a->a * ctm_b->a + ctm_a->b * ctm_b->b;
89 pdot = ctm_a->a * ctm_b->b - ctm_a->b * ctm_b->a;
90 }
91 /* Negative dot means lines are in opposite sense. */
92 if (dot <= 0)
93 return 0;
94 /* Remove the scaling from pdot to get back to a unit vector. */
95 pdot /= dot;
96
97 return (fabs(pdot) < 0.1);
98 }
99
100 /* Returns 1 if lines have same wmode and have the same baseline vector, else 0. */
101 static int
102 lines_are_compatible(line_t *a,
103 line_t *b)
104 {
105 span_t *first_span_a = content_first_span(&a->content);
106 span_t *first_span_b = content_first_span(&b->content);
107
108 if (a == b) return 0;
109 if (!first_span_a || !first_span_b) return 0;
110
111 /* We only join lines with the same wmode. */
112 if (first_span_a->flags.wmode != first_span_b->flags.wmode)
113 return 0;
114
115 return matrices_are_compatible(&first_span_a->ctm, &first_span_b->ctm, first_span_a->flags.wmode);
116 }
117
118
119 static const unsigned ucs_NONE = ((unsigned) -1);
120
121 /* Returns with <o_span> containing char_t's from <span> that are inside
122 rect, and *span modified to remove any char_t's that we have moved to
123 <o_span>.
124
125 May return with span->chars_num == 0, in which case the caller must remove the
126 span (including freeing .font_name), because lots of code assumes that there
127 are no empty spans. */
128 static int
129 span_inside_rect(
130 extract_alloc_t *alloc,
131 span_t *span,
132 rect_t *rect,
133 span_t *o_span)
134 {
135 int c;
136 content_t save = *(content_t *)o_span;
137
138 *o_span = *span;
139 *(content_t *)o_span = save; /* Avoid changing prev/next. */
140 extract_strdup(alloc, span->font_name, &o_span->font_name);
141 o_span->chars = NULL;
142 o_span->chars_num = 0;
143 for (c=0; c<span->chars_num; ++c)
144 {
145 /* For now we just look at whether span's (x, y) is within any
146 rects[]. We could instead try to find character's bounding box etc. */
147 char_t *char_ = &span->chars[c];
148 if (char_->x >= rect->min.x &&
149 char_->x < rect->max.x &&
150 char_->y >= rect->min.y &&
151 char_->y < rect->max.y)
152 {
153 char_t *c = extract_span_append_c(alloc, o_span, char_->ucs);
154 if (c == NULL) return -1;
155 *c = *char_;
156 char_->ucs = ucs_NONE; /* Mark for removal below, so it is not used again. */
157 }
158 }
159
160 /* Remove any char_t's that we've used. */
161 {
162 int cc = 0;
163 for (c=0; c<span->chars_num; ++c)
164 {
165 char_t* char_ = &span->chars[c];
166 if (char_->ucs != ucs_NONE)
167 {
168 span->chars[cc] = span->chars[c];
169 cc += 1;
170 }
171 }
172 /* This might set span->chars_num to zero; our caller needs to remove
173 the span - lots of code assumes that all spans contain at least one
174 character. */
175 span->chars_num = cc;
176 }
177
178 if (o_span->chars_num)
179 {
180 outf("o_span: %s", extract_span_string(alloc, o_span));
181 }
182 return 0;
183 }
184
185 /*
186 On entry:
187 <lines> is a list of span_t's.
188
189 On exit:
190 <lines> is a list of line_t's, made up by having pulled as many of the span_t's
191 as are appropriate together.
192 */
193 static int
194 make_lines(
195 extract_alloc_t *alloc,
196 content_root_t *lines,
197 double master_space_guess)
198 {
199 int ret = -1;
200 int a;
201 content_line_iterator lit;
202 line_t *line_a;
203 content_span_iterator sit;
204 span_t *span;
205
206 /* On entry <lines> contains spans. Make each span part of a <line>. */
207 for (a = 0, span = content_span_iterator_init(&sit, lines); span != NULL; span = content_span_iterator_next(&sit), a++)
208 {
209 line_t *line;
210
211 if (content_replace_new_line(alloc, &span->base, &line)) goto end;
212 content_append_span(&line->content, span);
213 outfx("initial line a=%i: %s", a, line_string(line));
214 }
215
216 /* For each line, look for nearest aligned line, and append if found. */
217 for (a=0, line_a = content_line_iterator_init(&lit, lines); line_a != NULL; a++, line_a = content_line_iterator_next(&lit))
218 {
219 content_line_iterator lit2;
220 line_t *line_b;
221 int b;
222 int nearest_line_b = -1;
223 double nearest_score = 0;
224 line_t *nearest_line = NULL;
225 double nearest_colinear = 0;
226 double nearest_space_guess = 0;
227 span_t *span_a;
228
229 span_a = extract_line_span_last(line_a);
230
231 for (b = 0, line_b = content_line_iterator_init(&lit2, lines); line_b != NULL; b++, line_b = content_line_iterator_next(&lit2))
232 {
233 if (line_a == line_b)
234 continue;
235
236 if (!lines_are_compatible(line_a, line_b))
237 continue;
238
239 {
240 span_t *span_b = extract_line_span_first(line_b);
241 char_t *last_a = extract_span_char_last(span_a);
242 /* Predict the end of span_a. */
243 point_t dir = { last_a->adv * (1 - span_a->flags.wmode), last_a->adv * span_a->flags.wmode };
244 point_t tdir = extract_matrix4_transform_point(span_a->ctm, dir);
245 point_t span_a_end = { last_a->x + tdir.x, last_a->y + tdir.y };
246 /* Find the difference between the end of span_a and the start of span_b. */
247 char_t *first_b = span_char_first(span_b);
248 point_t diff = { first_b->x - span_a_end.x, first_b->y - span_a_end.y };
249 double scale_squared = ((span_a->flags.wmode) ?
250 (span_a->ctm.c * span_a->ctm.c + span_a->ctm.d * span_a->ctm.d) :
251 (span_a->ctm.a * span_a->ctm.a + span_a->ctm.b * span_a->ctm.b));
252 /* Now find the differences in position, both colinear and perpendicular. */
253 double colinear = (diff.x * tdir.x + diff.y * tdir.y) / last_a->adv / scale_squared;
254 double perp = (diff.x * tdir.y - diff.y * tdir.x) / last_a->adv / scale_squared;
255 /* colinear and perp are now both pre-transform space distances, to match adv etc. */
256 double score;
257 double space_guess = (last_a->adv + first_b->adv)/2 * master_space_guess;
258
259 /* Heuristic: perpendicular distance larger than half of adv rules it out as a match. */
260 /* Ideally we should be using font bbox here, but we don't have that, currently. */
261 /* NOTE: We should match the logic in extract_add_char here! */
262 if (fabs(perp) > 3*space_guess/2 || fabs(colinear) > space_guess * 8)
263 continue;
264
265 /* We now form a score for this match. */
266 score = fabs(colinear);
267 if (score < fabs(perp) * 10) /* perpendicular distance matters much more. */
268 score = fabs(perp) * 10;
269
270 if (!nearest_line || score < nearest_score)
271 {
272 nearest_line = line_b;
273 nearest_score = score;
274 nearest_line_b = b;
275 nearest_colinear = colinear;
276 nearest_space_guess = space_guess;
277 }
278 }
279 }
280
281 if (nearest_line)
282 {
283 /* line_a and nearest_line are aligned so we can move line_b's
284 spans on to the end of line_a. */
285 span_t *span_b = extract_line_span_first(nearest_line);
286 b = nearest_line_b;
287
288 if (extract_span_char_last(span_a)->ucs != ' ' &&
289 span_char_first(span_b)->ucs != ' ')
290 {
291 /* Again, match the logic in extract_add_char here. */
292 int insert_space = (nearest_colinear > 2*nearest_space_guess/3);
293 if (insert_space)
294 {
295 /* Append space to span_a before concatenation. */
296 char_t *item = extract_span_append_c(alloc, span_a, ' ');
297 if (item == NULL) goto end;
298 item->adv = 0; /* FIXME */
299 /* This is a hack to give our extra space a vaguely useful
300 (x,y) coordinate - this can be used later on when ordering
301 paragraphs. We could try to be more accurate by adding
302 item[-1]'s .adv suitably transformed by .wmode, .ctm and
303 .trm. */
304 item->x = item[-1].x;
305 item->y = item[-1].y;
306 }
307 }
308
309 /* We might end up with two adjacent spaces here. But removing a
310 space could result in an empty line_t, which could break various
311 assumptions elsewhere. */
312
313 /* Move all the content from nearest_line to line_a. */
314 content_concat(&line_a->content, &nearest_line->content);
315
316 /* Ensure that we ignore nearest_line from now on. */
317 if (lit.next == &nearest_line->base)
318 lit.next = lit.next->next;
319 extract_line_free(alloc, &nearest_line);
320
321 if (b > a) {
322 /* We haven't yet tried appending any spans to nearest_line, so
323 the new extended line_a needs checking again. */
324 lit.next = &line_a->base;
325 a--;
326 } else {
327 a--; /* b is before a, so a's number needs to move back one. */
328 }
329 }
330 }
331
332 ret = 0;
333
334 end:
335 if (ret) {
336 /* Free everything. */
337 extract_span_free(alloc, &span);
338 content_clear(alloc, lines);
339 }
340 return ret;
341 }
342
343
344 /* A comparison function, for sorting paragraphs within a
345 page. */
346 static int paragraphs_cmp(const content_t *a, const content_t *b)
347 {
348 const paragraph_t *a_paragraph = (const paragraph_t *)a;
349 const paragraph_t *b_paragraph = (const paragraph_t *)b;
350 line_t *a_line, *b_line;
351 span_t *a_span, *b_span;
352
353 if (a->type != content_paragraph || b->type != content_paragraph)
354 return 0;
355
356 a_line = paragraph_line_first(a_paragraph);
357 b_line = paragraph_line_first(b_paragraph);
358 a_span = extract_line_span_first(a_line);
359 b_span = extract_line_span_first(b_line);
360
361 /* We can't directly compare stuff with different wmodes. */
362 if (a_span->flags.wmode != b_span->flags.wmode)
363 {
364 return a_span->flags.wmode - b_span->flags.wmode;
365 }
366
367 /* If matrices are compatible (i.e. they share the same baseline vector), don't consider that as
368 * part of the sort.
369 *
370 * Actually, is this safe? We don't necessarily have transitivity here due to the epsilon in the
371 * comparison. A might be compatible with B, and B with C, but A might not be with C.
372 * Worry about that if we get an example.
373 */
374 if (!matrices_are_compatible(&a_span->ctm, &b_span->ctm, a_span->flags.wmode))
375 {
376 /* If ctm matrices differ, always return this diff first. Note that we
377 ignore .e and .f because if data is from ghostscript then .e and .f
378 vary for each span, and we don't care about these differences. */
379 return extract_matrix4_cmp(&a_span->ctm, &b_span->ctm);
380 }
381
382 /* So, we know the matrices are compatible - i.e. the baselines are parallel.
383 * Just sort on how far down the page we are going. */
384 {
385 span_t *span_a = content_first_span(&a_line->content);
386 span_t *span_b = content_first_span(&b_line->content);
387 point_t dir = { 1 - span_a->flags.wmode, span_a->flags.wmode };
388 point_t tdir = extract_matrix4_transform_point(span_a->ctm, dir);
389 point_t diff = { span_a->chars[0].x - span_b->chars[0].x, span_a->chars[0].y - span_b->chars[0].y };
390 double perp = (diff.x * tdir.y - diff.y * tdir.x);
391
392 #if 0
393 printf("Comparing:\n");
394 content_dump_brief(&a_line->content);
395 printf("And:\n");
396 content_dump_brief(&b_line->content);
397 printf("perp=%g\n", perp);
398 #endif
399
400 if (perp < 0)
401 return 1;
402 if (perp > 0)
403 return -1;
404 }
405 return 0;
406 }
407
408 static double
409 font_size_from_ctm(const matrix4_t *ctm)
410 {
411 if (ctm->b == 0)
412 return fabs(ctm->a);
413 if (ctm->a == 0)
414 return fabs(ctm->b);
415
416 return sqrt(ctm->a * ctm->a + ctm->b * ctm->b);
417 }
418
419 static void
420 calculate_line_height(line_t *line)
421 {
422 content_span_iterator sit;
423 span_t *span;
424 double asc = 0, desc = 0;
425
426 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
427 {
428 double span_font_size = font_size_from_ctm(&span->ctm);
429 double min_y = span->font_bbox.min.y * span_font_size;
430 double max_y = span->font_bbox.max.y * span_font_size;
431 if (min_y < desc)
432 desc = min_y;
433 if (max_y > asc)
434 asc = max_y;
435 }
436 line->ascender = asc;
437 line->descender = desc;
438 }
439
440 /*
441 On entry:
442 <content> is a list of lines.
443
444 On exit:
445 <content> is a list of paragraphs, formed from pulling appropriate lines
446 together.
447 */
448 static int
449 make_paragraphs(
450 extract_alloc_t *alloc,
451 content_root_t *content)
452 {
453 int ret = -1;
454 int a;
455 content_line_iterator lit;
456 line_t *line;
457 content_paragraph_iterator pit;
458 paragraph_t *paragraph_a;
459
460 /* Convert every line_t to be a paragraph_t containing that line_t. */
461 for (line = content_line_iterator_init(&lit, content); line != NULL; line = content_line_iterator_next(&lit))
462 {
463 paragraph_t *paragraph;
464 if (content_replace_new_paragraph(alloc, &line->base, &paragraph))
465 goto end;
466 content_append_line(&paragraph->content, line);
467 calculate_line_height(line);
468 }
469
470 /* Now join paragraphs together where possible. */
471 for (a=0, paragraph_a = content_paragraph_iterator_init(&pit, content); paragraph_a != NULL; a++, paragraph_a = content_paragraph_iterator_next(&pit)) {
472 paragraph_t *nearest_paragraph = NULL;
473 int nearest_paragraph_b = -1;
474 double nearest_score = 0;
475 line_t *line_a;
476 paragraph_t *paragraph_b;
477 content_paragraph_iterator pit2;
478 int b;
479 span_t *span_a;
480
481 line_a = paragraph_line_last(paragraph_a);
482 assert(line_a != NULL);
483 span_a = extract_line_span_last(line_a);
484 assert(span_a != NULL);
485
486 /* Look for nearest paragraph_t that could be appended to
487 paragraph_a. */
488 for (b=0, paragraph_b = content_paragraph_iterator_init(&pit2, content); paragraph_b != NULL; b++, paragraph_b = content_paragraph_iterator_next(&pit2))
489 {
490 line_t *line_b;
491
492 if (paragraph_a == paragraph_b)
493 continue;
494 line_b = paragraph_line_first(paragraph_b);
495 if (!lines_are_compatible(line_a, line_b)) {
496 continue;
497 }
498
499 {
500 span_t *line_a_first_span = extract_line_span_first(line_a);
501 span_t *line_a_last_span = extract_line_span_last(line_a);
502 span_t *line_b_first_span = extract_line_span_first(line_b);
503 span_t *line_b_last_span = extract_line_span_last(line_b);
504 char_t *first_a = span_char_first(line_a_first_span);
505 char_t *first_b = span_char_first(line_b_first_span);
506 char_t *last_a = span_char_last(line_a_last_span);
507 char_t *last_b = span_char_last(line_b_last_span);
508 point_t dir = { 1 - span_a->flags.wmode, span_a->flags.wmode };
509 point_t tdir_a = extract_matrix4_transform_point(line_a_last_span->ctm, dir);
510 point_t tdir_b = extract_matrix4_transform_point(line_b_last_span->ctm, dir);
511 /* Find the difference between the start of span_a and the start of span_b. */
512 point_t start_diff = { first_b->x - first_a->x, first_b->y - first_a->y };
513 point_t end_a = { last_a->x + last_a->adv * tdir_a.x, last_a->y + last_a->adv * tdir_a.y };
514 point_t end_b = { last_b->x + last_b->adv * tdir_b.x, last_b->y + last_b->adv * tdir_b.y };
515 /* Now find the perpendicular difference in position. */
516 double scale_squared = ((span_a->flags.wmode) ?
517 (span_a->ctm.c * span_a->ctm.c + span_a->ctm.d * span_a->ctm.d) :
518 (span_a->ctm.a * span_a->ctm.a + span_a->ctm.b * span_a->ctm.b));
519 double perp = (start_diff.x * tdir_a.y - start_diff.y * tdir_a.x) / sqrt(scale_squared);
520 /* perp is now a post-transform space distance. */
521 double score;
522 /* Now consider the linear difference between: 1) start of a to end of a, 2) start of a to start of b,
523 * 3) start of a and the end of b. */
524 point_t saea = { end_a.x - first_a->x, end_a.y - first_a->y };
525 point_t sasb = { first_b->x - first_a->x, first_b->y - first_a->y };
526 point_t saeb = { end_b.x - first_a->x, end_b.y - first_a->y };
527 double dot_saea = ( saea.x * tdir_a.x + saea.y * tdir_a.y );
528 double dot_sasb = ( sasb.x * tdir_a.x + sasb.y * tdir_a.y );
529 double dot_saeb = ( saeb.x * tdir_a.x + saeb.y * tdir_a.y );
530
531 /* We are only interested in scoring down the page. */
532 score = -perp;
533
534 #if 0
535 printf("Comparing:\n");
536 content_dump_brief(&paragraph_a->content);
537 printf("And:\n");
538 content_dump_brief(&paragraph_b->content);
539 printf("score=%g\n", score);
540 printf("saea=%g sasb=%g saeb=%g\n", dot_saea, dot_sasb, dot_saeb);
541 #endif
542
543 /* Check for "horizontal" alignment of the two lines. If line_b starts
544 * entirely to the right of the end of line_a, we can't join with it. */
545 if (dot_sasb > dot_saea)
546 continue;
547 /* If line_b ends entirely to the left of the start of line_a, we can't
548 * join with it. */
549 if (dot_saeb < 0)
550 continue;
551
552 if (score >= 0 && (!nearest_paragraph || score < nearest_score))
553 {
554 nearest_paragraph = paragraph_b;
555 nearest_score = score;
556 nearest_paragraph_b = b;
557 }
558 }
559 }
560
561 if (nearest_paragraph) {
562 double line_a_height = line_a->ascender - line_a->descender;
563 line_t *line_b = paragraph_line_first(nearest_paragraph);
564 double line_b_height = line_b->ascender - line_b->descender;
565 double expected_height = (line_a_height + line_b_height)/2;
566
567 #if 0
568 printf("Best score = %g, expected_height=%g\n", nearest_score, expected_height);
569 #endif
570
571 if (nearest_score > 0 && nearest_score < 2 * expected_height) {
572 /* Paragraphs are close together vertically compared to maximum
573 font size of first line in second paragraph, so we'll join them
574 into a single paragraph. */
575 span_t *a_span = extract_line_span_last(line_a);
576
577 if (extract_span_char_last(a_span)->ucs == '-' ||
578 extract_span_char_last(a_span)->ucs == 0x2212 /* unicode dash */)
579 {
580 /* remove trailing '-' at end of prev line. char_t doesn't
581 contain any malloc-heap pointers so this doesn't leak. */
582 a_span->chars_num -= 1;
583 if (a_span->chars_num == 0)
584 {
585 /* The span is now empty, unlink and free it. */
586 extract_span_free(alloc, &a_span);
587
588 /* If this leaves the line empty, remove the line. */
589 if (line_a->content.base.next == &line_a->content.base)
590 {
591 extract_line_free(alloc, &line_a);
592 a--; /* Keep the index correct. */
593 }
594 }
595 }
596 else if (extract_span_char_last(a_span)->ucs == ' ')
597 {
598 }
599 else if (extract_span_char_last(a_span)->ucs == '/')
600 {
601 }
602 else
603 {
604 /* Insert space before joining adjacent lines. */
605 char_t *c_prev;
606 char_t *c = extract_span_append_c(alloc, extract_line_span_last(line_a), ' ');
607 if (c == NULL) goto end;
608 c_prev = &a_span->chars[ a_span->chars_num-2];
609 c->x = c_prev->x + c_prev->adv * a_span->ctm.a;
610 c->y = c_prev->y + c_prev->adv * a_span->ctm.c;
611 }
612
613 /* Join the two paragraphs by moving content from nearest_paragraph to paragraph_a. */
614 content_concat(&paragraph_a->content, &nearest_paragraph->content);
615
616 #if 0
617 printf("Joining to give:\n");
618 content_dump_brief(&paragraph_a->content);
619 #endif
620
621 /* Ensure that we skip nearest_paragraph in future. */
622 if (pit.next == &nearest_paragraph->base)
623 pit.next = pit.next->next;
624 extract_paragraph_free(alloc, &nearest_paragraph);
625
626 if (nearest_paragraph_b > a) {
627 /* We haven't yet tried appending any paragraphs to
628 nearest_paragraph_b, so the new extended paragraph_a needs
629 checking again. */
630 pit.next = &paragraph_a->base;
631 a -= 1;
632 } else {
633 a -= 1;
634 }
635 }
636 }
637 }
638
639 /* Sort paragraphs so they appear in correct order, using paragraphs_cmp().
640 */
641 content_sort(content, paragraphs_cmp);
642
643 ret = 0;
644
645 end:
646
647 return ret;
648 }
649
650 /* Return the last non space char of a span (or the first one if
651 * they are all spaces. */
652 static char_t *
653 last_non_space_char(span_t *span)
654 {
655 int i = span->chars_num - 1;
656
657 while (i > 0 && span->chars[i].ucs == 32)
658 i--;
659
660 return &span->chars[i];
661 }
662
663 /*
664 On entry:
665 <content> is a list of paragraphs.
666
667 On exit:
668 <content> is a list of paragraphs, with information about alignment etc.
669 */
670 static int
671 analyse_paragraphs(content_root_t *content)
672 {
673 content_paragraph_iterator pit;
674 paragraph_t *paragraph;
675
676 /* Examine each paragraph in turn. */
677 for (paragraph = content_paragraph_iterator_init(&pit, content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
678 {
679 content_line_iterator lit;
680 line_t *line;
681 double para_l = 0, para_r = 0;
682 int first_span_of_para = 1;
683 matrix4_t inverse;
684 double space_guess = 0; /* Stop clever-clever compilers warning. */
685 int previous_line_flags = -1;
686 double previous_line_spare = 0;
687 int first_line = 1;
688
689 /* Bound this paragraph on the left and right, pre-transform. */
690 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
691 {
692 content_span_iterator sit;
693 span_t *span;
694
695 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
696 {
697 char_t *lc = &span->chars[0];
698 char_t *rc = last_non_space_char(span);
699 point_t dir = { rc->adv * (1 - span->flags.wmode), rc->adv * span->flags.wmode };
700 point_t tdir = extract_matrix4_transform_point(span->ctm, dir);
701 point_t left = { lc->x, lc->y };
702 point_t right = { rc->x + tdir.x, rc->y + tdir.y };
703 double l, r;
704
705 /* We examine the ctm on the first span, and store its inverse. We then map all
706 * the coords we encounter back through this, to give us a position in a consistent
707 * source space (i.e. we don't use each different ctm we meet for different spans). */
708 if (first_span_of_para)
709 {
710 inverse = extract_matrix4_invert(&span->ctm);
711 space_guess = (span->font_bbox.max.x - span->font_bbox.min.x)/2;
712 }
713
714 left = extract_matrix4_transform_point(inverse, left);
715 right = extract_matrix4_transform_point(inverse, right);
716 l = span->flags.wmode ? left.y : left.x;
717 r = span->flags.wmode ? right.y : right.x;
718
719 if (l < para_l || first_span_of_para)
720 para_l = l;
721 if (r > para_r || first_span_of_para)
722 para_r = r;
723 first_span_of_para = 0;
724 }
725 }
726
727 /* So now we know that para_l/para_r are the bounds for this paragraph, under the ctm of the first_span. */
728
729 /* Now, look again at the lines that made up that paragraph, to figure out how well each line
730 * fills those bounds. */
731 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
732 {
733 content_span_iterator sit;
734 span_t *span;
735 double line_l = 0, line_r = 0;
736 int first_span = 1;
737
738 /* If we're not the first line, then we want to find the first word. */
739 int first_word = !first_line;
740 int word_width_found = 0;
741 point_t word_end;
742 int word_wmode;
743
744 /* For each line, find the line bounds. */
745 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
746 {
747 char_t *lc = &span->chars[0];
748 char_t *rc = last_non_space_char(span);
749 point_t dir = { 1 - span->flags.wmode, span->flags.wmode };
750 point_t tdir = extract_matrix4_transform_point(span->ctm, dir);
751 point_t left = { lc->x, lc->y };
752 point_t right = { rc->x + tdir.x * rc->adv, rc->y + tdir.y * rc->adv };
753 double l, r;
754
755 /* If we're not the first line, then calculate the length of the first word on the
756 * new line. */
757 if (first_word)
758 {
759 int i;
760
761 for (i = 0; i < span->chars_num; i++)
762 {
763 if (span->chars[i].ucs == 32)
764 break;
765 }
766 if (i > 0)
767 {
768 double adv = span->chars[i-1].adv;
769 word_end.x = span->chars[i-1].x + adv * tdir.x;
770 word_end.y = span->chars[i-1].y + adv * tdir.y;
771 word_wmode = span->flags.wmode;
772 word_width_found = 1;
773 if (i < span->chars_num)
774 first_word = 0;
775 }
776 }
777
778 left = extract_matrix4_transform_point(inverse, left);
779 right = extract_matrix4_transform_point(inverse, right);
780 l = span->flags.wmode ? left.y : left.x;
781 r = span->flags.wmode ? right.y : right.x;
782
783 if (l < line_l || first_span)
784 line_l = l;
785 if (r < line_r || first_span)
786 line_r = r;
787 first_span = 0;
788 }
789
790 #if 0
791 printf("Considering:\n");
792 content_dump_brief(&line->content);
793 printf("\n");
794 #endif
795
796 /* Did we find a width for the first word? */
797 if (word_width_found)
798 {
799 double w;
800 word_end = extract_matrix4_transform_point(inverse, word_end);
801 w = word_wmode ? word_end.y : word_end.x;
802 w -= line_l;
803 /* Now w = the extent of the word. */
804 /* If the previous line had enough room for this extent, plus a space,
805 * then we know that this paragraph can't just be breaking lines when
806 * they get full. */
807 if (previous_line_spare > w + space_guess)
808 paragraph->line_flags |= paragraph_breaks_strangely;
809 }
810
811 if (previous_line_flags != -1)
812 {
813 paragraph->line_flags |= previous_line_flags;
814 }
815 previous_line_flags = 0;
816 if (line_l > para_l + space_guess)
817 previous_line_flags |= paragraph_not_aligned_left;
818 if (line_r < para_r - space_guess)
819 previous_line_flags |= paragraph_not_aligned_right;
820 {
821 /* In order to figure out if we are plausibly centred,
822 * calculate the l and r gaps. */
823 double l = line_l - para_l;
824 double r = para_r - line_r;
825
826 if (fabs(l - r) > space_guess/2)
827 paragraph->line_flags |= paragraph_not_centred;
828 if (l > space_guess/2)
829 paragraph->line_flags |= paragraph_not_fully_justified;
830 if (r > space_guess/2)
831 previous_line_flags |= paragraph_not_fully_justified;
832 }
833 previous_line_spare = para_r - line_r + line_l - para_l;
834 first_line = 0;
835 }
836
837 if (previous_line_flags != -1)
838 {
839 paragraph->line_flags |= (previous_line_flags & paragraph_not_aligned_left);
840 }
841 }
842
843 return 0;
844 }
845
846 static int
847 spot_rotated_blocks(
848 extract_alloc_t *alloc,
849 content_root_t *lines)
850 {
851 /* On entry, we have that the content in lines has been
852 sorted so that paragraphs with the same rotation are together,
853 and sorted in order of increasing y. All we have to do here is
854 to spot a run of paragraphs with the same rotation, and to gather
855 them into a block. */
856 content_iterator cit;
857 content_t *content;
858 content_iterator cit0 = { 0 }; /* Stop clever-clever compilers warning. */
859 content_t *content0 = NULL;
860 int ret = -1;
861 matrix4_t ctm0;
862 int wmode, wmode0;
863 int ctm0_set = 0;
864
865 for (content = content_iterator_init(&cit, lines); content != NULL; content = content_iterator_next(&cit))
866 {
867 matrix4_t ctm;
868 int ctm_set = 0;
869 int flush = 0;
870
871 switch (content->type)
872 {
873 case content_paragraph:
874 {
875 double rotate;
876 span_t *span = content_first_span(&content_first_line(&((paragraph_t *)content)->content)->content);
877 wmode = span->flags.wmode;
878 ctm = span->ctm;
879 rotate = atan2(ctm.b, ctm.a);
880 /* We are not gathering rotated stuff into blocks. If the rotation returns to zero
881 * then flush any collection we might have found. Otherwise, remember that we have
882 * a ctm value set, so we can compare to it. */
883 if (rotate == 0)
884 flush = 1;
885 else
886 ctm_set = 1;
887 /* If the ctm value differs from the first ctm0 we met for the current collection,
888 * flush the collection. */
889 if (ctm0_set && (wmode != wmode0 || !matrices_are_compatible(&ctm, &ctm0, wmode0)))
890 flush = 1;
891 break;
892 }
893 default:
894 flush = 1;
895 break;
896 }
897
898 if (flush && content0)
899 {
900 /* Move [content0..content) to a block */
901 block_t *block;
902 content_t *c = content_iterator_next(&cit0);
903 /* Replace content0 with new block. */
904 if (content_replace_new_block(alloc, content0, &block)) goto end;
905 /* Insert content0 into block. */
906 content_append(&block->content, content0);
907 /* Now move the rest of the list into block too. */
908 for (; c != content; c = content_iterator_next(&cit0))
909 {
910 content_append(&block->content, c);
911 }
912 ctm0_set = 0;
913 content0 = NULL;
914 }
915 if (ctm_set && !ctm0_set)
916 {
917 ctm0 = ctm;
918 ctm0_set = 1;
919 wmode0 = wmode;
920 content0 = content;
921 cit0 = cit;
922 }
923 }
924
925 if (content0)
926 {
927 /* Move [content0..NULL) to a block */
928 block_t *block;
929 content_t *c = content_iterator_next(&cit0);
930 /* Replace content0 with new block. */
931 if (content_replace_new_block(alloc, content0, &block)) goto end;
932 /* Insert content0 into block. */
933 content_append(&block->content, content0);
934 /* Now move the rest of the list into block too. */
935 for (; c != content; c = content_iterator_next(&cit0))
936 {
937 content_append(&block->content, c);
938 }
939 }
940
941 ret = 0;
942 end:
943
944 return ret;
945 }
946
947 /* Take any content from content that falls inside rect, and append
948 * it to subset. */
949 static int
950 spans_within_rect(
951 extract_alloc_t *alloc,
952 content_root_t *content,
953 rect_t *rect,
954 content_root_t *subset)
955 {
956 content_span_iterator it;
957 span_t *candidate;
958
959 /* Make <lines> contain new span_t's and char_t's that are inside rects[]. */
960 for (candidate = content_span_iterator_init(&it, content); candidate != NULL; candidate = content_span_iterator_next(&it))
961 {
962 span_t *span;
963
964 if (candidate->chars_num == 0)
965 continue; /* In case used for table, */
966
967 /* Create a new span. */
968 if (content_new_span(alloc, &span, candidate->structure))
969 return -1;
970 /* Extract any chars from candidate that fall inside rect, inserting
971 * those chars into subset. */
972 if (span_inside_rect(alloc, candidate, rect, span))
973 return -1;
974 if (span->chars_num)
975 {
976 /* We populated it with some chars! */
977 /* Unlink span from where it was, and insert into subset. */
978 content_append_span(subset, span);
979 }
980 else
981 {
982 /* No chars found. Bin it. */
983 extract_span_free(alloc, &span);
984 }
985 span = NULL; /* Avoid us freeing on error now ownership has moved. */
986
987 if (!candidate->chars_num)
988 {
989 /* All characters in this span are inside table, so remove
990 * the vestigial span. */
991 extract_span_free(alloc, &candidate);
992 }
993 }
994
995 return 0;
996 }
997
998 static int
999 join_content(
1000 extract_alloc_t *alloc,
1001 content_root_t *lines,
1002 double master_space_guess)
1003 {
1004 if (make_lines(alloc, lines, master_space_guess))
1005 return -1;
1006 if (make_paragraphs(alloc, lines))
1007 return -1;
1008 if (analyse_paragraphs(lines))
1009 return -1;
1010 if (spot_rotated_blocks(alloc, lines))
1011 return -1;
1012
1013 return 0;
1014 }
1015
1016
1017 /* Compares two tableline_t's rectangles using x as primary key. */
1018 static int tablelines_compare_x(const void *a, const void *b)
1019 {
1020 const tableline_t *aa = a;
1021 const tableline_t *bb = b;
1022
1023 if (aa->rect.min.x > bb->rect.min.x) return +1;
1024 if (aa->rect.min.x < bb->rect.min.x) return -1;
1025 if (aa->rect.min.y > bb->rect.min.y) return +1;
1026 if (aa->rect.min.y < bb->rect.min.y) return -1;
1027
1028 return 0;
1029 }
1030
1031 /* Compares two tableline_t's rectangles using y as primary key. */
1032 static int tablelines_compare_y(const void *a, const void *b)
1033 {
1034 const tableline_t *aa = a;
1035 const tableline_t *bb = b;
1036
1037 if (aa->rect.min.y > bb->rect.min.y) return +1;
1038 if (aa->rect.min.y < bb->rect.min.y) return -1;
1039 if (aa->rect.min.x > bb->rect.min.x) return +1;
1040 if (aa->rect.min.x < bb->rect.min.x) return -1;
1041
1042 return 0;
1043 }
1044
1045 /* Makes <out> to contain all lines in <all> with y coordinate in the range
1046 y_min..y_max. */
1047 static int
1048 table_find_y_range(
1049 extract_alloc_t *alloc,
1050 tablelines_t *all,
1051 double y_min,
1052 double y_max,
1053 tablelines_t *out)
1054 {
1055 int i;
1056
1057 for (i=0; i<all->tablelines_num; ++i)
1058 {
1059 if (all->tablelines[i].rect.min.y >= y_min && all->tablelines[i].rect.min.y < y_max)
1060 {
1061 if (extract_realloc(alloc, &out->tablelines, sizeof(*out->tablelines) * (out->tablelines_num + 1))) return -1;
1062 out->tablelines[out->tablelines_num] = all->tablelines[i];
1063 out->tablelines_num += 1;
1064 }
1065 else
1066 {
1067 outf("Excluding line because outside y=%f..%f: %s", y_min, y_max, extract_rect_string(&all->tablelines[i].rect));
1068 }
1069 }
1070
1071 return 0;
1072 }
1073
1074
1075 /* Returns one if a_min..a_max significantly overlapps b_min..b_max, otherwise
1076 zero. */
1077 static int
1078 overlap(double a_min, double a_max, double b_min, double b_max)
1079 {
1080 double overlap;
1081 int ret0;
1082 int ret1;
1083
1084 assert(a_min < a_max);
1085 assert(b_min < b_max);
1086 if (b_min < a_min) b_min = a_min;
1087 if (b_max > a_max) b_max = a_max;
1088 if (b_max < b_min) b_max = b_min;
1089 overlap = (b_max - b_min) / (a_max - a_min);
1090 ret0 = overlap > 0.2;
1091 ret1 = overlap > 0.8;
1092 if (ret0 != ret1)
1093 {
1094 if (0) outf0("warning, unclear overlap=%f: a=%f..%f b=%f..%f", overlap, a_min, a_max, b_min, b_max);
1095 }
1096
1097 return overlap > 0.8;
1098 }
1099
1100 void extract_cell_init(cell_t *cell)
1101 {
1102 cell->rect.min.x = 0;
1103 cell->rect.min.y = 0;
1104 cell->rect.max.x = 0;
1105 cell->rect.max.y = 0;
1106 cell->above = 0;
1107 cell->left = 0;
1108 cell->extend_right = 0;
1109 cell->extend_down = 0;
1110 content_init_root(&cell->content, NULL);
1111 }
1112
1113
1114 /* Find cell extensions to right and down by looking at cells' .left and
1115 above flags.
1116
1117 For example for adjacent cells ABC..., we extend A to include cells BC..
1118 until we reach a cell with .left set to one.
1119
1120 ABCDE
1121 FGHIJ
1122 KLMNO
1123
1124 When looking to extend cell A, we only look at cells in the same column or
1125 same row, (i.e. in the above example we look at BCDE and FK, and not at
1126 GHIJ and LMNO).
1127
1128 For example if BCDE have no left lines and FK have no above lines, we
1129 ignore any lines in GHIJ and LMNO and make A extend to the entire 3x4
1130 box. Having found this box, we set .above=0 and .left to 0 in all enclosed
1131 cells, which simplifies html table generation code.
1132 */
1133 static int table_find_extend(cell_t **cells, int cells_num_x, int cells_num_y)
1134 {
1135 int y;
1136
1137 for (y=0; y<cells_num_y; ++y)
1138 {
1139 int x;
1140 for (x=0; x<cells_num_x; ++x)
1141 {
1142 cell_t* cell = cells[y * cells_num_x + x];
1143 outf("xy=(%i %i) above=%i left=%i", x, y, cell->above, cell->left);
1144 if (cell->left && cell->above)
1145 {
1146 /* See how far this cell extends to right and down. */
1147 int xx;
1148 int yy;
1149 for (xx=x+1; xx<cells_num_x; ++xx)
1150 {
1151 if (cells[y * cells_num_x + xx]->left) break;
1152 }
1153 cell->extend_right = xx - x;
1154 cell->rect.max.x = cells[y * cells_num_x + xx-1]->rect.max.x;
1155 for (yy=y+1; yy<cells_num_y; ++yy)
1156 {
1157 if (cells[yy * cells_num_x + x]->above) break;
1158 }
1159 cell->extend_down = yy - y;
1160 cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y;
1161
1162 /* Clear .above and .left in enclosed cells. */
1163 for (xx = x; xx < x + cell->extend_right; ++xx)
1164 {
1165 int yy;
1166 for (yy = y; yy < y + cell->extend_down; ++yy)
1167 {
1168 cell_t* cell2 = cells[cells_num_x * yy + xx];
1169 if ( xx==x && yy==y)
1170 {}
1171 else
1172 {
1173 if (xx==x)
1174 {
1175 cell2->extend_right = cell->extend_right;
1176 }
1177 cell2->above = 0;
1178 /* We set .left to 1 for left-most cells - e.g. F
1179 and K in the above diagram; this allows us to
1180 generate correct html without lots of recursing
1181 looking for extend_down in earlier cells. */
1182 cell2->left = (xx == x);
1183 outf("xy=(%i %i) xxyy=(%i %i) have set cell2->above=%i left=%i",
1184 x, y, xx, yy, cell2->above, cell2->left
1185 );
1186 }
1187 }
1188 }
1189 }
1190 }
1191 }
1192 return 0;
1193 }
1194
1195
1196 /* Sets each cell to contain the text that is within the cell's boundary. We
1197 remove any found text from the page. */
1198 static int
1199 table_find_cells_text(
1200 extract_alloc_t *alloc,
1201 subpage_t *subpage,
1202 cell_t **cells,
1203 int cells_num_x,
1204 int cells_num_y,
1205 double master_space_guess)
1206 {
1207 /* Find text within each cell. We don't attempt to handle images within
1208 cells. */
1209 int e = -1;
1210 int i;
1211 int cells_num = cells_num_x * cells_num_y;
1212 table_t *table;
1213
1214 for (i=0; i<cells_num; ++i)
1215 {
1216 cell_t* cell = cells[i];
1217 if (!cell->above || !cell->left) continue;
1218
1219 if (spans_within_rect(alloc, &subpage->content, &cell->rect, &cell->content))
1220 return -1;
1221 if (join_content(alloc, &cell->content, master_space_guess))
1222 return -1;
1223 }
1224
1225 /* Append the table we have found to page->tables[]. */
1226 if (content_append_new_table(alloc, &subpage->tables, &table)) goto end;
1227 table->pos.x = cells[0]->rect.min.x;
1228 table->pos.y = cells[0]->rect.min.y;
1229 table->cells = cells;
1230 table->cells_num_x = cells_num_x;
1231 table->cells_num_y = cells_num_y;
1232
1233 if (0)
1234 {
1235 /* For debugging. */
1236 int y;
1237 outf0("table:\n");
1238 for (y=0; y<cells_num_y; ++y)
1239 {
1240 int x;
1241 for (x=0; x<cells_num_x; ++x)
1242 {
1243 cell_t* cell = cells[cells_num_x * y + x];
1244 fprintf(stderr, " %c%c x=%i y=% 3i 3i w=%i h=%i",
1245 cell->left ? '|' : ' ',
1246 cell->above ? '-' : ' ',
1247 x,
1248 y,
1249 cell->extend_right,
1250 cell->extend_down
1251 );
1252 }
1253 fprintf(stderr, "\n");
1254 }
1255
1256 }
1257
1258 e = 0;
1259 end:
1260
1261 return e;
1262 }
1263
1264
1265 /* Finds single table made from lines whose y coordinates are in the range
1266 y_min..y_max. */
1267 static int
1268 table_find(extract_alloc_t *alloc, subpage_t *subpage, double y_min, double y_max, double master_space_guess)
1269 {
1270 tablelines_t *all_h = &subpage->tablelines_horizontal;
1271 tablelines_t *all_v = &subpage->tablelines_vertical;
1272 int e = -1;
1273 int i;
1274
1275 /* Find subset of vertical and horizontal lines that are within range
1276 y_min..y_max, and sort by y coordinate. */
1277 tablelines_t tl_h = {NULL, 0};
1278 tablelines_t tl_v = {NULL, 0};
1279 cell_t **cells = NULL;
1280 int cells_num = 0;
1281 int cells_num_x = 0;
1282 int cells_num_y = 0;
1283 int x;
1284 int y;
1285
1286 outf("y=(%f %f)", y_min, y_max);
1287
1288 if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end;
1289 if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end;
1290 /* Suppress false coverity warning - qsort() does not dereference null
1291 pointer if nmemb is zero. */
1292 /* coverity[var_deref_model] */
1293 qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x);
1294
1295 if (0)
1296 {
1297 /* Show raw lines info. */
1298 outf0("all_h->tablelines_num=%i tl_h.tablelines_num=%i", all_h->tablelines_num, tl_h.tablelines_num);
1299 for (i=0; i<tl_h.tablelines_num; ++i)
1300 {
1301 outf0(" %i: %s", i, extract_rect_string(&tl_h.tablelines[i].rect));
1302 }
1303
1304 outf0("all_v->tablelines_num=%i tl_v.tablelines_num=%i", all_v->tablelines_num, tl_v.tablelines_num);
1305 for (i=0; i<tl_v.tablelines_num; ++i)
1306 {
1307 outf0(" %i: %s", i, extract_rect_string(&tl_v.tablelines[i].rect));
1308 }
1309 }
1310 /* Find the cells defined by the vertical and horizontal lines.
1311
1312 It seems that lines can be disjoint, e.g. what looks like a single
1313 horizontal line could be made up of multiple lines all with the same
1314 y coordinate, so we use i_next and j_next to skip these sublines when
1315 iterating. */
1316 cells = NULL;
1317 cells_num = 0;
1318 cells_num_x = 0;
1319 cells_num_y = 0;
1320 for (i=0; i<tl_h.tablelines_num; )
1321 {
1322 int i_next;
1323 int j;
1324 for (i_next=i+1; i_next<tl_h.tablelines_num; ++i_next)
1325 {
1326 if (tl_h.tablelines[i_next].rect.min.y - tl_h.tablelines[i].rect.min.y > 5) break;
1327 }
1328 if (i_next == tl_h.tablelines_num)
1329 {
1330 /* Ignore last row of points - cells need another row below. */
1331 break;
1332 }
1333 cells_num_y += 1;
1334
1335 for (j=0; j<tl_v.tablelines_num; )
1336 {
1337 int j_next;
1338 int ii;
1339 int jj;
1340 cell_t* cell;
1341
1342 for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next)
1343 {
1344 if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break;
1345 }
1346 outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect));
1347
1348 if (j_next == tl_v.tablelines_num) break;
1349
1350 if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end;
1351 if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end;
1352 cell = cells[cells_num];
1353 cells_num += 1;
1354 if (i==0) cells_num_x += 1;
1355
1356 cell->rect.min.x = tl_v.tablelines[j].rect.min.x;
1357 cell->rect.min.y = tl_h.tablelines[i].rect.min.y;
1358 cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x;
1359 cell->rect.max.y = (i_next < tl_h.tablelines_num) ? tl_h.tablelines[i_next].rect.min.y : cell->rect.min.y;
1360 cell->above = (i==0);
1361 cell->left = (j==0);
1362 cell->extend_right = 1;
1363 cell->extend_down = 1;
1364 content_init_root(&cell->content, NULL);
1365
1366 /* Set cell->above if there is a horizontal line above the cell. */
1367 outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect));
1368 for (ii = i; ii < i_next; ++ii)
1369 {
1370 tableline_t* h = &tl_h.tablelines[ii];
1371 if (overlap(
1372 cell->rect.min.x,
1373 cell->rect.max.x,
1374 h->rect.min.x,
1375 h->rect.max.x
1376 ))
1377 {
1378 cell->above = 1;
1379 break;
1380 }
1381 }
1382
1383 /* Set cell->left if there is a vertical line to the left of the cell. */
1384 for (jj = j; jj < j_next; ++jj)
1385 {
1386 tableline_t* v = &tl_v.tablelines[jj];
1387 if (overlap(
1388 cell->rect.min.y,
1389 cell->rect.max.y,
1390 v->rect.min.y,
1391 v->rect.max.y
1392 ))
1393 {
1394 cell->left = 1;
1395 break;
1396 }
1397 }
1398
1399 j = j_next;
1400 }
1401
1402 i = i_next;
1403 }
1404
1405 assert(cells_num == cells_num_x * cells_num_y);
1406
1407 /* Remove cols and rows where no cells have .above and .left - these
1408 will not appear. It also avoids spurious empty columns when table uses
1409 closely-spaced double lines as separators. */
1410 for (x=0; x<cells_num_x; ++x)
1411 {
1412 int has_cells = 0;
1413 for (y=0; y<cells_num_y; ++y)
1414 {
1415 cell_t* cell = cells[y * cells_num_x + x];
1416 if (cell->above && cell->left)
1417 {
1418 has_cells = 1;
1419 break;
1420 }
1421 }
1422 if (!has_cells)
1423 {
1424 /* Remove column <x>. */
1425 int j = 0;
1426 outf("Removing column %i. cells_num=%i cells_num_x=%i cells_num_y=%i", x, cells_num, cells_num_x, cells_num_y);
1427 for (i=0; i<cells_num; ++i)
1428 {
1429 if (i % cells_num_x == x)
1430 {
1431 extract_cell_free(alloc, &cells[i]);
1432 continue;
1433 }
1434 cells[j] = cells[i];
1435 j += 1;
1436 }
1437 cells_num -= cells_num_y;
1438 cells_num_x -= 1;
1439 }
1440 }
1441
1442 if (cells_num == 0)
1443 {
1444 e = 0;
1445 goto end;
1446 }
1447
1448 if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end;
1449
1450 if (table_find_cells_text(alloc, subpage, cells, cells_num_x, cells_num_y, master_space_guess)) goto end;
1451
1452 e = 0;
1453 end:
1454
1455 extract_free(alloc, &tl_h.tablelines);
1456 extract_free(alloc, &tl_v.tablelines);
1457 if (e)
1458 {
1459 for (i=0; i<cells_num; ++i)
1460 {
1461 extract_cell_free(alloc, &cells[i]);
1462 }
1463 extract_free(alloc, &cells);
1464 }
1465
1466 return e;
1467 }
1468
1469
1470 /* Finds tables in <page> by looking for lines in page->tablelines_horizontal
1471 and page->tablelines_vertical that look like table dividers.
1472
1473 Any text found inside tables is removed from page->spans[].
1474 */
1475 static int extract_subpage_tables_find_lines(
1476 extract_alloc_t *alloc,
1477 subpage_t *subpage,
1478 double master_space_guess)
1479 {
1480 double miny;
1481 double maxy;
1482 double margin = 1;
1483 int iv;
1484 int ih;
1485 outf("page->tablelines_horizontal.tablelines_num=%i", subpage->tablelines_horizontal.tablelines_num);
1486 outf("page->tablelines_vertical.tablelines_num=%i", subpage->tablelines_vertical.tablelines_num);
1487
1488 /* Sort all lines by y coordinate. */
1489 qsort(subpage->tablelines_horizontal.tablelines,
1490 subpage->tablelines_horizontal.tablelines_num,
1491 sizeof(*subpage->tablelines_horizontal.tablelines),
1492 tablelines_compare_y);
1493 qsort(subpage->tablelines_vertical.tablelines,
1494 subpage->tablelines_vertical.tablelines_num,
1495 sizeof(*subpage->tablelines_vertical.tablelines),
1496 tablelines_compare_y);
1497
1498 if (0)
1499 {
1500 /* Show info about lines. */
1501 int i;
1502 outf0("tablelines_horizontal:");
1503 for (i=0; i<subpage->tablelines_horizontal.tablelines_num; ++i)
1504 {
1505 outf0(" color=%f: %s",
1506 subpage->tablelines_horizontal.tablelines[i].color,
1507 extract_rect_string(&subpage->tablelines_horizontal.tablelines[i].rect)
1508 );
1509 }
1510 outf0("tablelines_vertical:");
1511 for (i=0; i<subpage->tablelines_vertical.tablelines_num; ++i)
1512 {
1513 outf0(" color=%f: %s",
1514 subpage->tablelines_vertical.tablelines[i].color,
1515 extract_rect_string(&subpage->tablelines_vertical.tablelines[i].rect)
1516 );
1517 }
1518 }
1519
1520 /* Look for completely separate vertical regions that define different
1521 tables, by looking for vertical gaps between the rects of each
1522 horizontal/vertical line. */
1523 maxy = -DBL_MAX;
1524 miny = -DBL_MAX;
1525 iv = 0;
1526 ih = 0;
1527 for(;;)
1528 {
1529 tableline_t *tlv = NULL;
1530 tableline_t *tlh = NULL;
1531 tableline_t *tl;
1532 if (iv < subpage->tablelines_vertical.tablelines_num)
1533 {
1534 tlv = &subpage->tablelines_vertical.tablelines[iv];
1535 }
1536 /* We only consider horizontal lines that are not white. This is a bit
1537 of a cheat to get the right behaviour with twotables_2.pdf. */
1538 while (ih < subpage->tablelines_horizontal.tablelines_num)
1539 {
1540 if (subpage->tablelines_horizontal.tablelines[ih].color == 1)
1541 {
1542 /* Ignore white horizontal lines. */
1543 ++ih;
1544 }
1545 else
1546 {
1547 tlh = &subpage->tablelines_horizontal.tablelines[ih];
1548 break;
1549 }
1550 }
1551 if (tlv && tlh)
1552 {
1553 tl = (tlv->rect.min.y < tlh->rect.min.y) ? tlv : tlh;
1554 }
1555 else if (tlv) tl = tlv;
1556 else if (tlh) tl = tlh;
1557 else break;
1558 if (tl == tlv) iv += 1;
1559 else ih += 1;
1560 if (tl->rect.min.y > maxy + margin)
1561 {
1562 if (maxy > miny)
1563 {
1564 outf("New table. maxy=%f miny=%f", maxy, miny);
1565 /* Find table. */
1566 table_find(alloc, subpage, miny - margin, maxy + margin, master_space_guess);
1567 }
1568 miny = tl->rect.min.y;
1569 }
1570 if (tl->rect.max.y > maxy) maxy = tl->rect.max.y;
1571 }
1572
1573 /* Find last table. */
1574 table_find(alloc, subpage, miny - margin, maxy + margin, master_space_guess);
1575
1576 return 0;
1577 }
1578
1579
1580 /* For debugging only. */
1581 static void show_tables(content_root_t *tables)
1582 {
1583 content_table_iterator tit;
1584 table_t *table;
1585
1586 outf0("tables_num=%i", content_count_tables(tables));
1587 for (table = content_table_iterator_init(&tit, tables); table != NULL; table = content_table_iterator_next(&tit))
1588 {
1589 int y;
1590 outf0("table: cells_num_y=%i cells_num_x=%i", table->cells_num_y, table->cells_num_x);
1591 for (y=0; y<table->cells_num_y; ++y)
1592 {
1593 int x;
1594 for (x=0; x<table->cells_num_x; ++x)
1595 {
1596 cell_t* cell = table->cells[table->cells_num_x * y + x];
1597 outf0("cell: y=% 3i x=% 3i: left=%i above=%i rect=%s",
1598 y, x, cell->left, cell->above, extract_rect_string(&cell->rect));
1599 }
1600 }
1601 }
1602 }
1603
1604 /* Find tables in <page>.
1605
1606 At the moment this only calls extract_page_tables_find_lines(), but in future
1607 will call other functions that find tables in different ways, e.g. by analysing
1608 an image of a page, or looking for blocks of whitespace in between chunks of
1609 text. */
1610 static int
1611 extract_subpage_tables_find(
1612 extract_alloc_t *alloc,
1613 subpage_t *subpage,
1614 double master_space_guess)
1615 {
1616 if (extract_subpage_tables_find_lines(alloc, subpage, master_space_guess)) return -1;
1617
1618 if (0)
1619 {
1620 outf0("=== tables from extract_page_tables_find_lines():");
1621 show_tables(&subpage->tables);
1622 }
1623
1624 return 0;
1625 }
1626
1627 /* Finds tables and paragraphs on <page>. */
1628 static int
1629 extract_join_subpage(
1630 extract_alloc_t *alloc,
1631 subpage_t *subpage,
1632 double master_space_guess)
1633 {
1634 /* Find tables on this page first. This will remove text that is within
1635 tables from page->spans, so that text doesn't appear more than once in
1636 the final output. */
1637 if (extract_subpage_tables_find(alloc, subpage, master_space_guess)) return -1;
1638
1639 /* Now join remaining spans into lines and paragraphs. */
1640 if (join_content(alloc, &subpage->content, master_space_guess))
1641 return -1;
1642
1643 return 0;
1644 }
1645
1646
1647 /* For each page in <document> we find tables and join spans into lines and paragraphs.
1648
1649 A line is a list of spans that are at the same angle and on the same
1650 line. A paragraph is a list of lines that are at the same angle and close
1651 together.
1652 */
1653 int extract_document_join(extract_alloc_t *alloc, document_t *document, int layout_analysis, double master_space_guess)
1654 {
1655 int p;
1656
1657 for (p=0; p<document->pages_num; ++p) {
1658 extract_page_t* page = document->pages[p];
1659 int c;
1660
1661 /* If we have layout analysis enabled, then we do our 'boxer' analysis to
1662 * try to spot subdivisions and subpages. */
1663 if (layout_analysis && extract_page_analyse(alloc, page)) return -1;
1664
1665 for (c=0; c<page->subpages_num; ++c) {
1666 subpage_t* subpage = page->subpages[c];
1667
1668 outf("processing page %i, subpage %i: num_spans=%i", p, c, content_count_spans(&subpage->content));
1669 if (extract_join_subpage(alloc, subpage, master_space_guess)) return -1;
1670 }
1671 }
1672
1673 return 0;
1674 }