comparison mupdf-source/source/fitz/stext-para.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24
25 #include <assert.h>
26
27 /* #define DEBUG_SPLITS */
28
29 /* #define DEBUG_PARA_SPLITS */
30
31 static void
32 recalc_bbox(fz_stext_block *block)
33 {
34 fz_rect bbox = fz_empty_rect;
35 fz_stext_line *line;
36
37 for (line = block->u.t.first_line; line != NULL; line = line->next)
38 bbox = fz_union_rect(bbox, line->bbox);
39
40 block->bbox = bbox;
41 }
42
43 typedef enum
44 {
45 UNDERLINE_UNKNOWN,
46 UNDERLINE_YES,
47 UNDERLINE_NO,
48 UNDERLINE_MIXED
49 } underline_state;
50
51 /* Some crap heuristics to spot a bold font. */
52 static int
53 font_is_bold(fz_font *font)
54 {
55 const char *c;
56
57 if (font == NULL)
58 return 0;
59 if (font->flags.is_bold)
60 return 1;
61
62 if (fz_strstrcase(font->name, "Bold") != NULL)
63 return 1;
64 if (fz_strstrcase(font->name, "Black") != NULL)
65 return 1;
66 if (fz_strstrcase(font->name, "Medium") != NULL)
67 return 0;
68 if (fz_strstrcase(font->name, "Light") != NULL)
69 return 0;
70
71 c = fz_strstr(font->name, " B");
72 if (c && (c[2] == ' ' || c[2] == 0))
73 return 1;
74
75 return 0;
76 }
77
78 /* Check to see if lines move left to right and downwards. */
79 /* FIXME: Maybe allow right to left? checking unicode values? */
80 static int
81 lines_move_plausibly_like_paragraph(fz_stext_block *block)
82 {
83 fz_stext_line *line;
84 int firstline = 1;
85 float line_height, line_x, line_y;
86
87 /* Do the lines that make up this block move in an appropriate way? */
88 for (line = block->u.t.first_line; line != NULL; line = line->next)
89 {
90 float x = (line->bbox.x0 + line->bbox.x1)/2;
91 float y = (line->bbox.y0 + line->bbox.y1)/2;
92 float height = line->bbox.y1 - line->bbox.y0;
93 fz_stext_char *ch;
94
95 /* Ignore any completely empty lines */
96 for (ch = line->first_char; ch != NULL; ch = ch->next)
97 if (ch->c != ' ')
98 break;
99 if (ch == NULL)
100 continue;
101
102 if (firstline)
103 {
104 line_height = height;
105 line_x = x;
106 line_y = y;
107 firstline = 0;
108 }
109 else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
110 {
111 /* We are plausibly the same line. Only accept if we move right. */
112 if (x < line_x)
113 return 0;
114 else
115 line_x = x;
116 }
117 else if (line_y < y)
118 {
119 /* Moving downwards. Plausible. */
120 line_y = y;
121 line_height = height;
122 line_x = x;
123 }
124 else
125 {
126 /* Nothing else is plausible. */
127 return 0;
128 }
129 }
130 return 1;
131 }
132
133 #ifdef DEBUG_SPLITS
134 static void dump_line(fz_context *ctx, const char *str, fz_stext_line *line)
135 {
136 fz_stext_char *ch;
137
138 if (str)
139 fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", str);
140
141 if (line == NULL)
142 return;
143
144 for (ch = line->first_char; ch != NULL; ch = ch->next)
145 fz_write_printf(ctx, fz_stddbg(ctx), "%c", (char)ch->c);
146 fz_write_printf(ctx, fz_stddbg(ctx), "\n");
147 }
148
149 static void dump_block(fz_context *ctx, const char *fmt, fz_stext_block *block)
150 {
151 fz_stext_line *line;
152
153 fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", fmt);
154 if (block == NULL || block->type != FZ_STEXT_BLOCK_TEXT)
155 return;
156
157 for (line = block->u.t.first_line; line != NULL; line = line->next)
158 dump_line(ctx, NULL, line);
159 }
160 #endif
161
162 typedef struct
163 {
164 fz_pool *pool;
165 fz_stext_struct *parent;
166 int idx;
167 fz_stext_block **pfirst;
168 fz_stext_block **plast;
169 } stext_pos;
170
171 static fz_stext_block *split_block_at_line(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_stext_line *line)
172 {
173 fz_stext_block *newblock = fz_pool_alloc(ctx, pos->pool, sizeof *newblock);
174
175 #ifdef DEBUG_SPLITS
176 dump_block(ctx, "Splitting:", block);
177 dump_line(ctx, "At line:", line);
178 #endif
179
180 newblock->bbox = fz_empty_rect;
181 newblock->prev = block;
182 newblock->next = block->next;
183 if (block->next)
184 block->next->prev = newblock;
185 else
186 {
187 assert(*pos->plast == block);
188 *pos->plast = newblock;
189 }
190 block->next = newblock;
191 newblock->type = FZ_STEXT_BLOCK_TEXT;
192 newblock->u.t.flags = block->u.t.flags;
193 newblock->u.t.first_line = line;
194 newblock->u.t.last_line = block->u.t.last_line;
195 block->u.t.last_line = line->prev;
196 line->prev->next = NULL;
197 line->prev = NULL;
198 recalc_bbox(block);
199 recalc_bbox(newblock);
200
201 #ifdef DEBUG_SPLITS
202 dump_block(ctx, "Giving:", block);
203 dump_block(ctx, "and:", newblock);
204 #endif
205
206 return newblock;
207 }
208
209 /* Convert a block to being a struct that contains just that block. */
210 static void block_to_struct(fz_context *ctx, stext_pos *pos, fz_stext_block *block, int structtype)
211 {
212 fz_stext_struct *str = fz_pool_alloc_flexible(ctx, pos->pool, fz_stext_struct, raw, 1);
213 fz_stext_block *new_block = fz_pool_alloc(ctx, pos->pool, sizeof(*new_block));
214
215 str->up = block;
216 str->parent = pos->parent;
217 str->first_block = new_block;
218 str->last_block = new_block;
219 str->standard = structtype;
220 str->raw[0] = 0;
221
222 new_block->type = block->type;
223 new_block->bbox = block->bbox;
224 new_block->u = block->u;
225
226 block->type = FZ_STEXT_BLOCK_STRUCT;
227 block->u.s.down = str;
228 block->u.s.index = pos->idx++;
229 }
230
231 /*
232 We are going to repeatedly walk the lines that make up a block.
233 To reduce the boilerplate here, we'll use a line_walker function.
234 This will call a bunch of callbacks as it goes.
235
236 newline_fn Called whenever we move to a new horizontal line (i.e.
237 as if we've got a newline). This is not the same as being
238 called every fz_stext_line, as we frequently get multiple
239 fz_stext_line's on a single horizontal line. If this returns
240 0, execution continues. Return 1 to stop the walking.
241 line_fn Called for every fz_stext_line (typically used to process
242 characters).
243 end_fn Called at the end of the block (with line being the final
244 line of the block.
245 arg An opaque pointer passed to all the callbacks.
246 */
247 typedef int (line_walker_newline_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height);
248 typedef int (line_walker_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
249 typedef void (line_walker_end_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg);
250
251 static void
252 line_walker(fz_context *ctx, fz_stext_block *block, line_walker_newline_fn *newline_fn, line_walker_fn *line_fn, line_walker_end_fn *end_fn, void *arg)
253 {
254 int firstline = 1;
255 fz_stext_line *line;
256 float line_height, line_y;
257
258 if (block->u.t.first_line == NULL)
259 return;
260
261 for (line = block->u.t.first_line; line != NULL; line = line->next)
262 {
263 float y = (line->bbox.y0 + line->bbox.y1)/2;
264 float height = line->bbox.y1 - line->bbox.y0;
265
266 if (line->first_char == NULL)
267 continue; /* Should never happen, but makes life easier to assume this later. */
268
269 if (firstline)
270 {
271 line_height = height;
272 firstline = 0;
273 line_y = y;
274 }
275 else if (line_y - line_height/2 < y && line_y + line_height/2 > y)
276 {
277 /* We are plausibly the same horizontal line. */
278 }
279 else if (line_y < y)
280 {
281 /* Moving downwards. */
282 line_height = height;
283 line_y = y;
284 if (newline_fn && newline_fn(ctx, block, line, arg, line_height))
285 return;
286 }
287 if (line_fn && line_fn(ctx, block, line, arg))
288 return;
289 }
290 if (end_fn)
291 end_fn(ctx, block, block->u.t.last_line, arg);
292 }
293
294 /* We scan through the block, collecting lines up that look
295 * "title-ish" (by which here, we mean "are completely
296 * underlined"). As soon as we finish such a region, we split
297 * the block (either before or after it as appropriate), and
298 * mark it as a title.
299 *
300 * e.g.
301 *
302 * _THIS_IS_LIKELY_A
303 * _TITLE_ ___ < BREAK HERE
304 * Lorem ipsum dolor sit
305 * amet, consectetur
306 * adipiscing elit. ___ < BREAK HERE
307 * _LIKELY_ANOTHER_TITLE_ ____< BREAK HERE
308 * Sed do eiusmod tempor
309 * incididunt ut labore
310 * et dolore magna aliqua.
311 */
312 typedef struct
313 {
314 stext_pos *pos;
315 fz_stext_line *title_start;
316 fz_stext_line *title_end;
317 underline_state underlined;
318 int changed;
319 } underlined_data;
320
321 static int
322 underlined_break(fz_context *ctx, fz_stext_block *block, underlined_data *data)
323 {
324 fz_stext_line *line;
325
326 /* We have a block that looks like a title. */
327 if (data->title_start != block->u.t.first_line)
328 {
329 /* We need to split the block before title_start */
330 line = data->title_start;
331 }
332 else if (data->title_end != block->u.t.last_line)
333 {
334 /* We need to split the block after title_end */
335 line = data->title_end->next;
336 }
337 else
338 {
339 /* This block is already entirely title. */
340 line = NULL;
341 }
342 if (line)
343 {
344 (void)split_block_at_line(ctx, data->pos, block, line);
345 data->changed = 1;
346 if (line == data->title_start)
347 {
348 /* Don't label the latter part as a title yet, we'll do it when
349 * we step back in, but we don't know how much of the latter
350 * block is title yet. */
351 }
352 else
353 {
354 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
355 }
356 }
357 else
358 {
359 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
360 }
361 return 1;
362 }
363
364 static int
365 underlined_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
366 {
367 underlined_data *data = (underlined_data *)arg;
368
369 if (data->underlined == UNDERLINE_YES)
370 {
371 /* Add the line we've just finished to the start/stop region */
372 if (data->title_start == NULL)
373 data->title_start = line->prev;
374 data->title_end = line->prev;
375 }
376 else if (data->title_start != NULL)
377 {
378 /* We've reached the end of a title region. */
379 return underlined_break(ctx, block, data);
380 }
381 data->underlined = UNDERLINE_UNKNOWN;
382
383 return 0;
384 }
385
386 static int
387 underlined_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
388 {
389 underlined_data *data = (underlined_data *)arg;
390 fz_stext_char *ch;
391
392 /* If we already know that this line is mixed underlined, then no point in
393 * wasting time. */
394 if (data->underlined == UNDERLINE_MIXED)
395 return 0;
396
397 /* If we haven't started looking yet, prime the value. */
398 if (data->underlined == UNDERLINE_UNKNOWN)
399 data->underlined = (line->first_char->flags & FZ_STEXT_UNDERLINE) ? UNDERLINE_YES : UNDERLINE_NO;
400
401 /* Check that all the rest of the the chars match our expected value. */
402 for (ch = line->first_char; ch != NULL; ch = ch->next)
403 if ((!!(ch->flags & FZ_STEXT_UNDERLINE)) ^ (data->underlined == UNDERLINE_YES))
404 {
405 /* Differs! So, Mixed. */
406 data->underlined = UNDERLINE_MIXED;
407 break;
408 }
409
410 return 0;
411 }
412
413 static void
414 underlined_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
415 {
416 underlined_data *data = (underlined_data *)arg;
417
418 if (data->underlined == UNDERLINE_YES)
419 {
420 /* Add the line we've just finished to the start/stop region */
421 if (data->title_start == NULL)
422 data->title_start = block->u.t.last_line;
423 data->title_end = block->u.t.last_line;
424 }
425
426 /* If we didn't find a region, bale. */
427 if (data->title_start)
428 underlined_break(ctx, block, data);
429 }
430
431 static int
432 detect_underlined_titles(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
433 {
434 /* Let's do the title scanning, where our criteria is
435 * "the entire line is underlined". */
436 underlined_data data[1];
437
438 data->pos = pos;
439 data->title_start = NULL;
440 data->title_end = NULL;
441 data->underlined = UNDERLINE_UNKNOWN;
442 data->changed = 0;
443
444 line_walker(ctx, block, underlined_newline, underlined_line, underlined_end, data);
445
446 return data->changed;
447 }
448
449
450 /* Now we scan again, where the 'title' criteria is based upon
451 * the titles being entirely in a different font. */
452 typedef struct
453 {
454 stext_pos *pos;
455 fz_stext_line *title_start;
456 fz_stext_line *title_end;
457 fz_font *font;
458 int changed;
459 } font_data;
460
461 #define MIXED_FONT ((fz_font *)1)
462
463 static int
464 font_break(fz_context *ctx, fz_stext_block *block, font_data *data)
465 {
466 fz_stext_line *line;
467
468 /* We have a block that looks like a title. */
469 if (data->title_start != block->u.t.first_line)
470 {
471 /* We need to split the block before title_start */
472 line = data->title_start;
473 }
474 else if (data->title_end != block->u.t.last_line)
475 {
476 /* We need to split the block after title_end */
477 line = data->title_end->next;
478 }
479 else
480 {
481 /* This block is already entirely title. */
482 line = NULL;
483 }
484 if (line)
485 {
486 (void)split_block_at_line(ctx, data->pos, block, line);
487 data->changed = 1;
488 if (line == data->title_start)
489 {
490 /* Don't label the latter part as a title yet, we'll do it when
491 * we step back in, but we don't know how much of the latter
492 * block is title yet. */
493 }
494 else
495 {
496 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
497 }
498 }
499 else
500 {
501 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H);
502 }
503
504 return 1;
505 }
506
507 static int
508 font_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
509 {
510 font_data *data = (font_data *)arg;
511
512 if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
513 {
514 /* Add the line we've just finished to the start/stop region */
515 if (data->title_start == NULL)
516 data->title_start = line->prev;
517 data->title_end = line->prev;
518 }
519 else if (data->title_start != NULL)
520 {
521 /* We've reached the end of a title region. */
522 return font_break(ctx, block, data);
523 }
524 data->font = NULL;
525
526 return 0;
527 }
528
529 static int
530 font_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
531 {
532 font_data *data = (font_data *)arg;
533 fz_stext_char *ch;
534
535 /* If we already know that this line is mixed fonts, then no point in
536 * wasting time. */
537 if (data->font == MIXED_FONT)
538 return 0;
539
540 /* If we are just starting, prime it. */
541 if (data->font == NULL)
542 data->font = line->first_char->font;
543
544 for (ch = line->first_char; ch != NULL; ch = ch->next)
545 if (ch->font != data->font)
546 {
547 data->font = MIXED_FONT;
548 break;
549 }
550
551 return 0;
552 }
553
554 static void
555 font_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
556 {
557 font_data *data = (font_data *)arg;
558
559 if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font))
560 {
561 /* Add the line we've just finished to the start/stop region */
562 if (data->title_start == NULL)
563 data->title_start = block->u.t.last_line;
564 data->title_end = block->u.t.last_line;
565 }
566
567 if (data->title_start)
568 font_break(ctx, block, data);
569 }
570
571 static int
572 detect_titles_by_font_usage(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
573 {
574 font_data data[1];
575
576 data->pos = pos;
577 data->title_start = NULL;
578 data->title_end = NULL;
579 data->font = NULL;
580 data->changed = 0;
581
582 line_walker(ctx, block, font_newline, font_line, font_end, data);
583
584 return data->changed;
585 }
586
587 typedef struct
588 {
589 fz_rect bbox;
590 stext_pos *pos;
591 int changed;
592 } indent_data;
593
594 static int
595 indent_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
596 {
597 indent_data *data = (indent_data *)arg;
598 float indent = line->bbox.x0 - data->bbox.x0;
599
600 if (indent > line_height)
601 {
602 /* Break the block here! */
603 (void)split_block_at_line(ctx, data->pos, block, line);
604 data->changed = 1;
605 return 1;
606 }
607
608 return 0;
609 }
610
611 static int
612 break_paragraphs_by_indent(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
613 {
614 indent_data data[1];
615
616 data->pos = pos;
617 data->bbox = bbox;
618 data->changed = 0;
619
620 line_walker(ctx, block, indent_newline, NULL, NULL, data);
621
622 return data->changed;
623 }
624
625 typedef struct
626 {
627 fz_rect bbox;
628 stext_pos *pos;
629 float line_gap;
630 float prev_line_gap;
631 int looking_for_space;
632 float space_size;
633 int maybe_ends_paragraph;
634 int changed;
635 } trailing_data;
636
637 static int
638 trailing_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
639 {
640 trailing_data *data = (trailing_data *)arg;
641
642 data->prev_line_gap = data->line_gap;
643
644 if (data->looking_for_space)
645 {
646 /* We've moved downwards onto a line, and failed to find
647 * a space on that line. Presumably that means that whole
648 * line is a single word. */
649 float line_len = line->bbox.x1 - line->bbox.x0;
650
651 if (line_len + data->space_size < data->prev_line_gap)
652 {
653 /* We could have fitted this word into the previous line. */
654 /* So presumably that was a paragraph break. Split here. */
655 (void)split_block_at_line(ctx, data->pos, block, line);
656 data->changed = 1;
657 return 1;
658 }
659 data->looking_for_space = 0;
660 }
661
662 /* If we the last line we looked at ended plausibly for a paragraph,
663 * then look for a space in this line... */
664 data->looking_for_space = data->maybe_ends_paragraph;
665
666 return 0;
667 }
668
669 static int
670 trailing_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
671 {
672 trailing_data *data = (trailing_data *)arg;
673 fz_stext_char *ch;
674
675 data->line_gap = data->bbox.x1 - line->bbox.x1;
676 if (line->last_char && (
677 (line->last_char->c >= 'A' && line->last_char->c <= 'Z') ||
678 (line->last_char->c >= 'a' && line->last_char->c <= 'z') ||
679 (line->last_char->c >= '0' && line->last_char->c <= '9')))
680 {
681 /* In Latin text, paragraphs should always end up some form
682 * of punctuation. I suspect that's less true of some other
683 * languages (particularly far-eastern ones). Let's just say
684 * that if we end in A-Za-z0-9 we can't possibly be the last
685 * line of a paragraph. */
686 data->maybe_ends_paragraph = 0;
687 }
688 else
689 {
690 /* Plausibly the next line might be the first line of a new paragraph */
691 data->maybe_ends_paragraph = 1;
692 }
693 for (ch = line->first_char; ch != NULL; ch = ch->next)
694 {
695 fz_rect r;
696 float w, line_len;
697
698 if (ch->c != ' ')
699 continue;
700
701 r = fz_rect_from_quad(ch->quad);
702 w = r.x1 - r.x0;
703
704 if (w < data->space_size)
705 data->space_size = w;
706
707 /* If we aren't looking_for_space, then no point in checking for
708 * whether the prefix will fit. But keep looping as we want to
709 * continue to refine our idea of how big a space is. */
710 if (!data->looking_for_space)
711 continue;
712
713 line_len = r.x0 - line->bbox.x0;
714 if (line_len + data->space_size < data->prev_line_gap)
715 {
716 /* We could have fitted this word into the previous line. */
717 /* So presumably that was a paragraph break. Split here. */
718 (void)split_block_at_line(ctx, data->pos, block, line);
719 data->changed = 1;
720 return 1;
721 }
722 data->looking_for_space = 0;
723 }
724
725 return 0;
726 }
727
728 static int
729 break_paragraphs_by_analysing_trailing_gaps(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
730 {
731 trailing_data data[1];
732
733 data->bbox = bbox;
734 data->pos = pos;
735 data->line_gap = 0;
736 data->prev_line_gap = 0;
737 data->looking_for_space = 0;
738 data->space_size = 99999;
739 data->maybe_ends_paragraph = 0;
740 data->changed = 0;
741
742 line_walker(ctx, block, trailing_newline, trailing_line, NULL, data);
743
744 return data->changed;
745 }
746
747 typedef struct
748 {
749 fz_rect bbox;
750 stext_pos *pos;
751 int count_lines;
752 int count_justified;
753 int non_digits_exist_in_this_line;
754 fz_rect fragment_box;
755 fz_rect line_box;
756 int gap_count_this_line;
757 float gap_size_this_line;
758 int bad_gap;
759 float xmin, xmax;
760 float last_min_space;
761 int changed;
762 } justify_data;
763
764 #define JUSTIFY_THRESHOLD 1
765
766 static int
767 justify_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
768 {
769 justify_data *data = (justify_data *)arg;
770
771 if (line->prev)
772 line = line->prev;
773
774 data->line_box = fz_union_rect(data->line_box, data->fragment_box);
775 if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
776 data->count_justified++;
777 data->non_digits_exist_in_this_line = 0;
778 data->count_lines++;
779 data->gap_size_this_line = 0;
780 data->gap_count_this_line = 0;
781 data->fragment_box = fz_empty_rect;
782 data->line_box = fz_empty_rect;
783
784 data->xmin = INFINITY;
785 data->xmax = -INFINITY;
786
787 return 0;
788 }
789
790 static void
791 fragment_end(justify_data *data)
792 {
793 float gap;
794
795 if (fz_is_empty_rect(data->fragment_box))
796 {
797 /* No fragment. Nothing to do. */
798 return;
799 }
800 if (fz_is_empty_rect(data->line_box))
801 {
802 /* First fragment of the line; no gap yet. */
803 gap = 0;
804 }
805 else if (data->fragment_box.x0 > data->line_box.x1)
806 {
807 /* This whole fragment is to the right of the line so far. */
808 gap = data->fragment_box.x0 - data->line_box.x1;
809 }
810 else if (data->fragment_box.x1 < data->line_box.x0)
811 {
812 /* This whole fragment is the left of the line so far. */
813 gap = data->line_box.x1 - data->fragment_box.x0;
814 }
815 else
816 {
817 /* Abutting or overlapping fragment. Ignore it. */
818 gap = 0;
819 }
820 data->line_box = fz_union_rect(data->line_box, data->fragment_box);
821 data->fragment_box = fz_empty_rect;
822 if (gap < data->last_min_space)
823 return;
824 /* So we have a gap to consider */
825 if (data->gap_count_this_line > 0)
826 {
827 /* Allow for double spaces, cos some layouts put
828 * double spaces before full stops. */
829 if (fabs(gap - data->gap_size_this_line) > 1 &&
830 fabs(gap/2.0 - data->gap_size_this_line) < 1)
831 gap /= 2;
832 if (fabs(gap - data->gap_size_this_line) > 1)
833 data->bad_gap = 1;
834 }
835 data->gap_size_this_line = (data->gap_size_this_line * data->gap_count_this_line + gap) / (data->gap_count_this_line + 1);
836 data->gap_count_this_line++;
837 }
838
839 /* This is trickier than you'd imagine. We want to walk the line, looking
840 * for how large the spaces are. In a justified line, all the spaces should
841 * be pretty much the same size. (Except maybe before periods). But we want
842 * to cope with bidirectional text which can send glyphs in unexpected orders.
843 * e.g. abc fed ghi
844 * So we have to walk over "fragments" at a time.
845 */
846 static int
847 justify_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
848 {
849 justify_data *data = (justify_data *)arg;
850 fz_stext_char *ch;
851
852 for (ch = line->first_char; ch != NULL; ch = ch->next)
853 {
854 fz_rect r = fz_rect_from_quad(ch->quad);
855 float min_space = ch->size * 0.15f; /* Matches SPACE_DIST from stext-device. */
856
857 if (ch->c == ' ')
858 {
859 /* This ends a fragment, but we don't treat it as such.
860 * Just continue, because we'll end the fragment next time
861 * around the loop (this copes with trailing spaces, and
862 * multiple spaces, and gaps between 'lines' that are on
863 * the same line. */
864 data->last_min_space = min_space;
865 continue;
866 }
867 if ((ch->c <= '0' || ch->c >= '9') && ch->c != '.')
868 data->non_digits_exist_in_this_line = 1;
869 if (!fz_is_empty_rect(data->fragment_box))
870 {
871 if (r.x0 > data->fragment_box.x1 + data->last_min_space)
872 {
873 /* Fragment ends due to gap on right. */
874 fragment_end(data);
875 }
876 else if (r.x1 < data->fragment_box.x0 - data->last_min_space)
877 {
878 /* Fragment ends due to gap on left. */
879 fragment_end(data);
880 }
881 }
882 /* Extend the fragment */
883 data->fragment_box = fz_union_rect(data->fragment_box, r);
884 data->last_min_space = min_space;
885 }
886
887 return 0;
888 }
889
890 static void
891 justify_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
892 {
893 justify_data *data = (justify_data *)arg;
894
895 fragment_end(data);
896 data->line_box = fz_union_rect(data->line_box, data->fragment_box);
897 if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line)
898 data->count_justified++;
899 data->count_lines++;
900 }
901
902 static int
903 justify2_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
904 {
905 justify_data *data = (justify_data *)arg;
906
907 if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD)
908 {
909 /* Justified */
910 }
911 else
912 {
913 /* Break after line */
914 (void)split_block_at_line(ctx, data->pos, block, line);
915 data->changed = 1;
916 return 1;
917 }
918
919 data->line_box = fz_empty_rect;
920
921 return 0;
922 }
923
924 static int
925 justify2_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
926 {
927 justify_data *data = (justify_data *)arg;
928 fz_stext_char *ch;
929
930 for (ch = line->first_char; ch != NULL; ch = ch->next)
931 {
932 if (ch->c == ' ')
933 continue;
934
935 data->line_box = fz_union_rect(data->line_box, fz_rect_from_quad(ch->quad));
936 }
937
938 return 0;
939 }
940
941 static fz_rect
942 text_block_marked_bbox(fz_context *ctx, fz_stext_block *block)
943 {
944 fz_stext_line *line;
945 fz_stext_char *ch;
946 fz_rect r = fz_empty_rect;
947
948 for (line = block->u.t.first_line; line != NULL; line = line->next)
949 {
950 for (ch = line->first_char; ch != NULL; ch = ch->next)
951 {
952 if (ch->c == ' ')
953 continue;
954 r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
955 }
956 }
957
958 return r;
959 }
960
961 static int
962 break_paragraphs_within_justified_text(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox)
963 {
964 justify_data data[1];
965
966 if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
967 return 0;
968
969 data->bbox = bbox;
970
971 data->pos = pos;
972 data->count_lines = 0;
973 data->count_justified = 0;
974 data->non_digits_exist_in_this_line = 0;
975 data->bad_gap = 0;
976 data->gap_size_this_line = 0;
977 data->gap_count_this_line = 0;
978 data->fragment_box = fz_empty_rect;
979 data->line_box = fz_empty_rect;
980 data->xmin = INFINITY;
981 data->xmax = -INFINITY;
982 data->changed = 0;
983
984 line_walker(ctx, block, justify_newline, justify_line, justify_end, data);
985
986 /* We can't really derive anything about single lines! */
987 if (data->count_lines < 2)
988 return 0;
989 /* If at least half of the lines don't appear to be justified, then
990 * don't trust 'em. */
991 if (data->count_justified * 2 < data->count_lines)
992 return 0;
993 /* If the "badness" we've seen to do with big gaps (i.e. how much
994 * bigger the gaps are than we'd reasonably expect) is too large
995 * then we can't be a justified block. We are prepared to forgive
996 * larger sizes in larger paragraphs. */
997 if (data->bad_gap)
998 return 0;
999 block->u.t.flags = FZ_STEXT_TEXT_JUSTIFY_FULL;
1000
1001 line_walker(ctx, block, justify2_newline, justify2_line, NULL, data);
1002
1003 return data->changed;
1004 }
1005
1006 typedef enum
1007 {
1008 LOOKING_FOR_BULLET = 0,
1009 LOOKING_FOR_POST_BULLET = 1,
1010 LOOKING_FOR_POST_NUMERICAL_BULLET = 2,
1011 FOUND_BULLET = 3,
1012 CONTINUATION_LINE = 4,
1013 NO_BULLET = 5
1014 } list_state;
1015
1016 typedef struct
1017 {
1018 stext_pos *pos;
1019 list_state state;
1020 int buffer[10];
1021 int buffer_fill;
1022 float bullet_r;
1023 float post_bullet_indent;
1024 float l;
1025 fz_stext_line *bullet_line_start;
1026 fz_stext_line *this_line_start;
1027 int changed;
1028 } list_data;
1029
1030 static int
1031 list_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height)
1032 {
1033 list_data *data = (list_data *)arg;
1034
1035 if (data->state == FOUND_BULLET)
1036 {
1037 if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
1038 {
1039 /* We need to split the block before the bullet started. */
1040 (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
1041 data->changed = 1;
1042 return 1;
1043 }
1044 if (data->bullet_line_start != data->this_line_start)
1045 {
1046 /* We've found a second bullet. Break before the previous line. */
1047 (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
1048 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
1049 data->changed = 1;
1050 return 1;
1051 }
1052 }
1053 else if (data->state == NO_BULLET && data->bullet_line_start)
1054 {
1055 /* We've found a bullet before, and the line we've just completed
1056 * is neither a new bullet line, or a continuation so, we need to
1057 * break that into a new block. */
1058 (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
1059 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
1060 data->changed = 1;
1061 return 1;
1062 }
1063
1064 data->this_line_start = line;
1065 data->state = LOOKING_FOR_BULLET;
1066 data->buffer_fill = 0;
1067 data->l = block->bbox.x1;
1068 data->bullet_r = block->bbox.x0;
1069
1070 return 0;
1071 }
1072
1073 static int
1074 approx_eq(float a, float b, float c)
1075 {
1076 return fabs(a - b) <= c;
1077 }
1078
1079 static int
1080 is_roman(int c)
1081 {
1082 switch (c)
1083 {
1084 case 'm': case 'M':
1085 case 'c': case 'C':
1086 case 'l': case 'L':
1087 case 'x': case 'X':
1088 case 'v': case 'V':
1089 case 'i': case 'I':
1090 return 1;
1091 }
1092 return 0;
1093 }
1094
1095 typedef enum {
1096 NOT_A_BULLET,
1097 BULLET,
1098 NUMERICAL_BULLET
1099 } bullet_t;
1100
1101 static bullet_t
1102 is_bullet_aux(int *buffer, int len, int contained)
1103 {
1104 int i, decimal_pos, decimals_found;
1105
1106 if (len == 1 && (
1107 buffer[0] == '*' ||
1108 buffer[0] == 0x00B7 || /* Middle Dot */
1109 buffer[0] == 0x2022 || /* Bullet */
1110 buffer[0] == 0x2023 || /* Triangular Bullet */
1111 buffer[0] == 0x2043 || /* Hyphen Bullet */
1112 buffer[0] == 0x204C || /* Back leftwards bullet */
1113 buffer[0] == 0x204D || /* Back rightwards bullet */
1114 buffer[0] == 0x2219 || /* Bullet operator */
1115 buffer[0] == 0x25C9 || /* Fisheye */
1116 buffer[0] == 0x25CB || /* White circle */
1117 buffer[0] == 0x25CF || /* Black circle */
1118 buffer[0] == 0x25D8 || /* Inverse Bullet */
1119 buffer[0] == 0x25E6 || /* White Bullet */
1120 buffer[0] == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */
1121 buffer[0] == 0x261a || /* Black left pointing index */
1122 buffer[0] == 0x261b || /* Black right pointing index */
1123 buffer[0] == 0x261c || /* White left pointing index */
1124 buffer[0] == 0x261d || /* White up pointing index */
1125 buffer[0] == 0x261e || /* White right pointing index */
1126 buffer[0] == 0x261f || /* White down pointing index */
1127 buffer[0] == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */
1128 buffer[0] == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */
1129 buffer[0] == 0x29BE || /* Circled White Bullet */
1130 buffer[0] == 0x29BF || /* Circled Bullet */
1131 buffer[0] == 0x2660 || /* Black Spade suit */
1132 buffer[0] == 0x2661 || /* White Heart suit */
1133 buffer[0] == 0x2662 || /* White Diamond suit */
1134 buffer[0] == 0x2663 || /* Black Club suit */
1135 buffer[0] == 0x2664 || /* White Spade suit */
1136 buffer[0] == 0x2665 || /* Black Heart suit */
1137 buffer[0] == 0x2666 || /* Black Diamond suit */
1138 buffer[0] == 0x2667 || /* White Clud suit */
1139 buffer[0] == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */
1140 buffer[0] == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */
1141 buffer[0] == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */
1142 buffer[0] == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */
1143 buffer[0] == 0x1f597 || /* White down pointing left hand index */
1144 buffer[0] == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */
1145 buffer[0] == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */
1146 buffer[0] == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */
1147 buffer[0] == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */
1148 buffer[0] == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */
1149 buffer[0] == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */
1150 buffer[0] == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */
1151 buffer[0] == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */
1152 buffer[0] == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */
1153 buffer[0] == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */
1154 buffer[0] == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */
1155 buffer[0] == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */
1156 buffer[0] == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */
1157 buffer[0] == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */
1158 buffer[0] == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */
1159 buffer[0] == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */
1160 0))
1161 return BULLET;
1162
1163 if (!contained)
1164 {
1165 if (len > 2 && buffer[0] == '(' && buffer[len-1] == ')')
1166 return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
1167 if (len > 2 && buffer[0] == '<' && buffer[len-1] == '>')
1168 return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
1169 if (len > 2 && buffer[0] == '[' && buffer[len-1] == ']')
1170 return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
1171 if (len > 2 && buffer[0] == '{' && buffer[len-1] == '}')
1172 return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET;
1173
1174 if (len > 1 && buffer[len-1] == ':')
1175 return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
1176 if (len > 1 && buffer[len-1] == ')')
1177 return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET;
1178 }
1179
1180 /* Look for numbers */
1181 /* Be careful not to interpret rows of numbers, like:
1182 * 10.02 12.03
1183 * as bullets.
1184 */
1185 decimal_pos = 0;
1186 decimals_found = 0;
1187 for (i = 0; i < len; i++)
1188 {
1189 if (buffer[i] >= '0' && buffer[i] <= '9')
1190 {
1191 }
1192 else if (buffer[i] == '.')
1193 {
1194 decimal_pos = i;
1195 decimals_found++;
1196 }
1197 else
1198 break;
1199 }
1200 if (i == len && decimals_found <= 1)
1201 return NUMERICAL_BULLET;
1202 /* or number.something */
1203 if (decimals_found && i == decimal_pos+1 && i < len)
1204 return is_bullet_aux(buffer+i, len-i, 0) ? BULLET : NOT_A_BULLET;;
1205
1206 /* Look for roman */
1207 for (i = 0; i < len; i++)
1208 if (!is_roman(buffer[i]))
1209 break;
1210 if (i == len)
1211 return 1;
1212 /* or roman.something */
1213 if (buffer[i] == '.' && i < len-1)
1214 return is_bullet_aux(buffer+i+1, len-i-1, 0) ? BULLET : NOT_A_BULLET;
1215
1216 /* FIXME: Others. */
1217 return NOT_A_BULLET;
1218 }
1219
1220 static bullet_t
1221 is_bullet(int *buffer, int len)
1222 {
1223 return is_bullet_aux(buffer, len, 0);
1224 }
1225
1226 static int
1227 eval_buffer_for_bullet(fz_context *ctx, list_data *data, float size)
1228 {
1229 bullet_t bullet_type;
1230
1231 bullet_type = is_bullet(data->buffer, data->buffer_fill);
1232 if (bullet_type == NUMERICAL_BULLET)
1233 data->state = LOOKING_FOR_POST_NUMERICAL_BULLET;
1234 else if (bullet_type)
1235 data->state = LOOKING_FOR_POST_BULLET;
1236 else
1237 {
1238 if (approx_eq(data->l, data->post_bullet_indent, size/2))
1239 data->state = CONTINUATION_LINE;
1240 else
1241 data->state = NO_BULLET;
1242 return 1;
1243 }
1244 return 0;
1245 }
1246
1247 static int
1248 list_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
1249 {
1250 list_data *data = (list_data *)arg;
1251 fz_stext_char *ch;
1252
1253 for (ch = line->first_char; ch != NULL; ch = ch->next)
1254 {
1255 fz_rect r = fz_rect_from_quad(ch->quad);
1256
1257 if (r.x0 < data->l)
1258 data->l = line->bbox.x0;
1259
1260 switch (data->state)
1261 {
1262 case LOOKING_FOR_BULLET:
1263 if (ch->c == ' ')
1264 {
1265 /* We have a space */
1266 if (data->buffer_fill == 0)
1267 continue; /* Just skip leading spaces */
1268 if (eval_buffer_for_bullet(ctx, data, ch->size))
1269 return 0;
1270 }
1271 else if (data->buffer_fill > 0 && r.x0 - data->bullet_r > ch->size/2)
1272 {
1273 /* We have a gap large enough to be a space while we've
1274 * got something in the buffer. */
1275 if (eval_buffer_for_bullet(ctx, data, ch->size))
1276 return 0;
1277 }
1278 else if (data->buffer_fill < (int)nelem(data->buffer))
1279 {
1280 /* Stick it in the buffer for evaluation later. */
1281 data->buffer[data->buffer_fill++] = ch->c;
1282 }
1283 else
1284 {
1285 /* Buffer overflowed. Can't be a bullet. */
1286 if (approx_eq(data->l, data->post_bullet_indent, ch->size))
1287 data->state = CONTINUATION_LINE;
1288 else
1289 data->state = NO_BULLET;
1290 return 0;
1291 }
1292 data->bullet_r = r.x1;
1293 break;
1294 case LOOKING_FOR_POST_BULLET:
1295 if (ch->c != ' ')
1296 {
1297 data->state = FOUND_BULLET;
1298 if (data->bullet_line_start == NULL)
1299 data->bullet_line_start = data->this_line_start;
1300 data->post_bullet_indent = r.x0;
1301 }
1302 break;
1303 case LOOKING_FOR_POST_NUMERICAL_BULLET:
1304 if (ch->c >= '0' && ch->c <= '9')
1305 {
1306 /* Numerical bullets can't be followed by numbers. */
1307 if (approx_eq(data->l, data->post_bullet_indent, ch->size))
1308 data->state = CONTINUATION_LINE;
1309 else
1310 data->state = NO_BULLET;
1311 return 0;
1312 }
1313 if (ch->c != ' ')
1314 {
1315 data->state = FOUND_BULLET;
1316 if (data->bullet_line_start == NULL)
1317 data->bullet_line_start = data->this_line_start;
1318 data->post_bullet_indent = r.x0;
1319 }
1320 break;
1321 default:
1322 break;
1323 }
1324 }
1325
1326 return 0;
1327 }
1328
1329 static void
1330 list_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg)
1331 {
1332 list_data *data = (list_data *)arg;
1333
1334 if (data->state == LOOKING_FOR_BULLET)
1335 {
1336 eval_buffer_for_bullet(ctx, data, 0);
1337 /* If we ended up thinking we'd found a bullet, subject to
1338 * what follows not being of a specific form, then we're
1339 * fine, because nothing follows us! */
1340 if (data->state == LOOKING_FOR_POST_NUMERICAL_BULLET ||
1341 data->state == LOOKING_FOR_POST_BULLET)
1342 {
1343 data->state = FOUND_BULLET;
1344 if (data->bullet_line_start == NULL)
1345 data->bullet_line_start = data->this_line_start;
1346 }
1347 /* FIXME: This block contains just a bullet - not the content
1348 * for the bullet. We see this with page-12.pdf.
1349 * <> Rising commitment to battery...
1350 * committed to in-house battery...
1351 * developing and manufacturing...
1352 *
1353 * The <> is in a whole different DIV to the following text.
1354 * Really we want to look for if the "next" content (for some
1355 * definition of next) is on the same line as the bullet. If
1356 * it is, we want to merge the 2 divs.
1357 *
1358 * But that's a really tricky thing to do given the recursive
1359 * block walk we are current doing. Think about this.
1360 * For now, we just mark the <> as being a list item.
1361 */
1362 }
1363 if (data->state == FOUND_BULLET)
1364 {
1365 if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET)
1366 {
1367 /* We need to split the block before the start of the bullet. */
1368 (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start);
1369 data->changed = 1;
1370 return;
1371 }
1372 if (data->bullet_line_start != data->this_line_start)
1373 {
1374 /* We've found a second bullet. Break before the line. */
1375 (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
1376 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
1377 data->changed = 1;
1378 return;
1379 }
1380 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
1381 }
1382 else if (data->state == NO_BULLET && data->bullet_line_start)
1383 {
1384 /* We've found a bullet before, and the line we've just completed
1385 * is neither a new bullet line, or a continuation so, we need to
1386 * break that into a new block. */
1387 (void)split_block_at_line(ctx, data->pos, block, data->this_line_start);
1388 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
1389 data->changed = 1;
1390 return;
1391 }
1392 else if (data->bullet_line_start)
1393 {
1394 /* We've come to the end of the block still in the list item. */
1395 block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM);
1396 }
1397 }
1398
1399 static int
1400 break_list_items(fz_context *ctx, stext_pos *pos, fz_stext_block *block)
1401 {
1402 list_data data[1];
1403
1404 if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
1405 return 0;
1406
1407 data->pos = pos;
1408 data->state = LOOKING_FOR_BULLET;
1409 data->buffer_fill = 0;
1410 data->l = block->bbox.x1;
1411 data->bullet_line_start = NULL;
1412 data->this_line_start = block->u.t.first_line;
1413 data->bullet_r = block->bbox.x0;
1414 data->changed = 0;
1415
1416 line_walker(ctx, block, list_newline, list_line, list_end, data);
1417
1418 return data->changed;
1419 }
1420
1421 static int
1422 is_header(fz_structure s)
1423 {
1424 return (s == FZ_STRUCTURE_H ||
1425 s == FZ_STRUCTURE_H1 ||
1426 s == FZ_STRUCTURE_H2 ||
1427 s == FZ_STRUCTURE_H3 ||
1428 s == FZ_STRUCTURE_H4 ||
1429 s == FZ_STRUCTURE_H5 ||
1430 s == FZ_STRUCTURE_H6);
1431 }
1432
1433 static void
1434 do_para_break(fz_context *ctx, fz_stext_page *page, fz_stext_block **pfirst, fz_stext_block **plast, fz_stext_struct *parent, int in_header)
1435 {
1436 fz_stext_block *block, *next_block;
1437 stext_pos pos;
1438 fz_rect bbox;
1439
1440 pos.pool = page->pool;
1441 pos.idx = 0;
1442 pos.pfirst = pfirst;
1443 pos.plast = plast;
1444 pos.parent = parent;
1445
1446 /* First off, in order for us to consider a block to be suitable for paragraph
1447 * splitting, we want it to be a series of lines moving down the page, (or left
1448 * to right within a line). */
1449 for (block = *pfirst; block != NULL; block = next_block)
1450 {
1451 next_block = block->next;
1452
1453 switch (block->type)
1454 {
1455 case FZ_STEXT_BLOCK_STRUCT:
1456 if (block->u.s.index < pos.idx)
1457 block->u.s.index = pos.idx++;
1458 else
1459 pos.idx = block->u.s.index+1;
1460 if (block->u.s.down)
1461 {
1462 int header = in_header | is_header(block->u.s.down->standard);
1463 do_para_break(ctx, page, &block->u.s.down->first_block, &block->u.s.down->last_block, block->u.s.down, header);
1464 }
1465 break;
1466 case FZ_STEXT_BLOCK_TEXT:
1467 if (!lines_move_plausibly_like_paragraph(block))
1468 break;
1469
1470 #ifdef DEBUG_SPLITS
1471 dump_block(ctx, "Around the top level block loop:", block);
1472 #endif
1473
1474 /* Firstly, and somewhat annoyingly we need to find the bbox of the
1475 * block that doesn't include for trailing spaces. If we just use
1476 * the normal bbox, then lines that end in "foo " will end further
1477 * to the right of lines that end in "ba-", and consequently we'll
1478 * fail to detect blocks as being justified.
1479 * See PMC2656817_00002.pdf as an example. */
1480 bbox = text_block_marked_bbox(ctx, block);
1481
1482 #ifdef DEBUG_PARA_SPLITS
1483 {
1484 fz_stext_line *line;
1485
1486 for (line = block->u.t.first_line; line != NULL; line = line->next)
1487 {
1488 fz_stext_char *ch;
1489
1490 for (ch = line->first_char; ch != NULL; ch = ch->next)
1491 {
1492 fz_write_printf(ctx, fz_stddbg(ctx), "%C", ch->c);
1493 }
1494 }
1495 }
1496 #endif
1497
1498 /* Think about breaking lines at Titles. */
1499 /* First, underlined ones. */
1500 if (detect_underlined_titles(ctx, &pos, block))
1501 next_block = block->next; /* We split the block! */
1502 if (block->type != FZ_STEXT_BLOCK_TEXT)
1503 {
1504 next_block = block;
1505 break;
1506 }
1507
1508 #ifdef DEBUG_PARA_SPLITS
1509 fz_write_printf(ctx, fz_stddbg(ctx), "A");
1510 #endif
1511
1512 /* Next, ones that use bold fonts. */
1513 if (!in_header)
1514 {
1515 if (detect_titles_by_font_usage(ctx, &pos, block))
1516 next_block = block->next; /* We split the block! */
1517 if (block->type != FZ_STEXT_BLOCK_TEXT)
1518 {
1519 next_block = block;
1520 break;
1521 }
1522 }
1523
1524 #ifdef DEBUG_PARA_SPLITS
1525 fz_write_printf(ctx, fz_stddbg(ctx), "B");
1526 #endif
1527
1528 /* Now look at breaking based upon indents */
1529 if (break_paragraphs_by_indent(ctx, &pos, block, bbox))
1530 next_block = block->next; /* We split the block! */
1531 if (block->type != FZ_STEXT_BLOCK_TEXT)
1532 {
1533 next_block = block;
1534 break;
1535 }
1536
1537 #ifdef DEBUG_PARA_SPLITS
1538 fz_write_printf(ctx, fz_stddbg(ctx), "C");
1539 #endif
1540
1541 /* Now we're going to look for unindented paragraphs. We do this by
1542 * considering if the first word on the next line would have fitted
1543 * into the space left at the end of the previous line. */
1544 if (break_paragraphs_by_analysing_trailing_gaps(ctx, &pos, block, bbox))
1545 next_block = block->next; /* We split the block! */
1546 if (block->type != FZ_STEXT_BLOCK_TEXT)
1547 {
1548 next_block = block;
1549 break;
1550 }
1551
1552 #ifdef DEBUG_PARA_SPLITS
1553 fz_write_printf(ctx, fz_stddbg(ctx), "D");
1554 #endif
1555
1556 /* Now look to see if a block looks like fully justified text. If it
1557 * does, then any line that doesn't reach the right hand side must be
1558 * a paragraph break. */
1559 if (break_paragraphs_within_justified_text(ctx, &pos, block, bbox))
1560 next_block = block->next; /* We split the block! */
1561 if (block->type != FZ_STEXT_BLOCK_TEXT)
1562 {
1563 next_block = block;
1564 break;
1565 }
1566
1567 #ifdef DEBUG_PARA_SPLITS
1568 fz_write_printf(ctx, fz_stddbg(ctx), "E");
1569 #endif
1570
1571 /* Look for bulleted list items. */
1572 if (break_list_items(ctx, &pos, block))
1573 next_block = block->next; /* We split the block! */
1574
1575 break;
1576 }
1577 }
1578 }
1579
1580 void
1581 fz_paragraph_break(fz_context *ctx, fz_stext_page *page)
1582 {
1583 do_para_break(ctx, page, &page->first_block, &page->last_block, NULL, 0);
1584 }