comparison mupdf-source/include/mupdf/fitz/structured-text.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
24 #define MUPDF_FITZ_STRUCTURED_TEXT_H
25
26 #include "mupdf/fitz/system.h"
27 #include "mupdf/fitz/types.h"
28 #include "mupdf/fitz/context.h"
29 #include "mupdf/fitz/geometry.h"
30 #include "mupdf/fitz/font.h"
31 #include "mupdf/fitz/image.h"
32 #include "mupdf/fitz/output.h"
33 #include "mupdf/fitz/device.h"
34 #include "mupdf/fitz/pool.h"
35
36 /**
37 Simple text layout (for use with annotation editing primarily).
38 */
39 typedef struct fz_layout_char
40 {
41 float x, advance;
42 const char *p; /* location in source text of character */
43 struct fz_layout_char *next;
44 } fz_layout_char;
45
46 typedef struct fz_layout_line
47 {
48 float x, y, font_size;
49 const char *p; /* location in source text of start of line */
50 fz_layout_char *text;
51 struct fz_layout_line *next;
52 } fz_layout_line;
53
54 typedef struct
55 {
56 fz_pool *pool;
57 fz_matrix matrix;
58 fz_matrix inv_matrix;
59 fz_layout_line *head, **tailp;
60 fz_layout_char **text_tailp;
61 } fz_layout_block;
62
63 /**
64 Create a new layout block, with new allocation pool, zero
65 matrices, and initialise linked pointers.
66 */
67 fz_layout_block *fz_new_layout(fz_context *ctx);
68
69 /**
70 Drop layout block. Free the pool, and linked blocks.
71
72 Never throws exceptions.
73 */
74 void fz_drop_layout(fz_context *ctx, fz_layout_block *block);
75
76 /**
77 Add a new line to the end of the layout block.
78 */
79 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p);
80
81 /**
82 Add a new char to the line at the end of the layout block.
83 */
84 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p);
85
86 /**
87 Text extraction device: Used for searching, format conversion etc.
88
89 (In development - Subject to change in future versions)
90 */
91
92 typedef struct fz_stext_char fz_stext_char;
93 typedef struct fz_stext_line fz_stext_line;
94 typedef struct fz_stext_block fz_stext_block;
95 typedef struct fz_stext_struct fz_stext_struct;
96 typedef struct fz_stext_grid_positions fz_stext_grid_positions;
97
98 /**
99 FZ_STEXT_PRESERVE_LIGATURES: If this option is activated
100 ligatures are passed through to the application in their
101 original form. If this option is deactivated ligatures are
102 expanded into their constituent parts, e.g. the ligature ffi is
103 expanded into three separate characters f, f and i.
104
105 FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated
106 whitespace is passed through to the application in its original
107 form. If this option is deactivated any type of horizontal
108 whitespace (including horizontal tabs) will be replaced with
109 space characters of variable width.
110
111 FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images
112 will be stored in the structured text structure. The default is
113 to ignore all images.
114
115 FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try
116 to add missing space characters where there are large gaps
117 between characters.
118
119 FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the
120 end of a line will be removed and the lines will be merged.
121
122 FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line
123 will not be merged. Each line will thus be a span of text with the same
124 font, colour, and size.
125
126 FZ_STEXT_CLIP: If this option is set, characters that would be entirely
127 clipped away by the current clipping path (or, more accurate, the smallest
128 bbox that contains the current clipping path) will be ignored. The
129 clip path is guaranteed to be smaller then the page mediabox, hence
130 this option subsumes an older, now deprecated, FZ_STEXT_MEDIABOX_CLIP
131 option.
132
133 FZ_STEXT_CLIP_RECT: If this option is set, characters that would be entirely
134 clipped away by the specified 'clip' rectangle in the options struct
135 will be ignored. This enables content from specific subsections of pages to
136 be extracted.
137
138 FZ_STEXT_COLLECT_STRUCTURE: If this option is set, we will collect
139 the structure as specified using begin/end_structure calls. This will
140 change the returned stext structure from being a simple list of blocks
141 into effectively being a 'tree' that should be walked in depth-first
142 order.
143
144 FZ_STEXT_COLLECT_VECTORS: If this option is set, we will collect
145 details (currently just the bbox) of vector graphics. This is intended
146 to be of use in segmentation analysis.
147
148 FZ_STEXT_IGNORE_ACTUALTEXT: If this option is set, we will no longer
149 replace text by the ActualText replacement specified in the document.
150
151 FZ_STEXT_SEGMENT: If this option is set, we will attempt to segment
152 the page into different regions. This will deliberately not do anything
153 to pages with structure information present.
154
155 FZ_STEXT_PARAGRAPH_BREAK: If this option is set, we will break blocks
156 of text at what appear to be paragraph boundaries. This only works
157 for left-to-right, top-to-bottom paragraphs. Works best on a segmented
158 page.
159
160 FZ_STEXT_TABLE_HUNT: If this option is set, we will hunt for tables
161 within the stext. Details of the potential tables found will be
162 inserted into the stext for the caller to interpret. This will work
163 best on a segmented page.
164
165 FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE: If this option is set, then
166 in the event that we fail to find a unicode value for a given
167 character, we we instead return its CID in the unicode field. We
168 will set the FZ_STEXT_UNICODE_IS_CID bit in the char flags word to
169 indicate that this has happened.
170
171 FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE: If this option is set, then
172 in the event that we fail to find a unicode value for a given
173 character, we we instead return its glyph in the unicode field.
174 We will set the FZ_STEXT_UNICODE_IS_GID bit in the char flags word
175 to indicate that this has happened.
176
177 Setting both FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE and
178 FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE will give undefined behaviour.
179
180 */
181 enum
182 {
183 FZ_STEXT_PRESERVE_LIGATURES = 1,
184 FZ_STEXT_PRESERVE_WHITESPACE = 2,
185 FZ_STEXT_PRESERVE_IMAGES = 4,
186 FZ_STEXT_INHIBIT_SPACES = 8,
187 FZ_STEXT_DEHYPHENATE = 16,
188 FZ_STEXT_PRESERVE_SPANS = 32,
189 FZ_STEXT_CLIP = 64,
190 FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE = 128,
191 FZ_STEXT_COLLECT_STRUCTURE = 256,
192 FZ_STEXT_ACCURATE_BBOXES = 512,
193 FZ_STEXT_COLLECT_VECTORS = 1024,
194 FZ_STEXT_IGNORE_ACTUALTEXT = 2048,
195 FZ_STEXT_SEGMENT = 4096,
196 FZ_STEXT_PARAGRAPH_BREAK = 8192,
197 FZ_STEXT_TABLE_HUNT = 16384,
198 FZ_STEXT_COLLECT_STYLES = 32768,
199 FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE = 65536,
200 FZ_STEXT_CLIP_RECT = (1<<17),
201 FZ_STEXT_ACCURATE_ASCENDERS = (1<<18),
202 FZ_STEXT_ACCURATE_SIDE_BEARINGS = (1<<19),
203
204 /* An old, deprecated option. */
205 FZ_STEXT_MEDIABOX_CLIP = FZ_STEXT_CLIP
206 };
207
208 /**
209 * A note on stext's handling of structure.
210 *
211 * A PDF document can contain a structure tree. This gives the
212 * structure of a document in its entirety as a tree. e.g.
213 *
214 * Tree MCID INDEX
215 * -------------------------------------
216 * DOC 0 0
217 * TOC 1 0
218 * TOC_ITEM 2 0
219 * TOC_ITEM 3 1
220 * TOC_ITEM 4 2
221 * ...
222 * STORY 100 1
223 * SECTION 101 0
224 * HEADING 102 0
225 * SUBSECTION 103 1
226 * PARAGRAPH 104 0
227 * PARAGRAPH 105 1
228 * PARAGRAPH 106 2
229 * SUBSECTION 107 2
230 * PARAGRAPH 108 0
231 * PARAGRAPH 109 1
232 * PARAGRAPH 110 2
233 * ...
234 * SECTION 200 1
235 * ...
236 *
237 * Each different section of the tree is identified as part of an
238 * MCID by a number (this is a slight simplification, but makes the
239 * explanation easier).
240 *
241 * The PDF document contains markings that say "Entering MCID 0"
242 * and "Leaving MCID 0". Any content within that region is therefore
243 * identified as appearing in that particular structural region.
244 *
245 * This means that content can be sent in the document in a different
246 * order to which it appears 'logically' in the tree.
247 *
248 * MuPDF converts this tree form into a nested series of calls to
249 * begin_structure and end_structure.
250 *
251 * For instance, if the document started out with MCID 100, then
252 * we'd send:
253 * begin_structure("DOC")
254 * begin_structure("STORY")
255 *
256 * The problem with this is that if we send:
257 * begin_structure("DOC")
258 * begin_structure("STORY")
259 * begin_structure("SECTION")
260 * begin_structure("SUBSECTION")
261 *
262 * or
263 * begin_structure("DOC")
264 * begin_structure("STORY")
265 * begin_structure("SECTION")
266 * begin_structure("HEADING")
267 *
268 * How do I know what order the SECTION and HEADING should appear in?
269 * Are they even in the same STORY? Or the same DOC?
270 *
271 * Accordingly, every begin_structure is accompanied not only with the
272 * node type, but with an index. The index is the number of this node
273 * within this level of the tree. Hence:
274 *
275 * begin_structure("DOC", 0)
276 * begin_structure("STORY", 0)
277 * begin_structure("SECTION", 0)
278 * begin_structure("HEADING", 0)
279 * and
280 * begin_structure("DOC", 0)
281 * begin_structure("STORY", 0)
282 * begin_structure("SECTION", 0)
283 * begin_structure("SUBSECTION", 1)
284 *
285 * are now unambiguous in their describing of the tree.
286 *
287 * MuPDF automatically sends the minimal end_structure/begin_structure
288 * pairs to move us between nodes in the tree.
289 *
290 * In order to accommodate this information within the structured text
291 * data structures an additional block type is used. Previously a
292 * "page" was just a list of blocks, either text or images. e.g.
293 *
294 * [BLOCK:TEXT] <-> [BLOCK:IMG] <-> [BLOCK:TEXT] <-> [BLOCK:TEXT] ...
295 *
296 * We now introduce a new type of block, STRUCT, that turns this into
297 * a tree:
298 *
299 * [BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ...
300 * /|\
301 * [STRUCT:TYPE=DOC] <----
302 * |
303 * [BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ...
304 * /|\
305 * [STRUCT:TYPE=STORY] <--
306 * |
307 * ...
308 *
309 * Rather than doing a simple linear traversal of the list to extract
310 * the logical data, a caller now has to do a depth-first traversal.
311 */
312
313 /**
314 A text page is a list of blocks, together with an overall
315 bounding box.
316 */
317 typedef struct
318 {
319 fz_pool *pool;
320 fz_rect mediabox;
321 fz_stext_block *first_block;
322
323 /* The following fields are only of use to the routines that
324 * build an fz_stext_page. They change during page construction
325 * and their meaning is subject to change. These values should
326 * not be used by anything outside of the stext device. */
327 fz_stext_block *last_block;
328 fz_stext_struct *last_struct;
329 } fz_stext_page;
330
331 enum
332 {
333 FZ_STEXT_BLOCK_TEXT = 0,
334 FZ_STEXT_BLOCK_IMAGE = 1,
335 FZ_STEXT_BLOCK_STRUCT = 2,
336 FZ_STEXT_BLOCK_VECTOR = 3,
337 FZ_STEXT_BLOCK_GRID = 4
338 };
339
340 enum
341 {
342 FZ_STEXT_TEXT_JUSTIFY_UNKNOWN = 0,
343 FZ_STEXT_TEXT_JUSTIFY_LEFT = 1,
344 FZ_STEXT_TEXT_JUSTIFY_CENTRE = 2,
345 FZ_STEXT_TEXT_JUSTIFY_RIGHT = 3,
346 FZ_STEXT_TEXT_JUSTIFY_FULL = 4,
347 };
348
349 enum
350 {
351 /* Indicates that this vector came from a stroked
352 * path. */
353 FZ_STEXT_VECTOR_IS_STROKED = 1,
354
355 /* Indicates that this vector came from a rectangular
356 * (axis-aligned) path (or path segment). */
357 FZ_STEXT_VECTOR_IS_RECTANGLE = 2,
358
359 /* Indicates that this vector came from a path
360 * segment, and more segments from this same path are
361 * still to come. */
362 FZ_STEXT_VECTOR_CONTINUES = 4
363 };
364
365 /**
366 A text block is a list of lines of text (typically a paragraph),
367 or an image.
368 */
369 struct fz_stext_block
370 {
371 int type;
372 fz_rect bbox;
373 union {
374 struct { fz_stext_line *first_line, *last_line; int flags;} t;
375 struct { fz_matrix transform; fz_image *image; } i;
376 struct { fz_stext_struct *down; int index; } s;
377 struct { uint32_t flags; uint32_t argb; } v;
378 struct { fz_stext_grid_positions *xs; fz_stext_grid_positions *ys; } b;
379 } u;
380 fz_stext_block *prev, *next;
381 };
382
383 /**
384 A text line is a list of characters that share a common baseline.
385 */
386 struct fz_stext_line
387 {
388 int wmode; /* 0 for horizontal, 1 for vertical */
389 fz_point dir; /* normalized direction of baseline */
390 fz_rect bbox;
391 fz_stext_char *first_char, *last_char;
392 fz_stext_line *prev, *next;
393 };
394
395 /**
396 A text char is a unicode character, the style in which is
397 appears, and the point at which it is positioned.
398 */
399 struct fz_stext_char
400 {
401 int c; /* unicode character value */
402 uint16_t bidi; /* even for LTR, odd for RTL - probably only needs 8 bits? */
403 uint16_t flags;
404 uint32_t argb; /* sRGB hex color (alpha in top 8 bits, then r, then g, then b in low bits) */
405 fz_point origin;
406 fz_quad quad;
407 float size;
408 fz_font *font;
409 fz_stext_char *next;
410 };
411
412 enum
413 {
414 FZ_STEXT_STRIKEOUT = 1,
415 FZ_STEXT_UNDERLINE = 2,
416 FZ_STEXT_SYNTHETIC = 4,
417 FZ_STEXT_BOLD = 8, /* Either real or 'fake' bold */
418 FZ_STEXT_FILLED = 16,
419 FZ_STEXT_STROKED = 32,
420 FZ_STEXT_CLIPPED = 64,
421 FZ_STEXT_UNICODE_IS_CID = 128,
422 FZ_STEXT_UNICODE_IS_GID = 256,
423 };
424
425 /**
426 When we are collecting the structure information from
427 PDF structure trees/tags, we end up with a tree of
428 nodes. The structure should be walked in depth-first
429 traversal order to extract the content.
430
431 An fz_stext_struct pointer can be NULL to indicate that
432 we know there is a child there within the complete tree,
433 but we don't know what it is yet.
434 */
435 struct fz_stext_struct
436 {
437 /* up points to the block that contains this fz_stext_struct. */
438 fz_stext_block *up;
439 /* parent points to the struct that has up as one of its children.
440 * parent is useful for doing depth first traversal without having
441 * to store the entire chain of structs in the iterator. */
442 fz_stext_struct *parent;
443
444 /* first_block points to the first child of this node (or NULL
445 * if there are none). */
446 fz_stext_block *first_block;
447 /* last_block points to the last child of this node (or NULL
448 * if there are none). */
449 fz_stext_block *last_block;
450
451 /* We have a set of 'standard' structure types. Every structure
452 * element should correspond to one of these. */
453 fz_structure standard;
454 /* Documents can use their own non-standard structure types, which
455 * are held as 'raw' strings. */
456 char raw[FZ_FLEXIBLE_ARRAY];
457 };
458
459 /* An example to show how fz_stext_blocks and fz_stext_structs interact:
460 *
461 * [fz_stext_page]
462 * |
463 * first_block|
464 * |
465 * \|/
466 * [fz_stext_block:TEXT]<->[fz_stext_block:STRUCT]<->[fz_stext_block:IMG]
467 * u.s.down| /|\
468 * | |
469 * \|/ |up
470 * [fz_stext_struct]<---------.
471 * | | |
472 * first_block| |last_block |
473 * _______________________| | |
474 * | | |
475 * | | |
476 * \|/ \|/ |
477 * [fz_stext_block:...]<->...<->[fz_stext_block:STRUCT] |
478 * | /|\ |
479 * u.s.down| |up |
480 * \|/ | parent|
481 * [fz_stext_struct]--------'
482 * | |
483 * first_block| |last_block
484 * : :
485 */
486
487 struct fz_stext_grid_positions
488 {
489 int len;
490 int max_uncertainty;
491 struct {
492 int reinforcement;
493 float pos;
494 float min;
495 float max;
496 int uncertainty;
497 } list[FZ_FLEXIBLE_ARRAY];
498 };
499
500 FZ_DATA extern const char *fz_stext_options_usage;
501
502 /**
503 Create an empty text page.
504
505 The text page is filled out by the text device to contain the
506 blocks and lines of text on the page.
507
508 mediabox: optional mediabox information.
509 */
510 fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox);
511 void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
512
513 /**
514 Output structured text to a file in HTML (visual) format.
515 */
516 void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
517 void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out);
518 void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out);
519
520 /**
521 Output structured text to a file in XHTML (semantic) format.
522 */
523 void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
524 void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out);
525 void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out);
526
527 /**
528 Output structured text to a file in XML format.
529 */
530 void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
531
532 /**
533 Output structured text to a file in JSON format.
534 */
535 void fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale);
536
537 /**
538 Output structured text to a file in plain-text UTF-8 format.
539 */
540 void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page);
541
542 /**
543 Search for occurrence of 'needle' in text page.
544
545 Return the number of quads and store hit quads in the passed in
546 array.
547
548 NOTE: This is an experimental interface and subject to change
549 without notice.
550 */
551 int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, int *hit_mark, fz_quad *hit_bbox, int hit_max);
552
553 /**
554 Callback function for use in searching.
555
556 Called with the list of quads that correspond to a single hit.
557
558 The callback should return with 0 to continue the search, or 1 to abort it.
559 All other values are reserved at this point.
560 */
561 typedef int (fz_search_callback_fn)(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox);
562
563 /**
564 Search for occurrence of 'needle' in text page.
565
566 Call callback once for each hit. This callback will receive
567 (potentially) multiple quads for each hit.
568
569 Returns the number of hits - note that this is potentially
570 different from (i.e. is not greater than) the number of quads
571 as returned by the non callback API.
572
573 NOTE: This is an experimental interface and subject to change
574 without notice.
575 */
576 int fz_search_stext_page_cb(fz_context *ctx, fz_stext_page *text, const char *needle, fz_search_callback_fn *cb, void *opaque);
577
578
579 /**
580 Return a list of quads to highlight lines inside the selection
581 points.
582 */
583 int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads);
584
585 enum
586 {
587 FZ_SELECT_CHARS,
588 FZ_SELECT_WORDS,
589 FZ_SELECT_LINES,
590 };
591
592 fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode);
593
594 /**
595 Return a newly allocated UTF-8 string with the text for a given
596 selection.
597
598 crlf: If true, write "\r\n" style line endings (otherwise "\n"
599 only).
600 */
601 char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf);
602
603 /**
604 Return a newly allocated UTF-8 string with the text for a given
605 selection rectangle.
606
607 crlf: If true, write "\r\n" style line endings (otherwise "\n"
608 only).
609 */
610 char *fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf);
611
612 /**
613 Options for creating structured text.
614 */
615 typedef struct
616 {
617 int flags;
618 float scale;
619 fz_rect clip;
620 } fz_stext_options;
621
622 /**
623 Parse stext device options from a comma separated key-value
624 string.
625 */
626 fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
627
628 /**
629 Perform segmentation analysis on an (unstructured) page to look for
630 recursive subdivisions.
631
632 Essentially this code attempts to split the page horizontally and/or
633 vertically repeatedly into smaller and smaller "segments" (divisions).
634
635 Returns 0 if no changes were made to the document.
636
637 This is experimental code, and may change (or be removed) in future
638 versions!
639 */
640 int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page);
641
642 /**
643 Attempt to break paragraphs at plausible places.
644 */
645 void fz_paragraph_break(fz_context *ctx, fz_stext_page *page);
646
647 /**
648 Hunt for possible tables on a page, and update the stext with
649 information.
650 */
651 void fz_table_hunt(fz_context *ctx, fz_stext_page *page);
652
653 /**
654 Interpret the bounded contents of a given stext page as
655 a table.
656
657 The page contents will be rewritten to contain a Table
658 structure with the identified content in it.
659
660 This uses the same logic as for fz_table_hunt, without the
661 actual hunting. fz_table_hunt hunts to find possible bounds
662 for multiple tables on the page; this routine just finds a
663 single table contained within the given rectangle.
664
665 Returns the stext_block list that contains the content of
666 the table.
667 */
668 fz_stext_block *
669 fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds);
670
671 /**
672 Create a device to extract the text on a page.
673
674 Gather the text on a page into blocks and lines.
675
676 The reading order is taken from the order the text is drawn in
677 the source file, so may not be accurate.
678
679 page: The text page to which content should be added. This will
680 usually be a newly created (empty) text page, but it can be one
681 containing data already (for example when merging multiple
682 pages, or watermarking).
683
684 options: Options to configure the stext device.
685 */
686 fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
687
688 /**
689 Create a device to OCR the text on the page.
690
691 Renders the page internally to a bitmap that is then OCRd. Text
692 is then forwarded onto the target device.
693
694 target: The target device to receive the OCRd text.
695
696 ctm: The transform to apply to the mediabox to get the size for
697 the rendered page image. Also used to calculate the resolution
698 for the page image. In general, this will be the same as the CTM
699 that you pass to fz_run_page (or fz_run_display_list) to feed
700 this device.
701
702 mediabox: The mediabox (in points). Combined with the CTM to get
703 the bounds of the pixmap used internally for the rendered page
704 image.
705
706 with_list: If with_list is false, then all non-text operations
707 are forwarded instantly to the target device. This results in
708 the target device seeing all NON-text operations, followed by
709 all the text operations (derived from OCR).
710
711 If with_list is true, then all the marking operations are
712 collated into a display list which is then replayed to the
713 target device at the end.
714
715 language: NULL (for "eng"), or a pointer to a string to describe
716 the languages/scripts that should be used for OCR (e.g.
717 "eng,ara").
718
719 datadir: NULL (for ""), or a pointer to a path string otherwise
720 provided to Tesseract in the TESSDATA_PREFIX environment variable.
721
722 progress: NULL, or function to be called periodically to indicate
723 progress. Return 0 to continue, or 1 to cancel. progress_arg is
724 returned as the void *. The int is a value between 0 and 100 to
725 indicate progress.
726
727 progress_arg: A void * value to be parrotted back to the progress
728 function.
729 */
730 fz_device *fz_new_ocr_device(fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language,
731 const char *datadir, int (*progress)(fz_context *, void *, int), void *progress_arg);
732
733 fz_document *fz_open_reflowed_document(fz_context *ctx, fz_document *underdoc, const fz_stext_options *opts);
734
735
736 #endif