Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/include/mupdf/fitz/structured-text.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H | |
| 24 #define MUPDF_FITZ_STRUCTURED_TEXT_H | |
| 25 | |
| 26 #include "mupdf/fitz/system.h" | |
| 27 #include "mupdf/fitz/types.h" | |
| 28 #include "mupdf/fitz/context.h" | |
| 29 #include "mupdf/fitz/geometry.h" | |
| 30 #include "mupdf/fitz/font.h" | |
| 31 #include "mupdf/fitz/image.h" | |
| 32 #include "mupdf/fitz/output.h" | |
| 33 #include "mupdf/fitz/device.h" | |
| 34 #include "mupdf/fitz/pool.h" | |
| 35 | |
| 36 /** | |
| 37 Simple text layout (for use with annotation editing primarily). | |
| 38 */ | |
| 39 typedef struct fz_layout_char | |
| 40 { | |
| 41 float x, advance; | |
| 42 const char *p; /* location in source text of character */ | |
| 43 struct fz_layout_char *next; | |
| 44 } fz_layout_char; | |
| 45 | |
| 46 typedef struct fz_layout_line | |
| 47 { | |
| 48 float x, y, font_size; | |
| 49 const char *p; /* location in source text of start of line */ | |
| 50 fz_layout_char *text; | |
| 51 struct fz_layout_line *next; | |
| 52 } fz_layout_line; | |
| 53 | |
| 54 typedef struct | |
| 55 { | |
| 56 fz_pool *pool; | |
| 57 fz_matrix matrix; | |
| 58 fz_matrix inv_matrix; | |
| 59 fz_layout_line *head, **tailp; | |
| 60 fz_layout_char **text_tailp; | |
| 61 } fz_layout_block; | |
| 62 | |
| 63 /** | |
| 64 Create a new layout block, with new allocation pool, zero | |
| 65 matrices, and initialise linked pointers. | |
| 66 */ | |
| 67 fz_layout_block *fz_new_layout(fz_context *ctx); | |
| 68 | |
| 69 /** | |
| 70 Drop layout block. Free the pool, and linked blocks. | |
| 71 | |
| 72 Never throws exceptions. | |
| 73 */ | |
| 74 void fz_drop_layout(fz_context *ctx, fz_layout_block *block); | |
| 75 | |
| 76 /** | |
| 77 Add a new line to the end of the layout block. | |
| 78 */ | |
| 79 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p); | |
| 80 | |
| 81 /** | |
| 82 Add a new char to the line at the end of the layout block. | |
| 83 */ | |
| 84 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p); | |
| 85 | |
| 86 /** | |
| 87 Text extraction device: Used for searching, format conversion etc. | |
| 88 | |
| 89 (In development - Subject to change in future versions) | |
| 90 */ | |
| 91 | |
| 92 typedef struct fz_stext_char fz_stext_char; | |
| 93 typedef struct fz_stext_line fz_stext_line; | |
| 94 typedef struct fz_stext_block fz_stext_block; | |
| 95 typedef struct fz_stext_struct fz_stext_struct; | |
| 96 typedef struct fz_stext_grid_positions fz_stext_grid_positions; | |
| 97 | |
| 98 /** | |
| 99 FZ_STEXT_PRESERVE_LIGATURES: If this option is activated | |
| 100 ligatures are passed through to the application in their | |
| 101 original form. If this option is deactivated ligatures are | |
| 102 expanded into their constituent parts, e.g. the ligature ffi is | |
| 103 expanded into three separate characters f, f and i. | |
| 104 | |
| 105 FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated | |
| 106 whitespace is passed through to the application in its original | |
| 107 form. If this option is deactivated any type of horizontal | |
| 108 whitespace (including horizontal tabs) will be replaced with | |
| 109 space characters of variable width. | |
| 110 | |
| 111 FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images | |
| 112 will be stored in the structured text structure. The default is | |
| 113 to ignore all images. | |
| 114 | |
| 115 FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try | |
| 116 to add missing space characters where there are large gaps | |
| 117 between characters. | |
| 118 | |
| 119 FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the | |
| 120 end of a line will be removed and the lines will be merged. | |
| 121 | |
| 122 FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line | |
| 123 will not be merged. Each line will thus be a span of text with the same | |
| 124 font, colour, and size. | |
| 125 | |
| 126 FZ_STEXT_CLIP: If this option is set, characters that would be entirely | |
| 127 clipped away by the current clipping path (or, more accurate, the smallest | |
| 128 bbox that contains the current clipping path) will be ignored. The | |
| 129 clip path is guaranteed to be smaller then the page mediabox, hence | |
| 130 this option subsumes an older, now deprecated, FZ_STEXT_MEDIABOX_CLIP | |
| 131 option. | |
| 132 | |
| 133 FZ_STEXT_CLIP_RECT: If this option is set, characters that would be entirely | |
| 134 clipped away by the specified 'clip' rectangle in the options struct | |
| 135 will be ignored. This enables content from specific subsections of pages to | |
| 136 be extracted. | |
| 137 | |
| 138 FZ_STEXT_COLLECT_STRUCTURE: If this option is set, we will collect | |
| 139 the structure as specified using begin/end_structure calls. This will | |
| 140 change the returned stext structure from being a simple list of blocks | |
| 141 into effectively being a 'tree' that should be walked in depth-first | |
| 142 order. | |
| 143 | |
| 144 FZ_STEXT_COLLECT_VECTORS: If this option is set, we will collect | |
| 145 details (currently just the bbox) of vector graphics. This is intended | |
| 146 to be of use in segmentation analysis. | |
| 147 | |
| 148 FZ_STEXT_IGNORE_ACTUALTEXT: If this option is set, we will no longer | |
| 149 replace text by the ActualText replacement specified in the document. | |
| 150 | |
| 151 FZ_STEXT_SEGMENT: If this option is set, we will attempt to segment | |
| 152 the page into different regions. This will deliberately not do anything | |
| 153 to pages with structure information present. | |
| 154 | |
| 155 FZ_STEXT_PARAGRAPH_BREAK: If this option is set, we will break blocks | |
| 156 of text at what appear to be paragraph boundaries. This only works | |
| 157 for left-to-right, top-to-bottom paragraphs. Works best on a segmented | |
| 158 page. | |
| 159 | |
| 160 FZ_STEXT_TABLE_HUNT: If this option is set, we will hunt for tables | |
| 161 within the stext. Details of the potential tables found will be | |
| 162 inserted into the stext for the caller to interpret. This will work | |
| 163 best on a segmented page. | |
| 164 | |
| 165 FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE: If this option is set, then | |
| 166 in the event that we fail to find a unicode value for a given | |
| 167 character, we we instead return its CID in the unicode field. We | |
| 168 will set the FZ_STEXT_UNICODE_IS_CID bit in the char flags word to | |
| 169 indicate that this has happened. | |
| 170 | |
| 171 FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE: If this option is set, then | |
| 172 in the event that we fail to find a unicode value for a given | |
| 173 character, we we instead return its glyph in the unicode field. | |
| 174 We will set the FZ_STEXT_UNICODE_IS_GID bit in the char flags word | |
| 175 to indicate that this has happened. | |
| 176 | |
| 177 Setting both FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE and | |
| 178 FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE will give undefined behaviour. | |
| 179 | |
| 180 */ | |
| 181 enum | |
| 182 { | |
| 183 FZ_STEXT_PRESERVE_LIGATURES = 1, | |
| 184 FZ_STEXT_PRESERVE_WHITESPACE = 2, | |
| 185 FZ_STEXT_PRESERVE_IMAGES = 4, | |
| 186 FZ_STEXT_INHIBIT_SPACES = 8, | |
| 187 FZ_STEXT_DEHYPHENATE = 16, | |
| 188 FZ_STEXT_PRESERVE_SPANS = 32, | |
| 189 FZ_STEXT_CLIP = 64, | |
| 190 FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE = 128, | |
| 191 FZ_STEXT_COLLECT_STRUCTURE = 256, | |
| 192 FZ_STEXT_ACCURATE_BBOXES = 512, | |
| 193 FZ_STEXT_COLLECT_VECTORS = 1024, | |
| 194 FZ_STEXT_IGNORE_ACTUALTEXT = 2048, | |
| 195 FZ_STEXT_SEGMENT = 4096, | |
| 196 FZ_STEXT_PARAGRAPH_BREAK = 8192, | |
| 197 FZ_STEXT_TABLE_HUNT = 16384, | |
| 198 FZ_STEXT_COLLECT_STYLES = 32768, | |
| 199 FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE = 65536, | |
| 200 FZ_STEXT_CLIP_RECT = (1<<17), | |
| 201 FZ_STEXT_ACCURATE_ASCENDERS = (1<<18), | |
| 202 FZ_STEXT_ACCURATE_SIDE_BEARINGS = (1<<19), | |
| 203 | |
| 204 /* An old, deprecated option. */ | |
| 205 FZ_STEXT_MEDIABOX_CLIP = FZ_STEXT_CLIP | |
| 206 }; | |
| 207 | |
| 208 /** | |
| 209 * A note on stext's handling of structure. | |
| 210 * | |
| 211 * A PDF document can contain a structure tree. This gives the | |
| 212 * structure of a document in its entirety as a tree. e.g. | |
| 213 * | |
| 214 * Tree MCID INDEX | |
| 215 * ------------------------------------- | |
| 216 * DOC 0 0 | |
| 217 * TOC 1 0 | |
| 218 * TOC_ITEM 2 0 | |
| 219 * TOC_ITEM 3 1 | |
| 220 * TOC_ITEM 4 2 | |
| 221 * ... | |
| 222 * STORY 100 1 | |
| 223 * SECTION 101 0 | |
| 224 * HEADING 102 0 | |
| 225 * SUBSECTION 103 1 | |
| 226 * PARAGRAPH 104 0 | |
| 227 * PARAGRAPH 105 1 | |
| 228 * PARAGRAPH 106 2 | |
| 229 * SUBSECTION 107 2 | |
| 230 * PARAGRAPH 108 0 | |
| 231 * PARAGRAPH 109 1 | |
| 232 * PARAGRAPH 110 2 | |
| 233 * ... | |
| 234 * SECTION 200 1 | |
| 235 * ... | |
| 236 * | |
| 237 * Each different section of the tree is identified as part of an | |
| 238 * MCID by a number (this is a slight simplification, but makes the | |
| 239 * explanation easier). | |
| 240 * | |
| 241 * The PDF document contains markings that say "Entering MCID 0" | |
| 242 * and "Leaving MCID 0". Any content within that region is therefore | |
| 243 * identified as appearing in that particular structural region. | |
| 244 * | |
| 245 * This means that content can be sent in the document in a different | |
| 246 * order to which it appears 'logically' in the tree. | |
| 247 * | |
| 248 * MuPDF converts this tree form into a nested series of calls to | |
| 249 * begin_structure and end_structure. | |
| 250 * | |
| 251 * For instance, if the document started out with MCID 100, then | |
| 252 * we'd send: | |
| 253 * begin_structure("DOC") | |
| 254 * begin_structure("STORY") | |
| 255 * | |
| 256 * The problem with this is that if we send: | |
| 257 * begin_structure("DOC") | |
| 258 * begin_structure("STORY") | |
| 259 * begin_structure("SECTION") | |
| 260 * begin_structure("SUBSECTION") | |
| 261 * | |
| 262 * or | |
| 263 * begin_structure("DOC") | |
| 264 * begin_structure("STORY") | |
| 265 * begin_structure("SECTION") | |
| 266 * begin_structure("HEADING") | |
| 267 * | |
| 268 * How do I know what order the SECTION and HEADING should appear in? | |
| 269 * Are they even in the same STORY? Or the same DOC? | |
| 270 * | |
| 271 * Accordingly, every begin_structure is accompanied not only with the | |
| 272 * node type, but with an index. The index is the number of this node | |
| 273 * within this level of the tree. Hence: | |
| 274 * | |
| 275 * begin_structure("DOC", 0) | |
| 276 * begin_structure("STORY", 0) | |
| 277 * begin_structure("SECTION", 0) | |
| 278 * begin_structure("HEADING", 0) | |
| 279 * and | |
| 280 * begin_structure("DOC", 0) | |
| 281 * begin_structure("STORY", 0) | |
| 282 * begin_structure("SECTION", 0) | |
| 283 * begin_structure("SUBSECTION", 1) | |
| 284 * | |
| 285 * are now unambiguous in their describing of the tree. | |
| 286 * | |
| 287 * MuPDF automatically sends the minimal end_structure/begin_structure | |
| 288 * pairs to move us between nodes in the tree. | |
| 289 * | |
| 290 * In order to accommodate this information within the structured text | |
| 291 * data structures an additional block type is used. Previously a | |
| 292 * "page" was just a list of blocks, either text or images. e.g. | |
| 293 * | |
| 294 * [BLOCK:TEXT] <-> [BLOCK:IMG] <-> [BLOCK:TEXT] <-> [BLOCK:TEXT] ... | |
| 295 * | |
| 296 * We now introduce a new type of block, STRUCT, that turns this into | |
| 297 * a tree: | |
| 298 * | |
| 299 * [BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ... | |
| 300 * /|\ | |
| 301 * [STRUCT:TYPE=DOC] <---- | |
| 302 * | | |
| 303 * [BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ... | |
| 304 * /|\ | |
| 305 * [STRUCT:TYPE=STORY] <-- | |
| 306 * | | |
| 307 * ... | |
| 308 * | |
| 309 * Rather than doing a simple linear traversal of the list to extract | |
| 310 * the logical data, a caller now has to do a depth-first traversal. | |
| 311 */ | |
| 312 | |
| 313 /** | |
| 314 A text page is a list of blocks, together with an overall | |
| 315 bounding box. | |
| 316 */ | |
| 317 typedef struct | |
| 318 { | |
| 319 fz_pool *pool; | |
| 320 fz_rect mediabox; | |
| 321 fz_stext_block *first_block; | |
| 322 | |
| 323 /* The following fields are only of use to the routines that | |
| 324 * build an fz_stext_page. They change during page construction | |
| 325 * and their meaning is subject to change. These values should | |
| 326 * not be used by anything outside of the stext device. */ | |
| 327 fz_stext_block *last_block; | |
| 328 fz_stext_struct *last_struct; | |
| 329 } fz_stext_page; | |
| 330 | |
| 331 enum | |
| 332 { | |
| 333 FZ_STEXT_BLOCK_TEXT = 0, | |
| 334 FZ_STEXT_BLOCK_IMAGE = 1, | |
| 335 FZ_STEXT_BLOCK_STRUCT = 2, | |
| 336 FZ_STEXT_BLOCK_VECTOR = 3, | |
| 337 FZ_STEXT_BLOCK_GRID = 4 | |
| 338 }; | |
| 339 | |
| 340 enum | |
| 341 { | |
| 342 FZ_STEXT_TEXT_JUSTIFY_UNKNOWN = 0, | |
| 343 FZ_STEXT_TEXT_JUSTIFY_LEFT = 1, | |
| 344 FZ_STEXT_TEXT_JUSTIFY_CENTRE = 2, | |
| 345 FZ_STEXT_TEXT_JUSTIFY_RIGHT = 3, | |
| 346 FZ_STEXT_TEXT_JUSTIFY_FULL = 4, | |
| 347 }; | |
| 348 | |
| 349 enum | |
| 350 { | |
| 351 /* Indicates that this vector came from a stroked | |
| 352 * path. */ | |
| 353 FZ_STEXT_VECTOR_IS_STROKED = 1, | |
| 354 | |
| 355 /* Indicates that this vector came from a rectangular | |
| 356 * (axis-aligned) path (or path segment). */ | |
| 357 FZ_STEXT_VECTOR_IS_RECTANGLE = 2, | |
| 358 | |
| 359 /* Indicates that this vector came from a path | |
| 360 * segment, and more segments from this same path are | |
| 361 * still to come. */ | |
| 362 FZ_STEXT_VECTOR_CONTINUES = 4 | |
| 363 }; | |
| 364 | |
| 365 /** | |
| 366 A text block is a list of lines of text (typically a paragraph), | |
| 367 or an image. | |
| 368 */ | |
| 369 struct fz_stext_block | |
| 370 { | |
| 371 int type; | |
| 372 fz_rect bbox; | |
| 373 union { | |
| 374 struct { fz_stext_line *first_line, *last_line; int flags;} t; | |
| 375 struct { fz_matrix transform; fz_image *image; } i; | |
| 376 struct { fz_stext_struct *down; int index; } s; | |
| 377 struct { uint32_t flags; uint32_t argb; } v; | |
| 378 struct { fz_stext_grid_positions *xs; fz_stext_grid_positions *ys; } b; | |
| 379 } u; | |
| 380 fz_stext_block *prev, *next; | |
| 381 }; | |
| 382 | |
| 383 /** | |
| 384 A text line is a list of characters that share a common baseline. | |
| 385 */ | |
| 386 struct fz_stext_line | |
| 387 { | |
| 388 int wmode; /* 0 for horizontal, 1 for vertical */ | |
| 389 fz_point dir; /* normalized direction of baseline */ | |
| 390 fz_rect bbox; | |
| 391 fz_stext_char *first_char, *last_char; | |
| 392 fz_stext_line *prev, *next; | |
| 393 }; | |
| 394 | |
| 395 /** | |
| 396 A text char is a unicode character, the style in which is | |
| 397 appears, and the point at which it is positioned. | |
| 398 */ | |
| 399 struct fz_stext_char | |
| 400 { | |
| 401 int c; /* unicode character value */ | |
| 402 uint16_t bidi; /* even for LTR, odd for RTL - probably only needs 8 bits? */ | |
| 403 uint16_t flags; | |
| 404 uint32_t argb; /* sRGB hex color (alpha in top 8 bits, then r, then g, then b in low bits) */ | |
| 405 fz_point origin; | |
| 406 fz_quad quad; | |
| 407 float size; | |
| 408 fz_font *font; | |
| 409 fz_stext_char *next; | |
| 410 }; | |
| 411 | |
| 412 enum | |
| 413 { | |
| 414 FZ_STEXT_STRIKEOUT = 1, | |
| 415 FZ_STEXT_UNDERLINE = 2, | |
| 416 FZ_STEXT_SYNTHETIC = 4, | |
| 417 FZ_STEXT_BOLD = 8, /* Either real or 'fake' bold */ | |
| 418 FZ_STEXT_FILLED = 16, | |
| 419 FZ_STEXT_STROKED = 32, | |
| 420 FZ_STEXT_CLIPPED = 64, | |
| 421 FZ_STEXT_UNICODE_IS_CID = 128, | |
| 422 FZ_STEXT_UNICODE_IS_GID = 256, | |
| 423 }; | |
| 424 | |
| 425 /** | |
| 426 When we are collecting the structure information from | |
| 427 PDF structure trees/tags, we end up with a tree of | |
| 428 nodes. The structure should be walked in depth-first | |
| 429 traversal order to extract the content. | |
| 430 | |
| 431 An fz_stext_struct pointer can be NULL to indicate that | |
| 432 we know there is a child there within the complete tree, | |
| 433 but we don't know what it is yet. | |
| 434 */ | |
| 435 struct fz_stext_struct | |
| 436 { | |
| 437 /* up points to the block that contains this fz_stext_struct. */ | |
| 438 fz_stext_block *up; | |
| 439 /* parent points to the struct that has up as one of its children. | |
| 440 * parent is useful for doing depth first traversal without having | |
| 441 * to store the entire chain of structs in the iterator. */ | |
| 442 fz_stext_struct *parent; | |
| 443 | |
| 444 /* first_block points to the first child of this node (or NULL | |
| 445 * if there are none). */ | |
| 446 fz_stext_block *first_block; | |
| 447 /* last_block points to the last child of this node (or NULL | |
| 448 * if there are none). */ | |
| 449 fz_stext_block *last_block; | |
| 450 | |
| 451 /* We have a set of 'standard' structure types. Every structure | |
| 452 * element should correspond to one of these. */ | |
| 453 fz_structure standard; | |
| 454 /* Documents can use their own non-standard structure types, which | |
| 455 * are held as 'raw' strings. */ | |
| 456 char raw[FZ_FLEXIBLE_ARRAY]; | |
| 457 }; | |
| 458 | |
| 459 /* An example to show how fz_stext_blocks and fz_stext_structs interact: | |
| 460 * | |
| 461 * [fz_stext_page] | |
| 462 * | | |
| 463 * first_block| | |
| 464 * | | |
| 465 * \|/ | |
| 466 * [fz_stext_block:TEXT]<->[fz_stext_block:STRUCT]<->[fz_stext_block:IMG] | |
| 467 * u.s.down| /|\ | |
| 468 * | | | |
| 469 * \|/ |up | |
| 470 * [fz_stext_struct]<---------. | |
| 471 * | | | | |
| 472 * first_block| |last_block | | |
| 473 * _______________________| | | | |
| 474 * | | | | |
| 475 * | | | | |
| 476 * \|/ \|/ | | |
| 477 * [fz_stext_block:...]<->...<->[fz_stext_block:STRUCT] | | |
| 478 * | /|\ | | |
| 479 * u.s.down| |up | | |
| 480 * \|/ | parent| | |
| 481 * [fz_stext_struct]--------' | |
| 482 * | | | |
| 483 * first_block| |last_block | |
| 484 * : : | |
| 485 */ | |
| 486 | |
| 487 struct fz_stext_grid_positions | |
| 488 { | |
| 489 int len; | |
| 490 int max_uncertainty; | |
| 491 struct { | |
| 492 int reinforcement; | |
| 493 float pos; | |
| 494 float min; | |
| 495 float max; | |
| 496 int uncertainty; | |
| 497 } list[FZ_FLEXIBLE_ARRAY]; | |
| 498 }; | |
| 499 | |
| 500 FZ_DATA extern const char *fz_stext_options_usage; | |
| 501 | |
| 502 /** | |
| 503 Create an empty text page. | |
| 504 | |
| 505 The text page is filled out by the text device to contain the | |
| 506 blocks and lines of text on the page. | |
| 507 | |
| 508 mediabox: optional mediabox information. | |
| 509 */ | |
| 510 fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox); | |
| 511 void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page); | |
| 512 | |
| 513 /** | |
| 514 Output structured text to a file in HTML (visual) format. | |
| 515 */ | |
| 516 void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id); | |
| 517 void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out); | |
| 518 void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out); | |
| 519 | |
| 520 /** | |
| 521 Output structured text to a file in XHTML (semantic) format. | |
| 522 */ | |
| 523 void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id); | |
| 524 void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out); | |
| 525 void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out); | |
| 526 | |
| 527 /** | |
| 528 Output structured text to a file in XML format. | |
| 529 */ | |
| 530 void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id); | |
| 531 | |
| 532 /** | |
| 533 Output structured text to a file in JSON format. | |
| 534 */ | |
| 535 void fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale); | |
| 536 | |
| 537 /** | |
| 538 Output structured text to a file in plain-text UTF-8 format. | |
| 539 */ | |
| 540 void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page); | |
| 541 | |
| 542 /** | |
| 543 Search for occurrence of 'needle' in text page. | |
| 544 | |
| 545 Return the number of quads and store hit quads in the passed in | |
| 546 array. | |
| 547 | |
| 548 NOTE: This is an experimental interface and subject to change | |
| 549 without notice. | |
| 550 */ | |
| 551 int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, int *hit_mark, fz_quad *hit_bbox, int hit_max); | |
| 552 | |
| 553 /** | |
| 554 Callback function for use in searching. | |
| 555 | |
| 556 Called with the list of quads that correspond to a single hit. | |
| 557 | |
| 558 The callback should return with 0 to continue the search, or 1 to abort it. | |
| 559 All other values are reserved at this point. | |
| 560 */ | |
| 561 typedef int (fz_search_callback_fn)(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox); | |
| 562 | |
| 563 /** | |
| 564 Search for occurrence of 'needle' in text page. | |
| 565 | |
| 566 Call callback once for each hit. This callback will receive | |
| 567 (potentially) multiple quads for each hit. | |
| 568 | |
| 569 Returns the number of hits - note that this is potentially | |
| 570 different from (i.e. is not greater than) the number of quads | |
| 571 as returned by the non callback API. | |
| 572 | |
| 573 NOTE: This is an experimental interface and subject to change | |
| 574 without notice. | |
| 575 */ | |
| 576 int fz_search_stext_page_cb(fz_context *ctx, fz_stext_page *text, const char *needle, fz_search_callback_fn *cb, void *opaque); | |
| 577 | |
| 578 | |
| 579 /** | |
| 580 Return a list of quads to highlight lines inside the selection | |
| 581 points. | |
| 582 */ | |
| 583 int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads); | |
| 584 | |
| 585 enum | |
| 586 { | |
| 587 FZ_SELECT_CHARS, | |
| 588 FZ_SELECT_WORDS, | |
| 589 FZ_SELECT_LINES, | |
| 590 }; | |
| 591 | |
| 592 fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode); | |
| 593 | |
| 594 /** | |
| 595 Return a newly allocated UTF-8 string with the text for a given | |
| 596 selection. | |
| 597 | |
| 598 crlf: If true, write "\r\n" style line endings (otherwise "\n" | |
| 599 only). | |
| 600 */ | |
| 601 char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf); | |
| 602 | |
| 603 /** | |
| 604 Return a newly allocated UTF-8 string with the text for a given | |
| 605 selection rectangle. | |
| 606 | |
| 607 crlf: If true, write "\r\n" style line endings (otherwise "\n" | |
| 608 only). | |
| 609 */ | |
| 610 char *fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf); | |
| 611 | |
| 612 /** | |
| 613 Options for creating structured text. | |
| 614 */ | |
| 615 typedef struct | |
| 616 { | |
| 617 int flags; | |
| 618 float scale; | |
| 619 fz_rect clip; | |
| 620 } fz_stext_options; | |
| 621 | |
| 622 /** | |
| 623 Parse stext device options from a comma separated key-value | |
| 624 string. | |
| 625 */ | |
| 626 fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string); | |
| 627 | |
| 628 /** | |
| 629 Perform segmentation analysis on an (unstructured) page to look for | |
| 630 recursive subdivisions. | |
| 631 | |
| 632 Essentially this code attempts to split the page horizontally and/or | |
| 633 vertically repeatedly into smaller and smaller "segments" (divisions). | |
| 634 | |
| 635 Returns 0 if no changes were made to the document. | |
| 636 | |
| 637 This is experimental code, and may change (or be removed) in future | |
| 638 versions! | |
| 639 */ | |
| 640 int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page); | |
| 641 | |
| 642 /** | |
| 643 Attempt to break paragraphs at plausible places. | |
| 644 */ | |
| 645 void fz_paragraph_break(fz_context *ctx, fz_stext_page *page); | |
| 646 | |
| 647 /** | |
| 648 Hunt for possible tables on a page, and update the stext with | |
| 649 information. | |
| 650 */ | |
| 651 void fz_table_hunt(fz_context *ctx, fz_stext_page *page); | |
| 652 | |
| 653 /** | |
| 654 Interpret the bounded contents of a given stext page as | |
| 655 a table. | |
| 656 | |
| 657 The page contents will be rewritten to contain a Table | |
| 658 structure with the identified content in it. | |
| 659 | |
| 660 This uses the same logic as for fz_table_hunt, without the | |
| 661 actual hunting. fz_table_hunt hunts to find possible bounds | |
| 662 for multiple tables on the page; this routine just finds a | |
| 663 single table contained within the given rectangle. | |
| 664 | |
| 665 Returns the stext_block list that contains the content of | |
| 666 the table. | |
| 667 */ | |
| 668 fz_stext_block * | |
| 669 fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds); | |
| 670 | |
| 671 /** | |
| 672 Create a device to extract the text on a page. | |
| 673 | |
| 674 Gather the text on a page into blocks and lines. | |
| 675 | |
| 676 The reading order is taken from the order the text is drawn in | |
| 677 the source file, so may not be accurate. | |
| 678 | |
| 679 page: The text page to which content should be added. This will | |
| 680 usually be a newly created (empty) text page, but it can be one | |
| 681 containing data already (for example when merging multiple | |
| 682 pages, or watermarking). | |
| 683 | |
| 684 options: Options to configure the stext device. | |
| 685 */ | |
| 686 fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options); | |
| 687 | |
| 688 /** | |
| 689 Create a device to OCR the text on the page. | |
| 690 | |
| 691 Renders the page internally to a bitmap that is then OCRd. Text | |
| 692 is then forwarded onto the target device. | |
| 693 | |
| 694 target: The target device to receive the OCRd text. | |
| 695 | |
| 696 ctm: The transform to apply to the mediabox to get the size for | |
| 697 the rendered page image. Also used to calculate the resolution | |
| 698 for the page image. In general, this will be the same as the CTM | |
| 699 that you pass to fz_run_page (or fz_run_display_list) to feed | |
| 700 this device. | |
| 701 | |
| 702 mediabox: The mediabox (in points). Combined with the CTM to get | |
| 703 the bounds of the pixmap used internally for the rendered page | |
| 704 image. | |
| 705 | |
| 706 with_list: If with_list is false, then all non-text operations | |
| 707 are forwarded instantly to the target device. This results in | |
| 708 the target device seeing all NON-text operations, followed by | |
| 709 all the text operations (derived from OCR). | |
| 710 | |
| 711 If with_list is true, then all the marking operations are | |
| 712 collated into a display list which is then replayed to the | |
| 713 target device at the end. | |
| 714 | |
| 715 language: NULL (for "eng"), or a pointer to a string to describe | |
| 716 the languages/scripts that should be used for OCR (e.g. | |
| 717 "eng,ara"). | |
| 718 | |
| 719 datadir: NULL (for ""), or a pointer to a path string otherwise | |
| 720 provided to Tesseract in the TESSDATA_PREFIX environment variable. | |
| 721 | |
| 722 progress: NULL, or function to be called periodically to indicate | |
| 723 progress. Return 0 to continue, or 1 to cancel. progress_arg is | |
| 724 returned as the void *. The int is a value between 0 and 100 to | |
| 725 indicate progress. | |
| 726 | |
| 727 progress_arg: A void * value to be parrotted back to the progress | |
| 728 function. | |
| 729 */ | |
| 730 fz_device *fz_new_ocr_device(fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language, | |
| 731 const char *datadir, int (*progress)(fz_context *, void *, int), void *progress_arg); | |
| 732 | |
| 733 fz_document *fz_open_reflowed_document(fz_context *ctx, fz_document *underdoc, const fz_stext_options *opts); | |
| 734 | |
| 735 | |
| 736 #endif |
