Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/document.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #ifndef ARTIFEX_EXTRACT_DOCUMENT_H | |
| 2 #define ARTIFEX_EXTRACT_DOCUMENT_H | |
| 3 | |
| 4 #include "extract/extract.h" | |
| 5 #include "extract/alloc.h" | |
| 6 | |
| 7 #include "compat_stdint.h" | |
| 8 #include <assert.h> | |
| 9 | |
| 10 typedef struct span_t span_t; | |
| 11 typedef struct line_t line_t; | |
| 12 typedef struct paragraph_t paragraph_t; | |
| 13 typedef struct image_t image_t; | |
| 14 typedef struct table_t table_t; | |
| 15 typedef struct block_t block_t; | |
| 16 typedef struct structure_t structure_t; | |
| 17 | |
| 18 static const double pi = 3.141592653589793; | |
| 19 | |
| 20 /* | |
| 21 All content is stored as content_t nodes in a doubly linked-list. | |
| 22 The first node in the list is a 'content_root' node. The last | |
| 23 node in the list is the same node again. | |
| 24 | |
| 25 Thus: | |
| 26 Every node in a list (including the root) has next and prev != NULL. | |
| 27 The root node in an empty list has next and prev pointing to itself. | |
| 28 Any non-root node with prev and next == NULL is not in a list. | |
| 29 | |
| 30 Content nodes record a 'type' for the node. Each node is 'derived' in | |
| 31 an OO style from the basic content_t. | |
| 32 | |
| 33 The different content types form a heirarchy: | |
| 34 | |
| 35 A spans is an array of "char_t"s (note, an array, NOT a content list). | |
| 36 | |
| 37 Lines contain a content list, which should mostly consist of spans. | |
| 38 | |
| 39 Paragraphs contain a content list, which should mostly consist of lines. | |
| 40 | |
| 41 Image nodes contains details of a bitmap image. | |
| 42 | |
| 43 Table nodes contain an array of cells, each of which contains a content | |
| 44 list that can contain any other type. | |
| 45 | |
| 46 Blocks contain a content list consisting of paragraphs, tables and images. | |
| 47 Conceptually these represent a block of content on a page. | |
| 48 */ | |
| 49 typedef enum { | |
| 50 content_root, | |
| 51 content_span, | |
| 52 content_line, | |
| 53 content_paragraph, | |
| 54 content_image, | |
| 55 content_table, | |
| 56 content_block | |
| 57 } content_type_t; | |
| 58 | |
| 59 typedef struct content_t { | |
| 60 /* The type field tells us what derived type we actually are. */ | |
| 61 content_type_t type; | |
| 62 | |
| 63 /* This holds us in the linked list of sibling content nodes. */ | |
| 64 struct content_t *prev; | |
| 65 struct content_t *next; | |
| 66 } content_t; | |
| 67 | |
| 68 /* Initialise a content_t (just the base struct). */ | |
| 69 void content_init(content_t *content, content_type_t type); | |
| 70 | |
| 71 /* Unlink a (non-root) content_t from any list. */ | |
| 72 void content_unlink(content_t *content); | |
| 73 | |
| 74 /* Unlink a span_t from any list. */ | |
| 75 void content_unlink_span(span_t *span); | |
| 76 | |
| 77 typedef struct { | |
| 78 content_t base; | |
| 79 content_t *parent; | |
| 80 } content_root_t; | |
| 81 | |
| 82 void content_init_root(content_root_t *root, content_t *parent); | |
| 83 | |
| 84 /* Free all the content, from a (root) content_t. */ | |
| 85 void content_clear(extract_alloc_t* alloc, content_root_t *root); | |
| 86 | |
| 87 span_t *content_first_span(const content_root_t *root); | |
| 88 span_t *content_last_span(const content_root_t *root); | |
| 89 line_t *content_first_line(const content_root_t *root); | |
| 90 line_t *content_last_line(const content_root_t *root); | |
| 91 paragraph_t *content_first_paragraph(const content_root_t *root); | |
| 92 paragraph_t *content_last_paragraph(const content_root_t *root); | |
| 93 | |
| 94 span_t *content_next_span(const content_t *node); | |
| 95 span_t *content_prev_span(const content_t *node); | |
| 96 line_t *content_next_line(const content_t *node); | |
| 97 line_t *content_prev_line(const content_t *node); | |
| 98 paragraph_t *content_next_paragraph(const content_t *node); | |
| 99 paragraph_t *content_prev_paragraph(const content_t *node); | |
| 100 | |
| 101 int content_count(content_root_t *root); | |
| 102 int content_count_images(content_root_t *root); | |
| 103 int content_count_spans(content_root_t *root); | |
| 104 int content_count_lines(content_root_t *root); | |
| 105 int content_count_paragraphs(content_root_t *root); | |
| 106 int content_count_tables(content_root_t *root); | |
| 107 | |
| 108 int content_new_root(extract_alloc_t *alloc, content_root_t **proot); | |
| 109 int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure); | |
| 110 int content_new_line(extract_alloc_t *alloc, line_t **pline); | |
| 111 int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph); | |
| 112 int content_new_table(extract_alloc_t *alloc, table_t **ptable); | |
| 113 int content_new_block(extract_alloc_t *alloc, block_t **pblock); | |
| 114 | |
| 115 int content_append_new_span(extract_alloc_t* alloc, content_root_t *root, span_t **pspan, structure_t *structure); | |
| 116 int content_append_new_line(extract_alloc_t* alloc, content_root_t *root, line_t **pline); | |
| 117 int content_append_new_paragraph(extract_alloc_t* alloc, content_root_t *root, paragraph_t **pparagraph); | |
| 118 int content_append_new_image(extract_alloc_t* alloc, content_root_t *root, image_t **pimage); | |
| 119 int content_append_new_table(extract_alloc_t* alloc, content_root_t *root, table_t **ptable); | |
| 120 int content_append_new_block(extract_alloc_t* alloc, content_root_t *root, block_t **pblock); | |
| 121 | |
| 122 void content_replace(content_t *current, content_t *replacement); | |
| 123 int content_replace_new_line(extract_alloc_t* alloc, content_t *current, line_t **pline); | |
| 124 int content_replace_new_paragraph(extract_alloc_t* alloc, content_t *current, paragraph_t **pparagraph); | |
| 125 int content_replace_new_block(extract_alloc_t* alloc, content_t *current, block_t **pblock); | |
| 126 | |
| 127 | |
| 128 void content_append(content_root_t *root, content_t *content); | |
| 129 void content_append_span(content_root_t *root, span_t *span); | |
| 130 void content_append_line(content_root_t *root, line_t *line); | |
| 131 void content_append_paragraph(content_root_t *root, paragraph_t *paragraph); | |
| 132 void content_append_table(content_root_t *root, table_t *table); | |
| 133 void content_append_block(content_root_t *root, block_t *block); | |
| 134 | |
| 135 void content_concat(content_root_t *dst, content_root_t *src); | |
| 136 | |
| 137 void content_dump(const content_root_t *content); | |
| 138 void content_dump_line(const line_t *line); | |
| 139 void content_dump_span(const span_t *span); | |
| 140 void content_dump_brief(const content_root_t *content); | |
| 141 | |
| 142 | |
| 143 typedef int (content_cmp_fn)(const content_t *, const content_t *); | |
| 144 | |
| 145 void content_sort(content_root_t *content, content_cmp_fn *cmp); | |
| 146 | |
| 147 /* To iterate over the line elements of a content list: | |
| 148 | |
| 149 content_line_iterator it; | |
| 150 line_t *line; | |
| 151 | |
| 152 for(line = content_line_iterator_line_init(&it, content); line != NULL; line = content_line_iterator_next(&it)) | |
| 153 { | |
| 154 } | |
| 155 | |
| 156 */ | |
| 157 | |
| 158 typedef struct { | |
| 159 content_root_t *root; | |
| 160 content_t *next; | |
| 161 } content_paragraph_iterator; | |
| 162 | |
| 163 static inline paragraph_t *content_paragraph_iterator_next(content_paragraph_iterator *it) | |
| 164 { | |
| 165 content_t *next; | |
| 166 | |
| 167 do { | |
| 168 next = it->next; | |
| 169 if (next == &it->root->base) | |
| 170 return NULL; | |
| 171 assert(next->type != content_root); | |
| 172 it->next = next->next; | |
| 173 } while (next->type != content_paragraph); | |
| 174 | |
| 175 return (paragraph_t *)next; | |
| 176 } | |
| 177 | |
| 178 static inline paragraph_t *content_paragraph_iterator_init(content_paragraph_iterator *it, content_root_t *root) | |
| 179 { | |
| 180 it->root = root; | |
| 181 it->next = root->base.next; | |
| 182 | |
| 183 return content_paragraph_iterator_next(it); | |
| 184 } | |
| 185 | |
| 186 typedef struct { | |
| 187 content_root_t *root; | |
| 188 content_t *next; | |
| 189 } content_line_iterator; | |
| 190 | |
| 191 static inline line_t *content_line_iterator_next(content_line_iterator *it) | |
| 192 { | |
| 193 content_t *next; | |
| 194 | |
| 195 do { | |
| 196 next = it->next; | |
| 197 if (next == &it->root->base) | |
| 198 return NULL; | |
| 199 assert(next->type != content_root); | |
| 200 it->next = next->next; | |
| 201 } while (next->type != content_line); | |
| 202 | |
| 203 return (line_t *)next; | |
| 204 } | |
| 205 | |
| 206 static inline line_t *content_line_iterator_init(content_line_iterator *it, content_root_t *root) | |
| 207 { | |
| 208 it->root = root; | |
| 209 it->next = root->base.next; | |
| 210 | |
| 211 return content_line_iterator_next(it); | |
| 212 } | |
| 213 | |
| 214 typedef struct { | |
| 215 content_root_t *root; | |
| 216 content_t *next; | |
| 217 } content_span_iterator; | |
| 218 | |
| 219 static inline span_t *content_span_iterator_next(content_span_iterator *it) | |
| 220 { | |
| 221 content_t *next; | |
| 222 | |
| 223 do { | |
| 224 next = it->next; | |
| 225 if (next == &it->root->base) | |
| 226 return NULL; | |
| 227 assert(next->type != content_root); | |
| 228 it->next = next->next; | |
| 229 } while (next->type != content_span); | |
| 230 | |
| 231 return (span_t *)next; | |
| 232 } | |
| 233 | |
| 234 static inline span_t *content_span_iterator_init(content_span_iterator *it, content_root_t *root) | |
| 235 { | |
| 236 it->root = root; | |
| 237 it->next = root->base.next; | |
| 238 | |
| 239 return content_span_iterator_next(it); | |
| 240 } | |
| 241 | |
| 242 typedef struct { | |
| 243 content_root_t *root; | |
| 244 content_t *next; | |
| 245 } content_image_iterator; | |
| 246 | |
| 247 static inline image_t *content_image_iterator_next(content_image_iterator *it) | |
| 248 { | |
| 249 content_t *next; | |
| 250 | |
| 251 do { | |
| 252 next = it->next; | |
| 253 if (next == &it->root->base) | |
| 254 return NULL; | |
| 255 assert(next->type != content_root); | |
| 256 it->next = next->next; | |
| 257 } while (next->type != content_image); | |
| 258 | |
| 259 return (image_t *)next; | |
| 260 } | |
| 261 | |
| 262 static inline image_t *content_image_iterator_init(content_image_iterator *it, content_root_t *root) | |
| 263 { | |
| 264 it->root = root; | |
| 265 it->next = root->base.next; | |
| 266 | |
| 267 return content_image_iterator_next(it); | |
| 268 } | |
| 269 | |
| 270 typedef struct { | |
| 271 content_root_t *root; | |
| 272 content_t *next; | |
| 273 } content_table_iterator; | |
| 274 | |
| 275 static inline table_t *content_table_iterator_next(content_table_iterator *it) | |
| 276 { | |
| 277 content_t *next; | |
| 278 | |
| 279 do { | |
| 280 next = it->next; | |
| 281 if (next == &it->root->base) | |
| 282 return NULL; | |
| 283 assert(next->type != content_root); | |
| 284 it->next = next->next; | |
| 285 } while (next->type != content_table); | |
| 286 | |
| 287 return (table_t *)next; | |
| 288 } | |
| 289 | |
| 290 static inline table_t *content_table_iterator_init(content_table_iterator *it, content_root_t *root) | |
| 291 { | |
| 292 it->root = root; | |
| 293 it->next = root->base.next; | |
| 294 | |
| 295 return content_table_iterator_next(it); | |
| 296 } | |
| 297 | |
| 298 typedef struct { | |
| 299 content_root_t *root; | |
| 300 content_t *next; | |
| 301 } content_iterator; | |
| 302 | |
| 303 static inline content_t *content_iterator_next(content_iterator *it) | |
| 304 { | |
| 305 content_t *next = it->next; | |
| 306 | |
| 307 if (next == &it->root->base) | |
| 308 return NULL; | |
| 309 assert(next->type != content_root); | |
| 310 it->next = next->next; | |
| 311 | |
| 312 return next; | |
| 313 } | |
| 314 | |
| 315 static inline content_t *content_iterator_init(content_iterator *it, content_root_t *root) | |
| 316 { | |
| 317 it->root = root; | |
| 318 it->next = root->base.next; | |
| 319 | |
| 320 return content_iterator_next(it); | |
| 321 } | |
| 322 | |
| 323 typedef struct | |
| 324 { | |
| 325 double x; | |
| 326 double y; | |
| 327 } point_t; | |
| 328 | |
| 329 const char *extract_point_string(const point_t *point); | |
| 330 | |
| 331 typedef struct | |
| 332 { | |
| 333 point_t min; | |
| 334 point_t max; | |
| 335 } rect_t; | |
| 336 | |
| 337 extern const rect_t extract_rect_infinite; | |
| 338 extern const rect_t extract_rect_empty; | |
| 339 | |
| 340 rect_t extract_rect_intersect(rect_t a, rect_t b); | |
| 341 | |
| 342 rect_t extract_rect_union(rect_t a, rect_t b); | |
| 343 | |
| 344 rect_t extract_rect_union_point(rect_t a, point_t b); | |
| 345 | |
| 346 int extract_rect_contains_rect(rect_t a, rect_t b); | |
| 347 | |
| 348 int extract_rect_valid(rect_t a); | |
| 349 | |
| 350 const char *extract_rect_string(const rect_t *rect); | |
| 351 | |
| 352 typedef struct | |
| 353 { | |
| 354 double a; | |
| 355 double b; | |
| 356 double c; | |
| 357 double d; | |
| 358 double e; | |
| 359 double f; | |
| 360 } matrix_t; | |
| 361 | |
| 362 typedef struct | |
| 363 { | |
| 364 double a; | |
| 365 double b; | |
| 366 double c; | |
| 367 double d; | |
| 368 } matrix4_t; | |
| 369 | |
| 370 const char *extract_matrix_string(const matrix_t *matrix); | |
| 371 const char *extract_matrix4_string(const matrix4_t *matrix); | |
| 372 | |
| 373 /* Returns a*d - b*c. */ | |
| 374 double extract_matrix_expansion(matrix4_t m); | |
| 375 | |
| 376 /* Returns the inverse of a matrix (or identity for degenerate). */ | |
| 377 matrix4_t extract_matrix4_invert(const matrix4_t *ctm); | |
| 378 | |
| 379 point_t extract_matrix4_transform_point(matrix4_t m, point_t p); | |
| 380 point_t extract_matrix4_transform_xy(matrix4_t m, double x, double y); | |
| 381 matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2); | |
| 382 matrix4_t extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2); | |
| 383 | |
| 384 /* Returns zero if first four members of *lhs and *rhs are equal, otherwise | |
| 385 +/-1. */ | |
| 386 int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs); | |
| 387 | |
| 388 /* A single char in a span. */ | |
| 389 typedef struct | |
| 390 { | |
| 391 /* (x,y) after transformation by ctm. */ | |
| 392 double x; | |
| 393 double y; | |
| 394 | |
| 395 unsigned ucs; | |
| 396 double adv; /* Advance, before transform by ctm */ | |
| 397 | |
| 398 rect_t bbox; | |
| 399 } char_t; | |
| 400 | |
| 401 /* List of chars that have same font and are usually adjacent. */ | |
| 402 struct span_t | |
| 403 { | |
| 404 content_t base; | |
| 405 matrix4_t ctm; | |
| 406 char *font_name; | |
| 407 rect_t font_bbox; | |
| 408 structure_t *structure; | |
| 409 | |
| 410 struct { | |
| 411 unsigned font_bold : 1; | |
| 412 unsigned font_italic : 1; | |
| 413 unsigned wmode : 1; | |
| 414 } flags; | |
| 415 | |
| 416 char_t *chars; | |
| 417 int chars_num; | |
| 418 }; | |
| 419 | |
| 420 void extract_span_init(span_t *span, structure_t *structure); | |
| 421 | |
| 422 /* Frees a span_t, returning with *pspan set to NULL. */ | |
| 423 void extract_span_free(extract_alloc_t *alloc, span_t **pspan); | |
| 424 | |
| 425 /* Returns last character in span. */ | |
| 426 char_t *extract_span_char_last(span_t *span); | |
| 427 | |
| 428 /* Appends new char_t to an span_t with .ucs=c and all other | |
| 429 fields zeroed. Returns pointer to new char_t record, or NULL if allocation | |
| 430 failed. */ | |
| 431 char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c); | |
| 432 | |
| 433 /* Returns static string containing info about span_t. */ | |
| 434 const char *extract_span_string(extract_alloc_t *alloc, span_t *span); | |
| 435 | |
| 436 /* List of spans that are aligned on same line. */ | |
| 437 struct line_t | |
| 438 { | |
| 439 content_t base; | |
| 440 double ascender; | |
| 441 double descender; | |
| 442 content_root_t content; | |
| 443 }; | |
| 444 | |
| 445 void extract_line_init(line_t *line); | |
| 446 | |
| 447 void extract_line_free(extract_alloc_t* alloc, line_t **pline); | |
| 448 | |
| 449 /* Returns first span in a line. */ | |
| 450 span_t *extract_line_span_first(line_t *line); | |
| 451 | |
| 452 /* Returns last span in a line. */ | |
| 453 span_t *extract_line_span_last(line_t *line); | |
| 454 | |
| 455 /* List of lines that are aligned and adjacent to each other so as to form a | |
| 456 paragraph. */ | |
| 457 struct paragraph_t | |
| 458 { | |
| 459 content_t base; | |
| 460 int line_flags; | |
| 461 content_root_t content; | |
| 462 }; | |
| 463 | |
| 464 typedef enum | |
| 465 { | |
| 466 /* If the paragraph is ever not aligned to the left hand edge, we set this flag. */ | |
| 467 paragraph_not_aligned_left = 1, | |
| 468 | |
| 469 /* If the paragraph is ever not aligned to the right hand edge, we set this flag. */ | |
| 470 paragraph_not_aligned_right = 2, | |
| 471 | |
| 472 /* If the paragraph ever has a line that doesn't look centred, we set this flag. */ | |
| 473 paragraph_not_centred = 4, | |
| 474 | |
| 475 /* If the paragraph ever has a line that doesn't look fully justified, we set this flag. */ | |
| 476 paragraph_not_fully_justified = 8, | |
| 477 | |
| 478 /* If the paragraph ever breaks at a place where it looks like first word from the | |
| 479 * next line could have fitted, then set this flag.*/ | |
| 480 paragraph_breaks_strangely = 16 | |
| 481 } paragraph_flags; | |
| 482 | |
| 483 void extract_paragraph_init(paragraph_t *paragraph); | |
| 484 | |
| 485 void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph); | |
| 486 | |
| 487 /* List of content that we believe should be treated as a whole. */ | |
| 488 struct block_t | |
| 489 { | |
| 490 content_t base; | |
| 491 content_root_t content; | |
| 492 }; | |
| 493 | |
| 494 void extract_block_init(block_t *block); | |
| 495 | |
| 496 void extract_block_free(extract_alloc_t *alloc, block_t **pblock); | |
| 497 | |
| 498 | |
| 499 | |
| 500 /* Information about an image. <type> is as passed to extract_add_image(); | |
| 501 <name> and <id> are created to be unique identifiers for use in generated docx | |
| 502 file. */ | |
| 503 struct image_t | |
| 504 { | |
| 505 content_t base; | |
| 506 char *type; /* jpg, png etc. */ | |
| 507 char *name; /* Name of image file within docx. */ | |
| 508 char *id; /* ID of image within docx. */ | |
| 509 double x; | |
| 510 double y; | |
| 511 double w; | |
| 512 double h; | |
| 513 void *data; | |
| 514 size_t data_size; | |
| 515 | |
| 516 extract_image_data_free *data_free; | |
| 517 void *data_free_handle; | |
| 518 }; | |
| 519 | |
| 520 void extract_image_init(image_t *image); | |
| 521 | |
| 522 void extract_image_clear(extract_alloc_t *alloc, image_t *image); | |
| 523 | |
| 524 void extract_image_free(extract_alloc_t *alloc, image_t **pimage); | |
| 525 | |
| 526 /* A line that is part of a table. */ | |
| 527 typedef struct | |
| 528 { | |
| 529 float color; | |
| 530 rect_t rect; | |
| 531 } tableline_t; | |
| 532 | |
| 533 typedef struct | |
| 534 { | |
| 535 tableline_t *tablelines; | |
| 536 int tablelines_num; | |
| 537 } tablelines_t; | |
| 538 | |
| 539 | |
| 540 /* A cell within a table. */ | |
| 541 typedef struct | |
| 542 { | |
| 543 rect_t rect; | |
| 544 | |
| 545 /* If left/above is true, this cell is not obscured by cell to its | |
| 546 * left/above. */ | |
| 547 uint8_t left; | |
| 548 uint8_t above; | |
| 549 | |
| 550 /* extend_right and extend_down are 1 for normal cells, 2 for cells which | |
| 551 * extend right/down to cover an additional column/row, 3 to cover two | |
| 552 * additional columns/rows etc. */ | |
| 553 int extend_right; | |
| 554 int extend_down; | |
| 555 | |
| 556 /* Contents of this cell. */ | |
| 557 content_root_t content; | |
| 558 } cell_t; | |
| 559 | |
| 560 void extract_cell_init(cell_t *cell); | |
| 561 void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell); | |
| 562 void extract_table_init(table_t *table); | |
| 563 | |
| 564 struct table_t | |
| 565 { | |
| 566 content_t base; | |
| 567 point_t pos; /* top-left. */ | |
| 568 | |
| 569 /* Array of cells_num_x*cells_num_y cells; cell (x, y) is: | |
| 570 * cells_num_x * y + x. | |
| 571 */ | |
| 572 cell_t **cells; | |
| 573 int cells_num_x; | |
| 574 int cells_num_y; | |
| 575 }; | |
| 576 | |
| 577 void extract_table_free(extract_alloc_t *alloc, table_t **ptable); | |
| 578 | |
| 579 typedef enum | |
| 580 { | |
| 581 SPLIT_NONE = 0, | |
| 582 SPLIT_HORIZONTAL, | |
| 583 SPLIT_VERTICAL | |
| 584 } split_type_t; | |
| 585 | |
| 586 | |
| 587 typedef struct split_t | |
| 588 { | |
| 589 split_type_t type; | |
| 590 double weight; | |
| 591 int count; | |
| 592 struct split_t *split[1]; | |
| 593 } split_t; | |
| 594 | |
| 595 struct structure_t | |
| 596 { | |
| 597 structure_t *parent; | |
| 598 structure_t *sibling_next; | |
| 599 structure_t *sibling_prev; | |
| 600 structure_t *kids_first; | |
| 601 structure_t **kids_tail; | |
| 602 int uid; | |
| 603 extract_struct_t type; | |
| 604 int score; | |
| 605 }; | |
| 606 | |
| 607 /* A subpage. Contains different representations of the list of spans. */ | |
| 608 typedef struct | |
| 609 { | |
| 610 rect_t mediabox; | |
| 611 | |
| 612 int images_num; | |
| 613 | |
| 614 /* All the content on the page. */ | |
| 615 content_root_t content; | |
| 616 | |
| 617 tablelines_t tablelines_horizontal; | |
| 618 tablelines_t tablelines_vertical; | |
| 619 | |
| 620 content_root_t tables; | |
| 621 } subpage_t; | |
| 622 | |
| 623 | |
| 624 /* A page. Contains a list of subpages. NB not | |
| 625 called page_t because this clashes with a system type on hpux. */ | |
| 626 typedef struct | |
| 627 { | |
| 628 rect_t mediabox; | |
| 629 | |
| 630 subpage_t **subpages; | |
| 631 int subpages_num; | |
| 632 | |
| 633 split_t *split; | |
| 634 } extract_page_t; | |
| 635 | |
| 636 | |
| 637 /* A list of pages. */ | |
| 638 typedef struct | |
| 639 { | |
| 640 extract_page_t **pages; | |
| 641 int pages_num; | |
| 642 | |
| 643 /* All the structure for the document. */ | |
| 644 structure_t *structure; | |
| 645 | |
| 646 /* During construction, current points to the current point | |
| 647 * within the structure tree where things should be added. */ | |
| 648 structure_t *current; | |
| 649 } document_t; | |
| 650 | |
| 651 | |
| 652 typedef struct | |
| 653 { | |
| 654 image_t **images; | |
| 655 int images_num; | |
| 656 char **imagetypes; | |
| 657 int imagetypes_num; | |
| 658 } images_t; | |
| 659 | |
| 660 | |
| 661 /* This does all the work of finding paragraphs and tables. */ | |
| 662 int extract_document_join(extract_alloc_t *alloc, document_t *document, int layout_analysis, double master_space_guess); | |
| 663 | |
| 664 double extract_font_size(matrix4_t *ctm); | |
| 665 | |
| 666 /* Things below here are used when generating output. */ | |
| 667 | |
| 668 /* Basic information about current font. */ | |
| 669 typedef struct | |
| 670 { | |
| 671 char *name; | |
| 672 double size; | |
| 673 int bold; | |
| 674 int italic; | |
| 675 } font_t; | |
| 676 | |
| 677 /* Used to keep track of font information when writing paragraphs of odt | |
| 678 content, e.g. so we know whether a font has changed so need to start a new odt | |
| 679 span. */ | |
| 680 typedef struct | |
| 681 { | |
| 682 font_t font; | |
| 683 matrix4_t *ctm_prev; | |
| 684 } content_state_t; | |
| 685 | |
| 686 /* Analyse page content for layouts. */ | |
| 687 int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page); | |
| 688 | |
| 689 /* subpage_t constructor. */ | |
| 690 int extract_subpage_alloc(extract_alloc_t *extract, rect_t mediabox, extract_page_t *page, subpage_t **psubpage); | |
| 691 | |
| 692 /* subpage_t destructor. */ | |
| 693 void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage); | |
| 694 | |
| 695 /* Allocate a split_t. */ | |
| 696 int extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit); | |
| 697 | |
| 698 void extract_split_free(extract_alloc_t *alloc, split_t **psplit); | |
| 699 | |
| 700 typedef struct { | |
| 701 content_root_t *root; | |
| 702 content_t *next; | |
| 703 } content_tree_iterator; | |
| 704 | |
| 705 static inline content_t *content_tree_iterator_next(content_tree_iterator *it) | |
| 706 { | |
| 707 content_t *next = it->next; | |
| 708 | |
| 709 while (next->type == content_root) | |
| 710 { | |
| 711 content_t *parent = ((content_root_t *)next)->parent; | |
| 712 if (parent == NULL) | |
| 713 return NULL; | |
| 714 next = parent->next; | |
| 715 } | |
| 716 assert(next->type != content_root); | |
| 717 | |
| 718 switch (next->type) | |
| 719 { | |
| 720 default: | |
| 721 case content_root: | |
| 722 assert("Never happens!" == NULL); | |
| 723 break; | |
| 724 case content_span: | |
| 725 it->next = next->next; | |
| 726 break; | |
| 727 case content_line: | |
| 728 it->next = ((line_t *)next)->content.base.next; | |
| 729 break; | |
| 730 case content_paragraph: | |
| 731 it->next = ((paragraph_t *)next)->content.base.next; | |
| 732 break; | |
| 733 } | |
| 734 | |
| 735 return next; | |
| 736 } | |
| 737 | |
| 738 static inline content_t *content_tree_iterator_init(content_tree_iterator *it, content_root_t *root) | |
| 739 { | |
| 740 it->root = root; | |
| 741 it->next = root->base.next; | |
| 742 | |
| 743 return content_tree_iterator_next(it); | |
| 744 } | |
| 745 | |
| 746 /* Some helper functions */ | |
| 747 | |
| 748 /* Return a span_t * pointer to the first element in a content list. */ | |
| 749 static inline span_t *content_head_as_span(content_root_t *root) | |
| 750 { | |
| 751 assert(root != NULL && root->base.type == content_root && (root->base.next == NULL || root->base.next->type == content_span)); | |
| 752 return (span_t *)root->base.next; | |
| 753 } | |
| 754 | |
| 755 /* Return a point for the post-advance position of a char in a given span. */ | |
| 756 point_t extract_predicted_end_of_char(char_t *char_, const span_t *span); | |
| 757 | |
| 758 /* Return a point for the post-advance position of the final char in a given span. */ | |
| 759 point_t extract_end_of_span(const span_t *span); | |
| 760 | |
| 761 /* Return the bounds for a block before it was rotated around its origin. */ | |
| 762 rect_t extract_block_pre_rotation_bounds(block_t *block, double rotate); | |
| 763 | |
| 764 double extract_baseline_angle(const matrix4_t *ctm); | |
| 765 | |
| 766 #endif |
