Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/extract/src/document.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line source
#include "document.h" #include "outf.h" #include <assert.h> #include <stdio.h> #include <string.h> void content_init(content_t *content, content_type_t type) { content->type = type; content->next = content->prev = (type == content_root) ? content : NULL; } void content_init_root(content_root_t *content, content_t *parent) { content->base.type = content_root; content->base.next = content->base.prev = &content->base; content->parent = parent; } void content_unlink(content_t *content) { if (content == NULL) return; assert(content->type != content_root); if (content->prev == NULL) { assert(content->next == NULL); /* Already unlinked */ } else { assert(content->next != content && content->prev != content); content->prev->next = content->next; content->next->prev = content->prev; content->next = content->prev = NULL; } } void content_unlink_span(span_t *span) { content_unlink(&span->base); } void extract_span_init(span_t *span, structure_t *structure) { static const span_t blank = { 0 }; *span = blank; content_init(&span->base, content_span); span->structure = structure; } void extract_span_free(extract_alloc_t *alloc, span_t **pspan) { if (*pspan == NULL) return; content_unlink(&(*pspan)->base); extract_free(alloc, &(*pspan)->font_name); extract_free(alloc, &(*pspan)->chars); extract_free(alloc, pspan); } void extract_line_init(line_t *line) { static const line_t blank = { 0 }; *line = blank; content_init(&line->base, content_line); content_init_root(&line->content, &line->base); } void extract_paragraph_init(paragraph_t *paragraph) { static const paragraph_t blank = { 0 }; *paragraph = blank; content_init(¶graph->base, content_paragraph); content_init_root(¶graph->content, ¶graph->base); } void extract_block_init(block_t *block) { static const block_t blank = { 0 }; *block = blank; content_init(&block->base, content_block); content_init_root(&block->content, &block->base); } void extract_table_init(table_t *table) { static const table_t blank = { 0 }; *table = blank; content_init(&table->base, content_table); } void extract_image_init(image_t *image) { static const image_t blank = { 0 }; *image = blank; content_init(&image->base, content_image); } void content_clear(extract_alloc_t *alloc, content_root_t *proot) { content_t *content, *next; assert(proot->base.type == content_root && proot->base.next != NULL && proot->base.prev != NULL); for (content = proot->base.next; content != &proot->base; content = next) { assert(content->type != content_root); next = content->next; switch (content->type) { default: case content_root: assert("This never happens" == NULL); break; case content_span: extract_span_free(alloc, (span_t **)&content); break; case content_line: extract_line_free(alloc, (line_t **)&content); break; case content_paragraph: extract_paragraph_free(alloc, (paragraph_t **)&content); break; case content_block: extract_block_free(alloc, (block_t **)&content); break; case content_table: extract_table_free(alloc, (table_t **)&content); break; case content_image: extract_image_free(alloc, (image_t **)&content); break; } } } int content_count(content_root_t *root) { int n = 0; content_t *s; for (s = root->base.next; s != &root->base; s = s->next) n++; return n; } static int content_count_type(content_root_t *root, content_type_t type) { int n = 0; content_t *s; for (s = root->base.next; s != &root->base; s = s->next) if (s->type == type) n++; return n; } int content_count_spans(content_root_t *root) { return content_count_type(root, content_span); } int content_count_images(content_root_t *root) { return content_count_type(root, content_image); } int content_count_lines(content_root_t *root) { return content_count_type(root, content_line); } int content_count_paragraphs(content_root_t *root) { return content_count_type(root, content_paragraph); } int content_count_tables(content_root_t *root) { return content_count_type(root, content_table); } static content_t * content_first_of_type(const content_root_t *root, content_type_t type) { content_t *content; assert(root && root->base.type == content_root); for (content = root->base.next; content != &root->base; content = content->next) { if (content->type == type) return content; } return NULL; } static content_t * content_last_of_type(const content_root_t *root, content_type_t type) { content_t *content; assert(root && root->base.type == content_root); for (content = root->base.prev; content != &root->base; content = content->prev) { if (content->type == type) return content; } return NULL; } span_t *content_first_span(const content_root_t *root) { return (span_t *)content_first_of_type(root, content_span); } span_t *content_last_span(const content_root_t *root) { return (span_t *)content_last_of_type(root, content_span); } line_t *content_first_line(const content_root_t *root) { return (line_t *)content_first_of_type(root, content_line); } line_t *content_last_line(const content_root_t *root) { return (line_t *)content_last_of_type(root, content_line); } paragraph_t *content_first_paragraph(const content_root_t *root) { return (paragraph_t *)content_first_of_type(root, content_paragraph); } paragraph_t *content_last_paragraph(const content_root_t *root) { return (paragraph_t *)content_last_of_type(root, content_paragraph); } static content_t * content_next_of_type(const content_t *node, content_type_t type) { content_t *content; assert(node && node->type != content_root); for (content = node->next; content->type != content_root; content = content->next) { if (content->type == type) return content; } return NULL; } static content_t * content_prev_of_type(const content_t *node, content_type_t type) { content_t *content; assert(node && node->type != content_root); for (content = node->prev; content->type != content_root; content = content->prev) { if (content->type == type) return content; } return NULL; } span_t *content_next_span(const content_t *root) { return (span_t *)content_next_of_type(root, content_span); } span_t *content_prev_span(const content_t *root) { return (span_t *)content_prev_of_type(root, content_span); } line_t *content_next_line(const content_t *root) { return (line_t *)content_next_of_type(root, content_line); } line_t *content_prev_line(const content_t *root) { return (line_t *)content_prev_of_type(root, content_line); } paragraph_t *content_next_paragraph(const content_t *root) { return (paragraph_t *)content_next_of_type(root, content_paragraph); } paragraph_t *content_prev_paragraph(const content_t *root) { return (paragraph_t *)content_prev_of_type(root, content_paragraph); } void content_concat(content_root_t *dst, content_root_t *src) { content_t *walk, *walk_next; if (src == NULL) return; for (walk = src->base.next; walk != &src->base; walk = walk_next) { walk_next = walk->next; content_append(dst, walk); } } void extract_line_free(extract_alloc_t* alloc, line_t **pline) { line_t *line = *pline; content_unlink(&line->base); content_clear(alloc, &line->content); extract_free(alloc, pline); } void extract_image_clear(extract_alloc_t *alloc, image_t *image) { extract_free(alloc, &image->type); extract_free(alloc, &image->name); extract_free(alloc, &image->id); if (image->data_free) { image->data_free(image->data_free_handle, image->data); image->data_free = NULL; image->data_free_handle = NULL; image->data = NULL; } } void extract_image_free(extract_alloc_t *alloc, image_t **pimage) { if (*pimage == NULL) return; extract_image_clear(alloc, *pimage); extract_free(alloc, pimage); } void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell) { cell_t *cell = *pcell; if (cell == NULL) return; content_clear(alloc, &cell->content); extract_free(alloc, pcell); } int extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit) { split_t *split; if (extract_malloc(alloc, psplit, sizeof(*split) + (count-1) * sizeof(split_t *))) return -1; split = *psplit; split->type = type; split->weight = 0; split->count = count; memset(&split->split[0], 0, sizeof(split_t *) * count); return 0; } void extract_split_free(extract_alloc_t *alloc, split_t **psplit) { int i; split_t *split = *psplit; if (split == NULL) return; for (i = 0; i < split->count; i++) extract_split_free(alloc, &split->split[i]); extract_free(alloc, psplit); } static void space_prefix(int depth) { while (depth-- > 0) putc(' ', stdout); } static void content_dump_aux(const content_root_t *content, int depth); static void dump_span(const span_t *span, int depth) { int i; for (i = 0; i < span->chars_num; i++) { char_t *c = &span->chars[i]; space_prefix(depth); printf("<char ucs=\""); if (c->ucs >= 32 && c->ucs <= 127) putc((char)c->ucs, stdout); else printf("<%04x>", c->ucs); printf("\" x=%f y=%f adv=%f />\n", c->x, c->y, c->adv); } } static void dump_structure_path(structure_t *structure) { if (structure->parent) { dump_structure_path(structure->parent); printf("/"); } printf("%s(%d)", extract_struct_string(structure->type), structure->uid); } static void content_dump_span_aux(const span_t *span, int depth) { space_prefix(depth); printf("<span ctm=[%f %f %f %f]\n", span->ctm.a, span->ctm.b, span->ctm.c, span->ctm.d); if (span->structure) { space_prefix(depth); printf(" structure=\""); dump_structure_path(span->structure); printf("\"\n"); } space_prefix(depth); printf(" font-name=\"%s\" font_bbox=[%f %f %f %f]>\n", span->font_name, span->font_bbox.min.x, span->font_bbox.min.y, span->font_bbox.max.x, span->font_bbox.max.y); dump_span(span, depth+1); space_prefix(depth); printf("</span>\n"); } void content_dump_span(const span_t *span) { content_dump_span_aux(span, 0); } static void content_dump_line_aux(const line_t *line, int depth) { span_t *span0 = content_first_span(&line->content); span_t *span1 = content_last_span(&line->content); char_t *char0 = (span0 && span0->chars_num > 0) ? &span0->chars[0] : NULL; char_t *char1 = (span1 && span1->chars_num > 0) ? &span1->chars[span1->chars_num-1] : NULL; space_prefix(depth); printf("<line"); if (char0 && char1) { printf(" x0=%g y0=%g x1=%g y1=%g\n", char0->x, char0->y, char1->x, char1->y); } content_dump_aux(&line->content, depth+1); space_prefix(depth); printf("</line>\n"); } void content_dump_line(const line_t *line) { content_dump_line_aux(line, 0); } static void content_dump_aux(const content_root_t *content, int depth) { const content_t *walk; assert(content->base.type == content_root); for (walk = content->base.next; walk != &content->base; walk = walk->next) { assert(walk->next->prev == walk && walk->prev->next == walk); switch (walk->type) { case content_span: content_dump_span_aux((const span_t *)walk, depth); break; case content_line: content_dump_line_aux((const line_t *)walk, depth); break; case content_paragraph: space_prefix(depth); printf("<paragraph>\n"); content_dump_aux(&((const paragraph_t *)walk)->content, depth+1); space_prefix(depth); printf("</paragraph>\n"); break; case content_block: space_prefix(depth); printf("<block>\n"); content_dump_aux(&((const block_t *)walk)->content, depth+1); space_prefix(depth); printf("</block>\n"); break; case content_table: { const table_t *table = (const table_t *)walk; int i, j, k; space_prefix(depth); printf("<table w=%d h=%d>\n", table->cells_num_x, table->cells_num_y); k = 0; for (j = 0; j < table->cells_num_y; j++) { for (i = 0; i < table->cells_num_x; i++) { space_prefix(depth+1); printf("<cell>\n"); content_dump_aux(&table->cells[k]->content, depth+2); space_prefix(depth+1); printf("</cell>\n"); k++; } } space_prefix(depth); printf("</table>\n"); break; } case content_image: space_prefix(depth); printf("<image/>\n"); break; default: assert("Unexpected type found while dumping content list." == NULL); break; } } } void content_dump(const content_root_t *content) { content_dump_aux(content, 0); } static void content_dump_brief_aux(const content_root_t *content, int depth); static void content_dump_brief_span_aux(const span_t *span) { int i; printf("\""); for (i = 0; i < span->chars_num; i++) { char_t *c = &span->chars[i]; if (c->ucs >= 32 && c->ucs <= 127) putc((char)c->ucs, stdout); else printf("<%04x>", c->ucs); } printf("\""); } static void content_dump_brief_line_aux(const line_t *line, int depth) { printf("<line text="); content_dump_brief_aux(&line->content, depth+1); printf(">\n"); } static void content_dump_brief_aux(const content_root_t *content, int depth) { const content_t *walk; assert(content->base.type == content_root); for (walk = content->base.next; walk != &content->base; walk = walk->next) { assert(walk->next->prev == walk && walk->prev->next == walk); switch (walk->type) { case content_span: content_dump_brief_span_aux((const span_t *)walk); break; case content_line: content_dump_brief_line_aux((const line_t *)walk, depth); break; case content_paragraph: content_dump_brief_aux(&((const paragraph_t *)walk)->content, depth+1); break; case content_block: content_dump_brief_aux(&((const block_t *)walk)->content, depth+1); break; case content_table: { const table_t *table = (const table_t *)walk; int i, j, k; k = 0; for (j = 0; j < table->cells_num_y; j++) { for (i = 0; i < table->cells_num_x; i++) { content_dump_brief_aux(&table->cells[k]->content, depth+2); k++; } } break; } case content_image: break; default: assert("Unexpected type found while dumping content list." == NULL); break; } } } void content_dump_brief(const content_root_t *content) { content_dump_brief_aux(content, 0); } static content_t * cmp_and_merge(content_t *q1, int q1pos, int len1, int n, content_cmp_fn *cmp) { int len2 = q1pos + len1*2; /* end of both lists assuming we don't overrun */ int p; content_t *q2 = q1; /* Don't overrun the end, and then convert to length from end. */ if (len2 > n) len2 = n; len2 -= q1pos + len1; if (len2 <= 0) len1 += len2; /* Find the start of q2. We know this fits. */ for (p = 0; p < len1; p++) q2 = q2->next; if (len2 <= 0) return q2; /* So we have [q1..(q1+len1)) as the first list to merge, and * [q2..q2+len2)) as the second list to merge. We know that * q2 = q1+len1. So, if we can reduce len1 or len2 to 0, we have * the lists sorted. */ while (1) { if (cmp(q1, q2) > 0) { /* q2 is smaller. q2 should be before q1. Move it. */ /* So: * a<->q1<->c..d<->q2<->b => a<->q2<->q1<->c..d<->b * (where c and d can either be the same, or can be q2 and q1!) */ content_t *a = q1->prev; content_t *b = q2->next; content_t *d = q2->prev; d->next = b; b->prev = d; a->next = q2; q2->prev = a; q2->next = q1; q1->prev = q2; /* Now advance q2 */ q2 = b; len2--; if (len2 == 0) break; } else { /* Advance q1 */ q1 = q1->next; len1--; if (len1 == 0) break; } } while (len2) { q2 = q2->next; len2--; } return q2; } /* Spiffy in-place merge sort. */ void content_sort(content_root_t *content, content_cmp_fn *cmp) { int n = content_count(content); int size; for (size = 1; size < n; size <<= 1) { int q1_idx = 0; content_t *q1 = content->base.next; for (q1_idx = 0; q1_idx < n; q1_idx += size*2) q1 = cmp_and_merge(q1, q1_idx, size, n, cmp); assert(q1->type == content_root); } assert(content_count(content) == n); }
