Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/json.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 /* These extract_json_*() functions generate json content data. */ | |
| 2 | |
| 3 #include "extract/extract.h" | |
| 4 | |
| 5 #include "astring.h" | |
| 6 #include "document.h" | |
| 7 #include "html.h" | |
| 8 #include "mem.h" | |
| 9 #include "memento.h" | |
| 10 #include "outf.h" | |
| 11 #include "sys.h" | |
| 12 #include "text.h" | |
| 13 #include "zip.h" | |
| 14 | |
| 15 #include <assert.h> | |
| 16 #include <errno.h> | |
| 17 #include <float.h> | |
| 18 #include <math.h> | |
| 19 #include <stdlib.h> | |
| 20 #include <stdio.h> | |
| 21 #include <string.h> | |
| 22 | |
| 23 #include <sys/stat.h> | |
| 24 | |
| 25 | |
| 26 | |
| 27 static int osp(extract_alloc_t *alloc, extract_astring_t *content, structure_t *structure) | |
| 28 { | |
| 29 if (structure->parent) | |
| 30 { | |
| 31 if (osp(alloc, content, structure->parent) || | |
| 32 extract_astring_catc(alloc, content, '\\')) | |
| 33 return -1; | |
| 34 } | |
| 35 | |
| 36 if (structure->uid != 0) | |
| 37 { | |
| 38 if (extract_astring_catf(alloc, content, "%s[%d]", extract_struct_string(structure->type), structure->uid)) | |
| 39 return -1; | |
| 40 } | |
| 41 else | |
| 42 { | |
| 43 if (extract_astring_catf(alloc, content, "%s", extract_struct_string(structure->type))) | |
| 44 return -1; | |
| 45 } | |
| 46 | |
| 47 return 0; | |
| 48 } | |
| 49 | |
| 50 static int output_structure_path(extract_alloc_t *alloc, extract_astring_t *content, structure_t *structure) | |
| 51 { | |
| 52 if (structure == NULL) | |
| 53 return 0; | |
| 54 | |
| 55 if (extract_astring_cat(alloc, content, ",\n\"Path\" : \"") || | |
| 56 osp(alloc, content, structure) || | |
| 57 extract_astring_cat(alloc, content, "\"")) | |
| 58 return -1; | |
| 59 return 0; | |
| 60 } | |
| 61 | |
| 62 static int flush(extract_alloc_t *alloc, extract_astring_t *content, span_t *span, structure_t *structure, extract_astring_t *text, rect_t *bbox) | |
| 63 { | |
| 64 if (span == NULL) | |
| 65 return 0; | |
| 66 if (content->chars_num) | |
| 67 if (extract_astring_cat(alloc, content, ",\n")) | |
| 68 return -1; | |
| 69 if (extract_astring_catf(alloc, content, "{\n\"Bounds\": [ %f, %f, %f, %f ],\n\"Text\": \"", bbox->min.x, bbox->min.y, bbox->max.x, bbox->max.y) || | |
| 70 extract_astring_catl(alloc, content, text->chars, text->chars_num) || | |
| 71 extract_astring_catf(alloc, content, "\",\n\"Font\": { \"family_name\": \"%s\" },\n\"TextSize\": %g", span->font_name, extract_font_size(&span->ctm))) | |
| 72 return -1; | |
| 73 if (output_structure_path(alloc, content, structure)) | |
| 74 return -1; | |
| 75 if (extract_astring_cat(alloc, content, "\n}")) | |
| 76 return -1; | |
| 77 extract_astring_free(alloc, text); | |
| 78 *bbox = extract_rect_empty; | |
| 79 | |
| 80 return 0; | |
| 81 } | |
| 82 | |
| 83 int extract_document_to_json_content( | |
| 84 extract_alloc_t *alloc, | |
| 85 document_t *document, | |
| 86 int rotation, | |
| 87 int images, | |
| 88 extract_astring_t *content) | |
| 89 { | |
| 90 int ret = -1; | |
| 91 int n; | |
| 92 content_tree_iterator cti; | |
| 93 extract_astring_t text; | |
| 94 | |
| 95 (void) rotation; | |
| 96 (void) images; | |
| 97 | |
| 98 extract_astring_init(&text); | |
| 99 //extract_astring_cat(alloc, content, "<html>\n"); | |
| 100 //extract_astring_cat(alloc, content, "<body>\n"); | |
| 101 | |
| 102 /* Write paragraphs into <content>. */ | |
| 103 for (n=0; n<document->pages_num; ++n) | |
| 104 { | |
| 105 int i; | |
| 106 extract_page_t *page = document->pages[n]; | |
| 107 subpage_t **psubpage = page->subpages; | |
| 108 | |
| 109 for (i=0; i<page->subpages_num; ++i) | |
| 110 { | |
| 111 content_t *cont; | |
| 112 structure_t *structure = NULL; | |
| 113 span_t *last_span = NULL; | |
| 114 rect_t bbox = extract_rect_empty; | |
| 115 | |
| 116 for (cont = content_tree_iterator_init(&cti, &psubpage[i]->content); cont != NULL; cont = content_tree_iterator_next(&cti)) | |
| 117 { | |
| 118 switch (cont->type) | |
| 119 { | |
| 120 case content_span: | |
| 121 { | |
| 122 int j; | |
| 123 span_t *span = (span_t *)cont; | |
| 124 if (last_span && | |
| 125 (structure != span->structure || | |
| 126 last_span->flags.font_bold != span->flags.font_bold || | |
| 127 last_span->flags.font_italic != span->flags.font_italic || | |
| 128 last_span->flags.wmode != span->flags.wmode || | |
| 129 strcmp(last_span->font_name, span->font_name))) | |
| 130 { | |
| 131 // flush stored text. | |
| 132 flush(alloc, content, last_span, structure, &text, &bbox); | |
| 133 } | |
| 134 last_span = span; | |
| 135 structure = span->structure; | |
| 136 for (j = 0; j < span->chars_num; j++) | |
| 137 { | |
| 138 if (span->chars[j].ucs == (unsigned int)-1) | |
| 139 continue; | |
| 140 if (extract_astring_catc_unicode(alloc, &text, span->chars[j].ucs, 1, 0, 0, 0)) | |
| 141 goto end; | |
| 142 bbox = extract_rect_union(bbox, span->chars[j].bbox); | |
| 143 } | |
| 144 break; | |
| 145 } | |
| 146 case content_image: | |
| 147 case content_table: | |
| 148 case content_block: | |
| 149 case content_line: | |
| 150 case content_paragraph: | |
| 151 // Nothing to do for lines and paragraphs as they just enclose spans. | |
| 152 // Nothing to do for the others for now. | |
| 153 break; | |
| 154 default: | |
| 155 assert("This should never happen\n" == NULL); | |
| 156 break; | |
| 157 } | |
| 158 } | |
| 159 flush(alloc, content, last_span, structure, &text, &bbox); | |
| 160 } | |
| 161 } | |
| 162 | |
| 163 ret = 0; | |
| 164 end: | |
| 165 | |
| 166 extract_astring_free(alloc, &text); | |
| 167 | |
| 168 return ret; | |
| 169 } |
