comparison mupdf-source/thirdparty/extract/src/json.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 /* These extract_json_*() functions generate json content data. */
2
3 #include "extract/extract.h"
4
5 #include "astring.h"
6 #include "document.h"
7 #include "html.h"
8 #include "mem.h"
9 #include "memento.h"
10 #include "outf.h"
11 #include "sys.h"
12 #include "text.h"
13 #include "zip.h"
14
15 #include <assert.h>
16 #include <errno.h>
17 #include <float.h>
18 #include <math.h>
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22
23 #include <sys/stat.h>
24
25
26
27 static int osp(extract_alloc_t *alloc, extract_astring_t *content, structure_t *structure)
28 {
29 if (structure->parent)
30 {
31 if (osp(alloc, content, structure->parent) ||
32 extract_astring_catc(alloc, content, '\\'))
33 return -1;
34 }
35
36 if (structure->uid != 0)
37 {
38 if (extract_astring_catf(alloc, content, "%s[%d]", extract_struct_string(structure->type), structure->uid))
39 return -1;
40 }
41 else
42 {
43 if (extract_astring_catf(alloc, content, "%s", extract_struct_string(structure->type)))
44 return -1;
45 }
46
47 return 0;
48 }
49
50 static int output_structure_path(extract_alloc_t *alloc, extract_astring_t *content, structure_t *structure)
51 {
52 if (structure == NULL)
53 return 0;
54
55 if (extract_astring_cat(alloc, content, ",\n\"Path\" : \"") ||
56 osp(alloc, content, structure) ||
57 extract_astring_cat(alloc, content, "\""))
58 return -1;
59 return 0;
60 }
61
62 static int flush(extract_alloc_t *alloc, extract_astring_t *content, span_t *span, structure_t *structure, extract_astring_t *text, rect_t *bbox)
63 {
64 if (span == NULL)
65 return 0;
66 if (content->chars_num)
67 if (extract_astring_cat(alloc, content, ",\n"))
68 return -1;
69 if (extract_astring_catf(alloc, content, "{\n\"Bounds\": [ %f, %f, %f, %f ],\n\"Text\": \"", bbox->min.x, bbox->min.y, bbox->max.x, bbox->max.y) ||
70 extract_astring_catl(alloc, content, text->chars, text->chars_num) ||
71 extract_astring_catf(alloc, content, "\",\n\"Font\": { \"family_name\": \"%s\" },\n\"TextSize\": %g", span->font_name, extract_font_size(&span->ctm)))
72 return -1;
73 if (output_structure_path(alloc, content, structure))
74 return -1;
75 if (extract_astring_cat(alloc, content, "\n}"))
76 return -1;
77 extract_astring_free(alloc, text);
78 *bbox = extract_rect_empty;
79
80 return 0;
81 }
82
83 int extract_document_to_json_content(
84 extract_alloc_t *alloc,
85 document_t *document,
86 int rotation,
87 int images,
88 extract_astring_t *content)
89 {
90 int ret = -1;
91 int n;
92 content_tree_iterator cti;
93 extract_astring_t text;
94
95 (void) rotation;
96 (void) images;
97
98 extract_astring_init(&text);
99 //extract_astring_cat(alloc, content, "<html>\n");
100 //extract_astring_cat(alloc, content, "<body>\n");
101
102 /* Write paragraphs into <content>. */
103 for (n=0; n<document->pages_num; ++n)
104 {
105 int i;
106 extract_page_t *page = document->pages[n];
107 subpage_t **psubpage = page->subpages;
108
109 for (i=0; i<page->subpages_num; ++i)
110 {
111 content_t *cont;
112 structure_t *structure = NULL;
113 span_t *last_span = NULL;
114 rect_t bbox = extract_rect_empty;
115
116 for (cont = content_tree_iterator_init(&cti, &psubpage[i]->content); cont != NULL; cont = content_tree_iterator_next(&cti))
117 {
118 switch (cont->type)
119 {
120 case content_span:
121 {
122 int j;
123 span_t *span = (span_t *)cont;
124 if (last_span &&
125 (structure != span->structure ||
126 last_span->flags.font_bold != span->flags.font_bold ||
127 last_span->flags.font_italic != span->flags.font_italic ||
128 last_span->flags.wmode != span->flags.wmode ||
129 strcmp(last_span->font_name, span->font_name)))
130 {
131 // flush stored text.
132 flush(alloc, content, last_span, structure, &text, &bbox);
133 }
134 last_span = span;
135 structure = span->structure;
136 for (j = 0; j < span->chars_num; j++)
137 {
138 if (span->chars[j].ucs == (unsigned int)-1)
139 continue;
140 if (extract_astring_catc_unicode(alloc, &text, span->chars[j].ucs, 1, 0, 0, 0))
141 goto end;
142 bbox = extract_rect_union(bbox, span->chars[j].bbox);
143 }
144 break;
145 }
146 case content_image:
147 case content_table:
148 case content_block:
149 case content_line:
150 case content_paragraph:
151 // Nothing to do for lines and paragraphs as they just enclose spans.
152 // Nothing to do for the others for now.
153 break;
154 default:
155 assert("This should never happen\n" == NULL);
156 break;
157 }
158 }
159 flush(alloc, content, last_span, structure, &text, &bbox);
160 }
161 }
162
163 ret = 0;
164 end:
165
166 extract_astring_free(alloc, &text);
167
168 return ret;
169 }