comparison mupdf-source/thirdparty/extract/src/extract-exe.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /* Command-line programme for extract_ API. */
2
3 #ifdef _WIN32
4 #define _CRT_SECURE_NO_WARNINGS
5 #endif
6
7 #include "../include/extract.h"
8 #include "../include/extract_alloc.h"
9
10 #include "memento.h"
11 #include "outf.h"
12
13 #include <assert.h>
14 #include <errno.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19
20 /* Error-detecting equivalent to *out = argv[++i].
21 */
22 static int arg_next_string(char** argv, int argc, int* i, const char** out)
23 {
24 if (*i + 1 >= argc) {
25 printf("Expected arg after: %s\n", argv[*i]);
26 errno = EINVAL;
27 return -1;
28 }
29 *i += 1;
30 *out = argv[*i];
31 return 0;
32 }
33
34 /* Error-detecting equivalent to *out = atoi(argv[++i]).
35 */
36 static int arg_next_int(char** argv, int argc, int* i, int* out)
37 {
38 if (*i + 1 >= argc) {
39 printf("Expected integer arg after: %s\n", argv[*i]);
40 errno = EINVAL;
41 return -1;
42 }
43 *i += 1;
44 *out = atoi(argv[*i]);
45 return 0;
46 }
47
48 static void* s_realloc(void* state, void* prev, size_t size)
49 {
50 assert(state == (void*) 123);
51 return realloc(prev, size);
52 }
53
54 int main(int argc, char** argv)
55 {
56 int e = -1;
57 const char* docx_out_path = NULL;
58 const char* input_path = NULL;
59 const char* docx_template_path = NULL;
60 const char* content_path = NULL;
61 int preserve_dir = 0;
62 int spacing = 1;
63 int rotation = 1;
64 int autosplit = 0;
65 int images = 1;
66 int alloc_stats = 0;
67 int format = -1;
68 int i;
69
70 extract_alloc_t* alloc = NULL;
71 extract_buffer_t* out_buffer = NULL;
72 extract_buffer_t* intermediate = NULL;
73 extract_t* extract = NULL;
74
75 /* Create an allocator so we test the allocation code. */
76 if (extract_alloc_create(s_realloc, (void*) 123, &alloc))
77 {
78 assert(0);
79 }
80
81 for (i=1; i<argc; ++i) {
82 const char* arg = argv[i];
83 if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) {
84 printf(
85 "Converts intermediate data from mupdf or gs into a docx file.\n"
86 "\n"
87 "We require a file containing XML output from one of these commands:\n"
88 " mutool draw -F xmltext ...\n"
89 " gs -sDEVICE=txtwrite -dTextFormat=4 ...\n"
90 "\n"
91 "We also requires a template docx file.\n"
92 "\n"
93 "Args:\n"
94 " --alloc-exp-min <bytes>\n"
95 " Internal: set exponential allocation with minimum alloc size.\n"
96 " --autosplit 0|1\n"
97 " If 1, we initially split spans when y coordinate changes. This\n"
98 " stresses our handling of spans when input is from mupdf.\n"
99 " -f odt | docx\n"
100 " Sets output format. Required.\n"
101 " -i <intermediate-path>\n"
102 " Path of XML file containing intermediate text spans.\n"
103 " -o <docx-path>\n"
104 " If specified, we generate the specified docx file.\n"
105 " --o-content <path>\n"
106 " If specified, we write raw docx content to <path>; this is the\n"
107 " text that we embed inside the template word/document.xml file\n"
108 " when generating the docx file.\n"
109 " -p 0|1\n"
110 " If 1 and -t <docx-template> is specified, we preserve the\n"
111 " uncompressed <docx-path>.lib/ directory.\n"
112 " -r 0|1\n"
113 " If 1, we we output rotated text inside a rotated drawing. Otherwise\n"
114 " output text is always horizontal.\n"
115 " -s 0|1\n"
116 " If 1, we insert extra vertical space between paragraphs and extra\n"
117 " vertical space between paragraphs that had different ctm matrices\n"
118 " in the original document.\n"
119 " -t <docx-template>\n"
120 " If specified we use <docx-template> as template. Otheerwise we use\n"
121 " an internal template.\n"
122 " -v <verbose>\n"
123 " Set verbose level.\n"
124 " -v-alloc\n"
125 " Show alloc stats.\n"
126 );
127 if (i + 1 == argc) {
128 e = 0;
129 goto end;
130 }
131 }
132 else if (!strcmp(arg, "--alloc-exp-min")) {
133 int size;
134 if (arg_next_int(argv, argc, &i, &size)) goto end;
135 outf("Calling alloc_set_min_alloc_size(%i)", size);
136 extract_exp_min(extract, size);
137 }
138 else if (!strcmp(arg, "--autosplit")) {
139 if (arg_next_int(argv, argc, &i, &autosplit)) goto end;
140 }
141 else if (!strcmp(arg, "-f")) {
142 const char* format_name;
143 if (arg_next_string(argv, argc, &i, &format_name)) goto end;
144 if (!strcmp(format_name, "odt")) format = extract_format_ODT;
145 else if (!strcmp(format_name, "docx")) format = extract_format_DOCX;
146 else if (!strcmp(format_name, "html")) format = extract_format_HTML;
147 else
148 {
149 printf("-f value should be 'odt' or 'docx', not '%s'.\n", format_name);
150 errno = EINVAL;
151 goto end;
152 }
153 }
154 else if (!strcmp(arg, "-i")) {
155 if (arg_next_string(argv, argc, &i, &input_path)) goto end;
156 }
157 else if (!strcmp(arg, "-o")) {
158 if (arg_next_string(argv, argc, &i, &docx_out_path)) goto end;
159 }
160 else if (!strcmp(arg, "--o-content")) {
161 if (arg_next_string(argv, argc, &i, &content_path)) goto end;
162 }
163 else if (!strcmp(arg, "-p")) {
164 if (arg_next_int(argv, argc, &i, &preserve_dir)) goto end;
165 }
166 else if (!strcmp(arg, "-r")) {
167 if (arg_next_int(argv, argc, &i, &rotation)) goto end;
168 }
169 else if (!strcmp(arg, "-s")) {
170 if (arg_next_int(argv, argc, &i, &spacing)) goto end;
171 }
172 else if (!strcmp(arg, "-t")) {
173 if (arg_next_string(argv, argc, &i, &docx_template_path)) goto end;
174 }
175 else if (!strcmp(arg, "-v")) {
176 int verbose;
177 if (arg_next_int(argv, argc, &i, &verbose)) goto end;
178 extract_outf_verbose_set(verbose);
179 outf("Have changed verbose to %i", verbose);
180 }
181 else if (!strcmp(arg, "--v-alloc")) {
182 if (arg_next_int(argv, argc, &i, &alloc_stats)) goto end;
183 }
184 else {
185 printf("Unrecognised arg: '%s'\n", arg);
186 errno = EINVAL;
187 goto end;
188 }
189
190 assert(i < argc);
191 }
192
193 if (format == -1)
194 {
195 printf("'-f odt | docx' must be specified\n");
196 errno = EINVAL;
197 goto end;
198 }
199
200 if (!input_path) {
201 printf("-i <input-path> not specified.\n");
202 errno = EINVAL;
203 goto end;
204 }
205
206 if (extract_buffer_open_file(alloc, input_path, 0 /*writable*/, &intermediate)) {
207 printf("Failed to open intermediate file: %s\n", input_path);
208 goto end;
209 }
210
211 if (extract_begin(alloc, format, &extract)) goto end;
212 if (extract_read_intermediate(extract, intermediate)) goto end;
213
214 if (extract_process(extract, spacing, rotation, images)) goto end;
215
216 if (content_path) {
217 if (extract_buffer_open_file(alloc, content_path, 1 /*writable*/, &out_buffer)) goto end;
218 if (extract_write_content(extract, out_buffer)) goto end;
219 if (extract_buffer_close(&out_buffer)) goto end;
220 }
221 if (docx_out_path) {
222 if (docx_template_path) {
223 if (extract_write_template(
224 extract,
225 docx_template_path,
226 docx_out_path,
227 preserve_dir
228 )) {
229 printf("Failed to create docx file: %s\n", docx_out_path);
230 goto end;
231 }
232 }
233 else {
234 if (extract_buffer_open_file(alloc, docx_out_path, 1 /*writable*/, &out_buffer)) goto end;
235 if (extract_write(extract, out_buffer)) {
236 printf("Failed to create docx file: %s\n", docx_out_path);
237 goto end;
238 }
239 if (extract_buffer_close(&out_buffer)) goto end;
240 }
241 }
242
243 e = 0;
244 end:
245
246 extract_buffer_close(&intermediate);
247 extract_buffer_close(&out_buffer);
248 extract_end(&extract);
249
250 if (e) {
251 printf("Failed (errno=%i): %s\n", errno, strerror(errno));
252 return 1;
253 }
254
255 extract_internal_end();
256
257 if (alloc_stats) {
258 extract_alloc_stats_t* stats = extract_alloc_stats(alloc);
259 printf("Alloc stats: num_malloc=%i num_realloc=%i num_free=%i num_libc_realloc=%i\n",
260 stats->num_malloc,
261 stats->num_realloc,
262 stats->num_free,
263 stats->num_libc_realloc
264 );
265 }
266
267 extract_alloc_destroy(&alloc);
268 assert(alloc == NULL);
269
270 printf("Finished.\n");
271 return 0;
272 }