Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/extract-exe.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* Command-line programme for extract_ API. */ | |
| 2 | |
| 3 #ifdef _WIN32 | |
| 4 #define _CRT_SECURE_NO_WARNINGS | |
| 5 #endif | |
| 6 | |
| 7 #include "../include/extract.h" | |
| 8 #include "../include/extract_alloc.h" | |
| 9 | |
| 10 #include "memento.h" | |
| 11 #include "outf.h" | |
| 12 | |
| 13 #include <assert.h> | |
| 14 #include <errno.h> | |
| 15 #include <stdio.h> | |
| 16 #include <stdlib.h> | |
| 17 #include <string.h> | |
| 18 | |
| 19 | |
| 20 /* Error-detecting equivalent to *out = argv[++i]. | |
| 21 */ | |
| 22 static int arg_next_string(char** argv, int argc, int* i, const char** out) | |
| 23 { | |
| 24 if (*i + 1 >= argc) { | |
| 25 printf("Expected arg after: %s\n", argv[*i]); | |
| 26 errno = EINVAL; | |
| 27 return -1; | |
| 28 } | |
| 29 *i += 1; | |
| 30 *out = argv[*i]; | |
| 31 return 0; | |
| 32 } | |
| 33 | |
| 34 /* Error-detecting equivalent to *out = atoi(argv[++i]). | |
| 35 */ | |
| 36 static int arg_next_int(char** argv, int argc, int* i, int* out) | |
| 37 { | |
| 38 if (*i + 1 >= argc) { | |
| 39 printf("Expected integer arg after: %s\n", argv[*i]); | |
| 40 errno = EINVAL; | |
| 41 return -1; | |
| 42 } | |
| 43 *i += 1; | |
| 44 *out = atoi(argv[*i]); | |
| 45 return 0; | |
| 46 } | |
| 47 | |
| 48 static void* s_realloc(void* state, void* prev, size_t size) | |
| 49 { | |
| 50 assert(state == (void*) 123); | |
| 51 return realloc(prev, size); | |
| 52 } | |
| 53 | |
| 54 int main(int argc, char** argv) | |
| 55 { | |
| 56 int e = -1; | |
| 57 const char* docx_out_path = NULL; | |
| 58 const char* input_path = NULL; | |
| 59 const char* docx_template_path = NULL; | |
| 60 const char* content_path = NULL; | |
| 61 int preserve_dir = 0; | |
| 62 int spacing = 1; | |
| 63 int rotation = 1; | |
| 64 int autosplit = 0; | |
| 65 int images = 1; | |
| 66 int alloc_stats = 0; | |
| 67 int format = -1; | |
| 68 int i; | |
| 69 | |
| 70 extract_alloc_t* alloc = NULL; | |
| 71 extract_buffer_t* out_buffer = NULL; | |
| 72 extract_buffer_t* intermediate = NULL; | |
| 73 extract_t* extract = NULL; | |
| 74 | |
| 75 /* Create an allocator so we test the allocation code. */ | |
| 76 if (extract_alloc_create(s_realloc, (void*) 123, &alloc)) | |
| 77 { | |
| 78 assert(0); | |
| 79 } | |
| 80 | |
| 81 for (i=1; i<argc; ++i) { | |
| 82 const char* arg = argv[i]; | |
| 83 if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { | |
| 84 printf( | |
| 85 "Converts intermediate data from mupdf or gs into a docx file.\n" | |
| 86 "\n" | |
| 87 "We require a file containing XML output from one of these commands:\n" | |
| 88 " mutool draw -F xmltext ...\n" | |
| 89 " gs -sDEVICE=txtwrite -dTextFormat=4 ...\n" | |
| 90 "\n" | |
| 91 "We also requires a template docx file.\n" | |
| 92 "\n" | |
| 93 "Args:\n" | |
| 94 " --alloc-exp-min <bytes>\n" | |
| 95 " Internal: set exponential allocation with minimum alloc size.\n" | |
| 96 " --autosplit 0|1\n" | |
| 97 " If 1, we initially split spans when y coordinate changes. This\n" | |
| 98 " stresses our handling of spans when input is from mupdf.\n" | |
| 99 " -f odt | docx\n" | |
| 100 " Sets output format. Required.\n" | |
| 101 " -i <intermediate-path>\n" | |
| 102 " Path of XML file containing intermediate text spans.\n" | |
| 103 " -o <docx-path>\n" | |
| 104 " If specified, we generate the specified docx file.\n" | |
| 105 " --o-content <path>\n" | |
| 106 " If specified, we write raw docx content to <path>; this is the\n" | |
| 107 " text that we embed inside the template word/document.xml file\n" | |
| 108 " when generating the docx file.\n" | |
| 109 " -p 0|1\n" | |
| 110 " If 1 and -t <docx-template> is specified, we preserve the\n" | |
| 111 " uncompressed <docx-path>.lib/ directory.\n" | |
| 112 " -r 0|1\n" | |
| 113 " If 1, we we output rotated text inside a rotated drawing. Otherwise\n" | |
| 114 " output text is always horizontal.\n" | |
| 115 " -s 0|1\n" | |
| 116 " If 1, we insert extra vertical space between paragraphs and extra\n" | |
| 117 " vertical space between paragraphs that had different ctm matrices\n" | |
| 118 " in the original document.\n" | |
| 119 " -t <docx-template>\n" | |
| 120 " If specified we use <docx-template> as template. Otheerwise we use\n" | |
| 121 " an internal template.\n" | |
| 122 " -v <verbose>\n" | |
| 123 " Set verbose level.\n" | |
| 124 " -v-alloc\n" | |
| 125 " Show alloc stats.\n" | |
| 126 ); | |
| 127 if (i + 1 == argc) { | |
| 128 e = 0; | |
| 129 goto end; | |
| 130 } | |
| 131 } | |
| 132 else if (!strcmp(arg, "--alloc-exp-min")) { | |
| 133 int size; | |
| 134 if (arg_next_int(argv, argc, &i, &size)) goto end; | |
| 135 outf("Calling alloc_set_min_alloc_size(%i)", size); | |
| 136 extract_exp_min(extract, size); | |
| 137 } | |
| 138 else if (!strcmp(arg, "--autosplit")) { | |
| 139 if (arg_next_int(argv, argc, &i, &autosplit)) goto end; | |
| 140 } | |
| 141 else if (!strcmp(arg, "-f")) { | |
| 142 const char* format_name; | |
| 143 if (arg_next_string(argv, argc, &i, &format_name)) goto end; | |
| 144 if (!strcmp(format_name, "odt")) format = extract_format_ODT; | |
| 145 else if (!strcmp(format_name, "docx")) format = extract_format_DOCX; | |
| 146 else if (!strcmp(format_name, "html")) format = extract_format_HTML; | |
| 147 else | |
| 148 { | |
| 149 printf("-f value should be 'odt' or 'docx', not '%s'.\n", format_name); | |
| 150 errno = EINVAL; | |
| 151 goto end; | |
| 152 } | |
| 153 } | |
| 154 else if (!strcmp(arg, "-i")) { | |
| 155 if (arg_next_string(argv, argc, &i, &input_path)) goto end; | |
| 156 } | |
| 157 else if (!strcmp(arg, "-o")) { | |
| 158 if (arg_next_string(argv, argc, &i, &docx_out_path)) goto end; | |
| 159 } | |
| 160 else if (!strcmp(arg, "--o-content")) { | |
| 161 if (arg_next_string(argv, argc, &i, &content_path)) goto end; | |
| 162 } | |
| 163 else if (!strcmp(arg, "-p")) { | |
| 164 if (arg_next_int(argv, argc, &i, &preserve_dir)) goto end; | |
| 165 } | |
| 166 else if (!strcmp(arg, "-r")) { | |
| 167 if (arg_next_int(argv, argc, &i, &rotation)) goto end; | |
| 168 } | |
| 169 else if (!strcmp(arg, "-s")) { | |
| 170 if (arg_next_int(argv, argc, &i, &spacing)) goto end; | |
| 171 } | |
| 172 else if (!strcmp(arg, "-t")) { | |
| 173 if (arg_next_string(argv, argc, &i, &docx_template_path)) goto end; | |
| 174 } | |
| 175 else if (!strcmp(arg, "-v")) { | |
| 176 int verbose; | |
| 177 if (arg_next_int(argv, argc, &i, &verbose)) goto end; | |
| 178 extract_outf_verbose_set(verbose); | |
| 179 outf("Have changed verbose to %i", verbose); | |
| 180 } | |
| 181 else if (!strcmp(arg, "--v-alloc")) { | |
| 182 if (arg_next_int(argv, argc, &i, &alloc_stats)) goto end; | |
| 183 } | |
| 184 else { | |
| 185 printf("Unrecognised arg: '%s'\n", arg); | |
| 186 errno = EINVAL; | |
| 187 goto end; | |
| 188 } | |
| 189 | |
| 190 assert(i < argc); | |
| 191 } | |
| 192 | |
| 193 if (format == -1) | |
| 194 { | |
| 195 printf("'-f odt | docx' must be specified\n"); | |
| 196 errno = EINVAL; | |
| 197 goto end; | |
| 198 } | |
| 199 | |
| 200 if (!input_path) { | |
| 201 printf("-i <input-path> not specified.\n"); | |
| 202 errno = EINVAL; | |
| 203 goto end; | |
| 204 } | |
| 205 | |
| 206 if (extract_buffer_open_file(alloc, input_path, 0 /*writable*/, &intermediate)) { | |
| 207 printf("Failed to open intermediate file: %s\n", input_path); | |
| 208 goto end; | |
| 209 } | |
| 210 | |
| 211 if (extract_begin(alloc, format, &extract)) goto end; | |
| 212 if (extract_read_intermediate(extract, intermediate)) goto end; | |
| 213 | |
| 214 if (extract_process(extract, spacing, rotation, images)) goto end; | |
| 215 | |
| 216 if (content_path) { | |
| 217 if (extract_buffer_open_file(alloc, content_path, 1 /*writable*/, &out_buffer)) goto end; | |
| 218 if (extract_write_content(extract, out_buffer)) goto end; | |
| 219 if (extract_buffer_close(&out_buffer)) goto end; | |
| 220 } | |
| 221 if (docx_out_path) { | |
| 222 if (docx_template_path) { | |
| 223 if (extract_write_template( | |
| 224 extract, | |
| 225 docx_template_path, | |
| 226 docx_out_path, | |
| 227 preserve_dir | |
| 228 )) { | |
| 229 printf("Failed to create docx file: %s\n", docx_out_path); | |
| 230 goto end; | |
| 231 } | |
| 232 } | |
| 233 else { | |
| 234 if (extract_buffer_open_file(alloc, docx_out_path, 1 /*writable*/, &out_buffer)) goto end; | |
| 235 if (extract_write(extract, out_buffer)) { | |
| 236 printf("Failed to create docx file: %s\n", docx_out_path); | |
| 237 goto end; | |
| 238 } | |
| 239 if (extract_buffer_close(&out_buffer)) goto end; | |
| 240 } | |
| 241 } | |
| 242 | |
| 243 e = 0; | |
| 244 end: | |
| 245 | |
| 246 extract_buffer_close(&intermediate); | |
| 247 extract_buffer_close(&out_buffer); | |
| 248 extract_end(&extract); | |
| 249 | |
| 250 if (e) { | |
| 251 printf("Failed (errno=%i): %s\n", errno, strerror(errno)); | |
| 252 return 1; | |
| 253 } | |
| 254 | |
| 255 extract_internal_end(); | |
| 256 | |
| 257 if (alloc_stats) { | |
| 258 extract_alloc_stats_t* stats = extract_alloc_stats(alloc); | |
| 259 printf("Alloc stats: num_malloc=%i num_realloc=%i num_free=%i num_libc_realloc=%i\n", | |
| 260 stats->num_malloc, | |
| 261 stats->num_realloc, | |
| 262 stats->num_free, | |
| 263 stats->num_libc_realloc | |
| 264 ); | |
| 265 } | |
| 266 | |
| 267 extract_alloc_destroy(&alloc); | |
| 268 assert(alloc == NULL); | |
| 269 | |
| 270 printf("Finished.\n"); | |
| 271 return 0; | |
| 272 } |
