Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/extract/src/extract-exe.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/extract/src/extract-exe.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,272 @@ +/* Command-line programme for extract_ API. */ + +#ifdef _WIN32 +#define _CRT_SECURE_NO_WARNINGS +#endif + +#include "../include/extract.h" +#include "../include/extract_alloc.h" + +#include "memento.h" +#include "outf.h" + +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + + +/* Error-detecting equivalent to *out = argv[++i]. +*/ +static int arg_next_string(char** argv, int argc, int* i, const char** out) +{ + if (*i + 1 >= argc) { + printf("Expected arg after: %s\n", argv[*i]); + errno = EINVAL; + return -1; + } + *i += 1; + *out = argv[*i]; + return 0; +} + +/* Error-detecting equivalent to *out = atoi(argv[++i]). +*/ +static int arg_next_int(char** argv, int argc, int* i, int* out) +{ + if (*i + 1 >= argc) { + printf("Expected integer arg after: %s\n", argv[*i]); + errno = EINVAL; + return -1; + } + *i += 1; + *out = atoi(argv[*i]); + return 0; +} + +static void* s_realloc(void* state, void* prev, size_t size) +{ + assert(state == (void*) 123); + return realloc(prev, size); +} + +int main(int argc, char** argv) +{ + int e = -1; + const char* docx_out_path = NULL; + const char* input_path = NULL; + const char* docx_template_path = NULL; + const char* content_path = NULL; + int preserve_dir = 0; + int spacing = 1; + int rotation = 1; + int autosplit = 0; + int images = 1; + int alloc_stats = 0; + int format = -1; + int i; + + extract_alloc_t* alloc = NULL; + extract_buffer_t* out_buffer = NULL; + extract_buffer_t* intermediate = NULL; + extract_t* extract = NULL; + + /* Create an allocator so we test the allocation code. */ + if (extract_alloc_create(s_realloc, (void*) 123, &alloc)) + { + assert(0); + } + + for (i=1; i<argc; ++i) { + const char* arg = argv[i]; + if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { + printf( + "Converts intermediate data from mupdf or gs into a docx file.\n" + "\n" + "We require a file containing XML output from one of these commands:\n" + " mutool draw -F xmltext ...\n" + " gs -sDEVICE=txtwrite -dTextFormat=4 ...\n" + "\n" + "We also requires a template docx file.\n" + "\n" + "Args:\n" + " --alloc-exp-min <bytes>\n" + " Internal: set exponential allocation with minimum alloc size.\n" + " --autosplit 0|1\n" + " If 1, we initially split spans when y coordinate changes. This\n" + " stresses our handling of spans when input is from mupdf.\n" + " -f odt | docx\n" + " Sets output format. Required.\n" + " -i <intermediate-path>\n" + " Path of XML file containing intermediate text spans.\n" + " -o <docx-path>\n" + " If specified, we generate the specified docx file.\n" + " --o-content <path>\n" + " If specified, we write raw docx content to <path>; this is the\n" + " text that we embed inside the template word/document.xml file\n" + " when generating the docx file.\n" + " -p 0|1\n" + " If 1 and -t <docx-template> is specified, we preserve the\n" + " uncompressed <docx-path>.lib/ directory.\n" + " -r 0|1\n" + " If 1, we we output rotated text inside a rotated drawing. Otherwise\n" + " output text is always horizontal.\n" + " -s 0|1\n" + " If 1, we insert extra vertical space between paragraphs and extra\n" + " vertical space between paragraphs that had different ctm matrices\n" + " in the original document.\n" + " -t <docx-template>\n" + " If specified we use <docx-template> as template. Otheerwise we use\n" + " an internal template.\n" + " -v <verbose>\n" + " Set verbose level.\n" + " -v-alloc\n" + " Show alloc stats.\n" + ); + if (i + 1 == argc) { + e = 0; + goto end; + } + } + else if (!strcmp(arg, "--alloc-exp-min")) { + int size; + if (arg_next_int(argv, argc, &i, &size)) goto end; + outf("Calling alloc_set_min_alloc_size(%i)", size); + extract_exp_min(extract, size); + } + else if (!strcmp(arg, "--autosplit")) { + if (arg_next_int(argv, argc, &i, &autosplit)) goto end; + } + else if (!strcmp(arg, "-f")) { + const char* format_name; + if (arg_next_string(argv, argc, &i, &format_name)) goto end; + if (!strcmp(format_name, "odt")) format = extract_format_ODT; + else if (!strcmp(format_name, "docx")) format = extract_format_DOCX; + else if (!strcmp(format_name, "html")) format = extract_format_HTML; + else + { + printf("-f value should be 'odt' or 'docx', not '%s'.\n", format_name); + errno = EINVAL; + goto end; + } + } + else if (!strcmp(arg, "-i")) { + if (arg_next_string(argv, argc, &i, &input_path)) goto end; + } + else if (!strcmp(arg, "-o")) { + if (arg_next_string(argv, argc, &i, &docx_out_path)) goto end; + } + else if (!strcmp(arg, "--o-content")) { + if (arg_next_string(argv, argc, &i, &content_path)) goto end; + } + else if (!strcmp(arg, "-p")) { + if (arg_next_int(argv, argc, &i, &preserve_dir)) goto end; + } + else if (!strcmp(arg, "-r")) { + if (arg_next_int(argv, argc, &i, &rotation)) goto end; + } + else if (!strcmp(arg, "-s")) { + if (arg_next_int(argv, argc, &i, &spacing)) goto end; + } + else if (!strcmp(arg, "-t")) { + if (arg_next_string(argv, argc, &i, &docx_template_path)) goto end; + } + else if (!strcmp(arg, "-v")) { + int verbose; + if (arg_next_int(argv, argc, &i, &verbose)) goto end; + extract_outf_verbose_set(verbose); + outf("Have changed verbose to %i", verbose); + } + else if (!strcmp(arg, "--v-alloc")) { + if (arg_next_int(argv, argc, &i, &alloc_stats)) goto end; + } + else { + printf("Unrecognised arg: '%s'\n", arg); + errno = EINVAL; + goto end; + } + + assert(i < argc); + } + + if (format == -1) + { + printf("'-f odt | docx' must be specified\n"); + errno = EINVAL; + goto end; + } + + if (!input_path) { + printf("-i <input-path> not specified.\n"); + errno = EINVAL; + goto end; + } + + if (extract_buffer_open_file(alloc, input_path, 0 /*writable*/, &intermediate)) { + printf("Failed to open intermediate file: %s\n", input_path); + goto end; + } + + if (extract_begin(alloc, format, &extract)) goto end; + if (extract_read_intermediate(extract, intermediate)) goto end; + + if (extract_process(extract, spacing, rotation, images)) goto end; + + if (content_path) { + if (extract_buffer_open_file(alloc, content_path, 1 /*writable*/, &out_buffer)) goto end; + if (extract_write_content(extract, out_buffer)) goto end; + if (extract_buffer_close(&out_buffer)) goto end; + } + if (docx_out_path) { + if (docx_template_path) { + if (extract_write_template( + extract, + docx_template_path, + docx_out_path, + preserve_dir + )) { + printf("Failed to create docx file: %s\n", docx_out_path); + goto end; + } + } + else { + if (extract_buffer_open_file(alloc, docx_out_path, 1 /*writable*/, &out_buffer)) goto end; + if (extract_write(extract, out_buffer)) { + printf("Failed to create docx file: %s\n", docx_out_path); + goto end; + } + if (extract_buffer_close(&out_buffer)) goto end; + } + } + + e = 0; + end: + + extract_buffer_close(&intermediate); + extract_buffer_close(&out_buffer); + extract_end(&extract); + + if (e) { + printf("Failed (errno=%i): %s\n", errno, strerror(errno)); + return 1; + } + + extract_internal_end(); + + if (alloc_stats) { + extract_alloc_stats_t* stats = extract_alloc_stats(alloc); + printf("Alloc stats: num_malloc=%i num_realloc=%i num_free=%i num_libc_realloc=%i\n", + stats->num_malloc, + stats->num_realloc, + stats->num_free, + stats->num_libc_realloc + ); + } + + extract_alloc_destroy(&alloc); + assert(alloc == NULL); + + printf("Finished.\n"); + return 0; +}
