Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/include/mupdf/pdf/interpret.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/include/mupdf/pdf/interpret.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,500 @@ +// Copyright (C) 2004-2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#ifndef PDF_INTERPRET_H +#define PDF_INTERPRET_H + +#include "mupdf/pdf/font.h" +#include "mupdf/pdf/resource.h" +#include "mupdf/pdf/document.h" + +typedef struct pdf_gstate pdf_gstate; +typedef struct pdf_processor pdf_processor; + +void *pdf_new_processor(fz_context *ctx, int size); +pdf_processor *pdf_keep_processor(fz_context *ctx, pdf_processor *proc); +void pdf_close_processor(fz_context *ctx, pdf_processor *proc); +void pdf_drop_processor(fz_context *ctx, pdf_processor *proc); + +typedef enum +{ + PDF_PROCESSOR_REQUIRES_DECODED_IMAGES = 1 +} pdf_processor_requirements; + +struct pdf_processor +{ + int refs; + + int closed; + + /* close the processor. Also closes any chained processors. */ + void (*close_processor)(fz_context *ctx, pdf_processor *proc); + void (*drop_processor)(fz_context *ctx, pdf_processor *proc); + void (*reset_processor)(fz_context *ctx, pdf_processor *proc); + + /* At any stage, we can have one set of resources in place. + * This function gives us a set of resources to use. We remember + * any previous set on a stack, so we can pop back to it later. + * Our responsibility (as well as remembering it for our own use) + * is to pass either it, or a filtered version of it onto any + * chained processor. */ + void (*push_resources)(fz_context *ctx, pdf_processor *proc, pdf_obj *res); + /* Pop the resources stack. This must be passed on to any chained + * processors. This returns a pointer to the resource dict just + * popped by the deepest filter. The caller inherits this reference. */ + pdf_obj *(*pop_resources)(fz_context *ctx, pdf_processor *proc); + + /* general graphics state */ + void (*op_w)(fz_context *ctx, pdf_processor *proc, float linewidth); + void (*op_j)(fz_context *ctx, pdf_processor *proc, int linejoin); + void (*op_J)(fz_context *ctx, pdf_processor *proc, int linecap); + void (*op_M)(fz_context *ctx, pdf_processor *proc, float miterlimit); + void (*op_d)(fz_context *ctx, pdf_processor *proc, pdf_obj *array, float phase); + void (*op_ri)(fz_context *ctx, pdf_processor *proc, const char *intent); + void (*op_i)(fz_context *ctx, pdf_processor *proc, float flatness); + + void (*op_gs_begin)(fz_context *ctx, pdf_processor *proc, const char *name, pdf_obj *extgstate); + void (*op_gs_BM)(fz_context *ctx, pdf_processor *proc, const char *blendmode); + void (*op_gs_ca)(fz_context *ctx, pdf_processor *proc, float alpha); + void (*op_gs_CA)(fz_context *ctx, pdf_processor *proc, float alpha); + void (*op_gs_SMask)(fz_context *ctx, pdf_processor *proc, pdf_obj *smask, fz_colorspace *smask_cs, float *bc, int luminosity, pdf_obj *tr); + void (*op_gs_end)(fz_context *ctx, pdf_processor *proc); + + /* special graphics state */ + void (*op_q)(fz_context *ctx, pdf_processor *proc); + void (*op_Q)(fz_context *ctx, pdf_processor *proc); + void (*op_cm)(fz_context *ctx, pdf_processor *proc, float a, float b, float c, float d, float e, float f); + + /* path construction */ + void (*op_m)(fz_context *ctx, pdf_processor *proc, float x, float y); + void (*op_l)(fz_context *ctx, pdf_processor *proc, float x, float y); + void (*op_c)(fz_context *ctx, pdf_processor *proc, float x1, float y1, float x2, float y2, float x3, float y3); + void (*op_v)(fz_context *ctx, pdf_processor *proc, float x2, float y2, float x3, float y3); + void (*op_y)(fz_context *ctx, pdf_processor *proc, float x1, float y1, float x3, float y3); + void (*op_h)(fz_context *ctx, pdf_processor *proc); + void (*op_re)(fz_context *ctx, pdf_processor *proc, float x, float y, float w, float h); + + /* path painting */ + void (*op_S)(fz_context *ctx, pdf_processor *proc); + void (*op_s)(fz_context *ctx, pdf_processor *proc); + void (*op_F)(fz_context *ctx, pdf_processor *proc); + void (*op_f)(fz_context *ctx, pdf_processor *proc); + void (*op_fstar)(fz_context *ctx, pdf_processor *proc); + void (*op_B)(fz_context *ctx, pdf_processor *proc); + void (*op_Bstar)(fz_context *ctx, pdf_processor *proc); + void (*op_b)(fz_context *ctx, pdf_processor *proc); + void (*op_bstar)(fz_context *ctx, pdf_processor *proc); + void (*op_n)(fz_context *ctx, pdf_processor *proc); + + /* clipping paths */ + void (*op_W)(fz_context *ctx, pdf_processor *proc); + void (*op_Wstar)(fz_context *ctx, pdf_processor *proc); + + /* text objects */ + void (*op_BT)(fz_context *ctx, pdf_processor *proc); + void (*op_ET)(fz_context *ctx, pdf_processor *proc); + + /* text state */ + void (*op_Tc)(fz_context *ctx, pdf_processor *proc, float charspace); + void (*op_Tw)(fz_context *ctx, pdf_processor *proc, float wordspace); + void (*op_Tz)(fz_context *ctx, pdf_processor *proc, float scale); + void (*op_TL)(fz_context *ctx, pdf_processor *proc, float leading); + void (*op_Tf)(fz_context *ctx, pdf_processor *proc, const char *name, pdf_font_desc *font, float size); + void (*op_Tr)(fz_context *ctx, pdf_processor *proc, int render); + void (*op_Ts)(fz_context *ctx, pdf_processor *proc, float rise); + + /* text positioning */ + void (*op_Td)(fz_context *ctx, pdf_processor *proc, float tx, float ty); + void (*op_TD)(fz_context *ctx, pdf_processor *proc, float tx, float ty); + void (*op_Tm)(fz_context *ctx, pdf_processor *proc, float a, float b, float c, float d, float e, float f); + void (*op_Tstar)(fz_context *ctx, pdf_processor *proc); + + /* text showing */ + void (*op_TJ)(fz_context *ctx, pdf_processor *proc, pdf_obj *array); + void (*op_Tj)(fz_context *ctx, pdf_processor *proc, char *str, size_t len); + void (*op_squote)(fz_context *ctx, pdf_processor *proc, char *str, size_t len); + void (*op_dquote)(fz_context *ctx, pdf_processor *proc, float aw, float ac, char *str, size_t len); + + /* type 3 fonts */ + void (*op_d0)(fz_context *ctx, pdf_processor *proc, float wx, float wy); + void (*op_d1)(fz_context *ctx, pdf_processor *proc, float wx, float wy, float llx, float lly, float urx, float ury); + + /* color */ + void (*op_CS)(fz_context *ctx, pdf_processor *proc, const char *name, fz_colorspace *cs); + void (*op_cs)(fz_context *ctx, pdf_processor *proc, const char *name, fz_colorspace *cs); + void (*op_SC_pattern)(fz_context *ctx, pdf_processor *proc, const char *name, pdf_pattern *pat, int n, float *color); + void (*op_sc_pattern)(fz_context *ctx, pdf_processor *proc, const char *name, pdf_pattern *pat, int n, float *color); + void (*op_SC_shade)(fz_context *ctx, pdf_processor *proc, const char *name, fz_shade *shade); + void (*op_sc_shade)(fz_context *ctx, pdf_processor *proc, const char *name, fz_shade *shade); + void (*op_SC_color)(fz_context *ctx, pdf_processor *proc, int n, float *color); + void (*op_sc_color)(fz_context *ctx, pdf_processor *proc, int n, float *color); + + void (*op_G)(fz_context *ctx, pdf_processor *proc, float g); + void (*op_g)(fz_context *ctx, pdf_processor *proc, float g); + void (*op_RG)(fz_context *ctx, pdf_processor *proc, float r, float g, float b); + void (*op_rg)(fz_context *ctx, pdf_processor *proc, float r, float g, float b); + void (*op_K)(fz_context *ctx, pdf_processor *proc, float c, float m, float y, float k); + void (*op_k)(fz_context *ctx, pdf_processor *proc, float c, float m, float y, float k); + + /* shadings, images, xobjects */ + void (*op_BI)(fz_context *ctx, pdf_processor *proc, fz_image *image, const char *colorspace_name); + void (*op_sh)(fz_context *ctx, pdf_processor *proc, const char *name, fz_shade *shade); + void (*op_Do_image)(fz_context *ctx, pdf_processor *proc, const char *name, fz_image *image); + void (*op_Do_form)(fz_context *ctx, pdf_processor *proc, const char *name, pdf_obj *form); + + /* marked content */ + void (*op_MP)(fz_context *ctx, pdf_processor *proc, const char *tag); + void (*op_DP)(fz_context *ctx, pdf_processor *proc, const char *tag, pdf_obj *raw, pdf_obj *cooked); + void (*op_BMC)(fz_context *ctx, pdf_processor *proc, const char *tag); + void (*op_BDC)(fz_context *ctx, pdf_processor *proc, const char *tag, pdf_obj *raw, pdf_obj *cooked); + void (*op_EMC)(fz_context *ctx, pdf_processor *proc); + + /* compatibility */ + void (*op_BX)(fz_context *ctx, pdf_processor *proc); + void (*op_EX)(fz_context *ctx, pdf_processor *proc); + + /* Virtual ops for ExtGState entries */ + void (*op_gs_OP)(fz_context *ctx, pdf_processor *proc, int b); + void (*op_gs_op)(fz_context *ctx, pdf_processor *proc, int b); + void (*op_gs_OPM)(fz_context *ctx, pdf_processor *proc, int i); + void (*op_gs_UseBlackPtComp)(fz_context *ctx, pdf_processor *proc, pdf_obj *name); + + /* EOD is used to signify end of data (before any finalise/close down/ + * automatically added gstate pops). */ + void (*op_EOD)(fz_context *ctx, pdf_processor *proc); + + /* END is used to signify end of stream (finalise and close down) */ + void (*op_END)(fz_context *ctx, pdf_processor *proc); + + /* interpreter state that persists across content streams */ + const char *usage; + int hidden; + + pdf_processor_requirements requirements; +}; + +typedef struct +{ + /* input */ + pdf_document *doc; + pdf_obj *rdb; + pdf_lexbuf *buf; + fz_cookie *cookie; + + /* state */ + int gstate; + int xbalance; + int in_text; + fz_rect d1_rect; + + /* stack */ + pdf_obj *obj; + char name[256]; + char string[256]; + size_t string_len; + int top; + float stack[32]; +} pdf_csi; + +void pdf_count_q_balance(fz_context *ctx, pdf_document *doc, pdf_obj *res, pdf_obj *stm, int *prepend, int *append); + +/* Functions to set up pdf_process structures */ + +pdf_processor *pdf_new_run_processor(fz_context *ctx, pdf_document *doc, fz_device *dev, fz_matrix ctm, int struct_parent, const char *usage, pdf_gstate *gstate, fz_default_colorspaces *default_cs, fz_cookie *cookie, pdf_gstate *fill_gstate, pdf_gstate *stroke_gstate); + +/* + Create a buffer processor. + + This collects the incoming PDF operator stream into an fz_buffer. + + buffer: The (possibly empty) buffer to which operators will be + appended. + + ahxencode: If 0, then image streams will be send as binary, + otherwise they will be asciihexencoded. + + newlines: If 0, then minimal spacing will be sent. If 1 + then newlines will be sent after every operator. +*/ +pdf_processor *pdf_new_buffer_processor(fz_context *ctx, fz_buffer *buffer, int ahxencode, int newlines); + +/* + Reopen a closed processor to be used again. + + This brings a processor back to life after a close. + Not all processors may support this, so this may throw + an exception. +*/ +void pdf_reset_processor(fz_context *ctx, pdf_processor *proc); + + +/* + Create an output processor. This + sends the incoming PDF operator stream to an fz_output stream. + + out: The output stream to which operators will be sent. + + ahxencode: If 0, then image streams will be send as binary, + otherwise they will be asciihexencoded. + + newlines: If 0, then minimal spacing will be sent. If 1 + then newlines will be sent after every operator. +*/ +pdf_processor *pdf_new_output_processor(fz_context *ctx, fz_output *out, int ahxencode, int newlines); + +typedef struct pdf_filter_options pdf_filter_options; + +/* + Create a filter processor. This filters the PDF operators + it is fed, and passes them down (with some changes) to the + child filter. + + chain: The child processor to which the filtered operators + will be fed. + + The options field contains a pointer to a structure with + filter specific options in. +*/ +typedef pdf_processor *(pdf_filter_factory_fn)(fz_context *ctx, pdf_document *doc, pdf_processor *chain, int struct_parents, fz_matrix transform, pdf_filter_options *options, void *factory_options); + +/* + A pdf_filter_factory is a pdf_filter_factory_fn, plus the options + needed to instantiate it. +*/ +typedef struct +{ + pdf_filter_factory_fn *filter; + void *options; +} pdf_filter_factory; + +/* + recurse: Filter resources recursively. + + instance_forms: Always recurse on XObject Form resources, but will + create a new instance of each XObject Form that is used, filtered + individually. + + ascii: If true, escape all binary data in the output. + + no_update: If true, do not update the document at the end. + + opaque: Opaque value that is passed to the complete function. + + complete: A function called at the end of processing. + This allows the caller to insert some extra content after + all other content. + + filters: Pointer to an array of filter factory/options. + The array is terminated by an entry with a NULL factory pointer. + Operators will be fed into the filter generated from the first + factory function in the list, and from there go to the filter + generated from the second factory in the list etc. + + newlines: If 0, then minimal whitespace will be produced. If 1, + then a newline will be sent after every operator. +*/ +struct pdf_filter_options +{ + int recurse; + int instance_forms; + int ascii; + int no_update; + + void *opaque; + void (*complete)(fz_context *ctx, fz_buffer *buffer, void *opaque); + + pdf_filter_factory *filters; + int newlines; +}; + +typedef enum +{ + FZ_CULL_PATH_DROP = 0, + FZ_CULL_PATH_FILL = 1, + FZ_CULL_PATH_STROKE = 2, + FZ_CULL_PATH_FILL_STROKE = 3, + FZ_CULL_CLIP_PATH_DROP = 4, + FZ_CULL_CLIP_PATH_FILL = 5, + FZ_CULL_CLIP_PATH_STROKE = 6, + FZ_CULL_CLIP_PATH_FILL_STROKE = 7, + FZ_CULL_GLYPH = 8, + FZ_CULL_IMAGE, + FZ_CULL_SHADING +} fz_cull_type; + +/* + image_filter: A function called to assess whether a given + image should be removed or not. + + text_filter: A function called to assess whether a given + character should be removed or not. + + after_text_object: A function called after each text object. + This allows the caller to insert some extra content if + desired. + + culler: A function called to see whether each object should + be culled or not. +*/ +typedef struct +{ + void *opaque; + fz_image *(*image_filter)(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect scissor); + int (*text_filter)(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox); + void (*after_text_object)(fz_context *ctx, void *opaque, pdf_document *doc, pdf_processor *chain, fz_matrix ctm); + int (*culler)(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type); +} +pdf_sanitize_filter_options; + +/* + A sanitize filter factory. + + sopts = pointer to pdf_sanitize_filter_options. + + The changes made by a filter generated from this are: + + * No operations are allowed to change the top level gstate. + Additional q/Q operators are inserted to prevent this. + + * Repeated/unnecessary colour operators are removed (so, + for example, "0 0 0 rg 0 1 rg 0.5 g" would be sanitised to + "0.5 g") + + The intention of these changes is to provide a simpler, + but equivalent stream, repairing problems with mismatched + operators, maintaining structure (such as BMC, EMC calls) + and leaving the graphics state in an known (default) state + so that subsequent operations (such as synthesising new + operators to be appended to the stream) are easier. + + The net graphical effect of the filtered operator stream + should be identical to the incoming operator stream. +*/ +pdf_processor *pdf_new_sanitize_filter(fz_context *ctx, pdf_document *doc, pdf_processor *chain, int struct_parents, fz_matrix transform, pdf_filter_options *options, void *sopts); + +pdf_obj *pdf_filter_xobject_instance(fz_context *ctx, pdf_obj *old_xobj, pdf_obj *page_res, fz_matrix ctm, pdf_filter_options *options, pdf_cycle_list *cycle_up); + +void pdf_processor_push_resources(fz_context *ctx, pdf_processor *proc, pdf_obj *res); + +pdf_obj *pdf_processor_pop_resources(fz_context *ctx, pdf_processor *proc); + +/* + opaque: Opaque value that is passed to all the filter functions. + + color_rewrite: function pointer called to rewrite a color + On entry: + *cs = reference to a pdf object representing the colorspace. + + *n = number of color components + + color = *n color values. + + On exit: + *cs either the same (for no change in colorspace) or + updated to be a new one. Reference must be dropped, and + a new kept reference returned! + + *n = number of color components (maybe updated) + + color = *n color values (maybe updated) + + image_rewrite: function pointer called to rewrite an image + On entry: + *image = reference to an fz_image. + + On exit: + *image either the same (for no change) or updated + to be a new one. Reference must be dropped, and a + new kept reference returned. + + share_rewrite: function pointer called to rewrite a shade + + repeated_image_rewrite: If 0, then each image is rewritten only once. + Otherwise, it is called for every instance (useful if gathering + information about the ctm). +*/ +typedef struct +{ + void *opaque; + void (*color_rewrite)(fz_context *ctx, void *opaque, pdf_obj **cs, int *n, float color[FZ_MAX_COLORS]); + void (*image_rewrite)(fz_context *ctx, void *opaque, fz_image **image, fz_matrix ctm, pdf_obj *obj); + pdf_shade_recolorer *shade_rewrite; + int repeated_image_rewrite; +} pdf_color_filter_options; + +pdf_processor * +pdf_new_color_filter(fz_context *ctx, pdf_document *doc, pdf_processor *chain, int struct_parents, fz_matrix transform, pdf_filter_options *options, void *copts); + +/* + Functions to actually process annotations, glyphs and general stream objects. +*/ +void pdf_process_contents(fz_context *ctx, pdf_processor *proc, pdf_document *doc, pdf_obj *res, pdf_obj *stm, fz_cookie *cookie, pdf_obj **out_res); +void pdf_process_annot(fz_context *ctx, pdf_processor *proc, pdf_annot *annot, fz_cookie *cookie); +void pdf_process_glyph(fz_context *ctx, pdf_processor *proc, pdf_document *doc, pdf_obj *resources, fz_buffer *contents); + +/* + Function to process a contents stream without handling the resources. + The caller is responsible for pushing/popping the resources. +*/ +void pdf_process_raw_contents(fz_context *ctx, pdf_processor *proc, pdf_document *doc, pdf_obj *rdb, pdf_obj *stmobj, fz_cookie *cookie); + +/* Text handling helper functions */ +typedef struct +{ + float char_space; + float word_space; + float scale; + float leading; + pdf_font_desc *font; + fz_string *fontname; + float size; + int render; + float rise; +} pdf_text_state; + +typedef struct +{ + fz_text *text; + fz_rect text_bbox; + fz_matrix tlm; + fz_matrix tm; + int text_mode; + + int cid; + int gid; + fz_rect char_bbox; + pdf_font_desc *fontdesc; + float char_tx; + float char_ty; +} pdf_text_object_state; + +void pdf_tos_save(fz_context *ctx, pdf_text_object_state *tos, fz_matrix save[2]); +void pdf_tos_restore(fz_context *ctx, pdf_text_object_state *tos, fz_matrix save[2]); +fz_text *pdf_tos_get_text(fz_context *ctx, pdf_text_object_state *tos); +void pdf_tos_reset(fz_context *ctx, pdf_text_object_state *tos, int render); +int pdf_tos_make_trm(fz_context *ctx, pdf_text_object_state *tos, pdf_text_state *text, pdf_font_desc *fontdesc, int cid, fz_matrix *trm, float *adv); +void pdf_tos_move_after_char(fz_context *ctx, pdf_text_object_state *tos); +void pdf_tos_translate(pdf_text_object_state *tos, float tx, float ty); +void pdf_tos_set_matrix(pdf_text_object_state *tos, float a, float b, float c, float d, float e, float f); +void pdf_tos_newline(pdf_text_object_state *tos, float leading); + +#endif
