Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/include/mupdf/fitz/structured-text.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/include/mupdf/fitz/structured-text.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,736 @@
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
+#define MUPDF_FITZ_STRUCTURED_TEXT_H
+
+#include "mupdf/fitz/system.h"
+#include "mupdf/fitz/types.h"
+#include "mupdf/fitz/context.h"
+#include "mupdf/fitz/geometry.h"
+#include "mupdf/fitz/font.h"
+#include "mupdf/fitz/image.h"
+#include "mupdf/fitz/output.h"
+#include "mupdf/fitz/device.h"
+#include "mupdf/fitz/pool.h"
+
+/**
+	Simple text layout (for use with annotation editing primarily).
+*/
+typedef struct fz_layout_char
+{
+	float x, advance;
+	const char *p; /* location in source text of character */
+	struct fz_layout_char *next;
+} fz_layout_char;
+
+typedef struct fz_layout_line
+{
+	float x, y, font_size;
+	const char *p; /* location in source text of start of line */
+	fz_layout_char *text;
+	struct fz_layout_line *next;
+} fz_layout_line;
+
+typedef struct
+{
+	fz_pool *pool;
+	fz_matrix matrix;
+	fz_matrix inv_matrix;
+	fz_layout_line *head, **tailp;
+	fz_layout_char **text_tailp;
+} fz_layout_block;
+
+/**
+	Create a new layout block, with new allocation pool, zero
+	matrices, and initialise linked pointers.
+*/
+fz_layout_block *fz_new_layout(fz_context *ctx);
+
+/**
+	Drop layout block. Free the pool, and linked blocks.
+
+	Never throws exceptions.
+*/
+void fz_drop_layout(fz_context *ctx, fz_layout_block *block);
+
+/**
+	Add a new line to the end of the layout block.
+*/
+void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p);
+
+/**
+	Add a new char to the line at the end of the layout block.
+*/
+void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p);
+
+/**
+	Text extraction device: Used for searching, format conversion etc.
+
+	(In development - Subject to change in future versions)
+*/
+
+typedef struct fz_stext_char fz_stext_char;
+typedef struct fz_stext_line fz_stext_line;
+typedef struct fz_stext_block fz_stext_block;
+typedef struct fz_stext_struct fz_stext_struct;
+typedef struct fz_stext_grid_positions fz_stext_grid_positions;
+
+/**
+	FZ_STEXT_PRESERVE_LIGATURES: If this option is activated
+	ligatures are passed through to the application in their
+	original form. If this option is deactivated ligatures are
+	expanded into their constituent parts, e.g. the ligature ffi is
+	expanded into three separate characters f, f and i.
+
+	FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated
+	whitespace is passed through to the application in its original
+	form. If this option is deactivated any type of horizontal
+	whitespace (including horizontal tabs) will be replaced with
+	space characters of variable width.
+
+	FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images
+	will be stored in the structured text structure. The default is
+	to ignore all images.
+
+	FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try
+	to add missing space characters where there are large gaps
+	between characters.
+
+	FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the
+	end of a line will be removed and the lines will be merged.
+
+	FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line
+	will not be merged. Each line will thus be a span of text with the same
+	font, colour, and size.
+
+	FZ_STEXT_CLIP: If this option is set, characters that would be entirely
+	clipped away by the current clipping path (or, more accurate, the smallest
+	bbox that contains the current clipping path) will be ignored. The
+	clip path is guaranteed to be smaller then the page mediabox, hence
+	this option subsumes an older, now deprecated, FZ_STEXT_MEDIABOX_CLIP
+	option.
+
+	FZ_STEXT_CLIP_RECT: If this option is set, characters that would be entirely
+	clipped away by the specified 'clip' rectangle in the options struct
+	will be ignored. This enables content from specific subsections of pages to
+	be extracted.
+
+	FZ_STEXT_COLLECT_STRUCTURE: If this option is set, we will collect
+	the structure as specified using begin/end_structure calls. This will
+	change the returned stext structure from being a simple list of blocks
+	into effectively being a 'tree' that should be walked in depth-first
+	order.
+
+	FZ_STEXT_COLLECT_VECTORS: If this option is set, we will collect
+	details (currently just the bbox) of vector graphics. This is intended
+	to be of use in segmentation analysis.
+
+	FZ_STEXT_IGNORE_ACTUALTEXT: If this option is set, we will no longer
+	replace text by the ActualText replacement specified in the document.
+
+	FZ_STEXT_SEGMENT: If this option is set, we will attempt to segment
+	the page into different regions. This will deliberately not do anything
+	to pages with structure information present.
+
+	FZ_STEXT_PARAGRAPH_BREAK: If this option is set, we will break blocks
+	of text at what appear to be paragraph boundaries. This only works
+	for left-to-right, top-to-bottom paragraphs. Works best on a segmented
+	page.
+
+	FZ_STEXT_TABLE_HUNT: If this option is set, we will hunt for tables
+	within the stext. Details of the potential tables found will be
+	inserted into the stext for the caller to interpret. This will work
+	best on a segmented page.
+
+	FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE: If this option is set, then
+	in the event that we fail to find a unicode value for a given
+	character, we we instead return its CID in the unicode field. We
+	will set the FZ_STEXT_UNICODE_IS_CID bit in the char flags word to
+	indicate that this has happened.
+
+	FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE: If this option is set, then
+	in the event that we fail to find a unicode value for a given
+	character, we we instead return its glyph in the unicode field.
+	We will set the FZ_STEXT_UNICODE_IS_GID bit in the char flags word
+	to indicate that this has happened.
+
+	Setting both FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE and
+	FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE will give undefined behaviour.
+
+*/
+enum
+{
+	FZ_STEXT_PRESERVE_LIGATURES = 1,
+	FZ_STEXT_PRESERVE_WHITESPACE = 2,
+	FZ_STEXT_PRESERVE_IMAGES = 4,
+	FZ_STEXT_INHIBIT_SPACES = 8,
+	FZ_STEXT_DEHYPHENATE = 16,
+	FZ_STEXT_PRESERVE_SPANS = 32,
+	FZ_STEXT_CLIP = 64,
+	FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE = 128,
+	FZ_STEXT_COLLECT_STRUCTURE = 256,
+	FZ_STEXT_ACCURATE_BBOXES = 512,
+	FZ_STEXT_COLLECT_VECTORS = 1024,
+	FZ_STEXT_IGNORE_ACTUALTEXT = 2048,
+	FZ_STEXT_SEGMENT = 4096,
+	FZ_STEXT_PARAGRAPH_BREAK = 8192,
+	FZ_STEXT_TABLE_HUNT = 16384,
+	FZ_STEXT_COLLECT_STYLES = 32768,
+	FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE = 65536,
+	FZ_STEXT_CLIP_RECT = (1<<17),
+	FZ_STEXT_ACCURATE_ASCENDERS = (1<<18),
+	FZ_STEXT_ACCURATE_SIDE_BEARINGS = (1<<19),
+
+	/* An old, deprecated option. */
+	FZ_STEXT_MEDIABOX_CLIP = FZ_STEXT_CLIP
+};
+
+/**
+ *	A note on stext's handling of structure.
+ *
+ *	A PDF document can contain a structure tree. This gives the
+ *	structure of a document in its entirety as a tree. e.g.
+ *
+ *	Tree			MCID	INDEX
+ *	-------------------------------------
+ *	DOC			0	0
+ *	 TOC			1	0
+ *	  TOC_ITEM		2	0
+ *	  TOC_ITEM		3	1
+ *	  TOC_ITEM		4	2
+ *	  ...
+ *	 STORY			100	1
+ *	  SECTION		101	0
+ *	   HEADING		102	0
+ *	   SUBSECTION		103	1
+ *	    PARAGRAPH		104	0
+ *	    PARAGRAPH		105	1
+ *	    PARAGRAPH		106	2
+ *	   SUBSECTION		107	2
+ *	    PARAGRAPH		108	0
+ *	    PARAGRAPH		109	1
+ *	    PARAGRAPH		110	2
+ *	   ...
+ *	  SECTION		200	1
+ *      ...
+ *
+ *	Each different section of the tree is identified as part of an
+ *	MCID by a number (this is a slight simplification, but makes the
+ *	explanation easier).
+ *
+ *	The PDF document contains markings that say "Entering MCID 0"
+ *	and "Leaving MCID 0". Any content within that region is therefore
+ *	identified as appearing in that particular structural region.
+ *
+ *	This means that content can be sent in the document in a different
+ *	order to which it appears 'logically' in the tree.
+ *
+ *	MuPDF converts this tree form into a nested series of calls to
+ *	begin_structure and end_structure.
+ *
+ *	For instance, if the document started out with MCID 100, then
+ *	we'd send:
+ *		begin_structure("DOC")
+ *		begin_structure("STORY")
+ *
+ *	The problem with this is that if we send:
+ *		begin_structure("DOC")
+ *		begin_structure("STORY")
+ *		begin_structure("SECTION")
+ *		begin_structure("SUBSECTION")
+ *
+ *	or
+ *		begin_structure("DOC")
+ *		begin_structure("STORY")
+ *		begin_structure("SECTION")
+ *		begin_structure("HEADING")
+ *
+ *	How do I know what order the SECTION and HEADING should appear in?
+ *	Are they even in the same STORY? Or the same DOC?
+ *
+ *	Accordingly, every begin_structure is accompanied not only with the
+ *	node type, but with an index. The index is the number of this node
+ *	within this level of the tree. Hence:
+ *
+ *		begin_structure("DOC", 0)
+ *		begin_structure("STORY", 0)
+ *		begin_structure("SECTION", 0)
+ *		begin_structure("HEADING", 0)
+ *	and
+ *		begin_structure("DOC", 0)
+ *		begin_structure("STORY", 0)
+ *		begin_structure("SECTION", 0)
+ *		begin_structure("SUBSECTION", 1)
+ *
+ *	are now unambiguous in their describing of the tree.
+ *
+ *	MuPDF automatically sends the minimal end_structure/begin_structure
+ *	pairs to move us between nodes in the tree.
+ *
+ *	In order to accommodate this information within the structured text
+ *	data structures an additional block type is used. Previously a
+ *	"page" was just a list of blocks, either text or images. e.g.
+ *
+ *	[BLOCK:TEXT] <-> [BLOCK:IMG] <-> [BLOCK:TEXT] <-> [BLOCK:TEXT] ...
+ *
+ *	We now introduce a new type of block, STRUCT, that turns this into
+ *	a tree:
+ *
+ *	[BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ...
+ *	                      /|\
+ *	[STRUCT:TYPE=DOC] <----
+ *	    |
+ *	[BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ...
+ *	                      /|\
+ *	[STRUCT:TYPE=STORY] <--
+ *	    |
+ *	   ...
+ *
+ *	Rather than doing a simple linear traversal of the list to extract
+ *	the logical data, a caller now has to do a depth-first traversal.
+ */
+
+/**
+	A text page is a list of blocks, together with an overall
+	bounding box.
+*/
+typedef struct
+{
+	fz_pool *pool;
+	fz_rect mediabox;
+	fz_stext_block *first_block;
+
+	/* The following fields are only of use to the routines that
+	 * build an fz_stext_page. They change during page construction
+	 * and their meaning is subject to change. These values should
+	 * not be used by anything outside of the stext device. */
+	fz_stext_block *last_block;
+	fz_stext_struct *last_struct;
+} fz_stext_page;
+
+enum
+{
+	FZ_STEXT_BLOCK_TEXT = 0,
+	FZ_STEXT_BLOCK_IMAGE = 1,
+	FZ_STEXT_BLOCK_STRUCT = 2,
+	FZ_STEXT_BLOCK_VECTOR = 3,
+	FZ_STEXT_BLOCK_GRID = 4
+};
+
+enum
+{
+	FZ_STEXT_TEXT_JUSTIFY_UNKNOWN = 0,
+	FZ_STEXT_TEXT_JUSTIFY_LEFT = 1,
+	FZ_STEXT_TEXT_JUSTIFY_CENTRE = 2,
+	FZ_STEXT_TEXT_JUSTIFY_RIGHT = 3,
+	FZ_STEXT_TEXT_JUSTIFY_FULL = 4,
+};
+
+enum
+{
+	/* Indicates that this vector came from a stroked
+	 * path. */
+	FZ_STEXT_VECTOR_IS_STROKED = 1,
+
+	/* Indicates that this vector came from a rectangular
+	 * (axis-aligned) path (or path segment). */
+	FZ_STEXT_VECTOR_IS_RECTANGLE = 2,
+
+	/* Indicates that this vector came from a path
+	 * segment, and more segments from this same path are
+	 * still to come. */
+	FZ_STEXT_VECTOR_CONTINUES = 4
+};
+
+/**
+	A text block is a list of lines of text (typically a paragraph),
+	or an image.
+*/
+struct fz_stext_block
+{
+	int type;
+	fz_rect bbox;
+	union {
+		struct { fz_stext_line *first_line, *last_line; int flags;} t;
+		struct { fz_matrix transform; fz_image *image; } i;
+		struct { fz_stext_struct *down; int index; } s;
+		struct { uint32_t flags; uint32_t argb; } v;
+		struct { fz_stext_grid_positions *xs; fz_stext_grid_positions *ys; } b;
+	} u;
+	fz_stext_block *prev, *next;
+};
+
+/**
+	A text line is a list of characters that share a common baseline.
+*/
+struct fz_stext_line
+{
+	int wmode; /* 0 for horizontal, 1 for vertical */
+	fz_point dir; /* normalized direction of baseline */
+	fz_rect bbox;
+	fz_stext_char *first_char, *last_char;
+	fz_stext_line *prev, *next;
+};
+
+/**
+	A text char is a unicode character, the style in which is
+	appears, and the point at which it is positioned.
+*/
+struct fz_stext_char
+{
+	int c; /* unicode character value */
+	uint16_t bidi; /* even for LTR, odd for RTL - probably only needs 8 bits? */
+	uint16_t flags;
+	uint32_t argb; /* sRGB hex color (alpha in top 8 bits, then r, then g, then b in low bits) */
+	fz_point origin;
+	fz_quad quad;
+	float size;
+	fz_font *font;
+	fz_stext_char *next;
+};
+
+enum
+{
+	FZ_STEXT_STRIKEOUT = 1,
+	FZ_STEXT_UNDERLINE = 2,
+	FZ_STEXT_SYNTHETIC = 4,
+	FZ_STEXT_BOLD = 8, /* Either real or 'fake' bold */
+	FZ_STEXT_FILLED = 16,
+	FZ_STEXT_STROKED = 32,
+	FZ_STEXT_CLIPPED = 64,
+	FZ_STEXT_UNICODE_IS_CID = 128,
+	FZ_STEXT_UNICODE_IS_GID = 256,
+};
+
+/**
+	When we are collecting the structure information from
+	PDF structure trees/tags, we end up with a tree of
+	nodes. The structure should be walked in depth-first
+	traversal order to extract the content.
+
+	An fz_stext_struct pointer can be NULL to indicate that
+	we know there is a child there within the complete tree,
+	but we don't know what it is yet.
+*/
+struct fz_stext_struct
+{
+	/* up points to the block that contains this fz_stext_struct. */
+	fz_stext_block *up;
+	/* parent points to the struct that has up as one of its children.
+	 * parent is useful for doing depth first traversal without having
+	 * to store the entire chain of structs in the iterator. */
+	fz_stext_struct *parent;
+
+	/* first_block points to the first child of this node (or NULL
+	 * if there are none). */
+	fz_stext_block *first_block;
+	/* last_block points to the last child of this node (or NULL
+	 * if there are none). */
+	fz_stext_block *last_block;
+
+	/* We have a set of 'standard' structure types. Every structure
+	 * element should correspond to one of these. */
+	fz_structure standard;
+	/* Documents can use their own non-standard structure types, which
+	 * are held as 'raw' strings. */
+	char raw[FZ_FLEXIBLE_ARRAY];
+};
+
+/* An example to show how fz_stext_blocks and fz_stext_structs interact:
+ *
+ *         [fz_stext_page]
+ *             |
+ *  first_block|
+ *             |
+ *            \|/
+ *  [fz_stext_block:TEXT]<->[fz_stext_block:STRUCT]<->[fz_stext_block:IMG]
+ *                           u.s.down|   /|\
+ *                                   |    |
+ *                                  \|/   |up
+ *                             [fz_stext_struct]<---------.
+ *                                |       |               |
+ *                     first_block|       |last_block     |
+ *         _______________________|       |               |
+ *        |                               |               |
+ *        |                               |               |
+ *       \|/                             \|/              |
+ *  [fz_stext_block:...]<->...<->[fz_stext_block:STRUCT]  |
+ *                                  |  /|\                |
+ *                          u.s.down|   |up               |
+ *                                 \|/  |           parent|
+ *                               [fz_stext_struct]--------'
+ *                                  |   |
+ *                       first_block|   |last_block
+ *                                  :   :
+ */
+
+ struct fz_stext_grid_positions
+ {
+	int len;
+	int max_uncertainty;
+	struct {
+		int reinforcement;
+		float pos;
+		float min;
+		float max;
+		int uncertainty;
+	} list[FZ_FLEXIBLE_ARRAY];
+ };
+
+FZ_DATA extern const char *fz_stext_options_usage;
+
+/**
+	Create an empty text page.
+
+	The text page is filled out by the text device to contain the
+	blocks and lines of text on the page.
+
+	mediabox: optional mediabox information.
+*/
+fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox);
+void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
+
+/**
+	Output structured text to a file in HTML (visual) format.
+*/
+void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
+void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out);
+void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out);
+
+/**
+	Output structured text to a file in XHTML (semantic) format.
+*/
+void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
+void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out);
+void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out);
+
+/**
+	Output structured text to a file in XML format.
+*/
+void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
+
+/**
+	Output structured text to a file in JSON format.
+*/
+void fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale);
+
+/**
+	Output structured text to a file in plain-text UTF-8 format.
+*/
+void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page);
+
+/**
+	Search for occurrence of 'needle' in text page.
+
+	Return the number of quads and store hit quads in the passed in
+	array.
+
+	NOTE: This is an experimental interface and subject to change
+	without notice.
+*/
+int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, int *hit_mark, fz_quad *hit_bbox, int hit_max);
+
+/**
+	Callback function for use in searching.
+
+	Called with the list of quads that correspond to a single hit.
+
+	The callback should return with 0 to continue the search, or 1 to abort it.
+	All other values are reserved at this point.
+*/
+typedef int (fz_search_callback_fn)(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox);
+
+/**
+	Search for occurrence of 'needle' in text page.
+
+	Call callback once for each hit. This callback will receive
+	(potentially) multiple quads for each hit.
+
+	Returns the number of hits - note that this is potentially
+	different from (i.e. is not greater than) the number of quads
+	as returned by the non callback API.
+
+	NOTE: This is an experimental interface and subject to change
+	without notice.
+*/
+int fz_search_stext_page_cb(fz_context *ctx, fz_stext_page *text, const char *needle, fz_search_callback_fn *cb, void *opaque);
+
+
+/**
+	Return a list of quads to highlight lines inside the selection
+	points.
+*/
+int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads);
+
+enum
+{
+	FZ_SELECT_CHARS,
+	FZ_SELECT_WORDS,
+	FZ_SELECT_LINES,
+};
+
+fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode);
+
+/**
+	Return a newly allocated UTF-8 string with the text for a given
+	selection.
+
+	crlf: If true, write "\r\n" style line endings (otherwise "\n"
+	only).
+*/
+char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf);
+
+/**
+	Return a newly allocated UTF-8 string with the text for a given
+	selection rectangle.
+
+	crlf: If true, write "\r\n" style line endings (otherwise "\n"
+	only).
+*/
+char *fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf);
+
+/**
+	Options for creating structured text.
+*/
+typedef struct
+{
+	int flags;
+	float scale;
+	fz_rect clip;
+} fz_stext_options;
+
+/**
+	Parse stext device options from a comma separated key-value
+	string.
+*/
+fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
+
+/**
+	Perform segmentation analysis on an (unstructured) page to look for
+	recursive subdivisions.
+
+	Essentially this code attempts to split the page horizontally and/or
+	vertically repeatedly into smaller and smaller "segments" (divisions).
+
+	Returns 0 if no changes were made to the document.
+
+	This is experimental code, and may change (or be removed) in future
+	versions!
+*/
+int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page);
+
+/**
+	Attempt to break paragraphs at plausible places.
+*/
+void fz_paragraph_break(fz_context *ctx, fz_stext_page *page);
+
+/**
+	Hunt for possible tables on a page, and update the stext with
+	information.
+*/
+void fz_table_hunt(fz_context *ctx, fz_stext_page *page);
+
+/**
+	Interpret the bounded contents of a given stext page as
+	a table.
+
+	The page contents will be rewritten to contain a Table
+	structure with the identified content in it.
+
+	This uses the same logic as for fz_table_hunt, without the
+	actual hunting. fz_table_hunt hunts to find possible bounds
+	for multiple tables on the page; this routine just finds a
+	single table contained within the given rectangle.
+
+	Returns the stext_block list that contains the content of
+	the table.
+*/
+fz_stext_block *
+fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds);
+
+/**
+	Create a device to extract the text on a page.
+
+	Gather the text on a page into blocks and lines.
+
+	The reading order is taken from the order the text is drawn in
+	the source file, so may not be accurate.
+
+	page: The text page to which content should be added. This will
+	usually be a newly created (empty) text page, but it can be one
+	containing data already (for example when merging multiple
+	pages, or watermarking).
+
+	options: Options to configure the stext device.
+*/
+fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
+
+/**
+	Create a device to OCR the text on the page.
+
+	Renders the page internally to a bitmap that is then OCRd. Text
+	is then forwarded onto the target device.
+
+	target: The target device to receive the OCRd text.
+
+	ctm: The transform to apply to the mediabox to get the size for
+	the rendered page image. Also used to calculate the resolution
+	for the page image. In general, this will be the same as the CTM
+	that you pass to fz_run_page (or fz_run_display_list) to feed
+	this device.
+
+	mediabox: The mediabox (in points). Combined with the CTM to get
+	the bounds of the pixmap used internally for the rendered page
+	image.
+
+	with_list: If with_list is false, then all non-text operations
+	are forwarded instantly to the target device. This results in
+	the target device seeing all NON-text operations, followed by
+	all the text operations (derived from OCR).
+
+	If with_list is true, then all the marking operations are
+	collated into a display list which is then replayed to the
+	target device at the end.
+
+	language: NULL (for "eng"), or a pointer to a string to describe
+	the languages/scripts that should be used for OCR (e.g.
+	"eng,ara").
+
+	datadir: NULL (for ""), or a pointer to a path string otherwise
+	provided to Tesseract in the TESSDATA_PREFIX environment variable.
+
+	progress: NULL, or function to be called periodically to indicate
+	progress. Return 0 to continue, or 1 to cancel. progress_arg is
+	returned as the void *. The int is a value between 0 and 100 to
+	indicate progress.
+
+	progress_arg: A void * value to be parrotted back to the progress
+	function.
+*/
+fz_device *fz_new_ocr_device(fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language,
+			const char *datadir, int (*progress)(fz_context *, void *, int), void *progress_arg);
+
+fz_document *fz_open_reflowed_document(fz_context *ctx, fz_document *underdoc, const fz_stext_options *opts);
+
+
+#endif
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children