diff mupdf-source/include/mupdf/fitz/text.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/include/mupdf/fitz/text.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,210 @@
+// Copyright (C) 2004-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#ifndef MUPDF_FITZ_TEXT_H
+#define MUPDF_FITZ_TEXT_H
+
+#include "mupdf/fitz/system.h"
+#include "mupdf/fitz/context.h"
+#include "mupdf/fitz/font.h"
+#include "mupdf/fitz/path.h"
+#include "mupdf/fitz/bidi.h"
+
+/**
+	Text buffer.
+
+	The trm field contains the a, b, c and d coefficients.
+	The e and f coefficients come from the individual elements,
+	together they form the transform matrix for the glyph.
+
+	Glyphs are referenced by glyph ID.
+	The Unicode text equivalent is kept in a separate array
+	with indexes into the glyph array.
+*/
+
+typedef struct
+{
+	float x, y;
+	float adv; /* advance width given by input format */
+	int gid; /* -1 for one gid to many ucs mappings */
+	int ucs; /* -1 for one ucs to many gid mappings */
+	int cid; /* CID for CJK fonts, raw character code for other fonts; or unicode for non-PDF formats. */
+} fz_text_item;
+
+#define FZ_LANG_TAG2(c1,c2) ((c1-'a'+1) + ((c2-'a'+1)*27))
+#define FZ_LANG_TAG3(c1,c2,c3) ((c1-'a'+1) + ((c2-'a'+1)*27) + ((c3-'a'+1)*27*27))
+
+typedef enum
+{
+	FZ_LANG_UNSET = 0,
+	FZ_LANG_ur = FZ_LANG_TAG2('u','r'),
+	FZ_LANG_urd = FZ_LANG_TAG3('u','r','d'),
+	FZ_LANG_ko = FZ_LANG_TAG2('k','o'),
+	FZ_LANG_ja = FZ_LANG_TAG2('j','a'),
+	FZ_LANG_zh = FZ_LANG_TAG2('z','h'),
+	FZ_LANG_zh_Hans = FZ_LANG_TAG3('z','h','s'),
+	FZ_LANG_zh_Hant = FZ_LANG_TAG3('z','h','t'),
+} fz_text_language;
+
+typedef struct fz_text_span
+{
+	fz_font *font;
+	fz_matrix trm;
+	unsigned wmode : 1;		/* 0 horizontal, 1 vertical */
+	unsigned bidi_level : 7;	/* The bidirectional level of text */
+	unsigned markup_dir : 2;	/* The direction of text as marked in the original document */
+	unsigned language : 15;		/* The language as marked in the original document */
+	int len, cap;
+	fz_text_item *items;
+	struct fz_text_span *next;
+} fz_text_span;
+
+typedef struct
+{
+	int refs;
+	fz_text_span *head, *tail;
+} fz_text;
+
+/**
+	Create a new empty fz_text object.
+
+	Throws exception on failure to allocate.
+*/
+fz_text *fz_new_text(fz_context *ctx);
+
+/**
+	Increment the reference count for the text object. The same
+	pointer is returned.
+
+	Never throws exceptions.
+*/
+fz_text *fz_keep_text(fz_context *ctx, const fz_text *text);
+
+/**
+	Decrement the reference count for the text object. When the
+	reference count hits zero, the text object is freed.
+
+	Never throws exceptions.
+*/
+void fz_drop_text(fz_context *ctx, const fz_text *text);
+
+/**
+	Add a glyph/unicode value to a text object.
+
+	text: Text object to add to.
+
+	font: The font the glyph should be added in.
+
+	trm: The transform to use for the glyph.
+
+	glyph: The glyph id to add.
+
+	unicode: The unicode character for the glyph.
+
+	cid: The CJK CID value or raw character code.
+
+	wmode: 1 for vertical mode, 0 for horizontal.
+
+	bidi_level: The bidirectional level for this glyph.
+
+	markup_dir: The direction of the text as specified in the
+	markup.
+
+	language: The language in use (if known, 0 otherwise)
+	(e.g. FZ_LANG_zh_Hans).
+
+	Throws exception on failure to allocate.
+*/
+void fz_show_glyph(fz_context *ctx, fz_text *text, fz_font *font, fz_matrix trm, int glyph, int unicode, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language);
+void fz_show_glyph_aux(fz_context *ctx, fz_text *text, fz_font *font, fz_matrix trm, float adv, int glyph, int unicode, int cid, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language lang);
+
+/**
+	Add a UTF8 string to a text object.
+
+	text: Text object to add to.
+
+	font: The font the string should be added in.
+
+	trm: The transform to use.
+
+	s: The utf-8 string to add.
+
+	wmode: 1 for vertical mode, 0 for horizontal.
+
+	bidi_level: The bidirectional level for this glyph.
+
+	markup_dir: The direction of the text as specified in the markup.
+
+	language: The language in use (if known, 0 otherwise)
+		(e.g. FZ_LANG_zh_Hans).
+
+	Returns the transform updated with the advance width of the
+	string.
+*/
+fz_matrix fz_show_string(fz_context *ctx, fz_text *text, fz_font *font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language);
+
+/**
+	Measure the advance width of a UTF8 string should it be added to a text object.
+
+	This uses the same layout algorithms as fz_show_string, and can be used
+	to calculate text alignment adjustments.
+*/
+fz_matrix
+fz_measure_string(fz_context *ctx, fz_font *user_font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language);
+
+/**
+	Find the bounds of a given text object.
+
+	text: The text object to find the bounds of.
+
+	stroke: Pointer to the stroke attributes (for stroked
+	text), or NULL (for filled text).
+
+	ctm: The matrix in use.
+
+	r: pointer to storage for the bounds.
+
+	Returns a pointer to r, which is updated to contain the
+	bounding box for the text object.
+*/
+fz_rect fz_bound_text(fz_context *ctx, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm);
+
+/**
+	Convert ISO 639 (639-{1,2,3,5}) language specification
+	strings losslessly to a 15 bit fz_text_language code.
+
+	No validation is carried out. Obviously invalid (out
+	of spec) codes will be mapped to FZ_LANG_UNSET, but
+	well-formed (but undefined) codes will be blithely
+	accepted.
+*/
+fz_text_language fz_text_language_from_string(const char *str);
+
+/**
+	Recover ISO 639 (639-{1,2,3,5}) language specification
+	strings losslessly from a 15 bit fz_text_language code.
+
+	No validation is carried out. See note above.
+*/
+char *fz_string_from_text_language(char str[8], fz_text_language lang);
+
+#endif