view mupdf-source/include/mupdf/fitz/text.h @ 40:aa33339d6b8a upstream

ADD: MuPDF v1.26.10: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.5.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Oct 2025 11:31:38 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

// Copyright (C) 2004-2024 Artifex Software, Inc.
//
// This file is part of MuPDF.
//
// MuPDF is free software: you can redistribute it and/or modify it under the
// terms of the GNU Affero General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
// details.
//
// You should have received a copy of the GNU Affero General Public License
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
//
// Alternative licensing terms are available from the licensor.
// For commercial licensing, see <https://www.artifex.com/> or contact
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
// CA 94129, USA, for further information.

#ifndef MUPDF_FITZ_TEXT_H
#define MUPDF_FITZ_TEXT_H

#include "mupdf/fitz/system.h"
#include "mupdf/fitz/context.h"
#include "mupdf/fitz/font.h"
#include "mupdf/fitz/path.h"
#include "mupdf/fitz/bidi.h"

/**
	Text buffer.

	The trm field contains the a, b, c and d coefficients.
	The e and f coefficients come from the individual elements,
	together they form the transform matrix for the glyph.

	Glyphs are referenced by glyph ID.
	The Unicode text equivalent is kept in a separate array
	with indexes into the glyph array.
*/

typedef struct
{
	float x, y;
	float adv; /* advance width given by input format */
	int gid; /* -1 for one gid to many ucs mappings */
	int ucs; /* -1 for one ucs to many gid mappings */
	int cid; /* CID for CJK fonts, raw character code for other fonts; or unicode for non-PDF formats. */
} fz_text_item;

#define FZ_LANG_TAG2(c1,c2) ((c1-'a'+1) + ((c2-'a'+1)*27))
#define FZ_LANG_TAG3(c1,c2,c3) ((c1-'a'+1) + ((c2-'a'+1)*27) + ((c3-'a'+1)*27*27))

typedef enum
{
	FZ_LANG_UNSET = 0,
	FZ_LANG_ur = FZ_LANG_TAG2('u','r'),
	FZ_LANG_urd = FZ_LANG_TAG3('u','r','d'),
	FZ_LANG_ko = FZ_LANG_TAG2('k','o'),
	FZ_LANG_ja = FZ_LANG_TAG2('j','a'),
	FZ_LANG_zh = FZ_LANG_TAG2('z','h'),
	FZ_LANG_zh_Hans = FZ_LANG_TAG3('z','h','s'),
	FZ_LANG_zh_Hant = FZ_LANG_TAG3('z','h','t'),
} fz_text_language;

typedef struct fz_text_span
{
	fz_font *font;
	fz_matrix trm;
	unsigned wmode : 1;		/* 0 horizontal, 1 vertical */
	unsigned bidi_level : 7;	/* The bidirectional level of text */
	unsigned markup_dir : 2;	/* The direction of text as marked in the original document */
	unsigned language : 15;		/* The language as marked in the original document */
	int len, cap;
	fz_text_item *items;
	struct fz_text_span *next;
} fz_text_span;

typedef struct
{
	int refs;
	fz_text_span *head, *tail;
} fz_text;

/**
	Create a new empty fz_text object.

	Throws exception on failure to allocate.
*/
fz_text *fz_new_text(fz_context *ctx);

/**
	Increment the reference count for the text object. The same
	pointer is returned.

	Never throws exceptions.
*/
fz_text *fz_keep_text(fz_context *ctx, const fz_text *text);

/**
	Decrement the reference count for the text object. When the
	reference count hits zero, the text object is freed.

	Never throws exceptions.
*/
void fz_drop_text(fz_context *ctx, const fz_text *text);

/**
	Add a glyph/unicode value to a text object.

	text: Text object to add to.

	font: The font the glyph should be added in.

	trm: The transform to use for the glyph.

	glyph: The glyph id to add.

	unicode: The unicode character for the glyph.

	cid: The CJK CID value or raw character code.

	wmode: 1 for vertical mode, 0 for horizontal.

	bidi_level: The bidirectional level for this glyph.

	markup_dir: The direction of the text as specified in the
	markup.

	language: The language in use (if known, 0 otherwise)
	(e.g. FZ_LANG_zh_Hans).

	Throws exception on failure to allocate.
*/
void fz_show_glyph(fz_context *ctx, fz_text *text, fz_font *font, fz_matrix trm, int glyph, int unicode, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language);
void fz_show_glyph_aux(fz_context *ctx, fz_text *text, fz_font *font, fz_matrix trm, float adv, int glyph, int unicode, int cid, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language lang);

/**
	Add a UTF8 string to a text object.

	text: Text object to add to.

	font: The font the string should be added in.

	trm: The transform to use.

	s: The utf-8 string to add.

	wmode: 1 for vertical mode, 0 for horizontal.

	bidi_level: The bidirectional level for this glyph.

	markup_dir: The direction of the text as specified in the markup.

	language: The language in use (if known, 0 otherwise)
		(e.g. FZ_LANG_zh_Hans).

	Returns the transform updated with the advance width of the
	string.
*/
fz_matrix fz_show_string(fz_context *ctx, fz_text *text, fz_font *font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language);

/**
	Measure the advance width of a UTF8 string should it be added to a text object.

	This uses the same layout algorithms as fz_show_string, and can be used
	to calculate text alignment adjustments.
*/
fz_matrix
fz_measure_string(fz_context *ctx, fz_font *user_font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language);

/**
	Find the bounds of a given text object.

	text: The text object to find the bounds of.

	stroke: Pointer to the stroke attributes (for stroked
	text), or NULL (for filled text).

	ctm: The matrix in use.

	r: pointer to storage for the bounds.

	Returns a pointer to r, which is updated to contain the
	bounding box for the text object.
*/
fz_rect fz_bound_text(fz_context *ctx, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm);

/**
	Convert ISO 639 (639-{1,2,3,5}) language specification
	strings losslessly to a 15 bit fz_text_language code.

	No validation is carried out. Obviously invalid (out
	of spec) codes will be mapped to FZ_LANG_UNSET, but
	well-formed (but undefined) codes will be blithely
	accepted.
*/
fz_text_language fz_text_language_from_string(const char *str);

/**
	Recover ISO 639 (639-{1,2,3,5}) language specification
	strings losslessly from a 15 bit fz_text_language code.

	No validation is carried out. See note above.
*/
char *fz_string_from_text_language(char str[8], fz_text_language lang);

#endif