Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/fitz/bidi.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/fitz/bidi.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,589 @@ +/* + * Bidirectional text processing. + * + * Processes unicode text by arranging the characters into an order suitable + * for display. E.g. Hebrew text will be arranged from right-to-left and + * any English within the text will remain in the left-to-right order. + * Characters such as parenthesis will be substituted for their mirrored + * equivalents if they are part of text which must be reversed. + * + * This is an implementation of the unicode Bidirectional Algorithm which + * can be found here: http://www.unicode.org/reports/tr9/ and is based + * on the reference implementation of the algorithm found on that page. + * + * For a nice overview of how it works, read this... + * http://www.w3.org/TR/REC-html40/struct/dirlang.html + * + * Extracted from the SmartOffice code, where it was modified by Ian + * Beveridge. + * + * Copyright (C) Picsel, 2004. All Rights Reserved. + */ + +/* + * Original copyright notice from unicode reference implementation. + * ---------------------------------------------------------------- + * Written by: Asmus Freytag + * C++ and Windows dependencies removed, and + * command line interface added by: Rick McGowan + * + * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved + */ + +/* + * Includes... + */ + +#include "mupdf/fitz.h" +#include "mupdf/ucdn.h" +#include "bidi-imp.h" /* standard bidi code interface */ +#include <assert.h> + +/* + * Macros... + */ + +#define ODD(x) ((x) & 1) + +#define REPLACEABLE_TYPE(t) ( \ + ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \ + ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \ + ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) ) + +#ifdef DEBUG_BIDI_VERBOSE +#define DBUGVF(params) do { fz_warn params; } while (0) +#else +#define DBUGVF(params) do {} while (0) +#endif + +#ifdef DEBUG_BIDI_OUTLINE +#define DBUGH(params) do { fz_warn params; } while (0) +#else +#define DBUGH(params) do {} while (0) +#endif + +#define UNICODE_EOS 0 +#define UNICODE_DIGIT_ZERO 0x0030 +#define UNICODE_DIGIT_NINE 0x0039 +#define UNICODE_SUPERSCRIPT_TWO 0x00B2 +#define UNICODE_SUPERSCRIPT_THREE 0x00B3 +#define UNICODE_SUPERSCRIPT_ONE 0x00B9 +#define UNICODE_RTL_START 0x0590 +#define UNICODE_RTL_END 0x07BF +#define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660 +#define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669 +#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0 +#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9 +#define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C +#define UNICODE_SUPERSCRIPT_ZERO 0x2070 +#define UNICODE_SUPERSCRIPT_FOUR 0x2074 +#define UNICODE_SUPERSCRIPT_NINE 0x2079 +#define UNICODE_SUBSCRIPT_ZERO 0x2080 +#define UNICODE_SUBSCRIPT_NINE 0x2089 +#define UNICODE_CIRCLED_DIGIT_ONE 0x2460 +#define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B +#define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA +#define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10 +#define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19 + +#ifndef TRUE +#define TRUE (1) +#endif +#ifndef FALSE +#define FALSE (0) +#endif + +/* + * Enumerations... + */ + +#ifdef DEBUG_BIDI_VERBOSE +/* display support: */ +static const char char_from_types[] = +{ + ' ', /* ON */ + '>', /* L */ + '<', /* R */ + '9', /* AN */ + '1', /* EN */ + 'a', /* AL */ + '@', /* NSM */ + '.', /* CS */ + ',', /* ES */ + '$', /* ET */ + ':', /* BN */ + 'X', /* S */ + '_', /* WS */ + 'B', /* B */ + '+', /* RLO */ + '+', /* RLE */ + '+', /* LRO */ + '+', /* LRE */ + '-', /* PDF */ + '=' /* LS */ +}; +#endif + +/* + * Functions and static functions... + */ + +/* UCDN uses a different ordering than Bidi does. We cannot + * change to the UCDN ordering, as the bidi-std.c code relies + * on the exact ordering (at least that N = ON = 0). We + * therefore map between the two using this small table. It + * also takes care of fudging LRI, RLI, FSI and PDI, that this + * code does not currently support. */ +static const uint8_t ucdn_to_bidi[] = +{ + BDI_L, /* UCDN_BIDI_CLASS_L = 0 */ + BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */ + BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */ + BDI_R, /* UCDN_BIDI_CLASS_R = 3 */ + BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */ + BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */ + BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */ + BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */ + BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */ + BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */ + BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */ + BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */ + BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */ + BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */ + BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */ + BDI_B, /* UCDN_BIDI_CLASS_B = 15 */ + BDI_S, /* UCDN_BIDI_CLASS_S = 16 */ + BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */ + BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */ + BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */ + BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */ + BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */ + BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */ +}; + +#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)]) + +/* Return a direction for white-space on the second pass of the algorithm. */ +static fz_bidi_chartype class_from_ch_n(uint32_t ch) +{ + fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch); + if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS) + return BDI_N; + return from_ch_ws; +} + +/* Split fragments into single scripts (or punctuation + single script) */ +static void +split_at_script(const uint32_t *fragment, + size_t fragment_len, + int level, + void *arg, + fz_bidi_fragment_fn *callback) +{ + int script = UCDN_SCRIPT_COMMON; + size_t script_start, i; + + script_start = 0; + for (i = 0; i < fragment_len; i++) + { + int s = ucdn_get_script(fragment[i]); + if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED) + { + /* Punctuation etc. This is fine. */ + } + else if (s == script) + { + /* Same script. Still fine. */ + } + else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED) + { + /* First non punctuation thing. Set the script. */ + script = s; + } + else + { + /* Change of script. Break the fragment. */ + (*callback)(&fragment[script_start], i - script_start, level, script, arg); + script_start = i; + script = s; + } + } + if (script_start != fragment_len) + { + (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg); + } +} + +/* Determines the character classes for all following + * passes of the algorithm. A character class is basically the type of Bidi + * behaviour that the character exhibits. + */ +static void +classify_characters(const uint32_t *text, + fz_bidi_chartype *types, + size_t len, + fz_bidi_flags flags) +{ + size_t i; + + if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0) + { + for (i = 0; i < len; i++) + { + types[i] = class_from_ch_ws(text[i]); + } + } + else + { +#ifdef DEBUG_BIDI_VERBOSE + fprintf(stderr, "Text: "); + for (i = 0; i < len; i++) + { + /* So that we can actually sort of read the debug string, any + * non-ascii characters are replaced with a 1-digit hash + * value from 0-9, making non-english characters appear + * as numbers + */ + fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ? + text[i] : text[i] % 9 + '0'); + } + fprintf(stderr, "\nTypes: "); +#endif + for (i = 0; i < len; i++) + { + types[i] = class_from_ch_n(text[i]); +#ifdef DEBUG_BIDI_VERBOSE + fprintf(stderr, "%c", char_from_types[(int)types[i]]); +#endif + } +#ifdef DEBUG_BIDI_VERBOSE + fprintf(stderr, "\n"); +#endif + } +} + +/* Determines the base level of the text. + * Implements rule P2 of the Unicode Bidi Algorithm. + * Note: Ignores explicit embeddings + */ +static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) + { + switch (types[i]) + { + /* strong left */ + case BDI_L: + return FZ_BIDI_LTR; + + /* strong right */ + case BDI_R: + case BDI_AL: + return FZ_BIDI_RTL; + } + } + return FZ_BIDI_LTR; +} + +static fz_bidi_direction direction_from_type(fz_bidi_chartype type) +{ + switch (type) + { + case BDI_L: + case BDI_EN: + return FZ_BIDI_LTR; + + case BDI_R: + case BDI_AL: + return FZ_BIDI_RTL; + + default: + return FZ_BIDI_NEUTRAL; + } +} + +static void +classify_quoted_blocks(const uint32_t *text, + fz_bidi_chartype *types, + size_t len) +{ + size_t i; + int inQuote = FALSE; + int pdfNeeded = FALSE; + int ltrFound = FALSE; + int rtlFound = FALSE; + + /* Only do anything special here if there is mixed content + * (LTR *and* RTL) in the text. + */ + for (i = 0; i < len; i++) + { + switch (direction_from_type(types[i])) + { + case FZ_BIDI_LTR: + ltrFound = TRUE; + break; + + case FZ_BIDI_RTL: + rtlFound = TRUE; + break; + + default: + break; + } + } + + /* Only make any changes if *both* LTR and RTL characters exist + * in this text. + */ + if (!ltrFound || !rtlFound) + { + return; + } + + for (i = 0; i < len; i++) + { + if (text[i]=='"') + { + /* If we're already in a quote then terminate it, + * else start a new block. + */ + if (inQuote) + { + inQuote = FALSE; + if (pdfNeeded) + { + pdfNeeded = FALSE; + types[i] = BDI_PDF; + } + } + else + { + size_t j; + int done = FALSE; + + inQuote = TRUE; + + /* Find the first strong right or left type and + * use that to determine whether we should classify + * the quote as LRE or RLE. Or neither, if we + * hit another quote before any strongly-directional + * character. + */ + for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j) + { + switch(types[j]) + { + case BDI_RLE: + case BDI_LRE: + done = TRUE; + break; + + case BDI_L: + case BDI_EN: + types[i] = BDI_LRE; + pdfNeeded = TRUE; + done = TRUE; + break; + + case BDI_R: + case BDI_AL: + types[i] = BDI_RLE; + pdfNeeded = TRUE; + done = TRUE; + break; + + default: + break; + } + } + } + } + } +} + +/* Creates a buffer with an embedding level for every character in the + * given text. Also determines the base level and returns it in + * *baseDir if *baseDir does not initially contain a valid direction. + */ +static fz_bidi_level * +create_levels(fz_context *ctx, + const uint32_t *text, + size_t len, + fz_bidi_direction *baseDir, + int resolveWhiteSpace, + int flags) +{ + fz_bidi_level *levels, *plevels; + fz_bidi_chartype *types = NULL; + fz_bidi_chartype *ptypes; + fz_bidi_level baseLevel; + const uint32_t *ptext; + size_t plen, remaining; + + levels = Memento_label(fz_malloc(ctx, len * sizeof(*levels)), "bidi_levels"); + + fz_var(types); + + fz_try(ctx) + { + types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype)); + + classify_characters(text, types, len, flags); + + if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL) + { + /* Derive the base level from the text and + * update *baseDir in case the caller wants to know. + */ + baseLevel = base_level_from_text(types, len); + *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR; + } + else + { + baseLevel = (fz_bidi_level)*baseDir; + } + + { + /* Replace tab with base direction, i.e. make tab appear as + * 'strong left' if the base direction is left-to-right and + * 'strong right' if base direction is right-to-left. This + * allows Layout to implicitly treat tabs as 'segment separators'. + */ + size_t i; + + for (i = 0u; i < len; i++) + { + if (text[i]=='\t') + { + types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L; + } + } + } + + /* Look for quotation marks. Classify them as RLE or LRE + * or leave them alone, depending on what follows them. + */ + classify_quoted_blocks(text, types, len); + + /* Work one paragraph at a time. */ + plevels = levels; + ptypes = types; + ptext = text; + remaining = len; + while (remaining) + { + plen = fz_bidi_resolve_paragraphs(ptypes, remaining); + + /* Work out the levels and character types... */ + (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0); + fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen); + fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen); + fz_bidi_resolve_implicit(ptypes, plevels, plen); + + classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE); + + if (resolveWhiteSpace) + { + /* resolve whitespace */ + fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen); + } + + plevels += plen; + ptypes += plen; + ptext += plen; + remaining -= plen; + } + + /* The levels buffer now has odd and even numbers indicating + * rtl or ltr characters, respectively. + */ +#ifdef DEBUG_BIDI_VERBOSE + fprintf(stderr, "Levels: "); + { + size_t i; + for (i = 0; i < len; i++) + { + fprintf(stderr, "%d", levels[i]>9?0:levels[i]); + } + fprintf(stderr, "\n"); + } +#endif + } + fz_always(ctx) + { + fz_free(ctx, types); + } + fz_catch(ctx) + { + fz_free(ctx, levels); + fz_rethrow(ctx); + } + return levels; +} + +/* Partitions the given character sequence into one or more unidirectional + * fragments and invokes the given callback function for each fragment. + */ +void fz_bidi_fragment_text(fz_context *ctx, + const uint32_t *text, + size_t textlen, + fz_bidi_direction *baseDir, + fz_bidi_fragment_fn *callback, + void *arg, + int flags) +{ + size_t startOfFragment; + size_t i; + fz_bidi_level *levels; + + if (text == NULL || callback == NULL || textlen == 0) + return; + + DBUGH((ctx, "fz_bidi_fragment_text('%S', len = %d)\n", text, textlen)); + + levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags); + + /* We now have an array with an embedding level + * for each character in text. + */ + assert(levels != NULL); + + fz_try(ctx) + { + startOfFragment = 0; + for (i = 1; i < textlen; i++) + { + if (levels[i] != levels[i-1]) + { + /* We've gone past the end of the fragment. + * Create a text object for it, then start + * a new fragment. + */ + split_at_script(&text[startOfFragment], + i - startOfFragment, + levels[startOfFragment], + arg, + callback); + startOfFragment = i; + } + } + /* Now i == textlen. Deal with the final (or maybe only) fragment. */ + /* otherwise create 1 fragment */ + split_at_script(&text[startOfFragment], + i - startOfFragment, + levels[startOfFragment], + arg, + callback); + } + fz_always(ctx) + { + fz_free(ctx, levels); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +}
