comparison mupdf-source/source/fitz/bidi-std.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Extracted from Bidi.cpp - version 26
2
3 // Reference implementation for Unicode Bidirectional Algorithm
4
5 // Bidi include file
6 #include "mupdf/fitz.h"
7 #include "bidi-imp.h"
8
9 #include <assert.h>
10
11 #ifndef TRUE
12 #define TRUE (1)
13 #endif
14 #ifndef FALSE
15 #define FALSE (0)
16 #endif
17
18 /*------------------------------------------------------------------------
19 File: Bidi.Cpp
20
21 Description
22 -----------
23
24 Sample Implementation of the Unicode Bidirectional Algorithm as it
25 was revised by Revision 5 of the Unicode Technical Report # 9
26 (1999-8-17)
27
28 Verified for changes to the algorithm up through Unicode 5.2.0 (2009).
29
30 This implementation is organized into several passes, each implemen-
31 ting one or more of the rules of the Unicode Bidi Algorithm. The
32 resolution of Weak Types and of Neutrals each use a state table
33 approach.
34
35 Both a printf based interface and a Windows DlgProc are provided for
36 interactive testing.
37
38 A stress harness comparing this implementation (v24) to a Java based
39 implementation was used by Doug Felt to verify that the two
40 implementations produce identical results for all strings up to six
41 bidi classes and stochastic strings up to length 20.
42
43 Version 26 was verified by the author against the Unicode 5.2.0
44 file BidiTest.txt, which provides an exhaustive text of strings of
45 length 4 or less, but covers some important cases where the language
46 in UAX#9 had been clarified.
47
48 To see this code running in an actual Windows program,
49 download the free Unibook utility from http://unicode.org/unibook
50 The bidi demo is executed from the tools menu. It is build from
51 this source file.
52
53 Build Notes
54 -----------
55
56 To compile the sample implementation please set the #define
57 directives above so the correct headers get included. Not all the
58 files are needed for all purposes. For the commandline version
59 only bidi.h and bidi.cpp are needed.
60
61 The Win32 version is provided as a dialog procedure. To use as a
62 standalone program compile with the lbmain.cpp file. If all you
63 need is the ability to run the code "as is", you can instead download
64 the unibook utility from http://uincode.org/unibook/ which contains
65 the bidi demo compiled from this source file.
66
67 This code uses an extension to C++ that gives variables declared in
68 a for() statement function the same scope as the for() statement.
69 If your compiler does not support this extension, you may need to
70 move the declaration, e.g. int ich = 0; in front of the for statement.
71
72 Implementation Note
73 -------------------
74
75 NOTE: The Unicode Bidirectional Algorithm removes all explicit
76 formatting codes in rule X9, but states that this can be
77 simulated by conformant implementations. This implementation
78 attempts to demonstrate such a simulation
79
80 To demonstrate this, the current implementation does the
81 following:
82
83 in resolveExplicit()
84 - change LRE, LRO, RLE, RLO, PDF to BN
85 - assign nested levels to BN
86
87 in resolveWeak and resolveNeutrals
88 - assign L and R to BN's where they exist in place of
89 sor and eor by changing the last BN in front of a
90 level change to a strong type
91 - skip over BN's for the purpose of determining actions
92 - include BN in the count of deferred runs
93 which will resolve some of them to EN, AN and N
94
95 in resolveWhiteSpace
96 - set the level of any surviving BN to the base level,
97 or the level of the preceding character
98 - include LRE,LRO, RLE, RLO, PDF and BN in the count
99 whitespace to be reset
100
101 This will result in the same order for non-BN characters as
102 if the BN characters had been removed.
103
104 The clean() function can be used to remove boundary marks for
105 verification purposes.
106
107 Notation
108 --------
109 Pointer variables generally start with the letter p
110 Counter variables generally start with the letter c
111 Index variables generally start with the letter i
112 Boolean variables generally start with the letter f
113
114 The enumerated bidirectional types have the same name as in the
115 description for the Unicode Bidirectional Algorithm
116
117 Using this code outside a demo context
118 --------------------------------------
119
120 The way the functions are broken down in this demo code is based
121 on the needs of the demo to show the evolution in internal state
122 as the algorithm proceeds. This obscures how the algorithm would
123 be used in practice. These are the steps:
124
125 1. Allocate a pair of arrays large enough to hold bidi class
126 and calculated levels (one for each input character)
127
128 2. Provide your own function to assign directional types (bidi
129 classes) corresponding to each character in the input,
130 conflating ON, WS, S to N. Unlike the classify function in this
131 demo, the input would be actual Unicode characters.
132
133 3. Process the first paragraph by calling BidiParagraph. That
134 function changes B into BN and returns a length including the
135 paragraph separator. (The iteration over multiple paragraphs
136 is left as exercise for the reader).
137
138 4. Assign directional types again, but now assign specific types
139 to whitespace characters.
140
141 5. Instead of reordering the input in place it is often desirable
142 to calculate an array of offsets indicating the reordering.
143 To that end, allocate such an array here and use it instead
144 of the input array in the next step.
145
146 6. Resolve and reorder the lines by calling BidiLines. That
147 function 'breaks' lines on LS characters. Provide an optional
148 array of flags indicating the location of other line breaks as
149 needed.
150
151 Update History
152 --------------
153 Version 24 is the initial published and verified version of this
154 reference implementation. Version 25 and its updates fix various
155 minor issues with the scaffolding used for demonstrating the
156 algorithm using pseudo-alphabets from the command line or dialog
157 box. No changes to the implementation of the actual bidi algorithm
158 are made in any of the minor updates to version 25. Version 26
159 also makes no change to the actual algorithm but was verified
160 against the official BidiTest.txt file for Unicode 5.2.0.
161
162 - updated pseudo-alphabet
163
164 - Last Revised 12-10-99 (25)
165
166 - enable demo mode for release builds - no other changes
167
168 - Last Revised 12-10-00 (25a)
169
170 - fix regression in pseudo alphabet use for Windows UI
171
172 - Last Revised 02-01-01 (25b)
173
174 - fixed a few comments, renamed a variable
175
176 - Last Revised 03-04-01 (25c)
177
178 - make base level settable, enable mirror by default,
179 fix dialog size
180
181 - Last Revised 06-02-01 (25e)
182
183 - fixed some comments
184
185 - Last Revised 09-29-01 (25f)
186
187 - fixed classification for LS,RLM,LRM in pseudo alphabet,
188 focus issues in UI, regression fix to commandline from 25(e)
189 fix DEMO switch
190
191 - Last Revised 11-07-01 (25g)
192
193 - fixed classification for plus/minus in pseudo alphabet
194 to track changes made in Unicode 4.0.1
195
196 - Last Revised 12-03-04 (25h)
197
198 - now compiles as dialog-only program for WINDOWS_UI==1
199 using new bidimain.cpp
200
201 - Last Revised 12-02-07 (25i)
202
203 - cleaned up whitespace and indenting in the source,
204 fixed two comments (table headers)
205
206 - Last Revised 15-03-07 (25j)
207
208 - named enumerations
209
210 - Last Revised 30-05-07 (25k)
211
212 - added usage notes, minor edits to comments, indentation, etc
213 throughout. Added the bidiParagraph function. Checked against
214 changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No
215 changes needed to this implementation to match the values in
216 the BidiTest.txt file in the Unicode Character Database.
217 Minor fixes to dialog/windows proc, updated preprocessor directives.
218
219 - Last Revised 03-08-09 (26)
220
221 Credits:
222 -------
223 Written by: Asmus Freytag
224 Command line interface by: Rick McGowan
225 Verification (v24): Doug Felt
226
227 Disclaimer and legal rights:
228 ---------------------------
229 Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved.
230 Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
231
232 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
233 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
234 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
235 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
236 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
237 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
238 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
239 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE.
240
241 The file bid.rc is included in the software covered by the above.
242 ------------------------------------------------------------------------*/
243
244 /* === HELPER FUNCTIONS AND DECLARATIONS ================================= */
245
246 #define odd(x) ((x) & 1)
247
248 /*----------------------------------------------------------------------
249 The following array maps character codes to types for the purpose of
250 this sample implementation. The legend string gives a human readable
251 explanation of the pseudo alphabet.
252
253 For simplicity, characters entered by buttons are given a 1:1 mapping
254 between their type and pseudo character value. Pseudo characters that
255 can be typed from the keyboard are explained in the legend string.
256
257 Use the Unicode Character Database for the real values in real use.
258 ---------------------------------------------------------------------*/
259
260 enum
261 {
262 chLS = 0x15
263 };
264
265 #if 0
266 static const fz_bidi_chartype types_from_char[] =
267 {
268 // 0 1 2 3 4 5 6 7 8 9 a b c d e f
269 BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_S, BDI_B, BDI_S, BDI_WS, BDI_B, BDI_BN, BDI_BN, /*00-0f*/
270 BDI_LRO,BDI_LRE,BDI_PDF,BDI_RLO,BDI_RLE,BDI_WS, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_B, BDI_B, BDI_B, BDI_S, /*10-1f*/
271 BDI_WS, BDI_ON, BDI_ON, BDI_ET, BDI_ET, BDI_ET, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ES, BDI_CS, BDI_ES, BDI_CS, BDI_ES, /*20-2f*/
272 BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_AN, BDI_AN, BDI_AN, BDI_AN, BDI_CS, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, /*30-3f*/
273 BDI_ON, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, /*40-4f*/
274 BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_LRE,BDI_ON, BDI_RLE,BDI_PDF,BDI_S, /*50-5f*/
275 BDI_NSM,BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, /*60-6f*/
276 BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_LRO,BDI_B, BDI_RLO,BDI_BN, BDI_ON, /*70-7f*/
277 };
278 #endif
279
280 /***************************************
281 Reverse, human readable reference:
282
283 LRM: 0x4
284 RLM: 0x5
285 L: 0x16,a-z
286 LRE: 0x11,[
287 LRO: 0x10,{
288 R: 0x17,G-Z
289 AL: A-F
290 RLE: 0x14,]
291 RLO: 0x13,}
292 PDF: 0x12,^
293 EN: 0-5
294 ES: /,+,[hyphen]
295 ET: #,$,%
296 AN: 6-9
297 CS: [comma],.,:
298 NSM: `
299 BN: 0x0-0x8,0xe,0xf,0x18-0x1b,~
300 B: 0xa,0xd,0x1c-0x1e,|
301 S: 0x9,0xb,0x1f,_
302 WS: 0xc,0x15,[space]
303 ON: !,",&,',(,),*,;,<,=,>,?,@,\,0x7f
304 ****************************************/
305
306 // === HELPER FUNCTIONS ================================================
307
308 #ifdef BIDI_LINE_AT_A_TIME
309 // reverse cch characters
310 static
311 void reverse(uint32_t *psz, int cch)
312 {
313 uint32_t ch_temp;
314 int ich;
315
316 for (ich = 0; ich < --cch; ich++)
317 {
318 ch_temp = psz[ich];
319 psz[ich] = psz[cch];
320 psz[cch] = ch_temp;
321 }
322 }
323 #endif
324
325 // Set a run of cval values at locations all prior to, but not including
326 // iStart, to the new value nval.
327 static
328 void set_deferred_run(fz_bidi_chartype *pval, size_t cval, size_t iStart, fz_bidi_chartype nval)
329 {
330 size_t i;
331
332 for (i = iStart; i > iStart - cval; )
333 {
334 pval[--i] = nval;
335 }
336 }
337
338 static
339 void set_deferred_level_run(fz_bidi_level *pval, size_t cval, size_t iStart, fz_bidi_level nval)
340 {
341 size_t i;
342
343 for (i = iStart; i > iStart - cval; )
344 {
345 pval[--i] = nval;
346 }
347 }
348
349 // === ASSIGNING BIDI CLASSES ============================================
350
351 // === THE PARAGRAPH LEVEL ===============================================
352
353 /*------------------------------------------------------------------------
354 Function: resolve_paragraphs
355
356 Resolves the input strings into blocks over which the algorithm
357 is then applied.
358
359 Implements Rule P1 of the Unicode Bidi Algorithm
360
361 Input: Text string
362 Character count
363
364 Output: revised character count
365
366 Note: This is a very simplistic function. In effect it restricts
367 the action of the algorithm to the first paragraph in the input
368 where a paragraph ends at the end of the first block separator
369 or at the end of the input text.
370
371 ------------------------------------------------------------------------*/
372 size_t fz_bidi_resolve_paragraphs(fz_bidi_chartype *types, size_t cch)
373 {
374 size_t ich;
375
376 // skip characters not of type B
377 for(ich = 0; ich < cch && types[ich] != BDI_B; ich++)
378 ;
379 // stop after first B, make it a BN for use in the next steps
380 if (ich < cch && types[ich] == BDI_B)
381 types[ich++] = BDI_BN;
382 return ich;
383 }
384
385 #if 0
386 /*------------------------------------------------------------------------
387 Function: base_level
388
389 Determines the base level
390
391 Implements rule P2 of the Unicode Bidi Algorithm.
392
393 Input: Array of directional classes
394 Character count
395
396 Note: Ignores explicit embeddings
397 ------------------------------------------------------------------------*/
398 static int base_level(const fz_bidi_chartype *pcls, int cch)
399 {
400 int ich;
401
402 for (ich = 0; ich < cch; ich++)
403 {
404 switch (pcls[ich])
405 {
406 // strong left
407 case BDI_L:
408 return 0;
409
410 // strong right
411 case BDI_R:
412 case BDI_AL:
413 return 1;
414 }
415 }
416 return 0;
417 }
418 #endif
419
420 //====== RESOLVE EXPLICIT ================================================
421
422 static fz_bidi_level greater_even(fz_bidi_level i)
423 {
424 return odd(i) ? i + 1 : i + 2;
425 }
426
427 static fz_bidi_level greater_odd(fz_bidi_level i)
428 {
429 return odd(i) ? i + 2 : i + 1;
430 }
431
432 static fz_bidi_chartype embedding_direction(fz_bidi_chartype level)
433 {
434 return odd(level) ? BDI_R : BDI_L;
435 }
436
437 /*------------------------------------------------------------------------
438 Function: resolveExplicit
439
440 Recursively resolves explicit embedding levels and overrides.
441 Implements rules X1-X9, of the Unicode Bidirectional Algorithm.
442
443 Input: Base embedding level and direction
444 Character count
445
446 Output: Array of embedding levels
447 Caller must allocate (one level per input character)
448
449 In/Out: Array of direction classes
450
451 Note: The function uses two simple counters to keep track of
452 matching explicit codes and PDF. Use the default argument for
453 the outermost call. The nesting counter counts the recursion
454 depth and not the embedding level.
455 ------------------------------------------------------------------------*/
456 size_t fz_bidi_resolve_explicit(fz_bidi_level level, fz_bidi_chartype dir, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch,
457 fz_bidi_level n_nest)
458 {
459 size_t ich;
460
461 // always called with a valid nesting level
462 // nesting levels are != embedding levels
463 int nLastValid = n_nest;
464
465 // check input values
466 assert(n_nest >= 0 && level >= 0 && level <= BIDI_LEVEL_MAX);
467
468 // process the text
469 for (ich = 0; ich < cch; ich++)
470 {
471 fz_bidi_chartype cls = pcls[ich];
472 switch (cls)
473 {
474 case BDI_LRO:
475 case BDI_LRE:
476 n_nest++;
477 if (greater_even(level) <= BIDI_LEVEL_MAX)
478 {
479 plevel[ich] = greater_even(level);
480 pcls[ich] = BDI_BN;
481 ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L),
482 &pcls[ich+1], &plevel[ich+1],
483 cch - (ich+1), n_nest);
484 n_nest--;
485 continue;
486 }
487 cls = pcls[ich] = BDI_BN;
488 break;
489
490 case BDI_RLO:
491 case BDI_RLE:
492 n_nest++;
493 if (greater_odd(level) <= BIDI_LEVEL_MAX)
494 {
495 plevel[ich] = greater_odd(level);
496 pcls[ich] = BDI_BN;
497 ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R),
498 &pcls[ich+1], &plevel[ich+1],
499 cch - (ich+1), n_nest);
500 n_nest--;
501 continue;
502 }
503 cls = pcls[ich] = BDI_BN;
504 break;
505
506 case BDI_PDF:
507 cls = pcls[ich] = BDI_BN;
508 if (n_nest)
509 {
510 if (nLastValid < n_nest)
511 {
512 n_nest--;
513 }
514 else
515 {
516 cch = ich; // break the loop, but complete body
517 }
518 }
519 break;
520 }
521
522 // Apply the override
523 if (dir != BDI_N)
524 {
525 cls = dir;
526 }
527 plevel[ich] = level;
528 if (pcls[ich] != BDI_BN)
529 pcls[ich] = cls;
530 }
531
532 return ich;
533 }
534
535 // === RESOLVE WEAK TYPES ================================================
536
537 enum bidi_state // possible states
538 {
539 xa, // arabic letter
540 xr, // right letter
541 xl, // left letter
542
543 ao, // arabic lett. foll by ON
544 ro, // right lett. foll by ON
545 lo, // left lett. foll by ON
546
547 rt, // ET following R
548 lt, // ET following L
549
550 cn, // EN, AN following AL
551 ra, // arabic number foll R
552 re, // european number foll R
553 la, // arabic number foll L
554 le, // european number foll L
555
556 ac, // CS following cn
557 rc, // CS following ra
558 rs, // CS,ES following re
559 lc, // CS following la
560 ls, // CS,ES following le
561
562 ret, // ET following re
563 let // ET following le
564 } ;
565
566 static const unsigned char state_weak[][10] =
567 {
568 // N, L, R, AN, EN, AL,NSM, CS, ES, ET,
569 /*xa*/ { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter */
570 /*xr*/ { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter */
571 /*xl*/ { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter */
572
573 /*ao*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* arabic lett. foll by ON*/
574 /*ro*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* right lett. foll by ON */
575 /*lo*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* left lett. foll by ON */
576
577 /*rt*/ { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R */
578 /*lt*/ { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L */
579
580 /*cn*/ { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL */
581 /*ra*/ { ro, xl, xr, ra, re, xa, ra, rc, ro, rt }, /* arabic number foll R */
582 /*re*/ { ro, xl, xr, ra, re, xa, re, rs, rs,ret }, /* european number foll R */
583 /*la*/ { lo, xl, xr, la, le, xa, la, lc, lo, lt }, /* arabic number foll L */
584 /*le*/ { lo, xl, xr, la, le, xa, le, ls, ls,let }, /* european number foll L */
585
586 /*ac*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn */
587 /*rc*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra */
588 /*rs*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re */
589 /*lc*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la */
590 /*ls*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le */
591
592 /*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re */
593 /*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let } /* ET following le */
594
595 };
596
597 enum bidi_action // possible actions
598 {
599 // primitives
600 IX = 0x100, // increment
601 XX = 0xF, // no-op
602
603 // actions
604 xxx = (XX << 4) + XX, // no-op
605 xIx = IX + xxx, // increment run
606 xxN = (XX << 4) + BDI_ON, // set current to N
607 xxE = (XX << 4) + BDI_EN, // set current to EN
608 xxA = (XX << 4) + BDI_AN, // set current to AN
609 xxR = (XX << 4) + BDI_R, // set current to R
610 xxL = (XX << 4) + BDI_L, // set current to L
611 Nxx = (BDI_ON << 4) + 0xF, // set run to neutral
612 Axx = (BDI_AN << 4) + 0xF, // set run to AN
613 ExE = (BDI_EN << 4) + BDI_EN, // set run to EN, set current to EN
614 NIx = (BDI_ON << 4) + 0xF + IX, // set run to N, increment
615 NxN = (BDI_ON << 4) + BDI_ON, // set run to N, set current to N
616 NxR = (BDI_ON << 4) + BDI_R, // set run to N, set current to R
617 NxE = (BDI_ON << 4) + BDI_EN, // set run to N, set current to EN
618
619 AxA = (BDI_AN << 4) + BDI_AN, // set run to AN, set current to AN
620 NxL = (BDI_ON << 4) + BDI_L, // set run to N, set current to L
621 LxL = (BDI_L << 4) + BDI_L // set run to L, set current to L
622 };
623
624 typedef uint16_t fz_bidi_action;
625
626 static const fz_bidi_action action_weak[][10] =
627 {
628 // N,.. L, R, AN, EN, AL, NSM, CS,..ES, ET,
629 /*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter */
630 /*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right letter */
631 /*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter */
632
633 /*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON */
634 /*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON */
635 /*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON */
636
637 /*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R */
638 /*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L */
639
640 /*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following AL */
641 /*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R */
642 /*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R */
643 /*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L */
644 /*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L */
645
646 /*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn */
647 /*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra */
648 /*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re */
649 /*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la */
650 /*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le */
651
652 /*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re */
653 /*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL } /* ET following le */
654 };
655
656 static
657 fz_bidi_chartype get_deferred_type(fz_bidi_action action)
658 {
659 return (action >> 4) & 0xF;
660 }
661
662 static
663 fz_bidi_chartype get_resolved_type(fz_bidi_action action)
664 {
665 return action & 0xF;
666 }
667
668 /* Note on action table:
669
670 States can be of two kinds:
671 - Immediate Resolution State, where each input token
672 is resolved as soon as it is seen. These states have
673 only single action codes (xxN) or the no-op (xxx)
674 for static input tokens.
675 - Deferred Resolution State, where input tokens either
676 either extend the run (xIx) or resolve its Type (e.g. Nxx).
677
678 Input classes are of three kinds
679 - Static Input Token, where the class of the token remains
680 unchanged on output (AN, L, N, R)
681 - Replaced Input Token, where the class of the token is
682 always replaced on output (AL, BDI_BN, NSM, CS, ES, ET)
683 - Conditional Input Token, where the class of the token is
684 changed on output in some but not all cases (EN)
685
686 Where tokens are subject to change, a double action
687 (e.g. NxA, or NxN) is _required_ after deferred states,
688 resolving both the deferred state and changing the current token.
689
690 These properties of the table are verified by assertions below.
691 This code is needed only during debugging and maintenance
692 */
693
694 /*------------------------------------------------------------------------
695 Function: resolveWeak
696
697 Resolves the directionality of numeric and other weak character types
698
699 Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm.
700
701 Input: Array of embedding levels
702 Character count
703
704 In/Out: Array of directional classes
705
706 Note: On input only these directional classes are expected
707 AL, HL, R, L, ON, BDI_BN, NSM, AN, EN, ES, ET, CS,
708 ------------------------------------------------------------------------*/
709 void fz_bidi_resolve_weak(fz_context *ctx, fz_bidi_level baselevel, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
710 {
711 int state = odd(baselevel) ? xr : xl;
712 fz_bidi_chartype cls;
713 size_t ich;
714 fz_bidi_action action;
715 fz_bidi_chartype cls_run;
716 fz_bidi_chartype cls_new;
717
718 fz_bidi_level level = baselevel;
719
720 size_t cch_run = 0;
721
722 for (ich = 0; ich < cch; ich++)
723 {
724 if (pcls[ich] > BDI_BN) {
725 fz_warn(ctx, "error: pcls[%zu] > BN (%d)\n", ich, pcls[ich]);
726 }
727
728 // ignore boundary neutrals
729 if (pcls[ich] == BDI_BN)
730 {
731 // must flatten levels unless at a level change;
732 plevel[ich] = level;
733
734 // lookahead for level changes
735 if (ich + 1 == cch && level != baselevel)
736 {
737 // have to fixup last BN before end of the loop, since
738 // its fix-upped value will be needed below the assert
739 pcls[ich] = embedding_direction(level);
740 }
741 else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN)
742 {
743 // fixup LAST BN in front / after a level run to make
744 // it act like the SOR/EOR in rule X10
745 int newlevel = plevel[ich+1];
746 if (level > newlevel) {
747 newlevel = level;
748 }
749 plevel[ich] = newlevel;
750
751 // must match assigned level
752 pcls[ich] = embedding_direction(newlevel);
753 level = plevel[ich+1];
754 }
755 else
756 {
757 // don't interrupt runs
758 if (cch_run)
759 {
760 cch_run++;
761 }
762 continue;
763 }
764 }
765
766 assert(pcls[ich] <= BDI_BN);
767 cls = pcls[ich];
768
769 action = action_weak[state][cls];
770
771 // resolve the directionality for deferred runs
772 cls_run = get_deferred_type(action);
773 if (cls_run != XX)
774 {
775 set_deferred_run(pcls, cch_run, ich, cls_run);
776 cch_run = 0;
777 }
778
779 // resolve the directionality class at the current location
780 cls_new = get_resolved_type(action);
781 if (cls_new != XX)
782 pcls[ich] = cls_new;
783
784 // increment a deferred run
785 if (IX & action)
786 cch_run++;
787
788 state = state_weak[state][cls];
789 }
790
791 // resolve any deferred runs
792 // use the direction of the current level to emulate PDF
793 cls = embedding_direction(level);
794
795 // resolve the directionality for deferred runs
796 cls_run = get_deferred_type(action_weak[state][cls]);
797 if (cls_run != XX)
798 set_deferred_run(pcls, cch_run, ich, cls_run);
799 }
800
801 // === RESOLVE NEUTRAL TYPES ==============================================
802
803 // action values
804 enum neutral_action
805 {
806 // action to resolve previous input
807 nL = BDI_L, // resolve EN to L
808 En = 3 << 4, // resolve neutrals run to embedding level direction
809 Rn = BDI_R << 4, // resolve neutrals run to strong right
810 Ln = BDI_L << 4, // resolved neutrals run to strong left
811 In = (1<<8), // increment count of deferred neutrals
812 LnL = (1<<4)+BDI_L // set run and EN to L
813 };
814
815 static fz_bidi_chartype
816 get_deferred_neutrals(fz_bidi_action action, fz_bidi_level level)
817 {
818 action = (action >> 4) & 0xF;
819 if (action == (En >> 4))
820 return embedding_direction(level);
821 else
822 return action;
823 }
824
825 static fz_bidi_chartype get_resolved_neutrals(fz_bidi_action action)
826 {
827 action = action & 0xF;
828
829 /* RJW: The original code does:
830 * if (action == In)
831 * return 0;
832 * else
833 * return action;
834 * BUT, this can never be true, as In == 256.
835 * This upsets coverity. We fix this here, but include this note
836 * so that we understand what changed in case we ever update to
837 * a newer release of the bidirectional code.
838 */
839 return action;
840 }
841
842 // state values
843 enum neutral_state
844 {
845 // new temporary class
846 r, // R and characters resolved to R
847 l, // L and characters resolved to L
848 rn, // N preceded by right
849 ln, // N preceded by left
850 a, // AN preceded by left (the abbrev 'la' is used up above)
851 na // N preceded by a
852 } ;
853
854 /*------------------------------------------------------------------------
855 Notes:
856
857 By rule W7, whenever a EN is 'dominated' by an L (including start of
858 run with embedding direction = L) it is resolved to, and further treated
859 as L.
860
861 This leads to the need for 'a' and 'na' states.
862 ------------------------------------------------------------------------*/
863
864 static const int action_neutrals[][5] =
865 {
866 // N, L, R, AN, EN, = cls
867 // state =
868 {In, 0, 0, 0, 0}, // r right
869 {In, 0, 0, 0, BDI_L}, // l left
870
871 {In, En, Rn, Rn, Rn}, // rn N preceded by right
872 {In, Ln, En, En, LnL}, // ln N preceded by left
873
874 {In, 0, 0, 0, BDI_L}, // a AN preceded by left
875 {In, En, Rn, Rn, En} // na N preceded by a
876 } ;
877
878 static const int state_neutrals[][5] =
879 {
880 // N, L, R, AN, EN = cls
881 // state =
882 {rn, l, r, r, r}, // r right
883 {ln, l, r, a, l}, // l left
884
885 {rn, l, r, r, r}, // rn N preceded by right
886 {ln, l, r, a, l}, // ln N preceded by left
887
888 {na, l, r, a, l}, // a AN preceded by left
889 {na, l, r, a, l} // na N preceded by la
890 } ;
891
892 /*------------------------------------------------------------------------
893 Function: resolveNeutrals
894
895 Resolves the directionality of neutral character types.
896
897 Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm.
898
899 Input: Array of embedding levels
900 Character count
901 Baselevel
902
903 In/Out: Array of directional classes
904
905 Note: On input only these directional classes are expected
906 R, L, N, AN, EN and BN
907
908 W8 resolves a number of ENs to L
909 ------------------------------------------------------------------------*/
910 void fz_bidi_resolve_neutrals(fz_bidi_level baselevel, fz_bidi_chartype *pcls, const fz_bidi_level *plevel, size_t cch)
911 {
912 // the state at the start of text depends on the base level
913 int state = odd(baselevel) ? r : l;
914 fz_bidi_chartype cls;
915 size_t ich;
916 fz_bidi_chartype cls_run;
917
918 size_t cch_run = 0;
919 fz_bidi_level level = baselevel;
920
921 for (ich = 0; ich < cch; ich++)
922 {
923 int action;
924 fz_bidi_chartype cls_new;
925
926 // ignore boundary neutrals
927 if (pcls[ich] == BDI_BN)
928 {
929 // include in the count for a deferred run
930 if (cch_run)
931 cch_run++;
932
933 // skip any further processing
934 continue;
935 }
936
937 assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed"
938 cls = pcls[ich];
939
940 action = action_neutrals[state][cls];
941
942 // resolve the directionality for deferred runs
943 cls_run = get_deferred_neutrals(action, level);
944 if (cls_run != BDI_N)
945 {
946 set_deferred_run(pcls, cch_run, ich, cls_run);
947 cch_run = 0;
948 }
949
950 // resolve the directionality class at the current location
951 cls_new = get_resolved_neutrals(action);
952 if (cls_new != BDI_N)
953 pcls[ich] = cls_new;
954
955 if (In & action)
956 cch_run++;
957
958 state = state_neutrals[state][cls];
959 level = plevel[ich];
960 }
961
962 // resolve any deferred runs
963 cls = embedding_direction(level); // eor has type of current level
964
965 // resolve the directionality for deferred runs
966 cls_run = get_deferred_neutrals(action_neutrals[state][cls], level);
967 if (cls_run != BDI_N)
968 set_deferred_run(pcls, cch_run, ich, cls_run);
969 }
970
971 // === RESOLVE IMPLICITLY =================================================
972
973 /*------------------------------------------------------------------------
974 Function: resolveImplicit
975
976 Recursively resolves implicit embedding levels.
977 Implements rules I1 and I2 of the Unicode Bidirectional Algorithm.
978
979 Input: Array of direction classes
980 Character count
981 Base level
982
983 In/Out: Array of embedding levels
984
985 Note: levels may exceed 15 on output.
986 Accepted subset of direction classes
987 R, L, AN, EN
988 ------------------------------------------------------------------------*/
989 static const fz_bidi_level add_level[][4] =
990 {
991 // L, R, AN, EN = cls
992 // level =
993 /* even */ { 0, 1, 2, 2 }, // EVEN
994 /* odd */ { 1, 0, 1, 1 } // ODD
995
996 };
997
998 void fz_bidi_resolve_implicit(const fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
999 {
1000 size_t ich;
1001
1002 for (ich = 0; ich < cch; ich++)
1003 {
1004 // cannot resolve bn here, since some bn were resolved to strong
1005 // types in resolveWeak. To remove these we need the original
1006 // types, which are available again in resolveWhiteSpace
1007 if (pcls[ich] == BDI_BN)
1008 {
1009 continue;
1010 }
1011 assert(pcls[ich] > 0); // "No Neutrals allowed to survive here."
1012 assert(pcls[ich] < 5); // "Out of range."
1013 plevel[ich] += add_level[odd(plevel[ich])][pcls[ich] - 1];
1014 }
1015 }
1016
1017 #if 0
1018 // === REORDER ===========================================================
1019 /*------------------------------------------------------------------------
1020 Function: resolve_lines
1021
1022 Breaks a paragraph into lines
1023
1024 Input: Character count
1025 Array of line break flags
1026 In/Out: Array of characters
1027
1028 Returns the count of characters on the first line
1029
1030 Note: This function only breaks lines at hard line breaks. Other
1031 line breaks can be passed in. If pbrk[n] is true, then a break
1032 occurs after the character in psz_input[n]. Breaks before the first
1033 character are not allowed.
1034 ------------------------------------------------------------------------*/
1035 static int resolve_lines(uint32_t *psz_input, int *pbrk, int cch)
1036 {
1037 int ich;
1038
1039 // skip characters not of type LS
1040 for(ich = 0; ich < cch; ich++)
1041 {
1042 if (psz_input[ich] == chLS || (pbrk && pbrk[ich]))
1043 {
1044 ich++;
1045 break;
1046 }
1047 }
1048
1049 return ich;
1050 }
1051 #endif
1052
1053 /*------------------------------------------------------------------------
1054 Function: fz_bidi_resolve_whitespace
1055
1056 Resolves levels for WS and S
1057 Implements rule L1 of the Unicode bidi Algorithm.
1058
1059 Input: Base embedding level
1060 Character count
1061 Array of direction classes (for one line of text)
1062
1063 In/Out: Array of embedding levels (for one line of text)
1064
1065 Note: this should be applied a line at a time. The default driver
1066 code supplied in this file assumes a single line of text; for
1067 a real implementation, cch and the initial pointer values
1068 would have to be adjusted.
1069 ------------------------------------------------------------------------*/
1070 void fz_bidi_resolve_whitespace(fz_bidi_level baselevel, const fz_bidi_chartype *pcls, fz_bidi_level *plevel,
1071 size_t cch)
1072 {
1073 size_t cchrun = 0;
1074 fz_bidi_level oldlevel = baselevel;
1075 size_t ich;
1076
1077 for (ich = 0; ich < cch; ich++)
1078 {
1079 switch(pcls[ich])
1080 {
1081 default:
1082 cchrun = 0; // any other character breaks the run
1083 break;
1084 case BDI_WS:
1085 cchrun++;
1086 break;
1087
1088 case BDI_RLE:
1089 case BDI_LRE:
1090 case BDI_LRO:
1091 case BDI_RLO:
1092 case BDI_PDF:
1093 case BDI_BN:
1094 plevel[ich] = oldlevel;
1095 cchrun++;
1096 break;
1097
1098 case BDI_S:
1099 case BDI_B:
1100 // reset levels for WS before eot
1101 set_deferred_level_run(plevel, cchrun, ich, baselevel);
1102 cchrun = 0;
1103 plevel[ich] = baselevel;
1104 break;
1105 }
1106 oldlevel = plevel[ich];
1107 }
1108 // reset level before eot
1109 set_deferred_level_run(plevel, cchrun, ich, baselevel);
1110 }
1111
1112 #ifdef BIDI_LINE_AT_A_TIME
1113 /*------------------------------------------------------------------------
1114 Functions: reorder/reorderLevel
1115
1116 Recursively reorders the display string
1117 "From the highest level down, reverse all characters at that level and
1118 higher, down to the lowest odd level"
1119
1120 Implements rule L2 of the Unicode bidi Algorithm.
1121
1122 Input: Array of embedding levels
1123 Character count
1124 Flag enabling reversal (set to false by initial caller)
1125
1126 In/Out: Text to reorder
1127
1128 Note: levels may exceed 15 resp. 61 on input.
1129
1130 Rule L3 - reorder combining marks is not implemented here
1131 Rule L4 - glyph mirroring is implemented as a display option below
1132
1133 Note: this should be applied a line at a time
1134 -------------------------------------------------------------------------*/
1135 static int reorderLevel(fz_bidi_level level, uint32_t *psz_text, const fz_bidi_level *plevel, int cch,
1136 int f_reverse)
1137 {
1138 int ich;
1139
1140 // true as soon as first odd level encountered
1141 f_reverse = f_reverse || odd(level);
1142
1143 for (ich = 0; ich < cch; ich++)
1144 {
1145 if (plevel[ich] < level)
1146 {
1147 break;
1148 }
1149 else if (plevel[ich] > level)
1150 {
1151 ich += reorderLevel(level + 1, psz_text + ich, plevel + ich,
1152 cch - ich, f_reverse) - 1;
1153 }
1154 }
1155 if (f_reverse)
1156 {
1157 reverse(psz_text, ich);
1158 }
1159 return ich;
1160 }
1161
1162 int Bidi_reorder(fz_bidi_level baselevel, uint32_t *psz_text, const fz_bidi_level *plevel, int cch)
1163 {
1164 int ich = 0;
1165
1166 while (ich < cch)
1167 {
1168 ich += reorderLevel(baselevel, psz_text + ich, plevel + ich,
1169 cch - ich, FALSE);
1170 }
1171 return ich;
1172 }
1173 #endif