Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/mujs/utf.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/mujs/utf.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,285 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE + * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#include <stdlib.h> +#include <string.h> + +#include "utf.h" +#include "utfdata.h" + +#define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) + +typedef unsigned char uchar; + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + Bad = Runeerror +}; + +int +chartorune(Rune *rune, const char *str) +{ + int c, c1, c2, c3; + int l; + + /* overlong null character */ + if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) { + *rune = 0; + return 2; + } + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetochar(char *str, const Rune *rune) +{ + int c = *rune; + + /* overlong null character */ + if (c == 0) { + str[0] = (char)0xc0; + str[1] = (char)0x80; + return 2; + } + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + if(c <= Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 00080-007FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * three character sequence + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(int c) +{ + Rune rune; + char str[10]; + + rune = c; + return runetochar(str, &rune); +} + +static const Rune * +ucd_bsearch(Rune c, const Rune *t, int n, int ne) +{ + const Rune *p; + int m; + + while(n > 1) { + m = n/2; + p = t + m*ne; + if(c >= p[0]) { + t = p; + n = n-m; + } else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +Rune +tolowerrune(Rune c) +{ + const Rune *p; + + p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3); + if(p && c >= p[0] && c <= p[1]) + return c + p[2]; + p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2); + if(p && c == p[0]) + return c + p[1]; + return c; +} + +Rune +toupperrune(Rune c) +{ + const Rune *p; + + p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3); + if(p && c >= p[0] && c <= p[1]) + return c + p[2]; + p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2); + if(p && c == p[0]) + return c + p[1]; + return c; +} + +int +islowerrune(Rune c) +{ + const Rune *p; + + p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3); + if(p && c >= p[0] && c <= p[1]) + return 1; + p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2); + if(p && c == p[0]) + return 1; + return 0; +} + +int +isupperrune(Rune c) +{ + const Rune *p; + + p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3); + if(p && c >= p[0] && c <= p[1]) + return 1; + p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2); + if(p && c == p[0]) + return 1; + return 0; +} + +int +isalpharune(Rune c) +{ + const Rune *p; + + p = ucd_bsearch(c, ucd_alpha2, nelem(ucd_alpha2)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + p = ucd_bsearch(c, ucd_alpha1, nelem(ucd_alpha1), 1); + if(p && c == p[0]) + return 1; + return 0; +}
