Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/api/pdfrenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: pdfrenderer.cpp | |
| 3 // Description: PDF rendering interface to inject into TessBaseAPI | |
| 4 // | |
| 5 // (C) Copyright 2011, Google Inc. | |
| 6 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 // you may not use this file except in compliance with the License. | |
| 8 // You may obtain a copy of the License at | |
| 9 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 // | |
| 16 /////////////////////////////////////////////////////////////////////// | |
| 17 | |
| 18 // Include automatically generated configuration file if running autoconf. | |
| 19 #ifdef HAVE_CONFIG_H | |
| 20 # include "config_auto.h" | |
| 21 #endif | |
| 22 | |
| 23 #include "pdf_ttf.h" | |
| 24 #include "tprintf.h" | |
| 25 #include "helpers.h" // for Swap, copy_string | |
| 26 | |
| 27 #include <allheaders.h> | |
| 28 #include <tesseract/baseapi.h> | |
| 29 #include <tesseract/publictypes.h> // for PTIsTextType() | |
| 30 #include <tesseract/renderer.h> | |
| 31 #include <cmath> | |
| 32 #include <cstring> | |
| 33 #include <fstream> // for std::ifstream | |
| 34 #include <locale> // for std::locale::classic | |
| 35 #include <memory> // std::unique_ptr | |
| 36 #include <sstream> // for std::stringstream | |
| 37 #include <string_view> | |
| 38 | |
| 39 using namespace std::literals; | |
| 40 | |
| 41 #ifndef NDEBUG | |
| 42 #define DEBUG_PDF | |
| 43 #endif | |
| 44 #ifdef DEBUG_PDF | |
| 45 #define NO_PDF_COMPRESSION | |
| 46 #endif | |
| 47 | |
| 48 /* | |
| 49 | |
| 50 Design notes from Ken Sharp, with light editing. | |
| 51 | |
| 52 We think one solution is a font with a single glyph (.notdef) and a | |
| 53 CIDToGIDMap which maps all the CIDs to 0. That map would then be | |
| 54 stored as a stream in the PDF file, and when flat compressed should | |
| 55 be pretty small. The font, of course, will be approximately the same | |
| 56 size as the one you currently use. | |
| 57 | |
| 58 I'm working on such a font now, the CIDToGIDMap is trivial, you just | |
| 59 create a stream object which contains 128k bytes (2 bytes per possible | |
| 60 CID and your CIDs range from 0 to 65535) and where you currently have | |
| 61 "/CIDToGIDMap /Identity" you would have "/CIDToGIDMap <object> 0 R". | |
| 62 | |
| 63 Note that if, in future, you were to use a different (ie not 2 byte) | |
| 64 CMap for character codes you could trivially extend the CIDToGIDMap. | |
| 65 | |
| 66 The following is an explanation of how some of the font stuff works, | |
| 67 this may be too simple for you in which case please accept my | |
| 68 apologies, its hard to know how much knowledge someone has. You can | |
| 69 skip all this anyway, its just for information. | |
| 70 | |
| 71 The font embedded in a PDF file is usually intended just to be | |
| 72 rendered, but extensions allow for at least some ability to locate (or | |
| 73 copy) text from a document. This isn't something which was an original | |
| 74 goal of the PDF format, but its been retro-fitted, presumably due to | |
| 75 popular demand. | |
| 76 | |
| 77 To do this reliably the PDF file must contain a ToUnicode CMap, a | |
| 78 device for mapping character codes to Unicode code points. If one of | |
| 79 these is present, then this will be used to convert the character | |
| 80 codes into Unicode values. If its not present then the reader will | |
| 81 fall back through a series of heuristics to try and guess the | |
| 82 result. This is, as you would expect, prone to failure. | |
| 83 | |
| 84 This doesn't concern you of course, since you always write a ToUnicode | |
| 85 CMap, so because you are writing the text in text rendering mode 3 it | |
| 86 would seem that you don't really need to worry about this, but in the | |
| 87 PDF spec you cannot have an isolated ToUnicode CMap, it has to be | |
| 88 attached to a font, so in order to get even copy/paste to work you | |
| 89 need to define a font. | |
| 90 | |
| 91 This is what leads to problems, tools like pdfwrite assume that they | |
| 92 are going to be able to (or even have to) modify the font entries, so | |
| 93 they require that the font being embedded be valid, and to be honest | |
| 94 the font Tesseract embeds isn't valid (for this purpose). | |
| 95 | |
| 96 | |
| 97 To see why lets look at how text is specified in a PDF file: | |
| 98 | |
| 99 (Test) Tj | |
| 100 | |
| 101 Now that looks like text but actually it isn't. Each of those bytes is | |
| 102 a 'character code'. When it comes to rendering the text a complex | |
| 103 sequence of events takes place, which converts the character code into | |
| 104 'something' which the font understands. Its entirely possible via | |
| 105 character mappings to have that text render as 'Sftu' | |
| 106 | |
| 107 For simple fonts (PostScript type 1), we use the character code as the | |
| 108 index into an Encoding array (256 elements), each element of which is | |
| 109 a glyph name, so this gives us a glyph name. We then consult the | |
| 110 CharStrings dictionary in the font, that's a complex object which | |
| 111 contains pairs of keys and values, you can use the key to retrieve a | |
| 112 given value. So we have a glyph name, we then use that as the key to | |
| 113 the dictionary and retrieve the associated value. For a type 1 font, | |
| 114 the value is a glyph program that describes how to draw the glyph. | |
| 115 | |
| 116 For CIDFonts, its a little more complicated. Because CIDFonts can be | |
| 117 large, using a glyph name as the key is unreasonable (it would also | |
| 118 lead to unfeasibly large Encoding arrays), so instead we use a 'CID' | |
| 119 as the key. CIDs are just numbers. | |
| 120 | |
| 121 But.... We don't use the character code as the CID. What we do is use | |
| 122 a CMap to convert the character code into a CID. We then use the CID | |
| 123 to key the CharStrings dictionary and proceed as before. So the 'CMap' | |
| 124 is the equivalent of the Encoding array, but its a more compact and | |
| 125 flexible representation. | |
| 126 | |
| 127 Note that you have to use the CMap just to find out how many bytes | |
| 128 constitute a character code, and it can be variable. For example you | |
| 129 can say if the first byte is 0x00->0x7f then its just one byte, if its | |
| 130 0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I | |
| 131 have seen CMaps defining character codes up to 5 bytes wide. | |
| 132 | |
| 133 Now that's fine for 'PostScript' CIDFonts, but its not sufficient for | |
| 134 TrueType CIDFonts. The thing is that TrueType fonts are accessed using | |
| 135 a Glyph ID (GID) (and the LOCA table) which may well not be anything | |
| 136 like the CID. So for this case PDF includes a CIDToGIDMap. That maps | |
| 137 the CIDs to GIDs, and we can then use the GID to get the glyph | |
| 138 description from the GLYF table of the font. | |
| 139 | |
| 140 So for a TrueType CIDFont, character-code->CID->GID->glyf-program. | |
| 141 | |
| 142 Looking at the PDF file I was supplied with we see that it contains | |
| 143 text like : | |
| 144 | |
| 145 <0x0075> Tj | |
| 146 | |
| 147 So we start by taking the character code (117) and look it up in the | |
| 148 CMap. Well you don't supply a CMap, you just use the Identity-H one | |
| 149 which is predefined. So character code 117 maps to CID 117. Then we | |
| 150 use the CIDToGIDMap, again you don't supply one, you just use the | |
| 151 predefined 'Identity' map. So CID 117 maps to GID 117. But the font we | |
| 152 were supplied with only contains 116 glyphs. | |
| 153 | |
| 154 Now for Latin that's not a huge problem, you can just supply a bigger | |
| 155 font. But for more complex languages that *is* going to be more of a | |
| 156 problem. Either you need to supply a font which contains glyphs for | |
| 157 all the possible CID->GID mappings, or we need to think laterally. | |
| 158 | |
| 159 Our solution using a TrueType CIDFont is to intervene at the | |
| 160 CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a | |
| 161 font with just one glyph, the .notdef glyph at GID 0. This is what I'm | |
| 162 looking into now. | |
| 163 | |
| 164 It would also be possible to have a 'PostScript' (ie type 1 outlines) | |
| 165 CIDFont which contained 1 glyph, and a CMap which mapped all character | |
| 166 codes to CID 0. The effect would be the same. | |
| 167 | |
| 168 Its possible (I haven't checked) that the PostScript CIDFont and | |
| 169 associated CMap would be smaller than the TrueType font and associated | |
| 170 CIDToGIDMap. | |
| 171 | |
| 172 --- in a followup --- | |
| 173 | |
| 174 OK there is a small problem there, if I use GID 0 then Acrobat gets | |
| 175 upset about it and complains it cannot extract the font. If I set the | |
| 176 CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally | |
| 177 mad...... | |
| 178 | |
| 179 */ | |
| 180 | |
| 181 namespace tesseract { | |
| 182 | |
| 183 // If the font is 10 pts, nominal character width is 5 pts | |
| 184 static const int kCharWidth = 2; | |
| 185 | |
| 186 // Used for memory allocation. A codepoint must take no more than this | |
| 187 // many bytes, when written in the PDF way. e.g. "<0063>" for the | |
| 188 // letter 'c' | |
| 189 static const int kMaxBytesPerCodepoint = 20; | |
| 190 | |
| 191 /********************************************************************** | |
| 192 * PDF Renderer interface implementation | |
| 193 **********************************************************************/ | |
| 194 TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly) | |
| 195 : TessResultRenderer(outputbase, "pdf"), datadir_(datadir) { | |
| 196 obj_ = 0; | |
| 197 textonly_ = textonly; | |
| 198 offsets_.push_back(0); | |
| 199 } | |
| 200 | |
| 201 void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) { | |
| 202 offsets_.push_back(objectsize + offsets_.back()); | |
| 203 obj_++; | |
| 204 } | |
| 205 | |
| 206 void TessPDFRenderer::AppendPDFObject(const char *data) { | |
| 207 AppendPDFObjectDIY(strlen(data)); | |
| 208 AppendString(data); | |
| 209 } | |
| 210 | |
| 211 // Helper function to prevent us from accidentally writing | |
| 212 // scientific notation to an HOCR or PDF file. Besides, three | |
| 213 // decimal points are all you really need. | |
| 214 static double prec(double x) { | |
| 215 double kPrecision = 1000.0; | |
| 216 double a = round(x * kPrecision) / kPrecision; | |
| 217 if (a == -0) { | |
| 218 return 0; | |
| 219 } | |
| 220 return a; | |
| 221 } | |
| 222 | |
| 223 static long dist2(int x1, int y1, int x2, int y2) { | |
| 224 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1); | |
| 225 } | |
| 226 | |
| 227 // Viewers like evince can get really confused during copy-paste when | |
| 228 // the baseline wanders around. So I've decided to project every word | |
| 229 // onto the (straight) line baseline. All numbers are in the native | |
| 230 // PDF coordinate system, which has the origin in the bottom left and | |
| 231 // the unit is points, which is 1/72 inch. Tesseract reports baselines | |
| 232 // left-to-right no matter what the reading order is. We need the | |
| 233 // word baseline in reading order, so we do that conversion here. Returns | |
| 234 // the word's baseline origin and length. | |
| 235 static void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1, | |
| 236 int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, | |
| 237 int line_y2, double *x0, double *y0, double *length) { | |
| 238 if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { | |
| 239 std::swap(word_x1, word_x2); | |
| 240 std::swap(word_y1, word_y2); | |
| 241 } | |
| 242 double word_length; | |
| 243 double x, y; | |
| 244 { | |
| 245 double l2 = dist2(line_x1, line_y1, line_x2, line_y2); | |
| 246 if (l2 == 0) { | |
| 247 x = line_x1; | |
| 248 y = line_y1; | |
| 249 } else { | |
| 250 int px = word_x1; | |
| 251 int py = word_y1; | |
| 252 double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2; | |
| 253 x = line_x2 + t * (line_x2 - line_x1); | |
| 254 y = line_y2 + t * (line_y2 - line_y1); | |
| 255 } | |
| 256 word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2))); | |
| 257 word_length = word_length * 72.0 / ppi; | |
| 258 x = x * 72 / ppi; | |
| 259 y = height - (y * 72.0 / ppi); | |
| 260 } | |
| 261 *x0 = x; | |
| 262 *y0 = y; | |
| 263 *length = word_length; | |
| 264 } | |
| 265 | |
| 266 // Compute coefficients for an affine matrix describing the rotation | |
| 267 // of the text. If the text is right-to-left such as Arabic or Hebrew, | |
| 268 // we reflect over the Y-axis. This matrix will set the coordinate | |
| 269 // system for placing text in the PDF file. | |
| 270 // | |
| 271 // RTL | |
| 272 // [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ] | |
| 273 // [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ] | |
| 274 static void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, | |
| 275 double *a, double *b, double *c, double *d) { | |
| 276 double theta = | |
| 277 atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1)); | |
| 278 *a = cos(theta); | |
| 279 *b = sin(theta); | |
| 280 *c = -sin(theta); | |
| 281 *d = cos(theta); | |
| 282 switch (writing_direction) { | |
| 283 case WRITING_DIRECTION_RIGHT_TO_LEFT: | |
| 284 *a = -*a; | |
| 285 *b = -*b; | |
| 286 break; | |
| 287 case WRITING_DIRECTION_TOP_TO_BOTTOM: | |
| 288 // TODO(jbreiden) Consider using the vertical PDF writing mode. | |
| 289 break; | |
| 290 default: | |
| 291 break; | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 // There are some really awkward PDF viewers in the wild, such as | |
| 296 // 'Preview' which ships with the Mac. They do a better job with text | |
| 297 // selection and highlighting when given perfectly flat baseline | |
| 298 // instead of very slightly tilted. We clip small tilts to appease | |
| 299 // these viewers. I chose this threshold large enough to absorb noise, | |
| 300 // but small enough that lines probably won't cross each other if the | |
| 301 // whole page is tilted at almost exactly the clipping threshold. | |
| 302 static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, | |
| 303 int *line_x2, int *line_y2) { | |
| 304 *line_x1 = x1; | |
| 305 *line_y1 = y1; | |
| 306 *line_x2 = x2; | |
| 307 *line_y2 = y2; | |
| 308 int rise = abs(y2 - y1) * 72; | |
| 309 int run = abs(x2 - x1) * 72; | |
| 310 if (rise < 2 * ppi && 2 * ppi < run) { | |
| 311 *line_y1 = *line_y2 = (y1 + y2) / 2; | |
| 312 } | |
| 313 } | |
| 314 | |
| 315 static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) { | |
| 316 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) { | |
| 317 tprintf("Dropping invalid codepoint %d\n", code); | |
| 318 return false; | |
| 319 } | |
| 320 if (code < 0x10000) { | |
| 321 snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code); | |
| 322 } else { | |
| 323 int a = code - 0x010000; | |
| 324 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800; | |
| 325 int low_surrogate = (0x03FF & a) + 0xDC00; | |
| 326 snprintf(utf16, kMaxBytesPerCodepoint, "%04X%04X", high_surrogate, low_surrogate); | |
| 327 } | |
| 328 return true; | |
| 329 } | |
| 330 | |
| 331 char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) { | |
| 332 double ppi = api->GetSourceYResolution(); | |
| 333 | |
| 334 // These initial conditions are all arbitrary and will be overwritten | |
| 335 double old_x = 0.0, old_y = 0.0; | |
| 336 int old_fontsize = 0; | |
| 337 tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; | |
| 338 bool new_block = true; | |
| 339 int fontsize = 0; | |
| 340 double a = 1; | |
| 341 double b = 0; | |
| 342 double c = 0; | |
| 343 double d = 1; | |
| 344 | |
| 345 std::stringstream pdf_str; | |
| 346 // Use "C" locale (needed for double values prec()). | |
| 347 pdf_str.imbue(std::locale::classic()); | |
| 348 // Use 8 digits for double values. | |
| 349 pdf_str.precision(8); | |
| 350 | |
| 351 // TODO(jbreiden) This marries the text and image together. | |
| 352 // Slightly cleaner from an abstraction standpoint if this were to | |
| 353 // live inside a separate text object. | |
| 354 pdf_str << "q " << prec(width) << " 0 0 " << prec(height) << " 0 0 cm"; | |
| 355 if (!textonly_) { | |
| 356 pdf_str << " /Im1 Do"; | |
| 357 } | |
| 358 pdf_str << " Q\n"; | |
| 359 | |
| 360 int line_x1 = 0; | |
| 361 int line_y1 = 0; | |
| 362 int line_x2 = 0; | |
| 363 int line_y2 = 0; | |
| 364 | |
| 365 const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator()); | |
| 366 while (!res_it->Empty(RIL_BLOCK)) { | |
| 367 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { | |
| 368 auto block_type = res_it->BlockType(); | |
| 369 if (!PTIsTextType(block_type)) { | |
| 370 // ignore non-text blocks | |
| 371 res_it->Next(RIL_BLOCK); | |
| 372 continue; | |
| 373 } | |
| 374 pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink | |
| 375 old_fontsize = 0; // Every block will declare its fontsize | |
| 376 new_block = true; // Every block will declare its affine matrix | |
| 377 } | |
| 378 | |
| 379 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { | |
| 380 int x1, y1, x2, y2; | |
| 381 res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); | |
| 382 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); | |
| 383 } | |
| 384 | |
| 385 if (res_it->Empty(RIL_WORD)) { | |
| 386 res_it->Next(RIL_WORD); | |
| 387 continue; | |
| 388 } | |
| 389 | |
| 390 // Writing direction changes at a per-word granularity | |
| 391 tesseract::WritingDirection writing_direction; | |
| 392 { | |
| 393 tesseract::Orientation orientation; | |
| 394 tesseract::TextlineOrder textline_order; | |
| 395 float deskew_angle; | |
| 396 res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); | |
| 397 if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) { | |
| 398 switch (res_it->WordDirection()) { | |
| 399 case DIR_LEFT_TO_RIGHT: | |
| 400 writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; | |
| 401 break; | |
| 402 case DIR_RIGHT_TO_LEFT: | |
| 403 writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT; | |
| 404 break; | |
| 405 default: | |
| 406 writing_direction = old_writing_direction; | |
| 407 } | |
| 408 } | |
| 409 } | |
| 410 | |
| 411 // Where is word origin and how long is it? | |
| 412 double x, y, word_length; | |
| 413 { | |
| 414 int word_x1, word_y1, word_x2, word_y2; | |
| 415 res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); | |
| 416 GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1, | |
| 417 line_y1, line_x2, line_y2, &x, &y, &word_length); | |
| 418 } | |
| 419 | |
| 420 if (writing_direction != old_writing_direction || new_block) { | |
| 421 AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d); | |
| 422 pdf_str << " " << prec(a) // . This affine matrix | |
| 423 << " " << prec(b) // . sets the coordinate | |
| 424 << " " << prec(c) // . system for all | |
| 425 << " " << prec(d) // . text that follows. | |
| 426 << " " << prec(x) // . | |
| 427 << " " << prec(y) // . | |
| 428 << (" Tm "); // Place cursor absolutely | |
| 429 new_block = false; | |
| 430 } else { | |
| 431 double dx = x - old_x; | |
| 432 double dy = y - old_y; | |
| 433 pdf_str << " " << prec(dx * a + dy * b) << " " << prec(dx * c + dy * d) | |
| 434 << (" Td "); // Relative moveto | |
| 435 } | |
| 436 old_x = x; | |
| 437 old_y = y; | |
| 438 old_writing_direction = writing_direction; | |
| 439 | |
| 440 // Adjust font size on a per word granularity. Pay attention to | |
| 441 // fontsize, old_fontsize, and pdf_str. We've found that for | |
| 442 // in Arabic, Tesseract will happily return a fontsize of zero, | |
| 443 // so we make up a default number to protect ourselves. | |
| 444 { | |
| 445 bool bold, italic, underlined, monospace, serif, smallcaps; | |
| 446 int font_id; | |
| 447 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, | |
| 448 &fontsize, &font_id); | |
| 449 const int kDefaultFontsize = 8; | |
| 450 if (fontsize <= 0) { | |
| 451 fontsize = kDefaultFontsize; | |
| 452 } | |
| 453 if (fontsize != old_fontsize) { | |
| 454 pdf_str << "/f-0-0 " << fontsize << " Tf "; | |
| 455 old_fontsize = fontsize; | |
| 456 #ifdef DEBUG_PDF | |
| 457 pdf_str << "\n"; | |
| 458 #endif | |
| 459 } | |
| 460 } | |
| 461 | |
| 462 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); | |
| 463 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); | |
| 464 std::string pdf_word; | |
| 465 int pdf_word_len = 0; | |
| 466 do { | |
| 467 const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); | |
| 468 if (grapheme && grapheme[0] != '\0') { | |
| 469 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get()); | |
| 470 char utf16[kMaxBytesPerCodepoint]; | |
| 471 for (char32 code : unicodes) { | |
| 472 if (CodepointToUtf16be(code, utf16)) { | |
| 473 pdf_word += utf16; | |
| 474 pdf_word_len++; | |
| 475 } | |
| 476 } | |
| 477 } | |
| 478 res_it->Next(RIL_SYMBOL); | |
| 479 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); | |
| 480 if (res_it->IsAtBeginningOf(RIL_WORD)) { | |
| 481 pdf_word += "0020"; | |
| 482 } | |
| 483 if (word_length > 0 && pdf_word_len > 0) { | |
| 484 double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len)); | |
| 485 pdf_str << h_stretch << " Tz"; // horizontal stretch | |
| 486 pdf_str | |
| 487 << " [ <" << pdf_word // UTF-16BE representation | |
| 488 << "> ] TJ"; // show the text | |
| 489 #ifdef DEBUG_PDF | |
| 490 pdf_str << "\n"; | |
| 491 #endif | |
| 492 } | |
| 493 if (last_word_in_line) { | |
| 494 pdf_str << " \n"; | |
| 495 } | |
| 496 if (last_word_in_block) { | |
| 497 pdf_str << "ET\n"; // end the text object | |
| 498 } | |
| 499 } | |
| 500 return copy_string(pdf_str.str()); | |
| 501 } | |
| 502 | |
| 503 bool TessPDFRenderer::BeginDocumentHandler() { | |
| 504 AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n"); | |
| 505 | |
| 506 // CATALOG | |
| 507 AppendPDFObject( | |
| 508 "1 0 obj\n" | |
| 509 "<<\n" | |
| 510 " /Type /Catalog\n" | |
| 511 " /Pages 2 0 R\n" | |
| 512 ">>\nendobj\n"); | |
| 513 | |
| 514 // We are reserving object #2 for the /Pages | |
| 515 // object, which I am going to create and write | |
| 516 // at the end of the PDF file. | |
| 517 AppendPDFObject(""); | |
| 518 | |
| 519 // TYPE0 FONT | |
| 520 AppendPDFObject( | |
| 521 "3 0 obj\n" | |
| 522 "<<\n" | |
| 523 " /BaseFont /GlyphLessFont\n" | |
| 524 " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font | |
| 525 " /Encoding /Identity-H\n" | |
| 526 " /Subtype /Type0\n" | |
| 527 " /ToUnicode 6 0 R\n" // ToUnicode | |
| 528 " /Type /Font\n" | |
| 529 ">>\n" | |
| 530 "endobj\n"); | |
| 531 | |
| 532 // CIDFONTTYPE2 | |
| 533 std::stringstream stream; | |
| 534 // Use "C" locale (needed for int values larger than 999). | |
| 535 stream.imbue(std::locale::classic()); | |
| 536 stream << "4 0 obj\n" | |
| 537 "<<\n" | |
| 538 " /BaseFont /GlyphLessFont\n" | |
| 539 " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap | |
| 540 " /CIDSystemInfo\n" | |
| 541 " <<\n" | |
| 542 " /Ordering (Identity)\n" | |
| 543 " /Registry (Adobe)\n" | |
| 544 " /Supplement 0\n" | |
| 545 " >>\n" | |
| 546 " /FontDescriptor 7 0 R\n" // Font descriptor | |
| 547 " /Subtype /CIDFontType2\n" | |
| 548 " /Type /Font\n" | |
| 549 " /DW " | |
| 550 << (1000 / kCharWidth) | |
| 551 << "\n" | |
| 552 ">>\n" | |
| 553 "endobj\n"; | |
| 554 AppendPDFObject(stream.str().c_str()); | |
| 555 | |
| 556 // CIDTOGIDMAP | |
| 557 const int kCIDToGIDMapSize = 2 * (1 << 16); | |
| 558 const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]); | |
| 559 for (int i = 0; i < kCIDToGIDMapSize; i++) { | |
| 560 cidtogidmap[i] = (i % 2) ? 1 : 0; | |
| 561 } | |
| 562 size_t len = kCIDToGIDMapSize; | |
| 563 #ifndef NO_PDF_COMPRESSION | |
| 564 auto comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len); | |
| 565 #endif | |
| 566 stream.str(""); | |
| 567 stream << "5 0 obj\n" | |
| 568 "<<\n" | |
| 569 " /Length " | |
| 570 << len | |
| 571 << "" | |
| 572 #ifndef NO_PDF_COMPRESSION | |
| 573 " /Filter /FlateDecode" | |
| 574 #endif | |
| 575 "\n" | |
| 576 ">>\n" | |
| 577 "stream\n" | |
| 578 ; | |
| 579 AppendString(stream.str().c_str()); | |
| 580 long objsize = stream.str().size(); | |
| 581 #ifndef NO_PDF_COMPRESSION | |
| 582 AppendData(reinterpret_cast<char *>(comp), len); | |
| 583 #else | |
| 584 AppendData(reinterpret_cast<char *>(cidtogidmap.get()), len); | |
| 585 #endif | |
| 586 objsize += len; | |
| 587 #ifndef NO_PDF_COMPRESSION | |
| 588 lept_free(comp); | |
| 589 #endif | |
| 590 objsize += AppendData("endstream\n"sv); | |
| 591 objsize += AppendData("endobj\n"sv); | |
| 592 AppendPDFObjectDIY(objsize); | |
| 593 | |
| 594 const char stream2[] = | |
| 595 "/CIDInit /ProcSet findresource begin\n" | |
| 596 "12 dict begin\n" | |
| 597 "begincmap\n" | |
| 598 "/CIDSystemInfo\n" | |
| 599 "<<\n" | |
| 600 " /Registry (Adobe)\n" | |
| 601 " /Ordering (UCS)\n" | |
| 602 " /Supplement 0\n" | |
| 603 ">> def\n" | |
| 604 "/CMapName /Adobe-Identify-UCS def\n" | |
| 605 "/CMapType 2 def\n" | |
| 606 "1 begincodespacerange\n" | |
| 607 "<0000> <FFFF>\n" | |
| 608 "endcodespacerange\n" | |
| 609 "1 beginbfrange\n" | |
| 610 "<0000> <FFFF> <0000>\n" | |
| 611 "endbfrange\n" | |
| 612 "endcmap\n" | |
| 613 "CMapName currentdict /CMap defineresource pop\n" | |
| 614 "end\n" | |
| 615 "end\n"; | |
| 616 | |
| 617 // TOUNICODE | |
| 618 stream.str(""); | |
| 619 stream << "6 0 obj\n" | |
| 620 "<< /Length " | |
| 621 << (sizeof(stream2) - 1) | |
| 622 << " >>\n" | |
| 623 "stream\n" | |
| 624 << stream2 | |
| 625 << "endstream\n" | |
| 626 "endobj\n"; | |
| 627 AppendPDFObject(stream.str().c_str()); | |
| 628 | |
| 629 // FONT DESCRIPTOR | |
| 630 stream.str(""); | |
| 631 stream << "7 0 obj\n" | |
| 632 "<<\n" | |
| 633 " /Ascent 1000\n" | |
| 634 " /CapHeight 1000\n" | |
| 635 " /Descent -1\n" // Spec says must be negative | |
| 636 " /Flags 5\n" // FixedPitch + Symbolic | |
| 637 " /FontBBox [ 0 0 " | |
| 638 << (1000 / kCharWidth) | |
| 639 << " 1000 ]\n" | |
| 640 " /FontFile2 8 0 R\n" | |
| 641 " /FontName /GlyphLessFont\n" | |
| 642 " /ItalicAngle 0\n" | |
| 643 " /StemV 80\n" | |
| 644 " /Type /FontDescriptor\n" | |
| 645 ">>\n" | |
| 646 "endobj\n"; | |
| 647 AppendPDFObject(stream.str().c_str()); | |
| 648 | |
| 649 stream.str(""); | |
| 650 stream << datadir_.c_str() << "/pdf.ttf"; | |
| 651 const uint8_t *font; | |
| 652 std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary); | |
| 653 std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {}); | |
| 654 auto size = buffer.size(); | |
| 655 if (size) { | |
| 656 font = buffer.data(); | |
| 657 } else { | |
| 658 #if !defined(NDEBUG) | |
| 659 tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str()); | |
| 660 #endif | |
| 661 font = pdf_ttf; | |
| 662 size = sizeof(pdf_ttf); | |
| 663 } | |
| 664 | |
| 665 // FONTFILE2 | |
| 666 stream.str(""); | |
| 667 stream << "8 0 obj\n" | |
| 668 "<<\n" | |
| 669 " /Length " | |
| 670 << size | |
| 671 << "\n" | |
| 672 " /Length1 " | |
| 673 << size | |
| 674 << "\n" | |
| 675 ">>\n" | |
| 676 "stream\n"; | |
| 677 AppendString(stream.str().c_str()); | |
| 678 objsize = stream.str().size(); | |
| 679 AppendData(reinterpret_cast<const char *>(font), size); | |
| 680 objsize += size; | |
| 681 objsize += AppendData("endstream\n"sv); | |
| 682 objsize += AppendData("endobj\n"sv); | |
| 683 AppendPDFObjectDIY(objsize); | |
| 684 return true; | |
| 685 } | |
| 686 | |
| 687 bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum, | |
| 688 char **pdf_object, long int *pdf_object_size, | |
| 689 const int jpg_quality) { | |
| 690 if (!pdf_object_size || !pdf_object) { | |
| 691 return false; | |
| 692 } | |
| 693 *pdf_object = nullptr; | |
| 694 *pdf_object_size = 0; | |
| 695 if (!filename && !pix) { | |
| 696 return false; | |
| 697 } | |
| 698 | |
| 699 L_Compressed_Data *cid = nullptr; | |
| 700 auto sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid); | |
| 701 | |
| 702 if (sad || !cid) { | |
| 703 l_CIDataDestroy(&cid); | |
| 704 return false; | |
| 705 } | |
| 706 | |
| 707 const char *group4 = ""; | |
| 708 const char *filter; | |
| 709 switch (cid->type) { | |
| 710 case L_FLATE_ENCODE: | |
| 711 filter = "/FlateDecode"; | |
| 712 break; | |
| 713 case L_JPEG_ENCODE: | |
| 714 filter = "/DCTDecode"; | |
| 715 break; | |
| 716 case L_G4_ENCODE: | |
| 717 filter = "/CCITTFaxDecode"; | |
| 718 group4 = " /K -1\n"; | |
| 719 break; | |
| 720 case L_JP2K_ENCODE: | |
| 721 filter = "/JPXDecode"; | |
| 722 break; | |
| 723 default: | |
| 724 l_CIDataDestroy(&cid); | |
| 725 return false; | |
| 726 } | |
| 727 | |
| 728 // Maybe someday we will accept RGBA but today is not that day. | |
| 729 // It requires creating an /SMask for the alpha channel. | |
| 730 // http://stackoverflow.com/questions/14220221 | |
| 731 std::stringstream colorspace; | |
| 732 // Use "C" locale (needed for int values larger than 999). | |
| 733 colorspace.imbue(std::locale::classic()); | |
| 734 if (cid->ncolors > 0) { | |
| 735 colorspace << " /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) << " " | |
| 736 << cid->cmapdatahex << " ]\n"; | |
| 737 } else { | |
| 738 switch (cid->spp) { | |
| 739 case 1: | |
| 740 if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) { | |
| 741 colorspace.str( | |
| 742 " /ColorSpace /DeviceGray\n" | |
| 743 " /Decode [1 0]\n"); | |
| 744 } else { | |
| 745 colorspace.str(" /ColorSpace /DeviceGray\n"); | |
| 746 } | |
| 747 break; | |
| 748 case 3: | |
| 749 colorspace.str(" /ColorSpace /DeviceRGB\n"); | |
| 750 break; | |
| 751 default: | |
| 752 l_CIDataDestroy(&cid); | |
| 753 return false; | |
| 754 } | |
| 755 } | |
| 756 | |
| 757 int predictor = (cid->predictor) ? 14 : 1; | |
| 758 | |
| 759 // IMAGE | |
| 760 std::stringstream b1; | |
| 761 // Use "C" locale (needed for int values larger than 999). | |
| 762 b1.imbue(std::locale::classic()); | |
| 763 b1 << objnum | |
| 764 << " 0 obj\n" | |
| 765 "<<\n" | |
| 766 " /Length " | |
| 767 << cid->nbytescomp | |
| 768 << "\n" | |
| 769 " /Subtype /Image\n"; | |
| 770 | |
| 771 std::stringstream b2; | |
| 772 // Use "C" locale (needed for int values larger than 999). | |
| 773 b2.imbue(std::locale::classic()); | |
| 774 b2 << " /Width " << cid->w | |
| 775 << "\n" | |
| 776 " /Height " | |
| 777 << cid->h | |
| 778 << "\n" | |
| 779 " /BitsPerComponent " | |
| 780 << cid->bps | |
| 781 << "\n" | |
| 782 " /Filter " | |
| 783 << filter | |
| 784 << "\n" | |
| 785 " /DecodeParms\n" | |
| 786 " <<\n" | |
| 787 " /Predictor " | |
| 788 << predictor | |
| 789 << "\n" | |
| 790 " /Colors " | |
| 791 << cid->spp << "\n" | |
| 792 << group4 << " /Columns " << cid->w | |
| 793 << "\n" | |
| 794 " /BitsPerComponent " | |
| 795 << cid->bps | |
| 796 << "\n" | |
| 797 " >>\n" | |
| 798 ">>\n" | |
| 799 "stream\n"; | |
| 800 | |
| 801 const char *b3 = | |
| 802 "endstream\n" | |
| 803 "endobj\n"; | |
| 804 | |
| 805 size_t b1_len = b1.str().size(); | |
| 806 size_t b2_len = b2.str().size(); | |
| 807 size_t b3_len = strlen(b3); | |
| 808 size_t colorspace_len = colorspace.str().size(); | |
| 809 | |
| 810 *pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len; | |
| 811 *pdf_object = new char[*pdf_object_size]; | |
| 812 | |
| 813 char *p = *pdf_object; | |
| 814 memcpy(p, b1.str().c_str(), b1_len); | |
| 815 p += b1_len; | |
| 816 memcpy(p, colorspace.str().c_str(), colorspace_len); | |
| 817 p += colorspace_len; | |
| 818 memcpy(p, b2.str().c_str(), b2_len); | |
| 819 p += b2_len; | |
| 820 memcpy(p, cid->datacomp, cid->nbytescomp); | |
| 821 p += cid->nbytescomp; | |
| 822 memcpy(p, b3, b3_len); | |
| 823 l_CIDataDestroy(&cid); | |
| 824 return true; | |
| 825 } | |
| 826 | |
| 827 bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) { | |
| 828 Pix *pix = api->GetInputImage(); | |
| 829 const char *filename = api->GetInputName(); | |
| 830 int ppi = api->GetSourceYResolution(); | |
| 831 if (!pix || ppi <= 0) { | |
| 832 return false; | |
| 833 } | |
| 834 double width = pixGetWidth(pix) * 72.0 / ppi; | |
| 835 double height = pixGetHeight(pix) * 72.0 / ppi; | |
| 836 | |
| 837 std::stringstream xobject; | |
| 838 // Use "C" locale (needed for int values larger than 999). | |
| 839 xobject.imbue(std::locale::classic()); | |
| 840 if (!textonly_) { | |
| 841 xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n"; | |
| 842 } | |
| 843 | |
| 844 // PAGE | |
| 845 std::stringstream stream; | |
| 846 // Use "C" locale (needed for double values width and height). | |
| 847 stream.imbue(std::locale::classic()); | |
| 848 stream.precision(2); | |
| 849 stream << std::fixed << obj_ | |
| 850 << " 0 obj\n" | |
| 851 "<<\n" | |
| 852 " /Type /Page\n" | |
| 853 " /Parent 2 0 R\n" // Pages object | |
| 854 " /MediaBox [0 0 " | |
| 855 << width << " " << height | |
| 856 << "]\n" | |
| 857 " /Contents " | |
| 858 << (obj_ + 1) | |
| 859 << " 0 R\n" // Contents object | |
| 860 " /Resources\n" | |
| 861 " <<\n" | |
| 862 " " | |
| 863 << xobject.str() << // Image object | |
| 864 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" | |
| 865 " /Font << /f-0-0 3 0 R >>\n" // Type0 Font | |
| 866 " >>\n" | |
| 867 ">>\n" | |
| 868 "endobj\n"; | |
| 869 pages_.push_back(obj_); | |
| 870 AppendPDFObject(stream.str().c_str()); | |
| 871 | |
| 872 // CONTENTS | |
| 873 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height)); | |
| 874 const size_t pdftext_len = strlen(pdftext.get()); | |
| 875 size_t len = pdftext_len; | |
| 876 #ifndef NO_PDF_COMPRESSION | |
| 877 auto comp_pdftext = zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len); | |
| 878 #endif | |
| 879 stream.str(""); | |
| 880 stream << obj_ | |
| 881 << " 0 obj\n" | |
| 882 "<<\n" | |
| 883 " /Length " | |
| 884 << len | |
| 885 << "" | |
| 886 #ifndef NO_PDF_COMPRESSION | |
| 887 " /Filter /FlateDecode" | |
| 888 #endif | |
| 889 "\n" | |
| 890 ">>\n" | |
| 891 "stream\n" | |
| 892 ; | |
| 893 AppendString(stream.str().c_str()); | |
| 894 long objsize = stream.str().size(); | |
| 895 #ifndef NO_PDF_COMPRESSION | |
| 896 AppendData(reinterpret_cast<char *>(comp_pdftext), len); | |
| 897 #else | |
| 898 AppendData(reinterpret_cast<char *>(pdftext.get()), len); | |
| 899 #endif | |
| 900 objsize += len; | |
| 901 #ifndef NO_PDF_COMPRESSION | |
| 902 lept_free(comp_pdftext); | |
| 903 #endif | |
| 904 objsize += AppendData("endstream\n"sv); | |
| 905 objsize += AppendData("endobj\n"sv); | |
| 906 AppendPDFObjectDIY(objsize); | |
| 907 | |
| 908 if (!textonly_) { | |
| 909 char *pdf_object = nullptr; | |
| 910 int jpg_quality; | |
| 911 api->GetIntVariable("jpg_quality", &jpg_quality); | |
| 912 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) { | |
| 913 return false; | |
| 914 } | |
| 915 AppendData(pdf_object, objsize); | |
| 916 AppendPDFObjectDIY(objsize); | |
| 917 delete[] pdf_object; | |
| 918 } | |
| 919 return true; | |
| 920 } | |
| 921 | |
| 922 bool TessPDFRenderer::EndDocumentHandler() { | |
| 923 // We reserved the /Pages object number early, so that the /Page | |
| 924 // objects could refer to their parent. We finally have enough | |
| 925 // information to go fill it in. Using lower level calls to manipulate | |
| 926 // the offset record in two spots, because we are placing objects | |
| 927 // out of order in the file. | |
| 928 | |
| 929 // PAGES | |
| 930 const long int kPagesObjectNumber = 2; | |
| 931 offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1 | |
| 932 std::stringstream stream; | |
| 933 // Use "C" locale (needed for int values larger than 999). | |
| 934 stream.imbue(std::locale::classic()); | |
| 935 stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ "; | |
| 936 AppendString(stream.str().c_str()); | |
| 937 size_t pages_objsize = stream.str().size(); | |
| 938 for (const auto &page : pages_) { | |
| 939 stream.str(""); | |
| 940 stream << page << " 0 R "; | |
| 941 AppendString(stream.str().c_str()); | |
| 942 pages_objsize += stream.str().size(); | |
| 943 } | |
| 944 stream.str(""); | |
| 945 stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n"; | |
| 946 AppendString(stream.str().c_str()); | |
| 947 pages_objsize += stream.str().size(); | |
| 948 offsets_.back() += pages_objsize; // manipulation #2 | |
| 949 | |
| 950 // INFO | |
| 951 std::string utf16_title = "FEFF"; // byte_order_marker | |
| 952 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title()); | |
| 953 char utf16[kMaxBytesPerCodepoint]; | |
| 954 for (char32 code : unicodes) { | |
| 955 if (CodepointToUtf16be(code, utf16)) { | |
| 956 utf16_title += utf16; | |
| 957 } | |
| 958 } | |
| 959 | |
| 960 char *datestr = l_getFormattedDate(); | |
| 961 stream.str(""); | |
| 962 stream << obj_ | |
| 963 << " 0 obj\n" | |
| 964 "<<\n" | |
| 965 " /Producer (Tesseract " | |
| 966 << tesseract::TessBaseAPI::Version() | |
| 967 << ")\n" | |
| 968 " /CreationDate (D:" | |
| 969 << datestr | |
| 970 << ")\n" | |
| 971 " /Title <" | |
| 972 << utf16_title.c_str() | |
| 973 << ">\n" | |
| 974 ">>\n" | |
| 975 "endobj\n"; | |
| 976 lept_free(datestr); | |
| 977 AppendPDFObject(stream.str().c_str()); | |
| 978 stream.str(""); | |
| 979 stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n"; | |
| 980 AppendString(stream.str().c_str()); | |
| 981 for (int i = 1; i < obj_; i++) { | |
| 982 stream.str(""); | |
| 983 stream.width(10); | |
| 984 stream.fill('0'); | |
| 985 stream << offsets_[i] << " 00000 n \n"; | |
| 986 AppendString(stream.str().c_str()); | |
| 987 } | |
| 988 stream.str(""); | |
| 989 stream << "trailer\n<<\n /Size " << obj_ | |
| 990 << "\n" | |
| 991 " /Root 1 0 R\n" // catalog | |
| 992 " /Info " | |
| 993 << (obj_ - 1) | |
| 994 << " 0 R\n" // info | |
| 995 ">>\nstartxref\n" | |
| 996 << offsets_.back() << "\n%%EOF\n"; | |
| 997 AppendString(stream.str().c_str()); | |
| 998 return true; | |
| 999 } | |
| 1000 } // namespace tesseract |
