Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: hocrrenderer.cpp | |
| 3 * Description: Simple API for calling tesseract. | |
| 4 * Author: Ray Smith (original code from baseapi.cpp) | |
| 5 * Author: Stefan Weil (moved to separate file and cleaned code) | |
| 6 * | |
| 7 * (C) Copyright 2006, Google Inc. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 **********************************************************************/ | |
| 19 | |
| 20 #include <tesseract/baseapi.h> // for TessBaseAPI | |
| 21 #include <locale> // for std::locale::classic | |
| 22 #include <memory> // for std::unique_ptr | |
| 23 #include <sstream> // for std::stringstream | |
| 24 #include <tesseract/renderer.h> | |
| 25 #include "helpers.h" // for copy_string | |
| 26 #include "tesseractclass.h" // for Tesseract | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 /** | |
| 31 * Gets the block orientation at the current iterator position. | |
| 32 */ | |
| 33 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) { | |
| 34 tesseract::Orientation orientation; | |
| 35 tesseract::WritingDirection writing_direction; | |
| 36 tesseract::TextlineOrder textline_order; | |
| 37 float deskew_angle; | |
| 38 it->Orientation(&orientation, &writing_direction, &textline_order, | |
| 39 &deskew_angle); | |
| 40 return orientation; | |
| 41 } | |
| 42 | |
| 43 /** | |
| 44 * Fits a line to the baseline at the given level, and appends its coefficients | |
| 45 * to the hOCR string. | |
| 46 * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for | |
| 47 * rotated textlines. For this reason, on textlines that are not upright, this | |
| 48 * method currently only inserts a 'textangle' property to indicate the rotation | |
| 49 * direction and does not add any baseline information to the hocr string. | |
| 50 */ | |
| 51 static void AddBaselineCoordsTohOCR(const PageIterator *it, | |
| 52 PageIteratorLevel level, | |
| 53 std::stringstream &hocr_str) { | |
| 54 tesseract::Orientation orientation = GetBlockTextOrientation(it); | |
| 55 if (orientation != ORIENTATION_PAGE_UP) { | |
| 56 hocr_str << "; textangle " << 360 - orientation * 90; | |
| 57 return; | |
| 58 } | |
| 59 | |
| 60 int left, top, right, bottom; | |
| 61 it->BoundingBox(level, &left, &top, &right, &bottom); | |
| 62 | |
| 63 // Try to get the baseline coordinates at this level. | |
| 64 int x1, y1, x2, y2; | |
| 65 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) { | |
| 66 return; | |
| 67 } | |
| 68 // Following the description of this field of the hOCR spec, we convert the | |
| 69 // baseline coordinates so that "the bottom left of the bounding box is the | |
| 70 // origin". | |
| 71 x1 -= left; | |
| 72 x2 -= left; | |
| 73 y1 -= bottom; | |
| 74 y2 -= bottom; | |
| 75 | |
| 76 // Now fit a line through the points so we can extract coefficients for the | |
| 77 // equation: y = p1 x + p0 | |
| 78 if (x1 == x2) { | |
| 79 // Problem computing the polynomial coefficients. | |
| 80 return; | |
| 81 } | |
| 82 double p1 = (y2 - y1) / static_cast<double>(x2 - x1); | |
| 83 double p0 = y1 - p1 * x1; | |
| 84 | |
| 85 hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " | |
| 86 << round(p0 * 1000.0) / 1000.0; | |
| 87 } | |
| 88 | |
| 89 static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level, | |
| 90 std::stringstream &hocr_str) { | |
| 91 int left, top, right, bottom; | |
| 92 it->BoundingBox(level, &left, &top, &right, &bottom); | |
| 93 // This is the only place we use double quotes instead of single quotes, | |
| 94 // but it may too late to change for consistency | |
| 95 hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " | |
| 96 << bottom; | |
| 97 // Add baseline coordinates & heights for textlines only. | |
| 98 if (level == RIL_TEXTLINE) { | |
| 99 AddBaselineCoordsTohOCR(it, level, hocr_str); | |
| 100 // add custom height measures | |
| 101 float row_height, descenders, ascenders; // row attributes | |
| 102 it->RowAttributes(&row_height, &descenders, &ascenders); | |
| 103 // TODO(rays): Do we want to limit these to a single decimal place? | |
| 104 hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders | |
| 105 << "; x_ascenders " << ascenders; | |
| 106 } | |
| 107 hocr_str << "\">"; | |
| 108 } | |
| 109 | |
| 110 /** | |
| 111 * Make a HTML-formatted string with hOCR markup from the internal | |
| 112 * data structures. | |
| 113 * page_number is 0-based but will appear in the output as 1-based. | |
| 114 * Image name/input_file_ can be set by SetInputName before calling | |
| 115 * GetHOCRText | |
| 116 * STL removed from original patch submission and refactored by rays. | |
| 117 * Returned string must be freed with the delete [] operator. | |
| 118 */ | |
| 119 char *TessBaseAPI::GetHOCRText(int page_number) { | |
| 120 return GetHOCRText(nullptr, page_number); | |
| 121 } | |
| 122 | |
| 123 /** | |
| 124 * Make a HTML-formatted string with hOCR markup from the internal | |
| 125 * data structures. | |
| 126 * page_number is 0-based but will appear in the output as 1-based. | |
| 127 * Image name/input_file_ can be set by SetInputName before calling | |
| 128 * GetHOCRText | |
| 129 * STL removed from original patch submission and refactored by rays. | |
| 130 * Returned string must be freed with the delete [] operator. | |
| 131 */ | |
| 132 char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { | |
| 133 if (tesseract_ == nullptr || | |
| 134 (page_res_ == nullptr && Recognize(monitor) < 0)) { | |
| 135 return nullptr; | |
| 136 } | |
| 137 | |
| 138 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1; | |
| 139 int page_id = page_number + 1; // hOCR uses 1-based page numbers. | |
| 140 bool para_is_ltr = true; // Default direction is LTR | |
| 141 const char *paragraph_lang = nullptr; | |
| 142 bool font_info = false; | |
| 143 bool hocr_boxes = false; | |
| 144 GetBoolVariable("hocr_font_info", &font_info); | |
| 145 GetBoolVariable("hocr_char_boxes", &hocr_boxes); | |
| 146 | |
| 147 if (input_file_.empty()) { | |
| 148 SetInputName(nullptr); | |
| 149 } | |
| 150 | |
| 151 std::stringstream hocr_str; | |
| 152 // Use "C" locale (needed for double values x_size and x_descenders). | |
| 153 hocr_str.imbue(std::locale::classic()); | |
| 154 // Use 8 digits for double values. | |
| 155 hocr_str.precision(8); | |
| 156 hocr_str << " <div class='ocr_page'" | |
| 157 << " id='" | |
| 158 << "page_" << page_id << "'" | |
| 159 << " title='image \""; | |
| 160 if (!input_file_.empty()) { | |
| 161 hocr_str << HOcrEscape(input_file_.c_str()); | |
| 162 } else { | |
| 163 hocr_str << "unknown"; | |
| 164 } | |
| 165 | |
| 166 hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " | |
| 167 << rect_width_ << " " << rect_height_ << "; ppageno " << page_number | |
| 168 << "; scan_res " << GetSourceYResolution() << " " | |
| 169 << GetSourceYResolution() << "'>\n"; | |
| 170 | |
| 171 std::unique_ptr<ResultIterator> res_it(GetIterator()); | |
| 172 while (!res_it->Empty(RIL_BLOCK)) { | |
| 173 int left, top, right, bottom; | |
| 174 auto block_type = res_it->BlockType(); | |
| 175 switch (block_type) { | |
| 176 case PT_FLOWING_IMAGE: | |
| 177 case PT_HEADING_IMAGE: | |
| 178 case PT_PULLOUT_IMAGE: { | |
| 179 // Handle all kinds of images. | |
| 180 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); | |
| 181 hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_' | |
| 182 << bcnt++ << "' title=\"bbox " << left << " " << top << " " | |
| 183 << right << " " << bottom << "\"></div>\n"; | |
| 184 res_it->Next(RIL_BLOCK); | |
| 185 continue; | |
| 186 } | |
| 187 case PT_HORZ_LINE: | |
| 188 case PT_VERT_LINE: | |
| 189 // Handle horizontal and vertical lines. | |
| 190 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); | |
| 191 hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_' | |
| 192 << bcnt++ << "' title=\"bbox " << left << " " << top << " " | |
| 193 << right << " " << bottom << "\"></div>\n"; | |
| 194 res_it->Next(RIL_BLOCK); | |
| 195 continue; | |
| 196 case PT_NOISE: | |
| 197 tprintf("TODO: Please report image which triggers the noise case.\n"); | |
| 198 ASSERT_HOST(false); | |
| 199 default: | |
| 200 break; | |
| 201 } | |
| 202 | |
| 203 if (res_it->Empty(RIL_WORD)) { | |
| 204 res_it->Next(RIL_WORD); | |
| 205 continue; | |
| 206 } | |
| 207 | |
| 208 // Open any new block/paragraph/textline. | |
| 209 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { | |
| 210 para_is_ltr = true; // reset to default direction | |
| 211 hocr_str << " <div class='ocr_carea'" | |
| 212 << " id='" | |
| 213 << "block_" << page_id << "_" << bcnt << "'"; | |
| 214 AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str); | |
| 215 } | |
| 216 if (res_it->IsAtBeginningOf(RIL_PARA)) { | |
| 217 hocr_str << "\n <p class='ocr_par'"; | |
| 218 para_is_ltr = res_it->ParagraphIsLtr(); | |
| 219 if (!para_is_ltr) { | |
| 220 hocr_str << " dir='rtl'"; | |
| 221 } | |
| 222 hocr_str << " id='" | |
| 223 << "par_" << page_id << "_" << pcnt << "'"; | |
| 224 paragraph_lang = res_it->WordRecognitionLanguage(); | |
| 225 if (paragraph_lang) { | |
| 226 hocr_str << " lang='" << paragraph_lang << "'"; | |
| 227 } | |
| 228 AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); | |
| 229 } | |
| 230 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { | |
| 231 hocr_str << "\n <span class='"; | |
| 232 switch (block_type) { | |
| 233 case PT_HEADING_TEXT: | |
| 234 hocr_str << "ocr_header"; | |
| 235 break; | |
| 236 case PT_PULLOUT_TEXT: | |
| 237 hocr_str << "ocr_textfloat"; | |
| 238 break; | |
| 239 case PT_CAPTION_TEXT: | |
| 240 hocr_str << "ocr_caption"; | |
| 241 break; | |
| 242 case PT_FLOWING_IMAGE: | |
| 243 case PT_HEADING_IMAGE: | |
| 244 case PT_PULLOUT_IMAGE: | |
| 245 ASSERT_HOST(false); | |
| 246 break; | |
| 247 default: | |
| 248 hocr_str << "ocr_line"; | |
| 249 } | |
| 250 hocr_str << "' id='" | |
| 251 << "line_" << page_id << "_" << lcnt << "'"; | |
| 252 AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str); | |
| 253 } | |
| 254 | |
| 255 // Now, process the word... | |
| 256 int32_t lstm_choice_mode = tesseract_->lstm_choice_mode; | |
| 257 std::vector<std::vector<std::vector<std::pair<const char *, float>>>> | |
| 258 *rawTimestepMap = nullptr; | |
| 259 std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr; | |
| 260 if (lstm_choice_mode) { | |
| 261 CTCMap = res_it->GetBestLSTMSymbolChoices(); | |
| 262 rawTimestepMap = res_it->GetRawLSTMTimesteps(); | |
| 263 } | |
| 264 hocr_str << "\n <span class='ocrx_word'" | |
| 265 << " id='" | |
| 266 << "word_" << page_id << "_" << wcnt << "'"; | |
| 267 bool bold, italic, underlined, monospace, serif, smallcaps; | |
| 268 int pointsize, font_id; | |
| 269 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); | |
| 270 const char *font_name = | |
| 271 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, | |
| 272 &serif, &smallcaps, &pointsize, &font_id); | |
| 273 hocr_str << " title='bbox " << left << " " << top << " " << right << " " | |
| 274 << bottom << "; x_wconf " | |
| 275 << static_cast<int>(res_it->Confidence(RIL_WORD)); | |
| 276 if (font_info) { | |
| 277 if (font_name) { | |
| 278 hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); | |
| 279 } | |
| 280 hocr_str << "; x_fsize " << pointsize; | |
| 281 } | |
| 282 hocr_str << "'"; | |
| 283 const char *lang = res_it->WordRecognitionLanguage(); | |
| 284 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { | |
| 285 hocr_str << " lang='" << lang << "'"; | |
| 286 } | |
| 287 switch (res_it->WordDirection()) { | |
| 288 // Only emit direction if different from current paragraph direction | |
| 289 case DIR_LEFT_TO_RIGHT: | |
| 290 if (!para_is_ltr) { | |
| 291 hocr_str << " dir='ltr'"; | |
| 292 } | |
| 293 break; | |
| 294 case DIR_RIGHT_TO_LEFT: | |
| 295 if (para_is_ltr) { | |
| 296 hocr_str << " dir='rtl'"; | |
| 297 } | |
| 298 break; | |
| 299 case DIR_MIX: | |
| 300 case DIR_NEUTRAL: | |
| 301 default: // Do nothing. | |
| 302 break; | |
| 303 } | |
| 304 hocr_str << ">"; | |
| 305 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); | |
| 306 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); | |
| 307 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); | |
| 308 if (bold) { | |
| 309 hocr_str << "<strong>"; | |
| 310 } | |
| 311 if (italic) { | |
| 312 hocr_str << "<em>"; | |
| 313 } | |
| 314 do { | |
| 315 const std::unique_ptr<const char[]> grapheme( | |
| 316 res_it->GetUTF8Text(RIL_SYMBOL)); | |
| 317 if (grapheme && grapheme[0] != 0) { | |
| 318 if (hocr_boxes) { | |
| 319 res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); | |
| 320 hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " | |
| 321 << left << " " << top << " " << right << " " << bottom | |
| 322 << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>"; | |
| 323 } | |
| 324 hocr_str << HOcrEscape(grapheme.get()).c_str(); | |
| 325 if (hocr_boxes) { | |
| 326 hocr_str << "</span>"; | |
| 327 tesseract::ChoiceIterator ci(*res_it); | |
| 328 if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) { | |
| 329 std::vector<std::vector<std::pair<const char *, float>>> *symbol = | |
| 330 ci.Timesteps(); | |
| 331 hocr_str << "\n <span class='ocr_symbol'" | |
| 332 << " id='" | |
| 333 << "symbol_" << page_id << "_" << wcnt << "_" << scnt | |
| 334 << "'>"; | |
| 335 for (const auto ×tep : *symbol) { | |
| 336 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 337 << " id='" | |
| 338 << "timestep" << page_id << "_" << wcnt << "_" << tcnt | |
| 339 << "'>"; | |
| 340 for (auto conf : timestep) { | |
| 341 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 342 << " id='" | |
| 343 << "choice_" << page_id << "_" << wcnt << "_" << ccnt | |
| 344 << "'" | |
| 345 << " title='x_confs " << int(conf.second * 100) << "'>" | |
| 346 << HOcrEscape(conf.first).c_str() << "</span>"; | |
| 347 ++ccnt; | |
| 348 } | |
| 349 hocr_str << "</span>"; | |
| 350 ++tcnt; | |
| 351 } | |
| 352 hocr_str << "\n </span>"; | |
| 353 ++scnt; | |
| 354 } else if (lstm_choice_mode == 2) { | |
| 355 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 356 << " id='" | |
| 357 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt | |
| 358 << "'>"; | |
| 359 do { | |
| 360 const char *choice = ci.GetUTF8Text(); | |
| 361 float choiceconf = ci.Confidence(); | |
| 362 if (choice != nullptr) { | |
| 363 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 364 << " id='" | |
| 365 << "choice_" << page_id << "_" << wcnt << "_" << ccnt | |
| 366 << "'" | |
| 367 << " title='x_confs " << choiceconf << "'>" | |
| 368 << HOcrEscape(choice).c_str() << "</span>"; | |
| 369 ccnt++; | |
| 370 } | |
| 371 } while (ci.Next()); | |
| 372 hocr_str << "\n </span>"; | |
| 373 tcnt++; | |
| 374 } | |
| 375 } | |
| 376 } | |
| 377 res_it->Next(RIL_SYMBOL); | |
| 378 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); | |
| 379 if (italic) { | |
| 380 hocr_str << "</em>"; | |
| 381 } | |
| 382 if (bold) { | |
| 383 hocr_str << "</strong>"; | |
| 384 } | |
| 385 // If the lstm choice mode is required it is added here | |
| 386 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) { | |
| 387 for (const auto &symbol : *rawTimestepMap) { | |
| 388 hocr_str << "\n <span class='ocr_symbol'" | |
| 389 << " id='" | |
| 390 << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>"; | |
| 391 for (const auto ×tep : symbol) { | |
| 392 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 393 << " id='" | |
| 394 << "timestep" << page_id << "_" << wcnt << "_" << tcnt | |
| 395 << "'>"; | |
| 396 for (auto &&conf : timestep) { | |
| 397 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 398 << " id='" | |
| 399 << "choice_" << page_id << "_" << wcnt << "_" << ccnt | |
| 400 << "'" | |
| 401 << " title='x_confs " << int(conf.second * 100) << "'>" | |
| 402 << HOcrEscape(conf.first).c_str() << "</span>"; | |
| 403 ++ccnt; | |
| 404 } | |
| 405 hocr_str << "</span>"; | |
| 406 ++tcnt; | |
| 407 } | |
| 408 hocr_str << "</span>"; | |
| 409 ++scnt; | |
| 410 } | |
| 411 } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) { | |
| 412 for (const auto ×tep : *CTCMap) { | |
| 413 if (timestep.size() > 0) { | |
| 414 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 415 << " id='" | |
| 416 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt | |
| 417 << "'>"; | |
| 418 for (auto &j : timestep) { | |
| 419 float conf = 100 - tesseract_->lstm_rating_coefficient * j.second; | |
| 420 if (conf < 0.0f) { | |
| 421 conf = 0.0f; | |
| 422 } | |
| 423 if (conf > 100.0f) { | |
| 424 conf = 100.0f; | |
| 425 } | |
| 426 hocr_str << "\n <span class='ocrx_cinfo'" | |
| 427 << " id='" | |
| 428 << "choice_" << page_id << "_" << wcnt << "_" << ccnt | |
| 429 << "'" | |
| 430 << " title='x_confs " << conf << "'>" | |
| 431 << HOcrEscape(j.first).c_str() << "</span>"; | |
| 432 ccnt++; | |
| 433 } | |
| 434 hocr_str << "</span>"; | |
| 435 tcnt++; | |
| 436 } | |
| 437 } | |
| 438 } | |
| 439 // Close ocrx_word. | |
| 440 if (hocr_boxes || lstm_choice_mode > 0) { | |
| 441 hocr_str << "\n "; | |
| 442 } | |
| 443 hocr_str << "</span>"; | |
| 444 tcnt = 1; | |
| 445 ccnt = 1; | |
| 446 wcnt++; | |
| 447 // Close any ending block/paragraph/textline. | |
| 448 if (last_word_in_line) { | |
| 449 hocr_str << "\n </span>"; | |
| 450 lcnt++; | |
| 451 } | |
| 452 if (last_word_in_para) { | |
| 453 hocr_str << "\n </p>\n"; | |
| 454 pcnt++; | |
| 455 para_is_ltr = true; // back to default direction | |
| 456 } | |
| 457 if (last_word_in_block) { | |
| 458 hocr_str << " </div>\n"; | |
| 459 bcnt++; | |
| 460 } | |
| 461 } | |
| 462 hocr_str << " </div>\n"; | |
| 463 | |
| 464 return copy_string(hocr_str.str()); | |
| 465 } | |
| 466 | |
| 467 /********************************************************************** | |
| 468 * HOcr Text Renderer interface implementation | |
| 469 **********************************************************************/ | |
| 470 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) | |
| 471 : TessResultRenderer(outputbase, "hocr") { | |
| 472 font_info_ = false; | |
| 473 } | |
| 474 | |
| 475 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) | |
| 476 : TessResultRenderer(outputbase, "hocr") { | |
| 477 font_info_ = font_info; | |
| 478 } | |
| 479 | |
| 480 bool TessHOcrRenderer::BeginDocumentHandler() { | |
| 481 AppendString( | |
| 482 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | |
| 483 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" | |
| 484 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" | |
| 485 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " | |
| 486 "lang=\"en\">\n <head>\n <title>"); | |
| 487 AppendString(title()); | |
| 488 AppendString( | |
| 489 "</title>\n" | |
| 490 " <meta http-equiv=\"Content-Type\" content=\"text/html;" | |
| 491 "charset=utf-8\"/>\n" | |
| 492 " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR | |
| 493 "' />\n" | |
| 494 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par" | |
| 495 " ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf"); | |
| 496 if (font_info_) { | |
| 497 AppendString(" ocrp_font ocrp_fsize"); | |
| 498 } | |
| 499 AppendString( | |
| 500 "'/>\n" | |
| 501 " </head>\n" | |
| 502 " <body>\n"); | |
| 503 | |
| 504 return true; | |
| 505 } | |
| 506 | |
| 507 bool TessHOcrRenderer::EndDocumentHandler() { | |
| 508 AppendString(" </body>\n</html>\n"); | |
| 509 | |
| 510 return true; | |
| 511 } | |
| 512 | |
| 513 bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) { | |
| 514 const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum())); | |
| 515 if (hocr == nullptr) { | |
| 516 return false; | |
| 517 } | |
| 518 | |
| 519 AppendString(hocr.get()); | |
| 520 | |
| 521 return true; | |
| 522 } | |
| 523 | |
| 524 } // namespace tesseract |
