Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/api/altorenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // File: altorenderer.cpp | |
| 2 // Description: ALTO rendering interface | |
| 3 // Author: Jake Sebright | |
| 4 | |
| 5 // (C) Copyright 2018 | |
| 6 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 // you may not use this file except in compliance with the License. | |
| 8 // You may obtain a copy of the License at | |
| 9 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 | |
| 16 #include "errcode.h" // for ASSERT_HOST | |
| 17 #include "helpers.h" // for copy_string | |
| 18 #include "tprintf.h" // for tprintf | |
| 19 | |
| 20 #include <tesseract/baseapi.h> | |
| 21 #include <tesseract/renderer.h> | |
| 22 | |
| 23 #include <memory> | |
| 24 #include <sstream> // for std::stringstream | |
| 25 | |
| 26 namespace tesseract { | |
| 27 | |
| 28 /// Add coordinates to specified TextBlock, TextLine or String bounding box. | |
| 29 /// Add word confidence if adding to a String bounding box. | |
| 30 /// | |
| 31 static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, | |
| 32 std::stringstream &alto_str) { | |
| 33 int left, top, right, bottom; | |
| 34 it->BoundingBox(level, &left, &top, &right, &bottom); | |
| 35 | |
| 36 int hpos = left; | |
| 37 int vpos = top; | |
| 38 int height = bottom - top; | |
| 39 int width = right - left; | |
| 40 | |
| 41 alto_str << " HPOS=\"" << hpos << "\""; | |
| 42 alto_str << " VPOS=\"" << vpos << "\""; | |
| 43 alto_str << " WIDTH=\"" << width << "\""; | |
| 44 alto_str << " HEIGHT=\"" << height << "\""; | |
| 45 | |
| 46 if (level == RIL_WORD) { | |
| 47 int wc = it->Confidence(RIL_WORD); | |
| 48 alto_str << " WC=\"0." << wc << "\""; | |
| 49 } else { | |
| 50 alto_str << ">"; | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 /// | |
| 55 /// Append the ALTO XML for the beginning of the document | |
| 56 /// | |
| 57 bool TessAltoRenderer::BeginDocumentHandler() { | |
| 58 // Delay the XML output because we need the name of the image file. | |
| 59 begin_document = true; | |
| 60 return true; | |
| 61 } | |
| 62 | |
| 63 /// | |
| 64 /// Append the ALTO XML for the layout of the image | |
| 65 /// | |
| 66 bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) { | |
| 67 if (begin_document) { | |
| 68 AppendString( | |
| 69 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | |
| 70 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" " | |
| 71 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " | |
| 72 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# " | |
| 73 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n" | |
| 74 "\t<Description>\n" | |
| 75 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n" | |
| 76 "\t\t<sourceImageInformation>\n" | |
| 77 "\t\t\t<fileName>"); | |
| 78 | |
| 79 AppendString(api->GetInputName()); | |
| 80 | |
| 81 AppendString( | |
| 82 "</fileName>\n" | |
| 83 "\t\t</sourceImageInformation>\n" | |
| 84 "\t\t<OCRProcessing ID=\"OCR_0\">\n" | |
| 85 "\t\t\t<ocrProcessingStep>\n" | |
| 86 "\t\t\t\t<processingSoftware>\n" | |
| 87 "\t\t\t\t\t<softwareName>tesseract "); | |
| 88 AppendString(TessBaseAPI::Version()); | |
| 89 AppendString( | |
| 90 "</softwareName>\n" | |
| 91 "\t\t\t\t</processingSoftware>\n" | |
| 92 "\t\t\t</ocrProcessingStep>\n" | |
| 93 "\t\t</OCRProcessing>\n" | |
| 94 "\t</Description>\n" | |
| 95 "\t<Layout>\n"); | |
| 96 begin_document = false; | |
| 97 } | |
| 98 | |
| 99 const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum())); | |
| 100 if (text == nullptr) { | |
| 101 return false; | |
| 102 } | |
| 103 | |
| 104 AppendString(text.get()); | |
| 105 | |
| 106 return true; | |
| 107 } | |
| 108 | |
| 109 /// | |
| 110 /// Append the ALTO XML for the end of the document | |
| 111 /// | |
| 112 bool TessAltoRenderer::EndDocumentHandler() { | |
| 113 AppendString("\t</Layout>\n</alto>\n"); | |
| 114 | |
| 115 return true; | |
| 116 } | |
| 117 | |
| 118 TessAltoRenderer::TessAltoRenderer(const char *outputbase) | |
| 119 : TessResultRenderer(outputbase, "xml"), | |
| 120 begin_document(false) {} | |
| 121 | |
| 122 /// | |
| 123 /// Make an XML-formatted string with ALTO markup from the internal | |
| 124 /// data structures. | |
| 125 /// | |
| 126 char *TessBaseAPI::GetAltoText(int page_number) { | |
| 127 return GetAltoText(nullptr, page_number); | |
| 128 } | |
| 129 | |
| 130 /// | |
| 131 /// Make an XML-formatted string with ALTO markup from the internal | |
| 132 /// data structures. | |
| 133 /// | |
| 134 char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { | |
| 135 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) { | |
| 136 return nullptr; | |
| 137 } | |
| 138 | |
| 139 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0; | |
| 140 | |
| 141 if (input_file_.empty()) { | |
| 142 SetInputName(nullptr); | |
| 143 } | |
| 144 | |
| 145 std::stringstream alto_str; | |
| 146 // Use "C" locale (needed for int values larger than 999). | |
| 147 alto_str.imbue(std::locale::classic()); | |
| 148 alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_ | |
| 149 << "\" PHYSICAL_IMG_NR=\"" << page_number << "\"" | |
| 150 << " ID=\"page_" << page_number << "\">\n" | |
| 151 << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\"" | |
| 152 << " WIDTH=\"" << rect_width_ << "\"" | |
| 153 << " HEIGHT=\"" << rect_height_ << "\">\n"; | |
| 154 | |
| 155 std::unique_ptr<ResultIterator> res_it(GetIterator()); | |
| 156 while (!res_it->Empty(RIL_BLOCK)) { | |
| 157 if (res_it->Empty(RIL_WORD)) { | |
| 158 res_it->Next(RIL_WORD); | |
| 159 continue; | |
| 160 } | |
| 161 | |
| 162 int left, top, right, bottom; | |
| 163 auto block_type = res_it->BlockType(); | |
| 164 | |
| 165 switch (block_type) { | |
| 166 case PT_FLOWING_IMAGE: | |
| 167 case PT_HEADING_IMAGE: | |
| 168 case PT_PULLOUT_IMAGE: { | |
| 169 // Handle all kinds of images. | |
| 170 // TODO: optionally add TYPE, for example TYPE="photo". | |
| 171 alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\""; | |
| 172 AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str); | |
| 173 alto_str << "</Illustration>\n"; | |
| 174 res_it->Next(RIL_BLOCK); | |
| 175 continue; | |
| 176 } | |
| 177 case PT_HORZ_LINE: | |
| 178 case PT_VERT_LINE: | |
| 179 // Handle horizontal and vertical lines. | |
| 180 alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\""; | |
| 181 AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str); | |
| 182 alto_str << "</GraphicalElement >\n"; | |
| 183 res_it->Next(RIL_BLOCK); | |
| 184 continue; | |
| 185 case PT_NOISE: | |
| 186 tprintf("TODO: Please report image which triggers the noise case.\n"); | |
| 187 ASSERT_HOST(false); | |
| 188 default: | |
| 189 break; | |
| 190 } | |
| 191 | |
| 192 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { | |
| 193 alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\""; | |
| 194 AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str); | |
| 195 alto_str << "\n"; | |
| 196 } | |
| 197 | |
| 198 if (res_it->IsAtBeginningOf(RIL_PARA)) { | |
| 199 alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\""; | |
| 200 AddBoxToAlto(res_it.get(), RIL_PARA, alto_str); | |
| 201 alto_str << "\n"; | |
| 202 } | |
| 203 | |
| 204 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { | |
| 205 alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\""; | |
| 206 AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str); | |
| 207 alto_str << "\n"; | |
| 208 } | |
| 209 | |
| 210 alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\""; | |
| 211 AddBoxToAlto(res_it.get(), RIL_WORD, alto_str); | |
| 212 alto_str << " CONTENT=\""; | |
| 213 | |
| 214 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); | |
| 215 bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); | |
| 216 bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); | |
| 217 | |
| 218 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); | |
| 219 | |
| 220 do { | |
| 221 const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); | |
| 222 if (grapheme && grapheme[0] != 0) { | |
| 223 alto_str << HOcrEscape(grapheme.get()).c_str(); | |
| 224 } | |
| 225 res_it->Next(RIL_SYMBOL); | |
| 226 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); | |
| 227 | |
| 228 alto_str << "\"/>"; | |
| 229 | |
| 230 wcnt++; | |
| 231 | |
| 232 if (last_word_in_line) { | |
| 233 alto_str << "\n\t\t\t\t\t\t</TextLine>\n"; | |
| 234 lcnt++; | |
| 235 } else { | |
| 236 int hpos = right; | |
| 237 int vpos = top; | |
| 238 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); | |
| 239 int width = left - hpos; | |
| 240 alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos | |
| 241 << "\"/>\n"; | |
| 242 } | |
| 243 | |
| 244 if (last_word_in_tblock) { | |
| 245 alto_str << "\t\t\t\t\t</TextBlock>\n"; | |
| 246 tcnt++; | |
| 247 } | |
| 248 | |
| 249 if (last_word_in_cblock) { | |
| 250 alto_str << "\t\t\t\t</ComposedBlock>\n"; | |
| 251 bcnt++; | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 alto_str << "\t\t\t</PrintSpace>\n" | |
| 256 << "\t\t</Page>\n"; | |
| 257 | |
| 258 return copy_string(alto_str.str()); | |
| 259 } | |
| 260 | |
| 261 } // namespace tesseract |
