Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/output-pdfocr.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 | |
| 25 #include <assert.h> | |
| 26 #include <string.h> | |
| 27 #include <limits.h> | |
| 28 | |
| 29 #ifdef OCR_DISABLED | |
| 30 | |
| 31 /* In non-OCR builds, we need to define this otherwise SWIG Python gets SEGV | |
| 32 when it attempts to import mupdf.py and _mupdf.py. */ | |
| 33 const char *fz_pdfocr_write_options_usage = ""; | |
| 34 | |
| 35 #else | |
| 36 | |
| 37 #include "tessocr.h" | |
| 38 | |
| 39 const char *fz_pdfocr_write_options_usage = | |
| 40 "PDFOCR output options:\n" | |
| 41 "\tcompression=none: No compression (default)\n" | |
| 42 "\tcompression=flate: Flate compression\n" | |
| 43 "\tstrip-height=N: Strip height (default 0=fullpage)\n" | |
| 44 "\tocr-language=<lang>: OCR language (default=eng)\n" | |
| 45 "\tocr-datadir=<datadir>: OCR data path (default=rely on TESSDATA_PREFIX)\n" | |
| 46 "\tskew=none,auto,<angle>: Whether to skew correct (default=none).\n" | |
| 47 "\tskew-border=increase,maintain,decrease: Size change for border pixels (default=increase).\n" | |
| 48 "\n"; | |
| 49 | |
| 50 static const char funky_font[] = | |
| 51 "3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]" | |
| 52 "/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type/Font" | |
| 53 ">>\nendobj\n"; | |
| 54 | |
| 55 static const char funky_font2[] = | |
| 56 "4 0 obj\n" | |
| 57 "<</BaseFont/GlyphLessFont/CIDToGIDMap 5 0 R" | |
| 58 "/CIDSystemInfo<</Ordering (Identity)/Registry (Adobe)/Supplement 0>>" | |
| 59 "/FontDescriptor 7 0 R/Subtype/CIDFontType2/Type/Font/DW 500>>" | |
| 60 "\nendobj\n"; | |
| 61 | |
| 62 static const char funky_font3[] = | |
| 63 "5 0 obj\n<</Length 210/Filter/FlateDecode>>\nstream\n" | |
| 64 "\x78\x9c\xec\xc2\x01\x09\x00\x00\x00\x02\xa0\xfa\x7f\xba\x21\x89" | |
| 65 "\xa6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 66 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 67 "\x80\x7b\x03\x00\x00\xff\xff\xec\xc2\x01\x0d\x00\x00\x00\xc2\x20" | |
| 68 "\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 69 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 70 "\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xec\xc2\x01\x0d\x00" | |
| 71 "\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00" | |
| 72 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 73 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xed" | |
| 74 "\xc2\x01\x0d\x00\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00" | |
| 75 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 76 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\xff" | |
| 77 "\x00\x10" | |
| 78 "\nendstream\nendobj\n"; | |
| 79 | |
| 80 static const char funky_font4[] = | |
| 81 "6 0 obj\n<</Length 353>>\nstream\n" | |
| 82 "/CIDInit /ProcSet findresource begin\n" | |
| 83 "12 dict begin\n" | |
| 84 "begincmap\n" | |
| 85 "/CIDSystemInfo\n" | |
| 86 "<<\n" | |
| 87 " /Registry (Adobe)\n" | |
| 88 " /Ordering (UCS)\n" | |
| 89 " /Supplement 0\n" | |
| 90 ">> def\n" | |
| 91 "/CMapName /Adobe-Identity-UCS def\n" | |
| 92 "/CMapType 2 def\n" | |
| 93 "1 begincodespacerange\n" | |
| 94 "<0000> <FFFF>\n" | |
| 95 "endcodespacerange\n" | |
| 96 "1 beginbfrange\n" | |
| 97 "<0000> <FFFF> <0000>\n" | |
| 98 "endbfrange\n" | |
| 99 "endcmap\n" | |
| 100 "CMapName currentdict /CMap defineresource pop\n" | |
| 101 "end\n" | |
| 102 "end\n" | |
| 103 "endstream\n" | |
| 104 "endobj\n"; | |
| 105 | |
| 106 static const char funky_font5[] = | |
| 107 "7 0 obj\n" | |
| 108 "<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5" | |
| 109 "/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont" | |
| 110 "/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n"; | |
| 111 | |
| 112 static const char funky_font6[] = | |
| 113 "8 0 obj\n<</Length 572/Length1 572>>\nstream\n" | |
| 114 "\x00\x01\x00\x00\x00\x0a\x00\x80\x00\x03\x00\x20\x4f\x53\x2f\x32" | |
| 115 "\x56\xde\xc8\x94\x00\x00\x01\x28\x00\x00\x00\x60\x63\x6d\x61\x70" | |
| 116 "\x00\x0a\x00\x34\x00\x00\x01\x90\x00\x00\x00\x1e\x67\x6c\x79\x66" | |
| 117 "\x15\x22\x41\x24\x00\x00\x01\xb8\x00\x00\x00\x18\x68\x65\x61\x64" | |
| 118 "\x0b\x78\xf1\x65\x00\x00\x00\xac\x00\x00\x00\x36\x68\x68\x65\x61" | |
| 119 "\x0c\x02\x04\x02\x00\x00\x00\xe4\x00\x00\x00\x24\x68\x6d\x74\x78" | |
| 120 "\x04\x00\x00\x00\x00\x00\x01\x88\x00\x00\x00\x08\x6c\x6f\x63\x61" | |
| 121 "\x00\x0c\x00\x00\x00\x00\x01\xb0\x00\x00\x00\x06\x6d\x61\x78\x70" | |
| 122 "\x00\x04\x00\x05\x00\x00\x01\x08\x00\x00\x00\x20\x6e\x61\x6d\x65" | |
| 123 "\xf2\xeb\x16\xda\x00\x00\x01\xd0\x00\x00\x00\x4b\x70\x6f\x73\x74" | |
| 124 "\x00\x01\x00\x01\x00\x00\x02\x1c\x00\x00\x00\x20\x00\x01\x00\x00" | |
| 125 "\x00\x01\x00\x00\xb0\x94\x71\x10\x5f\x0f\x3c\xf5\x04\x07\x08\x00" | |
| 126 "\x00\x00\x00\x00\xcf\x9a\xfc\x6e\x00\x00\x00\x00\xd4\xc3\xa7\xf2" | |
| 127 "\x00\x00\x00\x00\x04\x00\x08\x00\x00\x00\x00\x10\x00\x02\x00\x00" | |
| 128 "\x00\x00\x00\x00\x00\x01\x00\x00\x08\x00\xff\xff\x00\x00\x04\x00" | |
| 129 "\x00\x00\x00\x00\x04\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 130 "\x00\x00\x00\x00\x00\x00\x00\x02\x00\x01\x00\x00\x00\x02\x00\x04" | |
| 131 "\x00\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 132 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x01\x90\x00\x05" | |
| 133 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 134 "\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x01\x00\x01\x00\x00\x00" | |
| 135 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 136 "\x00\x00\x47\x4f\x4f\x47\x00\x40\x00\x00\x00\x00\x00\x01\xff\xff" | |
| 137 "\x00\x00\x00\x01\x00\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 138 "\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00" | |
| 139 "\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00\x14\x00\x03\x00\x00" | |
| 140 "\x00\x00\x00\x14\x00\x06\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 141 "\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x00\x04\x00" | |
| 142 "\x08\x00\x00\x03\x00\x00\x31\x21\x11\x21\x04\x00\xfc\x00\x08\x00" | |
| 143 "\x00\x00\x00\x03\x00\x2a\x00\x00\x00\x03\x00\x00\x00\x05\x00\x16" | |
| 144 "\x00\x00\x00\x01\x00\x00\x00\x00\x00\x05\x00\x0b\x00\x16\x00\x03" | |
| 145 "\x00\x01\x04\x09\x00\x05\x00\x16\x00\x00\x00\x56\x00\x65\x00\x72" | |
| 146 "\x00\x73\x00\x69\x00\x6f\x00\x6e\x00\x20\x00\x31\x00\x2e\x00\x30" | |
| 147 "\x56\x65\x72\x73\x69\x6f\x6e\x20\x31\x2e\x30\x00\x00\x01\x00\x00" | |
| 148 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00" | |
| 149 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" | |
| 150 "\nendstream\nendobj\n"; | |
| 151 | |
| 152 #endif | |
| 153 | |
| 154 fz_pdfocr_options * | |
| 155 fz_parse_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, const char *args) | |
| 156 { | |
| 157 #ifdef OCR_DISABLED | |
| 158 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 159 #else | |
| 160 const char *val; | |
| 161 | |
| 162 memset(opts, 0, sizeof *opts); | |
| 163 | |
| 164 if (fz_has_option(ctx, args, "compression", &val)) | |
| 165 { | |
| 166 if (fz_option_eq(val, "none")) | |
| 167 opts->compress = 0; | |
| 168 else if (fz_option_eq(val, "flate")) | |
| 169 opts->compress = 1; | |
| 170 else | |
| 171 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported PDFOCR compression %s (none, or flate only)", val); | |
| 172 } | |
| 173 if (fz_has_option(ctx, args, "strip-height", &val)) | |
| 174 { | |
| 175 int i = fz_atoi(val); | |
| 176 if (i <= 0) | |
| 177 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported PDFOCR strip height %d (suggest 0)", i); | |
| 178 opts->strip_height = i; | |
| 179 } | |
| 180 if (fz_has_option(ctx, args, "ocr-language", &val)) | |
| 181 { | |
| 182 fz_copy_option(ctx, val, opts->language, nelem(opts->language)); | |
| 183 } | |
| 184 if (fz_has_option(ctx, args, "ocr-datadir", &val)) | |
| 185 { | |
| 186 fz_copy_option(ctx, val, opts->datadir, nelem(opts->datadir)); | |
| 187 } | |
| 188 if (fz_has_option(ctx, args, "skew", &val)) | |
| 189 { | |
| 190 if (fz_option_eq(val, "auto")) | |
| 191 opts->skew_correct = 1; | |
| 192 else | |
| 193 { | |
| 194 opts->skew_correct = 2; | |
| 195 opts->skew_angle = fz_atof(val); | |
| 196 } | |
| 197 } | |
| 198 if (fz_has_option(ctx, args, "skew-border", &val)) | |
| 199 { | |
| 200 if (fz_option_eq(val, "increase")) | |
| 201 opts->skew_border = 0; | |
| 202 else if (fz_option_eq(val, "maintain")) | |
| 203 opts->skew_border = 1; | |
| 204 else if (fz_option_eq(val, "decrease")) | |
| 205 opts->skew_border = 2; | |
| 206 else | |
| 207 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported skew-border option"); | |
| 208 } | |
| 209 | |
| 210 return opts; | |
| 211 #endif | |
| 212 } | |
| 213 | |
| 214 void | |
| 215 fz_write_pixmap_as_pdfocr(fz_context *ctx, fz_output *out, const fz_pixmap *pixmap, const fz_pdfocr_options *pdfocr) | |
| 216 { | |
| 217 #ifdef OCR_DISABLED | |
| 218 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 219 #else | |
| 220 fz_band_writer *writer; | |
| 221 | |
| 222 if (!pixmap || !out) | |
| 223 return; | |
| 224 | |
| 225 writer = fz_new_pdfocr_band_writer(ctx, out, pdfocr); | |
| 226 fz_try(ctx) | |
| 227 { | |
| 228 fz_write_header(ctx, writer, pixmap->w, pixmap->h, pixmap->n, pixmap->alpha, pixmap->xres, pixmap->yres, 0, pixmap->colorspace, pixmap->seps); | |
| 229 fz_write_band(ctx, writer, pixmap->stride, pixmap->h, pixmap->samples); | |
| 230 fz_close_band_writer(ctx, writer); | |
| 231 } | |
| 232 fz_always(ctx) | |
| 233 fz_drop_band_writer(ctx, writer); | |
| 234 fz_catch(ctx) | |
| 235 fz_rethrow(ctx); | |
| 236 #endif | |
| 237 } | |
| 238 | |
| 239 #ifndef OCR_DISABLED | |
| 240 typedef struct pdfocr_band_writer_s | |
| 241 { | |
| 242 fz_band_writer super; | |
| 243 fz_pdfocr_options options; | |
| 244 | |
| 245 /* The actual output size */ | |
| 246 int deskewed_w; | |
| 247 int deskewed_h; | |
| 248 | |
| 249 int obj_num; | |
| 250 int xref_max; | |
| 251 int64_t *xref; | |
| 252 int pages; | |
| 253 int page_max; | |
| 254 int *page_obj; | |
| 255 unsigned char *stripbuf; | |
| 256 unsigned char *compbuf; | |
| 257 size_t complen; | |
| 258 | |
| 259 fz_pixmap *skew_bitmap; | |
| 260 | |
| 261 void *tessapi; | |
| 262 fz_pixmap *ocrbitmap; | |
| 263 | |
| 264 fz_pdfocr_progress_fn *progress; | |
| 265 void *progress_arg; | |
| 266 } pdfocr_band_writer; | |
| 267 | |
| 268 static int | |
| 269 new_obj(fz_context *ctx, pdfocr_band_writer *writer) | |
| 270 { | |
| 271 int64_t pos = fz_tell_output(ctx, writer->super.out); | |
| 272 | |
| 273 if (writer->obj_num >= writer->xref_max) | |
| 274 { | |
| 275 int new_max = writer->xref_max * 2; | |
| 276 if (new_max < writer->obj_num + 8) | |
| 277 new_max = writer->obj_num + 8; | |
| 278 writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t); | |
| 279 writer->xref_max = new_max; | |
| 280 } | |
| 281 | |
| 282 writer->xref[writer->obj_num] = pos; | |
| 283 | |
| 284 return writer->obj_num++; | |
| 285 } | |
| 286 | |
| 287 static void | |
| 288 post_skew_write_header(fz_context *ctx, pdfocr_band_writer *writer, int w, int h) | |
| 289 { | |
| 290 fz_output *out = writer->super.out; | |
| 291 int xres = writer->super.xres; | |
| 292 int yres = writer->super.yres; | |
| 293 int sh = writer->options.strip_height; | |
| 294 int n = writer->super.n; | |
| 295 int strips; | |
| 296 int i; | |
| 297 | |
| 298 if (sh == 0) | |
| 299 sh = h; | |
| 300 assert(sh != 0 && "pdfocr_write_header() should not be given zero height input."); | |
| 301 strips = (h + sh-1)/sh; | |
| 302 | |
| 303 writer->deskewed_w = w; | |
| 304 writer->deskewed_h = h; | |
| 305 | |
| 306 writer->stripbuf = Memento_label(fz_malloc(ctx, (size_t)w * sh * n), "pdfocr_stripbuf"); | |
| 307 writer->complen = fz_deflate_bound(ctx, (size_t)w * sh * n); | |
| 308 writer->compbuf = Memento_label(fz_malloc(ctx, writer->complen), "pdfocr_compbuf"); | |
| 309 | |
| 310 /* Always round the width of ocrbitmap up to a multiple of 4. */ | |
| 311 writer->ocrbitmap = fz_new_pixmap(ctx, NULL, (w+3)&~3, h, NULL, 0); | |
| 312 fz_set_pixmap_resolution(ctx, writer->ocrbitmap, xres, yres); | |
| 313 | |
| 314 /* Send the Page Object */ | |
| 315 fz_write_printf(ctx, out, "%d 0 obj\n<</Type/Page/Parent 2 0 R/Resources<</XObject<<", new_obj(ctx, writer)); | |
| 316 for (i = 0; i < strips; i++) | |
| 317 fz_write_printf(ctx, out, "/I%d %d 0 R", i, writer->obj_num + i); | |
| 318 fz_write_printf(ctx, out, ">>/Font<</F0 3 0 R>>>>/MediaBox[0 0 %g %g]/Contents %d 0 R>>\nendobj\n", | |
| 319 w * 72.0f / xres, h * 72.0f / yres, writer->obj_num + strips); | |
| 320 } | |
| 321 | |
| 322 static void | |
| 323 pdfocr_write_header(fz_context *ctx, fz_band_writer *writer_, fz_colorspace *cs) | |
| 324 { | |
| 325 pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; | |
| 326 fz_output *out = writer->super.out; | |
| 327 int w = writer->super.w; | |
| 328 int h = writer->super.h; | |
| 329 int n = writer->super.n; | |
| 330 int s = writer->super.s; | |
| 331 int a = writer->super.alpha; | |
| 332 int sh = writer->options.strip_height; | |
| 333 | |
| 334 if (sh == 0) | |
| 335 sh = h; | |
| 336 assert(sh != 0 && "pdfocr_write_header() should not be given zero height input."); | |
| 337 | |
| 338 if (a != 0) | |
| 339 fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR cannot write alpha channel"); | |
| 340 if (s != 0) | |
| 341 fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR cannot write spot colors"); | |
| 342 if (n != 3 && n != 1) | |
| 343 fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR expected to be Grayscale or RGB"); | |
| 344 | |
| 345 fz_free(ctx, writer->stripbuf); | |
| 346 writer->stripbuf = NULL; | |
| 347 fz_free(ctx, writer->compbuf); | |
| 348 writer->compbuf = NULL; | |
| 349 fz_drop_pixmap(ctx, writer->ocrbitmap); | |
| 350 writer->ocrbitmap = NULL; | |
| 351 | |
| 352 /* Send the file header on the first page */ | |
| 353 if (writer->pages == 0) | |
| 354 { | |
| 355 fz_write_string(ctx, out, "%PDF-1.4\n%PDFOCR-1.0\n"); | |
| 356 | |
| 357 if (writer->xref_max < 9) | |
| 358 { | |
| 359 int new_max = 9; | |
| 360 writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t); | |
| 361 writer->xref_max = new_max; | |
| 362 } | |
| 363 writer->xref[3] = fz_tell_output(ctx, out); | |
| 364 fz_write_data(ctx, out, funky_font, sizeof(funky_font)-1); | |
| 365 writer->xref[4] = fz_tell_output(ctx, out); | |
| 366 fz_write_data(ctx, out, funky_font2, sizeof(funky_font2)-1); | |
| 367 writer->xref[5] = fz_tell_output(ctx, out); | |
| 368 fz_write_data(ctx, out, funky_font3, sizeof(funky_font3)-1); | |
| 369 writer->xref[6] = fz_tell_output(ctx, out); | |
| 370 fz_write_data(ctx, out, funky_font4, sizeof(funky_font4)-1); | |
| 371 writer->xref[7] = fz_tell_output(ctx, out); | |
| 372 fz_write_data(ctx, out, funky_font5, sizeof(funky_font5)-1); | |
| 373 writer->xref[8] = fz_tell_output(ctx, out); | |
| 374 fz_write_data(ctx, out, funky_font6, sizeof(funky_font6)-1); | |
| 375 } | |
| 376 | |
| 377 if (writer->page_max <= writer->pages) | |
| 378 { | |
| 379 int new_max = writer->page_max * 2; | |
| 380 if (new_max == 0) | |
| 381 new_max = writer->pages + 8; | |
| 382 writer->page_obj = fz_realloc_array(ctx, writer->page_obj, new_max, int); | |
| 383 writer->page_max = new_max; | |
| 384 } | |
| 385 writer->page_obj[writer->pages] = writer->obj_num; | |
| 386 writer->pages++; | |
| 387 | |
| 388 if (writer->options.skew_correct) | |
| 389 writer->skew_bitmap = fz_new_pixmap(ctx, n == 3 ? fz_device_rgb(ctx) : fz_device_gray(ctx), w, h, NULL, 0); | |
| 390 else | |
| 391 post_skew_write_header(ctx, writer, w, h); | |
| 392 } | |
| 393 | |
| 394 static void | |
| 395 flush_strip(fz_context *ctx, pdfocr_band_writer *writer, int fill) | |
| 396 { | |
| 397 unsigned char *data = writer->stripbuf; | |
| 398 fz_output *out = writer->super.out; | |
| 399 int w = writer->deskewed_w; | |
| 400 int n = writer->super.n; | |
| 401 size_t len = (size_t)w*n*fill; | |
| 402 | |
| 403 /* Buffer is full, compress it and write it. */ | |
| 404 if (writer->options.compress) | |
| 405 { | |
| 406 size_t destLen = writer->complen; | |
| 407 fz_deflate(ctx, writer->compbuf, &destLen, data, len, FZ_DEFLATE_DEFAULT); | |
| 408 len = destLen; | |
| 409 data = writer->compbuf; | |
| 410 } | |
| 411 fz_write_printf(ctx, out, "%d 0 obj\n<</Width %d/ColorSpace/Device%s/Height %d%s/Subtype/Image", | |
| 412 new_obj(ctx, writer), w, n == 1 ? "Gray" : "RGB", fill, writer->options.compress ? "/Filter/FlateDecode" : ""); | |
| 413 fz_write_printf(ctx, out, "/Length %zd/Type/XObject/BitsPerComponent 8>>\nstream\n", len); | |
| 414 fz_write_data(ctx, out, data, len); | |
| 415 fz_write_string(ctx, out, "\nendstream\nendobj\n"); | |
| 416 } | |
| 417 | |
| 418 static void | |
| 419 post_skew_write_band(fz_context *ctx, pdfocr_band_writer *writer, int stride, int band_start, int band_height, const unsigned char *sp) | |
| 420 { | |
| 421 int w = writer->deskewed_w; | |
| 422 int h = writer->deskewed_h; | |
| 423 int n = writer->super.n; | |
| 424 int x, y; | |
| 425 int sh = writer->options.strip_height; | |
| 426 int line; | |
| 427 unsigned char *d; | |
| 428 | |
| 429 if (sh == 0) | |
| 430 sh = h; | |
| 431 | |
| 432 for (line = 0; line < band_height; line++) | |
| 433 { | |
| 434 int dstline = (band_start+line) % sh; | |
| 435 memcpy(writer->stripbuf + (size_t)w*n*dstline, | |
| 436 sp + (size_t)line * w * n, | |
| 437 (size_t)w * n); | |
| 438 if (dstline+1 == sh) | |
| 439 flush_strip(ctx, writer, dstline+1); | |
| 440 } | |
| 441 if (band_start + band_height == h && h % sh != 0) | |
| 442 flush_strip(ctx, writer, h % sh); | |
| 443 | |
| 444 /* Copy strip to ocrbitmap, converting if required. */ | |
| 445 d = writer->ocrbitmap->samples; | |
| 446 d += band_start*w; | |
| 447 if (n == 1) | |
| 448 { | |
| 449 for (y = band_height; y > 0; y--) | |
| 450 { | |
| 451 memcpy(d, sp, w); | |
| 452 if (writer->ocrbitmap->w - w) | |
| 453 memset(d + w, 0, writer->ocrbitmap->w - w); | |
| 454 d += writer->ocrbitmap->w; | |
| 455 } | |
| 456 } | |
| 457 else | |
| 458 { | |
| 459 for (y = band_height; y > 0; y--) | |
| 460 { | |
| 461 for (x = w; x > 0; x--) | |
| 462 { | |
| 463 *d++ = (sp[0] + 2*sp[1] + sp[2] + 2)>>2; | |
| 464 sp += 3; | |
| 465 } | |
| 466 for (x = writer->ocrbitmap->w - w; x > 0; x--) | |
| 467 *d++ = 0; | |
| 468 } | |
| 469 } | |
| 470 } | |
| 471 | |
| 472 static void | |
| 473 pdfocr_write_band(fz_context *ctx, fz_band_writer *writer_, int stride, int band_start, int band_height, const unsigned char *sp) | |
| 474 { | |
| 475 pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; | |
| 476 fz_output *out = writer->super.out; | |
| 477 int w = writer->super.w; | |
| 478 int n = writer->super.n; | |
| 479 unsigned char *d; | |
| 480 | |
| 481 if (!out) | |
| 482 return; | |
| 483 | |
| 484 if (writer->skew_bitmap) | |
| 485 { | |
| 486 d = writer->skew_bitmap->samples; | |
| 487 d += band_start*w*n; | |
| 488 memcpy(d, sp, w*n*band_height); | |
| 489 } | |
| 490 else | |
| 491 post_skew_write_band(ctx, writer, stride, band_start, band_height, sp); | |
| 492 } | |
| 493 | |
| 494 enum | |
| 495 { | |
| 496 WORD_CONTAINS_L2R = 1, | |
| 497 WORD_CONTAINS_R2L = 2, | |
| 498 WORD_CONTAINS_T2B = 4, | |
| 499 WORD_CONTAINS_B2T = 8 | |
| 500 }; | |
| 501 | |
| 502 typedef struct word_t | |
| 503 { | |
| 504 struct word_t *next; | |
| 505 float bbox[4]; | |
| 506 int dirn; | |
| 507 int len; | |
| 508 int chars[FZ_FLEXIBLE_ARRAY]; | |
| 509 } word_t; | |
| 510 | |
| 511 typedef struct | |
| 512 { | |
| 513 fz_buffer *buf; | |
| 514 pdfocr_band_writer *writer; | |
| 515 | |
| 516 /* We collate the current word into the following fields: */ | |
| 517 int word_max; | |
| 518 int word_len; | |
| 519 int *word_chars; | |
| 520 float word_bbox[4]; | |
| 521 int word_dirn; | |
| 522 int word_prev_char_bbox[4]; | |
| 523 | |
| 524 /* When we finish a word, we try to add it to the line. If the | |
| 525 * word fits onto the end of the existing line, great. If not, | |
| 526 * we flush the entire line, and start a new one just with the | |
| 527 * new word. This enables us to output a whole line at once, | |
| 528 * which is beneficial to avoid jittering the font sizes | |
| 529 * up/down, which looks bad when we try to select text in the | |
| 530 * produced PDF. */ | |
| 531 word_t *line; | |
| 532 word_t **line_tail; | |
| 533 float line_bbox[4]; | |
| 534 int line_dirn; | |
| 535 | |
| 536 float cur_size; | |
| 537 float cur_scale; | |
| 538 float tx, ty; | |
| 539 } char_callback_data_t; | |
| 540 | |
| 541 static void | |
| 542 flush_words(fz_context *ctx, char_callback_data_t *cb) | |
| 543 { | |
| 544 float size; | |
| 545 | |
| 546 if (cb->line == NULL) | |
| 547 return; | |
| 548 | |
| 549 if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0) | |
| 550 { | |
| 551 /* Vertical line */ | |
| 552 } | |
| 553 else | |
| 554 { | |
| 555 /* Horizontal line */ | |
| 556 size = cb->line_bbox[3] - cb->line_bbox[1]; | |
| 557 | |
| 558 if (size != 0 && size != cb->cur_size) | |
| 559 { | |
| 560 fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size); | |
| 561 cb->cur_size = size; | |
| 562 } | |
| 563 /* Guard against division by 0. This makes no difference to the | |
| 564 * actual calculation as if size is 0, word->bbox[2] == word->bbox[0] | |
| 565 * too. */ | |
| 566 if (size == 0) | |
| 567 size = 1; | |
| 568 } | |
| 569 | |
| 570 while (cb->line) | |
| 571 { | |
| 572 word_t *word = cb->line; | |
| 573 float x, y; | |
| 574 int i, len = word->len; | |
| 575 float scale; | |
| 576 | |
| 577 if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0) | |
| 578 { | |
| 579 /* Contains vertical text. */ | |
| 580 size = (word->bbox[3] - word->bbox[1]) / len; | |
| 581 if (size == 0) | |
| 582 size = 1; | |
| 583 if (size != cb->cur_size) | |
| 584 { | |
| 585 fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size); | |
| 586 cb->cur_size = size; | |
| 587 } | |
| 588 | |
| 589 /* Set the scale so that our glyphs fill the line bbox. */ | |
| 590 scale = (cb->line_bbox[2] - cb->line_bbox[0]) / size * 200; | |
| 591 if (scale != 0) | |
| 592 { | |
| 593 float letter_height = (word->bbox[3] - word->bbox[1]) / len; | |
| 594 | |
| 595 if (scale != cb->cur_scale) | |
| 596 { | |
| 597 fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale); | |
| 598 cb->cur_scale = scale; | |
| 599 } | |
| 600 | |
| 601 for (i = 0; i < len; i++) | |
| 602 { | |
| 603 x = word->bbox[0]; | |
| 604 y = word->bbox[1] + letter_height * i; | |
| 605 fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); | |
| 606 cb->tx = x; | |
| 607 cb->ty = y; | |
| 608 | |
| 609 fz_append_printf(ctx, cb->buf, "<%04x>Tj\n", word->chars[i]); | |
| 610 } | |
| 611 } | |
| 612 } | |
| 613 else | |
| 614 { | |
| 615 scale = (word->bbox[2] - word->bbox[0]) / size / len * 200; | |
| 616 if (scale != 0) | |
| 617 { | |
| 618 if (scale != cb->cur_scale) | |
| 619 { | |
| 620 fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale); | |
| 621 cb->cur_scale = scale; | |
| 622 } | |
| 623 | |
| 624 if ((word->dirn & (WORD_CONTAINS_R2L | WORD_CONTAINS_L2R)) == WORD_CONTAINS_R2L) | |
| 625 { | |
| 626 /* Purely R2L text */ | |
| 627 x = word->bbox[0]; | |
| 628 y = cb->line_bbox[1]; | |
| 629 fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); | |
| 630 cb->tx = x; | |
| 631 cb->ty = y; | |
| 632 | |
| 633 /* Tesseract has sent us R2L text in R2L order (i.e. in Logical order). | |
| 634 * We want to output it in that same logical order, but PDF operators | |
| 635 * all move the point as if outputting L2R. We can either reverse the | |
| 636 * order of chars (bad, because of cut/paste) or we can perform | |
| 637 * gymnastics with the position. We opt for the latter. */ | |
| 638 fz_append_printf(ctx, cb->buf, "["); | |
| 639 for (i = 0; i < len; i++) | |
| 640 { | |
| 641 if (i == 0) | |
| 642 { | |
| 643 if (len > 1) | |
| 644 fz_append_printf(ctx, cb->buf, "%d", -500*(len-1)); | |
| 645 } | |
| 646 else | |
| 647 fz_append_printf(ctx, cb->buf, "%d", 1000); | |
| 648 fz_append_printf(ctx, cb->buf, "<%04x>", word->chars[i]); | |
| 649 } | |
| 650 fz_append_printf(ctx, cb->buf, "]TJ\n"); | |
| 651 } | |
| 652 else | |
| 653 { | |
| 654 /* L2R (or mixed) text */ | |
| 655 x = word->bbox[0]; | |
| 656 y = cb->line_bbox[1]; | |
| 657 fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); | |
| 658 cb->tx = x; | |
| 659 cb->ty = y; | |
| 660 | |
| 661 fz_append_printf(ctx, cb->buf, "<"); | |
| 662 for (i = 0; i < len; i++) | |
| 663 fz_append_printf(ctx, cb->buf, "%04x", word->chars[i]); | |
| 664 fz_append_printf(ctx, cb->buf, ">Tj\n"); | |
| 665 } | |
| 666 } | |
| 667 } | |
| 668 | |
| 669 cb->line = word->next; | |
| 670 fz_free(ctx, word); | |
| 671 } | |
| 672 | |
| 673 cb->line_tail = &cb->line; | |
| 674 cb->line = NULL; | |
| 675 cb->line_dirn = 0; | |
| 676 } | |
| 677 | |
| 678 static void | |
| 679 queue_word(fz_context *ctx, char_callback_data_t *cb) | |
| 680 { | |
| 681 word_t *word; | |
| 682 int line_is_v, line_is_h, word_is_v, word_is_h; | |
| 683 | |
| 684 if (cb->word_len == 0) | |
| 685 return; | |
| 686 | |
| 687 word = fz_malloc_flexible(ctx, word_t, chars, cb->word_len); | |
| 688 word->next = NULL; | |
| 689 word->len = cb->word_len; | |
| 690 memcpy(word->bbox, cb->word_bbox, 4*sizeof(float)); | |
| 691 memcpy(word->chars, cb->word_chars, cb->word_len * sizeof(int)); | |
| 692 cb->word_len = 0; | |
| 693 | |
| 694 line_is_v = !!(cb->line_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B)); | |
| 695 word_is_v = !!(cb->word_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B)); | |
| 696 line_is_h = !!(cb->line_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L)); | |
| 697 word_is_h = !!(cb->word_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L)); | |
| 698 | |
| 699 word->dirn = cb->word_dirn; | |
| 700 cb->word_dirn = 0; | |
| 701 | |
| 702 /* Can we put the new word onto the end of the existing line? */ | |
| 703 if (cb->line != NULL && | |
| 704 !line_is_v && !word_is_v && | |
| 705 word->bbox[1] <= cb->line_bbox[3] && | |
| 706 word->bbox[3] >= cb->line_bbox[1] && | |
| 707 (word->bbox[0] >= cb->line_bbox[2] || word->bbox[2] <= cb->line_bbox[0])) | |
| 708 { | |
| 709 /* Can append (horizontal motion). */ | |
| 710 if (word->bbox[0] < cb->line_bbox[0]) | |
| 711 cb->line_bbox[0] = word->bbox[0]; | |
| 712 if (word->bbox[1] < cb->line_bbox[1]) | |
| 713 cb->line_bbox[1] = word->bbox[1]; | |
| 714 if (word->bbox[2] > cb->line_bbox[2]) | |
| 715 cb->line_bbox[2] = word->bbox[2]; | |
| 716 if (word->bbox[3] > cb->line_bbox[3]) | |
| 717 cb->line_bbox[3] = word->bbox[3]; | |
| 718 } | |
| 719 else if (cb->line != NULL && | |
| 720 !line_is_h && !word_is_h && | |
| 721 word->bbox[0] <= cb->line_bbox[2] && | |
| 722 word->bbox[2] >= cb->line_bbox[0] && | |
| 723 (word->bbox[1] >= cb->line_bbox[3] || word->bbox[3] <= cb->line_bbox[1])) | |
| 724 { | |
| 725 /* Can append (vertical motion). */ | |
| 726 if (!word_is_v) | |
| 727 word->dirn |= WORD_CONTAINS_T2B; | |
| 728 if (word->bbox[0] < cb->line_bbox[0]) | |
| 729 cb->line_bbox[0] = word->bbox[0]; | |
| 730 if (word->bbox[1] < cb->line_bbox[1]) | |
| 731 cb->line_bbox[1] = word->bbox[1]; | |
| 732 if (word->bbox[2] > cb->line_bbox[2]) | |
| 733 cb->line_bbox[2] = word->bbox[2]; | |
| 734 if (word->bbox[3] > cb->line_bbox[3]) | |
| 735 cb->line_bbox[3] = word->bbox[3]; | |
| 736 } | |
| 737 else | |
| 738 { | |
| 739 fz_try(ctx) | |
| 740 flush_words(ctx, cb); | |
| 741 fz_catch(ctx) | |
| 742 { | |
| 743 fz_free(ctx, word); | |
| 744 fz_rethrow(ctx); | |
| 745 } | |
| 746 memcpy(cb->line_bbox, word->bbox, 4*sizeof(float)); | |
| 747 } | |
| 748 | |
| 749 *cb->line_tail = word; | |
| 750 cb->line_tail = &word->next; | |
| 751 cb->line_dirn |= word->dirn; | |
| 752 } | |
| 753 | |
| 754 static void | |
| 755 char_callback(fz_context *ctx, void *arg, int unicode, | |
| 756 const char *font_name, | |
| 757 const int *line_bbox, const int *word_bbox, | |
| 758 const int *char_bbox, int pointsize) | |
| 759 { | |
| 760 char_callback_data_t *cb = (char_callback_data_t *)arg; | |
| 761 pdfocr_band_writer *writer = cb->writer; | |
| 762 float bbox[4]; | |
| 763 | |
| 764 bbox[0] = word_bbox[0] * 72.0f / cb->writer->ocrbitmap->xres; | |
| 765 bbox[3] = (writer->ocrbitmap->h - 1 - word_bbox[1]) * 72.0f / cb->writer->ocrbitmap->yres; | |
| 766 bbox[2] = word_bbox[2] * 72.0f / cb->writer->ocrbitmap->yres; | |
| 767 bbox[1] = (writer->ocrbitmap->h - 1 - word_bbox[3]) * 72.0f / cb->writer->ocrbitmap->yres; | |
| 768 | |
| 769 if (bbox[0] != cb->word_bbox[0] || | |
| 770 bbox[1] != cb->word_bbox[1] || | |
| 771 bbox[2] != cb->word_bbox[2] || | |
| 772 bbox[3] != cb->word_bbox[3]) | |
| 773 { | |
| 774 queue_word(ctx, cb); | |
| 775 memcpy(cb->word_bbox, bbox, 4 * sizeof(float)); | |
| 776 } | |
| 777 | |
| 778 if (cb->word_len == 0) | |
| 779 { | |
| 780 cb->word_dirn = 0; | |
| 781 memcpy(cb->word_prev_char_bbox, char_bbox, 4 * sizeof(int)); | |
| 782 } | |
| 783 else | |
| 784 { | |
| 785 int ox = cb->word_prev_char_bbox[0] + cb->word_prev_char_bbox[2]; | |
| 786 int oy = cb->word_prev_char_bbox[1] + cb->word_prev_char_bbox[3]; | |
| 787 int x = char_bbox[0] + char_bbox[2] - ox; | |
| 788 int y = char_bbox[1] + char_bbox[3] - oy; | |
| 789 int ax = x < 0 ? -x : x; | |
| 790 int ay = y < 0 ? -y : y; | |
| 791 if (ax > ay) | |
| 792 { | |
| 793 if (x > 0) | |
| 794 cb->word_dirn |= WORD_CONTAINS_L2R; | |
| 795 else if (x < 0) | |
| 796 cb->word_dirn |= WORD_CONTAINS_R2L; | |
| 797 } | |
| 798 else if (ay < ax) | |
| 799 { | |
| 800 if (y > 0) | |
| 801 cb->word_dirn |= WORD_CONTAINS_T2B; | |
| 802 else if (y < 0) | |
| 803 cb->word_dirn |= WORD_CONTAINS_B2T; | |
| 804 } | |
| 805 } | |
| 806 | |
| 807 if (cb->word_max == cb->word_len) | |
| 808 { | |
| 809 int newmax = cb->word_max * 2; | |
| 810 if (newmax == 0) | |
| 811 newmax = 16; | |
| 812 cb->word_chars = fz_realloc_array(ctx, cb->word_chars, newmax, int); | |
| 813 cb->word_max = newmax; | |
| 814 } | |
| 815 | |
| 816 cb->word_chars[cb->word_len++] = unicode; | |
| 817 } | |
| 818 | |
| 819 static int | |
| 820 pdfocr_progress(fz_context *ctx, void *arg, int prog) | |
| 821 { | |
| 822 char_callback_data_t *cb = (char_callback_data_t *)arg; | |
| 823 pdfocr_band_writer *writer = cb->writer; | |
| 824 | |
| 825 if (writer->progress == NULL) | |
| 826 return 0; | |
| 827 | |
| 828 return writer->progress(ctx, writer->progress_arg, writer->pages - 1, prog); | |
| 829 } | |
| 830 | |
| 831 static void | |
| 832 do_skew_correct(fz_context *ctx, pdfocr_band_writer *writer) | |
| 833 { | |
| 834 fz_pixmap *deskewed; | |
| 835 | |
| 836 if (writer->options.skew_correct == 1) | |
| 837 writer->options.skew_angle = fz_detect_skew(ctx, writer->skew_bitmap); | |
| 838 | |
| 839 deskewed = fz_deskew_pixmap(ctx, writer->skew_bitmap, writer->options.skew_angle, writer->options.skew_border); | |
| 840 | |
| 841 fz_try(ctx) | |
| 842 { | |
| 843 post_skew_write_header(ctx, writer, deskewed->w, deskewed->h); | |
| 844 post_skew_write_band(ctx, writer, deskewed->stride, 0, deskewed->h, deskewed->samples); | |
| 845 } | |
| 846 fz_always(ctx) | |
| 847 fz_drop_pixmap(ctx, deskewed); | |
| 848 fz_catch(ctx) | |
| 849 fz_rethrow(ctx); | |
| 850 } | |
| 851 | |
| 852 static void | |
| 853 pdfocr_write_trailer(fz_context *ctx, fz_band_writer *writer_) | |
| 854 { | |
| 855 pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; | |
| 856 fz_output *out = writer->super.out; | |
| 857 int xres = writer->super.xres; | |
| 858 int yres = writer->super.yres; | |
| 859 int sh = writer->options.strip_height; | |
| 860 int strips; | |
| 861 int w, h, i; | |
| 862 size_t len; | |
| 863 unsigned char *data; | |
| 864 fz_buffer *buf = NULL; | |
| 865 char_callback_data_t cb = { NULL }; | |
| 866 | |
| 867 if (writer->options.skew_correct) | |
| 868 do_skew_correct(ctx, writer); | |
| 869 | |
| 870 w = writer->deskewed_w; | |
| 871 h = writer->deskewed_h; | |
| 872 if (sh == 0) | |
| 873 sh = h; | |
| 874 strips = (h + sh-1)/sh; | |
| 875 | |
| 876 /* Send the Page contents */ | |
| 877 /* We need the length to this, so write to a buffer first */ | |
| 878 fz_var(buf); | |
| 879 fz_var(cb); | |
| 880 fz_try(ctx) | |
| 881 { | |
| 882 cb.writer = writer; | |
| 883 cb.buf = buf = fz_new_buffer(ctx, 0); | |
| 884 cb.line_tail = &cb.line; | |
| 885 cb.word_dirn = 0; | |
| 886 cb.line_dirn = 0; | |
| 887 fz_append_printf(ctx, buf, "q\n%g 0 0 %g 0 0 cm\n", 72.0f/xres, 72.0f/yres); | |
| 888 for (i = 0; i < strips; i++) | |
| 889 { | |
| 890 int at = h - (i+1)*sh; | |
| 891 int this_sh = sh; | |
| 892 if (at < 0) | |
| 893 { | |
| 894 this_sh += at; | |
| 895 at = 0; | |
| 896 } | |
| 897 fz_append_printf(ctx, buf, "/P <</MCID 0>> BDC\nq\n%d 0 0 %d 0 %d cm\n/I%d Do\nQ\n", | |
| 898 w, this_sh, at, i); | |
| 899 } | |
| 900 | |
| 901 fz_append_printf(ctx, buf, "Q\nBT\n3 Tr\n"); | |
| 902 | |
| 903 ocr_recognise(ctx, writer->tessapi, writer->ocrbitmap, char_callback, pdfocr_progress, &cb); | |
| 904 queue_word(ctx, &cb); | |
| 905 flush_words(ctx, &cb); | |
| 906 fz_append_printf(ctx, buf, "ET\n"); | |
| 907 | |
| 908 len = fz_buffer_storage(ctx, buf, &data); | |
| 909 fz_write_printf(ctx, out, "%d 0 obj\n<</Length %zd>>\nstream\n", new_obj(ctx, writer), len); | |
| 910 fz_write_data(ctx, out, data, len); | |
| 911 fz_drop_buffer(ctx, buf); | |
| 912 buf = NULL; | |
| 913 fz_write_string(ctx, out, "\nendstream\nendobj\n"); | |
| 914 } | |
| 915 fz_always(ctx) | |
| 916 { | |
| 917 fz_free(ctx, cb.word_chars); | |
| 918 } | |
| 919 fz_catch(ctx) | |
| 920 { | |
| 921 fz_drop_buffer(ctx, buf); | |
| 922 fz_rethrow(ctx); | |
| 923 } | |
| 924 } | |
| 925 | |
| 926 static void | |
| 927 pdfocr_close_band_writer(fz_context *ctx, fz_band_writer *writer_) | |
| 928 { | |
| 929 pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; | |
| 930 fz_output *out = writer->super.out; | |
| 931 int i; | |
| 932 | |
| 933 /* We actually do the trailer writing in the close */ | |
| 934 if (writer->xref_max > 2) | |
| 935 { | |
| 936 int64_t t_pos; | |
| 937 | |
| 938 /* Catalog */ | |
| 939 writer->xref[1] = fz_tell_output(ctx, out); | |
| 940 fz_write_printf(ctx, out, "1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n"); | |
| 941 | |
| 942 /* Page table */ | |
| 943 writer->xref[2] = fz_tell_output(ctx, out); | |
| 944 fz_write_printf(ctx, out, "2 0 obj\n<</Count %d/Kids[", writer->pages); | |
| 945 | |
| 946 for (i = 0; i < writer->pages; i++) | |
| 947 { | |
| 948 if (i > 0) | |
| 949 fz_write_byte(ctx, out, ' '); | |
| 950 fz_write_printf(ctx, out, "%d 0 R", writer->page_obj[i]); | |
| 951 } | |
| 952 fz_write_string(ctx, out, "]/Type/Pages>>\nendobj\n"); | |
| 953 | |
| 954 /* Xref */ | |
| 955 t_pos = fz_tell_output(ctx, out); | |
| 956 fz_write_printf(ctx, out, "xref\n0 %d\n0000000000 65535 f \n", writer->obj_num); | |
| 957 for (i = 1; i < writer->obj_num; i++) | |
| 958 fz_write_printf(ctx, out, "%010ld 00000 n \n", writer->xref[i]); | |
| 959 fz_write_printf(ctx, out, "trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n", writer->obj_num, t_pos); | |
| 960 } | |
| 961 } | |
| 962 | |
| 963 static void | |
| 964 pdfocr_drop_band_writer(fz_context *ctx, fz_band_writer *writer_) | |
| 965 { | |
| 966 pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; | |
| 967 fz_free(ctx, writer->stripbuf); | |
| 968 fz_free(ctx, writer->compbuf); | |
| 969 fz_free(ctx, writer->page_obj); | |
| 970 fz_free(ctx, writer->xref); | |
| 971 fz_drop_pixmap(ctx, writer->ocrbitmap); | |
| 972 ocr_fin(ctx, writer->tessapi); | |
| 973 } | |
| 974 #endif | |
| 975 | |
| 976 fz_band_writer *fz_new_pdfocr_band_writer(fz_context *ctx, fz_output *out, const fz_pdfocr_options *options) | |
| 977 { | |
| 978 #ifdef OCR_DISABLED | |
| 979 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 980 #else | |
| 981 pdfocr_band_writer *writer = fz_new_band_writer(ctx, pdfocr_band_writer, out); | |
| 982 | |
| 983 writer->super.header = pdfocr_write_header; | |
| 984 writer->super.band = pdfocr_write_band; | |
| 985 writer->super.trailer = pdfocr_write_trailer; | |
| 986 writer->super.close = pdfocr_close_band_writer; | |
| 987 writer->super.drop = pdfocr_drop_band_writer; | |
| 988 | |
| 989 if (options) | |
| 990 writer->options = *options; | |
| 991 else | |
| 992 memset(&writer->options, 0, sizeof(writer->options)); | |
| 993 | |
| 994 /* Objects: | |
| 995 * 1 reserved for catalog | |
| 996 * 2 for pages tree | |
| 997 * 3 font | |
| 998 * 4 cidfont | |
| 999 * 5 cid to gid map | |
| 1000 * 6 tounicode | |
| 1001 * 7 font descriptor | |
| 1002 * 8 font file | |
| 1003 */ | |
| 1004 writer->obj_num = 9; | |
| 1005 | |
| 1006 fz_try(ctx) | |
| 1007 { | |
| 1008 writer->tessapi = ocr_init(ctx, writer->options.language, writer->options.datadir); | |
| 1009 } | |
| 1010 fz_catch(ctx) | |
| 1011 { | |
| 1012 fz_drop_band_writer(ctx, &writer->super); | |
| 1013 fz_rethrow(ctx); | |
| 1014 } | |
| 1015 | |
| 1016 return &writer->super; | |
| 1017 #endif | |
| 1018 } | |
| 1019 | |
| 1020 void | |
| 1021 fz_pdfocr_band_writer_set_progress(fz_context *ctx, fz_band_writer *writer_, fz_pdfocr_progress_fn *progress, void *progress_arg) | |
| 1022 { | |
| 1023 #ifdef OCR_DISABLED | |
| 1024 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 1025 #else | |
| 1026 pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; | |
| 1027 if (writer == NULL) | |
| 1028 return; | |
| 1029 if (writer->super.header != pdfocr_write_header) | |
| 1030 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Not a pdfocr band writer!"); | |
| 1031 | |
| 1032 writer->progress = progress; | |
| 1033 writer->progress_arg = progress_arg; | |
| 1034 #endif | |
| 1035 } | |
| 1036 | |
| 1037 void | |
| 1038 fz_save_pixmap_as_pdfocr(fz_context *ctx, fz_pixmap *pixmap, char *filename, int append, const fz_pdfocr_options *pdfocr) | |
| 1039 { | |
| 1040 #ifdef OCR_DISABLED | |
| 1041 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 1042 #else | |
| 1043 fz_output *out = fz_new_output_with_path(ctx, filename, append); | |
| 1044 fz_try(ctx) | |
| 1045 { | |
| 1046 fz_write_pixmap_as_pdfocr(ctx, out, pixmap, pdfocr); | |
| 1047 fz_close_output(ctx, out); | |
| 1048 } | |
| 1049 fz_always(ctx) | |
| 1050 fz_drop_output(ctx, out); | |
| 1051 fz_catch(ctx) | |
| 1052 fz_rethrow(ctx); | |
| 1053 #endif | |
| 1054 } | |
| 1055 | |
| 1056 /* High-level document writer interface */ | |
| 1057 | |
| 1058 #ifndef OCR_DISABLED | |
| 1059 typedef struct | |
| 1060 { | |
| 1061 fz_document_writer super; | |
| 1062 fz_draw_options draw; | |
| 1063 fz_pdfocr_options pdfocr; | |
| 1064 fz_pixmap *pixmap; | |
| 1065 fz_band_writer *bander; | |
| 1066 fz_output *out; | |
| 1067 int pagenum; | |
| 1068 } fz_pdfocr_writer; | |
| 1069 | |
| 1070 static fz_device * | |
| 1071 pdfocr_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) | |
| 1072 { | |
| 1073 fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; | |
| 1074 return fz_new_draw_device_with_options(ctx, &wri->draw, mediabox, &wri->pixmap); | |
| 1075 } | |
| 1076 | |
| 1077 static void | |
| 1078 pdfocr_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) | |
| 1079 { | |
| 1080 fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; | |
| 1081 fz_pixmap *pix = wri->pixmap; | |
| 1082 | |
| 1083 fz_try(ctx) | |
| 1084 { | |
| 1085 fz_close_device(ctx, dev); | |
| 1086 fz_write_header(ctx, wri->bander, pix->w, pix->h, pix->n, pix->alpha, pix->xres, pix->yres, wri->pagenum++, pix->colorspace, pix->seps); | |
| 1087 fz_write_band(ctx, wri->bander, pix->stride, pix->h, pix->samples); | |
| 1088 } | |
| 1089 fz_always(ctx) | |
| 1090 { | |
| 1091 fz_drop_device(ctx, dev); | |
| 1092 fz_drop_pixmap(ctx, pix); | |
| 1093 wri->pixmap = NULL; | |
| 1094 } | |
| 1095 fz_catch(ctx) | |
| 1096 fz_rethrow(ctx); | |
| 1097 } | |
| 1098 | |
| 1099 static void | |
| 1100 pdfocr_close_writer(fz_context *ctx, fz_document_writer *wri_) | |
| 1101 { | |
| 1102 fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; | |
| 1103 | |
| 1104 fz_close_band_writer(ctx, wri->bander); | |
| 1105 fz_close_output(ctx, wri->out); | |
| 1106 } | |
| 1107 | |
| 1108 static void | |
| 1109 pdfocr_drop_writer(fz_context *ctx, fz_document_writer *wri_) | |
| 1110 { | |
| 1111 fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; | |
| 1112 | |
| 1113 fz_drop_pixmap(ctx, wri->pixmap); | |
| 1114 fz_drop_band_writer(ctx, wri->bander); | |
| 1115 fz_drop_output(ctx, wri->out); | |
| 1116 } | |
| 1117 #endif | |
| 1118 | |
| 1119 fz_document_writer * | |
| 1120 fz_new_pdfocr_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 1121 { | |
| 1122 #ifdef OCR_DISABLED | |
| 1123 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 1124 #else | |
| 1125 fz_pdfocr_writer *wri = NULL; | |
| 1126 | |
| 1127 fz_var(wri); | |
| 1128 | |
| 1129 fz_try(ctx) | |
| 1130 { | |
| 1131 wri = fz_new_derived_document_writer(ctx, fz_pdfocr_writer, pdfocr_begin_page, pdfocr_end_page, pdfocr_close_writer, pdfocr_drop_writer); | |
| 1132 fz_parse_draw_options(ctx, &wri->draw, options); | |
| 1133 fz_parse_pdfocr_options(ctx, &wri->pdfocr, options); | |
| 1134 wri->out = out; | |
| 1135 wri->bander = fz_new_pdfocr_band_writer(ctx, wri->out, &wri->pdfocr); | |
| 1136 } | |
| 1137 fz_catch(ctx) | |
| 1138 { | |
| 1139 fz_drop_output(ctx, out); | |
| 1140 fz_free(ctx, wri); | |
| 1141 fz_rethrow(ctx); | |
| 1142 } | |
| 1143 | |
| 1144 return (fz_document_writer*)wri; | |
| 1145 #endif | |
| 1146 } | |
| 1147 | |
| 1148 fz_document_writer * | |
| 1149 fz_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options) | |
| 1150 { | |
| 1151 #ifdef OCR_DISABLED | |
| 1152 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 1153 #else | |
| 1154 fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.pdfocr", 0); | |
| 1155 return fz_new_pdfocr_writer_with_output(ctx, out, options); | |
| 1156 #endif | |
| 1157 } | |
| 1158 | |
| 1159 void | |
| 1160 fz_pdfocr_writer_set_progress(fz_context *ctx, fz_document_writer *writer, fz_pdfocr_progress_fn *progress, void *progress_arg) | |
| 1161 { | |
| 1162 #ifdef OCR_DISABLED | |
| 1163 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); | |
| 1164 #else | |
| 1165 fz_pdfocr_writer *wri = (fz_pdfocr_writer *)writer; | |
| 1166 if (!writer) | |
| 1167 return; | |
| 1168 if (writer->begin_page != pdfocr_begin_page) | |
| 1169 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Not a pdfocr writer!"); | |
| 1170 fz_pdfocr_band_writer_set_progress(ctx, wri->bander, progress, progress_arg); | |
| 1171 #endif | |
| 1172 } |
