Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/output-csv.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 // Copyright (C) 2024-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 | |
| 25 #include <zlib.h> | |
| 26 | |
| 27 #include <limits.h> | |
| 28 | |
| 29 typedef struct | |
| 30 { | |
| 31 fz_document_writer super; | |
| 32 int count; | |
| 33 fz_stext_page *page; | |
| 34 fz_output *out; | |
| 35 fz_stext_options options; | |
| 36 int pagenum; | |
| 37 } fz_csv_writer; | |
| 38 | |
| 39 static fz_device * | |
| 40 csv_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) | |
| 41 { | |
| 42 fz_csv_writer *wri = (fz_csv_writer*)wri_; | |
| 43 wri->page = fz_new_stext_page(ctx, mediabox); | |
| 44 wri->options.flags |= FZ_STEXT_COLLECT_VECTORS; | |
| 45 wri->options.flags |= FZ_STEXT_ACCURATE_BBOXES; | |
| 46 wri->options.flags |= FZ_STEXT_SEGMENT; | |
| 47 wri->options.flags |= FZ_STEXT_TABLE_HUNT; | |
| 48 return fz_new_stext_device(ctx, wri->page, &wri->options); | |
| 49 } | |
| 50 | |
| 51 typedef struct | |
| 52 { | |
| 53 int leading; | |
| 54 int spaces; | |
| 55 } space_data; | |
| 56 | |
| 57 static void | |
| 58 output_line(fz_context *ctx, fz_output *out, fz_stext_line *line, space_data *sd) | |
| 59 { | |
| 60 for (; line != NULL; line = line->next) | |
| 61 { | |
| 62 fz_stext_char *ch; | |
| 63 | |
| 64 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 65 { | |
| 66 if (ch->c == ' ') | |
| 67 { | |
| 68 if (!sd->leading) | |
| 69 sd->spaces++; | |
| 70 continue; | |
| 71 } | |
| 72 sd->leading = 0; | |
| 73 /* Compact all runs of spaces to single ones. */ | |
| 74 if (sd->spaces > 0) | |
| 75 { | |
| 76 fz_write_printf(ctx, out, " "); | |
| 77 sd->spaces = 0; | |
| 78 } | |
| 79 if (ch->c == '\"') | |
| 80 { | |
| 81 fz_write_printf(ctx, out, "\"\""); | |
| 82 } | |
| 83 else | |
| 84 { | |
| 85 fz_write_printf(ctx, out, "%C", ch->c); | |
| 86 } | |
| 87 } | |
| 88 } | |
| 89 } | |
| 90 | |
| 91 static fz_rect | |
| 92 whitespaceless_bbox(fz_context *ctx, fz_stext_block *block) | |
| 93 { | |
| 94 fz_rect r = fz_empty_rect; | |
| 95 fz_stext_line *line; | |
| 96 fz_stext_char *ch; | |
| 97 | |
| 98 for (; block != NULL; block = block->next) | |
| 99 { | |
| 100 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 101 { | |
| 102 if (block->u.s.down) | |
| 103 r = fz_union_rect(r, whitespaceless_bbox(ctx, block->u.s.down->first_block)); | |
| 104 continue; | |
| 105 } | |
| 106 if (block->type != FZ_STEXT_BLOCK_TEXT) | |
| 107 { | |
| 108 r = fz_union_rect(r, block->bbox); | |
| 109 continue; | |
| 110 } | |
| 111 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 112 { | |
| 113 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 114 { | |
| 115 if (ch->c != ' ') | |
| 116 r = fz_union_rect(r, fz_rect_from_quad(ch->quad)); | |
| 117 } | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 return r; | |
| 122 } | |
| 123 | |
| 124 static void | |
| 125 output_td_contents(fz_context *ctx, fz_output *out, fz_stext_block *block, space_data *sd) | |
| 126 { | |
| 127 for (; block != NULL; block = block->next) | |
| 128 { | |
| 129 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 130 { | |
| 131 if (block->u.s.down) | |
| 132 output_td_contents(ctx, out, block->u.s.down->first_block, sd); | |
| 133 continue; | |
| 134 } | |
| 135 if (block->type == FZ_STEXT_BLOCK_TEXT) | |
| 136 output_line(ctx, out, block->u.t.first_line, sd); | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 /* We have output up to and including position *pos on entry to this function. | |
| 141 * We preserve that on output. */ | |
| 142 static void | |
| 143 output_td(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, int *pos, fz_stext_block *block) | |
| 144 { | |
| 145 int x0, x1; | |
| 146 space_data sd = { 0 }; | |
| 147 fz_rect r = whitespaceless_bbox(ctx, block); | |
| 148 | |
| 149 if (fz_is_empty_rect(r)) | |
| 150 return; | |
| 151 | |
| 152 if (block && grid) | |
| 153 { | |
| 154 | |
| 155 for (x0 = 0; x0 < grid->u.b.xs->len; x0++) | |
| 156 if (r.x0 < grid->u.b.xs->list[x0].pos) | |
| 157 break; | |
| 158 for (x1 = x0; x1 < grid->u.b.xs->len; x1++) | |
| 159 if (r.x1 <= grid->u.b.xs->list[x1].pos) | |
| 160 break; | |
| 161 x0--; | |
| 162 x1--; | |
| 163 } | |
| 164 else | |
| 165 x0 = *pos+1, x1 = *pos+1; | |
| 166 | |
| 167 /* Send enough , to get us to the right position. */ | |
| 168 while (*pos < x0) | |
| 169 { | |
| 170 if (*pos >= 0) | |
| 171 fz_write_printf(ctx, wri->out, ","); | |
| 172 *pos = (*pos)+1; | |
| 173 } | |
| 174 | |
| 175 fz_write_printf(ctx, wri->out, "\""); | |
| 176 output_td_contents(ctx, wri->out, block, &sd); | |
| 177 fz_write_printf(ctx, wri->out, "\""); | |
| 178 | |
| 179 /* Send any extra , to allow for colspans */ | |
| 180 while (*pos < x1) | |
| 181 { | |
| 182 fz_write_printf(ctx, wri->out, ","); | |
| 183 *pos = (*pos)+1; | |
| 184 } | |
| 185 } | |
| 186 | |
| 187 static void | |
| 188 output_tr(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, fz_stext_block *block) | |
| 189 { | |
| 190 int pos = -1; | |
| 191 | |
| 192 for (; block != NULL; block = block->next) | |
| 193 { | |
| 194 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 195 { | |
| 196 if (!block->u.s.down) | |
| 197 continue; | |
| 198 if (block->u.s.down->standard == FZ_STRUCTURE_TD) | |
| 199 output_td(ctx, wri, grid, &pos, block->u.s.down->first_block); | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 if (pos != -1) | |
| 204 fz_write_printf(ctx, wri->out, "\n"); | |
| 205 } | |
| 206 | |
| 207 static void | |
| 208 output_table(fz_context *ctx, fz_csv_writer *wri, fz_rect bbox, fz_stext_block *first) | |
| 209 { | |
| 210 fz_stext_block *block; | |
| 211 fz_stext_block *grid = NULL; | |
| 212 int rows = 0; | |
| 213 | |
| 214 fz_try(ctx) | |
| 215 { | |
| 216 /* First, walk to find the div positions */ | |
| 217 for (block = first; block != NULL; block = block->next) | |
| 218 { | |
| 219 if (block->type == FZ_STEXT_BLOCK_GRID) | |
| 220 { | |
| 221 grid = block; | |
| 222 break; | |
| 223 } | |
| 224 } | |
| 225 | |
| 226 /* Then, count the rows */ | |
| 227 for (block = first; block != NULL; block = block->next) | |
| 228 { | |
| 229 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down != NULL && block->u.s.down->standard == FZ_STRUCTURE_TR) | |
| 230 rows++; | |
| 231 } | |
| 232 | |
| 233 fz_write_printf(ctx, wri->out, "Table %d,%d,%d,%g,%g,%g,%g\n", | |
| 234 wri->count++, | |
| 235 rows, | |
| 236 wri->pagenum, | |
| 237 bbox.x0, bbox.y0, bbox.x1, bbox.y1); | |
| 238 | |
| 239 /* Then do the output */ | |
| 240 for (block = first; block != NULL; block = block->next) | |
| 241 { | |
| 242 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 243 { | |
| 244 if (!block->u.s.down) | |
| 245 continue; | |
| 246 if (block->u.s.down->standard == FZ_STRUCTURE_TR) | |
| 247 output_tr(ctx, wri, grid, block->u.s.down->first_block); | |
| 248 } | |
| 249 } | |
| 250 } | |
| 251 fz_catch(ctx) | |
| 252 fz_rethrow(ctx); | |
| 253 } | |
| 254 | |
| 255 static void | |
| 256 output_tables(fz_context *ctx, fz_csv_writer *wri, fz_stext_page *page, fz_stext_block *block) | |
| 257 { | |
| 258 for (; block; block = block->next) | |
| 259 { | |
| 260 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 261 { | |
| 262 if (!block->u.s.down) | |
| 263 continue; | |
| 264 if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) | |
| 265 output_table(ctx, wri, block->bbox, block->u.s.down->first_block); | |
| 266 else | |
| 267 output_tables(ctx, wri, page, block->u.s.down->first_block); | |
| 268 } | |
| 269 } | |
| 270 } | |
| 271 | |
| 272 static void | |
| 273 csv_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) | |
| 274 { | |
| 275 fz_csv_writer *wri = (fz_csv_writer*)wri_; | |
| 276 | |
| 277 fz_try(ctx) | |
| 278 { | |
| 279 fz_close_device(ctx, dev); | |
| 280 | |
| 281 /* Output UTF-8 BOM */ | |
| 282 fz_write_printf(ctx, wri->out, "%C", 0xFEFF); | |
| 283 | |
| 284 output_tables(ctx, wri, wri->page, wri->page->first_block); | |
| 285 wri->pagenum++; | |
| 286 } | |
| 287 fz_always(ctx) | |
| 288 { | |
| 289 fz_drop_device(ctx, dev); | |
| 290 } | |
| 291 fz_catch(ctx) | |
| 292 fz_rethrow(ctx); | |
| 293 } | |
| 294 | |
| 295 static void | |
| 296 csv_close_writer(fz_context *ctx, fz_document_writer *wri_) | |
| 297 { | |
| 298 fz_csv_writer *wri = (fz_csv_writer*)wri_; | |
| 299 fz_close_output(ctx, wri->out); | |
| 300 } | |
| 301 | |
| 302 static void | |
| 303 csv_drop_writer(fz_context *ctx, fz_document_writer *wri_) | |
| 304 { | |
| 305 fz_csv_writer *wri = (fz_csv_writer*)wri_; | |
| 306 fz_drop_output(ctx, wri->out); | |
| 307 } | |
| 308 | |
| 309 fz_document_writer * | |
| 310 fz_new_csv_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 311 { | |
| 312 fz_csv_writer *wri = NULL; | |
| 313 | |
| 314 fz_var(wri); | |
| 315 fz_var(out); | |
| 316 | |
| 317 fz_try(ctx) | |
| 318 { | |
| 319 wri = fz_new_derived_document_writer(ctx, fz_csv_writer, csv_begin_page, csv_end_page, csv_close_writer, csv_drop_writer); | |
| 320 fz_parse_stext_options(ctx, &wri->options, options); | |
| 321 wri->out = out; | |
| 322 } | |
| 323 fz_catch(ctx) | |
| 324 { | |
| 325 fz_drop_output(ctx, out); | |
| 326 fz_free(ctx, wri); | |
| 327 fz_rethrow(ctx); | |
| 328 } | |
| 329 return (fz_document_writer*)wri; | |
| 330 } | |
| 331 | |
| 332 fz_document_writer * | |
| 333 fz_new_csv_writer(fz_context *ctx, const char *path, const char *options) | |
| 334 { | |
| 335 fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.csv", 0); | |
| 336 fz_document_writer *wri = NULL; | |
| 337 fz_try(ctx) | |
| 338 wri = fz_new_csv_writer_with_output(ctx, out, options); | |
| 339 fz_catch(ctx) | |
| 340 { | |
| 341 fz_drop_output(ctx, out); | |
| 342 fz_rethrow(ctx); | |
| 343 } | |
| 344 return wri; | |
| 345 } |
