comparison mupdf-source/source/fitz/output-csv.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 // Copyright (C) 2024-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24
25 #include <zlib.h>
26
27 #include <limits.h>
28
29 typedef struct
30 {
31 fz_document_writer super;
32 int count;
33 fz_stext_page *page;
34 fz_output *out;
35 fz_stext_options options;
36 int pagenum;
37 } fz_csv_writer;
38
39 static fz_device *
40 csv_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
41 {
42 fz_csv_writer *wri = (fz_csv_writer*)wri_;
43 wri->page = fz_new_stext_page(ctx, mediabox);
44 wri->options.flags |= FZ_STEXT_COLLECT_VECTORS;
45 wri->options.flags |= FZ_STEXT_ACCURATE_BBOXES;
46 wri->options.flags |= FZ_STEXT_SEGMENT;
47 wri->options.flags |= FZ_STEXT_TABLE_HUNT;
48 return fz_new_stext_device(ctx, wri->page, &wri->options);
49 }
50
51 typedef struct
52 {
53 int leading;
54 int spaces;
55 } space_data;
56
57 static void
58 output_line(fz_context *ctx, fz_output *out, fz_stext_line *line, space_data *sd)
59 {
60 for (; line != NULL; line = line->next)
61 {
62 fz_stext_char *ch;
63
64 for (ch = line->first_char; ch != NULL; ch = ch->next)
65 {
66 if (ch->c == ' ')
67 {
68 if (!sd->leading)
69 sd->spaces++;
70 continue;
71 }
72 sd->leading = 0;
73 /* Compact all runs of spaces to single ones. */
74 if (sd->spaces > 0)
75 {
76 fz_write_printf(ctx, out, " ");
77 sd->spaces = 0;
78 }
79 if (ch->c == '\"')
80 {
81 fz_write_printf(ctx, out, "\"\"");
82 }
83 else
84 {
85 fz_write_printf(ctx, out, "%C", ch->c);
86 }
87 }
88 }
89 }
90
91 static fz_rect
92 whitespaceless_bbox(fz_context *ctx, fz_stext_block *block)
93 {
94 fz_rect r = fz_empty_rect;
95 fz_stext_line *line;
96 fz_stext_char *ch;
97
98 for (; block != NULL; block = block->next)
99 {
100 if (block->type == FZ_STEXT_BLOCK_STRUCT)
101 {
102 if (block->u.s.down)
103 r = fz_union_rect(r, whitespaceless_bbox(ctx, block->u.s.down->first_block));
104 continue;
105 }
106 if (block->type != FZ_STEXT_BLOCK_TEXT)
107 {
108 r = fz_union_rect(r, block->bbox);
109 continue;
110 }
111 for (line = block->u.t.first_line; line != NULL; line = line->next)
112 {
113 for (ch = line->first_char; ch != NULL; ch = ch->next)
114 {
115 if (ch->c != ' ')
116 r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
117 }
118 }
119 }
120
121 return r;
122 }
123
124 static void
125 output_td_contents(fz_context *ctx, fz_output *out, fz_stext_block *block, space_data *sd)
126 {
127 for (; block != NULL; block = block->next)
128 {
129 if (block->type == FZ_STEXT_BLOCK_STRUCT)
130 {
131 if (block->u.s.down)
132 output_td_contents(ctx, out, block->u.s.down->first_block, sd);
133 continue;
134 }
135 if (block->type == FZ_STEXT_BLOCK_TEXT)
136 output_line(ctx, out, block->u.t.first_line, sd);
137 }
138 }
139
140 /* We have output up to and including position *pos on entry to this function.
141 * We preserve that on output. */
142 static void
143 output_td(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, int *pos, fz_stext_block *block)
144 {
145 int x0, x1;
146 space_data sd = { 0 };
147 fz_rect r = whitespaceless_bbox(ctx, block);
148
149 if (fz_is_empty_rect(r))
150 return;
151
152 if (block && grid)
153 {
154
155 for (x0 = 0; x0 < grid->u.b.xs->len; x0++)
156 if (r.x0 < grid->u.b.xs->list[x0].pos)
157 break;
158 for (x1 = x0; x1 < grid->u.b.xs->len; x1++)
159 if (r.x1 <= grid->u.b.xs->list[x1].pos)
160 break;
161 x0--;
162 x1--;
163 }
164 else
165 x0 = *pos+1, x1 = *pos+1;
166
167 /* Send enough , to get us to the right position. */
168 while (*pos < x0)
169 {
170 if (*pos >= 0)
171 fz_write_printf(ctx, wri->out, ",");
172 *pos = (*pos)+1;
173 }
174
175 fz_write_printf(ctx, wri->out, "\"");
176 output_td_contents(ctx, wri->out, block, &sd);
177 fz_write_printf(ctx, wri->out, "\"");
178
179 /* Send any extra , to allow for colspans */
180 while (*pos < x1)
181 {
182 fz_write_printf(ctx, wri->out, ",");
183 *pos = (*pos)+1;
184 }
185 }
186
187 static void
188 output_tr(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, fz_stext_block *block)
189 {
190 int pos = -1;
191
192 for (; block != NULL; block = block->next)
193 {
194 if (block->type == FZ_STEXT_BLOCK_STRUCT)
195 {
196 if (!block->u.s.down)
197 continue;
198 if (block->u.s.down->standard == FZ_STRUCTURE_TD)
199 output_td(ctx, wri, grid, &pos, block->u.s.down->first_block);
200 }
201 }
202
203 if (pos != -1)
204 fz_write_printf(ctx, wri->out, "\n");
205 }
206
207 static void
208 output_table(fz_context *ctx, fz_csv_writer *wri, fz_rect bbox, fz_stext_block *first)
209 {
210 fz_stext_block *block;
211 fz_stext_block *grid = NULL;
212 int rows = 0;
213
214 fz_try(ctx)
215 {
216 /* First, walk to find the div positions */
217 for (block = first; block != NULL; block = block->next)
218 {
219 if (block->type == FZ_STEXT_BLOCK_GRID)
220 {
221 grid = block;
222 break;
223 }
224 }
225
226 /* Then, count the rows */
227 for (block = first; block != NULL; block = block->next)
228 {
229 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down != NULL && block->u.s.down->standard == FZ_STRUCTURE_TR)
230 rows++;
231 }
232
233 fz_write_printf(ctx, wri->out, "Table %d,%d,%d,%g,%g,%g,%g\n",
234 wri->count++,
235 rows,
236 wri->pagenum,
237 bbox.x0, bbox.y0, bbox.x1, bbox.y1);
238
239 /* Then do the output */
240 for (block = first; block != NULL; block = block->next)
241 {
242 if (block->type == FZ_STEXT_BLOCK_STRUCT)
243 {
244 if (!block->u.s.down)
245 continue;
246 if (block->u.s.down->standard == FZ_STRUCTURE_TR)
247 output_tr(ctx, wri, grid, block->u.s.down->first_block);
248 }
249 }
250 }
251 fz_catch(ctx)
252 fz_rethrow(ctx);
253 }
254
255 static void
256 output_tables(fz_context *ctx, fz_csv_writer *wri, fz_stext_page *page, fz_stext_block *block)
257 {
258 for (; block; block = block->next)
259 {
260 if (block->type == FZ_STEXT_BLOCK_STRUCT)
261 {
262 if (!block->u.s.down)
263 continue;
264 if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
265 output_table(ctx, wri, block->bbox, block->u.s.down->first_block);
266 else
267 output_tables(ctx, wri, page, block->u.s.down->first_block);
268 }
269 }
270 }
271
272 static void
273 csv_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
274 {
275 fz_csv_writer *wri = (fz_csv_writer*)wri_;
276
277 fz_try(ctx)
278 {
279 fz_close_device(ctx, dev);
280
281 /* Output UTF-8 BOM */
282 fz_write_printf(ctx, wri->out, "%C", 0xFEFF);
283
284 output_tables(ctx, wri, wri->page, wri->page->first_block);
285 wri->pagenum++;
286 }
287 fz_always(ctx)
288 {
289 fz_drop_device(ctx, dev);
290 }
291 fz_catch(ctx)
292 fz_rethrow(ctx);
293 }
294
295 static void
296 csv_close_writer(fz_context *ctx, fz_document_writer *wri_)
297 {
298 fz_csv_writer *wri = (fz_csv_writer*)wri_;
299 fz_close_output(ctx, wri->out);
300 }
301
302 static void
303 csv_drop_writer(fz_context *ctx, fz_document_writer *wri_)
304 {
305 fz_csv_writer *wri = (fz_csv_writer*)wri_;
306 fz_drop_output(ctx, wri->out);
307 }
308
309 fz_document_writer *
310 fz_new_csv_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
311 {
312 fz_csv_writer *wri = NULL;
313
314 fz_var(wri);
315 fz_var(out);
316
317 fz_try(ctx)
318 {
319 wri = fz_new_derived_document_writer(ctx, fz_csv_writer, csv_begin_page, csv_end_page, csv_close_writer, csv_drop_writer);
320 fz_parse_stext_options(ctx, &wri->options, options);
321 wri->out = out;
322 }
323 fz_catch(ctx)
324 {
325 fz_drop_output(ctx, out);
326 fz_free(ctx, wri);
327 fz_rethrow(ctx);
328 }
329 return (fz_document_writer*)wri;
330 }
331
332 fz_document_writer *
333 fz_new_csv_writer(fz_context *ctx, const char *path, const char *options)
334 {
335 fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.csv", 0);
336 fz_document_writer *wri = NULL;
337 fz_try(ctx)
338 wri = fz_new_csv_writer_with_output(ctx, out, options);
339 fz_catch(ctx)
340 {
341 fz_drop_output(ctx, out);
342 fz_rethrow(ctx);
343 }
344 return wri;
345 }