diff mupdf-source/source/fitz/output-csv.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/fitz/output-csv.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,345 @@
+// Copyright (C) 2024-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+
+#include <zlib.h>
+
+#include <limits.h>
+
+typedef struct
+{
+	fz_document_writer super;
+	int count;
+	fz_stext_page *page;
+	fz_output *out;
+	fz_stext_options options;
+	int pagenum;
+} fz_csv_writer;
+
+static fz_device *
+csv_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
+{
+	fz_csv_writer *wri = (fz_csv_writer*)wri_;
+	wri->page = fz_new_stext_page(ctx, mediabox);
+	wri->options.flags |= FZ_STEXT_COLLECT_VECTORS;
+	wri->options.flags |= FZ_STEXT_ACCURATE_BBOXES;
+	wri->options.flags |= FZ_STEXT_SEGMENT;
+	wri->options.flags |= FZ_STEXT_TABLE_HUNT;
+	return fz_new_stext_device(ctx, wri->page, &wri->options);
+}
+
+typedef struct
+{
+	int leading;
+	int spaces;
+} space_data;
+
+static void
+output_line(fz_context *ctx, fz_output *out, fz_stext_line *line, space_data *sd)
+{
+	for (; line != NULL; line = line->next)
+	{
+		fz_stext_char *ch;
+
+		for (ch = line->first_char; ch != NULL; ch = ch->next)
+		{
+			if (ch->c == ' ')
+			{
+				if (!sd->leading)
+					sd->spaces++;
+				continue;
+			}
+			sd->leading = 0;
+			/* Compact all runs of spaces to single ones. */
+			if (sd->spaces > 0)
+			{
+				fz_write_printf(ctx, out, " ");
+				sd->spaces = 0;
+			}
+			if (ch->c == '\"')
+			{
+				fz_write_printf(ctx, out, "\"\"");
+			}
+			else
+			{
+				fz_write_printf(ctx, out, "%C", ch->c);
+			}
+		}
+	}
+}
+
+static fz_rect
+whitespaceless_bbox(fz_context *ctx, fz_stext_block *block)
+{
+	fz_rect r = fz_empty_rect;
+	fz_stext_line *line;
+	fz_stext_char *ch;
+
+	for (; block != NULL; block = block->next)
+	{
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+		{
+			if (block->u.s.down)
+				r = fz_union_rect(r, whitespaceless_bbox(ctx, block->u.s.down->first_block));
+			continue;
+		}
+		if (block->type != FZ_STEXT_BLOCK_TEXT)
+		{
+			r = fz_union_rect(r, block->bbox);
+			continue;
+		}
+		for (line = block->u.t.first_line; line != NULL; line = line->next)
+		{
+			for (ch = line->first_char; ch != NULL; ch = ch->next)
+			{
+				if (ch->c != ' ')
+					r = fz_union_rect(r, fz_rect_from_quad(ch->quad));
+			}
+		}
+	}
+
+	return r;
+}
+
+static void
+output_td_contents(fz_context *ctx, fz_output *out, fz_stext_block *block, space_data *sd)
+{
+	for (; block != NULL; block = block->next)
+	{
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+		{
+			if (block->u.s.down)
+				output_td_contents(ctx, out, block->u.s.down->first_block, sd);
+			continue;
+		}
+		if (block->type == FZ_STEXT_BLOCK_TEXT)
+			output_line(ctx, out, block->u.t.first_line, sd);
+	}
+}
+
+/* We have output up to and including position *pos on entry to this function.
+ * We preserve that on output. */
+static void
+output_td(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, int *pos, fz_stext_block *block)
+{
+	int x0, x1;
+	space_data sd = { 0 };
+	fz_rect r = whitespaceless_bbox(ctx, block);
+
+	if (fz_is_empty_rect(r))
+		return;
+
+	if (block && grid)
+	{
+
+		for (x0 = 0; x0 < grid->u.b.xs->len; x0++)
+			if (r.x0 < grid->u.b.xs->list[x0].pos)
+				break;
+		for (x1 = x0; x1 < grid->u.b.xs->len; x1++)
+			if (r.x1 <= grid->u.b.xs->list[x1].pos)
+				break;
+		x0--;
+		x1--;
+	}
+	else
+		x0 = *pos+1, x1 = *pos+1;
+
+	/* Send enough , to get us to the right position. */
+	while (*pos < x0)
+	{
+		if (*pos >= 0)
+			fz_write_printf(ctx, wri->out, ",");
+		*pos = (*pos)+1;
+	}
+
+	fz_write_printf(ctx, wri->out, "\"");
+	output_td_contents(ctx, wri->out, block, &sd);
+	fz_write_printf(ctx, wri->out, "\"");
+
+	/* Send any extra , to allow for colspans */
+	while (*pos < x1)
+	{
+		fz_write_printf(ctx, wri->out, ",");
+		*pos = (*pos)+1;
+	}
+}
+
+static void
+output_tr(fz_context *ctx, fz_csv_writer *wri, fz_stext_block *grid, fz_stext_block *block)
+{
+	int pos = -1;
+
+	for (; block != NULL; block = block->next)
+	{
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+		{
+			if (!block->u.s.down)
+				continue;
+			if (block->u.s.down->standard == FZ_STRUCTURE_TD)
+				output_td(ctx, wri, grid, &pos, block->u.s.down->first_block);
+		}
+	}
+
+	if (pos != -1)
+		fz_write_printf(ctx, wri->out, "\n");
+}
+
+static void
+output_table(fz_context *ctx, fz_csv_writer *wri, fz_rect bbox, fz_stext_block *first)
+{
+	fz_stext_block *block;
+	fz_stext_block *grid = NULL;
+	int rows = 0;
+
+	fz_try(ctx)
+	{
+		/* First, walk to find the div positions */
+		for (block = first; block != NULL; block = block->next)
+		{
+			if (block->type == FZ_STEXT_BLOCK_GRID)
+			{
+				grid = block;
+				break;
+			}
+		}
+
+		/* Then, count the rows */
+		for (block = first; block != NULL; block = block->next)
+		{
+			if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down != NULL && block->u.s.down->standard == FZ_STRUCTURE_TR)
+				rows++;
+		}
+
+		fz_write_printf(ctx, wri->out, "Table %d,%d,%d,%g,%g,%g,%g\n",
+			wri->count++,
+			rows,
+			wri->pagenum,
+			bbox.x0, bbox.y0, bbox.x1, bbox.y1);
+
+		/* Then do the output */
+		for (block = first; block != NULL; block = block->next)
+		{
+			if (block->type == FZ_STEXT_BLOCK_STRUCT)
+			{
+				if (!block->u.s.down)
+					continue;
+				if (block->u.s.down->standard == FZ_STRUCTURE_TR)
+					output_tr(ctx, wri, grid, block->u.s.down->first_block);
+			}
+		}
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+output_tables(fz_context *ctx, fz_csv_writer *wri, fz_stext_page *page, fz_stext_block *block)
+{
+	for (; block; block = block->next)
+	{
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+		{
+			if (!block->u.s.down)
+				continue;
+			if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
+				output_table(ctx, wri, block->bbox, block->u.s.down->first_block);
+			else
+				output_tables(ctx, wri, page, block->u.s.down->first_block);
+		}
+	}
+}
+
+static void
+csv_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
+{
+	fz_csv_writer *wri = (fz_csv_writer*)wri_;
+
+	fz_try(ctx)
+	{
+		fz_close_device(ctx, dev);
+
+		/* Output UTF-8 BOM */
+		fz_write_printf(ctx, wri->out, "%C", 0xFEFF);
+
+		output_tables(ctx, wri, wri->page, wri->page->first_block);
+		wri->pagenum++;
+	}
+	fz_always(ctx)
+	{
+		fz_drop_device(ctx, dev);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+csv_close_writer(fz_context *ctx, fz_document_writer *wri_)
+{
+	fz_csv_writer *wri = (fz_csv_writer*)wri_;
+	fz_close_output(ctx, wri->out);
+}
+
+static void
+csv_drop_writer(fz_context *ctx, fz_document_writer *wri_)
+{
+	fz_csv_writer *wri = (fz_csv_writer*)wri_;
+	fz_drop_output(ctx, wri->out);
+}
+
+fz_document_writer *
+fz_new_csv_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
+{
+	fz_csv_writer *wri = NULL;
+
+	fz_var(wri);
+	fz_var(out);
+
+	fz_try(ctx)
+	{
+		wri = fz_new_derived_document_writer(ctx, fz_csv_writer, csv_begin_page, csv_end_page, csv_close_writer, csv_drop_writer);
+		fz_parse_stext_options(ctx, &wri->options, options);
+		wri->out = out;
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_output(ctx, out);
+		fz_free(ctx, wri);
+		fz_rethrow(ctx);
+	}
+	return (fz_document_writer*)wri;
+}
+
+fz_document_writer *
+fz_new_csv_writer(fz_context *ctx, const char *path, const char *options)
+{
+	fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.csv", 0);
+	fz_document_writer *wri = NULL;
+	fz_try(ctx)
+		wri = fz_new_csv_writer_with_output(ctx, out, options);
+	fz_catch(ctx)
+	{
+		fz_drop_output(ctx, out);
+		fz_rethrow(ctx);
+	}
+	return wri;
+}