Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/extract/src/json.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/extract/src/json.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,169 @@
+/* These extract_json_*() functions generate json content data. */
+
+#include "extract/extract.h"
+
+#include "astring.h"
+#include "document.h"
+#include "html.h"
+#include "mem.h"
+#include "memento.h"
+#include "outf.h"
+#include "sys.h"
+#include "text.h"
+#include "zip.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sys/stat.h>
+
+
+
+static int osp(extract_alloc_t *alloc, extract_astring_t *content, structure_t *structure)
+{
+	if (structure->parent)
+	{
+		if (osp(alloc, content, structure->parent) ||
+			extract_astring_catc(alloc, content, '\\'))
+			return -1;
+	}
+
+	if (structure->uid != 0)
+	{
+		if (extract_astring_catf(alloc, content, "%s[%d]", extract_struct_string(structure->type), structure->uid))
+			return -1;
+	}
+	else
+	{
+		if (extract_astring_catf(alloc, content, "%s", extract_struct_string(structure->type)))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int output_structure_path(extract_alloc_t *alloc, extract_astring_t *content, structure_t *structure)
+{
+	if (structure == NULL)
+		return 0;
+
+	if (extract_astring_cat(alloc, content, ",\n\"Path\" : \"") ||
+		osp(alloc, content, structure) ||
+		extract_astring_cat(alloc, content, "\""))
+		return -1;
+	return 0;
+}
+
+static int flush(extract_alloc_t *alloc, extract_astring_t *content, span_t *span, structure_t *structure, extract_astring_t *text, rect_t *bbox)
+{
+	if (span == NULL)
+		return 0;
+	if (content->chars_num)
+		if (extract_astring_cat(alloc, content, ",\n"))
+			return -1;
+	if (extract_astring_catf(alloc, content, "{\n\"Bounds\": [ %f, %f, %f, %f ],\n\"Text\": \"", bbox->min.x, bbox->min.y, bbox->max.x, bbox->max.y) ||
+		extract_astring_catl(alloc, content, text->chars, text->chars_num) ||
+		extract_astring_catf(alloc, content, "\",\n\"Font\": { \"family_name\": \"%s\" },\n\"TextSize\": %g", span->font_name, extract_font_size(&span->ctm)))
+		return -1;
+	if (output_structure_path(alloc, content, structure))
+		return -1;
+	if (extract_astring_cat(alloc, content, "\n}"))
+		return -1;
+	extract_astring_free(alloc, text);
+	*bbox = extract_rect_empty;
+
+	return 0;
+}
+
+int extract_document_to_json_content(
+		extract_alloc_t   *alloc,
+		document_t        *document,
+		int                rotation,
+		int                images,
+		extract_astring_t *content)
+{
+	int ret = -1;
+	int n;
+	content_tree_iterator cti;
+	extract_astring_t text;
+
+	(void) rotation;
+	(void) images;
+
+	extract_astring_init(&text);
+	//extract_astring_cat(alloc, content, "<html>\n");
+	//extract_astring_cat(alloc, content, "<body>\n");
+
+	/* Write paragraphs into <content>. */
+	for (n=0; n<document->pages_num; ++n)
+	{
+		int              i;
+		extract_page_t  *page     = document->pages[n];
+		subpage_t      **psubpage = page->subpages;
+
+		for (i=0; i<page->subpages_num; ++i)
+		{
+			content_t *cont;
+			structure_t *structure = NULL;
+			span_t *last_span = NULL;
+			rect_t bbox = extract_rect_empty;
+
+			for (cont = content_tree_iterator_init(&cti, &psubpage[i]->content); cont != NULL; cont = content_tree_iterator_next(&cti))
+			{
+				switch (cont->type)
+				{
+				case content_span:
+				{
+					int j;
+					span_t *span = (span_t *)cont;
+					if (last_span &&
+						(structure != span->structure ||
+						 last_span->flags.font_bold != span->flags.font_bold ||
+						 last_span->flags.font_italic != span->flags.font_italic ||
+						 last_span->flags.wmode != span->flags.wmode ||
+						 strcmp(last_span->font_name, span->font_name)))
+					{
+						// flush stored text.
+						flush(alloc, content, last_span, structure, &text, &bbox);
+					}
+					last_span = span;
+					structure = span->structure;
+					for (j = 0; j < span->chars_num; j++)
+					{
+						if (span->chars[j].ucs == (unsigned int)-1)
+							continue;
+						if (extract_astring_catc_unicode(alloc, &text, span->chars[j].ucs, 1, 0, 0, 0))
+							goto end;
+						bbox = extract_rect_union(bbox, span->chars[j].bbox);
+					}
+					break;
+				}
+				case content_image:
+				case content_table:
+				case content_block:
+				case content_line:
+				case content_paragraph:
+					 // Nothing to do for lines and paragraphs as they just enclose spans.
+					 // Nothing to do for the others for now.
+					 break;
+				default:
+					 assert("This should never happen\n" == NULL);
+					 break;
+				}
+			}
+			flush(alloc, content, last_span, structure, &text, &bbox);
+		}
+	}
+
+	ret = 0;
+end:
+
+	extract_astring_free(alloc, &text);
+
+	return ret;
+}
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children