annotate tests/test_textextract.py @ 46:7ee69f120f19 default tip

>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Oct 2025 17:17:30 +0200
parents a6bc019ac0b2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
1 """
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
2 Extract page text in various formats.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
3 No checks performed - just contribute to code coverage.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
4 """
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
5 import os
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
6 import platform
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
7 import sys
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
8 import textwrap
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
9
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
10 import pymupdf
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
11
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
12 import gentle_compare
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
13
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
14
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
15 pymupdfdir = os.path.abspath(f'{__file__}/../..')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
16 scriptdir = f'{pymupdfdir}/tests'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
17 filename = os.path.join(scriptdir, "resources", "symbol-list.pdf")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
18
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
19
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
20 def test_extract1():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
21 doc = pymupdf.open(filename)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
22 page = doc[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
23 text = page.get_text("text")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
24 blocks = page.get_text("blocks")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
25 words = page.get_text("words")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
26 d1 = page.get_text("dict")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
27 d2 = page.get_text("json")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
28 d3 = page.get_text("rawdict")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
29 d3 = page.get_text("rawjson")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
30 text = page.get_text("html")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
31 text = page.get_text("xhtml")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
32 text = page.get_text("xml")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
33 rects = pymupdf.get_highlight_selection(page, start=page.rect.tl, stop=page.rect.br)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
34 text = pymupdf.ConversionHeader("xml")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
35 text = pymupdf.ConversionTrailer("xml")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
36
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
37 def _test_extract2():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
38 import sys
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
39 import time
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
40 path = f'{scriptdir}/../../PyMuPDF-performance/adobe.pdf'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
41 if not os.path.exists(path):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
42 print(f'test_extract2(): not running because does not exist: {path}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
43 return
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
44 doc = pymupdf.open( path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
45 for opt in (
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
46 'dict',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
47 'dict2',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
48 'text',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
49 'blocks',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
50 'words',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
51 'html',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
52 'xhtml',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
53 'xml',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
54 'json',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
55 'rawdict',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
56 'rawjson',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
57 ):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
58 for flags in None, pymupdf.TEXTFLAGS_TEXT:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
59 t0 = time.time()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
60 for page in doc:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
61 page.get_text(opt, flags=flags)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
62 t = time.time() - t0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
63 print(f't={t:.02f}: opt={opt} flags={flags}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
64 sys.stdout.flush()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
65
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
66 def _test_extract3():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
67 import sys
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
68 import time
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
69 path = f'{scriptdir}/../../PyMuPDF-performance/adobe.pdf'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
70 if not os.path.exists(path):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
71 print(f'test_extract3(): not running because does not exist: {path}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
72 return
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
73 doc = pymupdf.open( path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
74 t0 = time.time()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
75 for page in doc:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
76 page.get_text('json')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
77 t = time.time() - t0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
78 print(f't={t}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
79 sys.stdout.flush()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
80
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
81 def test_extract4():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
82 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
83 Rebased-specific.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
84 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
85 if not hasattr(pymupdf, 'mupdf'):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
86 return
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
87 path = f'{pymupdfdir}/tests/resources/2.pdf'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
88 document = pymupdf.open(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
89 page = document[4]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
90
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
91 out = 'test_stext.html'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
92 text = page.get_text('html')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
93 with open(out, 'w') as f:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
94 f.write(text)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
95 print(f'Have written to: {out}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
96
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
97 out = 'test_extract.html'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
98 writer = pymupdf.mupdf.FzDocumentWriter(
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
99 out,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
100 'html',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
101 pymupdf.mupdf.FzDocumentWriter.OutputType_DOCX,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
102 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
103 device = pymupdf.mupdf.fz_begin_page(writer, pymupdf.mupdf.fz_bound_page(page))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
104 pymupdf.mupdf.fz_run_page(page, device, pymupdf.mupdf.FzMatrix(), pymupdf.mupdf.FzCookie())
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
105 pymupdf.mupdf.fz_end_page(writer)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
106 pymupdf.mupdf.fz_close_document_writer(writer)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
107 print(f'Have written to: {out}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
108
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
109 def get_text(page, space_guess):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
110 buffer_ = pymupdf.mupdf.FzBuffer( 10)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
111 out = pymupdf.mupdf.FzOutput( buffer_)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
112 writer = pymupdf.mupdf.FzDocumentWriter(
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
113 out,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
114 'text,space-guess={space_guess}',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
115 pymupdf.mupdf.FzDocumentWriter.OutputType_DOCX,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
116 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
117 device = pymupdf.mupdf.fz_begin_page(writer, pymupdf.mupdf.fz_bound_page(page))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
118 pymupdf.mupdf.fz_run_page(page, device, pymupdf.mupdf.FzMatrix(), pymupdf.mupdf.FzCookie())
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
119 pymupdf.mupdf.fz_end_page(writer)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
120 pymupdf.mupdf.fz_close_document_writer(writer)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
121 text = buffer_.fz_buffer_extract()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
122 text = text.decode('utf8')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
123 n = text.count(' ')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
124 print(f'{space_guess=}: {n=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
125 return text, n
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
126 page = document[4]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
127 text0, n0 = get_text(page, 0)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
128 text1, n1 = get_text(page, 0.5)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
129 text2, n2 = get_text(page, 0.001)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
130 text2, n2 = get_text(page, 0.1)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
131 text2, n2 = get_text(page, 0.3)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
132 text2, n2 = get_text(page, 0.9)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
133 text2, n2 = get_text(page, 5.9)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
134 assert text1 == text0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
135
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
136 def test_2954():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
137 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
138 Check handling of unknown unicode characters, issue #2954, fixed in
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
139 mupdf-1.23.9 with addition of FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
140 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
141 path = os.path.abspath(f'{__file__}/../../tests/resources/test_2954.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
142 flags0 = (0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
143 | pymupdf.TEXT_PRESERVE_WHITESPACE
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
144 | pymupdf.TEXT_PRESERVE_LIGATURES
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
145 | pymupdf.TEXT_MEDIABOX_CLIP
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
146 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
147
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
148 document = pymupdf.Document(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
149
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
150 expected_good = (
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
151 "IT-204-IP (2021) Page 3 of 5\nNYPA2514 12/06/21\nPartner's share of \n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
152 " modifications (see instructions)\n20\n State additions\nNumber\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
153 "A ' Total amount\nB '\n State allocated amount\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
154 "EA '\n20a\nEA '\n20b\nEA '\n20c\nEA '\n20d\nEA '\n20e\nEA '\n20f\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
155 "Total addition modifications (total of column A, lines 20a through 20f)\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
156 ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . \n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
157 "21\n21\n22\n State subtractions\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
158 "Number\nA ' Total amount\nB '\n State allocated amount\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
159 "ES '\n22a\nES '\n22b\nES '\n22c\nES '\n22d\nES '\n22e\nES '\n22f\n23\n23\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
160 "Total subtraction modifications (total of column A, lines 22a through 22f). . . . . . . . . . . . . . . . . . . . . . . . . . . . \n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
161 "Additions to itemized deductions\n24\nAmount\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
162 "Letter\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
163 "24a\n24b\n24c\n24d\n24e\n24f\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
164 "Total additions to itemized deductions (add lines 24a through 24f)\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
165 ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . \n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
166 "25\n25\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
167 "Subtractions from itemized deductions\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
168 "26\nLetter\nAmount\n26a\n26b\n26c\n26d\n26e\n26f\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
169 "Total subtractions from itemized deductions (add lines 26a through 26f) . . . . . . . . . . . . . . . . . . . . . . . . . . . . \n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
170 "27\n27\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
171 "This line intentionally left blank. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . \n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
172 "28\n28\n118003213032\n"
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
173 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
174
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
175 def check_good(text):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
176 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
177 Returns true if `text` is approximately the same as `expected_good`.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
178
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
179 2024-01-09: MuPDF master and 1.23.x give slightly different 'good'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
180 output, differing in a missing newline. So we compare without newlines.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
181 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
182 return text.replace('\n', '') == expected_good.replace('\n', '')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
183
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
184 n_fffd_good = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
185 n_fffd_bad = 749
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
186
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
187 def get(flags=None):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
188 text = [page.get_text(flags=flags) for page in document]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
189 assert len(text) == 1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
190 text = text[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
191 n_fffd = text.count(chr(0xfffd))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
192 if 0:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
193 # This print() fails on Windows with UnicodeEncodeError.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
194 print(f'{flags=} {n_fffd=} {text=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
195 return text, n_fffd
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
196
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
197 text_none, n_fffd_none = get()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
198 text_0, n_fffd_0 = get(flags0)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
199
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
200 text_1, n_fffd_1 = get(flags0 | pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
201
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
202 assert n_fffd_none == n_fffd_good
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
203 assert n_fffd_0 == n_fffd_bad
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
204 assert n_fffd_1 == n_fffd_good
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
205
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
206 assert check_good(text_none)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
207 assert not check_good(text_0)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
208 assert check_good(text_1)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
209
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
210
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
211 def test_3027():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
212 path = path = f'{pymupdfdir}/tests/resources/2.pdf'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
213 doc = pymupdf.open(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
214 page = doc[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
215 textpage = page.get_textpage()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
216 pymupdf.utils.get_text(page=page, option="dict", textpage=textpage)["blocks"]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
217
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
218
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
219 def test_3186():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
220
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
221 # codespell:ignore-begin
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
222 texts_expected = [
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
223 "Assicurazione sulla vita di tipo Unit Linked\nDocumento informativo precontrattuale aggiuntivo\nper i prodotti d\x00investimento assicurativi\n(DIP aggiuntivo IBIP)\nImpresa: AXA MPS Financial DAC \nProdotto: Progetto Protetto New - Global Dividends\nContratto Unit linked (Ramo III)\nData di realizzazione: Aprile 2023\nIl presente documento contiene informazioni aggiuntive e complementari rispetto a quelle presenti nel documento \ncontenente le informazioni chiave per i prodotti di investimento assicurativi (KID) per aiutare il potenziale \ncontraente a capire più nel dettaglio le caratteristiche del prodotto, gli obblighi contrattuali e la situazione \npatrimoniale dell\x00impresa.\nIl Contraente deve prendere visione delle condizioni d\x00assicurazione prima della sottoscrizione del Contratto.\nAXA MPS Financial DAC, Wolfe Tone House, Wolfe Tone Street, Dublin, DO1 HP90, Irlanda; Tel: 00353-1-6439100; \nsito internet: www.axa-mpsfinancial.ie; e-mail: supporto@axa-mpsfinancial.ie;\nAXA MPS Financial DAC, società del Gruppo Assicurativo AXA Italia, iscritta nell\x00Albo delle Imprese di assicurazione \ncon il numero II.00234. \nLa Compagnia mette a disposizione dei clienti i seguenti recapiti per richiedere eventuali informazioni sia in merito alla \nCompagnia sia in relazione al contratto proposto: Tel: 00353-1-6439100; sito internet: www.axa-mpsfinancial.ie; \ne-mail: supporto@axa-mpsfinancial.ie;\nAXA MPS Financial DAC è un\x00impresa di assicurazione di diritto Irlandese, Sede legale 33 Sir John Rogerson's Quay, \nDublino D02 XK09 Irlanda. L\x00Impresa di Assicurazione è stata autorizzata all\x00esercizio dell\x00attività assicurativa con \nprovvedimento n. C33602 emesso dalla Central Bank of Ireland (l\x00Autorità di vigilanza irlandese) in data 14/05/1999 \ned è iscritta in Irlanda presso il Companies Registration Office (registered nr. 293822). \nLa Compagnia opera in Italia esclusivamente in regime di libera prestazione di servizi ai sensi dell\x00art. 24 del D. Lgs. \n07/09/2005, n. 209 e può investire in attivi non consentiti dalla normativa italiana in materia di assicurazione sulla \nvita, ma in conformità con la normativa irlandese di riferimento in quanto soggetta al controllo della Central Bank of \nIreland.\nCon riferimento all\x00ultimo bilancio d\x00esercizio (esercizio 2021) redatto ai sensi dei principi contabili vigenti, il patrimonio \nnetto di AXA MPS Financial DAC ammonta a 139,6 milioni di euro di cui 635 mila euro di capitale sociale interamente \nversato e 138,9 milioni di euro di riserve patrimoniali compreso il risultato di esercizio.\nAl 31 dicembre 2021 il Requisito patrimoniale di solvibilità è pari a 90 milioni di euro (Solvency Capital Requirement, \nSCR). Sulla base delle valutazioni effettuate della Compagnia coerentemente con gli esistenti dettami regolamentari, il \nRequisito patrimoniale minimo al 31 dicembre 2021 ammonta a 40 milioni di euro (Minimum Capital Requirement, \nMCR).\nL'indice di solvibilità di AXA MPS Financial DAC, ovvero l'indice che rappresenta il rapporto tra l'ammontare del margine \ndi solvibilità disponibile e l'ammontare del margine di solvibilità richiesto dalla normativa vigente, e relativo all'ultimo \nbilancio approvato, è pari al 304% (solvency ratio). L'importo dei fondi propri ammissibili a copertura dei requisiti \npatrimoniali è pari a 276 milioni di euro (Eligible Own Funds, EOF).\nPer informazioni patrimoniali sulla società è possibile consultare il sito: www.axa-mpsfinancial.ie/chi-siamo\nSi rinvia alla relazione sulla solvibilità e sulla condizione finanziaria dell\x00impresa (SFCR) disponibile sul sito internet \ndella Compagnia al seguente link www.axa-mpsfinancial.ie/comunicazioni \nAl contratto si applica la legge italiana\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 1 di 9\n",
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
224 "Quali sono le prestazioni?\nIl contratto prevede le seguenti prestazioni:\na)Prestazioni in caso di vita dell'assicurato\nPrestazione in caso di Riscatto Totale e parziale\nA condizione che siano trascorsi almeno 30 giorni dalla Data di Decorrenza (conclusione del Contratto) e fino all\x00ultimo \nGiorno Lavorativo della terzultima settimana precedente la data di scadenza, il Contraente può riscuotere, interamente \no parzialmente, il Valore di Riscatto. In caso di Riscatto totale, la liquidazione del Valore di Riscatto pone fine al \nContratto con effetto dalla data di ricezione della richiesta.\nIl Contraente ha inoltre la facoltà di esercitare parzialmente il diritto di Riscatto, nella misura minima di 500,00 euro, \nda esercitarsi con le stesse modalità previste per il Riscatto totale. In questo caso, il Contratto rimane in vigore per \nl\x00ammontare residuo, a condizione che il Controvalore delle Quote residue del Contratto non sia inferiore a 1.000,00 \neuro.\nb) Prestazione a Scadenza\nAlla data di scadenza, sempre che l\x00Assicurato sia in vita, l\x00Impresa di Assicurazione corrisponderà agli aventi diritto un \nammontare risultante dal Controvalore delle Quote collegate al Contratto alla scadenza, calcolato come prodotto tra il \nValore Unitario della Quota (rilevato in corrispondenza della data di scadenza) e il numero delle Quote attribuite al \nContratto alla medesima data.\nc) Prestazione in corso di Contratto\nPurché l\x00assicurato sia in vita, nel corso della durata del Contratto, il Fondo Interno mira alla corresponsione di due \nPrestazioni Periodiche. Le prestazioni saranno pari all\x00ammontare risultante dalla moltiplicazione tra il numero di Quote \nassegnate al Contratto il primo giorno Lavorativo della settimana successiva alla Data di Riferimento e 2,50% del \nValore Unitario della Quota registrato alla Data di istituzione del Fondo Interno.\nLe prestazioni verranno liquidate entro trenta giorni dalle Date di Riferimento.\nData di Riferimento\n 1° Prestazione Periodica\n24/04/2024\n 2° Prestazione Periodica\n23/04/2025\nLa corresponsione delle Prestazioni Periodiche non è collegata alla performance positiva o ai ricavi incassati dal Fondo \nInterno, pertanto, la corresponsione potrebbe comportare una riduzione del Controvalore delle Quote senza comportare \nalcuna riduzione del numero di Quote assegnate al Contratto.\nd) Prestazione assicurativa principale in caso di decesso dell'Assicurato\nIn caso di decesso dell\x00Assicurato nel corso della durata contrattuale, è previsto il pagamento ai Beneficiari di un \nimporto pari al Controvalore delle Quote attribuite al Contratto, calcolato come prodotto tra il Valore Unitario della \nQuota rilevato alla Data di Valorizzazione della settimana successiva alla data in cui la notifica di decesso \ndell\x00Assicurato perviene all\x00Impresa di Assicurazione e il numero delle Quote attribuite al Contratto alla medesima data, \nmaggiorato di una percentuale pari allo 0,1%.\nQualora il capitale così determinato fosse inferiore al Premio pagato, sarà liquidato un ulteriore importo pari alla \ndifferenza tra il Premio pagato, al netto della parte di Premio riferita a eventuali Riscatti parziali e l\x00importo caso morte \ncome sopra determinato. Tale importo non potrà essere in ogni caso superiore al 5% del Premio pagato.\nOpzioni contrattuali\nIl Contratto non prevede opzioni contrattuali.\nFondi Assicurativi\nLe prestazioni di cui sopra sono collegate, in base all\x00allocazione del premio come descritto alla sezione \x01Quando e \ncome devo pagare?\x02, al valore delle quote del Fondo Interno denominato PP27 Global Dividends.\nil Fondo interno mira al raggiungimento di un Obiettivo di Protezione del Valore Unitario di Quota, tramite il \nconseguimento di un Valore Unitario di Quota a scadenza almeno pari al 100% del valore di quota registrato alla Data \ndi istituzione dal Fondo Interno.\nIl regolamento di gestione del Fondo Interno è disponibile sul sito dell\x00Impresa di Assicurazione \nwww.axa-mpsfinancial.ie dove puo essere acquisito su supporto duraturo.\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 2 di 9\n",
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
225 'Che cosa NON è assicurato\nRischi esclusi\nIl rischio di decesso dell\x00Assicurato è coperto qualunque sia la causa, senza limiti territoriali e senza \ntenere conto dei cambiamenti di professione dell\x00Assicurato, ad eccezione dei seguenti casi:\n\x03 il decesso, entro i primi sette anni dalla data di decorrenza del Contratto, dovuto alla sindrome da \nimmunodeficienza acquisita (AIDS) o ad altra patologia ad essa associata;\n\x03 dolo del Contraente o del Beneficiario;\n\x03 partecipazione attiva dell\x00Assicurato a delitti dolosi;\n\x03 partecipazione dell\x00Assicurato a fatti di guerra, salvo che non derivi da obblighi verso lo Stato \nItaliano: in questo caso la garanzia può essere prestata su richiesta del Contraente, alle condizioni \nstabilite dal competente Ministero;\n\x03 incidente di volo, se l\x00Assicurato viaggia a bordo di un aeromobile non autorizzato al volo o con \npilota non titolare di brevetto idoneo e, in ogni caso, se viaggia in qualità di membro \ndell\x00equipaggio;\n\x03 suicidio, se avviene nei primi due anni dalla Data di Decorrenza del Contratto\nCi sono limiti di copertura?\nNon vi sono ulteriori informazioni rispetto al contenuto del KID.\nChe obblighi ho? Quali obblighi ha l\x00Impresa?\nCosa fare in caso \ndi evento?\nDenuncia\nCon riferimento alla liquidazione delle prestazioni dedotte in Contratto, il Contraente o, se del caso, \nil Beneficiario e il Referente Terzo, sono tenuti a recarsi presso la sede dell\x00intermediario presso il \nquale il Contratto è stato sottoscritto ovvero a inviare preventivamente, a mezzo di lettera \nraccomandata con avviso di ricevimento al seguente recapito:\n\x03 AXA MPS Financial DAC\n\x03 Wolfe Tone House, Wolfe Tone Street,\n\x03 Dublin, DO1 HP90 - Ireland\n\x03 Numero Verde: 800.231.187\n\x03 email: supporto@axa-mpsfinancial.ie\ni documenti di seguito elencati per ciascuna prestazione, al fine di consentire all\x00Impresa di \nAssicurazione di verificare l\x00effettiva esistenza dell\x00obbligo di pagamento.\nin caso di Riscatto totale, il Contraente deve inviare all\x00Impresa di Assicurazione:\n\x04 la richiesta di Riscatto totale firmata dal Contraente, indicando il conto corrente su cui il \npagamento deve essere effettuato. Nel caso il conto corrente sia intestato a persona diversa dal \nContraente o dai beneficiari o sia cointestato, il Contraente deve fornire anche I documenti del \ncointestatario e specificare la relazione con il terzo il cui conto viene indicato.\n\x04 copia di un valido documento di identità del Contraente o di un documento attestante i poteri di \nlegale rappresentante, nel caso in cui il Contraente sia una persona giuridica;\nin caso di Riscatto parziale, il Contraente deve inviare all\x00Impresa di Assicurazione:\n\x04 la richiesta di Riscatto parziale firmata dal Contraente, contenente l\x00indicazione dei Fondi \nInterni/OICR che intende riscattare e il relativo ammontare non ché l\x00indicazione del conto corrente \nbancario sul quale effettuare il pagamento;\n\x04 copia di un valido documento di identità del Contraente, o di un documento attestante i poteri di \nlegale rappresentante, nel caso in cui il Contraente sia una persona giuridica.\nIn caso di richiesta di Riscatto totale o parziale non corredata dalla sopra elencata documentazione, \nl\x00Impresa di Assicurazione effettuerà il disinvestimento delle Quote collegate al Contratto alla data \ndi ricezione della relativa richiesta. L\x00Impresa di Assicurazione provvederà tuttavia alla liquidazione \ndelle somme unicamente al momento di ricezione della documentazione mancante, prive degli \neventuali interessi che dovessero maturare;\nIn caso di decesso dell\x00Assicurato, il Beneficiario/i o il Referente Terzo deve inviare all\x00Impresa di \nAssicurazione:\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 3 di 9\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
226 '\x04 la richiesta di pagamento sottoscritta da tutti i Beneficiari, con l\x00indicazione del conto corrente \nbancario sul quale effettuare il pagamento; Nel caso il conto corrente sia intestato a persona \ndiversa dal Contraente o dai beneficiari o sia cointestato, il Contraente deve fornire anche I \ndocumenti del cointestatario e specificare la relazione con il terzo il cui conto viene indicato.\n\x04 copia di un valido documento d\x00identità dei Beneficiari o di un documento attestante i poteri di \nlegale rappresentante, nel caso in cui il Beneficiario sia una persona giuridica;\n\x04 il certificato di morte dell\x00Assicurato;\n\x04 la relazione medica sulle cause del decesso;\n\x04 copia autenticata del testamento accompagnato da dichiarazione sostitutiva di atto di notorietà \ncon l\x00indicazione (i) della circostanza che il testamento è l\x00ultimo da considerarsi valido e non è \nstato impugnato e (ii) degli eredi testamentari, le relative età e capacità\ndi agire;\n\x04 in assenza di testamento, atto notorio (o dichiarazione sostitutiva di atto di notorietà) attestante \nche il decesso è avvenuto senza lasciare testamento e che non vi sono altri soggetti cui la legge \nriconosce diritti o quote di eredità;\n\x04 decreto del Giudice Tutelare nel caso di Beneficiari di minore età, con l\x00indicazione della persona \ndesignata alla riscossione;\n\x04 copia del Questionario KYC.\nPrescrizione: Alla data di redazione del presente documento, i diritti dei beneficiari dei contratti di \nassicurazione sulla vita si prescrivono nel termine di dieci anni dal giorno in cui si è verificato il fatto \nsu cui il diritto si fonda. Decorso tale termine e senza che la Compagnia abbia ricevuto alcuna \ncomunicazione e/o disposizione, gli importi derivanti dal contratto saranno devoluti al Fondo \ncostitutivo presso il Ministero dell\x00Economia e delle Finanze \x01depositi dormienti\x02.\nErogazione della prestazione\nL\x00Impresa di Assicurazione esegue il pagamento entro trenta giorni dal ricevimento della \ndocumentazione completa all\x00indirizzo sopra indicato.\n \nLe dichiarazioni del Contraente, e dell\x00Assicurato se diverso dal Contraente, devono essere esatte e \nveritiere. In caso di dichiarazioni inesatte o reticenti relative a circostanze tali che l\x00Impresa di \nAssicurazione non avrebbe dato il suo consenso, non lo avrebbe dato alle medesime condizioni se \navesse conosciuto il vero stato delle cose, l\x00Impresa di Assicurazione ha diritto a:\na) in caso di dolo o colpa grave:\n\x04 impugnare il Contratto dichiarando al Contraente di voler esercitare tale diritto entro tre mesi dal \ngiorno in cui ha conosciuto l\x00inesattezza della dichiarazione o le reticenze;\n\x04 trattenere il Premio relativo al periodo di assicurazione in corso al momento dell\x00impugnazione e, \nin ogni caso, il Premio corrispondente al primo anno;\n\x04 restituire, in caso di decesso dell\x00Assicurato, solo il Controvalore delle Quote acquisite al \nmomento del decesso, se l\x00evento si verifica prima che sia decorso il termine dianzi indicato per \nl\x00impugnazione;\nb) ove non sussista dolo o colpa grave:\n\x04 recedere dal Contratto, mediante dichiarazione da farsi al Contraente entro tre mesi dal giorno in \ncui ha conosciuto l\x00inesattezza della dichiarazione o le reticenze;\n\x04 se il decesso si verifica prima che l\x00inesattezza della dichiarazione o la reticenza sia conosciuta \ndall\x00Impresa di Assicurazione, o prima che l\x00Impresa abbia dichiarato di recedere dal Contratto, di \nridurre la somma dovuta in proporzione alla differenza tra il Premio convenuto e quello che sarebbe \nstato applicato se si fosse conosciuto il vero stato delle cose.\nIl Contraente è tenuto a inoltrare per iscritto alla Compagnia (posta ordinaria e mail) eventuali \ncomunicazioni inerenti:\n-modifiche dell\x00indirizzo presso il quale intende ricevere le comunicazioni relative al contratto;\n-variazione della residenza Europea nel corso della durata del contratto, presso altro Paese \nmembro della Unione Europea;\n-variazione degli estremi di conto corrente bancario.\nIn tal caso è necessario inoltrare la richiesta attraverso l\x00invio del modulo del mandato, compilato e \nsottoscritto dal contraente, reperibile nella sezione \x01comunicazioni\x02 sul sito internet della \ncompagnia all\x00indirizzo www.axa-mpsfinancial.ie\nFATCA (Foreign Account Tax Compliance Act) e CRS (Common Standard Reporting)\nLa normativa denominata rispettivamente FATCA (Foreign Account Tax Compliance Act - \nIntergovernmental Agreement sottoscritto tra Italia e Stati Uniti in data 10 gennaio 2014 e Legge n. \n95 del 18 giugno 2015) e CRS (Common Reporting Standard - Decreto Ministeriale del 28 \ndicembre 2015) impone agli operatori commerciali, al fine di contrastare la frode fiscale e \nl\x00evasione fiscale transfrontaliera, di eseguire la puntuale identificazione della propria clientela al \nfine di determinarne l\x00effettivo status di contribuente estero.\nDichiarazioni \ninesatte o \nreticenti\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 4 di 9\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
227 "I dati anagrafici e patrimoniali dei Contraenti identificati come fiscalmente residenti negli USA e/o \nin uno o più Paesi aderenti al CRS, dovranno essere trasmessi all\x00autorità fiscale locale, tramite \nl\x00Agenzia delle Entrate.\nL\x00identificazione avviene in fase di stipula del contratto e deve essere ripetuta in caso di \ncambiamento delle condizioni originarie durante tutta la sua durata, mediante l\x00acquisizione di \nautocertificazione rilasciata dai Contraenti. Ogni contraente è tenuto a comunicare \ntempestivamente eventuali variazioni rispetto a quanto dichiarato o rilevato in fase di sottoscrizione \ndel contratto di assicurazione. La Società si riserva inoltre di verificare i dati raccolti e di richiedere \nulteriori informazioni. In caso di autocertificazione che risulti compilata parzialmente o in maniera \nerrata, nonché in caso di mancata/non corretta comunicazione dei propri dati anagrafici, la società \nqualora abbia rilevato indizi di americanità e/o residenze fiscali estere nelle informazioni in suo \npossesso, assocerà al cliente la condizione di contribuente estero, provvedendo alla comunicazione \ndovuta.\nAntiriciclaggio\nIl Contraente è tenuto a fornire alla Compagnia tutte le informazioni necessarie al fine \ndell\x00assolvimento dell\x00adeguata verifica ai fini antiriciclaggio. Qualora la Compagnia, in ragione \ndella mancata collaborazione del Contraente, non sia in grado di portare a compimento l\x00adeguata \nverifica, la stessa non potrà concludere il Contratto o dovrà porre fine allo stesso. In tali ipotesi le \nsomme dovute al Contraente dovranno essere allo stesso versate mediante bonifico a valere un \nconto corrente intestato al Contraente stesso. In tali ipotesi le disponibilità finanziarie \neventualmente già acquisite dalla Compagnia dovranno essere restituite al Contraente liquidando il \nrelativo importo tramite bonifico bancario su un conto corrente bancario indicato dal Contraente e \nallo stesso intestato.\nIn nessun caso l'Impresa di Assicurazione sarà tenuta a fornire alcuna copertura assicurativa, \nsoddisfare richieste di risarcimento o garantire alcuna indennità in virtù del presente contratto, \nqualora tale copertura, pagamento o indennità possa esporla a divieti, sanzioni economiche o \nrestrizioni ai sensi di Risoluzioni delle Nazioni Unite o sanzioni economiche o commerciali, leggi o \nnorme dell\x00Unione Europea, del Regno Unito o degli Stati Uniti d\x00America, ove applicabili in Italia.\nQuando e come devo pagare?\nPremio\nIl Contratto prevede il pagamento di un Premio Unico il cui ammontare minimo è pari a 2.500,00 \neuro, incrementabile di importo pari o in multiplo di 50,00 euro, da corrispondersi in un\x00unica \nsoluzione prima della conclusione del Contratto.\nNon è prevista la possibilità di effettuare versamenti aggiuntivi successivi.\nIl versamento del Premio Unico può essere effettuato mediante addebito su conto corrente \nbancario, indicato nel Modulo di Proposta, previa autorizzazione del titolare del conto corrente.\nIl pagamento dei Premio Unico può essere eseguito mediante addebito su conto corrente bancario, \nprevia autorizzazione, intestato al Contraente oppure tramite bonifico bancario sul conto corrente \ndell\x00Impresa di Assicurazione.\nRimborso\nIl rimborso del Premio Versato è previsto nel caso in cui il Contraente decida di revocare la proposta \nfinché il contratto non è concluso.\nSconti\nAl verificarsi di condizioni particolari ed eccezionali che potrebbero riguardare \x03 a titolo \nesemplificativo ma non esaustivo \x03 il Contraente e la relativa situazione assicurativo/finanziaria, \nl\x00ammontare del Premio pagato e gli investimenti selezionati dal Contraente, l\x00Impresa di \nAssicurazione si riserva la facoltà di applicare sconti sugli oneri previsti dal contratto, concordando \ntale agevolazione con il Contraente.\nQuando comincia la copertura e quando finisce?\nDurata\nIl Contratto ha una durata massima pari a 5 anni 11 mesi e 27 giorni, sino alla data di scadenza \n(11/04/2029, la \x01data di scadenza\x02).\nSospensione\nNon sono possibili delle sospensioni della copertura assicurativa\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 5 di 9\n",
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
228 'Come posso revocare la proposta, recedere dal contratto o risolvere il contratto? \nRevoca\nLa Proposta di assicurazione può essere revocata fino alle ore 24:00 del giorno in cui il Contratto è \nconcluso. In tal caso, l\x00Impresa di Assicurazione restituirà al Contraente il Premio pagato entro \ntrenta giorni dal ricevimento della comunicazione di Revoca.\nRecesso\nIl Contraente può recedere dal Contratto entro trenta giorni dalla sua conclusione. Il Recesso dovrà \nessere comunicato all\x00Impresa di Assicurazione mediante lettera raccomandata con avviso di \nricevimento.\nL\x00Impresa di Assicurazione, entro trenta giorni dal ricevimento della comunicazione relativa al \nRecesso, rimborserà al Contraente il Controvalore delle Quote attribuite al Contratto alla data di \nricevimento della richiesta di recesso incrementato dai caricamenti, ove previsti, e dedotte \neventuali agevolazioni.\nRisoluzione\nLa risoluzione del contratto è prevista tramite la richiesta di riscatto totale esercitabile in qualsiasi \nmomento della durata contrattuale\nSono previsti riscatti o riduzioni? Si\n no\nValori di\nriscatto e\nriduzione\nA condizione che siano trascorsi almeno 30 giorni dalla Data di Decorrenza (conclusione del \nContratto) e fino all\x00ultimo Giorno Lavorativo della terzultima settimana precedente la data di \nscadenza, il Contraente può riscuotere, interamente o parzialmente, il Valore di Riscatto. In caso di \nRiscatto totale, la liquidazione del Valore di Riscatto pone fine al Contratto con effetto dalla data di \nricezione della richiesta.\nL\x00importo che sarà corrisposto al Contraente in caso di Riscatto sarà pari al Controvalore delle \nQuote del Fondo Interno attribuite al Contratto alla data di Riscatto, al netto dei costi di Riscatto.\nIn caso di Riscatto, ai fini del calcolo del Valore Unitario della Quota, si farà riferimento alla Data di \nValorizzazione della settimana successiva alla data in cui la comunicazione di Riscatto del \nContraente perviene all\x00Impresa di Assicurazione, corredata di tutta la documentazione, al netto dei \ncosti di Riscatto, salvo il verificarsi di Eventi di Turbativa.\nIl Contraente assume il rischio connesso all\x00andamento negativo del valore delle Quote e, pertanto, \nesiste la possibilità di ricevere un ammontare inferiore all\x00investimento finanziario.\nIn caso di Riscatto del Contratto (totale o parziale), l\x00Impresa di Assicurazione non offre alcuna \ngaranzia finanziaria di rendimento minimo e pertanto il Contraente sopporta il rischio di ottenere un \nValore Unitario di Quota inferiore al 100% del Valore Unitario di Quota del Fondo Interno registrato \nalla Data di Istituzione in considerazione dei rischi connessi alla fluttuazione del valore di mercato \ndegli attivi in cui investe, direttamente o indirettamente, il Fondo Interno.\nRichiesta di\ninformazioni\nPer eventuali richieste di informazioni sul valore di riscatto, il Contraente può rivolgersi alla \nCompagnia AXA MPS Financial DAC \x03 Wolfe Tone House, Wolfe Tone Street, Dublin, DO1 HP90 \x03 \nIreland, Numero Verde 800.231.187, e-mail: supporto@axa-mpsfinancial.ie\nA chi è rivolto questo prodotto?\nL\x00investitore al dettaglio a cui è destinato il prodotto varia in funzione dell\x00opzione di investimento sottostante e \nillustrata nel relativo KID.\nIl prodotto è indirizzato a Contraenti persone fisiche e persone giuridiche a condizione che il Contraente (persona fisica) \ne l\x00Assicurato, al momento della sottoscrizione stessa, abbiano un\x00età compresa tra i 18 anni e i 85 anni.\nQuali costi devo sostenere?\nPer l\x00informativa dettagliata sui costi fare riferimento alle indicazioni del KID.\nIn aggiunta rispetto alle informazioni del KID , indicare i seguenti costi a carico del contraente.\nSpese di emissione:\nIl Contratto prevede una spesa fissa di emissione pari a 25 Euro.\nLa deduzione di tale importo avverrà contestualmente alla deduzione del premio.\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 6 di 9\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
229 "L\x00obiettivo di protezione è da considerarsi al netto delle spese di emissione.\nCosti per riscatto\nIl Riscatto (totale e parziale) prevede un costo che varia in funzione della data di richiesta e secondo le percentuali di \nseguito indicate:\n1°Anno 5,00%; 2°Anno 3,50%; 3°Anno 2,00%; dal quarto anno in poi 0%;\nCosti di intermediazione\nla quota parte massima percepita dall\x00intermediario con riferimento all\x00intero flusso commissionale relativo al prodotto \nè pari al 35,17%.\nQuali sono i rischi e qual è il potenziale rendimento?\nSia con riferimento alla prestazione in caso di vita dell\x00assicurato, sia con riferimento al capitale caso morte riferito ai \nFondi Assicurativi Interni, la Compagnia non presta alcuna garanzia di rendimento minimo o di conservazione del \ncapitale. Pertanto il controvalore della prestazione della Compagnia potrebbe essere inferiore all\x00importo dei premi \nversati, in considerazione dei rischi connessi alla fluttuazione del valore di mercato degli attivi in cui investe, \ndirettamente o indirettamente il Fondo Interno.\nCOME POSSO PRESENTARE I RECLAMI E RISOLVERE LE CONTROVERSIE?\nAll\x00IVASS\nNel caso in cui il reclamo presentato all\x00impesa assicuratrice abbia esito insoddisfacente o risposta \ntardiva, è possibile rivolgersi all\x00IVASS, Via del Quirinale, 21 - 00187 Roma, fax 06.42133206, Info \nsu: www.ivass.it.\nEventuali reclami potranno inoltre essere indirizzati all\x00Autorità Irlandese competente al seguente \nindirizzo:\nFinancial Services Ombudsman 3rd Floor, Lincoln House, Lincoln Place, Dublin 2, D02 VH29 \x03 \nIreland\nPRIMA DI RICORRERE ALL\x00AUTORITÀ GIUDIZIARIA è possibile, in alcuni casi necessario, \navvalersi di sistemi alternativi di risoluzione delle controversie, quali:\nMediazione\nInterpellando un Organismo di Mediazione tra quelli presenti nell'elenco del Ministero della \nGiustizia, consultabile sul sito www.giustizia.it (Legge 9/8/2013, n.98)\nNegoziazione \nassistita\nTramite richiesta del proprio avvocato all\x00impresa\nAltri Sistemi \nalternative di \nrisoluzione delle \ncontroversie\nEventuali reclami relativi ad un contratto o servizio assicurativo nei confronti dell'Impresa di \nassicurazione o dell'Intermediario assicurativo con cui si entra in contatto, nonché qualsiasi \nrichiesta di informazioni, devono essere preliminarmente presentati per iscritto (posta, email) ad \nAXA MPS Financial DAC - Ufficio Reclami secondo seguenti modalità:\nEmail: reclami@axa-mpsfinancial.ie\nPosta: AXA MPS Financial DAC - Ufficio Reclami\nWolfe Tone House, Wolfe Tone Street,\nDublin DO1 HP90 - Ireland\nNumero Verde 800.231.187\navendo cura di indicare:\n-nome, cognome, indirizzo completo e recapito telefonico del reclamante;\n-numero della polizza e nominativo del contraente;\n-breve ed esaustiva descrizione del motivo di lamentela;\n-ogni altra indicazione e documento utile per descrivere le circostanze.\nSarà cura della Compagnia fornire risposta entro 45 giorni dalla data di ricevimento del reclamo, \ncome previsto dalla normativa vigente.\nNel caso di mancato o parziale accoglimento del reclamo, nella risposta verrà fornita una chiara \nspiegazione della posizione assunta dalla Compagnia in relazione al reclamo stesso ovvero della \nsua mancata risposta.\nQualora il reclamante non abbia ricevuto risposta oppure ritenga la stessa non soddisfacente, \nprima di rivolgersi all'Autorità Giudiziaria, può scrivere all'IVASS (Via del Quirinale, 21 - 00187 \nRoma; fax 06.42.133.745 o 06.42.133.353, ivass@pec.ivass.it) fornendo copia del reclamo già \nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 7 di 9\n",
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
230 "inoltrato all'impresa ed il relativo riscontro anche utilizzando il modello presente nel sito dell'IVASS \nalla sezione per il Consumatore - come presentare un reclamo.\nEventuali reclami potranno inoltre essere indirizzati all'Autorità Irlandese competente al seguente \nindirizzo:\nFinancial Services Ombudsman\n3rd Floor, Lincoln House,\nLincoln Place, Dublin 2, D02 VH29 Ireland\nIl reclamante può ricorrere ai sistemi alternativi per la risoluzione delle controversie previsti a livello \nnormativo o convenzionale, quali:\n\x04 Mediazione: (Decreto Legislativo n.28/2010 e ss.mm.) puo' essere avviata presentando istanza \nad un Organismo di Mediazione tra quelle presenti nell'elenco del Ministero della Giustizia, \nconsultabile sul sito www.giustizia.it. La legge ne prevede l'obbligatorieta' nel caso in cui si intenda \nesercitare in giudizio i propri diritti in materia di contratti assicurativi o finanziari e di risarcimento \nda responsabilita' medica e sanitaria, costituendo condizione di procedibilita' della domanda.\n\x04 Negoziazione Assistita: (Legge n.162/2014) tramite richiesta del proprio Avvocato all'Impresa. E' \nun accordo mediante il quale le parti convengono di cooperare in buona fede e con lealta' per \nrisolvere in via amichevole la controversia tramite l'assistenza di avvocati. Fine del procedimento e' \nla composizione bonaria della lite, con la sottoscrizione delle parti - assistite dai rispettivi difensori - \ndi un accordo detto convenzione di negoziazione. Viene prevista la sua obbligatorieta' nel caso in \ncui si intenda esercitare in giudizio i propri diritti per ogni controversia in materia di risarcimento del \ndanno da circolazione di veicoli e natanti, ovverosia e' condizione di procedibilita' per l'eventuale \ngiudizio civile. Invece e' facoltativa per ogni altra controversia in materia di risarcimenti o di contratti \nassicurativi o finanziari.\nIn caso di controversia relativa alla determinazione dei danni si puo' ricorrere alla perizia \ncontrattuale prevista dalle Condizioni di Assicurazione per la risoluzione di tale tipologia di \ncontroversie. L'istanza di attivazione della perizia contrattuale dovra' essere indirizzata alla \nCompagnia all' indirizzo\nAXA MPS Financial DAC \nWolfe Tone House, Wolfe Tone Street\nDublin DO1 HP90 - Ireland\nPer maggiori informazioni si rimanda a quanto presente nell'area Reclami del sito \nwww.axa-mpsfinancial.ie. \nPer la risoluzione delle liti transfrontaliere è possibile presentare reclamo all'IVASS o direttamente \nal sistema estero http://ec.europa.eu/internal_market/fin-net/members_en.htm competente \nchiedendo l'attivazione della procedura FIN-NET.\nEventuali reclami relativi la mancata osservanza da parte della Compagnia, degli intermediari e dei \nperiti assicurativi, delle disposizioni del Codice delle assicurazioni, delle relative norme di \nattuazione nonché delle norme sulla commercializzazione a distanza dei prodotti assicurativi \npossono essere presentati direttamente all'IVASS, secondo le modalità sopra indicate.\nSi ricorda che resta salva la facoltà di adire l'autorità giudiziaria.\nREGIME FISCALE\nTrattamento \nfiscale applicabile \nal contratto\nLe seguenti informazioni sintetizzano alcuni aspetti del regime fiscale applicabile al Contratto, ai \nsensi della legislazione tributaria italiana e della prassi vigente alla data di pubblicazione del \npresente documento, fermo restando che le stesse rimangono soggette a possibili cambiamenti che \npotrebbero avere altresì effetti retroattivi. Quanto segue non intende rappresentare un\x00analisi \nesauriente di tutte le conseguenze fiscali del Contratto. I Contraenti sono tenuti a consultare i loro \nconsulenti in merito al regime fiscale proprio del Contratto.\nTasse e imposte\nLe imposte e tasse presenti e future applicabili per legge al Contratto sono a carico del Contraente \no dei Beneficiari e aventi diritto e non è prevista la corresponsione al Contraente di alcuna somma \naggiuntiva volta a compensare eventuali riduzioni dei pagamenti relativi al Contratto.\nTassazione delle somme corrisposte a soggetti non esercenti attività d\x00impresa\n1. In caso di decesso dell\x00Assicurato\nLe somme corrisposte dall\x00Impresa di Assicurazione in caso di decesso dell\x00Assicurato non sono \nsoggette a tassazione IRPEF in capo al percettore e sono esenti dall\x00imposta sulle successioni. Si \nricorda tuttavia che, per effetto della legge 23 dicembre 2014 n. 190 (c.d.\x02Legge di Stabilità\x02), i \nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 8 di 9\n",
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
231 'capitali percepiti in caso di morte, a decorrere dal 1 gennaio 2015, in dipendenza di contratti di \nassicurazione sulla vita, a copertura del rischio demografico, sono esenti dall\x00imposta sul reddito \ndelle persone fisiche.\n2. In caso di Riscatto totale o di Riscatto parziale.\nLe somme corrisposte dall\x05Impresa di Assicurazione in caso di Riscatto totale sono soggette ad \nun\x00imposta sostitutiva dell\x00imposta sui redditi nella misura prevista di volta in volta dalla legge. Tale \nimposta, al momento della redazione del presente documento, è pari al 26% sulla differenza \n(plusvalenza) tra il capitale maturato e l\x00ammontare dei premi versati (al netto di eventuali riscatti \nparziali), con l\x00eccezione dei proventi riferibili ai titoli di stato italiani ed equiparati (Paesi facenti \nparte della white list), per i quali l\x00imposta è pari al 12,5%.\nIn caso di Riscatto parziale, ai fini del computo del reddito di capitale da assoggettare alla predetta \nimposta sostitutiva, l\x00ammontare dei premi va rettificato in funzione del rapporto tra il capitale \nerogato ed il valore economico della polizza alla data del Riscatto parziale.\n3. In caso di Recesso\nLe somme corrisposte in caso di Recesso sono soggette all\x00imposta sostitutiva delle imposte sui \nredditi nella misura e con gli stessi criteri indicati per il Riscatto totale del Contratto.\nTassazione delle somme corrisposte a soggetti esercenti attività d\x00impresa\nLe somme corrisposte a soggetti che esercitano l\x00attività d\x00impresa non costituiscono redditi di \ncapitale, bensì redditi d\x00impresa. Su tali somme l\x00Impresa non applica l\x00imposta sostitutiva di cui \nall\x00art. 26-ter del D.P.R. 29 settembre 1973, n. 600.\nSe le somme sono corrisposte a persone fisiche o enti non commerciali in relazione a contratti \nstipulati nell\x00ambito dell\x00attività commerciale, l\x00Impresa non applica l\x00imposta sostitutiva, qualora gli \ninteressati presentino una dichiarazione in merito alla sussistenza di tale requisito.\nL\x00IMPRESA HA L\x00OBBLIGO DI TRASMETTERTI, ENTRO IL 31 MAGGIO DI OGNI ANNO, IL DOCUMENTO \nUNICO DI RENDICONTAZIONE ANNUALE DELLA TUA POSIZIONE ASSICURATIVA\nPER QUESTO CONTRATTO L\x00IMPRESA NON DISPONE DI UN\x00AREA INTERNET DISPOSITIVA RISERVATA \nAL CONTRAENTE (c.d. HOME INSURANCE), PERTANTO DOPO LA SOTTOSCRIZIONE NON POTRAI \nGESTIRE TELEMATICAMENTE IL CONTRATTO MEDESIMO.\nDIP aggiuntivo IBIP - Progetto Protetto New - Global Dividends - Pag. 9 di 9\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
232 ]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
233 # codespell:ignore-end
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
234
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
235 path = os.path.abspath(f'{__file__}/../../tests/resources/test_3186.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
236 fitz_doc = pymupdf.open(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
237 texts = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
238 for page in fitz_doc:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
239 t = page.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
240 texts.append(t)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
241 assert texts == texts_expected, f'Unexpected output: {texts=}'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
242
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
243
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
244 def test_3197():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
245 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
246 MuPDF's ActualText support fixes handling of test_3197.pdf.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
247 '''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
248 path = os.path.abspath(f'{__file__}/../../tests/resources/test_3197.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
249
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
250 text_utf8_expected = [
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
251 b'NYSE - Nasdaq Real Time Price \xe2\x80\xa2 USD\nFord Motor Company (F)\n12.14 -0.11 (-0.90%)\nAt close: 4:00 PM EST\nAfter hours: 7:43 PM EST\nAll numbers in thousands\nAnnual\nQuarterly\nDownload\nSummary\nNews\nChart\nConversations\nStatistics\nHistorical Data\nProfile\nFinancials\nAnalysis\nOptions\nHolders\nSustainability\nInsights\nFollow\n12.15 +0.01 (+0.08%)\nIncome Statement\nBalance Sheet\nCash Flow\nSearch for news, symbols or companies\nNews\nFinance\nSports\nSign in\nMy Portfolio\nNews\nMarkets\nSectors\nScreeners\nPersonal Finance\nVideos\nFinance Plus\nBack to classic\nMore\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
252 b'Related Tickers\nTTM\n12/31/2023\n12/31/2022\n12/31/2021\n12/31/2020\n14,918,000\n14,918,000\n6,853,000\n15,787,000\n24,269,000\n-17,628,000\n-17,628,000\n-4,347,000\n2,745,000\n-18,615,000\n2,584,000\n2,584,000\n2,511,000\n-23,498,000\n2,315,000\n25,110,000\n25,110,000\n25,340,000\n20,737,000\n25,935,000\n-8,236,000\n-8,236,000\n-6,866,000\n-6,227,000\n-5,742,000\n51,659,000\n51,659,000\n45,470,000\n27,901,000\n65,900,000\n-41,965,000\n-41,965,000\n-45,655,000\n-54,164,000\n-60,514,000\n-335,000\n-335,000\n-484,000\n--\n--\n6,682,000\n6,682,000\n-13,000\n9,560,000\n18,527,000\n \nYahoo Finance Plus Essential\naccess required.\nUnlock Access\nBreakdown\nOperating Cash\nFlow\nInvesting Cash\nFlow\nFinancing Cash\nFlow\nEnd Cash Position\nCapital Expenditure\nIssuance of Debt\nRepayment of Debt\nRepurchase of\nCapital Stock\nFree Cash Flow\n12/31/2020 - 6/1/1972\nGM\nGeneral Motors Compa\xe2\x80\xa6\n39.49 +1.23%\n\xc2\xa0\nRIVN\nRivian Automotive, Inc.\n15.39 -3.15%\n\xc2\xa0\nNIO\nNIO Inc.\n5.97 +0.17%\n\xc2\xa0\nSTLA\nStellantis N.V.\n25.63 +0.91%\n\xc2\xa0\nLCID\nLucid Group, Inc.\n3.7000 +0.54%\n\xc2\xa0\nTSLA\nTesla, Inc.\n194.77 +0.52%\n\xc2\xa0\nTM\nToyota Motor Corporati\xe2\x80\xa6\n227.09 +0.14%\n\xc2\xa0\nXPEV\nXPeng Inc.\n9.08 +0.89%\n\xc2\xa0\nFSR\nFisker Inc.\n0.5579 -11.46%\n\xc2\xa0\nCopyright \xc2\xa9 2024 Yahoo.\nAll rights reserved.\nPOPULAR QUOTES\nTesla\nDAX Index\nKOSPI\nDow Jones\nS&P BSE SENSEX\nSPDR S&P 500 ETF Trust\nEXPLORE MORE\nCredit Score Management\nHousing Market\nActive vs. Passive Investing\nShort Selling\nToday\xe2\x80\x99s Mortgage Rates\nHow Much Mortgage Can You Afford\nABOUT\nData Disclaimer\nHelp\nSuggestions\nSitemap\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
253 ]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
254
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
255 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
256 for i, page in enumerate(document):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
257 text = page.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
258 #print(f'{i=}:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
259 text_utf8 = text.encode('utf8')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
260 #print(f' {text_utf8=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
261 #print(f' {text_utf8_expected[i]=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
262 assert text_utf8 == text_utf8_expected[i]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
263
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
264
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
265 def test_document_text():
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
266 if os.environ.get('PYODIDE_ROOT'):
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
267 print('test_document_text(): not running on Pyodide - multiprocessing not available.')
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
268 return
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
269
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
270 import platform
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
271 import time
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
272
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
273 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
274 concurrency = None
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
275
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
276 def llen(texts):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
277 l = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
278 for text in texts:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
279 l += len(text) if isinstance(text, str) else text
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
280 return l
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
281
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
282 results = dict()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
283 _stats = 1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
284
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
285 print('')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
286 method = 'single'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
287 t = time.time()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
288 document = pymupdf.Document(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
289 texts0 = pymupdf.get_text(path, _stats=_stats)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
290 t0 = time.time() - t
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
291 print(f'{method}: {t0=} {llen(texts0)=}', flush=1)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
292
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
293 # Dummy run seems to avoid misleading stats with slow first run.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
294 method = 'mp'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
295 texts = pymupdf.get_text(path, concurrency=concurrency, method=method, _stats=_stats)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
296
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
297 method = 'mp'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
298 t = time.time()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
299 texts = pymupdf.get_text(path, concurrency=concurrency, method=method, _stats=_stats)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
300 t = time.time() - t
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
301 print(f'{method}: {concurrency=} {t=} ({t0/t:.2f}x) {llen(texts)=}', flush=1)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
302 assert texts == texts0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
303
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
304 if platform.system() != 'Windows':
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
305 method = 'fork'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
306 t = time.time()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
307 texts = pymupdf.get_text(path, concurrency=concurrency, method='fork', _stats=_stats)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
308 t = time.time() - t
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
309 print(f'{method}: {concurrency=} {t=} ({t0/t:.2f}x) {llen(texts)=}', flush=1)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
310 assert texts == texts0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
311
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
312 if _stats:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
313 pymupdf._log_items_clear()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
314
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
315
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
316 def test_4524():
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
317 if os.environ.get('PYODIDE_ROOT'):
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
318 print('test_4524(): not running on Pyodide - multiprocessing not available.')
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
319 return
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
320 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
321 print('')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
322 document = pymupdf.Document(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
323 texts_single = pymupdf.get_text(path, method='single', pages=[1, 3, 5])
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
324 texts_mp = pymupdf.get_text(path, method='mp', pages=[1, 3, 5])
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
325 print(f'{len(texts_single)=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
326 print(f'{len(texts_mp)=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
327 assert texts_mp == texts_single
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
328
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
329
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
330 def test_3594():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
331 verbose = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
332 print()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
333 d = pymupdf.open(os.path.abspath(f'{__file__}/../../tests/resources/test_3594.pdf'))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
334 for i, p in enumerate(d.pages()):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
335 text = p.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
336 print(f'Page {i}:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
337 if verbose:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
338 for line in text.split('\n'):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
339 print(f' {line!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
340 print('='*40)
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
341 wt = pymupdf.TOOLS.mupdf_warnings()
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
342 if pymupdf.mupdf_version_tuple < (1, 26, 8):
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
343 assert not wt
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
344 else:
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
345 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 2 times...'
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
346
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
347
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
348 def test_3687():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
349 path1 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687.epub'))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
350 path2 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687-3.epub'))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
351 for path in path1, path2:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
352 print(f'Looking at {path=}.')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
353 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
354 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
355 text = page.get_text("text")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
356 print(f'{text=!s}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
357 wt = pymupdf.TOOLS.mupdf_warnings()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
358 print(f'{wt=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
359 assert wt == 'unknown epub version: 3.0'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
360
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
361 def test_3705():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
362 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3705.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
363 def get_all_page_from_pdf(document, last_page=None):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
364 if last_page:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
365 document.select(list(range(0, last_page)))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
366 if document.page_count > 30:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
367 document.select(list(range(0, 30)))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
368 return iter(page for page in document)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
369
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
370 filename = os.path.basename(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
371
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
372 doc = pymupdf.open(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
373 texts0 = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
374 for i, page in enumerate(get_all_page_from_pdf(doc)):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
375 text = page.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
376 print(i, text)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
377 texts0.append(text)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
378
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
379 texts1 = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
380 doc = pymupdf.open(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
381 for page in doc:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
382 if page.number >= 30: # leave the iterator immediately
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
383 break
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
384 text = page.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
385 texts1.append(text)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
386
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
387 assert texts1 == texts0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
388
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
389 wt = pymupdf.TOOLS.mupdf_warnings()
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
390 if pymupdf.mupdf_version_tuple >= (1, 27):
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
391 expected = 'format error: No common ancestor in structure tree\nstructure tree broken, assume tree is missing'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
392 expected = '\n'.join([expected] * 56)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
393 assert wt == expected
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
394 elif pymupdf.mupdf_version_tuple >= (1, 26, 8):
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
395 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 7684 times...'
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
396 else:
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
397 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
398
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
399 def test_3650():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
400 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
401 doc = pymupdf.Document(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
402 blocks = doc[0].get_text("blocks")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
403 t = [block[4] for block in blocks]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
404 print(f'{t=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
405 assert t == [
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
406 'RECUEIL DES ACTES ADMINISTRATIFS\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
407 'n° 78 du 28 avril 2023\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
408 ]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
409
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
410 def test_4026():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
411 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4026.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
412 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
413 page = document[4]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
414 blocks = page.get_text('blocks')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
415 for i, block in enumerate(blocks):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
416 print(f'block {i}: {block}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
417 assert len(blocks) == 5
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
418
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
419 def test_3725():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
420 # This currently just shows the extracted text. We don't check it is as expected.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
421 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3725.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
422 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
423 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
424 text = page.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
425 if 0:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
426 print(textwrap.indent(text, ' '))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
427
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
428 def test_4147():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
429 print()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
430 items = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
431 for expect_visible, path in (
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
432 (False, os.path.normpath(f'{__file__}/../../tests/resources/test_4147.pdf')),
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
433 (True, os.path.normpath(f'{__file__}/../../tests/resources/symbol-list.pdf')),
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
434 ):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
435 print(f'{expect_visible=} {path=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
436 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
437 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
438 text = page.get_text('rawdict')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
439 for block in text['blocks']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
440 if block['type'] == 0:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
441 #print(f' block')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
442 for line in block['lines']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
443 #print(f' line')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
444 for span in line['spans']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
445 #print(f' span')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
446 if pymupdf.mupdf_version_tuple >= (1, 25, 2):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
447 #print(f' span: {span["flags"]=:#x} {span["char_flags"]=:#x}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
448 if expect_visible:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
449 assert span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
450 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
451 assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
452 assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_STROKED)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
453 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
454 #print(f' span: {span["flags"]=:#x}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
455 assert 'char_flags' not in span
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
456 # Check commit `add 'bidi' to span dict, add 'synthetic' to char dict.`
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
457 assert span['bidi'] == 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
458 for ch in span['chars']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
459 assert isinstance(ch['synthetic'], bool)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
460
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
461
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
462 def test_4139():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
463 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4139.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
464 flags = (0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
465 | pymupdf.TEXT_PRESERVE_IMAGES
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
466 | pymupdf.TEXT_PRESERVE_WHITESPACE
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
467 | pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
468 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
469 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
470 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
471 dicts = page.get_text('dict', flags=flags, sort=True)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
472 seen = set()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
473 for b_ctr, b in enumerate(dicts['blocks']):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
474 for l_ctr, l in enumerate(b.get('lines', [])):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
475 for s_ctr, s in enumerate(l['spans']):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
476 color = s.get('color')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
477 if color is not None and color not in seen:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
478 seen.add(color)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
479 print(f"B{b_ctr}.L{l_ctr}.S{s_ctr}: {color=} {hex(color)=} {s=}")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
480 assert color == 0, f'{s=}'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
481 assert s['alpha'] == 255
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
482
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
483
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
484 def test_4245():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
485 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4245.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
486 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
487 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
488 regions = page.search_for('Bart Simpson')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
489 print(f'{regions=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
490 page.add_highlight_annot(regions)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
491 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
492 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
493 regions = page.search_for('Bart Simpson')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
494 for region in regions:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
495 highlight = page.add_highlight_annot(region)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
496 highlight.update()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
497 pixmap = page.get_pixmap()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
498 path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4245_out.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
499 pixmap.save(path_out)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
500
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
501 path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4245_expected.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
502 rms = gentle_compare.pixmaps_rms(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
503 pixmap_diff = gentle_compare.pixmaps_diff(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
504 path_diff = os.path.normpath(f'{__file__}/../../tests/resources/test_4245_diff.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
505 pixmap_diff.save(path_diff)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
506 print(f'{rms=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
507 if pymupdf.mupdf_version_tuple < (1, 25, 5):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
508 # Prior to fix for mupdf bug 708274.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
509 assert 0.1 < rms < 0.2
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
510 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
511 assert rms < 0.01
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
512
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
513
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
514 def test_4180():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
515 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4180.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
516 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
517 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
518 regions = page.search_for('Reference is made')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
519 for region in regions:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
520 page.add_redact_annot(region, fill=(0, 0, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
521 page.apply_redactions()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
522 pixmap = page.get_pixmap()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
523 path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4180_out.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
524 pixmap.save(path_out)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
525
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
526 path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4180_expected.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
527 rms = gentle_compare.pixmaps_rms(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
528 pixmap_diff = gentle_compare.pixmaps_diff(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
529 path_diff = os.path.normpath(f'{__file__}/../../tests/resources/test_4180_diff.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
530 pixmap_diff.save(path_diff)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
531 print(f'{rms=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
532 if pymupdf.mupdf_version_tuple < (1, 25, 5):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
533 # Prior to fix for mupdf bug 708274.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
534 assert 0.2 < rms < 0.3
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
535 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
536 assert rms < 0.01
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
537
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
538
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
539 def test_4182():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
540 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4182.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
541 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
542 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
543 dict_ = page.get_text('dict')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
544 linelist = []
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
545 for block in dict_['blocks']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
546 if block['type'] == 0:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
547 paranum = block['number']
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
548 if 'lines' in block:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
549 for line in block.get('lines', ()):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
550 for span in line['spans']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
551 if span['text'].strip():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
552 page.draw_rect(span['bbox'], color=(1, 0, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
553 linelist.append([paranum, span['bbox'], repr(span['text'])])
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
554 pixmap = page.get_pixmap()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
555 path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4182_out.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
556 pixmap.save(path_out)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
557 if platform.system() != 'Windows': # Output on Windows can fail due to non-utf8 stdout.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
558 for l in linelist:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
559 print(l)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
560 path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4182_expected.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
561 pixmap_diff = gentle_compare.pixmaps_diff(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
562 path_diff = os.path.normpath(f'{__file__}/../../tests/resources/test_4182_diff.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
563 pixmap_diff.save(path_diff)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
564 rms = gentle_compare.pixmaps_rms(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
565 print(f'{rms=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
566 if pymupdf.mupdf_version_tuple < (1, 25, 5):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
567 # Prior to fix for mupdf bug 708274.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
568 assert 3 < rms < 3.5
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
569 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
570 assert rms < 0.01
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
571
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
572
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
573 def test_4179():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
574 if os.environ.get('PYMUPDF_USE_EXTRA') == '0':
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
575 # Looks like Python code doesn't behave same as C++, probably because
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
576 # of the code not being correct for Python's native unicode strings.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
577 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
578 print(f'test_4179(): Not running with PYMUPDF_USE_EXTRA=0 because known to fail.')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
579 return
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
580 # We check that using TEXT_ACCURATE_BBOXES gives the correct boxes. But
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
581 # this also requires that we disable PyMuPDF quad corrections.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
582 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
583 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4179.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
584
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
585 # Disable anti-aliasing to avoid our drawing of multiple identical bboxes
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
586 # (from normal/accurate bboxes) giving slightly different results.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
587 aa = pymupdf.mupdf.fz_aa_level()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
588 uqc = pymupdf._globals.skip_quad_corrections
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
589 pymupdf.TOOLS.set_aa_level(0)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
590 pymupdf.TOOLS.unset_quad_corrections(True)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
591 assert pymupdf._globals.skip_quad_corrections
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
592 try:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
593 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
594 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
595
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
596 char_sqrt = b'\xe2\x88\x9a'.decode()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
597
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
598 # Search with defaults.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
599 bboxes_search = page.search_for(char_sqrt)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
600 assert len(bboxes_search) == 1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
601 print(f'bboxes_search[0]:\n {bboxes_search[0]!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
602 page.draw_rect(bboxes_search[0], color=(1, 0, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
603 rms = gentle_compare.rms(bboxes_search[0], (250.0489959716797, 91.93604278564453, 258.34783935546875, 101.34073638916016))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
604 assert rms < 0.01
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
605
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
606 # Search with TEXT_ACCURATE_BBOXES.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
607 bboxes_search_accurate = page.search_for(
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
608 char_sqrt,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
609 flags = (0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
610 | pymupdf.TEXT_DEHYPHENATE
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
611 | pymupdf.TEXT_PRESERVE_WHITESPACE
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
612 | pymupdf.TEXT_PRESERVE_LIGATURES
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
613 | pymupdf.TEXT_MEDIABOX_CLIP
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
614 | pymupdf.TEXT_ACCURATE_BBOXES
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
615 ),
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
616 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
617 assert len(bboxes_search_accurate) == 1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
618 print(f'bboxes_search_accurate[0]\n {bboxes_search_accurate[0]!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
619 page.draw_rect(bboxes_search_accurate[0], color=(0, 1, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
620 rms = gentle_compare.rms(bboxes_search_accurate[0], (250.0489959716797, 99.00948333740234, 258.34783935546875, 108.97208404541016))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
621 assert rms < 0.01
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
622
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
623 # Iterate with TEXT_ACCURATE_BBOXES.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
624 bboxes_iterate_accurate = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
625 dict_ = page.get_text(
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
626 'rawdict',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
627 flags = pymupdf.TEXT_ACCURATE_BBOXES,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
628 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
629 linelist = []
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
630 for block in dict_['blocks']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
631 if block['type'] == 0:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
632 if 'lines' in block:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
633 for line in block.get('lines', ()):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
634 for span in line['spans']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
635 for ch in span['chars']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
636 if ch['c'] == char_sqrt:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
637 bbox_iterate_accurate = ch['bbox']
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
638 bboxes_iterate_accurate.append(bbox_iterate_accurate)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
639 print(f'bbox_iterate_accurate:\n {bbox_iterate_accurate!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
640 page.draw_rect(bbox_iterate_accurate, color=(0, 0, 1))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
641
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
642 assert bboxes_search_accurate != bboxes_search
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
643 assert bboxes_iterate_accurate == bboxes_search_accurate
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
644 pixmap = page.get_pixmap()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
645
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
646 path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4179_out.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
647 pixmap.save(path_out)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
648 path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4179_expected.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
649 rms = gentle_compare.pixmaps_rms(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
650 pixmap_diff = gentle_compare.pixmaps_diff(path_expected, pixmap)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
651 path_out_diff = os.path.normpath(f'{__file__}/../../tests/resources/test_4179_diff.png')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
652 pixmap_diff.save(path_out_diff)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
653 print(f'Have saved to: {path_out_diff=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
654 print(f'{rms=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
655 if pymupdf.mupdf_version_tuple < (1, 25, 5):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
656 # Prior to fix for mupdf bug 708274, our rects are rendered slightly incorrectly.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
657 assert 3.5 < rms < 4.5
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
658 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
659 assert rms < 0.01
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
660
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
661 finally:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
662 pymupdf.TOOLS.set_aa_level(aa)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
663 pymupdf.TOOLS.unset_quad_corrections(uqc)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
664
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
665
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
666 def test_extendable_textpage():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
667
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
668 # 2025-01-28:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
669 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
670 # We can create a pdf with two pages whose text is adjacent when stitched
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
671 # together vertically:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
672 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
673 # Page 1:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
674 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
675 # aaaa
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
676 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
677 # bbbb
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
678 # cccc
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
679 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
680 # dddd
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
681 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
682 # Page 2:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
683 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
684 # eeee
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
685 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
686 # ffff
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
687 # gggg
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
688 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
689 # hhhh
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
690 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
691 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
692 # Create a textpage for both of these pages. Then when extracting text,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
693 # we need to get (specifically the `dddd` and `eeee` sequences need to be
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
694 # treated as the same block):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
695 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
696 # aaaa
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
697 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
698 # bbbb
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
699 # cccc
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
700 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
701 # dddd
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
702 # eeee
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
703 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
704 # ffff
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
705 # gggg
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
706 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
707 # hhhh
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
708 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
709 print()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
710
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
711 path = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
712 with pymupdf.open(filetype='pdf') as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
713 document.new_page()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
714 document.new_page()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
715 page0 = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
716 page1 = document[1]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
717 y = 100
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
718 line_height = 9.6
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
719 for i in range(4):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
720 page0.insert_text((100, y+line_height), 'abcd'[i] * 16)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
721 page1.insert_text((100, y+line_height), 'efgh'[i] * 16)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
722 y += line_height
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
723 if i%2 == 0:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
724 y += line_height
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
725 rect = pymupdf.mupdf.FzRect(100, 100, 200, y)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
726 document[0].draw_rect(rect, (1, 0, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
727 document[1].draw_rect(rect, (1, 0, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
728 document.save(path)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
729
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
730 # Create a stext page for the text regions in both pages of our document,
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
731 # using direct calls to MuPDF.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
732 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
733
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
734 with pymupdf.Document(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
735
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
736 # Notes:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
737 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
738 # We need to reuse the stext device for second page. Otherwise if we
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
739 # create a new device, the first text in second page will always be in
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
740 # a new block, because pen position for new device is (0, 0) and this
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
741 # will usually be treated as a paragraph gap to the first text.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
742 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
743 # At the moment we use infinite mediabox when creating the
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
744 # fz_stext_page. I don't know what a non-infinite mediabox would be
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
745 # useful for.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
746 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
747 # FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
748 # to modify it to be in stext pagae coordinates (i.e. adding ctm.f
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
749 # to y0 and y1) when we append the second page. But it's internal
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
750 # data and there's no api to modify it. So for now we don't specify
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
751 # FZ_STEXT_CLIP_RECT when creating the stext device, so we always
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
752 # include each page's entire contents.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
753 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
754
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
755 # We use our knowledge of the text rect in each page to manipulate ctm
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
756 # so that the stext contains text starting at (0, 0) and extending
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
757 # downwards.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
758 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
759 y = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
760 cookie = pymupdf.mupdf.FzCookie()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
761
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
762 stext_page = pymupdf.mupdf.FzStextPage(
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
763 pymupdf.mupdf.FzRect(pymupdf.mupdf.FzRect.Fixed_INFINITE), # mediabox
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
764 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
765 stext_options = pymupdf.mupdf.FzStextOptions()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
766 #stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
767 #stext_options.clip = rect.internal()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
768 device = pymupdf.mupdf.fz_new_stext_device(stext_page, stext_options)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
769
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
770 # Add first page to stext_page at (0, y), and update <y> for the next
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
771 # page.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
772 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
773 ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
774 pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
775 y += rect.y1 - rect.y0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
776
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
777 # Add second page to stext_page at (0, y), and update <y> for the next
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
778 # page.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
779 page = document[1]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
780 ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
781 pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
782 y += rect.y1 - rect.y0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
783
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
784 # We've finished adding text to stext_page.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
785 pymupdf.mupdf.fz_close_device(device)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
786
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
787 # Create a pymupdf.TextPage() for <stext_page> so we can use
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
788 # text_page.extractDICT() etc.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
789 text_page = pymupdf.TextPage(stext_page)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
790
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
791 # Read text from stext_page using text_page.extractDICT().
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
792 print(f'Using text_page.extractDICT().')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
793 print(f'{text_page.this.m_internal.mediabox=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
794 d = text_page.extractDICT(sort=True)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
795 y0_prev = None
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
796 pno = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
797 ydelta = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
798 for block in d['blocks']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
799 print(f'block {block["bbox"]=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
800 for line in block['lines']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
801 print(f' line {line["bbox"]=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
802 for span in line['spans']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
803 print(f' span {span["bbox"]=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
804 bbox = span['bbox']
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
805 x0, y0, x1, y1 = bbox
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
806 dy = y0 - y0_prev if y0_prev else 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
807 y0_prev = y0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
808 print(f' {dy=: 5.2f} height={y1-y0:.02f} {x0:.02f} {y0:.02f} {x1:.02f} {y1:.02f} {span["text"]=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
809 if 'eee' in span['text']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
810 pno = 1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
811 ydelta = rect.y1 - rect.y0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
812 y0 -= ydelta
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
813 y1 -= ydelta
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
814 # Debugging - add green lines on original document
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
815 # translating final blocks info into original coors.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
816 document[pno].draw_rect((x0, y0, x1, y1), (0, 1, 0))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
817
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
818 print('\n\n')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
819
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
820 print(f'Using text_page.extractText()')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
821 text = text_page.extractText(True)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
822 print(f'{text}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
823
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
824 print('\n\n')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
825 print(f'Using extractBLOCKS')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
826 text = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
827 for x0, y0, x1, y1, line, no, type_ in text_page.extractBLOCKS():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
828 print(f'block:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
829 print(f' bbox={x0, y0, x1, y1} {no=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
830 print(f' {line=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
831 text.append(line)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
832
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
833 print("\n\n")
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
834 print(f'extractBLOCKS joined by newlines:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
835 print('\n'.join(text))
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
836
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
837 # This checks that lines before/after pages break are treated as a
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
838 # single paragraph.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
839 assert text == [
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
840 'aaaaaaaaaaaaaaaa\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
841 'bbbbbbbbbbbbbbbb\ncccccccccccccccc\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
842 'dddddddddddddddd\neeeeeeeeeeeeeeee\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
843 'ffffffffffffffff\ngggggggggggggggg\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
844 'hhhhhhhhhhhhhhhh\n',
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
845 ]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
846
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
847 path3 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage3.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
848 document.save(path3)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
849
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
850
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
851 def test_4363():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
852 print()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
853 print(f'{pymupdf.version=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
854 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4363.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
855 n = 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
856 texts = list()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
857 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
858 assert len(document) == 1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
859 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
860 t = page.search_for('tour')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
861 print(f'{t=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
862 n += len(t)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
863 text = page.get_text()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
864 texts.append(text)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
865 print(f'{n=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
866 print(f'{len(texts)=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
867 text = texts[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
868 print('text:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
869 print(f'{text=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
870 text_expected = (
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
871 'Deal Roadshow SiteTour\n'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
872 'We know your process. We know your standard.\n'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
873 'Professional Site Tour Video Productions for the Capital Markets.\n'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
874 '1\n'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
875 )
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
876 if text != text_expected:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
877 print(f'Expected:\n {text_expected!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
878 print(f'Found:\n {text!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
879 assert 0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
880
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
881
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
882 def test_4546():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
883 # This issue will not be fixed (in mupdf) because the test input is faulty.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
884 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
885 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4546.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
886 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
887 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
888 text = page.get_text()[:200]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
889
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
890 # We can't actually test with 1.23.5 because it uses `fitz.` not `pymupdf.`.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
891 expected_1_23_5 = b'JOB No.: \nShipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\n\xe5\x9d\x80) \nSINORICH TRANSPORT LIMITED\nADD:7C,WEST BLDG.,ZHONGQU\nMANSION,211 ZHONGSHAN\nRD. SHANTOU,515041 CN\nTEL:0754-88570001 FAX:0754-88572709\nS/O No. '.decode()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
892
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
893 # This output is different from expected_1_23_5.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
894 expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode()
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
895
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
896 # This output is different from either of the two expected strings.
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
897 expected_mupdf_1_27_0 = b'JOB No.: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88572702 \n \xe6\x93\x8d\xe4\xbd\x9c\xe9\x83\xa8: \n88570008 \n88570004 \n \xe6\x96\x87\xe4\xbb\xb6\xe9\x83\xa8: \n88570003\n \nNotify Party(complete name and address, '.decode()
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
898
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
899 print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
900 print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
901
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
902 print(f'{pymupdf.version=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
903 print(f'text is:\n{textwrap.indent(text, " ")}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
904 print(f'{text=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
905 print(f'{text.encode()=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
906
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
907 wt = pymupdf.TOOLS.mupdf_warnings()
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
908 if pymupdf.mupdf_version_tuple >= (1, 26, 8):
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
909 assert text == expected_mupdf_1_27_0
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
910 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 120 times...'
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
911 elif pymupdf.mupdf_version_tuple >= (1, 26, 1):
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
912 assert text == expected_mupdf_1_26_1
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
913 assert not wt
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
914 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
915 print(f'No expected output for {pymupdf.mupdf_version_tuple=}')
39
a6bc019ac0b2 ADD: PyMuPDF v1.26.5: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents: 1
diff changeset
916 assert not wt
1
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
917
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
918
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
919 def test_4503():
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
920 # Check detection of strikeout text. Behaviour is improved with
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
921 # mupdf>=1.26.2, and fixed with mupdf>=1.26.3.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
922 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
923 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4503.pdf')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
924 span_0 = None
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
925 text_0 = None
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
926 print()
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
927 print(f'{pymupdf.mupdf_version_tuple=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
928 with pymupdf.open(path) as document:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
929 page = document[0]
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
930 # Specify TEXT_COLLECT_STYLES so we collect char_flags, which contains
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
931 # FZ_STEXT_STRIKEOUT etc.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
932 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
933 text = page.get_text('rawdict', flags=pymupdf.TEXTFLAGS_RAWDICT | pymupdf.TEXT_COLLECT_STYLES)
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
934 for i, block in enumerate(text['blocks']):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
935 print(f'block {i}:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
936 for j, line in enumerate(block['lines']):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
937 print(f' line {j}:')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
938 for k, span in enumerate(line['spans']):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
939 text = ''
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
940 for char in span['chars']:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
941 text += char['c']
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
942 print(f' span {k}: {span["flags"]=:#x} {span["char_flags"]=:#x}: {text!r}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
943 if 'the right to request the state to review' in text:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
944 span_0 = span
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
945 text_0 = text
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
946 assert span_0
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
947 #print(f'{span_0=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
948 print(f'{span_0["flags"]=:#x}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
949 print(f'{span_0["char_flags"]=:#x}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
950 print(f'{text_0=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
951 strikeout = span_0['char_flags'] & pymupdf.mupdf.FZ_STEXT_STRIKEOUT
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
952 print(f'{strikeout=}')
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
953
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
954 if pymupdf.mupdf_version_tuple >= (1, 26, 3):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
955 assert strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be set in {span_0["char_flags"]=:#x}.'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
956 assert text_0 == 'the right to request the state to review and, if appropriate,'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
957 elif pymupdf.mupdf_version_tuple >= (1, 26, 2):
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
958 # 2025-06-09: This is still incorrect - the span should include the
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
959 # following text 'and, if appropriate,'. It looks like following spans
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
960 # are:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
961 # strikeout=0: 'and, '
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
962 # strikeout=1: 'if '
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
963 # strikeout=0: 'appropri'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
964 # strikeout=1: 'ate,'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
965 #
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
966 assert strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be set in {span_0["char_flags"]=:#x}.'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
967 assert text_0 == 'the right to request the state to review '
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
968 else:
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
969 # Expecting the bug.
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
970 assert not strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be unset in {span_0["char_flags"]=:#x}.'
1d09e1dec1d9 ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff changeset
971 assert text_0 == 'notice the right to request the state to review and, if appropriate,'