comparison tests/test_tesseract.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children a6bc019ac0b2
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 import os
2 import platform
3 import textwrap
4
5 import pymupdf
6
7 def test_tesseract():
8 '''
9 This checks that MuPDF has been built with tesseract support.
10
11 By default we don't supply a valid `tessdata` directory, and just assert
12 that attempting to use Tesseract raises the expected error (which checks
13 that MuPDF is built with Tesseract support).
14
15 But if TESSDATA_PREFIX is set in the environment, we assert that
16 FzPage.get_textpage_ocr() succeeds.
17 '''
18 path = os.path.abspath( f'{__file__}/../resources/2.pdf')
19 doc = pymupdf.open( path)
20 page = doc[5]
21 if hasattr(pymupdf, 'mupdf'):
22 # rebased.
23 if pymupdf.mupdf_version_tuple < (1, 25, 4):
24 tail = 'OCR initialisation failed'
25 else:
26 tail = 'Tesseract language initialisation failed'
27 e_expected = f'code=3: {tail}'
28 if platform.system() == 'OpenBSD':
29 # 2023-12-12: For some reason the SWIG catch code only catches
30 # the exception as FzErrorBase.
31 e_expected_type = pymupdf.mupdf.FzErrorBase
32 print(f'OpenBSD workaround - expecting FzErrorBase, not FzErrorLibrary.')
33 else:
34 e_expected_type = pymupdf.mupdf.FzErrorLibrary
35 else:
36 # classic.
37 e_expected = 'OCR initialisation failed'
38 e_expected_type = None
39 tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
40 if tessdata_prefix:
41 tp = page.get_textpage_ocr(full=True)
42 print(f'test_tesseract(): page.get_textpage_ocr() succeeded')
43 else:
44 try:
45 tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar')
46 except Exception as e:
47 e_text = str(e)
48 print(f'Received exception as expected.')
49 print(f'{type(e)=}')
50 print(f'{e_text=}')
51 assert e_text == e_expected, f'Unexpected exception: {e_text!r}'
52 if e_expected_type:
53 print(f'{e_expected_type=}')
54 assert type(e) == e_expected_type, f'{type(e)=} != {e_expected_type=}.'
55 else:
56 assert 0, f'Expected exception {e_expected!r}'
57 rebased = hasattr(pymupdf, 'mupdf')
58 if rebased:
59 wt = pymupdf.TOOLS.mupdf_warnings()
60 if pymupdf.mupdf_version_tuple < (1, 25, 4):
61 assert wt == (
62 'UNHANDLED EXCEPTION!\n'
63 'library error: Tesseract initialisation failed'
64 )
65 else:
66 assert not wt
67
68
69 def test_3842b():
70 # Check Tesseract failure when given a bogus languages.
71 #
72 # Note that Tesseract seems to output its own diagnostics.
73 #
74 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf')
75 with pymupdf.open(path) as document:
76 page = document[6]
77 try:
78 partial_tp = page.get_textpage_ocr(flags=0, full=False, language='qwerty')
79 except Exception as e:
80 print(f'test_3842b(): received exception: {e}')
81 if 'No tessdata specified and Tesseract is not installed' in str(e):
82 pass
83 else:
84 if pymupdf.mupdf_version_tuple < (1, 25, 4):
85 assert 'OCR initialisation failed' in str(e)
86 wt = pymupdf.TOOLS.mupdf_warnings()
87 assert wt == 'UNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed\nUNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed', \
88 f'Unexpected {wt=}'
89 else:
90 assert 'Tesseract language initialisation failed' in str(e)
91
92
93 def test_3842():
94 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf')
95 with pymupdf.open(path) as document:
96 page = document[6]
97 try:
98 partial_tp = page.get_textpage_ocr(flags=0, full=False)
99 except Exception as e:
100 print(f'test_3842(): received exception: {e}', flush=1)
101 if 'No tessdata specified and Tesseract is not installed' in str(e):
102 pass
103 elif 'Tesseract language initialisation failed' in str(e):
104 pass
105 else:
106 assert 0, f'Unexpected exception text: {str(e)=}'
107 else:
108 text = page.get_text(textpage=partial_tp)
109 print()
110 print(text)
111 print(f'text:\n{text!r}')
112
113 # 2024-11-29: This is the current incorrect output. We use
114 # underscores for lines containing entirely whitespace (which
115 # textwrap.dedent() unfortunately replaces with empty lines).
116 text_expected = textwrap.dedent('''
117 NIST SP 800-223
118 _
119 High-Performance Computing Security
120 February 2024
121 _
122 __
123 iii
124 Table of Contents
125 1. Introduction ...................................................................................................................................1
126 2. HPC System Reference Architecture and Main Components ............................................................2
127 2.1.1. Components of the High-Performance Computing Zone ............................................................. 3
128 2.1.2. Components of the Data Storage Zone ........................................................................................ 4
129 2.1.3. Parallel File System ....................................................................................................................... 4
130 2.1.4. Archival and Campaign Storage .................................................................................................... 5
131 2.1.5. Burst Buffer .................................................................................................................................. 5
132 2.1.6. Components of the Access Zone .................................................................................................. 6
133 2.1.7. Components of the Management Zone ....................................................................................... 6
134 2.1.8. General Architecture and Characteristics .................................................................................... 6
135 2.1.9. Basic Services ................................................................................................................................ 7
136 2.1.10. Configuration Management ....................................................................................................... 7
137 2.1.11. HPC Scheduler and Workflow Management .............................................................................. 7
138 2.1.12. HPC Software .............................................................................................................................. 8
139 2.1.13. User Software ............................................................................................................................. 8
140 2.1.14. Site-Provided Software and Vendor Software ........................................................................... 8
141 2.1.15. Containerized Software in HPC .................................................................................................. 9
142 3. HPC Threat Analysis...................................................................................................................... 10
143 3.2.1. Access Zone Threats ................................................................................................................... 11
144 3.2.2. Management Zone Threats ........................................................................................................ 11
145 3.2.3. High-Performance Computing Zone Threats .............................................................................. 12
146 3.2.4. Data Storage Zone Threats ......................................................................................................... 12
147 4. HPC Security Posture, Challenges, and Recommendations ............................................................. 14
148 5. Conclusions .................................................................................................................................. 19
149 ''',
150 )[1:].replace('_', ' ')
151 print(f'text_expected:\n{text_expected!r}')
152 assert text == text_expected