Mercurial > hgrepos > Python2 > PyMuPDF
comparison tests/test_tesseract.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children | a6bc019ac0b2 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 1:1d09e1dec1d9 |
|---|---|
| 1 import os | |
| 2 import platform | |
| 3 import textwrap | |
| 4 | |
| 5 import pymupdf | |
| 6 | |
| 7 def test_tesseract(): | |
| 8 ''' | |
| 9 This checks that MuPDF has been built with tesseract support. | |
| 10 | |
| 11 By default we don't supply a valid `tessdata` directory, and just assert | |
| 12 that attempting to use Tesseract raises the expected error (which checks | |
| 13 that MuPDF is built with Tesseract support). | |
| 14 | |
| 15 But if TESSDATA_PREFIX is set in the environment, we assert that | |
| 16 FzPage.get_textpage_ocr() succeeds. | |
| 17 ''' | |
| 18 path = os.path.abspath( f'{__file__}/../resources/2.pdf') | |
| 19 doc = pymupdf.open( path) | |
| 20 page = doc[5] | |
| 21 if hasattr(pymupdf, 'mupdf'): | |
| 22 # rebased. | |
| 23 if pymupdf.mupdf_version_tuple < (1, 25, 4): | |
| 24 tail = 'OCR initialisation failed' | |
| 25 else: | |
| 26 tail = 'Tesseract language initialisation failed' | |
| 27 e_expected = f'code=3: {tail}' | |
| 28 if platform.system() == 'OpenBSD': | |
| 29 # 2023-12-12: For some reason the SWIG catch code only catches | |
| 30 # the exception as FzErrorBase. | |
| 31 e_expected_type = pymupdf.mupdf.FzErrorBase | |
| 32 print(f'OpenBSD workaround - expecting FzErrorBase, not FzErrorLibrary.') | |
| 33 else: | |
| 34 e_expected_type = pymupdf.mupdf.FzErrorLibrary | |
| 35 else: | |
| 36 # classic. | |
| 37 e_expected = 'OCR initialisation failed' | |
| 38 e_expected_type = None | |
| 39 tessdata_prefix = os.environ.get('TESSDATA_PREFIX') | |
| 40 if tessdata_prefix: | |
| 41 tp = page.get_textpage_ocr(full=True) | |
| 42 print(f'test_tesseract(): page.get_textpage_ocr() succeeded') | |
| 43 else: | |
| 44 try: | |
| 45 tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar') | |
| 46 except Exception as e: | |
| 47 e_text = str(e) | |
| 48 print(f'Received exception as expected.') | |
| 49 print(f'{type(e)=}') | |
| 50 print(f'{e_text=}') | |
| 51 assert e_text == e_expected, f'Unexpected exception: {e_text!r}' | |
| 52 if e_expected_type: | |
| 53 print(f'{e_expected_type=}') | |
| 54 assert type(e) == e_expected_type, f'{type(e)=} != {e_expected_type=}.' | |
| 55 else: | |
| 56 assert 0, f'Expected exception {e_expected!r}' | |
| 57 rebased = hasattr(pymupdf, 'mupdf') | |
| 58 if rebased: | |
| 59 wt = pymupdf.TOOLS.mupdf_warnings() | |
| 60 if pymupdf.mupdf_version_tuple < (1, 25, 4): | |
| 61 assert wt == ( | |
| 62 'UNHANDLED EXCEPTION!\n' | |
| 63 'library error: Tesseract initialisation failed' | |
| 64 ) | |
| 65 else: | |
| 66 assert not wt | |
| 67 | |
| 68 | |
| 69 def test_3842b(): | |
| 70 # Check Tesseract failure when given a bogus languages. | |
| 71 # | |
| 72 # Note that Tesseract seems to output its own diagnostics. | |
| 73 # | |
| 74 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf') | |
| 75 with pymupdf.open(path) as document: | |
| 76 page = document[6] | |
| 77 try: | |
| 78 partial_tp = page.get_textpage_ocr(flags=0, full=False, language='qwerty') | |
| 79 except Exception as e: | |
| 80 print(f'test_3842b(): received exception: {e}') | |
| 81 if 'No tessdata specified and Tesseract is not installed' in str(e): | |
| 82 pass | |
| 83 else: | |
| 84 if pymupdf.mupdf_version_tuple < (1, 25, 4): | |
| 85 assert 'OCR initialisation failed' in str(e) | |
| 86 wt = pymupdf.TOOLS.mupdf_warnings() | |
| 87 assert wt == 'UNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed\nUNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed', \ | |
| 88 f'Unexpected {wt=}' | |
| 89 else: | |
| 90 assert 'Tesseract language initialisation failed' in str(e) | |
| 91 | |
| 92 | |
| 93 def test_3842(): | |
| 94 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf') | |
| 95 with pymupdf.open(path) as document: | |
| 96 page = document[6] | |
| 97 try: | |
| 98 partial_tp = page.get_textpage_ocr(flags=0, full=False) | |
| 99 except Exception as e: | |
| 100 print(f'test_3842(): received exception: {e}', flush=1) | |
| 101 if 'No tessdata specified and Tesseract is not installed' in str(e): | |
| 102 pass | |
| 103 elif 'Tesseract language initialisation failed' in str(e): | |
| 104 pass | |
| 105 else: | |
| 106 assert 0, f'Unexpected exception text: {str(e)=}' | |
| 107 else: | |
| 108 text = page.get_text(textpage=partial_tp) | |
| 109 print() | |
| 110 print(text) | |
| 111 print(f'text:\n{text!r}') | |
| 112 | |
| 113 # 2024-11-29: This is the current incorrect output. We use | |
| 114 # underscores for lines containing entirely whitespace (which | |
| 115 # textwrap.dedent() unfortunately replaces with empty lines). | |
| 116 text_expected = textwrap.dedent(''' | |
| 117 NIST SP 800-223 | |
| 118 _ | |
| 119 High-Performance Computing Security | |
| 120 February 2024 | |
| 121 _ | |
| 122 __ | |
| 123 iii | |
| 124 Table of Contents | |
| 125 1. Introduction ...................................................................................................................................1 | |
| 126 2. HPC System Reference Architecture and Main Components ............................................................2 | |
| 127 2.1.1. Components of the High-Performance Computing Zone ............................................................. 3 | |
| 128 2.1.2. Components of the Data Storage Zone ........................................................................................ 4 | |
| 129 2.1.3. Parallel File System ....................................................................................................................... 4 | |
| 130 2.1.4. Archival and Campaign Storage .................................................................................................... 5 | |
| 131 2.1.5. Burst Buffer .................................................................................................................................. 5 | |
| 132 2.1.6. Components of the Access Zone .................................................................................................. 6 | |
| 133 2.1.7. Components of the Management Zone ....................................................................................... 6 | |
| 134 2.1.8. General Architecture and Characteristics .................................................................................... 6 | |
| 135 2.1.9. Basic Services ................................................................................................................................ 7 | |
| 136 2.1.10. Configuration Management ....................................................................................................... 7 | |
| 137 2.1.11. HPC Scheduler and Workflow Management .............................................................................. 7 | |
| 138 2.1.12. HPC Software .............................................................................................................................. 8 | |
| 139 2.1.13. User Software ............................................................................................................................. 8 | |
| 140 2.1.14. Site-Provided Software and Vendor Software ........................................................................... 8 | |
| 141 2.1.15. Containerized Software in HPC .................................................................................................. 9 | |
| 142 3. HPC Threat Analysis...................................................................................................................... 10 | |
| 143 3.2.1. Access Zone Threats ................................................................................................................... 11 | |
| 144 3.2.2. Management Zone Threats ........................................................................................................ 11 | |
| 145 3.2.3. High-Performance Computing Zone Threats .............................................................................. 12 | |
| 146 3.2.4. Data Storage Zone Threats ......................................................................................................... 12 | |
| 147 4. HPC Security Posture, Challenges, and Recommendations ............................................................. 14 | |
| 148 5. Conclusions .................................................................................................................................. 19 | |
| 149 ''', | |
| 150 )[1:].replace('_', ' ') | |
| 151 print(f'text_expected:\n{text_expected!r}') | |
| 152 assert text == text_expected |
