Mercurial > hgrepos > Python2 > PyMuPDF
comparison tests/test_textextract.py @ 39:a6bc019ac0b2 upstream
ADD: PyMuPDF v1.26.5: the original sdist.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 11:19:58 +0200 |
| parents | 1d09e1dec1d9 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:b50eed0cc0ef | 39:a6bc019ac0b2 |
|---|---|
| 261 #print(f' {text_utf8_expected[i]=}') | 261 #print(f' {text_utf8_expected[i]=}') |
| 262 assert text_utf8 == text_utf8_expected[i] | 262 assert text_utf8 == text_utf8_expected[i] |
| 263 | 263 |
| 264 | 264 |
| 265 def test_document_text(): | 265 def test_document_text(): |
| 266 if os.environ.get('PYODIDE_ROOT'): | |
| 267 print('test_document_text(): not running on Pyodide - multiprocessing not available.') | |
| 268 return | |
| 269 | |
| 266 import platform | 270 import platform |
| 267 import time | 271 import time |
| 268 | 272 |
| 269 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') | 273 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') |
| 270 concurrency = None | 274 concurrency = None |
| 308 if _stats: | 312 if _stats: |
| 309 pymupdf._log_items_clear() | 313 pymupdf._log_items_clear() |
| 310 | 314 |
| 311 | 315 |
| 312 def test_4524(): | 316 def test_4524(): |
| 317 if os.environ.get('PYODIDE_ROOT'): | |
| 318 print('test_4524(): not running on Pyodide - multiprocessing not available.') | |
| 319 return | |
| 313 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') | 320 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') |
| 314 print('') | 321 print('') |
| 315 document = pymupdf.Document(path) | 322 document = pymupdf.Document(path) |
| 316 texts_single = pymupdf.get_text(path, method='single', pages=[1, 3, 5]) | 323 texts_single = pymupdf.get_text(path, method='single', pages=[1, 3, 5]) |
| 317 texts_mp = pymupdf.get_text(path, method='mp', pages=[1, 3, 5]) | 324 texts_mp = pymupdf.get_text(path, method='mp', pages=[1, 3, 5]) |
| 329 print(f'Page {i}:') | 336 print(f'Page {i}:') |
| 330 if verbose: | 337 if verbose: |
| 331 for line in text.split('\n'): | 338 for line in text.split('\n'): |
| 332 print(f' {line!r}') | 339 print(f' {line!r}') |
| 333 print('='*40) | 340 print('='*40) |
| 341 wt = pymupdf.TOOLS.mupdf_warnings() | |
| 342 if pymupdf.mupdf_version_tuple < (1, 26, 8): | |
| 343 assert not wt | |
| 344 else: | |
| 345 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 2 times...' | |
| 334 | 346 |
| 335 | 347 |
| 336 def test_3687(): | 348 def test_3687(): |
| 337 path1 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687.epub')) | 349 path1 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687.epub')) |
| 338 path2 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687-3.epub')) | 350 path2 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687-3.epub')) |
| 373 texts1.append(text) | 385 texts1.append(text) |
| 374 | 386 |
| 375 assert texts1 == texts0 | 387 assert texts1 == texts0 |
| 376 | 388 |
| 377 wt = pymupdf.TOOLS.mupdf_warnings() | 389 wt = pymupdf.TOOLS.mupdf_warnings() |
| 378 if pymupdf.mupdf_version_tuple < (1, 27): | 390 if pymupdf.mupdf_version_tuple >= (1, 27): |
| 379 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...' | |
| 380 else: | |
| 381 expected = 'format error: No common ancestor in structure tree\nstructure tree broken, assume tree is missing' | 391 expected = 'format error: No common ancestor in structure tree\nstructure tree broken, assume tree is missing' |
| 382 expected = '\n'.join([expected] * 56) | 392 expected = '\n'.join([expected] * 56) |
| 383 assert wt == expected | 393 assert wt == expected |
| 394 elif pymupdf.mupdf_version_tuple >= (1, 26, 8): | |
| 395 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 7684 times...' | |
| 396 else: | |
| 397 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...' | |
| 384 | 398 |
| 385 def test_3650(): | 399 def test_3650(): |
| 386 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf') | 400 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf') |
| 387 doc = pymupdf.Document(path) | 401 doc = pymupdf.Document(path) |
| 388 blocks = doc[0].get_text("blocks") | 402 blocks = doc[0].get_text("blocks") |
| 876 # We can't actually test with 1.23.5 because it uses `fitz.` not `pymupdf.`. | 890 # We can't actually test with 1.23.5 because it uses `fitz.` not `pymupdf.`. |
| 877 expected_1_23_5 = b'JOB No.: \nShipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\n\xe5\x9d\x80) \nSINORICH TRANSPORT LIMITED\nADD:7C,WEST BLDG.,ZHONGQU\nMANSION,211 ZHONGSHAN\nRD. SHANTOU,515041 CN\nTEL:0754-88570001 FAX:0754-88572709\nS/O No. '.decode() | 891 expected_1_23_5 = b'JOB No.: \nShipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\n\xe5\x9d\x80) \nSINORICH TRANSPORT LIMITED\nADD:7C,WEST BLDG.,ZHONGQU\nMANSION,211 ZHONGSHAN\nRD. SHANTOU,515041 CN\nTEL:0754-88570001 FAX:0754-88572709\nS/O No. '.decode() |
| 878 | 892 |
| 879 # This output is different from expected_1_23_5. | 893 # This output is different from expected_1_23_5. |
| 880 expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode() | 894 expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode() |
| 895 | |
| 896 # This output is different from either of the two expected strings. | |
| 897 expected_mupdf_1_27_0 = b'JOB No.: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88572702 \n \xe6\x93\x8d\xe4\xbd\x9c\xe9\x83\xa8: \n88570008 \n88570004 \n \xe6\x96\x87\xe4\xbb\xb6\xe9\x83\xa8: \n88570003\n \nNotify Party(complete name and address, '.decode() | |
| 881 | 898 |
| 882 print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}') | 899 print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}') |
| 883 print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}') | 900 print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}') |
| 884 | 901 |
| 885 print(f'{pymupdf.version=}') | 902 print(f'{pymupdf.version=}') |
| 886 print(f'text is:\n{textwrap.indent(text, " ")}') | 903 print(f'text is:\n{textwrap.indent(text, " ")}') |
| 887 print(f'{text=}') | 904 print(f'{text=}') |
| 888 print(f'{text.encode()=}') | 905 print(f'{text.encode()=}') |
| 889 | 906 |
| 890 if pymupdf.mupdf_version_tuple >= (1, 26, 1): | 907 wt = pymupdf.TOOLS.mupdf_warnings() |
| 908 if pymupdf.mupdf_version_tuple >= (1, 26, 8): | |
| 909 assert text == expected_mupdf_1_27_0 | |
| 910 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 120 times...' | |
| 911 elif pymupdf.mupdf_version_tuple >= (1, 26, 1): | |
| 891 assert text == expected_mupdf_1_26_1 | 912 assert text == expected_mupdf_1_26_1 |
| 913 assert not wt | |
| 892 else: | 914 else: |
| 893 print(f'No expected output for {pymupdf.mupdf_version_tuple=}') | 915 print(f'No expected output for {pymupdf.mupdf_version_tuple=}') |
| 916 assert not wt | |
| 894 | 917 |
| 895 | 918 |
| 896 def test_4503(): | 919 def test_4503(): |
| 897 # Check detection of strikeout text. Behaviour is improved with | 920 # Check detection of strikeout text. Behaviour is improved with |
| 898 # mupdf>=1.26.2, and fixed with mupdf>=1.26.3. | 921 # mupdf>=1.26.2, and fixed with mupdf>=1.26.3. |
