comparison tests/test_textextract.py @ 39:a6bc019ac0b2 upstream

ADD: PyMuPDF v1.26.5: the original sdist.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Oct 2025 11:19:58 +0200
parents 1d09e1dec1d9
children
comparison
equal deleted inserted replaced
2:b50eed0cc0ef 39:a6bc019ac0b2
261 #print(f' {text_utf8_expected[i]=}') 261 #print(f' {text_utf8_expected[i]=}')
262 assert text_utf8 == text_utf8_expected[i] 262 assert text_utf8 == text_utf8_expected[i]
263 263
264 264
265 def test_document_text(): 265 def test_document_text():
266 if os.environ.get('PYODIDE_ROOT'):
267 print('test_document_text(): not running on Pyodide - multiprocessing not available.')
268 return
269
266 import platform 270 import platform
267 import time 271 import time
268 272
269 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') 273 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf')
270 concurrency = None 274 concurrency = None
308 if _stats: 312 if _stats:
309 pymupdf._log_items_clear() 313 pymupdf._log_items_clear()
310 314
311 315
312 def test_4524(): 316 def test_4524():
317 if os.environ.get('PYODIDE_ROOT'):
318 print('test_4524(): not running on Pyodide - multiprocessing not available.')
319 return
313 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') 320 path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf')
314 print('') 321 print('')
315 document = pymupdf.Document(path) 322 document = pymupdf.Document(path)
316 texts_single = pymupdf.get_text(path, method='single', pages=[1, 3, 5]) 323 texts_single = pymupdf.get_text(path, method='single', pages=[1, 3, 5])
317 texts_mp = pymupdf.get_text(path, method='mp', pages=[1, 3, 5]) 324 texts_mp = pymupdf.get_text(path, method='mp', pages=[1, 3, 5])
329 print(f'Page {i}:') 336 print(f'Page {i}:')
330 if verbose: 337 if verbose:
331 for line in text.split('\n'): 338 for line in text.split('\n'):
332 print(f' {line!r}') 339 print(f' {line!r}')
333 print('='*40) 340 print('='*40)
341 wt = pymupdf.TOOLS.mupdf_warnings()
342 if pymupdf.mupdf_version_tuple < (1, 26, 8):
343 assert not wt
344 else:
345 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 2 times...'
334 346
335 347
336 def test_3687(): 348 def test_3687():
337 path1 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687.epub')) 349 path1 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687.epub'))
338 path2 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687-3.epub')) 350 path2 = pymupdf.open(os.path.normpath(f'{__file__}/../../tests/resources/test_3687-3.epub'))
373 texts1.append(text) 385 texts1.append(text)
374 386
375 assert texts1 == texts0 387 assert texts1 == texts0
376 388
377 wt = pymupdf.TOOLS.mupdf_warnings() 389 wt = pymupdf.TOOLS.mupdf_warnings()
378 if pymupdf.mupdf_version_tuple < (1, 27): 390 if pymupdf.mupdf_version_tuple >= (1, 27):
379 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
380 else:
381 expected = 'format error: No common ancestor in structure tree\nstructure tree broken, assume tree is missing' 391 expected = 'format error: No common ancestor in structure tree\nstructure tree broken, assume tree is missing'
382 expected = '\n'.join([expected] * 56) 392 expected = '\n'.join([expected] * 56)
383 assert wt == expected 393 assert wt == expected
394 elif pymupdf.mupdf_version_tuple >= (1, 26, 8):
395 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 7684 times...'
396 else:
397 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
384 398
385 def test_3650(): 399 def test_3650():
386 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf') 400 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf')
387 doc = pymupdf.Document(path) 401 doc = pymupdf.Document(path)
388 blocks = doc[0].get_text("blocks") 402 blocks = doc[0].get_text("blocks")
876 # We can't actually test with 1.23.5 because it uses `fitz.` not `pymupdf.`. 890 # We can't actually test with 1.23.5 because it uses `fitz.` not `pymupdf.`.
877 expected_1_23_5 = b'JOB No.: \nShipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\n\xe5\x9d\x80) \nSINORICH TRANSPORT LIMITED\nADD:7C,WEST BLDG.,ZHONGQU\nMANSION,211 ZHONGSHAN\nRD. SHANTOU,515041 CN\nTEL:0754-88570001 FAX:0754-88572709\nS/O No. '.decode() 891 expected_1_23_5 = b'JOB No.: \nShipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\n\xe5\x9d\x80) \nSINORICH TRANSPORT LIMITED\nADD:7C,WEST BLDG.,ZHONGQU\nMANSION,211 ZHONGSHAN\nRD. SHANTOU,515041 CN\nTEL:0754-88570001 FAX:0754-88572709\nS/O No. '.decode()
878 892
879 # This output is different from expected_1_23_5. 893 # This output is different from expected_1_23_5.
880 expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode() 894 expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode()
895
896 # This output is different from either of the two expected strings.
897 expected_mupdf_1_27_0 = b'JOB No.: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88572702 \n \xe6\x93\x8d\xe4\xbd\x9c\xe9\x83\xa8: \n88570008 \n88570004 \n \xe6\x96\x87\xe4\xbb\xb6\xe9\x83\xa8: \n88570003\n \nNotify Party(complete name and address, '.decode()
881 898
882 print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}') 899 print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}')
883 print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}') 900 print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}')
884 901
885 print(f'{pymupdf.version=}') 902 print(f'{pymupdf.version=}')
886 print(f'text is:\n{textwrap.indent(text, " ")}') 903 print(f'text is:\n{textwrap.indent(text, " ")}')
887 print(f'{text=}') 904 print(f'{text=}')
888 print(f'{text.encode()=}') 905 print(f'{text.encode()=}')
889 906
890 if pymupdf.mupdf_version_tuple >= (1, 26, 1): 907 wt = pymupdf.TOOLS.mupdf_warnings()
908 if pymupdf.mupdf_version_tuple >= (1, 26, 8):
909 assert text == expected_mupdf_1_27_0
910 assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 120 times...'
911 elif pymupdf.mupdf_version_tuple >= (1, 26, 1):
891 assert text == expected_mupdf_1_26_1 912 assert text == expected_mupdf_1_26_1
913 assert not wt
892 else: 914 else:
893 print(f'No expected output for {pymupdf.mupdf_version_tuple=}') 915 print(f'No expected output for {pymupdf.mupdf_version_tuple=}')
916 assert not wt
894 917
895 918
896 def test_4503(): 919 def test_4503():
897 # Check detection of strikeout text. Behaviour is improved with 920 # Check detection of strikeout text. Behaviour is improved with
898 # mupdf>=1.26.2, and fixed with mupdf>=1.26.3. 921 # mupdf>=1.26.2, and fixed with mupdf>=1.26.3.