comparison tests/test_textsearch.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 """
2 "test_search1":
3 Search for some text on a PDF page, and compare content of returned hit
4 rectangle with the searched text.
5
6 "test_search2":
7 Text search with 'clip' parameter - clip rectangle contains two occurrences
8 of searched text. Confirm search locations are inside clip.
9 """
10
11 import os
12
13 import pymupdf
14
15 scriptdir = os.path.abspath(os.path.dirname(__file__))
16 filename1 = os.path.join(scriptdir, "resources", "2.pdf")
17 filename2 = os.path.join(scriptdir, "resources", "github_sample.pdf")
18 filename3 = os.path.join(scriptdir, "resources", "text-find-ligatures.pdf")
19
20
21 def test_search1():
22 doc = pymupdf.open(filename1)
23 page = doc[0]
24 needle = "mupdf"
25 rlist = page.search_for(needle)
26 assert rlist != []
27 for rect in rlist:
28 assert needle in page.get_textbox(rect).lower()
29
30
31 def test_search2():
32 doc = pymupdf.open(filename2)
33 page = doc[0]
34 needle = "the"
35 clip = pymupdf.Rect(40.5, 228.31436157226562, 346.5226135253906, 239.5338592529297)
36 rl = page.search_for(needle, clip=clip)
37 assert len(rl) == 2
38 for r in rl:
39 assert r in clip
40
41
42 def test_search3():
43 """Ensure we find text whether or not it contains ligatures."""
44 doc = pymupdf.open(filename3)
45 page = doc[0]
46 needle = "flag"
47 hits = page.search_for(needle, flags=pymupdf.TEXTFLAGS_SEARCH)
48 assert len(hits) == 2 # all occurrences found
49 hits = page.search_for(
50 needle, flags=pymupdf.TEXTFLAGS_SEARCH | pymupdf.TEXT_PRESERVE_LIGATURES
51 )
52 assert len(hits) == 1 # only found text without ligatures