comparison tests/test_word_delimiters.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 import pymupdf
2 import string
3
4
5 def test_delimiters():
6 """Test changing word delimiting characters."""
7 doc = pymupdf.open()
8 page = doc.new_page()
9 text = "word1,word2 - word3. word4?word5."
10 page.insert_text((50, 50), text)
11
12 # Standard words extraction:
13 # only spaces and line breaks start a new word
14 words0 = [w[4] for w in page.get_text("words")]
15 assert words0 == ["word1,word2", "-", "word3.", "word4?word5."]
16
17 # extract words again
18 words1 = [w[4] for w in page.get_text("words", delimiters=string.punctuation)]
19 assert words0 != words1
20 assert " ".join(words1) == "word1 word2 word3 word4 word5"
21
22 # confirm we will be getting old extraction
23 assert [w[4] for w in page.get_text("words")] == words0