Mercurial > hgrepos > Python2 > PyMuPDF
annotate tests/test_word_delimiters.py @ 46:7ee69f120f19 default tip
>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:30 +0200 |
| parents | 1d09e1dec1d9 |
| children |
| rev | line source |
|---|---|
|
1
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
1 import pymupdf |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
2 import string |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
3 |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
4 |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
5 def test_delimiters(): |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
6 """Test changing word delimiting characters.""" |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
7 doc = pymupdf.open() |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
8 page = doc.new_page() |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
9 text = "word1,word2 - word3. word4?word5." |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
10 page.insert_text((50, 50), text) |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
11 |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
12 # Standard words extraction: |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
13 # only spaces and line breaks start a new word |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
14 words0 = [w[4] for w in page.get_text("words")] |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
15 assert words0 == ["word1,word2", "-", "word3.", "word4?word5."] |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
16 |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
17 # extract words again |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
18 words1 = [w[4] for w in page.get_text("words", delimiters=string.punctuation)] |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
19 assert words0 != words1 |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
20 assert " ".join(words1) == "word1 word2 word3 word4 word5" |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
21 |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
22 # confirm we will be getting old extraction |
|
1d09e1dec1d9
ADD: PyMuPDF v1.26.4: the original sdist.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
23 assert [w[4] for w in page.get_text("words")] == words0 |
