comparison mupdf-source/thirdparty/tesseract/src/ccstruct/werd.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: werd.h
3 * Description: Code for the WERD class.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #ifndef WERD_H
20 #define WERD_H
21
22 #include "elst2.h"
23 #include "params.h"
24 #include "stepblob.h"
25
26 #include <bitset>
27
28 namespace tesseract {
29
30 enum WERD_FLAGS {
31 W_SEGMENTED, ///< correctly segmented
32 W_ITALIC, ///< italic text
33 W_BOLD, ///< bold text
34 W_BOL, ///< start of line
35 W_EOL, ///< end of line
36 W_NORMALIZED, ///< flags
37 W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense.
38 W_SCRIPT_IS_LATIN, ///< Special case latin for y. splitting.
39 W_DONT_CHOP, ///< fixed pitch chopped
40 W_REP_CHAR, ///< repeated character
41 W_FUZZY_SP, ///< fuzzy space
42 W_FUZZY_NON, ///< fuzzy nonspace
43 W_INVERSE ///< white on black
44 };
45
46 enum DISPLAY_FLAGS {
47 /* Display flags bit number allocations */
48 DF_BOX, ///< Bounding box
49 DF_TEXT, ///< Correct ascii
50 DF_POLYGONAL, ///< Polyg approx
51 DF_EDGE_STEP, ///< Edge steps
52 DF_BN_POLYGONAL, ///< BL normalisd polyapx
53 DF_BLAMER ///< Blamer information
54 };
55
56 class ROW; // forward decl
57
58 class TESS_API WERD : public ELIST2_LINK {
59 public:
60 WERD() = default;
61 // WERD constructed with:
62 // blob_list - blobs of the word (we take this list's contents)
63 // blanks - number of blanks before the word
64 // text - correct text (outlives WERD)
65 WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text);
66
67 // WERD constructed from:
68 // blob_list - blobs in the word
69 // clone - werd to clone flags, etc from.
70 WERD(C_BLOB_LIST *blob_list, WERD *clone);
71
72 // Construct a WERD from a single_blob and clone the flags from this.
73 // W_BOL and W_EOL flags are set according to the given values.
74 WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob);
75
76 ~WERD() = default;
77
78 // assignment
79 WERD &operator=(const WERD &source);
80
81 // This method returns a new werd constructed using the blobs in the input
82 // all_blobs list, which correspond to the blobs in this werd object. The
83 // blobs used to construct the new word are consumed and removed from the
84 // input all_blobs list.
85 // Returns nullptr if the word couldn't be constructed.
86 // Returns original blobs for which no matches were found in the output list
87 // orphan_blobs (appends).
88 WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs);
89
90 // Accessors for reject / DUFF blobs in various formats
91 C_BLOB_LIST *rej_cblob_list() { // compact format
92 return &rej_cblobs;
93 }
94
95 // Accessors for good blobs in various formats.
96 C_BLOB_LIST *cblob_list() { // get compact blobs
97 return &cblobs;
98 }
99
100 uint8_t space() const { // access function
101 return blanks;
102 }
103 void set_blanks(uint8_t new_blanks) {
104 blanks = new_blanks;
105 }
106 int script_id() const {
107 return script_id_;
108 }
109 void set_script_id(int id) {
110 script_id_ = id;
111 }
112
113 // Returns the (default) bounding box including all the dots.
114 TBOX bounding_box() const; // compute bounding box
115 // Returns the bounding box including the desired combination of upper and
116 // lower noise/diacritic elements.
117 TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
118 // Returns the bounding box of only the good blobs.
119 TBOX true_bounding_box() const;
120
121 const char *text() const {
122 return correct.c_str();
123 }
124 void set_text(const char *new_text) {
125 correct = new_text;
126 }
127
128 bool flag(WERD_FLAGS mask) const {
129 return flags[mask];
130 }
131 void set_flag(WERD_FLAGS mask, bool value) {
132 flags.set(mask, value);
133 }
134
135 bool display_flag(uint8_t flag) const {
136 return disp_flags[flag];
137 }
138 void set_display_flag(uint8_t flag, bool value) {
139 disp_flags.set(flag, value);
140 }
141
142 WERD *shallow_copy(); // shallow copy word
143
144 // reposition word by vector
145 void move(const ICOORD vec);
146
147 // join other's blobs onto this werd, emptying out other.
148 void join_on(WERD *other);
149
150 // copy other's blobs onto this word, leaving other intact.
151 void copy_on(WERD *other);
152
153 // tprintf word metadata (but not blob innards)
154 void print() const;
155
156 #ifndef GRAPHICS_DISABLED
157 // plot word on window in a uniform colour
158 void plot(ScrollView *window, ScrollView::Color colour);
159
160 // Get the next color in the (looping) rainbow.
161 static ScrollView::Color NextColor(ScrollView::Color colour);
162
163 // plot word on window in a rainbow of colours
164 void plot(ScrollView *window);
165
166 // plot rejected blobs in a rainbow of colours
167 void plot_rej_blobs(ScrollView *window);
168 #endif // !GRAPHICS_DISABLED
169
170 // Removes noise from the word by moving small outlines to the rej_cblobs
171 // list, based on the size_threshold.
172 void CleanNoise(float size_threshold);
173
174 // Extracts all the noise outlines and stuffs the pointers into the given
175 // vector of outlines. Afterwards, the outlines vector owns the pointers.
176 void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
177 // Adds the selected outlines to the indcated real blobs, and puts the rest
178 // back in rej_cblobs where they came from. Where the target_blobs entry is
179 // nullptr, a run of wanted outlines is put into a single new blob.
180 // Ownership of the outlines is transferred back to the word. (Hence
181 // vector and not PointerVector.)
182 // Returns true if any new blob was added to the start of the word, which
183 // suggests that it might need joining to the word before it, and likewise
184 // sets make_next_word_fuzzy true if any new blob was added to the end.
185 bool AddSelectedOutlines(const std::vector<bool> &wanted,
186 const std::vector<C_BLOB *> &target_blobs,
187 const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
188
189 private:
190 uint8_t blanks = 0; // no of blanks
191 std::bitset<16> flags; // flags about word
192 std::bitset<16> disp_flags; // display flags
193 int16_t script_id_ = 0; // From unicharset.
194 std::string correct; // correct text
195 C_BLOB_LIST cblobs; // compacted blobs
196 C_BLOB_LIST rej_cblobs; // DUFF blobs
197 };
198
199 ELIST2IZEH(WERD)
200
201 } // namespace tesseract
202
203 #include "ocrrow.h" // placed here due to
204
205 namespace tesseract {
206
207 // compare words by increasing order of left edge, suitable for qsort(3)
208 int word_comparator(const void *word1p, const void *word2p);
209
210 } // namespace tesseract
211
212 #endif