Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/arch/intsimdmatrix.h @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: intsimdmatrix.h | |
| 3 // Description: Base class for 8-bit int SIMD matrix multipliers. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2017, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 /////////////////////////////////////////////////////////////////////// | |
| 17 | |
| 18 #ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_ | |
| 19 #define TESSERACT_ARCH_INTSIMDMATRIX_H_ | |
| 20 | |
| 21 #include <tesseract/export.h> | |
| 22 | |
| 23 #include <cstdint> | |
| 24 #include <vector> | |
| 25 | |
| 26 #include "tesstypes.h" | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 template <class T> | |
| 31 class GENERIC_2D_ARRAY; | |
| 32 | |
| 33 // Base class for a SIMD function to multiply a matrix by a vector, with sources | |
| 34 // of 8-bit signed integer, and result in a double, after appropriate scaling. | |
| 35 // Assumes a specific method of multiplication that can be applied to any size | |
| 36 // and number of SIMD registers as follows: | |
| 37 // int32_t results are computed with num_outputs_per_register_ in each of | |
| 38 // max_output_registers_ result registers, repeatedly until it would make too | |
| 39 // many results, then the number of registers is halved, and so-on down to a | |
| 40 // single result register. The last calculation only outputs the required number | |
| 41 // of results instead of writing beyond the bounds. Eg: matrix has 75 outputs, | |
| 42 // num_outputs_per_register_ = 4, and max_output_registers_ = 8, | |
| 43 // Step 1: 8x4=32 results are computed, | |
| 44 // Step 2: 8x4=32 again, total 64, | |
| 45 // Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72, | |
| 46 // Step 4: 1x3, total 75. | |
| 47 // Each step above is computed using a PartialFunc, which runs over the input | |
| 48 // vector once. The input is read one registerful of num_inputs_per_register_ | |
| 49 // at a time (presumably 4x num_outputs_per_register_ since they are int8_t) | |
| 50 // so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_. | |
| 51 // Since it is slow (on Intel at least) to horizontally add in a register, | |
| 52 // provision is made to process num_inputs_per_group_ inputs at a time, with | |
| 53 // the group being replicated num_input_groups_ times and multiplied by a | |
| 54 // num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix. | |
| 55 // This is most convenient if num_inputs_per_group_ is 4, and the product | |
| 56 // sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent | |
| 57 // results in the process, but it doesn't have to be implemented that way. | |
| 58 // The weights are re-ordered by Init() to be used sequentially by the above | |
| 59 // algorithm, followed by the biases, so they can be added at the end. | |
| 60 // The base class computes the base C++ implementation. | |
| 61 // NOTE that, although the subclasses execute on different SIMD hardware, no | |
| 62 // virtual methods are needed, as the constructor sets up everything that | |
| 63 // is required to allow the base class implementation to do all the work. | |
| 64 struct TESS_API IntSimdMatrix { | |
| 65 // Computes a reshaped copy of the weight matrix w. | |
| 66 void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w, | |
| 67 int32_t &rounded_num_out) const; | |
| 68 | |
| 69 // Rounds the size up to a multiple of the input register size (in int8_t). | |
| 70 int RoundInputs(int size) const { | |
| 71 return Roundup(size, num_inputs_per_register_); | |
| 72 } | |
| 73 // Rounds the size up to a multiple of the output register size (in int32_t). | |
| 74 int RoundOutputs(int size) const { | |
| 75 return Roundup(size, num_outputs_per_register_); | |
| 76 } | |
| 77 | |
| 78 // Computes matrix.vector v = Wu. | |
| 79 // u is of size W.dim2() - 1 and the output v is of size W.dim1(). | |
| 80 // u is imagined to have an extra element at the end with value 1, to | |
| 81 // implement the bias, but it doesn't actually have it. | |
| 82 // Computes the base C++ implementation. | |
| 83 static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales, | |
| 84 const int8_t *u, TFloat *v); | |
| 85 | |
| 86 // Rounds the input up to a multiple of the given factor. | |
| 87 static int Roundup(int input, int factor) { | |
| 88 return (input + factor - 1) / factor * factor; | |
| 89 } | |
| 90 | |
| 91 // Computes matrix.vector v = Wu. | |
| 92 // u is of size W.dim2() - 1 and the output v is of size W.dim1(). | |
| 93 // u is imagined to have an extra element at the end with value 1, to | |
| 94 // implement the bias, but it doesn't actually have it. | |
| 95 // Uses an optimized implementation with partial funcs. | |
| 96 // NOTE: The size of the input vector (u) must be padded using | |
| 97 // RoundInputs above. | |
| 98 // The input will be over-read to the extent of the padding. There are no | |
| 99 // alignment requirements. | |
| 100 using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *, | |
| 101 TFloat *); | |
| 102 MatrixDotVectorFunction matrixDotVectorFunction; | |
| 103 | |
| 104 // Number of 32 bit outputs held in each register. | |
| 105 int num_outputs_per_register_; | |
| 106 // Maximum number of registers that we will use to hold outputs. | |
| 107 int max_output_registers_; | |
| 108 // Number of 8 bit inputs in the inputs register. | |
| 109 int num_inputs_per_register_; | |
| 110 // Number of inputs in each weight group. | |
| 111 int num_inputs_per_group_; | |
| 112 // Number of groups of inputs to be broadcast. | |
| 113 // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_ | |
| 114 | |
| 115 static const IntSimdMatrix *intSimdMatrix; | |
| 116 // Only available with NEON. | |
| 117 static const IntSimdMatrix intSimdMatrixNEON; | |
| 118 // Only available with RVV. | |
| 119 static const IntSimdMatrix intSimdMatrixRVV; | |
| 120 // Only available with AVX2 / AVX / FMA / SSE. | |
| 121 static const IntSimdMatrix intSimdMatrixAVX2; | |
| 122 static const IntSimdMatrix intSimdMatrixSSE; | |
| 123 }; | |
| 124 | |
| 125 } // namespace tesseract | |
| 126 | |
| 127 #endif // TESSERACT_ARCH_INTSIMDMATRIX_H_ |
