Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/lstm/networkio.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line source
/////////////////////////////////////////////////////////////////////// // File: networkio.cpp // Description: Network input/output data, allowing float/int implementations. // Author: Ray Smith // // (C) Copyright 2014, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /////////////////////////////////////////////////////////////////////// #include "networkio.h" #include <cfloat> // for FLT_MAX #include <cmath> #include <allheaders.h> #include "functions.h" #include "statistc.h" #include "tprintf.h" namespace tesseract { // Minimum value to output for certainty. const float kMinCertainty = -20.0f; // Probability corresponding to kMinCertainty. const float kMinProb = std::exp(kMinCertainty); // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. void NetworkIO::Resize2d(bool int_mode, int width, int num_features) { stride_map_ = StrideMap(); int_mode_ = int_mode; if (int_mode_) { i_.ResizeNoInit(width, num_features, GetPadding(num_features)); } else { f_.ResizeNoInit(width, num_features); } } // Resizes to a specific stride_map. void NetworkIO::ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features) { // If this method crashes with this == nullptr, // it most likely got here through an uninitialized scratch element, // ie call NetworkScratch::IO::Resizexxx() not NetworkIO::Resizexxx()!! stride_map_ = stride_map; int_mode_ = int_mode; if (int_mode_) { i_.ResizeNoInit(stride_map.Width(), num_features, GetPadding(num_features)); } else { f_.ResizeNoInit(stride_map.Width(), num_features); } ZeroInvalidElements(); } // Shrinks image size by x_scale,y_scale, and use given number of features. void NetworkIO::ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features) { StrideMap stride_map = src.stride_map_; stride_map.ScaleXY(x_scale, y_scale); ResizeToMap(src.int_mode_, stride_map, num_features); } // Resizes to just 1 x-coord, whatever the input. void NetworkIO::ResizeXTo1(const NetworkIO &src, int num_features) { StrideMap stride_map = src.stride_map_; stride_map.ReduceWidthTo1(); ResizeToMap(src.int_mode_, stride_map, num_features); } // Initialize all the array to zero. void NetworkIO::Zero() { int width = Width(); // Zero out the everything. Column-by-column in case it is aligned. for (int t = 0; t < width; ++t) { ZeroTimeStep(t); } } // Initializes to zero all elements of the array that do not correspond to // valid image positions. (If a batch of different-sized images are packed // together, then there will be padding pixels.) void NetworkIO::ZeroInvalidElements() { int num_features = NumFeatures(); int full_width = stride_map_.Size(FD_WIDTH); int full_height = stride_map_.Size(FD_HEIGHT); StrideMap::Index b_index(stride_map_); do { int end_x = b_index.MaxIndexOfDim(FD_WIDTH) + 1; if (end_x < full_width) { // The width is small, so fill for every valid y. StrideMap::Index y_index(b_index); int fill_size = num_features * (full_width - end_x); do { StrideMap::Index z_index(y_index); z_index.AddOffset(end_x, FD_WIDTH); if (int_mode_) { ZeroVector(fill_size, i_[z_index.t()]); } else { ZeroVector(fill_size, f_[z_index.t()]); } } while (y_index.AddOffset(1, FD_HEIGHT)); } int end_y = b_index.MaxIndexOfDim(FD_HEIGHT) + 1; if (end_y < full_height) { // The height is small, so fill in the space in one go. StrideMap::Index y_index(b_index); y_index.AddOffset(end_y, FD_HEIGHT); int fill_size = num_features * full_width * (full_height - end_y); if (int_mode_) { ZeroVector(fill_size, i_[y_index.t()]); } else { ZeroVector(fill_size, f_[y_index.t()]); } } } while (b_index.AddOffset(1, FD_BATCH)); } // Helper computes a black point and white point to contrast-enhance an image. // The computation is based on the assumption that the image is of a single line // of text, so a horizontal line through the middle of the image passes through // at least some of it, so local minima and maxima are a good proxy for black // and white pixel samples. static void ComputeBlackWhite(Image pix, float *black, float *white) { int width = pixGetWidth(pix); int height = pixGetHeight(pix); STATS mins(0, 255), maxes(0, 255); if (width >= 3) { int y = height / 2; l_uint32 *line = pixGetData(pix) + pixGetWpl(pix) * y; int prev = GET_DATA_BYTE(line, 0); int curr = GET_DATA_BYTE(line, 1); for (int x = 1; x + 1 < width; ++x) { int next = GET_DATA_BYTE(line, x + 1); if ((curr < prev && curr <= next) || (curr <= prev && curr < next)) { // Local minimum. mins.add(curr, 1); } if ((curr > prev && curr >= next) || (curr >= prev && curr > next)) { // Local maximum. maxes.add(curr, 1); } prev = curr; curr = next; } } if (mins.get_total() == 0) { mins.add(0, 1); } if (maxes.get_total() == 0) { maxes.add(255, 1); } *black = mins.ile(0.25); *white = maxes.ile(0.75); } // Sets up the array from the given image, using the currently set int_mode_. // If the image width doesn't match the shape, the image is truncated or padded // with noise to match. void NetworkIO::FromPix(const StaticShape &shape, const Image pix, TRand *randomizer) { std::vector<Image> pixes(1, pix); FromPixes(shape, pixes, randomizer); } // Sets up the array from the given set of images, using the currently set // int_mode_. If the image width doesn't match the shape, the images are // truncated or padded with noise to match. void NetworkIO::FromPixes(const StaticShape &shape, const std::vector<Image> &pixes, TRand *randomizer) { int target_height = shape.height(); int target_width = shape.width(); std::vector<std::pair<int, int>> h_w_pairs; for (auto &&pix : pixes) { Image var_pix = pix; int width = pixGetWidth(var_pix); if (target_width != 0) { width = target_width; } int height = pixGetHeight(var_pix); if (target_height != 0) { height = target_height; } h_w_pairs.emplace_back(height, width); } stride_map_.SetStride(h_w_pairs); ResizeToMap(int_mode(), stride_map_, shape.depth()); // Iterate over the images again to copy the data. for (size_t b = 0; b < pixes.size(); ++b) { Image pix = pixes[b]; float black = 0.0f, white = 255.0f; if (shape.depth() != 3) { ComputeBlackWhite(pix, &black, &white); } float contrast = (white - black) / 2.0f; if (contrast <= 0.0f) { contrast = 1.0f; } if (shape.height() == 1) { Copy1DGreyImage(b, pix, black, contrast, randomizer); } else { Copy2DImage(b, pix, black, contrast, randomizer); } } } // Copies the given pix to *this at the given batch index, stretching and // clipping the pixel values so that [black, black + 2*contrast] maps to the // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. // This is a 2-d operation in the sense that the output depth is the number // of input channels, the height is the height of the image, and the width // is the width of the image, or truncated/padded with noise if the width // is a fixed size. void NetworkIO::Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer) { int width = pixGetWidth(pix); int height = pixGetHeight(pix); int wpl = pixGetWpl(pix); StrideMap::Index index(stride_map_); index.AddOffset(batch, FD_BATCH); int t = index.t(); int target_height = stride_map_.Size(FD_HEIGHT); int target_width = stride_map_.Size(FD_WIDTH); int num_features = NumFeatures(); bool color = num_features == 3; if (width > target_width) { width = target_width; } uint32_t *line = pixGetData(pix); for (int y = 0; y < target_height; ++y, line += wpl) { int x = 0; if (y < height) { for (x = 0; x < width; ++x, ++t) { if (color) { int f = 0; for (int c = COLOR_RED; c <= COLOR_BLUE; ++c) { int pixel = GET_DATA_BYTE(line + x, c); SetPixel(t, f++, pixel, black, contrast); } } else { int pixel = GET_DATA_BYTE(line, x); SetPixel(t, 0, pixel, black, contrast); } } } for (; x < target_width; ++x) { Randomize(t++, 0, num_features, randomizer); } } } // Copies the given pix to *this at the given batch index, as Copy2DImage // above, except that the output depth is the height of the input image, the // output height is 1, and the output width as for Copy2DImage. // The image is thus treated as a 1-d set of vertical pixel strips. void NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer) { int width = pixGetWidth(pix); int height = pixGetHeight(pix); ASSERT_HOST(height == NumFeatures()); int wpl = pixGetWpl(pix); StrideMap::Index index(stride_map_); index.AddOffset(batch, FD_BATCH); int t = index.t(); int target_width = stride_map_.Size(FD_WIDTH); if (width > target_width) { width = target_width; } int x; for (x = 0; x < width; ++x, ++t) { for (int y = 0; y < height; ++y) { uint32_t *line = pixGetData(pix) + wpl * y; int pixel = GET_DATA_BYTE(line, x); SetPixel(t, y, pixel, black, contrast); } } for (; x < target_width; ++x) { Randomize(t++, 0, height, randomizer); } } // Helper stores the pixel value in i_ or f_ according to int_mode_. // t: is the index from the StrideMap corresponding to the current // [batch,y,x] position // f: is the index into the depth/channel // pixel: the value of the pixel from the image (in one channel) // black: the pixel value to map to the lowest of the range of *this // contrast: the range of pixel values to stretch to half the range of *this. void NetworkIO::SetPixel(int t, int f, int pixel, float black, float contrast) { float float_pixel = (pixel - black) / contrast - 1.0f; if (int_mode_) { i_[t][f] = ClipToRange<int>(IntCastRounded((INT8_MAX + 1) * float_pixel), -INT8_MAX, INT8_MAX); } else { f_[t][f] = float_pixel; } } // Converts the array to a Pix. Must be pixDestroyed after use. Image NetworkIO::ToPix() const { // Count the width of the image, and find the max multiplication factor. int im_width = stride_map_.Size(FD_WIDTH); int im_height = stride_map_.Size(FD_HEIGHT); int num_features = NumFeatures(); int feature_factor = 1; if (num_features == 3) { // Special hack for color. num_features = 1; feature_factor = 3; } Image pix = pixCreate(im_width, im_height * num_features, 32); StrideMap::Index index(stride_map_); do { int im_x = index.index(FD_WIDTH); int top_im_y = index.index(FD_HEIGHT); int im_y = top_im_y; int t = index.t(); if (int_mode_) { const int8_t *features = i_[t]; for (int y = 0; y < num_features; ++y, im_y += im_height) { int pixel = features[y * feature_factor]; // 1 or 2 features use greyscale. int red = ClipToRange<int>(pixel + 128, 0, 255); int green = red, blue = red; if (feature_factor == 3) { // With 3 features assume RGB color. green = ClipToRange<int>(features[y * feature_factor + 1] + 128, 0, 255); blue = ClipToRange<int>(features[y * feature_factor + 2] + 128, 0, 255); } else if (num_features > 3) { // More than 3 features use false yellow/blue color, assuming a signed // input in the range [-1,1]. red = abs(pixel) * 2; if (pixel >= 0) { green = red; blue = 0; } else { blue = red; green = red = 0; } } pixSetPixel(pix, im_x, im_y, (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT)); } } else { const float *features = f_[t]; for (int y = 0; y < num_features; ++y, im_y += im_height) { float pixel = features[y * feature_factor]; // 1 or 2 features use greyscale. int red = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255); int green = red, blue = red; if (feature_factor == 3) { // With 3 features assume RGB color. pixel = features[y * feature_factor + 1]; green = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255); pixel = features[y * feature_factor + 2]; blue = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255); } else if (num_features > 3) { // More than 3 features use false yellow/blue color, assuming a signed // input in the range [-1,1]. red = ClipToRange<int>(IntCastRounded(std::fabs(pixel) * 255), 0, 255); if (pixel >= 0) { green = red; blue = 0; } else { blue = red; green = red = 0; } } pixSetPixel(pix, im_x, im_y, (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT)); } } } while (index.Increment()); return pix; } // Prints the first and last num timesteps of the array for each feature. void NetworkIO::Print(int num) const { int num_features = NumFeatures(); for (int y = 0; y < num_features; ++y) { for (int t = 0; t < Width(); ++t) { if (num == 0 || t < num || t + num >= Width()) { if (int_mode_) { tprintf(" %g", static_cast<float>(i_[t][y]) / INT8_MAX); } else { tprintf(" %g", f_[t][y]); } } } tprintf("\n"); } } // Copies a single time step from src. void NetworkIO::CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t) { ASSERT_HOST(int_mode_ == src.int_mode_); if (int_mode_) { memcpy(i_[dest_t], src.i_[src_t], i_.dim2() * sizeof(i_[0][0])); } else { memcpy(f_[dest_t], src.f_[src_t], f_.dim2() * sizeof(f_[0][0])); } } // Copies a part of single time step from src. void NetworkIO::CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src, int src_t, int src_offset) { ASSERT_HOST(int_mode_ == src.int_mode_); if (int_mode_) { memcpy(i_[dest_t] + dest_offset, src.i_[src_t] + src_offset, num_features * sizeof(i_[0][0])); } else { memcpy(f_[dest_t] + dest_offset, src.f_[src_t] + src_offset, num_features * sizeof(f_[0][0])); } } // Sets the given range to random values. void NetworkIO::Randomize(int t, int offset, int num_features, TRand *randomizer) { if (int_mode_) { int8_t *line = i_[t] + offset; for (int i = 0; i < num_features; ++i) { line[i] = IntCastRounded(randomizer->SignedRand(INT8_MAX)); } } else { // float mode. float *line = f_[t] + offset; for (int i = 0; i < num_features; ++i) { line[i] = randomizer->SignedRand(1.0); } } } // Helper returns the label and score of the best choice over a range. int NetworkIO::BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating, float *certainty) const { if (t_end <= t_start) { return -1; } int max_char = -1; float min_score = 0.0f; for (int c = 0; c < NumFeatures(); ++c) { if (c == not_this || c == null_ch) { continue; } ScoresOverRange(t_start, t_end, c, null_ch, rating, certainty); if (max_char < 0 || *rating < min_score) { min_score = *rating; max_char = c; } } ScoresOverRange(t_start, t_end, max_char, null_ch, rating, certainty); return max_char; } // Helper returns the rating and certainty of the choice over a range in output. void NetworkIO::ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating, float *certainty) const { ASSERT_HOST(!int_mode_); *rating = 0.0f; *certainty = 0.0f; if (t_end <= t_start || t_end <= 0) { return; } float ratings[3] = {0.0f, 0.0f, 0.0f}; float certs[3] = {0.0f, 0.0f, 0.0f}; for (int t = t_start; t < t_end; ++t) { const float *line = f_[t]; float score = ProbToCertainty(line[choice]); float zero = ProbToCertainty(line[null_ch]); if (t == t_start) { ratings[2] = FLT_MAX; ratings[1] = -score; certs[1] = score; } else { for (int i = 2; i >= 1; --i) { if (ratings[i] > ratings[i - 1]) { ratings[i] = ratings[i - 1]; certs[i] = certs[i - 1]; } } ratings[2] -= zero; if (zero < certs[2]) { certs[2] = zero; } ratings[1] -= score; if (score < certs[1]) { certs[1] = score; } } ratings[0] -= zero; if (zero < certs[0]) { certs[0] = zero; } } int best_i = ratings[2] < ratings[1] ? 2 : 1; *rating = ratings[best_i] + t_end - t_start; *certainty = certs[best_i]; } // Returns the index (label) of the best value at the given timestep, // excluding not_this and not_that, and if not null, sets the score to the // log of the corresponding value. int NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const { ASSERT_HOST(!int_mode_); int best_index = -1; float best_score = -FLT_MAX; const float *line = f_[t]; for (int i = 0; i < f_.dim2(); ++i) { if (line[i] > best_score && i != not_this && i != not_that) { best_score = line[i]; best_index = i; } } if (score != nullptr) { *score = ProbToCertainty(best_score); } return best_index; } // Returns the best start position out of [start, end) (into which all labels // must fit) to obtain the highest cumulative score for the given labels. int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const { int length = labels.size(); int last_start = end - length; int best_start = -1; TFloat best_score = 0; for (int s = start; s <= last_start; ++s) { TFloat score = ScoreOfLabels(labels, s); if (score > best_score || best_start < 0) { best_score = score; best_start = s; } } return best_start; } // Returns the cumulative score of the given labels starting at start, and // using one label per time-step. TFloat NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const { int length = labels.size(); TFloat score = 0; for (int i = 0; i < length; ++i) { score += f_(start + i, labels[i]); } return score; } // Helper function sets all the outputs for a single timestep, such that // label has value ok_score, and the other labels share 1 - ok_score. void NetworkIO::SetActivations(int t, int label, float ok_score) { ASSERT_HOST(!int_mode_); int num_classes = NumFeatures(); float bad_score = (1.0f - ok_score) / (num_classes - 1); float *targets = f_[t]; for (int i = 0; i < num_classes; ++i) { targets[i] = bad_score; } targets[label] = ok_score; } // Modifies the values, only if needed, so that the given label is // the winner at the given time step t. void NetworkIO::EnsureBestLabel(int t, int label) { ASSERT_HOST(!int_mode_); if (BestLabel(t, nullptr) != label) { // Output value needs enhancing. Third all the other elements and add the // remainder to best_label. int num_classes = NumFeatures(); float *targets = f_[t]; for (int c = 0; c < num_classes; ++c) { if (c == label) { targets[c] += (1.0 - targets[c]) * (2 / 3.0); } else { targets[c] /= 3.0; } } } } // Helper function converts prob to certainty taking the minimum into account. /* static */ float NetworkIO::ProbToCertainty(float prob) { return prob > kMinProb ? std::log(prob) : kMinCertainty; } // Returns true if there is any bad value that is suspiciously like a GT // error. Assuming that *this is the difference(gradient) between target // and forward output, returns true if there is a large negative value // (correcting a very confident output) for which there is no corresponding // positive value in an adjacent timestep for the same feature index. This // allows the box-truthed samples to make fine adjustments to position while // stopping other disagreements of confident output with ground truth. bool NetworkIO::AnySuspiciousTruth(float confidence_thr) const { int num_features = NumFeatures(); for (int t = 0; t < Width(); ++t) { const float *features = f_[t]; for (int y = 0; y < num_features; ++y) { float grad = features[y]; if (grad < -confidence_thr) { // Correcting strong output. Check for movement. if ((t == 0 || f_[t - 1][y] < confidence_thr / 2) && (t + 1 == Width() || f_[t + 1][y] < confidence_thr / 2)) { return true; // No strong positive on either side. } } } } return false; } // Reads a single timestep to floats in the range [-1, 1]. void NetworkIO::ReadTimeStep(int t, TFloat *output) const { if (int_mode_) { const int8_t *line = i_[t]; for (int i = 0; i < i_.dim2(); ++i) { output[i] = static_cast<TFloat>(line[i]) / INT8_MAX; } } else { const float *line = f_[t]; for (int i = 0; i < f_.dim2(); ++i) { output[i] = static_cast<TFloat>(line[i]); } } } // Adds a single timestep to floats. void NetworkIO::AddTimeStep(int t, TFloat *inout) const { int num_features = NumFeatures(); if (int_mode_) { const int8_t *line = i_[t]; for (int i = 0; i < num_features; ++i) { inout[i] += static_cast<TFloat>(line[i]) / INT8_MAX; } } else { const float *line = f_[t]; for (int i = 0; i < num_features; ++i) { inout[i] += line[i]; } } } // Adds part of a single timestep to floats. void NetworkIO::AddTimeStepPart(int t, int offset, int num_features, float *inout) const { if (int_mode_) { const int8_t *line = i_[t] + offset; for (int i = 0; i < num_features; ++i) { inout[i] += static_cast<float>(line[i]) / INT8_MAX; } } else { const float *line = f_[t] + offset; for (int i = 0; i < num_features; ++i) { inout[i] += line[i]; } } } // Writes a single timestep from floats in the range [-1, 1]. void NetworkIO::WriteTimeStep(int t, const TFloat *input) { WriteTimeStepPart(t, 0, NumFeatures(), input); } // Writes a single timestep from floats in the range [-1, 1] writing only // num_features elements of input to (*this)[t], starting at offset. void NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input) { if (int_mode_) { int8_t *line = i_[t] + offset; for (int i = 0; i < num_features; ++i) { line[i] = ClipToRange<int>(IntCastRounded(input[i] * INT8_MAX), -INT8_MAX, INT8_MAX); } } else { float *line = f_[t] + offset; for (int i = 0; i < num_features; ++i) { line[i] = static_cast<float>(input[i]); } } } // Maxpools a single time step from src. void NetworkIO::MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line) { ASSERT_HOST(int_mode_ == src.int_mode_); if (int_mode_) { int dim = i_.dim2(); int8_t *dest_line = i_[dest_t]; const int8_t *src_line = src.i_[src_t]; for (int i = 0; i < dim; ++i) { if (dest_line[i] < src_line[i]) { dest_line[i] = src_line[i]; max_line[i] = src_t; } } } else { int dim = f_.dim2(); float *dest_line = f_[dest_t]; const float *src_line = src.f_[src_t]; for (int i = 0; i < dim; ++i) { if (dest_line[i] < src_line[i]) { dest_line[i] = src_line[i]; max_line[i] = src_t; } } } } // Runs maxpool backward, using maxes to index timesteps in *this. void NetworkIO::MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes) { ASSERT_HOST(!int_mode_); Zero(); StrideMap::Index index(fwd.stride_map_); do { int t = index.t(); const int *max_line = maxes[t]; const float *fwd_line = fwd.f_[t]; int num_features = fwd.f_.dim2(); for (int i = 0; i < num_features; ++i) { f_[max_line[i]][i] = fwd_line[i]; } } while (index.Increment()); } // Returns the min over time of the maxes over features of the outputs. float NetworkIO::MinOfMaxes() const { float min_max = 0.0f; int width = Width(); int num_features = NumFeatures(); for (int t = 0; t < width; ++t) { float max_value = -FLT_MAX; if (int_mode_) { const int8_t *column = i_[t]; for (int i = 0; i < num_features; ++i) { if (column[i] > max_value) { max_value = column[i]; } } } else { const float *column = f_[t]; for (int i = 0; i < num_features; ++i) { if (column[i] > max_value) { max_value = column[i]; } } } if (t == 0 || max_value < min_max) { min_max = max_value; } } return min_max; } // Computes combined results for a combiner that chooses between an existing // input and itself, with an additional output to indicate the choice. void NetworkIO::CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output) { int no = base_output.NumFeatures(); ASSERT_HOST(combiner_output.NumFeatures() == no + 1); Resize(base_output, no); int width = Width(); if (int_mode_) { // Number of outputs from base and final result. for (int t = 0; t < width; ++t) { int8_t *out_line = i_[t]; const int8_t *base_line = base_output.i_[t]; const int8_t *comb_line = combiner_output.i_[t]; float base_weight = static_cast<float>(comb_line[no]) / INT8_MAX; float boost_weight = 1.0f - base_weight; for (int i = 0; i < no; ++i) { out_line[i] = IntCastRounded(base_line[i] * base_weight + comb_line[i] * boost_weight); } } } else { for (int t = 0; t < width; ++t) { float *out_line = f_[t]; const float *base_line = base_output.f_[t]; const float *comb_line = combiner_output.f_[t]; float base_weight = comb_line[no]; float boost_weight = 1.0f - base_weight; for (int i = 0; i < no; ++i) { out_line[i] = base_line[i] * base_weight + comb_line[i] * boost_weight; } } } } // Computes deltas for a combiner that chooses between 2 sets of inputs. void NetworkIO::ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output) { ASSERT_HOST(!int_mode_); // Compute the deltas for the combiner. int width = Width(); int no = NumFeatures() - 1; ASSERT_HOST(fwd_deltas.NumFeatures() == no); ASSERT_HOST(base_output.NumFeatures() == no); // Number of outputs from base and final result. for (int t = 0; t < width; ++t) { const float *delta_line = fwd_deltas.f_[t]; const float *base_line = base_output.f_[t]; float *comb_line = f_[t]; float base_weight = comb_line[no]; float boost_weight = 1.0f - base_weight; float max_base_delta = 0.0; for (int i = 0; i < no; ++i) { // What did the combiner actually produce? float output = base_line[i] * base_weight + comb_line[i] * boost_weight; // Reconstruct the target from the delta. float comb_target = delta_line[i] + output; comb_line[i] = comb_target - comb_line[i]; float base_delta = std::fabs(comb_target - base_line[i]); if (base_delta > max_base_delta) { max_base_delta = base_delta; } } if (max_base_delta >= 0.5) { // The base network got it wrong. The combiner should output the right // answer and 0 for the base network. comb_line[no] = 0.0 - base_weight; } else { // The base network was right. The combiner should flag that. for (int i = 0; i < no; ++i) { // All other targets are 0. if (comb_line[i] > 0.0) { comb_line[i] -= 1.0; } } comb_line[no] = 1.0 - base_weight; } } } // Copies the array checking that the types match. void NetworkIO::CopyAll(const NetworkIO &src) { ASSERT_HOST(src.int_mode_ == int_mode_); f_ = src.f_; } // Checks that both are floats and adds the src array to *this. void NetworkIO::AddAllToFloat(const NetworkIO &src) { ASSERT_HOST(!int_mode_); ASSERT_HOST(!src.int_mode_); f_ += src.f_; } // Subtracts the array from a float array. src must also be float. void NetworkIO::SubtractAllFromFloat(const NetworkIO &src) { ASSERT_HOST(!int_mode_); ASSERT_HOST(!src.int_mode_); f_ -= src.f_; } // Copies src to *this, with maxabs normalization to match scale. void NetworkIO::CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale) { ASSERT_HOST(!int_mode_); ASSERT_HOST(!src.int_mode_); ASSERT_HOST(!scale.int_mode_); float src_max = src.f_.MaxAbs(); ASSERT_HOST(std::isfinite(src_max)); float scale_max = scale.f_.MaxAbs(); ASSERT_HOST(std::isfinite(scale_max)); if (src_max > 0.0f) { float factor = scale_max / src_max; for (int t = 0; t < src.Width(); ++t) { const float *src_ptr = src.f_[t]; float *dest_ptr = f_[t]; for (int i = 0; i < src.f_.dim2(); ++i) { dest_ptr[i] = src_ptr[i] * factor; } } } else { f_.Clear(); } } // Copies src to *this with independent reversal of the y dimension. void NetworkIO::CopyWithYReversal(const NetworkIO &src) { int num_features = src.NumFeatures(); Resize(src, num_features); StrideMap::Index b_index(src.stride_map_); do { int width = b_index.MaxIndexOfDim(FD_WIDTH) + 1; StrideMap::Index fwd_index(b_index); StrideMap::Index rev_index(b_index); rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_HEIGHT), FD_HEIGHT); do { int fwd_t = fwd_index.t(); int rev_t = rev_index.t(); for (int x = 0; x < width; ++x) { CopyTimeStepFrom(rev_t++, src, fwd_t++); } } while (fwd_index.AddOffset(1, FD_HEIGHT) && rev_index.AddOffset(-1, FD_HEIGHT)); } while (b_index.AddOffset(1, FD_BATCH)); } // Copies src to *this with independent reversal of the x dimension. void NetworkIO::CopyWithXReversal(const NetworkIO &src) { int num_features = src.NumFeatures(); Resize(src, num_features); StrideMap::Index b_index(src.stride_map_); do { StrideMap::Index y_index(b_index); do { StrideMap::Index fwd_index(y_index); StrideMap::Index rev_index(y_index); rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_WIDTH), FD_WIDTH); do { CopyTimeStepFrom(rev_index.t(), src, fwd_index.t()); } while (fwd_index.AddOffset(1, FD_WIDTH) && rev_index.AddOffset(-1, FD_WIDTH)); } while (y_index.AddOffset(1, FD_HEIGHT)); } while (b_index.AddOffset(1, FD_BATCH)); } // Copies src to *this with independent transpose of the x and y dimensions. void NetworkIO::CopyWithXYTranspose(const NetworkIO &src) { int num_features = src.NumFeatures(); stride_map_ = src.stride_map_; stride_map_.TransposeXY(); ResizeToMap(src.int_mode(), stride_map_, num_features); StrideMap::Index src_b_index(src.stride_map_); StrideMap::Index dest_b_index(stride_map_); do { StrideMap::Index src_y_index(src_b_index); StrideMap::Index dest_x_index(dest_b_index); do { StrideMap::Index src_x_index(src_y_index); StrideMap::Index dest_y_index(dest_x_index); do { CopyTimeStepFrom(dest_y_index.t(), src, src_x_index.t()); } while (src_x_index.AddOffset(1, FD_WIDTH) && dest_y_index.AddOffset(1, FD_HEIGHT)); } while (src_y_index.AddOffset(1, FD_HEIGHT) && dest_x_index.AddOffset(1, FD_WIDTH)); } while (src_b_index.AddOffset(1, FD_BATCH) && dest_b_index.AddOffset(1, FD_BATCH)); } // Copies src to *this, at the given feature_offset, returning the total // feature offset after the copy. Multiple calls will stack outputs from // multiple sources in feature space. int NetworkIO::CopyPacking(const NetworkIO &src, int feature_offset) { ASSERT_HOST(int_mode_ == src.int_mode_); int width = src.Width(); ASSERT_HOST(width <= Width()); int num_features = src.NumFeatures(); ASSERT_HOST(num_features + feature_offset <= NumFeatures()); if (int_mode_) { for (int t = 0; t < width; ++t) { memcpy(i_[t] + feature_offset, src.i_[t], num_features * sizeof(i_[t][0])); } for (int t = width; t < i_.dim1(); ++t) { memset(i_[t], 0, num_features * sizeof(i_[t][0])); } } else { for (int t = 0; t < width; ++t) { memcpy(f_[t] + feature_offset, src.f_[t], num_features * sizeof(f_[t][0])); } for (int t = width; t < f_.dim1(); ++t) { memset(f_[t], 0, num_features * sizeof(f_[t][0])); } } return num_features + feature_offset; } // Opposite of CopyPacking, fills *this with a part of src, starting at // feature_offset, and picking num_features. void NetworkIO::CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features) { Resize(src, num_features); int width = src.Width(); ASSERT_HOST(num_features + feature_offset <= src.NumFeatures()); if (int_mode_) { for (int t = 0; t < width; ++t) { memcpy(i_[t], src.i_[t] + feature_offset, num_features * sizeof(i_[t][0])); } } else { for (int t = 0; t < width; ++t) { memcpy(f_[t], src.f_[t] + feature_offset, num_features * sizeof(f_[t][0])); } } } // Transposes the float part of *this into dest. void NetworkIO::Transpose(TransposedArray *dest) const { int width = Width(); dest->ResizeNoInit(NumFeatures(), width); for (int t = 0; t < width; ++t) { dest->WriteStrided(t, f_[t]); } } // Clips the content of a single time-step to +/-range. void NetworkIO::ClipVector(int t, float range) { ASSERT_HOST(!int_mode_); float *v = f_[t]; int dim = f_.dim2(); for (int i = 0; i < dim; ++i) { v[i] = ClipToRange<float>(v[i], -range, range); } } // Returns the padding required for the given number of features in order // for the SIMD operations to be safe. /* static */ int NetworkIO::GetPadding(int num_features) { int padding = 0; if (IntSimdMatrix::intSimdMatrix) { padding = IntSimdMatrix::intSimdMatrix->RoundInputs(num_features) - num_features; } return padding; } } // namespace tesseract.
