view mupdf-source/thirdparty/tesseract/src/classify/adaptive.cpp @ 21:2f43e400f144

Provide an "all" target to build both the sdist and the wheel
author Franz Glasner <fzglas.hg@dom66.de>
date Fri, 19 Sep 2025 10:28:53 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

/******************************************************************************
 ** Filename:    adaptive.c
 ** Purpose:     Adaptive matcher.
 ** Author:      Dan Johnson
 **
 ** (c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 ******************************************************************************/

#include "adaptive.h"

#include "classify.h"

#include <cassert>
#include <cstdio>

namespace tesseract {

/*----------------------------------------------------------------------------
              Public Code
----------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
/**
 * This routine adds a new adapted class to an existing
 * set of adapted templates.
 *
 * @param Templates set of templates to add new class to
 * @param Class new class to add to templates
 * @param ClassId class id to associate with new class
 *
 * @note Globals: none
 */
void AddAdaptedClass(ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_CLASS_STRUCT *Class, CLASS_ID ClassId) {
  assert(Templates != nullptr);
  assert(Class != nullptr);
  assert(LegalClassId(ClassId));
  assert(UnusedClassIdIn(Templates->Templates, ClassId));
  assert(Class->NumPermConfigs == 0);

  auto IntClass = new INT_CLASS_STRUCT(1, 1);
  AddIntClass(Templates->Templates, ClassId, IntClass);

  assert(Templates->Class[ClassId] == nullptr);
  Templates->Class[ClassId] = Class;

} /* AddAdaptedClass */

/*---------------------------------------------------------------------------*/

PERM_CONFIG_STRUCT::~PERM_CONFIG_STRUCT() {
  delete[] Ambigs;
}

ADAPT_CLASS_STRUCT::ADAPT_CLASS_STRUCT() :
  NumPermConfigs(0),
  MaxNumTimesSeen(0),
  PermProtos(NewBitVector(MAX_NUM_PROTOS)),
  PermConfigs(NewBitVector(MAX_NUM_CONFIGS)),
  TempProtos(NIL_LIST) {
  zero_all_bits(PermProtos, WordsInVectorOfSize(MAX_NUM_PROTOS));
  zero_all_bits(PermConfigs, WordsInVectorOfSize(MAX_NUM_CONFIGS));

  for (int i = 0; i < MAX_NUM_CONFIGS; i++) {
    TempConfigFor(this, i) = nullptr;
  }
}

ADAPT_CLASS_STRUCT::~ADAPT_CLASS_STRUCT() {
  for (int i = 0; i < MAX_NUM_CONFIGS; i++) {
    if (ConfigIsPermanent(this, i) && PermConfigFor(this, i) != nullptr) {
      delete PermConfigFor(this, i);
    } else if (!ConfigIsPermanent(this, i) && TempConfigFor(this, i) != nullptr) {
      delete TempConfigFor(this, i);
    }
  }
  FreeBitVector(PermProtos);
  FreeBitVector(PermConfigs);
  auto list = TempProtos;
  while (list != nullptr) {
    delete reinterpret_cast<TEMP_PROTO_STRUCT *>(list->node);
    list = pop(list);
  }
}

/// Constructor for adapted templates.
/// Add an empty class for each char in unicharset to the newly created templates.
ADAPT_TEMPLATES_STRUCT::ADAPT_TEMPLATES_STRUCT(UNICHARSET &unicharset) {
  Templates = new INT_TEMPLATES_STRUCT;
  NumPermClasses = 0;
  NumNonEmptyClasses = 0;

  /* Insert an empty class for each unichar id in unicharset */
  for (unsigned i = 0; i < MAX_NUM_CLASSES; i++) {
    Class[i] = nullptr;
    if (i < unicharset.size()) {
      AddAdaptedClass(this, new ADAPT_CLASS_STRUCT, i);
    }
  }
}

ADAPT_TEMPLATES_STRUCT::~ADAPT_TEMPLATES_STRUCT() {
  for (unsigned i = 0; i < (Templates)->NumClasses; i++) {
    delete Class[i];
  }
  delete Templates;
}

// Returns FontinfoId of the given config of the given adapted class.
int Classify::GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId) {
  return (ConfigIsPermanent(Class, ConfigId) ? PermConfigFor(Class, ConfigId)->FontinfoId
                                             : TempConfigFor(Class, ConfigId)->FontinfoId);
}

/// This constructor allocates and returns a new temporary config.
///
/// @param MaxProtoId  max id of any proto in new config
/// @param FontinfoId font information from pre-trained templates
TEMP_CONFIG_STRUCT::TEMP_CONFIG_STRUCT(int maxProtoId, int fontinfoId) :
  NumTimesSeen(1),
  ProtoVectorSize(WordsInVectorOfSize(maxProtoId + 1)),
  MaxProtoId(maxProtoId),
  Protos(NewBitVector(maxProtoId + 1)),
  FontinfoId(fontinfoId) {
  zero_all_bits(Protos, ProtoVectorSize);
}

TEMP_CONFIG_STRUCT::~TEMP_CONFIG_STRUCT() {
  FreeBitVector(Protos);
}

/*---------------------------------------------------------------------------*/
/**
 * This routine prints a summary of the adapted templates
 *  in Templates to File.
 *
 * @param File    open text file to print Templates to
 * @param Templates adapted templates to print to File
 *
 * @note Globals: none
 */
void Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) {
  INT_CLASS_STRUCT *IClass;
  ADAPT_CLASS_STRUCT *AClass;

  fprintf(File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
  fprintf(File, "Num classes = %d;  Num permanent classes = %d\n\n", Templates->NumNonEmptyClasses,
          Templates->NumPermClasses);
  fprintf(File, "   Id  NC NPC  NP NPP\n");
  fprintf(File, "------------------------\n");

  for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
    IClass = Templates->Templates->Class[i];
    AClass = Templates->Class[i];
    if (!IsEmptyAdaptedClass(AClass)) {
      fprintf(File, "%5u  %s %3d %3d %3d %3zd\n", i, unicharset.id_to_unichar(i), IClass->NumConfigs,
              AClass->NumPermConfigs, IClass->NumProtos,
              IClass->NumProtos - AClass->TempProtos->size());
    }
  }
  fprintf(File, "\n");

} /* PrintAdaptedTemplates */

/*---------------------------------------------------------------------------*/
/**
 * Read an adapted class description from file and return
 * a ptr to the adapted class.
 *
 * @param fp open file to read adapted class from
 * @return Ptr to new adapted class.
 *
 * @note Globals: none
 */
ADAPT_CLASS_STRUCT *ReadAdaptedClass(TFile *fp) {
  int NumTempProtos;
  int NumConfigs;
  int i;
  ADAPT_CLASS_STRUCT *Class;

  /* first read high level adapted class structure */
  Class = new ADAPT_CLASS_STRUCT;
  fp->FRead(Class, sizeof(ADAPT_CLASS_STRUCT), 1);

  /* then read in the definitions of the permanent protos and configs */
  Class->PermProtos = NewBitVector(MAX_NUM_PROTOS);
  Class->PermConfigs = NewBitVector(MAX_NUM_CONFIGS);
  fp->FRead(Class->PermProtos, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_PROTOS));
  fp->FRead(Class->PermConfigs, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_CONFIGS));

  /* then read in the list of temporary protos */
  fp->FRead(&NumTempProtos, sizeof(int), 1);
  Class->TempProtos = NIL_LIST;
  for (i = 0; i < NumTempProtos; i++) {
    auto TempProto = new TEMP_PROTO_STRUCT;
    fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1);
    Class->TempProtos = push_last(Class->TempProtos, TempProto);
  }

  /* then read in the adapted configs */
  fp->FRead(&NumConfigs, sizeof(int), 1);
  for (i = 0; i < NumConfigs; i++) {
    if (test_bit(Class->PermConfigs, i)) {
      Class->Config[i].Perm = ReadPermConfig(fp);
    } else {
      Class->Config[i].Temp = ReadTempConfig(fp);
    }
  }

  return (Class);

} /* ReadAdaptedClass */

/*---------------------------------------------------------------------------*/
/**
 * Read a set of adapted templates from file and return
 * a ptr to the templates.
 *
 * @param fp open text file to read adapted templates from
 * @return Ptr to adapted templates read from file.
 *
 * @note Globals: none
 */
ADAPT_TEMPLATES_STRUCT *Classify::ReadAdaptedTemplates(TFile *fp) {
  auto Templates = new ADAPT_TEMPLATES_STRUCT;

  /* first read the high level adaptive template struct */
  fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);

  /* then read in the basic integer templates */
  Templates->Templates = ReadIntTemplates(fp);

  /* then read in the adaptive info for each class */
  for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
    Templates->Class[i] = ReadAdaptedClass(fp);
  }
  return (Templates);

} /* ReadAdaptedTemplates */

/*---------------------------------------------------------------------------*/
/**
 * Read a permanent configuration description from file
 * and return a ptr to it.
 *
 * @param fp open file to read permanent config from
 * @return Ptr to new permanent configuration description.
 *
 * @note Globals: none
 */
PERM_CONFIG_STRUCT *ReadPermConfig(TFile *fp) {
  auto Config = new PERM_CONFIG_STRUCT;
  uint8_t NumAmbigs;
  fp->FRead(&NumAmbigs, sizeof(NumAmbigs), 1);
  Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1];
  fp->FRead(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs);
  Config->Ambigs[NumAmbigs] = -1;
  fp->FRead(&(Config->FontinfoId), sizeof(int), 1);

  return (Config);

} /* ReadPermConfig */

/*---------------------------------------------------------------------------*/
/**
 * Read a temporary configuration description from file
 * and return a ptr to it.
 *
 * @param fp open file to read temporary config from
 * @return Ptr to new temporary configuration description.
 *
 * @note Globals: none
 */
TEMP_CONFIG_STRUCT *ReadTempConfig(TFile *fp) {
  auto Config = new TEMP_CONFIG_STRUCT;
  fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1);

  Config->Protos = NewBitVector(Config->ProtoVectorSize * BITSINLONG);
  fp->FRead(Config->Protos, sizeof(uint32_t), Config->ProtoVectorSize);

  return (Config);

} /* ReadTempConfig */

/*---------------------------------------------------------------------------*/
/**
 * This routine writes a binary representation of Class
 * to File.
 *
 * @param File    open file to write Class to
 * @param Class   adapted class to write to File
 * @param NumConfigs  number of configs in Class
 *
 * @note Globals: none
 */
void WriteAdaptedClass(FILE *File, ADAPT_CLASS_STRUCT *Class, int NumConfigs) {
  /* first write high level adapted class structure */
  fwrite(Class, sizeof(ADAPT_CLASS_STRUCT), 1, File);

  /* then write out the definitions of the permanent protos and configs */
  fwrite(Class->PermProtos, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_PROTOS), File);
  fwrite(Class->PermConfigs, sizeof(uint32_t), WordsInVectorOfSize(MAX_NUM_CONFIGS), File);

  /* then write out the list of temporary protos */
  uint32_t NumTempProtos = Class->TempProtos->size();
  fwrite(&NumTempProtos, sizeof(NumTempProtos), 1, File);
  auto TempProtos = Class->TempProtos;
  iterate(TempProtos) {
    void *proto = TempProtos->node;
    fwrite(proto, sizeof(TEMP_PROTO_STRUCT), 1, File);
  }

  /* then write out the adapted configs */
  fwrite(&NumConfigs, sizeof(int), 1, File);
  for (int i = 0; i < NumConfigs; i++) {
    if (test_bit(Class->PermConfigs, i)) {
      WritePermConfig(File, Class->Config[i].Perm);
    } else {
      WriteTempConfig(File, Class->Config[i].Temp);
    }
  }

} /* WriteAdaptedClass */

/*---------------------------------------------------------------------------*/
/**
 * This routine saves Templates to File in a binary format.
 *
 * @param File    open text file to write Templates to
 * @param Templates set of adapted templates to write to File
 *
 * @note Globals: none
 */
void Classify::WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) {
  /* first write the high level adaptive template struct */
  fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);

  /* then write out the basic integer templates */
  WriteIntTemplates(File, Templates->Templates, unicharset);

  /* then write out the adaptive info for each class */
  for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
    WriteAdaptedClass(File, Templates->Class[i], Templates->Templates->Class[i]->NumConfigs);
  }
} /* WriteAdaptedTemplates */

/*---------------------------------------------------------------------------*/
/**
 * This routine writes a binary representation of a
 * permanent configuration to File.
 *
 * @param File  open file to write Config to
 * @param Config  permanent config to write to File
 *
 * @note Globals: none
 */
void WritePermConfig(FILE *File, PERM_CONFIG_STRUCT *Config) {
  uint8_t NumAmbigs = 0;

  assert(Config != nullptr);
  while (Config->Ambigs[NumAmbigs] > 0) {
    ++NumAmbigs;
  }

  fwrite(&NumAmbigs, sizeof(uint8_t), 1, File);
  fwrite(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs, File);
  fwrite(&(Config->FontinfoId), sizeof(int), 1, File);
} /* WritePermConfig */

/*---------------------------------------------------------------------------*/
/**
 * This routine writes a binary representation of a
 * temporary configuration to File.
 *
 * @param File  open file to write Config to
 * @param Config  temporary config to write to File
 *
 * @note Globals: none
 */
void WriteTempConfig(FILE *File, TEMP_CONFIG_STRUCT *Config) {
  assert(Config != nullptr);

  fwrite(Config, sizeof(TEMP_CONFIG_STRUCT), 1, File);
  fwrite(Config->Protos, sizeof(uint32_t), Config->ProtoVectorSize, File);

} /* WriteTempConfig */

} // namespace tesseract