Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/leptonica/src/dnahash_remnant.c.notused @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/leptonica/src/dnahash_remnant.c.notused	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,949 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*!
+ * \file dnahash_remnant.c.notused
+ * <pre>
+ *
+ *                                     NOTE
+ *       ==================================================================
+ *       This code has been retired from the library.  It is just
+ *       documentation.  It contains dnahash functionality that is no
+ *       longer in use.  The only current use of dnahash (in dnahash.c)
+ *       is for fast template lookup in the jbig2 classifier (jbclass.c).
+ *
+ *       The functions in this file hash strings, points and doubles.
+ *       Most of them have been replaced by analogous ones using the more
+ *       general hashmap.  They are saved here for pedagogical purposes;
+ *       mostly, to show how misguided one can be trying to implement a
+ *       general hashing function using only an array of doubles.  (Yes, it can
+ *       be done, but it's a lot of work for relatively little functionality.)
+ *
+ *       The new hashmap (in hashmap.c) has a much simpler lookup and
+ *       add mechanism.  It also has a rehashing function to allow the hash
+ *       array to grow as items are added.  Unlike the simple dnahash,
+ *       the hashmap stores the key in the hashitem, so it doesn't require
+ *       an auxiliary array to do lookup with checking and creating.
+ *       (It does however require the auxiliary array to retrieve the
+ *       original data, which is not stored in the hashitem.  This may
+ *       be changed in the future.)
+ *       ==================================================================
+ *
+ *      DnaHash: Accessors
+ *          l_int32      l_dnaHashGetCount()
+ *          l_int32      l_dnaHashGetTotalCount()
+ *
+ *      Set operations on dna
+ *          L_DNAHASH   *l_dnaHashCreateFromDna()
+ *          l_int32      l_dnaRemoveDupsByHash()
+ *          l_int32      l_dnaMakeHistoByHash()
+ *          L_DNA       *l_dnaIntersectionByHash()
+ *          l_int32      l_dnaFindValByHash()
+ *
+ *      Set operations on pta
+ *           PTA        *ptaUnionByHash()
+ *           l_int32     ptaRemoveDupsByHash()
+ *           PTA        *ptaIntersectionByHash();
+ *           l_int32     ptaFindPtByHash()
+ *           L_DNAHASH  *l_dnaHashCreateFromPta()
+ *
+ *      Set operations on sarray
+ *          l_int32     sarrayRemoveDupsByHash()
+ *          SARRAY     *sarrayIntersectionByHash()
+ *          l_int32     sarrayFindStringByHash()
+ *          L_DNAHASH  *l_dnaHashCreateFromSarray()
+ *
+ *    (1) The DnaHash is an array of Dna.  It can be used for fast
+ *        storage and lookup for sets and maps.  If the set or map
+ *        is on a Dna itself, the hash can be a simple casting from
+ *        a double to a l_uint64. For a string or a (x,y) point, we
+ *        use a hash to a l_uint64.  The result of the hash is
+ *        a "key", which is then used with the mod function to select
+ *        which Dna array is to be used.
+ *    (2) The number of arrays in a DnaHash is a prime number.
+ *        If there are N items, we set up the DnaHash array to have
+ *        approximately N/20 Dna, so the average size of these arrays
+ *        will be about 20 when fully populated.  The number 20 was
+ *        found empirically to be in a broad maximum of efficiency.
+ *    (3) Note that the word "hash" is overloaded.  There are actually
+ *        two hashing steps: the first hashes the object to a l_uint64,
+ *        called the "key", and the second uses the mod function to
+ *        "hash" the "key" to the index of a particular Dna in the
+ *        DnaHash array.
+ *    (4) Insertion and lookup time for DnaHash is O(1).  Hash collisions
+ *        into the dna array are expected (we might choose to have an average
+ *        of 20 for each key).  This can be contrasted with using rbtree
+ *        for sets and maps, where insertion and lookup are O(logN).
+ *    (5) Hash functions that map points and strings to l_uint64 are
+ *        given in utils1.c.
+ *    (6) This is a very simple implementation, that expects that you
+ *        know approximately (i.e., within a factor of 2 or 3) how many
+ *        items are to be stored when you initialize the DnaHash.
+ *        It cannot modify the size of the Dna array as the occupation grows.
+ * </pre>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config_auto.h>
+#endif  /* HAVE_CONFIG_H */
+
+#include "allheaders.h"
+
+/*--------------------------------------------------------------------------*
+ *                   Dna hash: Accessors and modifiers                      *
+ *--------------------------------------------------------------------------*/
+/*!
+ * \brief   l_dnaHashGetCount()
+ *
+ * \param[in]    dahash
+ * \return  nbuckets allocated, or 0 on error
+ */
+l_int32
+l_dnaHashGetCount(L_DNAHASH  *dahash)
+{
+
+    if (!dahash)
+        return ERROR_INT("dahash not defined", __func__, 0);
+    return dahash->nbuckets;
+}
+
+
+/*!
+ * \brief   l_dnaHashGetTotalCount()
+ *
+ * \param[in]    dahash
+ * \return  n number of numbers in all dna, or 0 on error
+ */
+l_int32
+l_dnaHashGetTotalCount(L_DNAHASH  *dahash)
+{
+l_int32  i, n;
+L_DNA   *da;
+
+    if (!dahash)
+        return ERROR_INT("dahash not defined", __func__, 0);
+
+    for (i = 0, n = 0; i < dahash->nbuckets; i++) {
+        da = l_dnaHashGetDna(dahash, i, L_NOCOPY);
+        if (da)
+            n += l_dnaGetCount(da);
+    }
+
+    return n;
+}
+
+
+/*--------------------------------------------------------------------------*
+ *                   Set operations on dna using hashing                    *
+ *--------------------------------------------------------------------------*/
+/*!
+ * \brief   l_dnaHashCreateFromDna()
+ *
+ * \param[in]    da
+ * \return  dahash if OK; 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) The values stored in the %dahash are indices into %da;
+ *          %dahash has no use without %da.
+ * </pre>
+ */
+L_DNAHASH *
+l_dnaHashCreateFromDna(L_DNA  *da)
+{
+l_int32     i, n;
+l_uint32    nsize;
+l_uint64    key;
+l_float64   val;
+L_DNAHASH  *dahash;
+
+    if (!da)
+        return (L_DNAHASH *)ERROR_PTR("da not defined", __func__, NULL);
+
+    n = l_dnaGetCount(da);
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+
+    dahash = l_dnaHashCreate(nsize, 8);
+    for (i = 0; i < n; i++) {
+        l_dnaGetDValue(da, i, &val);
+        l_dnaHashAdd(dahash, (l_uint64)val, (l_float64)i);
+    }
+
+    return dahash;
+}
+
+
+/*!
+ * \brief   l_dnaRemoveDupsByHash()
+ *
+ * \param[in]    das
+ * \param[out]   pdad      hash set
+ * \param[out]   pdahash   [optional] dnahash used for lookup
+ * \return  0 if OK; 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Generates a dna with unique values.
+ *      (2) The dnahash is built up with dad to assure uniqueness.
+ *          It can be used to find if an element is in the set:
+ *              l_dnaFindValByHash(dad, dahash, val, &index)
+ * </pre>
+ */
+l_ok
+l_dnaRemoveDupsByHash(L_DNA       *das,
+                      L_DNA      **pdad,
+                      L_DNAHASH  **pdahash)
+{
+l_int32     i, n, index, items;
+l_uint32    nsize;
+l_uint64    key;
+l_float64   val;
+L_DNA      *dad;
+L_DNAHASH  *dahash;
+
+    if (pdahash) *pdahash = NULL;
+    if (!pdad)
+        return ERROR_INT("&dad not defined", __func__, 1);
+    *pdad = NULL;
+    if (!das)
+        return ERROR_INT("das not defined", __func__, 1);
+
+    n = l_dnaGetCount(das);
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+    dahash = l_dnaHashCreate(nsize, 8);
+    dad = l_dnaCreate(n);
+    *pdad = dad;
+    for (i = 0, items = 0; i < n; i++) {
+        l_dnaGetDValue(das, i, &val);
+        l_dnaFindValByHash(dad, dahash, val, &index);
+        if (index < 0) {  /* not found */
+            l_dnaHashAdd(dahash, (l_uint64)val, (l_float64)items);
+            l_dnaAddNumber(dad, val);
+            items++;
+        }
+    }
+
+    if (pdahash)
+        *pdahash = dahash;
+    else
+        l_dnaHashDestroy(&dahash);
+    return 0;
+}
+
+
+/*!
+ * \brief   l_dnaMakeHistoByHash()
+ *
+ * \param[in]    das
+ * \param[out]   pdahash    hash map: val --> index
+ * \param[out]   pdav       [optional] array of values: index --> val
+ * \param[out]   pdac       [optional] histo array of counts: index --> count
+ * \return  0 if OK; 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Generates and returns a dna of occurrences (histogram),
+ *          an aligned dna of values, and an associated hashmap.
+ *          The hashmap takes %dav and a value, and points into the
+ *          histogram in %dac.
+ *      (2) The dna of values, %dav, is aligned with the histogram %dac,
+ *          and is needed for fast lookup.  It is a hash set, because
+ *          the values are unique.
+ *      (3) If you only need to make a histogram and get the number of
+ *          non-zero entries, here are two methods:
+ *          (a) l_dnaMakeHistoByHash(da, &dahash, NULL, NULL);
+ *              count = l_dnaHashGetTotalCount(dahash);
+ *          (b) l_dnaRemoveDupsByHash(da, &da_nodups, NULL);
+                count = l_dnaGetCount(da_nodups);
+ *      (4) Lookup is simple:
+ *              l_dnaFindValByHash(dav, dahash, val, &index);
+ *              if (index >= 0)
+ *                  l_dnaGetIValue(dac, index, &icount);
+ *              else
+ *                  icount = 0;
+ * </pre>
+ */
+l_ok
+l_dnaMakeHistoByHash(L_DNA       *das,
+                     L_DNAHASH  **pdahash,
+                     L_DNA      **pdav,
+                     L_DNA      **pdac)
+{
+l_int32     i, n, nitems, index, count;
+l_uint32    nsize;
+l_uint64    key;
+l_float64   val;
+L_DNA      *dac, *dav;
+L_DNAHASH  *dahash;
+
+    if (pdahash) *pdahash = NULL;
+    if (pdac) *pdac = NULL;
+    if (pdav) *pdav = NULL;
+    if (!pdahash)
+        return ERROR_INT("&dahash not defined", __func__, 1);
+    if (!das)
+        return ERROR_INT("das not defined", __func__, 1);
+    if ((n = l_dnaGetCount(das)) == 0)
+        return ERROR_INT("no data in das", __func__, 1);
+
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+    dahash = l_dnaHashCreate(nsize, 8);
+    dac = l_dnaCreate(n);  /* histogram */
+    dav = l_dnaCreate(n);  /* the values */
+    for (i = 0, nitems = 0; i < n; i++) {
+        l_dnaGetDValue(das, i, &val);
+            /* Is this value already stored in dav? */
+        l_dnaFindValByHash(dav, dahash, val, &index);
+        if (index >= 0) {  /* found */
+            l_dnaGetIValue(dac, (l_float64)index, &count);
+            l_dnaSetValue(dac, (l_float64)index, count + 1);
+        } else {  /* not found */
+            l_dnaHashAdd(dahash, (l_uint64)val, (l_float64)nitems);
+            l_dnaAddNumber(dav, val);
+            l_dnaAddNumber(dac, 1);
+            nitems++;
+        }
+    }
+
+    *pdahash = dahash;
+    if (pdac)
+        *pdac = dac;
+    else
+        l_dnaDestroy(&dac);
+    if (pdav)
+        *pdav = dav;
+    else
+        l_dnaDestroy(&dav);
+    return 0;
+}
+
+
+/*!
+ * \brief   l_dnaIntersectionByHash()
+ *
+ * \param[in]    da1, da2
+ * \return  dad   intersection of the number arrays, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This uses the same method for building the intersection set
+ *          as ptaIntersectionByHash() and sarrayIntersectionByHash().
+ * </pre>
+ */
+L_DNA *
+l_dnaIntersectionByHash(L_DNA  *da1,
+                        L_DNA  *da2)
+{
+l_int32     n1, n2, nsmall, nbuckets, i, index1, index2;
+l_uint32    nsize2;
+l_uint64    key;
+l_float64   val;
+L_DNAHASH  *dahash1, *dahash2;
+L_DNA      *da_small, *da_big, *dad;
+
+    if (!da1)
+        return (L_DNA *)ERROR_PTR("da1 not defined", __func__, NULL);
+    if (!da2)
+        return (L_DNA *)ERROR_PTR("da2 not defined", __func__, NULL);
+
+        /* Put the elements of the biggest array into a dnahash */
+    n1 = l_dnaGetCount(da1);
+    n2 = l_dnaGetCount(da2);
+    da_small = (n1 < n2) ? da1 : da2;   /* do not destroy da_small */
+    da_big = (n1 < n2) ? da2 : da1;   /* do not destroy da_big */
+    dahash1 = l_dnaHashCreateFromDna(da_big);
+
+        /* Build up the intersection of numbers.  Add to %dad
+         * if the number is in da_big (using dahash1) but hasn't
+         * yet been seen in the traversal of da_small (using dahash2). */
+    dad = l_dnaCreate(0);
+    nsmall = l_dnaGetCount(da_small);
+    findNextLargerPrime(nsmall / 20, &nsize2);  /* buckets in hash table */
+    dahash2 = l_dnaHashCreate(nsize2, 0);
+    nbuckets = l_dnaHashGetCount(dahash2);
+    for (i = 0; i < nsmall; i++) {
+        l_dnaGetDValue(da_small, i, &val);
+        l_dnaFindValByHash(da_big, dahash1, val, &index1);
+        if (index1 >= 0) {  /* found */
+            l_dnaFindValByHash(da_small, dahash2, val, &index2);
+            if (index2 == -1) {  /* not found */
+                l_dnaAddNumber(dad, val);
+                l_dnaHashAdd(dahash2, (l_uint64)val, (l_float64)i);
+            }
+        }
+    }
+
+    l_dnaHashDestroy(&dahash1);
+    l_dnaHashDestroy(&dahash2);
+    return dad;
+}
+
+
+/*!
+ * \brief   l_dnaFindValByHash()
+ *
+ * \param[in]    da
+ * \param[in]    dahash    containing indices into %da
+ * \param[in]    val       searching for this number in %da
+ * \param[out]   pindex    index into da if found; -1 otherwise
+ * \return  0 if OK; 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Algo: hash %val into a key; hash the key to get the dna
+ *                in %dahash (that holds indices into %da); traverse
+ *                the dna of indices looking for %val in %da.
+ * </pre>
+ */
+l_ok
+l_dnaFindValByHash(L_DNA      *da,
+                   L_DNAHASH  *dahash,
+                   l_float64   val,
+                   l_int32    *pindex)
+{
+l_int32    i, nbuckets, nvals, indexval;
+l_float64  vali;
+l_uint64   key;
+L_DNA     *da1;
+
+    if (!pindex)
+        return ERROR_INT("&index not defined", __func__, 1);
+    *pindex = -1;
+    if (!da)
+        return ERROR_INT("da not defined", __func__, 1);
+    if (!dahash)
+        return ERROR_INT("dahash not defined", __func__, 1);
+
+    nbuckets = l_dnaHashGetCount(dahash);
+    da1 = l_dnaHashGetDna(dahash, (l_uint64)val, L_NOCOPY);
+    if (!da1) return 0;
+
+        /* Run through da1, looking for this %val */
+    nvals = l_dnaGetCount(da1);
+    for (i = 0; i < nvals; i++) {
+        l_dnaGetIValue(da1, i, &indexval);
+        l_dnaGetDValue(da, indexval, &vali);
+        if (val == vali) {
+            *pindex = indexval;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                Set operations on points using hashing               *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   ptaUnionByHash()
+ *
+ * \param[in]    pta1, pta2
+ * \return  ptad with the union of the set of points, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is faster than ptaUnionByAset(), because the
+ *          bucket lookup is O(n).  It should be used if the pts are
+ *          integers (e.g., representing pixel positions).
+ * </pre>
+ */
+PTA *
+ptaUnionByHash(PTA  *pta1,
+               PTA  *pta2)
+{
+PTA  *pta3, *ptad;
+
+    if (!pta1)
+        return (PTA *)ERROR_PTR("pta1 not defined", __func__, NULL);
+    if (!pta2)
+        return (PTA *)ERROR_PTR("pta2 not defined", __func__, NULL);
+
+        /* Join */
+    pta3 = ptaCopy(pta1);
+    ptaJoin(pta3, pta2, 0, -1);
+
+        /* Eliminate duplicates */
+    ptaRemoveDupsByHash(pta3, &ptad, NULL);
+    ptaDestroy(&pta3);
+    return ptad;
+}
+
+
+/*!
+ * \brief   ptaRemoveDupsByHash()
+ *
+ * \param[in]    ptas      assumed to be integer values
+ * \param[out]   pptad     unique set of pts; duplicates removed
+ * \param[out]   pdahash   [optional] dnahash used for lookup
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Generates a pta with unique values.
+ *      (2) The dnahash is built up with ptad to assure uniqueness.
+ *          It can be used to find if a point is in the set:
+ *              ptaFindPtByHash(ptad, dahash, x, y, &index)
+ *      (3) The hash of the (x,y) location is simple and fast.  It scales
+ *          up with the number of buckets to insure a fairly random
+ *          bucket selection for adjacent points.
+ *      (4) A Dna is used rather than a Numa because we need accurate
+ *          representation of 32-bit integers that are indices into ptas.
+ *          Integer --> float --> integer conversion makes errors for
+ *          integers larger than 10M.
+ *      (5) This is faster than ptaRemoveDupsByAset(), because the
+ *          bucket lookup is O(n), although there is a double-loop
+ *          lookup within the dna in each bucket.
+ * </pre>
+ */
+l_ok
+ptaRemoveDupsByHash(PTA         *ptas,
+                    PTA        **pptad,
+                    L_DNAHASH  **pdahash)
+{
+l_int32     i, n, index, items, x, y;
+l_uint32    nsize;
+l_uint64    key;
+PTA        *ptad;
+L_DNAHASH  *dahash;
+
+    if (pdahash) *pdahash = NULL;
+    if (!pptad)
+        return ERROR_INT("&ptad not defined", __func__, 1);
+    *pptad = NULL;
+    if (!ptas)
+        return ERROR_INT("ptas not defined", __func__, 1);
+
+    n = ptaGetCount(ptas);
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+    dahash = l_dnaHashCreate(nsize, 8);
+    ptad = ptaCreate(n);
+    *pptad = ptad;
+    for (i = 0, items = 0; i < n; i++) {
+        ptaGetIPt(ptas, i, &x, &y);
+        ptaFindPtByHash(ptad, dahash, x, y, &index);
+        if (index < 0) {  /* not found */
+            l_hashPtToUint64(x, y, &key);
+            l_dnaHashAdd(dahash, key, (l_float64)items);
+            ptaAddPt(ptad, x, y);
+            items++;
+        }
+    }
+
+    if (pdahash)
+        *pdahash = dahash;
+    else
+        l_dnaHashDestroy(&dahash);
+    return 0;
+}
+
+
+/*!
+ * \brief   ptaIntersectionByHash()
+ *
+ * \param[in]    pta1, pta2
+ * \return  ptad intersection of the point sets, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is faster than ptaIntersectionByAset(), because the
+ *          bucket lookup is O(n).  It should be used if the pts are
+ *          integers (e.g., representing pixel positions).
+ * </pre>
+ */
+PTA *
+ptaIntersectionByHash(PTA  *pta1,
+                      PTA  *pta2)
+{
+l_int32     n1, n2, nsmall, i, x, y, index1, index2;
+l_uint32    nsize2;
+l_uint64    key;
+L_DNAHASH  *dahash1, *dahash2;
+PTA        *pta_small, *pta_big, *ptad;
+
+    if (!pta1)
+        return (PTA *)ERROR_PTR("pta1 not defined", __func__, NULL);
+    if (!pta2)
+        return (PTA *)ERROR_PTR("pta2 not defined", __func__, NULL);
+
+        /* Put the elements of the biggest pta into a dnahash */
+    n1 = ptaGetCount(pta1);
+    n2 = ptaGetCount(pta2);
+    pta_small = (n1 < n2) ? pta1 : pta2;   /* do not destroy pta_small */
+    pta_big = (n1 < n2) ? pta2 : pta1;   /* do not destroy pta_big */
+    dahash1 = l_dnaHashCreateFromPta(pta_big);
+
+        /* Build up the intersection of points.  Add to ptad
+         * if the point is in pta_big (using dahash1) but hasn't
+         * yet been seen in the traversal of pta_small (using dahash2). */
+    ptad = ptaCreate(0);
+    nsmall = ptaGetCount(pta_small);
+    findNextLargerPrime(nsmall / 20, &nsize2);  /* buckets in hash table */
+    dahash2 = l_dnaHashCreate(nsize2, 0);
+    for (i = 0; i < nsmall; i++) {
+        ptaGetIPt(pta_small, i, &x, &y);
+        ptaFindPtByHash(pta_big, dahash1, x, y, &index1);
+        if (index1 >= 0) {  /* found */
+            ptaFindPtByHash(pta_small, dahash2, x, y, &index2);
+            if (index2 == -1) {  /* not found */
+                ptaAddPt(ptad, x, y);
+                l_hashPtToUint64(x, y, &key);
+                l_dnaHashAdd(dahash2, key, (l_float64)i);
+            }
+        }
+    }
+
+    l_dnaHashDestroy(&dahash1);
+    l_dnaHashDestroy(&dahash2);
+    return ptad;
+}
+
+
+/*!
+ * \brief   ptaFindPtByHash()
+ *
+ * \param[in]    pta
+ * \param[in]    dahash     built from pta
+ * \param[in]    x, y       arbitrary points
+ * \param[out]   pindex     index into pta if (x,y) is in pta; -1 otherwise
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Fast lookup in dnaHash associated with a pta, to see if a
+ *          random point (x,y) is already stored in the hash table.
+ *      (2) We use a strong hash function to minimize the chance that
+ *          two different points hash to the same key value.
+ *      (3) We select the number of buckets to be about 5% of the size
+ *          of the input %pta, so that when fully populated, each
+ *          bucket (dna) will have about 20 entries, each being an index
+ *          into %pta.  In lookup, after hashing to the key, and then
+ *          again to the bucket, we traverse the bucket (dna), using the
+ *          index into %pta to check if the point (x,y) has been found before.
+ * </pre>
+ */
+l_ok
+ptaFindPtByHash(PTA        *pta,
+                L_DNAHASH  *dahash,
+                l_int32     x,
+                l_int32     y,
+                l_int32    *pindex)
+{
+l_int32   i, nvals, index, xi, yi;
+l_uint64  key;
+L_DNA    *da;
+
+    if (!pindex)
+        return ERROR_INT("&index not defined", __func__, 1);
+    *pindex = -1;
+    if (!pta)
+        return ERROR_INT("pta not defined", __func__, 1);
+    if (!dahash)
+        return ERROR_INT("dahash not defined", __func__, 1);
+
+    l_hashPtToUint64(x, y, &key);
+    da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
+    if (!da) return 0;
+
+        /* Run through the da, looking for this point */
+    nvals = l_dnaGetCount(da);
+    for (i = 0; i < nvals; i++) {
+        l_dnaGetIValue(da, i, &index);
+        ptaGetIPt(pta, index, &xi, &yi);
+        if (x == xi && y == yi) {
+            *pindex = index;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+
+/*!
+ * \brief   l_dnaHashCreateFromPta()
+ *
+ * \param[in]    pta
+ * \return  dahash, or NULL on error
+ */
+L_DNAHASH *
+l_dnaHashCreateFromPta(PTA  *pta)
+{
+l_int32     i, n, x, y;
+l_uint32    nsize;
+l_uint64    key;
+L_DNAHASH  *dahash;
+
+    if (!pta)
+        return (L_DNAHASH *)ERROR_PTR("pta not defined", __func__, NULL);
+
+        /* Build up dnaHash of indices, hashed by a key that is
+         * a large linear combination of x and y values designed to
+         * randomize the key.  Having about 20 pts in each bucket is
+         * roughly optimal for speed for large sets. */
+    n = ptaGetCount(pta);
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+
+        /* Add each point, using the hash as key and the index into
+         * %ptas as the value.  Storing the index enables operations
+         * that check for duplicates. */
+    dahash = l_dnaHashCreate(nsize, 8);
+    for (i = 0; i < n; i++) {
+        ptaGetIPt(pta, i, &x, &y);
+        l_hashPtToUint64(x, y, &key);
+        l_dnaHashAdd(dahash, key, (l_float64)i);
+    }
+
+    return dahash;
+}
+
+
+/*----------------------------------------------------------------------*
+ *               Set operations on sarray using hashing                 *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayRemoveDupsByHash()
+ *
+ * \param[in]    sas
+ * \param[out]   psad      unique set of strings; duplicates removed
+ * \param[out]   pdahash   [optional] dnahash used for lookup
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Generates a sarray with unique values.
+ *      (2) The dnahash is built up with sad to assure uniqueness.
+ *          It can be used to find if a string is in the set:
+ *              sarrayFindValByHash(sad, dahash, str, &index)
+ *      (3) The hash of the string location is simple and fast.  It scales
+ *          up with the number of buckets to insure a fairly random
+ *          bucket selection input strings.
+ *      (4) This is faster than sarrayRemoveDupsByAset(), because the
+ *          bucket lookup is O(n), although there is a double-loop
+ *          lookup within the dna in each bucket.
+ * </pre>
+ */
+l_ok
+sarrayRemoveDupsByHash(SARRAY      *sas,
+                       SARRAY     **psad,
+                       L_DNAHASH  **pdahash)
+{
+char       *str;
+l_int32     i, n, index, items;
+l_uint32    nsize;
+l_uint64    key;
+SARRAY     *sad;
+L_DNAHASH  *dahash;
+
+    if (pdahash) *pdahash = NULL;
+    if (!psad)
+        return ERROR_INT("&sad not defined", __func__, 1);
+    *psad = NULL;
+    if (!sas)
+        return ERROR_INT("sas not defined", __func__, 1);
+
+    n = sarrayGetCount(sas);
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+    dahash = l_dnaHashCreate(nsize, 8);
+    sad = sarrayCreate(n);
+    *psad = sad;
+    for (i = 0, items = 0; i < n; i++) {
+        str = sarrayGetString(sas, i, L_NOCOPY);
+        sarrayFindStringByHash(sad, dahash, str, &index);
+        if (index < 0) {  /* not found */
+            l_hashStringToUint64(str, &key);
+            l_dnaHashAdd(dahash, key, (l_float64)items);
+            sarrayAddString(sad, str, L_COPY);
+            items++;
+        }
+    }
+
+    if (pdahash)
+        *pdahash = dahash;
+    else
+        l_dnaHashDestroy(&dahash);
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayIntersectionByHash()
+ *
+ * \param[in]    sa1, sa2
+ * \return  sad  intersection of the strings, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is faster than sarrayIntersectionByAset(), because the
+ *          bucket lookup is O(n).
+ * </pre>
+ */
+SARRAY *
+sarrayIntersectionByHash(SARRAY  *sa1,
+                         SARRAY  *sa2)
+{
+char       *str;
+l_int32     n1, n2, nsmall, i, index1, index2;
+l_uint32    nsize2;
+l_uint64    key;
+L_DNAHASH  *dahash1, *dahash2;
+SARRAY     *sa_small, *sa_big, *sad;
+
+    if (!sa1)
+        return (SARRAY *)ERROR_PTR("sa1 not defined", __func__, NULL);
+    if (!sa2)
+        return (SARRAY *)ERROR_PTR("sa2 not defined", __func__, NULL);
+
+        /* Put the elements of the biggest sarray into a dnahash */
+    n1 = sarrayGetCount(sa1);
+    n2 = sarrayGetCount(sa2);
+    sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */
+    sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */
+    dahash1 = l_dnaHashCreateFromSarray(sa_big);
+
+        /* Build up the intersection of strings.  Add to %sad
+         * if the string is in sa_big (using dahash1) but hasn't
+         * yet been seen in the traversal of sa_small (using dahash2). */
+    sad = sarrayCreate(0);
+    nsmall = sarrayGetCount(sa_small);
+    findNextLargerPrime(nsmall / 20, &nsize2);  /* buckets in hash table */
+    dahash2 = l_dnaHashCreate(nsize2, 0);
+    for (i = 0; i < nsmall; i++) {
+        str = sarrayGetString(sa_small, i, L_NOCOPY);
+        sarrayFindStringByHash(sa_big, dahash1, str, &index1);
+        if (index1 >= 0) {
+            sarrayFindStringByHash(sa_small, dahash2, str, &index2);
+            if (index2 == -1) {
+                sarrayAddString(sad, str, L_COPY);
+                l_hashStringToUint64(str, &key);
+                l_dnaHashAdd(dahash2, key, (l_float64)i);
+            }
+        }
+    }
+
+    l_dnaHashDestroy(&dahash1);
+    l_dnaHashDestroy(&dahash2);
+    return sad;
+}
+
+
+/*!
+ * \brief   sarrayFindStringByHash()
+ *
+ * \param[in]    sa
+ * \param[in]    dahash   built from sa
+ * \param[in]    str      arbitrary string
+ * \param[out]   pindex   index into %sa if %str is in %sa; -1 otherwise
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Fast lookup in dnaHash associated with a sarray, to see if a
+ *          random string %str is already stored in the hash table.
+ *      (2) We use a strong hash function to minimize the chance that
+ *          two different strings hash to the same key value.
+ *      (3) We select the number of buckets to be about 5% of the size
+ *          of the input sarray, so that when fully populated, each
+ *          bucket (dna) will have about 20 entries, each being an index
+ *          into sa.  In lookup, after hashing to the key, and then
+ *          again to the bucket, we traverse the bucket (dna), using the
+ *          index into sa to check if %str has been found before.
+ * </pre>
+ */
+l_ok
+sarrayFindStringByHash(SARRAY      *sa,
+                       L_DNAHASH   *dahash,
+                       const char  *str,
+                       l_int32     *pindex)
+{
+char     *stri;
+l_int32   i, nvals, index;
+l_uint64  key;
+L_DNA    *da;
+
+    if (!pindex)
+        return ERROR_INT("&index not defined", __func__, 1);
+    *pindex = -1;
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!dahash)
+        return ERROR_INT("dahash not defined", __func__, 1);
+
+    l_hashStringToUint64(str, &key);
+    da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
+    if (!da) return 0;
+
+        /* Run through the da, looking for this string */
+    nvals = l_dnaGetCount(da);
+    for (i = 0; i < nvals; i++) {
+        l_dnaGetIValue(da, i, &index);
+        stri = sarrayGetString(sa, index, L_NOCOPY);
+        if (!strcmp(str, stri)) {  /* duplicate */
+            *pindex = index;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+
+/*!
+ * \brief   l_dnaHashCreateFromSarray()
+ *
+ * \param[in]    sa
+ * \return  dahash, or NULL on error
+ */
+L_DNAHASH *
+l_dnaHashCreateFromSarray(SARRAY  *sa)
+{
+char       *str;
+l_int32     i, n;
+l_uint32    nsize;
+l_uint64    key;
+L_DNAHASH  *dahash;
+
+        /* Build up dnaHash of indices, hashed by a 64-bit key that
+         * should randomize the lower bits used in bucket selection.
+         * Having about 20 pts in each bucket is roughly optimal. */
+    n = sarrayGetCount(sa);
+    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
+/*    lept_stderr("Prime used: %d\n", nsize); */
+
+        /* Add each string, using the hash as key and the index into %sa
+         * as the value.  Storing the index enables operations that check
+         * for duplicates.  */
+    dahash = l_dnaHashCreate(nsize, 8);
+    for (i = 0; i < n; i++) {
+        str = sarrayGetString(sa, i, L_NOCOPY);
+        l_hashStringToUint64(str, &key);
+        l_dnaHashAdd(dahash, key, (l_float64)i);
+    }
+
+    return dahash;
+}
+
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children