view cutils/util/__init__.py @ 380:58552d3d1766

treesum: begin unittests for treesums and .treesum files
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 12 May 2025 15:33:16 +0200
parents 6d7659a709f2
children
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# SPDX-FileCopyrightText: © 2020-2025 Franz Glasner
# SPDX-License-Identifier: BSD-3-Clause
# :-
r"""Utility package.

"""

from __future__ import print_function, absolute_import


__all__ = ["PY2",
           "PY35",
           "n", "b", "u",
           "parse_grouped_decimal_number",
           "normalize_filename",
           "escape_for_output",
           "argv2algo",
           "algotag2algotype",
           "algotag2digest_size",
           "get_blake2b",
           "get_blake2b_256",
           "get_blake2s",
           "default_algotag",
           "fsencode",
           "interpolate_bytes",
           ]


import argparse
import hashlib
import os
import sys


PY2 = sys.version_info[0] < 3
PY35 = sys.version_info[:2] >= (3, 5)


if PY2:

    def n(s, encoding="ascii"):
        """Convert `s` to the native string implementation"""
        if isinstance(s, unicode):       # noqa: F821 undefined name 'unicode'
            return s.encode(encoding)
        return s

    def b(s, encoding="ascii", errors="strict"):
        """Convert `s` to bytes"""
        if isinstance(s, unicode):       # noqa: F821 undefined name 'unicode'
            return s.encode(encoding, errors)
        return s

    def u(s, encoding="ascii"):
        """Convert `s` to a unicode string"""
        if isinstance(s, str):
            return s.decode(encoding)
        return s

    def parse_grouped_decimal_number(s):
        return int(n(s).translate(None, "., '_"), 10)

else:

    def n(s, encoding="ascii"):
        """Convert `s` to the native string implementation"""
        if isinstance(s, (bytes, bytearray)):
            return s.decode(encoding)
        return s

    def b(s, encoding="ascii", errors="strict"):
        """Convert `s` to bytes"""
        if isinstance(s, str):
            return s.encode(encoding, errors)
        return s

    u = n

    def parse_grouped_decimal_number(s):
        return int(n(s).translate(NORMALIZATION_DELETE_CHARS), 10)

    NORMALIZATION_DELETE_CHARS = {
        ord('.'): None,
        ord(','): None,
        ord(' '): None,
        ord("'"): None,
        ord('_'): None
    }


def escape_for_output(what):
    """Escape `what` in such a way that the output can be safely written into
    a line and/or column-oriented output file

    """
    if isinstance(what, bytes):
        return (what.replace(b'\\', b"\\\\")
                .replace(b'\n', b"\\x0a")
                .replace(b'\r', b"\\x0d")
                .replace(b'\t', b"\\x09"))
    else:
        return (what.replace(u'\\', u"\\\\")
                .replace(u'\n', u"\\x0a")
                .replace(u'\r', u"\\x0d")
                .replace(u'\t', u"\\x09"))


def default_algotag():
    """Determine the "best" default algorithm.

    Depend on availability in :mod:`hashlib`.

    Prefer BLAKE2b-256, SHA256 or SHA1 -- in this order.

    Does not consider :mod:`pyblake2` if it is available eventually.

    """
    # Python <2.7.9 has no algorithms_available: fall back to algorithms
    try:
        algos = hashlib.algorithms_available
    except AttributeError:
        try:
            algos = hashlib.algorithms
        except AttributeError:
            algos = []
    if "blake2b" in algos:
        return "BLAKE2b-256"
    if "sha256" in algos:
        return "SHA256"
    return "SHA1"


def get_blake2b():
    """Get the factory for blake2b"""
    try:
        return hashlib.blake2b
    except AttributeError:
        import pyblake2
        return pyblake2.blake2b


def get_blake2b_256():
    """Get the factory for blake2b-256"""

    try:
        hashlib.blake2b
    except AttributeError:
        import pyblake2

        def _get_blake():
            return pyblake2.blake2b(digest_size=32)

    else:

        def _get_blake():
            return hashlib.blake2b(digest_size=32)

    return _get_blake


def get_blake2s():
    """Get the factory for blake2s"""
    try:
        return hashlib.blake2s
    except AttributeError:
        import pyblake2
        return pyblake2.blake2s


def get_crc(name):
    """Get the factory for a CRC"""

    from ..crcmod.predefined import PredefinedCrc

    def _crc_type():
        return PredefinedCrc(name)

    return _crc_type


def argv2algo(s):
    """Convert a command line algorithm specifier into a tuple with the
    type/factory of the digest and the algorithms tag for output purposes.

    :param str s: the specifier from the command line; should include all
                  algorithm tags also (for proper round-tripping)
    :return: the internal digest specification
    :rtype: a tuple (digest_type_or_factory, name_in_output)
    :raises argparse.ArgumentTypeError: for unrecognized algorithms or names

    String comparisons are done case-insensitively.

    """
    s = s.lower()
    if s in ("1", "sha1"):
        return (hashlib.sha1, "SHA1")
    elif s in ("224", "sha224"):
        return (hashlib.sha224, "SHA224")
    elif s in ("256", "sha256"):
        return (hashlib.sha256, "SHA256")
    elif s in ("384", "sha384"):
        return (hashlib.sha384, "SHA384")
    elif s in ("512", "sha512"):
        return (hashlib.sha512, "SHA512")
    elif s in ("3-224", "sha3-224"):
        return (hashlib.sha3_224, "SHA3-224")
    elif s in ("3-256", "sha3-256"):
        return (hashlib.sha3_256, "SHA3-256")
    elif s in ("3-384", "sha3-384"):
        return (hashlib.sha3_384, "SHA3-384")
    elif s in ("3", "3-512", "sha3-512"):
        return (hashlib.sha3_512, "SHA3-512")
    elif s in ("blake2b", "blake2b-512", "blake2", "blake2-512"):
        return (get_blake2b(), "BLAKE2b")
    elif s in ("blake2s", "blake2s-256"):
        return (get_blake2s(), "BLAKE2s")
    elif s in ("blake2-256", "blake2b-256"):
        return (get_blake2b_256(), "BLAKE2b-256")
    elif s == "md5":
        return (hashlib.md5, "MD5")
    elif s in ("crc24", "crc-24",
               "crc24-openpgp", "crc-24-openpgp"):
        return (get_crc("crc-24"), "CRC-24")
    elif s in ("crc32", "crc-32",
               "crc32-pkzip", "crc-32-pkzip",
               "crc32-iso", "crc-32-iso",
               "crc32-iso-hdlc", "crc-32-iso-hdlc"):
        return (get_crc("crc-32"), "CRC-32-ISO")
    elif s in ("crc32-posix", "crc-32-posix",
               "crc32-cksum", "crc-32-cksum",
               "posix"):
        return (get_crc("posix"), "CRC-32-POSIX")
    elif s in ("crc64", "crc-64",
               "crc64-iso", "crc-64-iso"):
        return (get_crc("crc-64"), "CRC-64-ISO")
    elif s in ("crc64-2", "crc-64-2",
               "crc64-iso-2", "crc-64-iso-2",
               "crc64-mcrc64", "crc-64-mcrc64"):
        return (get_crc("crc-64-2"), "CRC-64-ISO-2")
    elif s in ("crc64-ecma", "crc-64-ecma"):
        return (get_crc("crc-64-ecma"), "CRC-64-ECMA")
    elif s in ("crc64-xz", "crc-64-xz",
               "crc64-go-ecma", "crc-64-go-ecma"):
        return (get_crc("crc-64-xz"), "CRC-64-XZ")
    elif s in ("crc64-go", "crc-64-go",
               "crc64-go-iso", "crc-64-go-iso"):
        return (get_crc("crc-64-go"), "CRC-64-GO-ISO")
    elif s in ("crc64-redis", "crc-64-redis"):
        return (get_crc("crc-64-redis"), "CRC-64-REDIS")
    else:
        raise argparse.ArgumentTypeError(
            "`{}' is not a recognized algorithm".format(s))


def algotag2algotype(s):
    """Convert the algorithm specifier in a BSD-style digest file to the
    type/factory of the corresponding algorithm.

    :param str s: the tag (i.e. normalized name) or the algorithm
    :return: the digest type or factory for `s`
    :raises ValueError: on unknown and/or unhandled algorithms
    :raises ImportError: if a module that is required to handle given
                         specifier `s` is not available (e.g. BLAKE2b on
                         Python 2)

    All string comparisons are case-sensitive.

    """
    # Standard in Python2.7
    if s == "MD5":
        return hashlib.md5
    elif s == "SHA1":
        return hashlib.sha1
    elif s == "SHA224":
        return hashlib.sha224
    elif s == "SHA256":
        return hashlib.sha256
    elif s == "SHA384":
        return hashlib.sha384
    elif s == "SHA512":
        return hashlib.sha512
    # Available in Python 3.6+
    elif s == "SHA3-224":
        return hashlib.sha3_224
    elif s == "SHA3-256":
        return hashlib.sha3_256
    elif s == "SHA3-384":
        return hashlib.sha3_384
    elif s == "SHA3-512":
        return hashlib.sha3_512
    # Available in Python 3.6+ or if pyblake2 is installed
    elif s in ("BLAKE2b", "BLAKE2b-512", "BLAKE2b512"):  # compat for openssl
        return get_blake2b()
    elif s in ("BLAKE2s", "BLAKE2s-256", "BLAKE2s256"):  # compat for openssl
        return get_blake2s()
    elif s in ("BLAKE2b-256", "BLAKE2b256"):   # also compat for openssl dgst
        return get_blake2b_256()
    # Vendored in cutils.crcmod
    elif s == "CRC-24":
        return get_crc("crc-24")
    elif s == "CRC-32-ISO":
        return get_crc("crc-32")
    elif s == "CRC-32-POSIX":
        return get_crc("posix")
    elif s == "CRC-64-ISO":
        return get_crc("crc-64")
    elif s == "CRC-64-ISO-2":
        return get_crc("crc-64-2")
    elif s == "CRC-64-ECMA":
        return get_crc("crc-64-ecma")
    elif s == "CRC-64-XZ":
        return get_crc("crc-64-xz")
    elif s == "CRC-64-GO-ISO":
        return get_crc("crc-64-go")
    elif s == "CRC-64-REDIS":
        return get_crc("crc-64-redis")
    else:
        raise ValueError("unknown algorithm: {}".format(s))


def algotag2digest_size(s):
    """Get the `digest_size` in bytes from given algorithm specifier `s`.

    Contains a small static database of digest sizes for algorithms that
    are not available by default in older Python versions.

    :raises ValueError: on unknown and/or unhandled algorithms
    :raises ImportError: if a module that is required to handle given
                         specifier `s` is not available (e.g. BLAKE2b on
                         Python 2)

    All string comparisons are case-sensitive.

    """
    try:
        dgst = algotag2algotype(s)()
        return dgst.digest_size
    except ImportError:
        sz = {
            "SHA3-224": 28,
            "SHA3-256": 32,
            "SHA3-384": 48,
            "SHA3-512": 64,
            "BLAKE2b": 64,
            "BLAKE2b-512": 64,
            "BLAKE2b512": 64,
            "BLAKE2b-256": 32,
            "BLAKE2b256": 32,
            "BLAKE2s": 32,
            "BLAKE2s-256": 32,
            "BLAKE2s256": 32,
        }.get(s, None)
        if not sz:
            raise
        return sz


def normalize_filename(filename, strip_dot_slashes=False):
    if isinstance(filename, bytes):
        filename = filename.replace(b"\\", b"/")
        if strip_dot_slashes:
            while filename.startswith(b"./"):
                filename = filename[2:]
            # This also handles adjacent /./ cases
            while b"/./" in filename:
                filename = filename.replace(b"/./", b"/", 1)
            while filename.endswith(b"/."):
                filename = filename[:-2]
    else:
        filename = filename.replace(u"\\", u"/")
        if strip_dot_slashes:
            while filename.startswith(u"./"):
                filename = filename[2:]
            # This also handles adjacent /./ cases
            while u"/./" in filename:
                filename = filename.replace(u"/./", u"/", 1)
            while filename.endswith(u"/."):
                filename = filename[:-2]
    return filename


def fsencode(what):
    """A somewhat compatibility function for :func:`os.fsencode`.

    If `what` is of type :class:`bytes` no :func:`os.fsencode` is required.

    """
    if isinstance(what, bytes):
        return what
    return os.fsencode(what)


def interpolate_bytes(formatstr, *values):
    """Interpolate byte strings also on Python 3.4.

    :param bytes formatstr:
    :param values: params for interpolation: may *not* contain Unicode strings
    :rvalue: the formatted octet
    :rtype: bytes

    """
    assert isinstance(formatstr, bytes)
    # Python 3.5+ or Python2 know how to interpolate byte strings
    if PY35 or PY2:
        return formatstr % values
    # Workaround with a Latin-1 dance
    tformatstr = formatstr.decode("latin1")
    tvalues = []
    for v in values:
        # Do not automatically convert text (unicode) string values into bytes.
        if PY2:
            if isinstance(v, unicode):  # noqa: F821  undefined name 'unicode'
                raise TypeError(
                    "unicode values not supported when interpolating"
                    " into bytes")
        else:
            if isinstance(v, str):
                raise TypeError(
                    "unicode (native string) values not supported when"
                    " interpolating into bytes")
        if isinstance(v, bytes):
            tvalues.append(v.decode("latin1"))
        else:
            tvalues.append(v)
    return (tformatstr % tuple(tvalues)).encode("latin1")