view cutils/util/__init__.py @ 177:089c40240061

Add an alternate implementation for generating directory tree digests: - Do not use something like os.walk() but use os.scandir() directly. - Recursively generate the subdirectory digests only when needed and in the right order. This fixes that the order of subdirectories in the output did not match the application order of its directory digests. The new implementation also should make filtering (that will be implemented later) easier. NOTE: The tree digests of the old and the new implementation are identical.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Jan 2025 17:41:28 +0100
parents e081b6ee5570
children 6154b8e4ba94
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# :Copyright: (c) 2020-2025 Franz Glasner
# :License:   BSD-3-Clause
# :-
r"""Utility package.

"""

__all__ = ["PY2",
           "PY35",
           "normalize_filename",
           "argv2algo",
           "algotag2algotype",
           "get_blake2b",
           "get_blake2b_256",
           "get_blake2s",
           "default_algotag",
           "fsencode",
           ]


import argparse
import hashlib
import os
import sys


PY2 = sys.version_info[0] < 3
PY35 = sys.version_info[:2] >= (3, 5)


def default_algotag():
    """Determine the "best" default algorithm.

    Depend on availability in :mod:`hashlib`.

    Prefer BLAKE2b-256, SHA256 or SHA1 -- in this order.

    Does not consider :mod:`pyblake2` if it is available eventually.

    """
    if "blake2b" in hashlib.algorithms_available:
        return "BLAKE2b-256"
    if "sha256" in hashlib.algorithms_available:
        return "SHA256"
    return "SHA1"


def get_blake2b():
    """Get the factory for blake2b"""
    try:
        return hashlib.blake2b
    except AttributeError:
        import pyblake2
        return pyblake2.blake2b


def get_blake2b_256():
    """Get the factory for blake2b-256"""

    try:
        hashlib.blake2b
    except AttributeError:
        import pyblake2

        def _get_blake():
            return pyblake2.blake2b(digest_size=32)

    else:

        def _get_blake():
            return hashlib.blake2b(digest_size=32)

    return _get_blake


def get_blake2s():
    """Get the factory for blake2s"""
    try:
        return hashlib.blake2s
    except AttributeError:
        import pyblake2
        return pyblake2.blake2s


def argv2algo(s):
    """Convert a command line algorithm specifier into a tuple with the
    type/factory of the digest and the algorithms tag for output purposes.

    :param str s: the specifier from the command line; should include all
                  algorithm tags also (for proper round-tripping)
    :return: the internal digest specification
    :rtype: a tuple (digest_type_or_factory, name_in_output)
    :raises argparse.ArgumentTypeError: for unrecognized algorithms or names

    String comparisons are done case-insensitively.

    """
    s = s.lower()
    if s in ("1", "sha1"):
        return (hashlib.sha1, "SHA1")
    elif s in ("224", "sha224"):
        return (hashlib.sha224, "SHA224")
    elif s in ("256", "sha256"):
        return (hashlib.sha256, "SHA256")
    elif s in ("384", "sha384"):
        return (hashlib.sha384, "SHA384")
    elif s in ("512", "sha512"):
        return (hashlib.sha512, "SHA512")
    elif s in ("3-224", "sha3-224"):
        return (hashlib.sha3_224, "SHA3-224")
    elif s in ("3-256", "sha3-256"):
        return (hashlib.sha3_256, "SHA3-256")
    elif s in ("3-384", "sha3-384"):
        return (hashlib.sha3_384, "SHA3-384")
    elif s in ("3", "3-512", "sha3-512"):
        return (hashlib.sha3_512, "SHA3-512")
    elif s in ("blake2b", "blake2b-512", "blake2", "blake2-512"):
        return (get_blake2b(), "BLAKE2b")
    elif s in ("blake2s", "blake2s-256"):
        return (get_blake2s(), "BLAKE2s")
    elif s in ("blake2-256", "blake2b-256"):
        return (get_blake2b_256(), "BLAKE2b-256")
    elif s == "md5":
        return (hashlib.md5, "MD5")
    else:
        raise argparse.ArgumentTypeError(
            "`{}' is not a recognized algorithm".format(s))


def algotag2algotype(s):
    """Convert the algorithm specifier in a BSD-style digest file to the
    type/factory of the corresponding algorithm.

    :param str s: the tag (i.e. normalized name) or the algorithm
    :return: the digest type or factory for `s`
    :raises ValueError: on unknown and/or unhandled algorithms

    All string comparisons are case-sensitive.

    """
    if s == "SHA1":
        return hashlib.sha1
    elif s == "SHA224":
        return hashlib.sha224
    elif s == "SHA256":
        return hashlib.sha256
    elif s == "SHA384":
        return hashlib.sha384
    elif s == "SHA512":
        return hashlib.sha512
    elif s == "SHA3-224":
        return hashlib.sha3_224
    elif s == "SHA3-256":
        return hashlib.sha3_256
    elif s == "SHA3-384":
        return hashlib.sha3_384
    elif s == "SHA3-512":
        return hashlib.sha3_512
    elif s in ("BLAKE2b", "BLAKE2b-512", "BLAKE2b512"):  # compat for openssl
        return get_blake2b()
    elif s in ("BLAKE2s", "BLAKE2s-256", "BLAKE2s256"):  # compat for openssl
        return get_blake2s()
    elif s in ("BLAKE2b-256", "BLAKE2b256"):   # also compat for openssl dgst
        return get_blake2b_256()
    elif s == "MD5":
        return hashlib.md5
    else:
        raise ValueError("unknown algorithm: {}".format(s))


def normalize_filename(filename, strip_leading_dot_slash=False):
    if isinstance(filename, bytes):
        filename = filename.replace(b"\\", b"/")
        if strip_leading_dot_slash:
            while filename.startswith(b"./"):
                filename = filename[2:]
    else:
        filename = filename.replace(u"\\", u"/")
        if strip_leading_dot_slash:
            while filename.startswith(u"./"):
                filename = filename[2:]
    return filename


def fsencode(what):
    """A somewhat compatibility function for :func:`os.fsencode`.

    If `what` is of type :class:`bytes` no :func:`os.fsencode` is required.

    """
    if isinstance(what, bytes):
        return what
    return os.fsencode(what)


def interpolate_bytes(formatstr, *values):
    """Interpolate byte strings also on Python 3.4.

    :param bytes formatstr:
    :param values: params for interpolation: may *not* contain Unicode strings
    :rvalue: the formatted octet
    :rtype: bytes

    """
    assert isinstance(formatstr, bytes)
    # Python 3.5+ or Python2 know how to interpolate byte strings
    if PY35 or PY2:
        return formatstr % values
    # Workaround with a Latin-1 dance
    tformatstr = formatstr.decode("latin1")
    tvalues = []
    for v in values:
        if PY2:
            if isinstance(v, unicode):  # noqa: F821  undefined name 'unicode'
                assert False
        else:
            if isinstance(v, str):
                assert False
        if isinstance(v, bytes):
            tvalues.append(v.decode("latin1"))
        else:
            tvalues.append(v)
    return (tformatstr % tuple(tvalues)).encode("latin1")