view cutils/util/glob.py @ 323:48430941c18c

Adopt copyright and license wordings from https://reuse.software/faq/. While there normalize copyright years for every file to start with the file's addition to the project (i.e. with the year of file creation).
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 26 Mar 2025 18:42:23 +0100
parents 6d1add815d14
children
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# SPDX-FileCopyrightText: © 2020-2025 Franz Glasner
# SPDX-License-Identifier: BSD-3-Clause
# :-
r"""Glob handling.

.. seealso::
   - https://docs.oracle.com/javase/8/docs/api/java/nio/file/FileSystem.html#getPathMatcher-java.lang.String-
   - https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob
   - Mercurial: :command:`hg help patterns`

The following rules are used to interpret glob patterns:

- The ``*`` character matches zero or more characters of a name
  component without crossing directory boundaries.

- The ``**`` characters matches zero or more characters crossing
  directory boundaries.

- ``**/`` matches zero or more subdirectories; files do not match.

- The ``?`` character matches exactly one character of a name component.

- The backslash character (``\``) is used to escape characters that
  would otherwise be interpreted as special characters. The expression
  ``\\`` matches a single backslash and ``\{`` matches a left brace for
  example.

- The ``[ ]`` characters are a bracket expression that match a single
  character of a name component out of a set of characters. For example,
  ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may
  be used to specify a range so ``[a-z]`` specifies a range that matches
  from "``a``" to "``z``" (inclusive). These forms can be mixed so
  ``[abce-g]`` matches "``a``", "``b``", "``c``", "``e``", "``f``" or
  "``g``".

  If the character after the ``[`` is a ``!`` then it is used for negation
  so ``[!a-c]`` matches any character except "``a``", "``b``", or "``c``".

  Within a bracket expression the ``*``, ``?`` and ``\`` characters match
  themselves.

  The ``-`` character matches itself if it is the first or last character
  within the brackets, or the first or last character after the ``!`` if
  negating.

  Also, the ``]`` character matches itself if it is the first character
  within the brackets, or the first character after the ``!`` if negating.

- The curly brace characters ``{ }`` denote a group of subpatterns, where
  the group matches if any subpattern in the group matches.

  The ``,`` character is used to separate the subpatterns. Groups can be
  nested.

- Leading period/dot characters in file name are treated as regular characters
  in match operations. For example, the ``*`` glob pattern matches file name
  ``.login``.

- All other characters match themselves.

"""

from __future__ import print_function, absolute_import


__all__ = ["glob_to_regexp"]


import logging
import re

from . import PY2


def glob_to_regexp(globobj):
    """Convert a glob string to a regular expression string.

    The resulting regexp is *not* rooted.

    :param globobj: the pattern with glob syntax or an iterator over the
                    characters in such a pattern
    :type globobj: str or iterator over str

    """
    res = []
    grouplvl = 0         # support for nested pattern groups
    globiter = CharIter(globobj)
    for c in globiter:
        if c == '?':
            res.append(".")
        elif c == '*':
            if globiter.peek() == '*':
                # extended glob
                next(globiter)
                if globiter.peek() == '/':
                    next(globiter)
                    res.append("(?:.*/)?")
                else:
                    res.append(".*")
            else:
                res.append("[^/]*")
        elif c == '\\':
            try:
                res.append(re.escape(next(globiter)))
            except StopIteration:
                # XXX FIXME: or raise an exception with an invalid syntax
                logging.warning(
                    "lone trailing backslash in glob: %s", globobj)
                res.append("\\\\")
        elif c == '[':
            bres = []                 # need a temp store because of errors
            if globiter.peek() == '!':  # XXX FIXME: handle '^' also? see below!
                next(globiter)
                bres.append("^")
            if globiter.peek() == ']':
                next(globiter)
                bres.append("\\]")
            elif globiter.peek() == '-':
                next(globiter)
                bres.append("\\-")
            elif globiter.peek() == '^':
                #
                # XXX FIXME: as an extension: FreeBSD /bin/sh handles this
                #            like `!'. Should we follow it?
                #
                next(globiter)
                if len(bres) > 0 and bres[0] == '^':
                    bres.append("^")
                else:
                    bres.append("\\^")
            for c2 in globiter:
                if c2 == ']':
                    # normal and regular break
                    if bres[-1] == '-':
                        bres.insert(-1, "\\")
                    res.append("[")
                    res.extend(bres)
                    res.append("]")
                    break
                if c2 == '\\':
                    bres.append("\\\\")
                else:
                    bres.append(c2)    # no escaping needed
            else:
                # no trailing `]' char
                logging.warning(
                    "missing trailing bracket `]' in this glob: %s", globobj)
                #
                # FreeBSD's /bin/sh handles this like putting the given pattern
                # into single quotes -- effectively disabling any glob syntax.
                # We do this here also.
                #
                res.append("\\[")
                res.append(re.escape("".join(bres)))
        elif c == '{':
            grouplvl += 1
            res.append("(?:")
        elif grouplvl > 0 and c == '}':
            grouplvl -= 1
            res.append(")")
        elif grouplvl > 0 and c == ',':
            res.append("|")
        else:
            res.append(re.escape(c))
    if grouplvl > 0:
        if grouplvl > 1:
            logging.warning("missing braces `}' in this glob: %s", globobj)
        else:
            logging.warning("missing brace `}' in this glob: %s", globobj)
        while grouplvl > 0:
            # XXX FIXME: what about trailing `|' chars
            grouplvl -= 1
            res.append(")")
    return "".join(res)


class CharIter(object):

    """Iterator over byte or unicode strings with peek support.

    On Python3 always yields an octet of :class:`bytes` instead of
    :class:`int`s if the iterator iterates over :class:`bytes`.

    """

    __slots__ = ("_it", "_nch")

    def __init__(self, w):
        self._nch = None
        if PY2:
            if isinstance(w, (bytes, unicode)):  # noqa: F821 undefined name
                self._it = iter(w)
            else:
                self._it = w
        else:
            if isinstance(w, (bytes, str)):
                self._it = iter(w)
            else:
                self._it = w

    def __iter__(self):
        return self

    def __next__(self):
        if self._nch is not None:
            c = self._nch
            self._nch = None
            return c
        c = next(self._it)
        return bytes((c,)) if isinstance(c, int) else c

    if PY2:
        next = __next__

    def peek(self):
        """Peek the next character.

        Return `None` if the iterator is exhausted.

        """
        if self._nch is not None:
            return self._nch
        self._nch = next(self._it, None)
        if isinstance(self._nch, int):
            self._nch = bytes((self._nch,))
        return self._nch