Mercurial > hgrepos > Python > apps > py-cutils
view cutils/util/glob.py @ 369:04d7945ff4ae
treesum: Comment
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Fri, 11 Apr 2025 17:12:00 +0200 |
| parents | 48430941c18c |
| children |
line wrap: on
line source
# -*- coding: utf-8 -*- # :- # SPDX-FileCopyrightText: © 2020-2025 Franz Glasner # SPDX-License-Identifier: BSD-3-Clause # :- r"""Glob handling. .. seealso:: - https://docs.oracle.com/javase/8/docs/api/java/nio/file/FileSystem.html#getPathMatcher-java.lang.String- - https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob - Mercurial: :command:`hg help patterns` The following rules are used to interpret glob patterns: - The ``*`` character matches zero or more characters of a name component without crossing directory boundaries. - The ``**`` characters matches zero or more characters crossing directory boundaries. - ``**/`` matches zero or more subdirectories; files do not match. - The ``?`` character matches exactly one character of a name component. - The backslash character (``\``) is used to escape characters that would otherwise be interpreted as special characters. The expression ``\\`` matches a single backslash and ``\{`` matches a left brace for example. - The ``[ ]`` characters are a bracket expression that match a single character of a name component out of a set of characters. For example, ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may be used to specify a range so ``[a-z]`` specifies a range that matches from "``a``" to "``z``" (inclusive). These forms can be mixed so ``[abce-g]`` matches "``a``", "``b``", "``c``", "``e``", "``f``" or "``g``". If the character after the ``[`` is a ``!`` then it is used for negation so ``[!a-c]`` matches any character except "``a``", "``b``", or "``c``". Within a bracket expression the ``*``, ``?`` and ``\`` characters match themselves. The ``-`` character matches itself if it is the first or last character within the brackets, or the first or last character after the ``!`` if negating. Also, the ``]`` character matches itself if it is the first character within the brackets, or the first character after the ``!`` if negating. - The curly brace characters ``{ }`` denote a group of subpatterns, where the group matches if any subpattern in the group matches. The ``,`` character is used to separate the subpatterns. Groups can be nested. - Leading period/dot characters in file name are treated as regular characters in match operations. For example, the ``*`` glob pattern matches file name ``.login``. - All other characters match themselves. """ from __future__ import print_function, absolute_import __all__ = ["glob_to_regexp"] import logging import re from . import PY2 def glob_to_regexp(globobj): """Convert a glob string to a regular expression string. The resulting regexp is *not* rooted. :param globobj: the pattern with glob syntax or an iterator over the characters in such a pattern :type globobj: str or iterator over str """ res = [] grouplvl = 0 # support for nested pattern groups globiter = CharIter(globobj) for c in globiter: if c == '?': res.append(".") elif c == '*': if globiter.peek() == '*': # extended glob next(globiter) if globiter.peek() == '/': next(globiter) res.append("(?:.*/)?") else: res.append(".*") else: res.append("[^/]*") elif c == '\\': try: res.append(re.escape(next(globiter))) except StopIteration: # XXX FIXME: or raise an exception with an invalid syntax logging.warning( "lone trailing backslash in glob: %s", globobj) res.append("\\\\") elif c == '[': bres = [] # need a temp store because of errors if globiter.peek() == '!': # XXX FIXME: handle '^' also? see below! next(globiter) bres.append("^") if globiter.peek() == ']': next(globiter) bres.append("\\]") elif globiter.peek() == '-': next(globiter) bres.append("\\-") elif globiter.peek() == '^': # # XXX FIXME: as an extension: FreeBSD /bin/sh handles this # like `!'. Should we follow it? # next(globiter) if len(bres) > 0 and bres[0] == '^': bres.append("^") else: bres.append("\\^") for c2 in globiter: if c2 == ']': # normal and regular break if bres[-1] == '-': bres.insert(-1, "\\") res.append("[") res.extend(bres) res.append("]") break if c2 == '\\': bres.append("\\\\") else: bres.append(c2) # no escaping needed else: # no trailing `]' char logging.warning( "missing trailing bracket `]' in this glob: %s", globobj) # # FreeBSD's /bin/sh handles this like putting the given pattern # into single quotes -- effectively disabling any glob syntax. # We do this here also. # res.append("\\[") res.append(re.escape("".join(bres))) elif c == '{': grouplvl += 1 res.append("(?:") elif grouplvl > 0 and c == '}': grouplvl -= 1 res.append(")") elif grouplvl > 0 and c == ',': res.append("|") else: res.append(re.escape(c)) if grouplvl > 0: if grouplvl > 1: logging.warning("missing braces `}' in this glob: %s", globobj) else: logging.warning("missing brace `}' in this glob: %s", globobj) while grouplvl > 0: # XXX FIXME: what about trailing `|' chars grouplvl -= 1 res.append(")") return "".join(res) class CharIter(object): """Iterator over byte or unicode strings with peek support. On Python3 always yields an octet of :class:`bytes` instead of :class:`int`s if the iterator iterates over :class:`bytes`. """ __slots__ = ("_it", "_nch") def __init__(self, w): self._nch = None if PY2: if isinstance(w, (bytes, unicode)): # noqa: F821 undefined name self._it = iter(w) else: self._it = w else: if isinstance(w, (bytes, str)): self._it = iter(w) else: self._it = w def __iter__(self): return self def __next__(self): if self._nch is not None: c = self._nch self._nch = None return c c = next(self._it) return bytes((c,)) if isinstance(c, int) else c if PY2: next = __next__ def peek(self): """Peek the next character. Return `None` if the iterator is exhausted. """ if self._nch is not None: return self._nch self._nch = next(self._it, None) if isinstance(self._nch, int): self._nch = bytes((self._nch,)) return self._nch
