Mercurial > hgrepos > Python > apps > py-cutils
diff cutils/util/glob.py @ 297:141a3aa0b403
First version of converting a glob-style pattern to a regex
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Tue, 04 Mar 2025 01:52:18 +0100 |
| parents | ca293f708cb4 |
| children | 16a5c337fcb9 |
line wrap: on
line diff
--- a/cutils/util/glob.py Sun Mar 02 22:54:40 2025 +0100 +++ b/cutils/util/glob.py Tue Mar 04 01:52:18 2025 +0100 @@ -27,7 +27,7 @@ - The ``[ ]`` characters are a bracket expression that match a single character of a name component out of a set of characters. For example, ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may - be used to specify a range so ``[a-z]`` specifies a range that matches + be used to specify a range so ``'[^/]*'[a-z]`` specifies a range that matches from "``a``" to "``z``" (inclusive). These forms can be mixed so ``[abce-g]`` matches "``a``", "``b``", "``c``", "``ey", "``f``" or "``g``". @@ -45,10 +45,10 @@ Also, the ``]`` character matches itself if it is the first character within the brackets, or the first character after the ``!`` if negating. -- The ``{ }`` characters are a group of subpatterns, where the group matches - if any subpattern in the group matches. +- The curly brace characters ``{ }`` denote a group of subpatterns, where + the group matches if any subpattern in the group matches. - The ``,`` character is used to separate the subpatterns. Groups cannot be + The ``,`` character is used to separate the subpatterns. Groups can be nested. - Leading period/dot characters in file name are treated as regular characters @@ -65,11 +65,112 @@ __all__ = ["glob_to_regexp"] +import logging +import re + from . import PY2 -def glob_to_regexp(g): - pass +def glob_to_regexp(globobj): + """Convert a glob string to a regular expression string. + + The resulting regexp is *not* rooted. + + :param globobj: the pattern with glob syntax or an iterator over the + characters in such a pattern + :type globobj: str or iterator over str + + """ + res = [] + grouplvl = 0 # support for nested pattern groups + globiter = CharIter(globobj) + for c in globiter: + if c == '?': + res.append(".") + elif c == '*': + if globiter.peek() == '*': + # extended glob + next(globiter) + if globiter.peek() == '/': + next(globiter) + res.append("(?:.*/)?") + else: + res.append(".*") + else: + res.append("[^/]*") + elif c == '\\': + try: + res.append(re.escape(next(globiter))) + except StopIteration: + # XXX FIXME: or raise an exception with an invalid syntax + logging.warning( + "lone trailing backslash in glob: %s", globobj) + res.append("\\\\") + elif c == '[': + bres = [] # need a temp store because of errors + if globiter.peek() == '!': # XXX FIXME: handle '^' also? see below! + next(globiter) + bres.append("^") + if globiter.peek() == ']': + next(globiter) + bres.append("\\]") + elif globiter.peek() == '-': + next(globiter) + bres.append("\\-") + elif globiter.peek() == '^': + # + # XXX FIXME: as an extension: FreeBSD /bin/sh handles this + # like `!'. Should we follow it? + # + next(globiter) + if len(bres) > 0 and bres[0] == '^': + bres.append("^") + else: + bres.append("\\^") + for c2 in globiter: + if c2 == ']': + # normal and regular break + if bres[-1] == '-': + bres.insert(-1, "\\") + res.append("[") + res.extend(bres) + res.append("]") + break + if c2 == '\\': + bres.append("\\\\") + else: + bres.append(c2) # no escaping needed + else: + # no trailing `]' char + logging.warning( + "missing trailing bracket `]' in this glob: %s", globobj) + # + # FreeBSD's /bin/sh handles this like putting the given pattern + # into single quotes -- effectively disabling any glob syntax. + # We do this here also. + # + res.append("\\[") + res.append(re.escape("".join(bres))) + elif c == '{': + grouplvl += 1 + res.append("(?:") + elif grouplvl > 0 and c == '}': + grouplvl -= 1 + res.append(")") + elif grouplvl > 0 and c == ',': + res.append("|") + else: + res.append(re.escape(c)) + if grouplvl > 0: + if grouplvl > 1: + logging.warning("missing braces `}' in this glob: %s", globobj) + else: + logging.warning("missing brace `}' in this glob: %s", globobj) + while grouplvl > 0: + # XXX FIXME: what about trailing `|' chars + grouplvl -= 1 + res.append(")") + return "".join(res) class CharIter(object):
