diff cutils/util/glob.py @ 297:141a3aa0b403

First version of converting a glob-style pattern to a regex
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 04 Mar 2025 01:52:18 +0100
parents ca293f708cb4
children 16a5c337fcb9
line wrap: on
line diff
--- a/cutils/util/glob.py	Sun Mar 02 22:54:40 2025 +0100
+++ b/cutils/util/glob.py	Tue Mar 04 01:52:18 2025 +0100
@@ -27,7 +27,7 @@
 - The ``[ ]`` characters are a bracket expression that match a single
   character of a name component out of a set of characters. For example,
   ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may
-  be used to specify a range so ``[a-z]`` specifies a range that matches
+  be used to specify a range so ``'[^/]*'[a-z]`` specifies a range that matches
   from "``a``" to "``z``" (inclusive). These forms can be mixed so
   ``[abce-g]`` matches "``a``", "``b``", "``c``", "``ey", "``f``" or
   "``g``".
@@ -45,10 +45,10 @@
   Also, the ``]`` character matches itself if it is the first character
   within the brackets, or the first character after the ``!`` if negating.
 
-- The ``{ }`` characters are a group of subpatterns, where the group matches
-  if any subpattern in the group matches.
+- The curly brace characters ``{ }`` denote a group of subpatterns, where
+  the group matches if any subpattern in the group matches.
 
-  The ``,`` character is used to separate the subpatterns. Groups cannot be
+  The ``,`` character is used to separate the subpatterns. Groups can be
   nested.
 
 - Leading period/dot characters in file name are treated as regular characters
@@ -65,11 +65,112 @@
 __all__ = ["glob_to_regexp"]
 
 
+import logging
+import re
+
 from . import PY2
 
 
-def glob_to_regexp(g):
-    pass
+def glob_to_regexp(globobj):
+    """Convert a glob string to a regular expression string.
+
+    The resulting regexp is *not* rooted.
+
+    :param globobj: the pattern with glob syntax or an iterator over the
+                    characters in such a pattern
+    :type globobj: str or iterator over str
+
+    """
+    res = []
+    grouplvl = 0         # support for nested pattern groups
+    globiter = CharIter(globobj)
+    for c in globiter:
+        if c == '?':
+            res.append(".")
+        elif c == '*':
+            if globiter.peek() == '*':
+                # extended glob
+                next(globiter)
+                if globiter.peek() == '/':
+                    next(globiter)
+                    res.append("(?:.*/)?")
+                else:
+                    res.append(".*")
+            else:
+                res.append("[^/]*")
+        elif c == '\\':
+            try:
+                res.append(re.escape(next(globiter)))
+            except StopIteration:
+                # XXX FIXME: or raise an exception with an invalid syntax
+                logging.warning(
+                    "lone trailing backslash in glob: %s", globobj)
+                res.append("\\\\")
+        elif c == '[':
+            bres = []                 # need a temp store because of errors
+            if globiter.peek() == '!':  # XXX FIXME: handle '^' also? see below!
+                next(globiter)
+                bres.append("^")
+            if globiter.peek() == ']':
+                next(globiter)
+                bres.append("\\]")
+            elif globiter.peek() == '-':
+                next(globiter)
+                bres.append("\\-")
+            elif globiter.peek() == '^':
+                #
+                # XXX FIXME: as an extension: FreeBSD /bin/sh handles this
+                #            like `!'. Should we follow it?
+                #
+                next(globiter)
+                if len(bres) > 0 and bres[0] == '^':
+                    bres.append("^")
+                else:
+                    bres.append("\\^")
+            for c2 in globiter:
+                if c2 == ']':
+                    # normal and regular break
+                    if bres[-1] == '-':
+                        bres.insert(-1, "\\")
+                    res.append("[")
+                    res.extend(bres)
+                    res.append("]")
+                    break
+                if c2 == '\\':
+                    bres.append("\\\\")
+                else:
+                    bres.append(c2)    # no escaping needed
+            else:
+                # no trailing `]' char
+                logging.warning(
+                    "missing trailing bracket `]' in this glob: %s", globobj)
+                #
+                # FreeBSD's /bin/sh handles this like putting the given pattern
+                # into single quotes -- effectively disabling any glob syntax.
+                # We do this here also.
+                #
+                res.append("\\[")
+                res.append(re.escape("".join(bres)))
+        elif c == '{':
+            grouplvl += 1
+            res.append("(?:")
+        elif grouplvl > 0 and c == '}':
+            grouplvl -= 1
+            res.append(")")
+        elif grouplvl > 0 and c == ',':
+            res.append("|")
+        else:
+            res.append(re.escape(c))
+    if grouplvl > 0:
+        if grouplvl > 1:
+            logging.warning("missing braces `}' in this glob: %s", globobj)
+        else:
+            logging.warning("missing brace `}' in this glob: %s", globobj)
+        while grouplvl > 0:
+            # XXX FIXME: what about trailing `|' chars
+            grouplvl -= 1
+            res.append(")")
+    return "".join(res)
 
 
 class CharIter(object):