changeset 297:141a3aa0b403

First version of converting a glob-style pattern to a regex
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 04 Mar 2025 01:52:18 +0100
parents ca293f708cb4
children 16a5c337fcb9
files cutils/util/glob.py tests/test_match.py
diffstat 2 files changed, 204 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/cutils/util/glob.py	Sun Mar 02 22:54:40 2025 +0100
+++ b/cutils/util/glob.py	Tue Mar 04 01:52:18 2025 +0100
@@ -27,7 +27,7 @@
 - The ``[ ]`` characters are a bracket expression that match a single
   character of a name component out of a set of characters. For example,
   ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may
-  be used to specify a range so ``[a-z]`` specifies a range that matches
+  be used to specify a range so ``'[^/]*'[a-z]`` specifies a range that matches
   from "``a``" to "``z``" (inclusive). These forms can be mixed so
   ``[abce-g]`` matches "``a``", "``b``", "``c``", "``ey", "``f``" or
   "``g``".
@@ -45,10 +45,10 @@
   Also, the ``]`` character matches itself if it is the first character
   within the brackets, or the first character after the ``!`` if negating.
 
-- The ``{ }`` characters are a group of subpatterns, where the group matches
-  if any subpattern in the group matches.
+- The curly brace characters ``{ }`` denote a group of subpatterns, where
+  the group matches if any subpattern in the group matches.
 
-  The ``,`` character is used to separate the subpatterns. Groups cannot be
+  The ``,`` character is used to separate the subpatterns. Groups can be
   nested.
 
 - Leading period/dot characters in file name are treated as regular characters
@@ -65,11 +65,112 @@
 __all__ = ["glob_to_regexp"]
 
 
+import logging
+import re
+
 from . import PY2
 
 
-def glob_to_regexp(g):
-    pass
+def glob_to_regexp(globobj):
+    """Convert a glob string to a regular expression string.
+
+    The resulting regexp is *not* rooted.
+
+    :param globobj: the pattern with glob syntax or an iterator over the
+                    characters in such a pattern
+    :type globobj: str or iterator over str
+
+    """
+    res = []
+    grouplvl = 0         # support for nested pattern groups
+    globiter = CharIter(globobj)
+    for c in globiter:
+        if c == '?':
+            res.append(".")
+        elif c == '*':
+            if globiter.peek() == '*':
+                # extended glob
+                next(globiter)
+                if globiter.peek() == '/':
+                    next(globiter)
+                    res.append("(?:.*/)?")
+                else:
+                    res.append(".*")
+            else:
+                res.append("[^/]*")
+        elif c == '\\':
+            try:
+                res.append(re.escape(next(globiter)))
+            except StopIteration:
+                # XXX FIXME: or raise an exception with an invalid syntax
+                logging.warning(
+                    "lone trailing backslash in glob: %s", globobj)
+                res.append("\\\\")
+        elif c == '[':
+            bres = []                 # need a temp store because of errors
+            if globiter.peek() == '!':  # XXX FIXME: handle '^' also? see below!
+                next(globiter)
+                bres.append("^")
+            if globiter.peek() == ']':
+                next(globiter)
+                bres.append("\\]")
+            elif globiter.peek() == '-':
+                next(globiter)
+                bres.append("\\-")
+            elif globiter.peek() == '^':
+                #
+                # XXX FIXME: as an extension: FreeBSD /bin/sh handles this
+                #            like `!'. Should we follow it?
+                #
+                next(globiter)
+                if len(bres) > 0 and bres[0] == '^':
+                    bres.append("^")
+                else:
+                    bres.append("\\^")
+            for c2 in globiter:
+                if c2 == ']':
+                    # normal and regular break
+                    if bres[-1] == '-':
+                        bres.insert(-1, "\\")
+                    res.append("[")
+                    res.extend(bres)
+                    res.append("]")
+                    break
+                if c2 == '\\':
+                    bres.append("\\\\")
+                else:
+                    bres.append(c2)    # no escaping needed
+            else:
+                # no trailing `]' char
+                logging.warning(
+                    "missing trailing bracket `]' in this glob: %s", globobj)
+                #
+                # FreeBSD's /bin/sh handles this like putting the given pattern
+                # into single quotes -- effectively disabling any glob syntax.
+                # We do this here also.
+                #
+                res.append("\\[")
+                res.append(re.escape("".join(bres)))
+        elif c == '{':
+            grouplvl += 1
+            res.append("(?:")
+        elif grouplvl > 0 and c == '}':
+            grouplvl -= 1
+            res.append(")")
+        elif grouplvl > 0 and c == ',':
+            res.append("|")
+        else:
+            res.append(re.escape(c))
+    if grouplvl > 0:
+        if grouplvl > 1:
+            logging.warning("missing braces `}' in this glob: %s", globobj)
+        else:
+            logging.warning("missing brace `}' in this glob: %s", globobj)
+        while grouplvl > 0:
+            # XXX FIXME: what about trailing `|' chars
+            grouplvl -= 1
+            res.append(")")
+    return "".join(res)
 
 
 class CharIter(object):
--- a/tests/test_match.py	Sun Mar 02 22:54:40 2025 +0100
+++ b/tests/test_match.py	Tue Mar 04 01:52:18 2025 +0100
@@ -10,7 +10,8 @@
 import sys
 import unittest
 
-from cutils.util.glob import CharIter
+from cutils.util import PY2
+from cutils.util.glob import CharIter, glob_to_regexp
 
 
 class TestCharIter(unittest.TestCase):
@@ -63,5 +64,100 @@
         self.assertIsNone(it.peek())
 
 
+class TestGlobToRegexp(unittest.TestCase):
+
+    def test_empty(self):
+        self.assertEqual("", glob_to_regexp(""))
+
+    def test_question_mark(self):
+        self.assertEqual(".", glob_to_regexp("?"))
+
+    def test_single_star(self):
+        self.assertEqual("[^/]*", glob_to_regexp("*"))
+
+    def test_double_star(self):
+        self.assertEqual(".*", glob_to_regexp("**"))
+
+    def test_double_star_slash(self):
+        self.assertEqual("(?:.*/)?", glob_to_regexp("**/"))
+
+    def test_double_star_in_between(self):
+        if PY2:
+            # Python 2 escapes all alnum characters in re.escape()
+            self.assertEqual("part1\\/(?:.*/)?part2",
+                             glob_to_regexp("part1/**/part2"))
+        else:
+
+            self.assertEqual("part1/(?:.*/)?part2",
+                             glob_to_regexp("part1/**/part2"))
+
+    def test_double_start_in_between2(self):
+        if PY2:
+            # Python 2 escapes all alnum characters in re.escape()
+            self.assertEqual("part1\\/.*\\.py", glob_to_regexp("part1/**.py"))
+        else:
+            self.assertEqual("part1/.*\\.py", glob_to_regexp("part1/**.py"))
+
+    def test_bracket_simple(self):
+        self.assertEqual("[abc]", glob_to_regexp("[abc]"))
+
+    def test_bracket_simple_range(self):
+        self.assertEqual("[a-c]", glob_to_regexp("[a-c]"))
+
+    def test_bracket_with_special_chars(self):
+        self.assertEqual("[x*?!^]", glob_to_regexp("[x*?!^]"))
+
+    def test_bracket_simple_range_with_escape(self):
+        self.assertEqual("[\\\\-c]", glob_to_regexp("[\\-c]"))
+
+    def test_bracket_not_closed(self):
+        self.assertEqual("\\[a", glob_to_regexp("[a"))
+
+    def test_bracket_not_closed_escapes(self):
+        self.assertEqual("\\[a\\*\\?", glob_to_regexp("[a*?"))
+
+    def test_bracket_with_dash_as_first_character(self):
+        self.assertEqual("[\\-a]", glob_to_regexp("[-a]"))
+
+    def test_bracket_with_dash_as_last_character(self):
+        self.assertEqual("[a\\-]", glob_to_regexp("[a-]"))
+
+    def test_bracket_with_closing_bracket(self):
+        self.assertEqual("[\\]a]", glob_to_regexp("[]a]"))
+
+    def test_bracket_with_caret_as_first_character(self):
+        self.assertEqual("[\\^a]", glob_to_regexp("[^a]"))
+
+    def test_bracket_negating_with_dash_as_first_character(self):
+        self.assertEqual("[^\\-a]", glob_to_regexp("[!-a]"))
+
+    def test_bracket_negating_with_dash_as_last_character(self):
+        self.assertEqual("[^a\\-]", glob_to_regexp("[!a-]"))
+
+    def test_bracket_negating_with_closing_bracket(self):
+        self.assertEqual("[^\\]a]", glob_to_regexp("[!]a]"))
+
+    def test_bracket_negating_with_caret_as_first_character(self):
+        self.assertEqual("[^^a]", glob_to_regexp("[!^a]"))
+
+    def test_simple_escapes(self):
+        for c in "\\()[]{}*.?":
+            self.assertEqual("\\"+c, glob_to_regexp("\\"+c))
+
+    def test_simple_escapes_last_backslash(self):
+        self.assertEqual("\\\\", glob_to_regexp("\\"))
+
+    def test_auto_escapes(self):
+        for c in "*.?":
+            self.assertEqual("\\"+c, glob_to_regexp("\\"+c))
+
+    def test_group_simple(self):
+        self.assertEqual("(?:abc|def)", glob_to_regexp("{abc,def}"))
+
+    def test_group_complex_nested(self):
+        self.assertEqual("(?:abc|(?:[ABQ-Z]|[^A][^/]*))",
+                         glob_to_regexp("{abc,{[ABQ-Z],[!A]*}}"))
+
+
 if __name__ == "__main__":
     sys.exit(unittest.main())