# HG changeset patch # User Franz Glasner # Date 1741049538 -3600 # Node ID 141a3aa0b403849efbab7fca9962e4be0eaf43fe # Parent ca293f708cb4dcb65812242e79eae988a8b5d242 First version of converting a glob-style pattern to a regex diff -r ca293f708cb4 -r 141a3aa0b403 cutils/util/glob.py --- a/cutils/util/glob.py Sun Mar 02 22:54:40 2025 +0100 +++ b/cutils/util/glob.py Tue Mar 04 01:52:18 2025 +0100 @@ -27,7 +27,7 @@ - The ``[ ]`` characters are a bracket expression that match a single character of a name component out of a set of characters. For example, ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may - be used to specify a range so ``[a-z]`` specifies a range that matches + be used to specify a range so ``'[^/]*'[a-z]`` specifies a range that matches from "``a``" to "``z``" (inclusive). These forms can be mixed so ``[abce-g]`` matches "``a``", "``b``", "``c``", "``ey", "``f``" or "``g``". @@ -45,10 +45,10 @@ Also, the ``]`` character matches itself if it is the first character within the brackets, or the first character after the ``!`` if negating. -- The ``{ }`` characters are a group of subpatterns, where the group matches - if any subpattern in the group matches. +- The curly brace characters ``{ }`` denote a group of subpatterns, where + the group matches if any subpattern in the group matches. - The ``,`` character is used to separate the subpatterns. Groups cannot be + The ``,`` character is used to separate the subpatterns. Groups can be nested. - Leading period/dot characters in file name are treated as regular characters @@ -65,11 +65,112 @@ __all__ = ["glob_to_regexp"] +import logging +import re + from . import PY2 -def glob_to_regexp(g): - pass +def glob_to_regexp(globobj): + """Convert a glob string to a regular expression string. + + The resulting regexp is *not* rooted. + + :param globobj: the pattern with glob syntax or an iterator over the + characters in such a pattern + :type globobj: str or iterator over str + + """ + res = [] + grouplvl = 0 # support for nested pattern groups + globiter = CharIter(globobj) + for c in globiter: + if c == '?': + res.append(".") + elif c == '*': + if globiter.peek() == '*': + # extended glob + next(globiter) + if globiter.peek() == '/': + next(globiter) + res.append("(?:.*/)?") + else: + res.append(".*") + else: + res.append("[^/]*") + elif c == '\\': + try: + res.append(re.escape(next(globiter))) + except StopIteration: + # XXX FIXME: or raise an exception with an invalid syntax + logging.warning( + "lone trailing backslash in glob: %s", globobj) + res.append("\\\\") + elif c == '[': + bres = [] # need a temp store because of errors + if globiter.peek() == '!': # XXX FIXME: handle '^' also? see below! + next(globiter) + bres.append("^") + if globiter.peek() == ']': + next(globiter) + bres.append("\\]") + elif globiter.peek() == '-': + next(globiter) + bres.append("\\-") + elif globiter.peek() == '^': + # + # XXX FIXME: as an extension: FreeBSD /bin/sh handles this + # like `!'. Should we follow it? + # + next(globiter) + if len(bres) > 0 and bres[0] == '^': + bres.append("^") + else: + bres.append("\\^") + for c2 in globiter: + if c2 == ']': + # normal and regular break + if bres[-1] == '-': + bres.insert(-1, "\\") + res.append("[") + res.extend(bres) + res.append("]") + break + if c2 == '\\': + bres.append("\\\\") + else: + bres.append(c2) # no escaping needed + else: + # no trailing `]' char + logging.warning( + "missing trailing bracket `]' in this glob: %s", globobj) + # + # FreeBSD's /bin/sh handles this like putting the given pattern + # into single quotes -- effectively disabling any glob syntax. + # We do this here also. + # + res.append("\\[") + res.append(re.escape("".join(bres))) + elif c == '{': + grouplvl += 1 + res.append("(?:") + elif grouplvl > 0 and c == '}': + grouplvl -= 1 + res.append(")") + elif grouplvl > 0 and c == ',': + res.append("|") + else: + res.append(re.escape(c)) + if grouplvl > 0: + if grouplvl > 1: + logging.warning("missing braces `}' in this glob: %s", globobj) + else: + logging.warning("missing brace `}' in this glob: %s", globobj) + while grouplvl > 0: + # XXX FIXME: what about trailing `|' chars + grouplvl -= 1 + res.append(")") + return "".join(res) class CharIter(object): diff -r ca293f708cb4 -r 141a3aa0b403 tests/test_match.py --- a/tests/test_match.py Sun Mar 02 22:54:40 2025 +0100 +++ b/tests/test_match.py Tue Mar 04 01:52:18 2025 +0100 @@ -10,7 +10,8 @@ import sys import unittest -from cutils.util.glob import CharIter +from cutils.util import PY2 +from cutils.util.glob import CharIter, glob_to_regexp class TestCharIter(unittest.TestCase): @@ -63,5 +64,100 @@ self.assertIsNone(it.peek()) +class TestGlobToRegexp(unittest.TestCase): + + def test_empty(self): + self.assertEqual("", glob_to_regexp("")) + + def test_question_mark(self): + self.assertEqual(".", glob_to_regexp("?")) + + def test_single_star(self): + self.assertEqual("[^/]*", glob_to_regexp("*")) + + def test_double_star(self): + self.assertEqual(".*", glob_to_regexp("**")) + + def test_double_star_slash(self): + self.assertEqual("(?:.*/)?", glob_to_regexp("**/")) + + def test_double_star_in_between(self): + if PY2: + # Python 2 escapes all alnum characters in re.escape() + self.assertEqual("part1\\/(?:.*/)?part2", + glob_to_regexp("part1/**/part2")) + else: + + self.assertEqual("part1/(?:.*/)?part2", + glob_to_regexp("part1/**/part2")) + + def test_double_start_in_between2(self): + if PY2: + # Python 2 escapes all alnum characters in re.escape() + self.assertEqual("part1\\/.*\\.py", glob_to_regexp("part1/**.py")) + else: + self.assertEqual("part1/.*\\.py", glob_to_regexp("part1/**.py")) + + def test_bracket_simple(self): + self.assertEqual("[abc]", glob_to_regexp("[abc]")) + + def test_bracket_simple_range(self): + self.assertEqual("[a-c]", glob_to_regexp("[a-c]")) + + def test_bracket_with_special_chars(self): + self.assertEqual("[x*?!^]", glob_to_regexp("[x*?!^]")) + + def test_bracket_simple_range_with_escape(self): + self.assertEqual("[\\\\-c]", glob_to_regexp("[\\-c]")) + + def test_bracket_not_closed(self): + self.assertEqual("\\[a", glob_to_regexp("[a")) + + def test_bracket_not_closed_escapes(self): + self.assertEqual("\\[a\\*\\?", glob_to_regexp("[a*?")) + + def test_bracket_with_dash_as_first_character(self): + self.assertEqual("[\\-a]", glob_to_regexp("[-a]")) + + def test_bracket_with_dash_as_last_character(self): + self.assertEqual("[a\\-]", glob_to_regexp("[a-]")) + + def test_bracket_with_closing_bracket(self): + self.assertEqual("[\\]a]", glob_to_regexp("[]a]")) + + def test_bracket_with_caret_as_first_character(self): + self.assertEqual("[\\^a]", glob_to_regexp("[^a]")) + + def test_bracket_negating_with_dash_as_first_character(self): + self.assertEqual("[^\\-a]", glob_to_regexp("[!-a]")) + + def test_bracket_negating_with_dash_as_last_character(self): + self.assertEqual("[^a\\-]", glob_to_regexp("[!a-]")) + + def test_bracket_negating_with_closing_bracket(self): + self.assertEqual("[^\\]a]", glob_to_regexp("[!]a]")) + + def test_bracket_negating_with_caret_as_first_character(self): + self.assertEqual("[^^a]", glob_to_regexp("[!^a]")) + + def test_simple_escapes(self): + for c in "\\()[]{}*.?": + self.assertEqual("\\"+c, glob_to_regexp("\\"+c)) + + def test_simple_escapes_last_backslash(self): + self.assertEqual("\\\\", glob_to_regexp("\\")) + + def test_auto_escapes(self): + for c in "*.?": + self.assertEqual("\\"+c, glob_to_regexp("\\"+c)) + + def test_group_simple(self): + self.assertEqual("(?:abc|def)", glob_to_regexp("{abc,def}")) + + def test_group_complex_nested(self): + self.assertEqual("(?:abc|(?:[ABQ-Z]|[^A][^/]*))", + glob_to_regexp("{abc,{[ABQ-Z],[!A]*}}")) + + if __name__ == "__main__": sys.exit(unittest.main())