comparison cutils/util/glob.py @ 297:141a3aa0b403

First version of converting a glob-style pattern to a regex
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 04 Mar 2025 01:52:18 +0100
parents ca293f708cb4
children 16a5c337fcb9
comparison
equal deleted inserted replaced
296:ca293f708cb4 297:141a3aa0b403
25 example. 25 example.
26 26
27 - The ``[ ]`` characters are a bracket expression that match a single 27 - The ``[ ]`` characters are a bracket expression that match a single
28 character of a name component out of a set of characters. For example, 28 character of a name component out of a set of characters. For example,
29 ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may 29 ``[abc]`` matches "``a``", "``b``", or "``c``". The hyphen (``-``) may
30 be used to specify a range so ``[a-z]`` specifies a range that matches 30 be used to specify a range so ``'[^/]*'[a-z]`` specifies a range that matches
31 from "``a``" to "``z``" (inclusive). These forms can be mixed so 31 from "``a``" to "``z``" (inclusive). These forms can be mixed so
32 ``[abce-g]`` matches "``a``", "``b``", "``c``", "``ey", "``f``" or 32 ``[abce-g]`` matches "``a``", "``b``", "``c``", "``ey", "``f``" or
33 "``g``". 33 "``g``".
34 34
35 If the character after the ``[`` is a ``!`` then it is used for negation 35 If the character after the ``[`` is a ``!`` then it is used for negation
43 negating. 43 negating.
44 44
45 Also, the ``]`` character matches itself if it is the first character 45 Also, the ``]`` character matches itself if it is the first character
46 within the brackets, or the first character after the ``!`` if negating. 46 within the brackets, or the first character after the ``!`` if negating.
47 47
48 - The ``{ }`` characters are a group of subpatterns, where the group matches 48 - The curly brace characters ``{ }`` denote a group of subpatterns, where
49 if any subpattern in the group matches. 49 the group matches if any subpattern in the group matches.
50 50
51 The ``,`` character is used to separate the subpatterns. Groups cannot be 51 The ``,`` character is used to separate the subpatterns. Groups can be
52 nested. 52 nested.
53 53
54 - Leading period/dot characters in file name are treated as regular characters 54 - Leading period/dot characters in file name are treated as regular characters
55 in match operations. For example, the ``*`` glob pattern matches file name 55 in match operations. For example, the ``*`` glob pattern matches file name
56 ``.login``. 56 ``.login``.
63 63
64 64
65 __all__ = ["glob_to_regexp"] 65 __all__ = ["glob_to_regexp"]
66 66
67 67
68 import logging
69 import re
70
68 from . import PY2 71 from . import PY2
69 72
70 73
71 def glob_to_regexp(g): 74 def glob_to_regexp(globobj):
72 pass 75 """Convert a glob string to a regular expression string.
76
77 The resulting regexp is *not* rooted.
78
79 :param globobj: the pattern with glob syntax or an iterator over the
80 characters in such a pattern
81 :type globobj: str or iterator over str
82
83 """
84 res = []
85 grouplvl = 0 # support for nested pattern groups
86 globiter = CharIter(globobj)
87 for c in globiter:
88 if c == '?':
89 res.append(".")
90 elif c == '*':
91 if globiter.peek() == '*':
92 # extended glob
93 next(globiter)
94 if globiter.peek() == '/':
95 next(globiter)
96 res.append("(?:.*/)?")
97 else:
98 res.append(".*")
99 else:
100 res.append("[^/]*")
101 elif c == '\\':
102 try:
103 res.append(re.escape(next(globiter)))
104 except StopIteration:
105 # XXX FIXME: or raise an exception with an invalid syntax
106 logging.warning(
107 "lone trailing backslash in glob: %s", globobj)
108 res.append("\\\\")
109 elif c == '[':
110 bres = [] # need a temp store because of errors
111 if globiter.peek() == '!': # XXX FIXME: handle '^' also? see below!
112 next(globiter)
113 bres.append("^")
114 if globiter.peek() == ']':
115 next(globiter)
116 bres.append("\\]")
117 elif globiter.peek() == '-':
118 next(globiter)
119 bres.append("\\-")
120 elif globiter.peek() == '^':
121 #
122 # XXX FIXME: as an extension: FreeBSD /bin/sh handles this
123 # like `!'. Should we follow it?
124 #
125 next(globiter)
126 if len(bres) > 0 and bres[0] == '^':
127 bres.append("^")
128 else:
129 bres.append("\\^")
130 for c2 in globiter:
131 if c2 == ']':
132 # normal and regular break
133 if bres[-1] == '-':
134 bres.insert(-1, "\\")
135 res.append("[")
136 res.extend(bres)
137 res.append("]")
138 break
139 if c2 == '\\':
140 bres.append("\\\\")
141 else:
142 bres.append(c2) # no escaping needed
143 else:
144 # no trailing `]' char
145 logging.warning(
146 "missing trailing bracket `]' in this glob: %s", globobj)
147 #
148 # FreeBSD's /bin/sh handles this like putting the given pattern
149 # into single quotes -- effectively disabling any glob syntax.
150 # We do this here also.
151 #
152 res.append("\\[")
153 res.append(re.escape("".join(bres)))
154 elif c == '{':
155 grouplvl += 1
156 res.append("(?:")
157 elif grouplvl > 0 and c == '}':
158 grouplvl -= 1
159 res.append(")")
160 elif grouplvl > 0 and c == ',':
161 res.append("|")
162 else:
163 res.append(re.escape(c))
164 if grouplvl > 0:
165 if grouplvl > 1:
166 logging.warning("missing braces `}' in this glob: %s", globobj)
167 else:
168 logging.warning("missing brace `}' in this glob: %s", globobj)
169 while grouplvl > 0:
170 # XXX FIXME: what about trailing `|' chars
171 grouplvl -= 1
172 res.append(")")
173 return "".join(res)
73 174
74 175
75 class CharIter(object): 176 class CharIter(object):
76 177
77 """Iterator over byte or unicode strings with peek support. 178 """Iterator over byte or unicode strings with peek support.