Mercurial > hgrepos > Python > libs > pygments-lexer-pseudocode2
changeset 105:cec52d83869a
Handle much more characters from the Unicode codeset in expressions.
While there: FIX: Add forgotten Punctuation characters `?' and `@'.
While there: Allow the escaping of single and double quotes that normally
start a string (e.g. for expressions like f' is the first derivation of f).
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 04 May 2026 16:30:36 +0200 |
| parents | ffe6ea2cf69b |
| children | f6b46a379aba |
| files | pygments_lexer_pseudocode2/algpseudocode.py pygments_lexer_pseudocode2/uniprops.py tests/test_algpseudo.py |
| diffstat | 3 files changed, 294 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/pygments_lexer_pseudocode2/algpseudocode.py Mon May 04 16:23:18 2026 +0200 +++ b/pygments_lexer_pseudocode2/algpseudocode.py Mon May 04 16:30:36 2026 +0200 @@ -26,7 +26,7 @@ # from pygments_lexer_pseudocode2.bases import LexBase from pygments_lexer_pseudocode2.utils import REVERSED_STANDARD_TYPES - +from pygments_lexer_pseudocode2 import uniprops # # As in the local imports: use an explicit name because __name__ is @@ -127,6 +127,18 @@ "TSTATE": SYMBOL_TEXTSTATEMENT, "TEXTBLOCK": SYMBOL_TEXTSTATEMENT, "TBLOCK": SYMBOL_TEXTSTATEMENT, + "<-": "←", + "->": "→", + "=>": "⇒", + "<=": "≤", + ">=": "≥", + "<>": "≠", + "!=": "≠", + ":=": "∶=", # "≔" not recognizable + "=:": "=∶", # "≕", not recognizable + "<=>": "⇔", + "<->": "↔", + "?=": "≟", } def op_translate(toktype): @@ -296,6 +308,8 @@ r")\b", bygroups(op_translate(Keyword))), include("expr"), + include("unicode-separators"), + include("unicode-other"), (r"[^\S\n]+", Text), (r".", Generic.Error), # tolerance for errors ], @@ -318,7 +332,10 @@ (r"\\", op_opt_ignore_or_fixed(Name.Entity, "\\")), ], "expr": [ - include("punctuation"), + include("math-symbols"), # must be before punctuation + include("ascii-punctuation"), + include("unicode-punctuation"), + include("escaped-string-start"), include("py-strings"), include("py-numbers"), (r"(?i)\\text[ \t]*\{", LexBase.op_ignore, "text-in-expr"), @@ -330,7 +347,10 @@ include("py-name"), ], "expr-in-braces": [ - include("punctuation-in-braces"), + include("math-symbols"), # must be before punctuation + include("ascii-punctuation-in-braces"), + include("unicode-punctuation"), + include("escaped-string-start"), include("py-strings"), include("py-numbers"), (r"(?i)\\text[ \t]*\{", LexBase.op_ignore, "text-in-expr"), @@ -347,6 +367,8 @@ include("expr-in-braces"), (r"\\\\", LexBase.op_fixed(Text, "\\")), (r"\\", LexBase.op_fixed(Text, "\\")), + include("unicode-separators"), + include("unicode-other"), (r"[^\S\n]+", Text), (r".", Generic.Error), # tolerance for errors ], @@ -385,6 +407,11 @@ suffix=r"\b"), Name.Builtin), ], + "math-symbols": [ + (r"<=>|<->|<-|->|=>|<=|>=|<>|!=|:=|=:|\?=", op_symbol(Operator)), + (r"[!&<>=+\-*/%|~]", Operator), # ASCII + (u"[%s]" % (uniprops.Sm,), Operator), # other Unicode + ], "word-operators": [ (words(("IN", "In", "in", "IS", "Is", "is", @@ -405,13 +432,37 @@ suffix=r"\b"), Keyword.Constant), ], - "punctuation": [ - (r"[{}:(),;[\]]", Punctuation), + "ascii-punctuation": [ + (r"[{}:(),;[\]?@]", Punctuation), + ], + "ascii-punctuation-in-braces": [ + # + # Like "punctuation" but needs an escaped curly brace for } because + # a single closing curly brace pops the current state here. + # + (r"\\\}", LexBase.op_fixed(Punctuation, "}")), + (r"[{:(),;[\]?@]", Punctuation), + ], + "unicode-separators": [ + (u"[%s]" % (uniprops.Zl,), Whitespace), + (u"[%s]" % (uniprops.Zp,), Whitespace), + (u"[%s]" % (uniprops.Zs,), Whitespace), ], - "punctuation-in-braces": [ - # like "punctuation" but needs an escaped curly brace for } - (r"\\\}", LexBase.op_fixed(Punctuation, "}")), - (r"[{:(),;[\]]", Punctuation), + "unicode-punctuation": [ + (u"[%s]" % (uniprops.Pc,), Punctuation), + (u"[%s]" % (uniprops.Pd,), Punctuation), + (u"[%s]" % (uniprops.Ps,), Punctuation), + (u"[%s]" % (uniprops.Pe,), Punctuation), + (u"[%s]" % (uniprops.Pi,), Punctuation), + (u"[%s]" % (uniprops.Pf,), Punctuation), + (u"[%s]" % (uniprops.Po,), Punctuation), + ], + "unicode-other": [ + (u"[%s]" % (uniprops.Sc,), Text), # Currency + (u"[%s]" % (uniprops.So,), Text), # Other symbols + ], + "escaped-string-start": [ + (r"""\\(['"])""", bygroups(Punctuation)), ], "explicit-tokentype": [ # All these REs are CASE-SENSITIVE!
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pygments_lexer_pseudocode2/uniprops.py Mon May 04 16:30:36 2026 +0200 @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# :- +# SPDX-FileCopyrightText: © 2026 Franz Glasner +# SPDX-License-Identifier: MIT +# :- +r"""A somewhat changed variant of :mod:`pygments.unistring`. + +We handle ASCII characters mostly ourself. + +""" + +__all__ = [] + + +import pygments.unistring + + +def _remove_ascii(s): + """Remove the characters in the ASCII range from `s` and return the + adjusted string. + + Assumes that in `s` the ASCII chars are sorted before the Unicode + codepoints as in :mod:`pygments.unistring`. + + """ + idx = 0 + while ord(s[idx]) < 0x80: + idx += 1 + if idx > 0: + return s[idx:] + else: + # nothing changed + return s + + +Pc = _remove_ascii(pygments.unistring.Pc) +Pd = _remove_ascii(pygments.unistring.Pd) +Pe = _remove_ascii(pygments.unistring.Pe) +Ps = _remove_ascii(pygments.unistring.Ps) +Pi = _remove_ascii(pygments.unistring.Pi) +Pf = _remove_ascii(pygments.unistring.Pf) +Po = _remove_ascii(pygments.unistring.Po) +Sc = _remove_ascii(pygments.unistring.Sc) +So = _remove_ascii(pygments.unistring.So) +Sm = _remove_ascii(pygments.unistring.Sm) +Zl = _remove_ascii(pygments.unistring.Zl) +Zp = _remove_ascii(pygments.unistring.Zp) +Zs = _remove_ascii(pygments.unistring.Zs)
--- a/tests/test_algpseudo.py Mon May 04 16:23:18 2026 +0200 +++ b/tests/test_algpseudo.py Mon May 04 16:30:36 2026 +0200 @@ -152,6 +152,15 @@ ], pygments.lex("\\PROC {the name}", self.lexer)) + def test_proc_with_symbols_in_name(self): + self.assertTokenStreamEqualComplete( + [("Keyword", "PROCEDURE"), + ("Text.Whitespace", " "), + ("Name.Entity", "the name sqrt ! <="), + ("Text.Whitespace", "\n"), + ], + pygments.lex("\\PROC {the name sqrt ! <=}", self.lexer)) + def test_proc_with_escape_in_name(self): self.assertTokenStreamEqualComplete( [("Keyword", "PROCEDURE"), @@ -191,6 +200,20 @@ ], pygments.lex("\\END-PROC {the procedure name}", self.lexer)) + def test_endproc_with_entityname_3(self): + self.assertTokenStreamEqualComplete( + [("Keyword", "END OF PROCEDURE"), + ("Text.Whitespace", " "), + ("Name.Entity", "the procedure name with pow and symbols ! <= "), + ("Name.Entity", "}"), + ("Name.Entity", "<-"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + "\\END-PROC" + " {the procedure name with pow and symbols ! <= \\}<-}", + self.lexer)) + def test_proc_de(self): lexer = pygments.lexers.load_lexer_from_file( ALGLEXERFILENAME, "AlgPseudocodeLexer_DE") @@ -379,9 +402,11 @@ ("Punctuation", ";"), ("Punctuation", "["), ("Punctuation", "]"), + ("Punctuation", "?"), + ("Punctuation", "@"), ("Text.Whitespace", "\n"), ], - pygments.lex(r"{}:(),;[]", self.lexer)) + pygments.lex(r"{}:(),;[]?@", self.lexer)) def test_block_empty(self): self.assertTokenStreamEqualComplete( @@ -564,17 +589,16 @@ ], pygments.lex("\\tt-o/\n\\tt-o// ", self.lexer)) - @unittest.skipIf(sys.version_info[0] <= 2, "Unicode issue on Python 2") def test_explicit_tokentype_with_remark(self): self.assertTokenStreamEqualComplete( - [("Operator", "∈ ∌"), + [("Operator", u"∈ ∌"), ("Text", " "), ("Comment.Single", "▷"), - ("Comment.Single", " ∈ ∌ as (ordinary) operators"), + ("Comment.Single", u" ∈ ∌ as (ordinary) operators"), ("Text.Whitespace", "\n"), ], pygments.lex( - r"""\ttx-o<∈ ∌> \rem ∈ ∌ as (ordinary) operators""", + u"""\\ttx-o<∈ ∌> \\rem ∈ ∌ as (ordinary) operators""", self.lexer)) def test_explicit_tokentype_with_remark_2(self): @@ -589,10 +613,9 @@ r"""\ttx-o<new_operator> \rem a (synthesized) operator""", self.lexer)) - @unittest.skipIf(sys.version_info[0] <= 2, "Unicode issue on Python 2") def test_explicit_tokentype_with_possibly_conflicting_parens(self): self.assertTokenStreamEqualComplete( - [("Name.Function", "∈_∌"), + [("Name.Function", u"∈_∌"), ("Punctuation", "("), ("Name.Entity", "p1"), ("Punctuation", ","), @@ -602,7 +625,7 @@ ("Text.Whitespace", "\n"), ], pygments.lex( - r"""\ttx-nf<∈_∌>(p1, p2)""", + u"""\\ttx-nf<∈_∌>(p1, p2)""", self.lexer)) def test_explicit_tokentype_with_possibly_conflicting_parens_2(self): @@ -673,6 +696,161 @@ r""" \end fn {The End of the Next Function} """, self.lexer)) + def test_unicode_math_operator(self): + self.assertTokenStreamEqualComplete( + [("Operator", u"∈"), + ("Text.Whitespace", "\n"), + ], + pygments.lex(u"∈", self.lexer)) + + def test_ascii_math_operator(self): + self.assertTokenStreamEqualComplete( + [("Operator", "="), + ("Operator", "!"), + ("Operator", "&"), + ("Operator", "<"), + ("Text", " "), + ("Operator", ">"), + ("Operator", "+"), + ("Operator", "-"), + ("Operator", "*"), + ("Operator", "/"), + ("Operator", "%"), + ("Operator", "|"), + ("Operator", "~"), + ("Text.Whitespace", "\n"), + ], + pygments.lex("=!&< >+-*/%|~", self.lexer)) + + @unittest.skipIf(sys.version_info[0] <= 2, "Unicode issues on Python 2") + def test_ascii_math_operator_with_replacements(self): + self.assertTokenStreamEqualComplete( + [("Operator", u"⇔"), + ("Text", " "), + ("Operator", u"↔"), + ("Text", " "), + ("Operator", u"←"), + ("Text", " "), + ("Operator", u"→"), + ("Text", " "), + ("Operator", u"⇒"), + ("Text", " "), + ("Operator", u"≤"), + ("Text", " "), + ("Operator", u"≥"), + ("Text", " "), + ("Operator", u"≠"), + ("Text", " "), + ("Operator", u"≠"), + ("Text", " "), + ("Operator", u"∶="), # u"≔"), + ("Text", " "), + ("Operator", u"=∶"), # u"≕"), + ("Text", " "), + ("Operator", u"≟"), + ("Text.Whitespace", "\n"), + ], + pygments.lex(u"<=> <-> <- -> => <= >= <> != := =: ?=", self.lexer)) + + def test_word_operators(self): + self.assertTokenStreamEqualComplete( + [("Operator.Word", "IN"), + ("Text", " "), + ("Operator.Word", "is"), + ("Text", " "), + ("Operator.Word", "And"), + ("Text", " "), + ("Operator.Word", "Or"), + ("Text", " "), + ("Operator.Word", "XOR"), + ("Text", " "), + ("Operator.Word", "not"), + ("Text.Whitespace", "\n"), + ], + pygments.lex("IN is And Or XOR not", self.lexer)) + + def test_keyword_constants(self): + self.assertTokenStreamEqualComplete( + [("Keyword.Constant", "true"), + ("Text", " "), + ("Keyword.Constant", "FALSE"), + ("Text", " "), + ("Keyword.Constant", "None"), + ("Text", " "), + ("Keyword.Constant", "nil"), + ("Text", " "), + ("Keyword.Constant", "NULL"), + ("Text", " "), + ("Keyword.Constant", "Empty"), + ("Text.Whitespace", "\n"), + ], + pygments.lex("true FALSE None nil NULL Empty", self.lexer)) + + def test_math_builtins(self): + self.assertTokenStreamEqualComplete( + [("Name.Builtin", "sqrt"), + ("Punctuation", "("), + ("Name.Entity", "Foo"), + ("Punctuation", ")"), + ("Punctuation", ";"), + ("Text.Whitespace", "\n"), + ], + pygments.lex("sqrt(Foo);", self.lexer)) + + def test_math_builtins_nested(self): + self.assertTokenStreamEqualComplete( + [("Name.Builtin", "pow"), + ("Punctuation", "("), + ("Number.Integer", "2"), + ("Punctuation", ","), + ("Text", " "), + ("Number.Integer", "8"), + ("Punctuation", ")"), + ("Punctuation", ";"), + ("Text.Whitespace", "\n"), + ], + pygments.lex("\\text{\\expr{pow(2, 8);}}", self.lexer)) + + def test_math_letters(self): + self.assertTokenStreamEqualComplete( + [("Name.Entity", "a"), + ("Text", " "), + ("Operator.Word", "in"), + ("Text", " "), + ("Name.Entity", u"ℂ"), + ("Text.Whitespace", "\n"), + ], + pygments.lex(u"a in ℂ", self.lexer)) + + def test_other_symbols(self): + self.assertTokenStreamEqualComplete( + [("Name.Entity", "b"), + ("Text", " "), + ("Operator.Word", "in"), + ("Text", " "), + ("Text", u"℀"), + ("Text.Whitespace", "\n"), + ], + pygments.lex(u"b in ℀", self.lexer)) + + def test_escaped_string_start_1(self): + self.assertTokenStreamEqualComplete( + [("Text", "flow "), + ("Name.Entity", "f"), + ("Punctuation", "'"), + ("Text.Whitespace", "\n"), + ], + pygments.lex(r"\TEXT{flow \expr{f\'}}", self.lexer)) + + def test_escaped_string_start_2(self): + self.assertTokenStreamEqualComplete( + [("Text", "flow "), + ("Name.Entity", "f"), + ("Punctuation", '"'), + ("Text.Whitespace", "\n"), + ], + pygments.lex(r'\TEXT{flow \expr{f\"}}', self.lexer)) + class PygmentizeCompletely(unittest.TestCase):
