changeset 105:cec52d83869a

Handle much more characters from the Unicode codeset in expressions. While there: FIX: Add forgotten Punctuation characters `?' and `@'. While there: Allow the escaping of single and double quotes that normally start a string (e.g. for expressions like f' is the first derivation of f).
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 04 May 2026 16:30:36 +0200
parents ffe6ea2cf69b
children f6b46a379aba
files pygments_lexer_pseudocode2/algpseudocode.py pygments_lexer_pseudocode2/uniprops.py tests/test_algpseudo.py
diffstat 3 files changed, 294 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/pygments_lexer_pseudocode2/algpseudocode.py	Mon May 04 16:23:18 2026 +0200
+++ b/pygments_lexer_pseudocode2/algpseudocode.py	Mon May 04 16:30:36 2026 +0200
@@ -26,7 +26,7 @@
 #
 from pygments_lexer_pseudocode2.bases import LexBase
 from pygments_lexer_pseudocode2.utils import REVERSED_STANDARD_TYPES
-
+from pygments_lexer_pseudocode2 import uniprops
 
 #
 # As in the local imports: use an explicit name because __name__ is
@@ -127,6 +127,18 @@
         "TSTATE": SYMBOL_TEXTSTATEMENT,
         "TEXTBLOCK": SYMBOL_TEXTSTATEMENT,
         "TBLOCK": SYMBOL_TEXTSTATEMENT,
+        "<-": "←",
+        "->": "→",
+        "=>": "⇒",
+        "<=": "≤",
+        ">=": "≥",
+        "<>": "≠",
+        "!=": "≠",
+        ":=": "∶=",  # "≔"   not recognizable
+        "=:": "=∶",  # "≕",  not recognizable
+        "<=>": "⇔",
+        "<->": "↔",
+        "?=": "≟",
     }
 
     def op_translate(toktype):
@@ -296,6 +308,8 @@
              r")\b",
              bygroups(op_translate(Keyword))),
             include("expr"),
+            include("unicode-separators"),
+            include("unicode-other"),
             (r"[^\S\n]+", Text),
             (r".", Generic.Error),     # tolerance for errors
         ],
@@ -318,7 +332,10 @@
             (r"\\", op_opt_ignore_or_fixed(Name.Entity, "\\")),
         ],
         "expr": [
-            include("punctuation"),
+            include("math-symbols"),          # must be before punctuation
+            include("ascii-punctuation"),
+            include("unicode-punctuation"),
+            include("escaped-string-start"),
             include("py-strings"),
             include("py-numbers"),
             (r"(?i)\\text[ \t]*\{", LexBase.op_ignore, "text-in-expr"),
@@ -330,7 +347,10 @@
             include("py-name"),
         ],
         "expr-in-braces": [
-            include("punctuation-in-braces"),
+            include("math-symbols"),          # must be before punctuation
+            include("ascii-punctuation-in-braces"),
+            include("unicode-punctuation"),
+            include("escaped-string-start"),
             include("py-strings"),
             include("py-numbers"),
             (r"(?i)\\text[ \t]*\{", LexBase.op_ignore, "text-in-expr"),
@@ -347,6 +367,8 @@
             include("expr-in-braces"),
             (r"\\\\", LexBase.op_fixed(Text, "\\")),
             (r"\\", LexBase.op_fixed(Text, "\\")),
+            include("unicode-separators"),
+            include("unicode-other"),
             (r"[^\S\n]+", Text),
             (r".", Generic.Error),     # tolerance for errors
         ],
@@ -385,6 +407,11 @@
                    suffix=r"\b"),
              Name.Builtin),
         ],
+        "math-symbols": [
+            (r"<=>|<->|<-|->|=>|<=|>=|<>|!=|:=|=:|\?=", op_symbol(Operator)),
+            (r"[!&<>=+\-*/%|~]", Operator),         # ASCII
+            (u"[%s]" % (uniprops.Sm,), Operator),   # other Unicode
+        ],
         "word-operators": [
             (words(("IN", "In", "in",
                     "IS", "Is", "is",
@@ -405,13 +432,37 @@
                    suffix=r"\b"),
              Keyword.Constant),
         ],
-        "punctuation": [
-            (r"[{}:(),;[\]]", Punctuation),
+        "ascii-punctuation": [
+            (r"[{}:(),;[\]?@]", Punctuation),
+        ],
+        "ascii-punctuation-in-braces": [
+            #
+            # Like "punctuation" but needs an escaped curly brace for } because
+            # a single closing curly brace pops the current state here.
+            #
+            (r"\\\}", LexBase.op_fixed(Punctuation, "}")),
+            (r"[{:(),;[\]?@]", Punctuation),
+        ],
+        "unicode-separators": [
+            (u"[%s]" % (uniprops.Zl,), Whitespace),
+            (u"[%s]" % (uniprops.Zp,), Whitespace),
+            (u"[%s]" % (uniprops.Zs,), Whitespace),
         ],
-        "punctuation-in-braces": [
-            # like "punctuation" but needs an escaped curly brace for }
-            (r"\\\}", LexBase.op_fixed(Punctuation, "}")),
-            (r"[{:(),;[\]]", Punctuation),
+        "unicode-punctuation": [
+            (u"[%s]" % (uniprops.Pc,), Punctuation),
+            (u"[%s]" % (uniprops.Pd,), Punctuation),
+            (u"[%s]" % (uniprops.Ps,), Punctuation),
+            (u"[%s]" % (uniprops.Pe,), Punctuation),
+            (u"[%s]" % (uniprops.Pi,), Punctuation),
+            (u"[%s]" % (uniprops.Pf,), Punctuation),
+            (u"[%s]" % (uniprops.Po,), Punctuation),
+        ],
+        "unicode-other": [
+            (u"[%s]" % (uniprops.Sc,), Text),    # Currency
+            (u"[%s]" % (uniprops.So,), Text),    # Other symbols
+        ],
+        "escaped-string-start": [
+            (r"""\\(['"])""", bygroups(Punctuation)),
         ],
         "explicit-tokentype": [
             # All these REs are CASE-SENSITIVE!
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pygments_lexer_pseudocode2/uniprops.py	Mon May 04 16:30:36 2026 +0200
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# :-
+# SPDX-FileCopyrightText: © 2026 Franz Glasner
+# SPDX-License-Identifier: MIT
+# :-
+r"""A somewhat changed variant of :mod:`pygments.unistring`.
+
+We handle ASCII characters mostly ourself.
+
+"""
+
+__all__ = []
+
+
+import pygments.unistring
+
+
+def _remove_ascii(s):
+    """Remove the characters in the ASCII range from `s` and return the
+    adjusted string.
+
+    Assumes that in `s` the ASCII chars are sorted before the Unicode
+    codepoints as in :mod:`pygments.unistring`.
+
+    """
+    idx = 0
+    while ord(s[idx]) < 0x80:
+        idx += 1
+    if idx > 0:
+        return s[idx:]
+    else:
+        # nothing changed
+        return s
+
+
+Pc = _remove_ascii(pygments.unistring.Pc)
+Pd = _remove_ascii(pygments.unistring.Pd)
+Pe = _remove_ascii(pygments.unistring.Pe)
+Ps = _remove_ascii(pygments.unistring.Ps)
+Pi = _remove_ascii(pygments.unistring.Pi)
+Pf = _remove_ascii(pygments.unistring.Pf)
+Po = _remove_ascii(pygments.unistring.Po)
+Sc = _remove_ascii(pygments.unistring.Sc)
+So = _remove_ascii(pygments.unistring.So)
+Sm = _remove_ascii(pygments.unistring.Sm)
+Zl = _remove_ascii(pygments.unistring.Zl)
+Zp = _remove_ascii(pygments.unistring.Zp)
+Zs = _remove_ascii(pygments.unistring.Zs)
--- a/tests/test_algpseudo.py	Mon May 04 16:23:18 2026 +0200
+++ b/tests/test_algpseudo.py	Mon May 04 16:30:36 2026 +0200
@@ -152,6 +152,15 @@
              ],
             pygments.lex("\\PROC {the name}", self.lexer))
 
+    def test_proc_with_symbols_in_name(self):
+        self.assertTokenStreamEqualComplete(
+            [("Keyword", "PROCEDURE"),
+             ("Text.Whitespace", " "),
+             ("Name.Entity", "the name sqrt ! <="),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex("\\PROC {the name sqrt ! <=}", self.lexer))
+
     def test_proc_with_escape_in_name(self):
         self.assertTokenStreamEqualComplete(
             [("Keyword", "PROCEDURE"),
@@ -191,6 +200,20 @@
              ],
             pygments.lex("\\END-PROC {the procedure name}", self.lexer))
 
+    def test_endproc_with_entityname_3(self):
+        self.assertTokenStreamEqualComplete(
+            [("Keyword", "END OF PROCEDURE"),
+             ("Text.Whitespace", " "),
+             ("Name.Entity", "the procedure name with pow and symbols ! <= "),
+             ("Name.Entity", "}"),
+             ("Name.Entity", "<-"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                "\\END-PROC"
+                " {the procedure name with pow and symbols ! <= \\}<-}",
+                self.lexer))
+
     def test_proc_de(self):
         lexer = pygments.lexers.load_lexer_from_file(
             ALGLEXERFILENAME, "AlgPseudocodeLexer_DE")
@@ -379,9 +402,11 @@
              ("Punctuation", ";"),
              ("Punctuation", "["),
              ("Punctuation", "]"),
+             ("Punctuation", "?"),
+             ("Punctuation", "@"),
              ("Text.Whitespace", "\n"),
              ],
-            pygments.lex(r"{}:(),;[]", self.lexer))
+            pygments.lex(r"{}:(),;[]?@", self.lexer))
 
     def test_block_empty(self):
         self.assertTokenStreamEqualComplete(
@@ -564,17 +589,16 @@
              ],
             pygments.lex("\\tt-o/\n\\tt-o// ", self.lexer))
 
-    @unittest.skipIf(sys.version_info[0] <= 2, "Unicode issue on Python 2")
     def test_explicit_tokentype_with_remark(self):
         self.assertTokenStreamEqualComplete(
-            [("Operator", "∈ ∌"),
+            [("Operator", u"∈ ∌"),
              ("Text", "    "),
              ("Comment.Single", "▷"),
-             ("Comment.Single", " ∈ ∌ as (ordinary) operators"),
+             ("Comment.Single", u" ∈ ∌ as (ordinary) operators"),
              ("Text.Whitespace", "\n"),
              ],
             pygments.lex(
-                r"""\ttx-o<∈ ∌>    \rem ∈ ∌ as (ordinary) operators""",
+                u"""\\ttx-o<∈ ∌>    \\rem ∈ ∌ as (ordinary) operators""",
                 self.lexer))
 
     def test_explicit_tokentype_with_remark_2(self):
@@ -589,10 +613,9 @@
                 r"""\ttx-o<new_operator>  \rem a (synthesized) operator""",
                 self.lexer))
 
-    @unittest.skipIf(sys.version_info[0] <= 2, "Unicode issue on Python 2")
     def test_explicit_tokentype_with_possibly_conflicting_parens(self):
         self.assertTokenStreamEqualComplete(
-            [("Name.Function", "∈_∌"),
+            [("Name.Function", u"∈_∌"),
              ("Punctuation", "("),
              ("Name.Entity", "p1"),
              ("Punctuation", ","),
@@ -602,7 +625,7 @@
              ("Text.Whitespace", "\n"),
              ],
             pygments.lex(
-                r"""\ttx-nf<∈_∌>(p1, p2)""",
+                u"""\\ttx-nf<∈_∌>(p1, p2)""",
                 self.lexer))
 
     def test_explicit_tokentype_with_possibly_conflicting_parens_2(self):
@@ -673,6 +696,161 @@
                 r"""  \end fn {The End of the Next Function} """,
                 self.lexer))
 
+    def test_unicode_math_operator(self):
+        self.assertTokenStreamEqualComplete(
+            [("Operator", u"∈"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(u"∈", self.lexer))
+
+    def test_ascii_math_operator(self):
+        self.assertTokenStreamEqualComplete(
+            [("Operator", "="),
+             ("Operator", "!"),
+             ("Operator", "&"),
+             ("Operator", "<"),
+             ("Text", " "),
+             ("Operator", ">"),
+             ("Operator", "+"),
+             ("Operator", "-"),
+             ("Operator", "*"),
+             ("Operator", "/"),
+             ("Operator", "%"),
+             ("Operator", "|"),
+             ("Operator", "~"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex("=!&< >+-*/%|~", self.lexer))
+
+    @unittest.skipIf(sys.version_info[0] <= 2, "Unicode issues on Python 2")
+    def test_ascii_math_operator_with_replacements(self):
+        self.assertTokenStreamEqualComplete(
+            [("Operator", u"⇔"),
+             ("Text", " "),
+             ("Operator", u"↔"),
+             ("Text", " "),
+             ("Operator", u"←"),
+             ("Text", " "),
+             ("Operator", u"→"),
+             ("Text", " "),
+             ("Operator", u"⇒"),
+             ("Text", " "),
+             ("Operator", u"≤"),
+             ("Text", " "),
+             ("Operator", u"≥"),
+             ("Text", " "),
+             ("Operator", u"≠"),
+             ("Text", " "),
+             ("Operator", u"≠"),
+             ("Text", " "),
+             ("Operator", u"∶="),   # u"≔"),
+             ("Text", " "),
+             ("Operator", u"=∶"),   # u"≕"),
+             ("Text", " "),
+             ("Operator", u"≟"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(u"<=> <-> <- -> => <= >= <> != := =: ?=", self.lexer))
+
+    def test_word_operators(self):
+        self.assertTokenStreamEqualComplete(
+            [("Operator.Word", "IN"),
+             ("Text", " "),
+             ("Operator.Word", "is"),
+             ("Text", " "),
+             ("Operator.Word", "And"),
+             ("Text", " "),
+             ("Operator.Word", "Or"),
+             ("Text", " "),
+             ("Operator.Word", "XOR"),
+             ("Text", " "),
+             ("Operator.Word", "not"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex("IN is And Or XOR not", self.lexer))
+
+    def test_keyword_constants(self):
+        self.assertTokenStreamEqualComplete(
+            [("Keyword.Constant", "true"),
+             ("Text", " "),
+             ("Keyword.Constant", "FALSE"),
+             ("Text", " "),
+             ("Keyword.Constant", "None"),
+             ("Text", " "),
+             ("Keyword.Constant", "nil"),
+             ("Text", " "),
+             ("Keyword.Constant", "NULL"),
+             ("Text", " "),
+             ("Keyword.Constant", "Empty"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex("true FALSE None nil NULL Empty", self.lexer))
+
+    def test_math_builtins(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Builtin", "sqrt"),
+             ("Punctuation", "("),
+             ("Name.Entity", "Foo"),
+             ("Punctuation", ")"),
+             ("Punctuation", ";"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex("sqrt(Foo);", self.lexer))
+
+    def test_math_builtins_nested(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Builtin", "pow"),
+             ("Punctuation", "("),
+             ("Number.Integer", "2"),
+             ("Punctuation", ","),
+             ("Text", " "),
+             ("Number.Integer", "8"),
+             ("Punctuation", ")"),
+             ("Punctuation", ";"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex("\\text{\\expr{pow(2, 8);}}", self.lexer))
+
+    def test_math_letters(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Entity", "a"),
+             ("Text", " "),
+             ("Operator.Word", "in"),
+             ("Text", " "),
+             ("Name.Entity", u"ℂ"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(u"a in ℂ", self.lexer))
+
+    def test_other_symbols(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Entity", "b"),
+             ("Text", " "),
+             ("Operator.Word", "in"),
+             ("Text", " "),
+             ("Text", u"℀"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(u"b in ℀", self.lexer))
+
+    def test_escaped_string_start_1(self):
+        self.assertTokenStreamEqualComplete(
+            [("Text", "flow "),
+             ("Name.Entity", "f"),
+             ("Punctuation", "'"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(r"\TEXT{flow \expr{f\'}}", self.lexer))
+
+    def test_escaped_string_start_2(self):
+        self.assertTokenStreamEqualComplete(
+            [("Text", "flow "),
+             ("Name.Entity", "f"),
+             ("Punctuation", '"'),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(r'\TEXT{flow \expr{f\"}}', self.lexer))
+
 
 class PygmentizeCompletely(unittest.TestCase):