changeset 285:afbca50b7dc1

Implement an alternate syntax for "Explicit Token Types". Uses a generic two-argument syntax and allows escaping of characters using the common excaping rules. For this to work the AlgPseudocodeLexer is now based on Pygment's ExtendedRegexLexer instead of RegexLexer.
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 20 May 2026 20:35:37 +0200
parents 1683a10eabb2
children 051c8877ee22
files docs/lexer-algpseudocode.rst pygments_lexer_pseudocode2/lexers/algpseudocode.py pygments_lexer_pseudocode2/lexers/bases.py tests/test_algpseudo.py
diffstat 4 files changed, 189 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/docs/lexer-algpseudocode.rst	Wed May 20 20:32:42 2026 +0200
+++ b/docs/lexer-algpseudocode.rst	Wed May 20 20:35:37 2026 +0200
@@ -457,6 +457,81 @@
 They allow to handle keywords and operators that are not recognized by default.
 And they allow the user to explicitely highlight some input text at low-level.
 
+.. note:: Explicit token types are **case-sensitive**.
+
+.. note:: Explicit token types work in all `expression` and `text` contexts.
+
+.. note:: Nested explicit token types are *not supported*.
+
+
+Current and Recommended Syntax
+------------------------------
+
+The current and recommended use is the ``\ttX{ARG1}{ARG2}`` command.
+
+This command has two required parameters:
+
+#. The content of the first argument `ARG1` must be a `value` in the
+   :py:data:`pygments.token.STANDARD_TYPES` dict.
+   Its corresponding token type (the associated `key` in this dictionary)
+   will be used as token type for the token.
+
+#. The content of the second argument will given the token type of
+   the first parameter.
+
+   Standard `Escaping Rules`_ apply to this argument!
+
+Examples:
+
+.. code-block:: algpseudocode
+
+   \text{• \\ttX{kc\}{C\}}      \ttX{kc}{C}         \rem C as Keyword.Constant
+   \text{• \\ttX{ow\}{∈\}}      \ttX{ow}{∈}         \rem ∈ as Operator.Word
+   \text{• \\ttX{kc\}{A Constant Keyword\}}  \ttX{kc}{A Constant Keyword}  \rem An explicit Keyword.Constant
+   \text{• \\ttX{nv\}{A Variable Name\}}     \ttX{nv}{A Variable Name}     \rem An explicit Name.Variable
+   \text{• \\ttX{ni\}{An Entity*Name\}}      \ttX{ni}{An Entity*Name}      \rem An explicit Name.Entity
+   \text{• \\ttX{k\}{∈ ∌\}}     \ttX{k}{∈ ∌}       \rem ∈ and ∌ as (ordinary) Keywords
+   \text{• \\ttX{o\}{∈ ∌\}}     \ttX{o}{∈ ∌}       \rem ∈ and ∌ as (ordinary) Operators
+     /*
+      * The line below has ∈_∌ as (peculiar) function name.
+      * Their params are automatic (i.e. a normal expression).
+      */
+   \text{• \\ttX{nf\}{∈_∌\}(p1, p2)}             \ttX{nf}{∈_∌}(p1, p2)
+     /*
+      * The line below has ∈_∌ as (peculiar) decorator name (as used in Python).
+      * Their params are automatic (i.e. a normal expression).
+      */
+   \text{• \\ttX{nd\}{∈_∌\}(p1, p2)}             \ttX{nd}{∈_∌}(p1, p2)
+     /*
+      * Normal emphasis ("strong")
+      */
+   \text{• \\ttX{gs\}{this is strong\}}          \ttX{gs}{this is strong}
+     /*
+      * A strong emphasis.
+      * Note that the backslash is a valid delimiter!
+      */
+   \text{• \\ttX{ges\}{A Strong Emphasis!\}}     \ttX{ges}{A Strong Emphasis!}
+     /*
+      * Escaping is allowed and needed for the closing brace!
+      * The example token type is a "String".
+      */
+   \text{• \\ttX{s\}{Escaping brace \\\} and backslash \\\\!\}}   \ttX{s}{Escaping brace \} and backslash \\!}
+     /*
+      * This is a non-existing token type: you get some generic error markup
+      * with a Generic.Error token and no expansion.
+      */
+   \text{• \\ttX{NON-EXISTING\}{∈_∌\}(p1, p2)}   \ttX{NON_EXISTING}{∈_∌}(p1, p2)
+
+
+
+Old Syntax (Deprecated)
+-----------------------
+
+.. deprecated:: 3.0
+   Use `Current and Recommended Syntax`_ instead.
+
+.. note:: The lower-case ``x`` in ``\ttx-``!
+
 `XX` represents a `value` in the :py:data:`pygments.token.STANDARD_TYPES`
 dict.
 Its corresponding token type (the associated `key` in this `dict`) is
@@ -520,10 +595,6 @@
       */
    \text{• \\ttx-NON-EXISTING?∈_∌?(p1, p2)}      \ttx-NON_EXISTING?∈_∌?(p1, p2)
 
-.. note:: Explicit token types are **case-sensitive**.
-
-.. note:: Explicit token types work in all `expression` and `text` contexts.
-
 
 .. _escaping-rules:
 
@@ -540,7 +611,8 @@
 argument content.
 
 A single backslash yields a :py:class:`pygments.token.Token.Generic.Error`
-token when in `default` and `expression` states.
+token when in `default` and `expression` states
+(and also in `Explicit Token Types`_).
 Contrary---in `text` contexts a single backslash character that does not
 introduce a command yields a normal text token.
 
--- a/pygments_lexer_pseudocode2/lexers/algpseudocode.py	Wed May 20 20:32:42 2026 +0200
+++ b/pygments_lexer_pseudocode2/lexers/algpseudocode.py	Wed May 20 20:35:37 2026 +0200
@@ -235,6 +235,26 @@
         if ctx:
             ctx.pos = match.end()
 
+    def op_explicit_tokentype_ex_start(lexer, match, ctx):
+        needed_css = match.group("type")
+        ctx.explicit_token_type = REVERSED_STANDARD_TYPES.get(needed_css, None)
+        if ctx.explicit_token_type is None:
+            # Be more error friendly
+            ctx.explicit_token_type = Generic.Error
+            _logger.warning("Unhandled explicit token type: %s", match.group())
+            yield match.start(), ctx.explicit_token_type, match.group()
+        ctx.pos = match.end()
+
+    def op_explicit_tokentype_ex_value(lexer, match, ctx):
+        yield match.start(), ctx.explicit_token_type, match.group(1)
+        ctx.pos = match.end()
+
+    def op_explicit_tokentype_ex_end(lexer, match, ctx):
+        if ctx.explicit_token_type is Generic.Error:
+            yield match.start(), ctx.explicit_token_type, match.group()
+        ctx.pos = match.end()
+        ctx.explicit_token_type = None
+
     tokens = {
         "root": [
             (r"\n", Whitespace),
@@ -494,6 +514,17 @@
         "explicit-tokentype": [
             # All these REs are CASE-SENSITIVE!
 
+            #
+            # New extended (more flexible, allows escaping)
+            #
+            (r"""\\ttX[ \t]*\{(?P<type>[^}]+)\}[ \t]*\{""",
+             op_explicit_tokentype_ex_start,
+             "extended-explicit-tokentype"),
+
+            #
+            # Old variants
+            #
+
             # Multiple characters possible, but no escaping!
             (r"""\\ttx\-(?P<type>[a-zA-Z0-9_-]+?)"""
              r"""(?P<sep>[/?.,:;%|=*+!\\$~"'#@_-])"""
@@ -512,6 +543,13 @@
             (r"\\tt-(?P<type>[^/]+?)/(?P<characters>(?:.|\n))",
              op_explicit_tokentype),
         ],
+        "extended-explicit-tokentype": [
+            (r"([^\\}]+)", op_explicit_tokentype_ex_value),
+            (r"\}", op_explicit_tokentype_ex_end, "#pop"),
+            (r"\\(\})", op_explicit_tokentype_ex_value),
+            (r"\\(\\)", op_explicit_tokentype_ex_value),
+            (r"\\", LexBase.op_fixed(Generic.Error, "\\")),   # weak error
+        ],
     }
 
     def __init__(self, **options):
--- a/pygments_lexer_pseudocode2/lexers/bases.py	Wed May 20 20:32:42 2026 +0200
+++ b/pygments_lexer_pseudocode2/lexers/bases.py	Wed May 20 20:35:37 2026 +0200
@@ -11,7 +11,7 @@
 import sys
 
 from pygments import unistring
-from pygments.lexer import RegexLexer, combined, bygroups, include
+from pygments.lexer import ExtendedRegexLexer, combined, bygroups, include
 from pygments.token import (Comment, Error, Name, Number, Other, String)
 
 
@@ -73,7 +73,7 @@
 # SPDX-SnippetEnd
 
 
-class LexBase(RegexLexer):
+class LexBase(ExtendedRegexLexer):
 
     """A base that defines some common lexer states.
 
--- a/tests/test_algpseudo.py	Wed May 20 20:32:42 2026 +0200
+++ b/tests/test_algpseudo.py	Wed May 20 20:35:37 2026 +0200
@@ -750,6 +750,78 @@
                 r"""\ttx-non-existing[a_Decorator]""",
                 self.lexer))
 
+    def test_extended_explicit_tokentype_empty(self):
+        self.assertTokenStreamEqualComplete(
+            [("Text.Whitespace", "\n")],
+            pygments.lex(
+                r"""\ttX{nd}{}""", self.lexer))
+
+    def test_extended_explicit_tokentype_simple(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Decorator", "simple\nline 2"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                """\\ttX{nd}{simple\nline 2}""", self.lexer))
+
+    def test_extended_explicit_tokentype_escaped_brace(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Decorator", "simple"),
+             ("Name.Decorator", "}"),
+             ("Name.Decorator", "part 2"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                """\\ttX{nd}{simple\\}part 2}""", self.lexer))
+
+    def test_extended_explicit_tokentype_escaped_backslash(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Decorator", "simple"),
+             ("Name.Decorator", "\\"),
+             ("Name.Decorator", "part 2"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                """\\ttX{nd}{simple\\\\part 2}""", self.lexer))
+
+    def test_extended_explicit_tokentype_single_backslash(self):
+        self.assertTokenStreamEqualComplete(
+            [("Name.Variable", "simple"),
+             ("Generic.Error", "\\"),
+             ("Name.Variable", "part 2"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                """\\ttX{nv}{simple\\part 2}""", self.lexer))
+
+    def test_extended_explicit_tokentype_non_existing_type(self):
+        self.assertTokenStreamEqualComplete(
+            [("Generic.Error", u"\\ttX{NON_EXISTING}{"),
+             ("Generic.Error", u"∈_∌"),
+             ("Generic.Error", u"}"),
+             ("Punctuation", "("),
+             ("Name.Entity", "p1"),
+             ("Punctuation", ","),
+             ("Text", " "),
+             ("Name.Entity", "p2"),
+             ("Punctuation", ")"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                u"\\ttX{NON_EXISTING}{∈_∌}(p1, p2)", self.lexer))
+
+    def test_just_braces_in_expressions(self):
+        self.assertTokenStreamEqualComplete(
+            [("Punctuation", "{"),
+             ("Name.Entity", "foo"),
+             ("Text", " "),
+             ("Name.Entity", "bar"),
+             ("Punctuation", "}"),
+             ("Text.Whitespace", "\n"),
+             ],
+            pygments.lex(
+                r"{foo bar}", self.lexer))
+
     def test_end_combinations(self):
         self.assertTokenStreamEqualComplete(
             [("Keyword", "BEGIN"),