# HG changeset patch # User Franz Glasner # Date 1779302137 -7200 # Node ID afbca50b7dc1205828f26dc6536d0910f622c8b1 # Parent 1683a10eabb2d27bc45bd2d2dd367676e3f4b767 Implement an alternate syntax for "Explicit Token Types". Uses a generic two-argument syntax and allows escaping of characters using the common excaping rules. For this to work the AlgPseudocodeLexer is now based on Pygment's ExtendedRegexLexer instead of RegexLexer. diff -r 1683a10eabb2 -r afbca50b7dc1 docs/lexer-algpseudocode.rst --- a/docs/lexer-algpseudocode.rst Wed May 20 20:32:42 2026 +0200 +++ b/docs/lexer-algpseudocode.rst Wed May 20 20:35:37 2026 +0200 @@ -457,6 +457,81 @@ They allow to handle keywords and operators that are not recognized by default. And they allow the user to explicitely highlight some input text at low-level. +.. note:: Explicit token types are **case-sensitive**. + +.. note:: Explicit token types work in all `expression` and `text` contexts. + +.. note:: Nested explicit token types are *not supported*. + + +Current and Recommended Syntax +------------------------------ + +The current and recommended use is the ``\ttX{ARG1}{ARG2}`` command. + +This command has two required parameters: + +#. The content of the first argument `ARG1` must be a `value` in the + :py:data:`pygments.token.STANDARD_TYPES` dict. + Its corresponding token type (the associated `key` in this dictionary) + will be used as token type for the token. + +#. The content of the second argument will given the token type of + the first parameter. + + Standard `Escaping Rules`_ apply to this argument! + +Examples: + +.. code-block:: algpseudocode + + \text{• \\ttX{kc\}{C\}} \ttX{kc}{C} \rem C as Keyword.Constant + \text{• \\ttX{ow\}{∈\}} \ttX{ow}{∈} \rem ∈ as Operator.Word + \text{• \\ttX{kc\}{A Constant Keyword\}} \ttX{kc}{A Constant Keyword} \rem An explicit Keyword.Constant + \text{• \\ttX{nv\}{A Variable Name\}} \ttX{nv}{A Variable Name} \rem An explicit Name.Variable + \text{• \\ttX{ni\}{An Entity*Name\}} \ttX{ni}{An Entity*Name} \rem An explicit Name.Entity + \text{• \\ttX{k\}{∈ ∌\}} \ttX{k}{∈ ∌} \rem ∈ and ∌ as (ordinary) Keywords + \text{• \\ttX{o\}{∈ ∌\}} \ttX{o}{∈ ∌} \rem ∈ and ∌ as (ordinary) Operators + /* + * The line below has ∈_∌ as (peculiar) function name. + * Their params are automatic (i.e. a normal expression). + */ + \text{• \\ttX{nf\}{∈_∌\}(p1, p2)} \ttX{nf}{∈_∌}(p1, p2) + /* + * The line below has ∈_∌ as (peculiar) decorator name (as used in Python). + * Their params are automatic (i.e. a normal expression). + */ + \text{• \\ttX{nd\}{∈_∌\}(p1, p2)} \ttX{nd}{∈_∌}(p1, p2) + /* + * Normal emphasis ("strong") + */ + \text{• \\ttX{gs\}{this is strong\}} \ttX{gs}{this is strong} + /* + * A strong emphasis. + * Note that the backslash is a valid delimiter! + */ + \text{• \\ttX{ges\}{A Strong Emphasis!\}} \ttX{ges}{A Strong Emphasis!} + /* + * Escaping is allowed and needed for the closing brace! + * The example token type is a "String". + */ + \text{• \\ttX{s\}{Escaping brace \\\} and backslash \\\\!\}} \ttX{s}{Escaping brace \} and backslash \\!} + /* + * This is a non-existing token type: you get some generic error markup + * with a Generic.Error token and no expansion. + */ + \text{• \\ttX{NON-EXISTING\}{∈_∌\}(p1, p2)} \ttX{NON_EXISTING}{∈_∌}(p1, p2) + + + +Old Syntax (Deprecated) +----------------------- + +.. deprecated:: 3.0 + Use `Current and Recommended Syntax`_ instead. + +.. note:: The lower-case ``x`` in ``\ttx-``! + `XX` represents a `value` in the :py:data:`pygments.token.STANDARD_TYPES` dict. Its corresponding token type (the associated `key` in this `dict`) is @@ -520,10 +595,6 @@ */ \text{• \\ttx-NON-EXISTING?∈_∌?(p1, p2)} \ttx-NON_EXISTING?∈_∌?(p1, p2) -.. note:: Explicit token types are **case-sensitive**. - -.. note:: Explicit token types work in all `expression` and `text` contexts. - .. _escaping-rules: @@ -540,7 +611,8 @@ argument content. A single backslash yields a :py:class:`pygments.token.Token.Generic.Error` -token when in `default` and `expression` states. +token when in `default` and `expression` states +(and also in `Explicit Token Types`_). Contrary---in `text` contexts a single backslash character that does not introduce a command yields a normal text token. diff -r 1683a10eabb2 -r afbca50b7dc1 pygments_lexer_pseudocode2/lexers/algpseudocode.py --- a/pygments_lexer_pseudocode2/lexers/algpseudocode.py Wed May 20 20:32:42 2026 +0200 +++ b/pygments_lexer_pseudocode2/lexers/algpseudocode.py Wed May 20 20:35:37 2026 +0200 @@ -235,6 +235,26 @@ if ctx: ctx.pos = match.end() + def op_explicit_tokentype_ex_start(lexer, match, ctx): + needed_css = match.group("type") + ctx.explicit_token_type = REVERSED_STANDARD_TYPES.get(needed_css, None) + if ctx.explicit_token_type is None: + # Be more error friendly + ctx.explicit_token_type = Generic.Error + _logger.warning("Unhandled explicit token type: %s", match.group()) + yield match.start(), ctx.explicit_token_type, match.group() + ctx.pos = match.end() + + def op_explicit_tokentype_ex_value(lexer, match, ctx): + yield match.start(), ctx.explicit_token_type, match.group(1) + ctx.pos = match.end() + + def op_explicit_tokentype_ex_end(lexer, match, ctx): + if ctx.explicit_token_type is Generic.Error: + yield match.start(), ctx.explicit_token_type, match.group() + ctx.pos = match.end() + ctx.explicit_token_type = None + tokens = { "root": [ (r"\n", Whitespace), @@ -494,6 +514,17 @@ "explicit-tokentype": [ # All these REs are CASE-SENSITIVE! + # + # New extended (more flexible, allows escaping) + # + (r"""\\ttX[ \t]*\{(?P[^}]+)\}[ \t]*\{""", + op_explicit_tokentype_ex_start, + "extended-explicit-tokentype"), + + # + # Old variants + # + # Multiple characters possible, but no escaping! (r"""\\ttx\-(?P[a-zA-Z0-9_-]+?)""" r"""(?P[/?.,:;%|=*+!\\$~"'#@_-])""" @@ -512,6 +543,13 @@ (r"\\tt-(?P[^/]+?)/(?P(?:.|\n))", op_explicit_tokentype), ], + "extended-explicit-tokentype": [ + (r"([^\\}]+)", op_explicit_tokentype_ex_value), + (r"\}", op_explicit_tokentype_ex_end, "#pop"), + (r"\\(\})", op_explicit_tokentype_ex_value), + (r"\\(\\)", op_explicit_tokentype_ex_value), + (r"\\", LexBase.op_fixed(Generic.Error, "\\")), # weak error + ], } def __init__(self, **options): diff -r 1683a10eabb2 -r afbca50b7dc1 pygments_lexer_pseudocode2/lexers/bases.py --- a/pygments_lexer_pseudocode2/lexers/bases.py Wed May 20 20:32:42 2026 +0200 +++ b/pygments_lexer_pseudocode2/lexers/bases.py Wed May 20 20:35:37 2026 +0200 @@ -11,7 +11,7 @@ import sys from pygments import unistring -from pygments.lexer import RegexLexer, combined, bygroups, include +from pygments.lexer import ExtendedRegexLexer, combined, bygroups, include from pygments.token import (Comment, Error, Name, Number, Other, String) @@ -73,7 +73,7 @@ # SPDX-SnippetEnd -class LexBase(RegexLexer): +class LexBase(ExtendedRegexLexer): """A base that defines some common lexer states. diff -r 1683a10eabb2 -r afbca50b7dc1 tests/test_algpseudo.py --- a/tests/test_algpseudo.py Wed May 20 20:32:42 2026 +0200 +++ b/tests/test_algpseudo.py Wed May 20 20:35:37 2026 +0200 @@ -750,6 +750,78 @@ r"""\ttx-non-existing[a_Decorator]""", self.lexer)) + def test_extended_explicit_tokentype_empty(self): + self.assertTokenStreamEqualComplete( + [("Text.Whitespace", "\n")], + pygments.lex( + r"""\ttX{nd}{}""", self.lexer)) + + def test_extended_explicit_tokentype_simple(self): + self.assertTokenStreamEqualComplete( + [("Name.Decorator", "simple\nline 2"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + """\\ttX{nd}{simple\nline 2}""", self.lexer)) + + def test_extended_explicit_tokentype_escaped_brace(self): + self.assertTokenStreamEqualComplete( + [("Name.Decorator", "simple"), + ("Name.Decorator", "}"), + ("Name.Decorator", "part 2"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + """\\ttX{nd}{simple\\}part 2}""", self.lexer)) + + def test_extended_explicit_tokentype_escaped_backslash(self): + self.assertTokenStreamEqualComplete( + [("Name.Decorator", "simple"), + ("Name.Decorator", "\\"), + ("Name.Decorator", "part 2"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + """\\ttX{nd}{simple\\\\part 2}""", self.lexer)) + + def test_extended_explicit_tokentype_single_backslash(self): + self.assertTokenStreamEqualComplete( + [("Name.Variable", "simple"), + ("Generic.Error", "\\"), + ("Name.Variable", "part 2"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + """\\ttX{nv}{simple\\part 2}""", self.lexer)) + + def test_extended_explicit_tokentype_non_existing_type(self): + self.assertTokenStreamEqualComplete( + [("Generic.Error", u"\\ttX{NON_EXISTING}{"), + ("Generic.Error", u"∈_∌"), + ("Generic.Error", u"}"), + ("Punctuation", "("), + ("Name.Entity", "p1"), + ("Punctuation", ","), + ("Text", " "), + ("Name.Entity", "p2"), + ("Punctuation", ")"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + u"\\ttX{NON_EXISTING}{∈_∌}(p1, p2)", self.lexer)) + + def test_just_braces_in_expressions(self): + self.assertTokenStreamEqualComplete( + [("Punctuation", "{"), + ("Name.Entity", "foo"), + ("Text", " "), + ("Name.Entity", "bar"), + ("Punctuation", "}"), + ("Text.Whitespace", "\n"), + ], + pygments.lex( + r"{foo bar}", self.lexer)) + def test_end_combinations(self): self.assertTokenStreamEqualComplete( [("Keyword", "BEGIN"),