view pygments_lexer_pseudocode2/algpseudocode.py @ 111:d6f3a1d1bedd

Some more keywords (\FROM, \TO, \IN)
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 05 May 2026 17:37:58 +0200
parents 49e5aa89095f
children ec8767cc5493
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# SPDX-FileCopyrightText: © 2026 Franz Glasner
# SPDX-License-Identifier: MIT
# :-
r"""A pseudocode lexer along the lines of CTAN's algpseudocode or
algpseudocodex.

"""

__all__ = ["AlgPseudocodeLexer",
           "AlgPseudocodeLexer_DE", "AlgPseudocodeLexer_FR"]


import logging
import re

import pygments.util
from pygments.lexer import bygroups, include, words
from pygments.token import (Comment, Generic, Keyword, Name, Operator,
                            Punctuation, Text, Whitespace)

#
# Relative imports do not work with pygments.lexers.load_lexer_from_file()
# in all of our supported Python releases.
#
from pygments_lexer_pseudocode2.bases import LexBase
from pygments_lexer_pseudocode2.utils import REVERSED_STANDARD_TYPES
from pygments_lexer_pseudocode2 import uniprops

#
# As in the local imports: use an explicit name because __name__ is
# __builtins__
#
_logger = logging.getLogger("pygments_lexer_pseudocode2.algpseudocode")


class AlgPseudocodeLexer(LexBase):

    """A pseudocode lexer along the lines of CTAN's algpseudocode or
    algpseudocodex.

    Some ideas (e.g. strings) are borrowed from Pygment's Python lexer.

    """

    name = "AlgPseudocode"
    aliases = ["algpseudocode", "algpseudo"]
    filenames = ["*.algpseudo", "*.algpseudocode"]
    mimetypes = []
    flags = re.MULTILINE

    LANG = "en"
    TRANSLATIONS = {
        "PROG": "PROGRAM",
        "PROGRAM": "PROGRAM",
        "ALGO": "ALGORITHM",
        "ALGORITHM": "ALGORITHM",
        "PROC": "PROCEDURE",
        "PROCEDURE": "PROCEDURE",
        "FUNC": "FUNCTION",
        "FUNCTION": "FUNCTION",
        "FN": "FUNCTION",
        "CLASS": "CLASS",
        "INPUT": "Input:",
        "INPUTS": "Inputs:",
        "OUTPUT": "Output:",
        "OUTPUTS": "Outputs:",
        "RETURNS": "Returns:",
        "ENSURE": "Ensure:",
        "IS": "IS",
        "WITH": "WITH",
        "IF": "IF",
        "THEN": "THEN",
        "ELSE": "ELSE",
        "ELSEIF": "ELSE IF",
        "ELSIF": "ELSE IF",
        "ELIF": "ELSE IF",
        "DO": "DO",           # in WHILE ... DO
        "WHILE": "WHILE",
        "FOR": "FOR",
        "FORALL": "FOR ALL",
        "FROM": "FROM",
        "TO": "TO",
        "IN": "IN",           # as in FOR ... IN
        "STEP": "STEP",
        "LOOP": "LOOP",
        "REPEAT": "REPEAT",
        "UNTIL": "UNTIL",
        "RETURN": "RETURN",
        "BEGIN": "BEGIN",
        "END": "END",         # not in END_TRANSLATIONS
    }
    END_TRANSLATIONS = {
        "PROG": "END OF PROGRAM",
        "PROGRAM": "END OF PROGRAM",
        "ALGO": "END OF ALGORITHM",
        "ALGORITHM": "END OF ALGORITHM",
        "PROC": "END OF PROCEDURE",
        "PROCEDURE": "END OF PROCEDURE",
        "FUNC": "END OF FUNCTION",
        "FUNCTION": "END OF FUNCTION",
        "FN": "END OF FUNCTION",
        "CLASS": "END OF CLASS",
        "IF": "END IF",
        "WHILE": "END WHILE",
        "FOR": "END FOR",
        "FORALL": "END FOR ALL",
        "LOOP": "END LOOP",
    }
    DEFAULT_END_PREFIX = "END OF "
    SYMBOL_REMARK = "▷"            # U+25B7: Unicode 1.0 (Geometric Shapes)
    # SYMBOL_REMARK = "▻"          # U+25BB: Unicode 1.0 (Geometric Shapes)
    SYMBOL_BLOCK = "◆"             # U+25C6: Unicode 1.0 (Geometric Shapes)
    # SYMBOL_BLOCK = "┃"           # U+2503: Unicode 1.0 (Bow Drawing)
    # SYMBOL_BLOCK = "●"           # U+25CF: Unicode 1.0 (Geometric Shapes)
    SYMBOL_TEXTSTATEMENT = "▪"     # U+25AA: Unicode 1.0 (Geometric Shapes)
    # SYMBOL_TEXTSTATEMENT = "■"   # U+25A0: Unicode 1.0 (Geometric Shapes)
    SYMBOLS = {
        # Group REMARK
        "REMARK": SYMBOL_REMARK,
        "REM": SYMBOL_REMARK,
        # Group STATEMENT
        "STATEMENT": SYMBOL_BLOCK,
        "STATE": SYMBOL_BLOCK,
        "BLOCK": SYMBOL_BLOCK,
        # Group TEXTSTATEMENT
        "TEXTSTATEMENT": SYMBOL_TEXTSTATEMENT,
        "TEXTSTATE": SYMBOL_TEXTSTATEMENT,
        "TSTATEMENT": SYMBOL_TEXTSTATEMENT,
        "TSTATE": SYMBOL_TEXTSTATEMENT,
        "TEXTBLOCK": SYMBOL_TEXTSTATEMENT,
        "TBLOCK": SYMBOL_TEXTSTATEMENT,
        "<-": "←",
        "->": "→",
        "=>": "⇒",
        "<=": "≤",
        ">=": "≥",
        "<>": "≠",
        "!=": "≠",
        ":=": "∶=",  # "≔"   not recognizable
        "=:": "=∶",  # "≕",  not recognizable
        "<=>": "⇔",
        "<->": "↔",
        "?=": "≟",
    }

    def op_translate(toktype):

        def _op_translate(lexer, match, ctx=None):
            kw = match.group().upper()
            yield match.start(), toktype, lexer.TRANSLATIONS.get(kw, kw)
            if ctx:
                ctx.pos = match.end()

        return _op_translate

    def op_opt_end_translate(toktype):

        def _op_end_translate(lexer, match, ctx=None):
            if not lexer.no_end:
                kw = match.group().upper()
                yield (match.start(),
                       toktype,
                       lexer.END_TRANSLATIONS.get(
                           kw,
                           lexer.DEFAULT_END_PREFIX + kw))
            if ctx:
                ctx.pos = match.end()

        return _op_end_translate

    def op_opt_ignore(toktype):

        def _op_opt_ignore(lexer, match, ctx=None):
            if not lexer.no_end:
                yield match.start(), toktype, match.group()
            if ctx:
                ctx.pos = match.end()

        return _op_opt_ignore

    def op_opt_ignore_or_fixed(toktype, value):
        """Yield a fixed given token type and value or -- if the lexer's
        `no_end` setting evals to ``True`` nothing.

        """

        def _op_opt_ignore_or_fixed(lexer, match, ctx=None):
            if not lexer.no_end:
                yield match.start(), toktype, value
            if ctx:
                ctx.pos = match.end()

        return _op_opt_ignore_or_fixed

    def op_symbol(toktype):

        def _op_symbol(lexer, match, ctx=None):
            kw = match.group().upper()
            yield match.start(), toktype, lexer.SYMBOLS.get(kw, kw)
            if ctx:
                ctx.pos = match.end()

        return _op_symbol

    def op_explicit_tokentype(lexer, match, ctx=None):
        needed_css = match.group("type")
        toktype = REVERSED_STANDARD_TYPES.get(needed_css, None)
        if toktype is None:
            # Be more error friendly
            toktype = Generic.Error
            val = match.group()
            _logger.warning("Unhandled explicit token type: %s", val)
        else:
            val = match.group("character")
        yield match.start(), toktype, val
        if ctx:
            ctx.pos = match.end()

    tokens = {
        "root": [
            (r"\n", Whitespace),
            (r"/\*", Comment.Multiline, "multiline-nested-comment"),
            (r"(//|#).*$", Comment.Single),
            include("remark"),
            (r"(?i)\\(block|state(?:ment)?)[ \t]*(\{)",
             bygroups(op_symbol(Text), LexBase.op_fixed(Whitespace, " ")),
             "block-expr"),
            (r"(?i)\\("
             r"(?:textstate(?:ment)?)"
             r"|(?:tstate(?:ment)?)"
             r"|(?:textblock)"
             r"|(?:tblock)"
             r")[ \t]*(\{)",
             bygroups(op_symbol(Text), LexBase.op_fixed(Whitespace, " ")),
             "text-statement"),
            (r"(?i)\\("
             r"(?:input(?:s)?)"
             r"|(?:output(?:s)?)"
             r"|(?:ensure)"
             r"|(?:returns)"
             r")[ \t]*(\{)",
             bygroups(op_translate(Keyword),
                      LexBase.op_fixed(Whitespace, " ")),
             "text-statement"),
            (r"(?i)\\("
             r"(?:if)"
             r"|(?:then)"
             r"|(?:else)"
             r"|(?:el(?:s(?:e)?)?if)"
             r"|(?:do)"               # as in WHILE ... DO not DO ... UNTIL
             r"|(?:while)"
             r"|(?:forall)"
             r"|(?:for)"
             r"|(?:from)"
             r"|(?:to)"
             r"|(?:step)"
             r"|(?:in)"
             r"|(?:loop)"
             r"|(?:repeat)"
             r"|(?:until)"
             r"|(?:return)"
             r")\b",
             bygroups(op_translate(Keyword))),
            (r"\\\n", Text),
            (r"(?i)\\("
             r"(?:prog(?:ram)?)"
             r"|(?:algo(?:rithm)?)"
             r"|(?:proc(?:edure)?)"
             r"|(?:func(?:tion)?|(?:fn))"
             r"|(?:class)"
             r")[ \t]*(\{)",
             bygroups(op_translate(Keyword),
                      LexBase.op_fixed(Whitespace, " ")),
             "entity-name"),
            # ENDxxx keywords with optional entity name in two parts:
            #   1. with name
            (r"(?i)\\end(?:[_\-]|(?:[ \t]+))?("
             r"(?:prog(?:ram)?)"
             r"|(?:algo(?:rithm)?)"
             r"|(?:proc(?:edure)?)"
             r"|(?:func(?:tion)?)"
             r"|(?:fn)"
             r"|(?:class)"
             r")(?:[_\-]|(?:[\t ]+))?(\{)",
             bygroups(op_opt_end_translate(Keyword),
                      op_opt_ignore_or_fixed(Whitespace, " ")),
             "entity-name-end"),
            #   2. without name
            #   3. AND keywords that do not allow a param (e.g. endif)
            (r"(?i)\\end(?:[_\-]|(?:[ \t]+))?("
             r"(?:prog(?:ram)?)"
             r"|(?:algo(?:rithm)?)"
             r"|(?:proc(?:edure)?)"
             r"|(?:func(?:tion)?)"
             r"|(?:fn)"
             r"|(?:class)"
             r"|(?:if)"
             r"|(?:while)"
             r"|(?:for)"
             r"|(?:forall)"
             r"|(?:loop)"
             r")\b",
             bygroups(op_opt_end_translate(Keyword))),
            #
            # A single begin or end that is never suppressed because
            # it is supposed to be paired with begin
            #
            (r"(?i)\\(begin|end)\b",
             bygroups(op_translate(Keyword))),
            # Keywords
            (r"(?i)\\("
             r"(?:is)"
             r"|(?:with)"
             r")\b",
             bygroups(op_translate(Keyword))),
            include("expr"),
            include("unicode-separators"),
            include("unicode-other"),
            (r"[^\S\n]+", Text),
            (r".", Generic.Error),     # tolerance for errors
        ],
        "remark": [
            (r"(?i)\\(remark|rem)\b(.*)$",
             bygroups(op_symbol(Comment.Single), Comment.Single)),
        ],
        "entity-name": [      # may be multiline
            (r"[^\\}]+", Name.Entity),
            (r"\}", LexBase.op_ignore, "#pop"),
            (r"\\\}", LexBase.op_fixed(Name.Entity, "}")),
            (r"\\\\", LexBase.op_fixed(Name.Entity, "\\")),
            (r"\\", LexBase.op_fixed(Name.Entity, "\\")),
        ],
        "entity-name-end": [  # may be multiline -- suppressed if no_end
            (r"[^\\}]+", op_opt_ignore(Name.Entity)),
            (r"\}", LexBase.op_ignore, "#pop"),
            (r"\\\}", op_opt_ignore_or_fixed(Name.Entity, "}")),
            (r"\\\\", op_opt_ignore_or_fixed(Name.Entity, "\\")),
            (r"\\", op_opt_ignore_or_fixed(Name.Entity, "\\")),
        ],
        "expr": [
            include("math-symbols"),          # must be before punctuation
            include("ascii-punctuation"),
            include("unicode-punctuation"),
            include("escaped-string-start"),
            include("py-strings"),
            include("py-numbers"),
            (r"(?i)\\text[ \t]*\{", LexBase.op_ignore, "text-in-expr"),
            include("explicit-tokentype"),
            include("remark"),
            include("keyword-constants"),
            include("word-operators"),
            include("math-builtins"),
            include("py-name"),
        ],
        "expr-in-braces": [
            include("math-symbols"),          # must be before punctuation
            include("ascii-punctuation-in-braces"),
            include("unicode-punctuation"),
            include("escaped-string-start"),
            include("py-strings"),
            include("py-numbers"),
            (r"(?i)\\text[ \t]*\{", LexBase.op_ignore, "text-in-expr"),
            include("explicit-tokentype"),
            include("remark"),
            include("keyword-constants"),
            include("word-operators"),
            include("math-builtins"),
            include("py-name"),
        ],
        "block-expr": [      # somewhat similar to "root"
            (r"\}", LexBase.op_ignore, "#pop"),
            (r"\n", Whitespace),
            include("expr-in-braces"),
            (r"\\\\", LexBase.op_fixed(Text, "\\")),
            (r"\\", LexBase.op_fixed(Text, "\\")),
            include("unicode-separators"),
            include("unicode-other"),
            (r"[^\S\n]+", Text),
            (r".", Generic.Error),     # tolerance for errors
        ],
        "text-statement": [  # like block but default to text-mode
            (r"[^\\}\n]+", Text),
            (r"\}", LexBase.op_ignore, "#pop"),
            (r"\n", Whitespace),
            (r"\\\}", LexBase.op_fixed(Text, "}")),
            (r"(?i)\\expr(?:ession)?[ \t]*\{",
             LexBase.op_ignore,
             "block-expr"),
            include("explicit-tokentype"),
            include("remark"),
            (r"\\\\", LexBase.op_fixed(Text, "\\")),
            (r"\\", LexBase.op_fixed(Text, "\\")),
            (r".", Generic.Error),     # tolerance for errors
        ],
        "text-in-expr": [
            (r"[^\\}\n]+", Text),
            (r"\}", LexBase.op_ignore, "#pop"),
            (r"\n", Whitespace),
            (r"\\\}", LexBase.op_fixed(Text, "}")),
            (r"(?i)\\expr(?:ession)?[ \t]*\{",
             LexBase.op_ignore,
             "block-expr"),
            include("explicit-tokentype"),
            include("remark"),
            (r"\\\\", LexBase.op_fixed(Text, "\\")),
            (r"\\", LexBase.op_fixed(Text, "\\")),
            (r".", Generic.Error),     # tolerance for errors
        ],
        "math-builtins": [
            (words(("sqrt", "pow", "cos", "sin", "tan", "arcos", "arcsin",
                    "arctan", "arctan2", "mod", "exp", "ln", "log",
                    "min", "max"),
                   prefix=r"(?<!\.)",
                   suffix=r"\b"),
             Name.Builtin),
        ],
        "math-symbols": [
            (r"<=>|<->|<-|->|=>|<=|>=|<>|!=|:=|=:|\?=", op_symbol(Operator)),
            (r"[!&<>=+\-*/%|~]", Operator),         # ASCII
            (u"[%s]" % (uniprops.Sm,), Operator),   # other Unicode
        ],
        "word-operators": [
            (words(("IN", "In", "in",
                    "IS", "Is", "is",
                    "AND", "And", "and",
                    "OR", "Or", "or",
                    "XOR", "Xor", "xor",
                    "NOT", "Not", "not"),
                   prefix=r"(?<!\.)",
                   suffix=r"\b"),
             Operator.Word),
        ],
        "keyword-constants": [
            (words(("True", "TRUE", "true", "False", "FALSE", "false",
                    "None", "NONE", "none", "Nil", "NIL", "nil",
                    "Null", "NULL", "null",
                    "Empty", "EMPTY", "empty"),
                   prefix=r"(?<!\.)",
                   suffix=r"\b"),
             Keyword.Constant),
        ],
        "ascii-punctuation": [
            (r"[{}:(),;[\]?@]", Punctuation),
        ],
        "ascii-punctuation-in-braces": [
            #
            # Like "punctuation" but needs an escaped curly brace for } because
            # a single closing curly brace pops the current state here.
            #
            (r"\\\}", LexBase.op_fixed(Punctuation, "}")),
            (r"[{:(),;[\]?@]", Punctuation),
        ],
        "unicode-separators": [
            (u"[%s]" % (uniprops.Zl,), Whitespace),
            (u"[%s]" % (uniprops.Zp,), Whitespace),
            (u"[%s]" % (uniprops.Zs,), Whitespace),
        ],
        "unicode-punctuation": [
            (u"[%s]" % (uniprops.Pc,), Punctuation),
            (u"[%s]" % (uniprops.Pd,), Punctuation),
            (u"[%s]" % (uniprops.Ps,), Punctuation),
            (u"[%s]" % (uniprops.Pe,), Punctuation),
            (u"[%s]" % (uniprops.Pi,), Punctuation),
            (u"[%s]" % (uniprops.Pf,), Punctuation),
            (u"[%s]" % (uniprops.Po,), Punctuation),
        ],
        "unicode-other": [
            (u"[%s]" % (uniprops.Sc,), Text),    # Currency
            (u"[%s]" % (uniprops.So,), Text),    # Other symbols
        ],
        "escaped-string-start": [
            (r"""\\(['"])""", bygroups(Punctuation)),
        ],
        "explicit-tokentype": [
            # All these REs are CASE-SENSITIVE!

            # Multiple characters possible, but no escaping!
            (r"\\ttx\-(?P<type>[a-zA-Z0-9_-]+?)(?P<sep>[/:|=*+!\$~])"
             r"(?P<character>(.|\n)+?)(?P=sep)",
             op_explicit_tokentype),
            (r"\\ttx\-(?P<type>[a-zA-Z0-9_-]+?)\{(?P<character>[^}]+?)\}",
             op_explicit_tokentype),
            (r"\\ttx\-(?P<type>[a-zA-Z0-9_-]+?)\((?P<character>[^)]+?)\)",
             op_explicit_tokentype),
            (r"\\ttx\-(?P<type>[a-zA-Z0-9_-]+?)<(?P<character>[^>]+?)>",
             op_explicit_tokentype),
            (r"\\ttx\-(?P<type>[a-zA-Z0-9_-]+?)\[(?P<character>[^\]]+?)\]",
             op_explicit_tokentype),

            # Every character is possible: no escaping needed!
            (r"\\tt-(?P<type>[^/]+?)/(?P<character>(?:.|\n))",
             op_explicit_tokentype),
        ],
    }

    def __init__(self, **options):
        val = pygments.util.get_bool_opt(options, "no_end", default=False)
        self.no_end = val
        LexBase.__init__(self, **options)


class AlgPseudocodeLexer_DE(AlgPseudocodeLexer):

    name = "AlgPseudocodeDE"
    aliases = ["algpseudocode-de", "algpseudo-de"]
    filenames = ["*.algpseudo-de", "*.algpseudocode-de"]

    LANG = "de"
    TRANSLATIONS = AlgPseudocodeLexer.TRANSLATIONS.copy()
    TRANSLATIONS.update({
        "PROG": "PROGRAMM",
        "PROGRAM": "PROGRAMM",
        "ALGO": "ALGORITHMUS",
        "ALGORITHM": "ALGORITHMUS",
        "PROC": "PROZEDUR",
        "PROCEDURE": "PROZEDUR",
        "FUNC": "FUNKTION",
        "FUNCTION": "FUNKTION",
        "FN": "FUNKTION",
        "CLASS": "KLASSE",
        "IS": "IST",
        "WITH": "MIT",
    })
    END_TRANSLATIONS = AlgPseudocodeLexer.END_TRANSLATIONS.copy()
    END_TRANSLATIONS.update({
        "PROG": "ENDE DES PROGRAMMS",
        "PROGRAM": "ENDE VON PROGRAMMS",
        "ALGO": "ENDE DES ALGORITHMUS",
        "ALGORITHM": "ENDE DES ALGORITHMUS",
        "PROC": "ENDE DER PROZEDUR",
        "PROCEDURE": "ENDE DER PROZEDUR",
        "FUNC": "ENDE DER FUNKTION",
        "FUNCTION": "ENDE DER FUNKTION",
        "FN": "ENDE DER FUNKTION",
        "CLASS": "ENDE DER KLASSE",
    })
    DEFAULT_END_PREFIX = "ENDE VON "


class AlgPseudocodeLexer_FR(AlgPseudocodeLexer):

    name = "AlgPseudocodeFR"
    aliases = ["algpseudocode-fr", "algpseudo-fr"]
    filenames = ["*.algpseudo-fr", "*.algpseudocode-fr"]

    LANG = "fr"
    TRANSLATIONS = AlgPseudocodeLexer.TRANSLATIONS.copy()
    TRANSLATIONS.update({
        "PROG": "PROGRAMME",
        "PROGRAM": "PROGRAMME",
        "ALGO": "ALGORITHME",
        "ALGORITHM": "ALGORITHME",
        "PROC": "PROCÉDURE",
        "PROCEDURE": "PROCÉDURE",
        "FUNC": "FONCTION",
        "FUNCTION": "FOUNCTION",
        "FN": "FONCTION",
        "CLASS": "CLASSE",
        "IS": "EST",
        "WITH": "AVEC",
    })
    END_TRANSLATIONS = AlgPseudocodeLexer.END_TRANSLATIONS.copy()
    END_TRANSLATIONS.update({
        "PROG": "FIN DE PROGRAMME",
        "PROGRAM": "FIN DE PROGRAMME",
        "ALGO": "FIN D'ALGORITHME",
        "ALGORITHM": "FIN D'ALGORITHME",
        "PROC": "FIN DE PROCÉDURE",
        "PROCEDURE": "FIN DE PROCÉDURE",
        "FUNC": "FIN DE FONCTION",
        "FUNCTION": "FIN DE FOUNCTION",
        "FN": "FIN DE FONCTION",
        "CLASS": "FIN DE CLASSE",
    })
    DEFAULT_END_PREFIX = "FIN DE "