view pygments_lexer_pseudocode2/bases.py @ 87:d8ca835c74ea

FIX: Erroneous parsing of \tt-XXX and \ttx-XXX: Need to restrict the tokentype group because otherwise it would match too much if some sort of braces are mixed on a single line.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 02 May 2026 10:07:59 +0200
parents cd79d2c76347
children e1663ac707b0
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# SPDX-FileCopyrightText: © 2026 Franz Glasner
# SPDX-License-Identifier: MIT
# :-
r"""Some common bases for the lexers."""

__all__ = ["LexBase", "uni_name", "py_innerstring_rules", "py_name_rules"]


import sys

from pygments import unistring
from pygments.lexer import RegexLexer, combined, bygroups, include
from pygments.token import (Comment, Error, Name, Number, Other, String)


PY2 = sys.version_info[0] <= 2


#
# SPDX-SnippetBegin
# SPDX-License-Identifier: BSD-2-Clause
# SPDX-SnippetCopyrightText: Copyright 2006-2023 by the Pygments team
# SPDX-SnippetCopyrightText: Copyright 2026 by Franz Glasner
#

uni_name = "[%s][%s]*" % (unistring.xid_start, unistring.xid_continue)


"""PY3 allows no @staticmethod but PY2 needs it."""
if PY2:
    _staticmethod = staticmethod
else:
    def _staticmethod(fn):
        return fn


def py_innerstring_rules(ttype):
    return [
        # the old style '%s' % (...) string formatting (still valid in Py3)
        (r'%(\(\w+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?'
         '[hlL]?[E-GXc-giorsaux%]', String.Interpol),
        # the new style '{}'.format(...) string formatting
        (r'\{'
         r'((\w+)((\.\w+)|(\[[^\]]+\]))*)?'  # field name
         r'(\![sra])?'                       # conversion
         r'(\:(.?[<>=\^])?[-+ ]?#?0?(\d+)?,?(\.\d+)?[E-GXb-gnosx%]?)?'
         r'\}', String.Interpol),
        #
        # backslashes, quotes and formatting signs must be parsed
        # one at a time
        #
        (r'[^\\\'"%{\n]+', ttype),
        (r'[\'"\\]', ttype),
        # unhandled string formatting sign
        (r'%|(\{{1,2})', ttype)
        # newlines are an error (use "nl" state)
    ]


def py_name_rules(ttype, deco_ttype=Name.Decorator):
    return [
        # We recognize decorator syntax here
        (r'@' + uni_name, deco_ttype),
        #
        # Python's new matrix multiplication operator:
        # not used here in pseudocode
        # (r'@', Operator),
        (uni_name, ttype),
    ]

# SPDX-SnippetEnd


class LexBase(RegexLexer):

    """A base that defines some common lexer states.

    Default flags are not important.

    """

    def op_ignore(lexer, match, ctx=None):
        """Unconditionally ignore the match."""
        if False:
            yield match.start(), Other, ""
        if ctx:
            ctx.pos = match.end()

    @_staticmethod
    def op_fixed(toktype, value):
        """Unconditionally yield a given token type and value."""

        def _op_fixed(lexer, match, ctx=None):
            yield match.start(), toktype, value
            if ctx:
                ctx.pos = match.end()

        return _op_fixed

    tokens = {
#
# These states are borrowed from Pygment's Python lexer.
# Their names have been prefixed with `py-'.
#
# SPDX-SnippetBegin
# SPDX-License-Identifier: BSD-2-Clause
# SPDX-SnippetCopyrightText: Copyright 2006-2023 by the Pygments team
# SPDX-SnippetCopyrightText: Copyright 2026 by Franz Glasner
#
        'py-numbers': [
            (r'(\d(?:_?\d)*\.(?:\d(?:_?\d)*)?|(?:\d(?:_?\d)*)?\.\d(?:_?\d)*)'
             r'([eE][+-]?\d(?:_?\d)*)?', Number.Float),
            (r'\d(?:_?\d)*[eE][+-]?\d(?:_?\d)*j?', Number.Float),
            (r'0[oO](?:_?[0-7])+', Number.Oct),
            (r'0[bB](?:_?[01])+', Number.Bin),
            (r'0[xX](?:_?[a-fA-F0-9])+', Number.Hex),
            (r'\d(?:_?\d)*', Number.Integer),
        ],
        'py-strings': [
            # non-raw strings
            ('([uU]?)(""")', bygroups(String.Affix, String.Double),
             combined('py-stringescape', 'py-tdqs')),
            ("([uU]?)(''')", bygroups(String.Affix, String.Single),
             combined('py-stringescape', 'py-tsqs')),
            ('([uU]?)(")', bygroups(String.Affix, String.Double),
             combined('py-stringescape', 'py-dqs')),
            ("([uU]?)(')", bygroups(String.Affix, String.Single),
             combined('py-stringescape', 'py-sqs')),
            # non-raw bytes
            ('([bB])(""")', bygroups(String.Affix, String.Double),
             combined('py-bytesescape', 'py-tdqs')),
            ("([bB])(''')", bygroups(String.Affix, String.Single),
             combined('py-bytesescape', 'py-tsqs')),
            ('([bB])(")', bygroups(String.Affix, String.Double),
             combined('py-bytesescape', 'py-dqs')),
            ("([bB])(')", bygroups(String.Affix, String.Single),
             combined('py-bytesescape', 'py-sqs')),
        ],
        'py-stringescape': [
            (r'\\(N\{.*?\}|u[a-fA-F0-9]{4}|U[a-fA-F0-9]{8})', String.Escape),
            include('py-bytesescape')
        ],
        'py-bytesescape': [
            (r'\\([\\abfnrtv"\']|\n|x[a-fA-F0-9]{2}|[0-7]{1,3})',
             String.Escape)
        ],
        'py-dqs': [
            (r'"', String.Double, '#pop'),
            (r'\\\\|\\"|\\\n', String.Escape),  # included here for raw strings
            include('py-strings-double'),
            (r'\n', Error),    # added by fag
        ],
        'py-sqs': [
            (r"'", String.Single, '#pop'),
            (r"\\\\|\\'|\\\n", String.Escape),  # included here for raw strings
            include('py-strings-single'),
            (r'\n', Error),    # added by fag
        ],
        'py-tdqs': [
            (r'"""', String.Double, '#pop'),
            include('py-strings-double'),
            (r'\n', String.Double)
        ],
        'py-tsqs': [
            (r"'''", String.Single, '#pop'),
            include('py-strings-single'),
            (r'\n', String.Single)
        ],
        'py-strings-single': py_innerstring_rules(String.Single),
        'py-strings-double': py_innerstring_rules(String.Double),
        'py-name': py_name_rules(Name.Entity),
# SPDX-SnippetEnd
    # This snippet is from the Pygments' documentation "Write your own lexer"
    'multiline-nested-comment': [
            (r'[^*/]+', Comment.Multiline),
            (r'/\*', Comment.Multiline, '#push'),
            (r'\*/', Comment.Multiline, '#pop'),
            (r'[*/]', Comment.Multiline),
        ]
    }