view pygments_lexer_pseudocode2/bases.py @ 160:b4028838e0c8

Implement lexer option "prohibit_raiseonerror_filter". Sphinx raises by default when an Error token is seen (by means of the "raiseonerror" filter that is applied by default to lexers in Sphinx). This option skips this and allows error locations to be seen and highlighted properly. While there convert most Generic.Error tokens to Error tokens because now they can be handled by a lexer with "prohibit_raiseonerror_filter=True".
author Franz Glasner <fzglas.hg@dom66.de>
date Fri, 08 May 2026 17:46:28 +0200
parents e1663ac707b0
children
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# SPDX-FileCopyrightText: © 2026 Franz Glasner
# SPDX-License-Identifier: MIT
# :-
r"""Some common bases for the lexers."""

__all__ = ["LexBase", "uni_name", "py_innerstring_rules", "py_name_rules"]


import sys

from pygments import unistring
from pygments.lexer import RegexLexer, combined, bygroups, include
from pygments.token import (Comment, Error, Name, Number, Other, String)


PY2 = sys.version_info[0] <= 2


#
# SPDX-SnippetBegin
# SPDX-License-Identifier: BSD-2-Clause
# SPDX-SnippetCopyrightText: Copyright 2006-2023 by the Pygments team
# SPDX-SnippetCopyrightText: Copyright 2026 by Franz Glasner
#

uni_name = "[%s][%s]*" % (unistring.xid_start, unistring.xid_continue)


"""PY3 allows no @staticmethod but PY2 needs it."""
if PY2:
    _staticmethod = staticmethod
else:
    def _staticmethod(fn):
        return fn


def py_innerstring_rules(ttype):
    return [
        # the old style '%s' % (...) string formatting (still valid in Py3)
        (r'%(\(\w+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?'
         '[hlL]?[E-GXc-giorsaux%]', String.Interpol),
        # the new style '{}'.format(...) string formatting
        (r'\{'
         r'((\w+)((\.\w+)|(\[[^\]]+\]))*)?'  # field name
         r'(\![sra])?'                       # conversion
         r'(\:(.?[<>=\^])?[-+ ]?#?0?(\d+)?,?(\.\d+)?[E-GXb-gnosx%]?)?'
         r'\}', String.Interpol),
        #
        # backslashes, quotes and formatting signs must be parsed
        # one at a time
        #
        (r'[^\\\'"%{\n]+', ttype),
        (r'[\'"\\]', ttype),
        # unhandled string formatting sign
        (r'%|(\{{1,2})', ttype)
        # newlines are an error (use "nl" state)
    ]


def py_name_rules(ttype, deco_ttype=Name.Decorator):
    return [
        # We recognize decorator syntax here
        (r'@' + uni_name, deco_ttype),
        #
        # Python's new matrix multiplication operator:
        # not used here in pseudocode
        # (r'@', Operator),
        (uni_name, ttype),
    ]

# SPDX-SnippetEnd


class LexBase(RegexLexer):

    """A base that defines some common lexer states.

    Default flags are not important.

    """

    def op_ignore(lexer, match, ctx=None):
        """Unconditionally ignore the match."""
        if False:
            yield match.start(), Other, ""
        if ctx:
            ctx.pos = match.end()

    @_staticmethod
    def op_fixed(toktype, value):
        """Unconditionally yield a given token type and value."""

        def _op_fixed(lexer, match, ctx=None):
            yield match.start(), toktype, value
            if ctx:
                ctx.pos = match.end()

        return _op_fixed

    tokens = {
#
# These states are borrowed from Pygment's Python lexer.
# Their names have been prefixed with `py-'.
#
# SPDX-SnippetBegin
# SPDX-License-Identifier: BSD-2-Clause
# SPDX-SnippetCopyrightText: Copyright 2006-2023 by the Pygments team
# SPDX-SnippetCopyrightText: Copyright 2026 by Franz Glasner
#
        'py-numbers': [
            (r'(\d(?:_?\d)*\.(?:\d(?:_?\d)*)?|(?:\d(?:_?\d)*)?\.\d(?:_?\d)*)'
             r'([eE][+-]?\d(?:_?\d)*)?', Number.Float),
            (r'\d(?:_?\d)*[eE][+-]?\d(?:_?\d)*j?', Number.Float),
            (r'0[oO](?:_?[0-7])+', Number.Oct),
            (r'0[bB](?:_?[01])+', Number.Bin),
            (r'0[xX](?:_?[a-fA-F0-9])+', Number.Hex),
            (r'\d(?:_?\d)*', Number.Integer),
        ],
        'py-strings': [
            # non-raw strings
            ('([uU]?)(""")', bygroups(String.Affix, String.Double),
             combined('py-stringescape', 'py-tdqs')),
            ("([uU]?)(''')", bygroups(String.Affix, String.Single),
             combined('py-stringescape', 'py-tsqs')),
            ('([uU]?)(")', bygroups(String.Affix, String.Double),
             combined('py-stringescape', 'py-dqs')),
            ("([uU]?)(')", bygroups(String.Affix, String.Single),
             combined('py-stringescape', 'py-sqs')),
            # non-raw bytes
            ('([bB])(""")', bygroups(String.Affix, String.Double),
             combined('py-bytesescape', 'py-tdqs')),
            ("([bB])(''')", bygroups(String.Affix, String.Single),
             combined('py-bytesescape', 'py-tsqs')),
            ('([bB])(")', bygroups(String.Affix, String.Double),
             combined('py-bytesescape', 'py-dqs')),
            ("([bB])(')", bygroups(String.Affix, String.Single),
             combined('py-bytesescape', 'py-sqs')),
        ],
        'py-stringescape': [
            (r'\\(N\{.*?\}|u[a-fA-F0-9]{4}|U[a-fA-F0-9]{8})', String.Escape),
            include('py-bytesescape')
        ],
        'py-bytesescape': [
            (r'\\([\\abfnrtv"\']|\n|x[a-fA-F0-9]{2}|[0-7]{1,3})',
             String.Escape)
        ],
        'py-dqs': [
            (r'"', String.Double, '#pop'),
            (r'\\\\|\\"|\\\n', String.Escape),  # included here for raw strings
            include('py-strings-double'),
            (r'\n', Error),    # added by fag
        ],
        'py-sqs': [
            (r"'", String.Single, '#pop'),
            (r"\\\\|\\'|\\\n", String.Escape),  # included here for raw strings
            include('py-strings-single'),
            (r'\n', Error),    # added by fag
        ],
        'py-tdqs': [
            (r'"""', String.Double, '#pop'),
            include('py-strings-double'),
            (r'\n', String.Double)
        ],
        'py-tsqs': [
            (r"'''", String.Single, '#pop'),
            include('py-strings-single'),
            (r'\n', String.Single)
        ],
        'py-strings-single': py_innerstring_rules(String.Single),
        'py-strings-double': py_innerstring_rules(String.Double),
        'py-name': py_name_rules(Name.Entity),
# SPDX-SnippetEnd
    # This snippet is from the Pygments' documentation "Write your own lexer"
    'multiline-nested-comment': [
            (r'[^*/]+', Comment.Multiline),
            (r'/\*', Comment.Multiline, '#push'),
            (r'\*/', Comment.Multiline, '#pop'),
            (r'[*/]', Comment.Multiline),
        ],
    'multiline-nested-comment-alt': [
            (r'[^*()]+', Comment.Multiline),
            (r'\(\*', Comment.Multiline, '#push'),
            (r'\*\)', Comment.Multiline, '#pop'),
            (r'[*()]', Comment.Multiline),
        ]
    }