view pygments_lexer_pseudocode2/bases.py @ 123:4d96ace53ba1

Make it work on Python2 too with all tests by explicitely declaring some strings to be Unicode strings. No tests need to be skipped on Python2 now.
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 06 May 2026 15:53:24 +0200
parents e1663ac707b0
children
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# SPDX-FileCopyrightText: © 2026 Franz Glasner
# SPDX-License-Identifier: MIT
# :-
r"""Some common bases for the lexers."""

__all__ = ["LexBase", "uni_name", "py_innerstring_rules", "py_name_rules"]


import sys

from pygments import unistring
from pygments.lexer import RegexLexer, combined, bygroups, include
from pygments.token import (Comment, Error, Name, Number, Other, String)


PY2 = sys.version_info[0] <= 2


#
# SPDX-SnippetBegin
# SPDX-License-Identifier: BSD-2-Clause
# SPDX-SnippetCopyrightText: Copyright 2006-2023 by the Pygments team
# SPDX-SnippetCopyrightText: Copyright 2026 by Franz Glasner
#

uni_name = "[%s][%s]*" % (unistring.xid_start, unistring.xid_continue)


"""PY3 allows no @staticmethod but PY2 needs it."""
if PY2:
    _staticmethod = staticmethod
else:
    def _staticmethod(fn):
        return fn


def py_innerstring_rules(ttype):
    return [
        # the old style '%s' % (...) string formatting (still valid in Py3)
        (r'%(\(\w+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?'
         '[hlL]?[E-GXc-giorsaux%]', String.Interpol),
        # the new style '{}'.format(...) string formatting
        (r'\{'
         r'((\w+)((\.\w+)|(\[[^\]]+\]))*)?'  # field name
         r'(\![sra])?'                       # conversion
         r'(\:(.?[<>=\^])?[-+ ]?#?0?(\d+)?,?(\.\d+)?[E-GXb-gnosx%]?)?'
         r'\}', String.Interpol),
        #
        # backslashes, quotes and formatting signs must be parsed
        # one at a time
        #
        (r'[^\\\'"%{\n]+', ttype),
        (r'[\'"\\]', ttype),
        # unhandled string formatting sign
        (r'%|(\{{1,2})', ttype)
        # newlines are an error (use "nl" state)
    ]


def py_name_rules(ttype, deco_ttype=Name.Decorator):
    return [
        # We recognize decorator syntax here
        (r'@' + uni_name, deco_ttype),
        #
        # Python's new matrix multiplication operator:
        # not used here in pseudocode
        # (r'@', Operator),
        (uni_name, ttype),
    ]

# SPDX-SnippetEnd


class LexBase(RegexLexer):

    """A base that defines some common lexer states.

    Default flags are not important.

    """

    def op_ignore(lexer, match, ctx=None):
        """Unconditionally ignore the match."""
        if False:
            yield match.start(), Other, ""
        if ctx:
            ctx.pos = match.end()

    @_staticmethod
    def op_fixed(toktype, value):
        """Unconditionally yield a given token type and value."""

        def _op_fixed(lexer, match, ctx=None):
            yield match.start(), toktype, value
            if ctx:
                ctx.pos = match.end()

        return _op_fixed

    tokens = {
#
# These states are borrowed from Pygment's Python lexer.
# Their names have been prefixed with `py-'.
#
# SPDX-SnippetBegin
# SPDX-License-Identifier: BSD-2-Clause
# SPDX-SnippetCopyrightText: Copyright 2006-2023 by the Pygments team
# SPDX-SnippetCopyrightText: Copyright 2026 by Franz Glasner
#
        'py-numbers': [
            (r'(\d(?:_?\d)*\.(?:\d(?:_?\d)*)?|(?:\d(?:_?\d)*)?\.\d(?:_?\d)*)'
             r'([eE][+-]?\d(?:_?\d)*)?', Number.Float),
            (r'\d(?:_?\d)*[eE][+-]?\d(?:_?\d)*j?', Number.Float),
            (r'0[oO](?:_?[0-7])+', Number.Oct),
            (r'0[bB](?:_?[01])+', Number.Bin),
            (r'0[xX](?:_?[a-fA-F0-9])+', Number.Hex),
            (r'\d(?:_?\d)*', Number.Integer),
        ],
        'py-strings': [
            # non-raw strings
            ('([uU]?)(""")', bygroups(String.Affix, String.Double),
             combined('py-stringescape', 'py-tdqs')),
            ("([uU]?)(''')", bygroups(String.Affix, String.Single),
             combined('py-stringescape', 'py-tsqs')),
            ('([uU]?)(")', bygroups(String.Affix, String.Double),
             combined('py-stringescape', 'py-dqs')),
            ("([uU]?)(')", bygroups(String.Affix, String.Single),
             combined('py-stringescape', 'py-sqs')),
            # non-raw bytes
            ('([bB])(""")', bygroups(String.Affix, String.Double),
             combined('py-bytesescape', 'py-tdqs')),
            ("([bB])(''')", bygroups(String.Affix, String.Single),
             combined('py-bytesescape', 'py-tsqs')),
            ('([bB])(")', bygroups(String.Affix, String.Double),
             combined('py-bytesescape', 'py-dqs')),
            ("([bB])(')", bygroups(String.Affix, String.Single),
             combined('py-bytesescape', 'py-sqs')),
        ],
        'py-stringescape': [
            (r'\\(N\{.*?\}|u[a-fA-F0-9]{4}|U[a-fA-F0-9]{8})', String.Escape),
            include('py-bytesescape')
        ],
        'py-bytesescape': [
            (r'\\([\\abfnrtv"\']|\n|x[a-fA-F0-9]{2}|[0-7]{1,3})',
             String.Escape)
        ],
        'py-dqs': [
            (r'"', String.Double, '#pop'),
            (r'\\\\|\\"|\\\n', String.Escape),  # included here for raw strings
            include('py-strings-double'),
            (r'\n', Error),    # added by fag
        ],
        'py-sqs': [
            (r"'", String.Single, '#pop'),
            (r"\\\\|\\'|\\\n", String.Escape),  # included here for raw strings
            include('py-strings-single'),
            (r'\n', Error),    # added by fag
        ],
        'py-tdqs': [
            (r'"""', String.Double, '#pop'),
            include('py-strings-double'),
            (r'\n', String.Double)
        ],
        'py-tsqs': [
            (r"'''", String.Single, '#pop'),
            include('py-strings-single'),
            (r'\n', String.Single)
        ],
        'py-strings-single': py_innerstring_rules(String.Single),
        'py-strings-double': py_innerstring_rules(String.Double),
        'py-name': py_name_rules(Name.Entity),
# SPDX-SnippetEnd
    # This snippet is from the Pygments' documentation "Write your own lexer"
    'multiline-nested-comment': [
            (r'[^*/]+', Comment.Multiline),
            (r'/\*', Comment.Multiline, '#push'),
            (r'\*/', Comment.Multiline, '#pop'),
            (r'[*/]', Comment.Multiline),
        ],
    'multiline-nested-comment-alt': [
            (r'[^*()]+', Comment.Multiline),
            (r'\(\*', Comment.Multiline, '#push'),
            (r'\*\)', Comment.Multiline, '#pop'),
            (r'[*()]', Comment.Multiline),
        ]
    }