Source code for mechanics_dsl.parser.tokens

"""
Token system for MechanicsDSL parser.

This module provides the tokenization layer that converts DSL source code
into a stream of tokens for parsing.

Classes:
    Token: Represents a token with position tracking for error messages.

Functions:
    tokenize: Convert source code string to list of tokens.

Example:
    >>> from mechanics_dsl.parser.tokens import tokenize
    >>> tokens = tokenize(r"\\system{pendulum}")
    >>> print(tokens[0])
    SYSTEM:\\system@1:1
"""

import re
from dataclasses import dataclass
from typing import List, Tuple

from ..utils import logger

# ============================================================================
# TOKEN TYPE DEFINITIONS
# ============================================================================

TOKEN_TYPES = [
    # Physics specific commands (order matters!)
    ("DOT_NOTATION", r"\\ddot|\\dot"),
    ("SYSTEM", r"\\system"),
    ("DEFVAR", r"\\defvar"),
    ("DEFINE", r"\\define"),
    ("LAGRANGIAN", r"\\lagrangian"),
    ("HAMILTONIAN", r"\\hamiltonian"),
    ("TRANSFORM", r"\\transform"),
    ("CONSTRAINT", r"\\constraint"),
    ("NONHOLONOMIC", r"\\nonholonomic"),
    ("FORCE", r"\\force"),
    ("DAMPING", r"\\damping"),
    ("RAYLEIGH", r"\\rayleigh"),
    ("INITIAL", r"\\initial"),
    ("SOLVE", r"\\solve"),
    ("ANIMATE", r"\\animate"),
    ("PLOT", r"\\plot"),
    ("PARAMETER", r"\\parameter"),
    ("EXPORT", r"\\export"),
    ("IMPORT", r"\\import"),
    ("EULER_ANGLES", r"\\euler"),
    ("QUATERNION", r"\\quaternion"),
    # Vector operations
    ("VEC", r"\\vec"),
    ("HAT", r"\\hat"),
    ("MAGNITUDE", r"\\mag|\\norm"),
    # Advanced math operators
    ("VECTOR_DOT", r"\\cdot"),
    ("VECTOR_CROSS", r"\\times|\\cross"),
    ("GRADIENT", r"\\nabla|\\grad"),
    ("DIVERGENCE", r"\\div"),
    ("CURL", r"\\curl"),
    ("LAPLACIAN", r"\\laplacian|\\Delta"),
    # Calculus
    ("PARTIAL", r"\\partial"),
    ("INTEGRAL", r"\\int"),
    ("OINT", r"\\oint"),
    ("SUM", r"\\sum"),
    ("LIMIT", r"\\lim"),
    ("FRAC", r"\\frac"),
    # Greek letters (comprehensive)
    (
        "GREEK_LETTER",
        r"\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\varepsilon|\\zeta|\\eta|\\theta|\\vartheta|\\iota|\\kappa|\\lambda|\\mu|\\nu|\\xi|\\omicron|\\pi|\\varpi|\\rho|\\varrho|\\sigma|\\varsigma|\\tau|\\upsilon|\\phi|\\varphi|\\chi|\\psi|\\omega",  # noqa: E501
    ),
    ("FLUID", r"\\fluid"),
    ("BOUNDARY", r"\\boundary"),
    ("REGION", r"\\region"),
    ("PARTICLE_MASS", r"\\particle_mass"),
    ("EOS", r"\\equation_of_state"),
    ("RANGE_OP", r"\.\."),
    # General commands
    ("COMMAND", r"\\[a-zA-Z_][a-zA-Z0-9_]*"),
    # Brackets and grouping
    ("LBRACE", r"\{"),
    ("RBRACE", r"\}"),
    ("LPAREN", r"\("),
    ("RPAREN", r"\)"),
    ("LBRACKET", r"\["),
    ("RBRACKET", r"\]"),
    # Mathematical operators
    ("PLUS", r"\+"),
    ("MINUS", r"-"),
    ("MULTIPLY", r"\*"),
    ("DIVIDE", r"/"),
    ("POWER", r"\^"),
    ("EQUALS", r"="),
    ("COMMA", r","),
    ("SEMICOLON", r";"),
    ("COLON", r":"),
    ("DOT", r"\."),
    ("UNDERSCORE", r"_"),
    ("PIPE", r"\|"),
    # Basic tokens
    ("NUMBER", r"\d+\.?\d*([eE][+-]?\d+)?"),
    ("IDENT", r"[a-zA-Z_][a-zA-Z0-9_]*"),
    ("WHITESPACE", r"\s+"),
    ("NEWLINE", r"\n"),
    ("COMMENT", r"%.*"),
]

# Compile token regex pattern
token_regex = "|".join(f"(?P<{name}>{pattern})" for name, pattern in TOKEN_TYPES)
token_pattern = re.compile(token_regex)


# ============================================================================
# TOKEN CLASS
# ============================================================================


[docs] @dataclass class Token: """ Token with position tracking for better error messages. Attributes: type: The token type (e.g., 'IDENT', 'NUMBER', 'LAGRANGIAN'). value: The raw string value matched from source. position: Character position in source (0-indexed). line: Line number (1-indexed). column: Column number (1-indexed). Example: >>> token = Token('IDENT', 'theta', position=10, line=2, column=5) >>> print(token) IDENT:theta@2:5 """ type: str value: str position: int = 0 line: int = 1 column: int = 1 def __repr__(self) -> str: return f"{self.type}:{self.value}@{self.line}:{self.column}"
# ============================================================================ # TOKENIZER FUNCTION # ============================================================================
[docs] def tokenize(source: str) -> List[Token]: """ Tokenize DSL source code with position tracking. Converts a string of MechanicsDSL code into a list of tokens, excluding whitespace and comments. Unrecognized characters are reported as a single error rather than silently dropped. Args: source: DSL source code string. Returns: List of Token objects (excluding whitespace and comments). Raises: ValueError: If the source contains characters that do not match any token pattern (e.g. ``@``, ``$``, ``&``). Example: >>> tokens = tokenize(r"\\lagrangian{T - V}") >>> [t.type for t in tokens] ['LAGRANGIAN', 'LBRACE', 'IDENT', 'MINUS', 'IDENT', 'RBRACE'] """ tokens: List[Token] = [] line = 1 line_start = 0 pos = 0 unknown: List[Tuple[int, int, str]] = [] def _account_for_text(text: str, base_pos: int) -> None: """Advance line/line_start across `text` starting at `base_pos`.""" nonlocal line, line_start last_nl = text.rfind("\n") if last_nl != -1: line += text.count("\n") line_start = base_pos + last_nl + 1 for match in token_pattern.finditer(source): start = match.start() # Anything between the last match and this one is unmatched. Track # line numbers across that gap and record any non-whitespace chars. if start > pos: gap = source[pos:start] for offset, ch in enumerate(gap): if ch == "\n": line += 1 line_start = pos + offset + 1 elif not ch.isspace(): unknown.append((line, (pos + offset) - line_start + 1, ch)) kind = match.lastgroup value = match.group() column = start - line_start + 1 if kind not in ("WHITESPACE", "COMMENT"): tokens.append(Token(kind, value, start, line, column)) # Update line tracking for newlines inside the matched span too. if "\n" in value: _account_for_text(value, start) pos = match.end() # Anything left after the final match. if pos < len(source): tail = source[pos:] for offset, ch in enumerate(tail): if ch == "\n": line += 1 line_start = pos + offset + 1 elif not ch.isspace(): unknown.append((line, (pos + offset) - line_start + 1, ch)) if unknown: head = unknown[:5] details = ", ".join(f"{ch!r} at line {ln}, col {col}" for ln, col, ch in head) more = "" if len(unknown) <= len(head) else f" (+{len(unknown) - len(head)} more)" raise ValueError(f"Unrecognized character(s): {details}{more}") logger.debug(f"Tokenized {len(tokens)} tokens from {line} lines") return tokens
__all__ = [ "TOKEN_TYPES", "token_pattern", "Token", "tokenize", ]