You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

159 lines
5.9 KiB

"""
Variable Wrapper Utility
Automatically wraps variables in HTML content with tooltip spans
"""
import re
import html
from typing import List, Tuple
from .symbol_loader import get_symbol_definitions
class VariableWrapper:
"""Wraps known variables in HTML content with tooltip markup"""
def __init__(self):
"""Initialize variable wrapper with symbol definitions"""
self.symbols = get_symbol_definitions()
self._build_patterns()
def _build_patterns(self) -> None:
"""Build regex patterns for all known symbols"""
# Get all symbols and sort by length (longest first) to avoid partial matches
symbols_list = sorted(
self.symbols.get_all_symbols(),
key=len,
reverse=True
)
# Single letters that commonly appear in regular text
# Only match these in specific mathematical contexts
common_words = {'A', 'I', 'V', 'P', 'Q', 'R', 'L', 'C', 'E', 'B', 'G', 'X', 'Y', 'Z', 'f', 'd', 'h'}
# Very common English words that need extra-strict matching
very_common = {'A', 'I'}
self.patterns: List[Tuple[str, str]] = []
self.context_patterns: List[Tuple[str, str]] = [] # Patterns requiring context
for symbol in symbols_list:
# Escape special regex characters
escaped = re.escape(symbol)
# For single-letter variables, only match in formula/code contexts
if symbol in common_words:
if symbol in very_common:
# Extra restrictive for A, I - only in clear math context
# Must be preceded by =, ×, +, -, /, ( with optional single space
# Multiple patterns to handle both "=A" and "= A" cases
# Use alternation to avoid variable-width lookbehind
pattern = f'(?<=[=×+\\-/\\(])\\s?({escaped})(?=[\\s=+\\-*/()\\[\\]])'
self.context_patterns.append((pattern, symbol))
else:
# More restrictive pattern - requires mathematical context
# Match if preceded by: =, mathematical operators, but NOT punctuation
pattern = f'(?<=[=])\\s?({escaped})(?=[\\s=+\\-*/()\\[\\],;<>])|(?<=\\s)({escaped})(?=[\\s=+\\-*/()\\[\\],;<>])'
self.context_patterns.append((pattern, symbol))
else:
# Normal pattern for multi-character symbols
# Use word boundaries but allow underscores and subscripts
pattern = f'(?<!\\w)({escaped})(?!\\w)'
self.patterns.append((pattern, symbol))
print(f"[VariableWrapper] Built {len(self.patterns)} normal patterns + {len(self.context_patterns)} context-sensitive patterns")
def wrap_variables(self, html_content: str) -> str:
"""
Wrap known variables in HTML content with tooltip spans
Args:
html_content: HTML content to process
Returns:
HTML content with variables wrapped in tooltip spans
"""
# Track which variables were found (for debugging)
wrapped_vars = set()
# Process normal patterns
all_patterns = self.patterns + self.context_patterns
for pattern, symbol in all_patterns:
tooltip_text = self.symbols.get_tooltip(symbol)
if not tooltip_text:
continue
# Escape for HTML attribute (newlines become &#10;)
tooltip_escaped = html.escape(tooltip_text, quote=True).replace('\n', '&#10;')
# Create replacement span with tooltip
replacement = (
f'<span class="var-tooltip" '
f'data-symbol="{symbol}" '
f'title="{tooltip_escaped}">'
f'\\1' # Captured group (the symbol itself)
f'</span>'
)
# Count matches before replacement
matches = list(re.finditer(pattern, html_content))
if matches:
wrapped_vars.add(symbol)
# Replace pattern with wrapped version
# Use negative lookahead to avoid wrapping already-wrapped variables
pattern_with_check = f'(?<!var-tooltip">)(?<!var-tooltip" )(?<!title=")({pattern})(?!</span>)'
html_content = re.sub(
pattern_with_check,
replacement,
html_content
)
if wrapped_vars:
print(f"[VariableWrapper] Wrapped {len(wrapped_vars)} unique variables: {', '.join(sorted(wrapped_vars)[:10])}...")
return html_content
def wrap_in_context(self, html_content: str) -> str:
"""
More sophisticated wrapping that parses HTML structure
to avoid wrapping in code blocks, headings, etc.
Args:
html_content: HTML content to process
Returns:
HTML content with variables wrapped (context-aware)
"""
# For now, use simple wrapping
# TODO: Implement HTML parsing to be more selective
# (e.g., skip <code>, <pre>, <h1>-<h6> tags)
# Simple exclusion: Don't process content inside <code> or <pre>
code_blocks = []
def preserve_code(match):
"""Preserve code blocks and replace with placeholder"""
code_blocks.append(match.group(0))
return f"___CODE_BLOCK_{len(code_blocks) - 1}___"
# Temporarily remove code blocks
html_content = re.sub(
r'<(code|pre)>(.*?)</\1>',
preserve_code,
html_content,
flags=re.DOTALL
)
# Wrap variables
html_content = self.wrap_variables(html_content)
# Restore code blocks
for i, code_block in enumerate(code_blocks):
html_content = html_content.replace(
f"___CODE_BLOCK_{i}___",
code_block
)
return html_content