#include #include #include #include #include "tree_sitter/parser.h" /* #define DEBUG 1 */ #if DEBUG #define LOG(...) fprintf(stderr, "[SCANNER] " __VA_ARGS__) #else #define LOG(...) #endif enum TokenType { TSLX_CONTENT, TSL_STATEMENT_START_TAG, TSL_STATEMENT_END_TAG, TSL_EXPRESSION_START_TAG, TSL_EXPRESSION_END_TAG, TSLX_END_TAG }; enum State { STATE_ROOT, STATE_TSLX, STATE_TSL_STATEMENT, STATE_TSL_EXPRESSION }; typedef struct { enum State* states; size_t capacity; size_t size; } StateStack; static void stack_init(StateStack* stack) { stack->capacity = 8; stack->size = 0; stack->states = malloc(stack->capacity * sizeof(enum State)); if (stack->states) { stack->states[0] = STATE_ROOT; stack->size = 1; } } static void stack_free(StateStack* stack) { if (stack->states) { free(stack->states); stack->states = NULL; } stack->size = 0; stack->capacity = 0; } static void stack_push(StateStack* stack, enum State state) { if (stack->size >= stack->capacity) { stack->capacity *= 2; stack->states = realloc(stack->states, stack->capacity * sizeof(enum State)); } if (stack->states) { stack->states[stack->size++] = state; } } static enum State stack_pop(StateStack* stack) { if (stack->size > 1) { return stack->states[--stack->size]; } return STATE_ROOT; } static enum State stack_top(const StateStack* stack) { return stack->size > 0 ? stack->states[stack->size - 1] : STATE_ROOT; } void* tree_sitter_tsf_external_scanner_create(void) { StateStack* stack = malloc(sizeof(StateStack)); if (stack) { stack_init(stack); } LOG("Scanner created\n"); return stack; } void tree_sitter_tsf_external_scanner_destroy(void* payload) { StateStack* stack = (StateStack*)payload; if (stack) { stack_free(stack); free(stack); } LOG("Scanner destroyed\n"); } unsigned tree_sitter_tsf_external_scanner_serialize(void* payload, char* buffer) { StateStack* stack = (StateStack*)payload; if (!stack || stack->size == 0) return 0; size_t bytes_to_copy = stack->size * sizeof(enum State); if (bytes_to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { bytes_to_copy = TREE_SITTER_SERIALIZATION_BUFFER_SIZE; } memcpy(buffer, stack->states, bytes_to_copy); return bytes_to_copy; } void tree_sitter_tsf_external_scanner_deserialize(void* payload, const char* buffer, unsigned length) { StateStack* stack = (StateStack*)payload; if (!stack) return; stack_free(stack); stack_init(stack); if (length > 0) { size_t count = length / sizeof(enum State); for (size_t i = 0; i < count && i < stack->capacity; i++) { enum State state; memcpy(&state, buffer + i * sizeof(enum State), sizeof(enum State)); if (i == 0) { stack->states[0] = state; } else { stack_push(stack, state); } } } } static void skip_whitespace(TSLexer* lexer) { while (iswspace(lexer->lookahead)) { lexer->advance(lexer, true); } } static bool lookahead_for_close_tag(TSLexer* lexer) { LOG(" [lookahead_for_close_tag] Starting lookahead\n"); while (lexer->lookahead != '\0') { if (lexer->lookahead == '?') { lexer->advance(lexer, false); if (lexer->lookahead == '>') { LOG(" [lookahead_for_close_tag] Found ?>, returning true\n"); return true; } } else if (lexer->lookahead == '<') { lexer->advance(lexer, false); if (lexer->lookahead == '?') { LOG(" [lookahead_for_close_tag] Found next advance(lexer, false); } } LOG(" [lookahead_for_close_tag] Reached EOF, returning false\n"); return false; } static int peek_special_tag(TSLexer* lexer) { if (lexer->lookahead != '<') return 0; lexer->advance(lexer, false); if (lexer->lookahead != '?') return 0; lexer->advance(lexer, false); if (lexer->lookahead == '=') { LOG(" [peek_special_tag] Found lookahead) == 't') { lexer->advance(lexer, false); if (towlower(lexer->lookahead) == 's') { lexer->advance(lexer, false); if (towlower(lexer->lookahead) == 'l') { lexer->advance(lexer, false); if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_') { LOG(" [peek_special_tag] Found lookahead != '\0') { if (lexer->lookahead == '<') { lexer->mark_end(lexer); int tag_type = peek_special_tag(lexer); if (tag_type > 0) { LOG(" [scan_tslx_content] Found special tag, stopping. has_content=%d\n", has_content); return has_content; } has_content = true; } else { lexer->advance(lexer, false); has_content = true; } } LOG(" [scan_tslx_content] Reached EOF, has_content=%d\n", has_content); return has_content; } bool tree_sitter_tsf_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols) { StateStack* stack = (StateStack*)payload; if (!stack) { LOG("ERROR: stack is NULL\n"); return false; } enum State current_state = stack_top(stack); LOG("\n=== SCAN START ===\n"); LOG("Current state: %d (0=ROOT, 1=TSLX, 2=TSL_STATEMENT, 3=TSL_EXPRESSION)\n", current_state); LOG("Current char: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead); LOG("Valid symbols: CONTENT=%d, TSL_START=%d, TSL_END=%d, EXPR_START=%d, EXPR_END=%d, TSLX_END=%d\n", valid_symbols[TSLX_CONTENT], valid_symbols[TSL_STATEMENT_START_TAG], valid_symbols[TSL_STATEMENT_END_TAG], valid_symbols[TSL_EXPRESSION_START_TAG], valid_symbols[TSL_EXPRESSION_END_TAG], valid_symbols[TSLX_END_TAG]); // ⚠️ 关键修复:通过 valid_symbols 推断状态切换 // 如果当前在 ROOT 状态,但 parser 期望 TSLX 内部的 token,说明刚匹配完 if (current_state == STATE_ROOT && (valid_symbols[TSLX_CONTENT] || valid_symbols[TSL_STATEMENT_START_TAG] || valid_symbols[TSL_EXPRESSION_START_TAG] || valid_symbols[TSLX_END_TAG])) { LOG("State transition: ROOT -> TSLX (inferred from valid_symbols)\n"); stack_push(stack, STATE_TSLX); current_state = STATE_TSLX; } if (current_state == STATE_TSL_STATEMENT || current_state == STATE_TSL_EXPRESSION) { skip_whitespace(lexer); LOG("After skip whitespace: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead); } if (current_state == STATE_TSLX) { LOG("In STATE_TSLX\n"); if (lexer->lookahead == '\0' && valid_symbols[TSLX_END_TAG]) { LOG("Found EOF, returning TSLX_END_TAG\n"); lexer->mark_end(lexer); lexer->result_symbol = TSLX_END_TAG; stack_pop(stack); return true; } if (lexer->lookahead == '<') { LOG("Found '<', checking for tag\n"); lexer->advance(lexer, false); if (lexer->lookahead == '?') { LOG("Found 'advance(lexer, false); if (lexer->lookahead == '=' && valid_symbols[TSL_EXPRESSION_START_TAG]) { LOG("Found 'advance(lexer, false); lexer->mark_end(lexer); lexer->result_symbol = TSL_EXPRESSION_START_TAG; stack_push(stack, STATE_TSL_EXPRESSION); return true; } if (towlower(lexer->lookahead) == 't') { LOG("Found 't', checking for 'tsl'\n"); lexer->advance(lexer, false); if (towlower(lexer->lookahead) == 's') { lexer->advance(lexer, false); if (towlower(lexer->lookahead) == 'l') { lexer->advance(lexer, false); LOG("Found 'tsl', next char: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead); if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_') { LOG("Valid mark_end(lexer); bool has_close_tag = lookahead_for_close_tag(lexer); LOG("Lookahead result: has_close_tag = %d\n", has_close_tag); if (has_close_tag && valid_symbols[TSL_STATEMENT_START_TAG]) { LOG("Returning TSL_STATEMENT_START_TAG\n"); lexer->result_symbol = TSL_STATEMENT_START_TAG; stack_push(stack, STATE_TSL_STATEMENT); return true; } else if (!has_close_tag && valid_symbols[TSLX_END_TAG]) { LOG("Returning TSLX_END_TAG ()\n"); lexer->result_symbol = TSLX_END_TAG; stack_pop(stack); return true; } } } } } } } if (valid_symbols[TSLX_CONTENT]) { LOG("Trying to scan TSLX_CONTENT\n"); if (scan_tslx_content(lexer)) { LOG("Successfully scanned TSLX_CONTENT\n"); lexer->result_symbol = TSLX_CONTENT; return true; } LOG("Failed to scan TSLX_CONTENT\n"); } } if (current_state == STATE_TSL_STATEMENT) { LOG("In STATE_TSL_STATEMENT\n"); if (lexer->lookahead == '?' && valid_symbols[TSL_STATEMENT_END_TAG]) { lexer->advance(lexer, false); if (lexer->lookahead == '>') { LOG("Found '?>', returning TSL_STATEMENT_END_TAG\n"); lexer->advance(lexer, false); lexer->mark_end(lexer); lexer->result_symbol = TSL_STATEMENT_END_TAG; stack_pop(stack); return true; } } } if (current_state == STATE_TSL_EXPRESSION) { LOG("In STATE_TSL_EXPRESSION\n"); if (lexer->lookahead == '?' && valid_symbols[TSL_EXPRESSION_END_TAG]) { lexer->advance(lexer, false); if (lexer->lookahead == '>') { LOG("Found '?>', returning TSL_EXPRESSION_END_TAG\n"); lexer->advance(lexer, false); lexer->mark_end(lexer); lexer->result_symbol = TSL_EXPRESSION_END_TAG; stack_pop(stack); return true; } } } LOG("Returning false (no match)\n"); return false; }