438 lines
12 KiB
C
438 lines
12 KiB
C
#include <wctype.h>
|
||
#include <string.h>
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include "tree_sitter/parser.h"
|
||
|
||
/* #define DEBUG 1 */
|
||
#if DEBUG
|
||
#define LOG(...) fprintf(stderr, "[SCANNER] " __VA_ARGS__)
|
||
#else
|
||
#define LOG(...)
|
||
#endif
|
||
|
||
enum TokenType
|
||
{
|
||
TSLX_CONTENT,
|
||
TSL_STATEMENT_START_TAG,
|
||
TSL_STATEMENT_END_TAG,
|
||
TSL_EXPRESSION_START_TAG,
|
||
TSL_EXPRESSION_END_TAG,
|
||
TSLX_END_TAG
|
||
};
|
||
|
||
enum State
|
||
{
|
||
STATE_ROOT,
|
||
STATE_TSLX,
|
||
STATE_TSL_STATEMENT,
|
||
STATE_TSL_EXPRESSION
|
||
};
|
||
|
||
typedef struct
|
||
{
|
||
enum State* states;
|
||
size_t capacity;
|
||
size_t size;
|
||
} StateStack;
|
||
|
||
static void stack_init(StateStack* stack)
|
||
{
|
||
stack->capacity = 8;
|
||
stack->size = 0;
|
||
stack->states = malloc(stack->capacity * sizeof(enum State));
|
||
if (stack->states)
|
||
{
|
||
stack->states[0] = STATE_ROOT;
|
||
stack->size = 1;
|
||
}
|
||
}
|
||
|
||
static void stack_free(StateStack* stack)
|
||
{
|
||
if (stack->states)
|
||
{
|
||
free(stack->states);
|
||
stack->states = NULL;
|
||
}
|
||
stack->size = 0;
|
||
stack->capacity = 0;
|
||
}
|
||
|
||
static void stack_push(StateStack* stack, enum State state)
|
||
{
|
||
if (stack->size >= stack->capacity)
|
||
{
|
||
stack->capacity *= 2;
|
||
stack->states = realloc(stack->states, stack->capacity * sizeof(enum State));
|
||
}
|
||
if (stack->states)
|
||
{
|
||
stack->states[stack->size++] = state;
|
||
}
|
||
}
|
||
|
||
static enum State stack_pop(StateStack* stack)
|
||
{
|
||
if (stack->size > 1)
|
||
{
|
||
return stack->states[--stack->size];
|
||
}
|
||
return STATE_ROOT;
|
||
}
|
||
|
||
static enum State stack_top(const StateStack* stack)
|
||
{
|
||
return stack->size > 0 ? stack->states[stack->size - 1] : STATE_ROOT;
|
||
}
|
||
|
||
void* tree_sitter_tsf_external_scanner_create(void)
|
||
{
|
||
StateStack* stack = malloc(sizeof(StateStack));
|
||
if (stack)
|
||
{
|
||
stack_init(stack);
|
||
}
|
||
LOG("Scanner created\n");
|
||
return stack;
|
||
}
|
||
|
||
void tree_sitter_tsf_external_scanner_destroy(void* payload)
|
||
{
|
||
StateStack* stack = (StateStack*)payload;
|
||
if (stack)
|
||
{
|
||
stack_free(stack);
|
||
free(stack);
|
||
}
|
||
LOG("Scanner destroyed\n");
|
||
}
|
||
|
||
unsigned tree_sitter_tsf_external_scanner_serialize(void* payload, char* buffer)
|
||
{
|
||
StateStack* stack = (StateStack*)payload;
|
||
if (!stack || stack->size == 0)
|
||
return 0;
|
||
|
||
size_t bytes_to_copy = stack->size * sizeof(enum State);
|
||
if (bytes_to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE)
|
||
{
|
||
bytes_to_copy = TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
|
||
}
|
||
|
||
memcpy(buffer, stack->states, bytes_to_copy);
|
||
return bytes_to_copy;
|
||
}
|
||
|
||
void tree_sitter_tsf_external_scanner_deserialize(void* payload, const char* buffer, unsigned length)
|
||
{
|
||
StateStack* stack = (StateStack*)payload;
|
||
if (!stack)
|
||
return;
|
||
|
||
stack_free(stack);
|
||
stack_init(stack);
|
||
|
||
if (length > 0)
|
||
{
|
||
size_t count = length / sizeof(enum State);
|
||
for (size_t i = 0; i < count && i < stack->capacity; i++)
|
||
{
|
||
enum State state;
|
||
memcpy(&state, buffer + i * sizeof(enum State), sizeof(enum State));
|
||
if (i == 0)
|
||
{
|
||
stack->states[0] = state;
|
||
}
|
||
else
|
||
{
|
||
stack_push(stack, state);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
static void skip_whitespace(TSLexer* lexer)
|
||
{
|
||
while (iswspace(lexer->lookahead))
|
||
{
|
||
lexer->advance(lexer, true);
|
||
}
|
||
}
|
||
|
||
static bool lookahead_for_close_tag(TSLexer* lexer)
|
||
{
|
||
LOG(" [lookahead_for_close_tag] Starting lookahead\n");
|
||
|
||
while (lexer->lookahead != '\0')
|
||
{
|
||
if (lexer->lookahead == '?')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead == '>')
|
||
{
|
||
LOG(" [lookahead_for_close_tag] Found ?>, returning true\n");
|
||
return true;
|
||
}
|
||
}
|
||
else if (lexer->lookahead == '<')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead == '?')
|
||
{
|
||
LOG(" [lookahead_for_close_tag] Found next <?, returning false\n");
|
||
return false;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
lexer->advance(lexer, false);
|
||
}
|
||
}
|
||
|
||
LOG(" [lookahead_for_close_tag] Reached EOF, returning false\n");
|
||
return false;
|
||
}
|
||
|
||
static int peek_special_tag(TSLexer* lexer)
|
||
{
|
||
if (lexer->lookahead != '<')
|
||
return 0;
|
||
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead != '?')
|
||
return 0;
|
||
|
||
lexer->advance(lexer, false);
|
||
|
||
if (lexer->lookahead == '=')
|
||
{
|
||
LOG(" [peek_special_tag] Found <?=\n");
|
||
return 1;
|
||
}
|
||
|
||
if (towlower(lexer->lookahead) == 't')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (towlower(lexer->lookahead) == 's')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (towlower(lexer->lookahead) == 'l')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_')
|
||
{
|
||
LOG(" [peek_special_tag] Found <?tsl\n");
|
||
return 2;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
|
||
static bool scan_tslx_content(TSLexer* lexer)
|
||
{
|
||
bool has_content = false;
|
||
LOG(" [scan_tslx_content] Starting scan\n");
|
||
|
||
while (lexer->lookahead != '\0')
|
||
{
|
||
if (lexer->lookahead == '<')
|
||
{
|
||
lexer->mark_end(lexer);
|
||
|
||
int tag_type = peek_special_tag(lexer);
|
||
|
||
if (tag_type > 0)
|
||
{
|
||
LOG(" [scan_tslx_content] Found special tag, stopping. has_content=%d\n", has_content);
|
||
return has_content;
|
||
}
|
||
|
||
has_content = true;
|
||
}
|
||
else
|
||
{
|
||
lexer->advance(lexer, false);
|
||
has_content = true;
|
||
}
|
||
}
|
||
|
||
LOG(" [scan_tslx_content] Reached EOF, has_content=%d\n", has_content);
|
||
return has_content;
|
||
}
|
||
|
||
bool tree_sitter_tsf_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols)
|
||
{
|
||
StateStack* stack = (StateStack*)payload;
|
||
if (!stack)
|
||
{
|
||
LOG("ERROR: stack is NULL\n");
|
||
return false;
|
||
}
|
||
|
||
enum State current_state = stack_top(stack);
|
||
|
||
LOG("\n=== SCAN START ===\n");
|
||
LOG("Current state: %d (0=ROOT, 1=TSLX, 2=TSL_STATEMENT, 3=TSL_EXPRESSION)\n", current_state);
|
||
LOG("Current char: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead);
|
||
LOG("Valid symbols: CONTENT=%d, TSL_START=%d, TSL_END=%d, EXPR_START=%d, EXPR_END=%d, TSLX_END=%d\n",
|
||
valid_symbols[TSLX_CONTENT],
|
||
valid_symbols[TSL_STATEMENT_START_TAG],
|
||
valid_symbols[TSL_STATEMENT_END_TAG],
|
||
valid_symbols[TSL_EXPRESSION_START_TAG],
|
||
valid_symbols[TSL_EXPRESSION_END_TAG],
|
||
valid_symbols[TSLX_END_TAG]);
|
||
|
||
// ⚠️ 关键修复:通过 valid_symbols 推断状态切换
|
||
// 如果当前在 ROOT 状态,但 parser 期望 TSLX 内部的 token,说明刚匹配完 <?tslx>
|
||
if (current_state == STATE_ROOT &&
|
||
(valid_symbols[TSLX_CONTENT] ||
|
||
valid_symbols[TSL_STATEMENT_START_TAG] ||
|
||
valid_symbols[TSL_EXPRESSION_START_TAG] ||
|
||
valid_symbols[TSLX_END_TAG]))
|
||
{
|
||
LOG("State transition: ROOT -> TSLX (inferred from valid_symbols)\n");
|
||
stack_push(stack, STATE_TSLX);
|
||
current_state = STATE_TSLX;
|
||
}
|
||
|
||
if (current_state == STATE_TSL_STATEMENT || current_state == STATE_TSL_EXPRESSION)
|
||
{
|
||
skip_whitespace(lexer);
|
||
LOG("After skip whitespace: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead);
|
||
}
|
||
|
||
if (current_state == STATE_TSLX)
|
||
{
|
||
LOG("In STATE_TSLX\n");
|
||
|
||
if (lexer->lookahead == '\0' && valid_symbols[TSLX_END_TAG])
|
||
{
|
||
LOG("Found EOF, returning TSLX_END_TAG\n");
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = TSLX_END_TAG;
|
||
stack_pop(stack);
|
||
return true;
|
||
}
|
||
|
||
if (lexer->lookahead == '<')
|
||
{
|
||
LOG("Found '<', checking for tag\n");
|
||
lexer->advance(lexer, false);
|
||
|
||
if (lexer->lookahead == '?')
|
||
{
|
||
LOG("Found '<?', checking tag type\n");
|
||
lexer->advance(lexer, false);
|
||
|
||
if (lexer->lookahead == '=' && valid_symbols[TSL_EXPRESSION_START_TAG])
|
||
{
|
||
LOG("Found '<?=', returning TSL_EXPRESSION_START_TAG\n");
|
||
lexer->advance(lexer, false);
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = TSL_EXPRESSION_START_TAG;
|
||
stack_push(stack, STATE_TSL_EXPRESSION);
|
||
return true;
|
||
}
|
||
|
||
if (towlower(lexer->lookahead) == 't')
|
||
{
|
||
LOG("Found 't', checking for 'tsl'\n");
|
||
lexer->advance(lexer, false);
|
||
|
||
if (towlower(lexer->lookahead) == 's')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
|
||
if (towlower(lexer->lookahead) == 'l')
|
||
{
|
||
lexer->advance(lexer, false);
|
||
|
||
LOG("Found 'tsl', next char: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead);
|
||
|
||
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_')
|
||
{
|
||
LOG("Valid <?tsl delimiter, performing lookahead\n");
|
||
|
||
lexer->mark_end(lexer);
|
||
|
||
bool has_close_tag = lookahead_for_close_tag(lexer);
|
||
LOG("Lookahead result: has_close_tag = %d\n", has_close_tag);
|
||
|
||
if (has_close_tag && valid_symbols[TSL_STATEMENT_START_TAG])
|
||
{
|
||
LOG("Returning TSL_STATEMENT_START_TAG\n");
|
||
lexer->result_symbol = TSL_STATEMENT_START_TAG;
|
||
stack_push(stack, STATE_TSL_STATEMENT);
|
||
return true;
|
||
}
|
||
else if (!has_close_tag && valid_symbols[TSLX_END_TAG])
|
||
{
|
||
LOG("Returning TSLX_END_TAG (<?tsl without ?>)\n");
|
||
lexer->result_symbol = TSLX_END_TAG;
|
||
stack_pop(stack);
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if (valid_symbols[TSLX_CONTENT])
|
||
{
|
||
LOG("Trying to scan TSLX_CONTENT\n");
|
||
if (scan_tslx_content(lexer))
|
||
{
|
||
LOG("Successfully scanned TSLX_CONTENT\n");
|
||
lexer->result_symbol = TSLX_CONTENT;
|
||
return true;
|
||
}
|
||
LOG("Failed to scan TSLX_CONTENT\n");
|
||
}
|
||
}
|
||
|
||
if (current_state == STATE_TSL_STATEMENT)
|
||
{
|
||
LOG("In STATE_TSL_STATEMENT\n");
|
||
if (lexer->lookahead == '?' && valid_symbols[TSL_STATEMENT_END_TAG])
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead == '>')
|
||
{
|
||
LOG("Found '?>', returning TSL_STATEMENT_END_TAG\n");
|
||
lexer->advance(lexer, false);
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = TSL_STATEMENT_END_TAG;
|
||
stack_pop(stack);
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (current_state == STATE_TSL_EXPRESSION)
|
||
{
|
||
LOG("In STATE_TSL_EXPRESSION\n");
|
||
if (lexer->lookahead == '?' && valid_symbols[TSL_EXPRESSION_END_TAG])
|
||
{
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead == '>')
|
||
{
|
||
LOG("Found '?>', returning TSL_EXPRESSION_END_TAG\n");
|
||
lexer->advance(lexer, false);
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = TSL_EXPRESSION_END_TAG;
|
||
stack_pop(stack);
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
LOG("Returning false (no match)\n");
|
||
return false;
|
||
}
|