tsl-devkit/lsp-server/test/test_tree_sitter/src/scanner.c

438 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <wctype.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "tree_sitter/parser.h"
/* #define DEBUG 1 */
#if DEBUG
#define LOG(...) fprintf(stderr, "[SCANNER] " __VA_ARGS__)
#else
#define LOG(...)
#endif
enum TokenType
{
TSLX_CONTENT,
TSL_STATEMENT_START_TAG,
TSL_STATEMENT_END_TAG,
TSL_EXPRESSION_START_TAG,
TSL_EXPRESSION_END_TAG,
TSLX_END_TAG
};
enum State
{
STATE_ROOT,
STATE_TSLX,
STATE_TSL_STATEMENT,
STATE_TSL_EXPRESSION
};
typedef struct
{
enum State* states;
size_t capacity;
size_t size;
} StateStack;
static void stack_init(StateStack* stack)
{
stack->capacity = 8;
stack->size = 0;
stack->states = malloc(stack->capacity * sizeof(enum State));
if (stack->states)
{
stack->states[0] = STATE_ROOT;
stack->size = 1;
}
}
static void stack_free(StateStack* stack)
{
if (stack->states)
{
free(stack->states);
stack->states = NULL;
}
stack->size = 0;
stack->capacity = 0;
}
static void stack_push(StateStack* stack, enum State state)
{
if (stack->size >= stack->capacity)
{
stack->capacity *= 2;
stack->states = realloc(stack->states, stack->capacity * sizeof(enum State));
}
if (stack->states)
{
stack->states[stack->size++] = state;
}
}
static enum State stack_pop(StateStack* stack)
{
if (stack->size > 1)
{
return stack->states[--stack->size];
}
return STATE_ROOT;
}
static enum State stack_top(const StateStack* stack)
{
return stack->size > 0 ? stack->states[stack->size - 1] : STATE_ROOT;
}
void* tree_sitter_tsf_external_scanner_create(void)
{
StateStack* stack = malloc(sizeof(StateStack));
if (stack)
{
stack_init(stack);
}
LOG("Scanner created\n");
return stack;
}
void tree_sitter_tsf_external_scanner_destroy(void* payload)
{
StateStack* stack = (StateStack*)payload;
if (stack)
{
stack_free(stack);
free(stack);
}
LOG("Scanner destroyed\n");
}
unsigned tree_sitter_tsf_external_scanner_serialize(void* payload, char* buffer)
{
StateStack* stack = (StateStack*)payload;
if (!stack || stack->size == 0)
return 0;
size_t bytes_to_copy = stack->size * sizeof(enum State);
if (bytes_to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE)
{
bytes_to_copy = TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
}
memcpy(buffer, stack->states, bytes_to_copy);
return bytes_to_copy;
}
void tree_sitter_tsf_external_scanner_deserialize(void* payload, const char* buffer, unsigned length)
{
StateStack* stack = (StateStack*)payload;
if (!stack)
return;
stack_free(stack);
stack_init(stack);
if (length > 0)
{
size_t count = length / sizeof(enum State);
for (size_t i = 0; i < count && i < stack->capacity; i++)
{
enum State state;
memcpy(&state, buffer + i * sizeof(enum State), sizeof(enum State));
if (i == 0)
{
stack->states[0] = state;
}
else
{
stack_push(stack, state);
}
}
}
}
static void skip_whitespace(TSLexer* lexer)
{
while (iswspace(lexer->lookahead))
{
lexer->advance(lexer, true);
}
}
static bool lookahead_for_close_tag(TSLexer* lexer)
{
LOG(" [lookahead_for_close_tag] Starting lookahead\n");
while (lexer->lookahead != '\0')
{
if (lexer->lookahead == '?')
{
lexer->advance(lexer, false);
if (lexer->lookahead == '>')
{
LOG(" [lookahead_for_close_tag] Found ?>, returning true\n");
return true;
}
}
else if (lexer->lookahead == '<')
{
lexer->advance(lexer, false);
if (lexer->lookahead == '?')
{
LOG(" [lookahead_for_close_tag] Found next <?, returning false\n");
return false;
}
}
else
{
lexer->advance(lexer, false);
}
}
LOG(" [lookahead_for_close_tag] Reached EOF, returning false\n");
return false;
}
static int peek_special_tag(TSLexer* lexer)
{
if (lexer->lookahead != '<')
return 0;
lexer->advance(lexer, false);
if (lexer->lookahead != '?')
return 0;
lexer->advance(lexer, false);
if (lexer->lookahead == '=')
{
LOG(" [peek_special_tag] Found <?=\n");
return 1;
}
if (towlower(lexer->lookahead) == 't')
{
lexer->advance(lexer, false);
if (towlower(lexer->lookahead) == 's')
{
lexer->advance(lexer, false);
if (towlower(lexer->lookahead) == 'l')
{
lexer->advance(lexer, false);
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_')
{
LOG(" [peek_special_tag] Found <?tsl\n");
return 2;
}
}
}
}
return 0;
}
static bool scan_tslx_content(TSLexer* lexer)
{
bool has_content = false;
LOG(" [scan_tslx_content] Starting scan\n");
while (lexer->lookahead != '\0')
{
if (lexer->lookahead == '<')
{
lexer->mark_end(lexer);
int tag_type = peek_special_tag(lexer);
if (tag_type > 0)
{
LOG(" [scan_tslx_content] Found special tag, stopping. has_content=%d\n", has_content);
return has_content;
}
has_content = true;
}
else
{
lexer->advance(lexer, false);
has_content = true;
}
}
LOG(" [scan_tslx_content] Reached EOF, has_content=%d\n", has_content);
return has_content;
}
bool tree_sitter_tsf_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols)
{
StateStack* stack = (StateStack*)payload;
if (!stack)
{
LOG("ERROR: stack is NULL\n");
return false;
}
enum State current_state = stack_top(stack);
LOG("\n=== SCAN START ===\n");
LOG("Current state: %d (0=ROOT, 1=TSLX, 2=TSL_STATEMENT, 3=TSL_EXPRESSION)\n", current_state);
LOG("Current char: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead);
LOG("Valid symbols: CONTENT=%d, TSL_START=%d, TSL_END=%d, EXPR_START=%d, EXPR_END=%d, TSLX_END=%d\n",
valid_symbols[TSLX_CONTENT],
valid_symbols[TSL_STATEMENT_START_TAG],
valid_symbols[TSL_STATEMENT_END_TAG],
valid_symbols[TSL_EXPRESSION_START_TAG],
valid_symbols[TSL_EXPRESSION_END_TAG],
valid_symbols[TSLX_END_TAG]);
// ⚠️ 关键修复:通过 valid_symbols 推断状态切换
// 如果当前在 ROOT 状态,但 parser 期望 TSLX 内部的 token说明刚匹配完 <?tslx>
if (current_state == STATE_ROOT &&
(valid_symbols[TSLX_CONTENT] ||
valid_symbols[TSL_STATEMENT_START_TAG] ||
valid_symbols[TSL_EXPRESSION_START_TAG] ||
valid_symbols[TSLX_END_TAG]))
{
LOG("State transition: ROOT -> TSLX (inferred from valid_symbols)\n");
stack_push(stack, STATE_TSLX);
current_state = STATE_TSLX;
}
if (current_state == STATE_TSL_STATEMENT || current_state == STATE_TSL_EXPRESSION)
{
skip_whitespace(lexer);
LOG("After skip whitespace: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead);
}
if (current_state == STATE_TSLX)
{
LOG("In STATE_TSLX\n");
if (lexer->lookahead == '\0' && valid_symbols[TSLX_END_TAG])
{
LOG("Found EOF, returning TSLX_END_TAG\n");
lexer->mark_end(lexer);
lexer->result_symbol = TSLX_END_TAG;
stack_pop(stack);
return true;
}
if (lexer->lookahead == '<')
{
LOG("Found '<', checking for tag\n");
lexer->advance(lexer, false);
if (lexer->lookahead == '?')
{
LOG("Found '<?', checking tag type\n");
lexer->advance(lexer, false);
if (lexer->lookahead == '=' && valid_symbols[TSL_EXPRESSION_START_TAG])
{
LOG("Found '<?=', returning TSL_EXPRESSION_START_TAG\n");
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = TSL_EXPRESSION_START_TAG;
stack_push(stack, STATE_TSL_EXPRESSION);
return true;
}
if (towlower(lexer->lookahead) == 't')
{
LOG("Found 't', checking for 'tsl'\n");
lexer->advance(lexer, false);
if (towlower(lexer->lookahead) == 's')
{
lexer->advance(lexer, false);
if (towlower(lexer->lookahead) == 'l')
{
lexer->advance(lexer, false);
LOG("Found 'tsl', next char: '%c' (0x%02x)\n", lexer->lookahead, lexer->lookahead);
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_')
{
LOG("Valid <?tsl delimiter, performing lookahead\n");
lexer->mark_end(lexer);
bool has_close_tag = lookahead_for_close_tag(lexer);
LOG("Lookahead result: has_close_tag = %d\n", has_close_tag);
if (has_close_tag && valid_symbols[TSL_STATEMENT_START_TAG])
{
LOG("Returning TSL_STATEMENT_START_TAG\n");
lexer->result_symbol = TSL_STATEMENT_START_TAG;
stack_push(stack, STATE_TSL_STATEMENT);
return true;
}
else if (!has_close_tag && valid_symbols[TSLX_END_TAG])
{
LOG("Returning TSLX_END_TAG (<?tsl without ?>)\n");
lexer->result_symbol = TSLX_END_TAG;
stack_pop(stack);
return true;
}
}
}
}
}
}
}
if (valid_symbols[TSLX_CONTENT])
{
LOG("Trying to scan TSLX_CONTENT\n");
if (scan_tslx_content(lexer))
{
LOG("Successfully scanned TSLX_CONTENT\n");
lexer->result_symbol = TSLX_CONTENT;
return true;
}
LOG("Failed to scan TSLX_CONTENT\n");
}
}
if (current_state == STATE_TSL_STATEMENT)
{
LOG("In STATE_TSL_STATEMENT\n");
if (lexer->lookahead == '?' && valid_symbols[TSL_STATEMENT_END_TAG])
{
lexer->advance(lexer, false);
if (lexer->lookahead == '>')
{
LOG("Found '?>', returning TSL_STATEMENT_END_TAG\n");
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = TSL_STATEMENT_END_TAG;
stack_pop(stack);
return true;
}
}
}
if (current_state == STATE_TSL_EXPRESSION)
{
LOG("In STATE_TSL_EXPRESSION\n");
if (lexer->lookahead == '?' && valid_symbols[TSL_EXPRESSION_END_TAG])
{
lexer->advance(lexer, false);
if (lexer->lookahead == '>')
{
LOG("Found '?>', returning TSL_EXPRESSION_END_TAG\n");
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = TSL_EXPRESSION_END_TAG;
stack_pop(stack);
return true;
}
}
}
LOG("Returning false (no match)\n");
return false;
}