Lime Parser Generator 0.1.0
Runtime-extensible LALR(1) parser with SIMD tokenization and LLVM JIT
Loading...
Searching...
No Matches
tokenize.h
1/*
2** SIMD-Accelerated SQL Tokenizer
3**
4** Provides a fast tokenizer for SQL input using SIMD-accelerated character
5** classification. The tokenizer uses the character classification functions
6** from tokenize_simd.h to skip whitespace and scan identifiers/numbers in
7** bulk, then falls back to scalar logic for operators and punctuation.
8**
9** Integration points:
10** - TokenTable (token_table.h) for keyword lookup
11** - CharClassVector (tokenize_simd.h) for parallel character classification
12*/
13#ifndef TOKENIZE_H
14#define TOKENIZE_H
15
16#include <stdint.h>
17#include <stddef.h>
18#include <stdbool.h>
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24/* Forward declarations */
25typedef struct TokenTable TokenTable;
26
27/* ------------------------------------------------------------------ */
28/* Token representation */
29/* ------------------------------------------------------------------ */
30
34typedef struct Token {
35 int type;
36 const char *start;
37 size_t length;
38 uint32_t line;
39 uint32_t column;
40} Token;
41
42/* Generic token type codes for non-keyword tokens.
43** Keyword tokens get their type from the TokenTable.
44** These codes use negative values to avoid collision with
45** user-defined token codes (which are positive). */
46enum {
47 TK_EOF = 0, /* End of input */
48 TK_IDENTIFIER = -1, /* Unrecognized identifier */
49 TK_INTEGER = -2, /* Integer literal */
50 TK_FLOAT = -3, /* Floating point literal */
51 TK_STRING = -4, /* Single-quoted string literal */
52 TK_BLOB = -5, /* X'...' blob literal */
53 TK_LPAREN = -6, /* ( */
54 TK_RPAREN = -7, /* ) */
55 TK_SEMICOLON = -8, /* ; */
56 TK_COMMA = -9, /* , */
57 TK_DOT = -10, /* . */
58 TK_STAR = -11, /* * */
59 TK_PLUS = -12, /* + */
60 TK_MINUS = -13, /* - */
61 TK_SLASH = -14, /* / */
62 TK_PERCENT = -15, /* % */
63 TK_EQ = -16, /* = or == */
64 TK_NE = -17, /* != or <> */
65 TK_LT = -18, /* < */
66 TK_GT = -19, /* > */
67 TK_LE = -20, /* <= */
68 TK_GE = -21, /* >= */
69 TK_BITAND = -22, /* & */
70 TK_BITOR = -23, /* | */
71 TK_BITNOT = -24, /* ~ */
72 TK_LSHIFT = -25, /* << */
73 TK_RSHIFT = -26, /* >> */
74 TK_CONCAT = -27, /* || */
75 TK_DQUOTE_ID = -28, /* "quoted identifier" */
76 TK_BACKTICK_ID = -29, /* `backtick identifier` */
77 TK_BRACKET_ID = -30, /* [bracket identifier] */
78 TK_USTRING = -31, /* U&'...' Unicode escape string literal */
79 TK_ILLEGAL = -32, /* Unrecognized character */
80};
81
82/* ------------------------------------------------------------------ */
83/* Tokenizer state */
84/* ------------------------------------------------------------------ */
85
86typedef struct Tokenizer Tokenizer;
87
88/*
89** Create a new tokenizer for the given input buffer.
90** The input buffer must remain valid for the lifetime of the tokenizer.
91**
92** table: keyword lookup table (may be NULL for identifier-only mode).
93** input: NUL-terminated SQL input string.
94** length: length of input in bytes (not including NUL terminator).
95** The buffer must have at least 32 bytes of readable memory
96** past the end (e.g., zero-padded) for SIMD safety.
97**
98** Returns NULL on allocation failure.
99*/
100Tokenizer *tokenizer_create(TokenTable *table, const char *input, size_t length);
101
102/*
103** Destroy a tokenizer and free its memory.
104** Passing NULL is safe.
105*/
106void tokenizer_destroy(Tokenizer *tok);
107
108/*
109** Extract the next token from the input.
110** Returns true if a token was produced, false at end-of-input.
111** On false return, out->type is TK_EOF.
112*/
113bool tokenizer_next(Tokenizer *tok, Token *out);
114
115/*
116** Peek at the next token without consuming it.
117** Returns true if a token is available, false at end-of-input.
118*/
119bool tokenizer_peek(Tokenizer *tok, Token *out);
120
121/*
122** Return the current position (byte offset) in the input.
123*/
124size_t tokenizer_position(const Tokenizer *tok);
125
126/*
127** Return the current line number (1-based).
128*/
129uint32_t tokenizer_line(const Tokenizer *tok);
130
131/*
132** Return the current column number (1-based).
133*/
134uint32_t tokenizer_column(const Tokenizer *tok);
135
136#ifdef __cplusplus
137}
138#endif
139
140#endif /* TOKENIZE_H */
Thread-safe token lookup table.
Definition token_table.h:40
A single token returned by the tokenizer.
Definition tokenize.h:34
uint32_t line
1-based line number
Definition tokenize.h:38
const char * start
Pointer into source buffer.
Definition tokenize.h:36
size_t length
Length in bytes.
Definition tokenize.h:37
uint32_t column
1-based column number
Definition tokenize.h:39
int type
Token type code (keyword code or generic)
Definition tokenize.h:35