词法分析 |
|
有些人认为 Lua 的正则表达式有限制。Lua 的妙处在于它很容易添加额外的功能。我们不需要花哨的正则表达式,因为我们可以轻松地使用 Peter Bumbulis 的 re2c
[1] 添加词法分析器。
这是一个识别 Lua 5.0 语法和关键字的词法扫描器。函数 LexLua
接受要扫描的字符串并返回一个执行扫描的函数。返回的函数是一个闭包,它绑定到 C 函数 scan
和两个上值:要扫描的字符串和一个用于跟踪状态的用户数据。每次调用它时,它都会返回下一个标记,或者在到达字符串末尾时返回 nil
。
re2c
将特殊注释中的正则表达式替换为扫描器的代码。以下是输入 [2] 和输出 [3] 的 re2c
。
/* ============================================================================== LexLua.c ============================================================================== */ #include "lua.h" #include "lauxlib.h" const char *name = "<name>"; const char *number = "<number>"; const char *literal = "<literal>"; #define YYCTYPE char #define YYCURSOR cursor #define YYMARKER marker #define YYLIMIT limit #define YYFILL(n) #define save_state(i,s,c,m,l) {\ (s)->cursor = (c)-(i); \ (s)->marker = (m)-(i); \ (s)->limit = (l)-(i); \ } typedef struct Scanner { int cursor, marker, limit; } Scanner; static Scanner *check_Scanner(lua_State *L, int index) { luaL_check_type(L, index, LUA_TUSERDATA); return (Scanner*)lua_touserdata(L,index); } static int scan (lua_State *L) { const char *input = luaL_check_string(L, lua_upvalueindex(1)); Scanner *state = check_Scanner(L,lua_upvalueindex(2)); char *cursor = (char*)input + state->cursor; char *marker = (char*)input + state->marker; char *limit = (char*)input + state->limit; char *white_space, *token; const char *ret = 0; int nest_count = 0; /*!re2c D = [0-9] ; E = [Ee] [+-]? D+ ; L = [a-zA-Z_] ; NUMBER = ( D+ | D* "." D+ | D+ "." D* ) E? ; WS = [ \t\n\v\f]+ ; LF = [\n] ; END = [\000] ; ANY = [\000-\377] \ END ; ESC = [\\] ; SQ = ['] ; DQ = ["] ; STRING1 = SQ ( ANY \ SQ \ ESC | ESC ANY )* SQ ; STRING2 = DQ ( ANY \ DQ \ ESC | ESC ANY )* DQ ; */ Begin: white_space = cursor; /* start of white space */ Space: token = cursor; /* start of token */ /*!re2c WS { goto Space; } "--[[" { nest_count=0; goto LongComment; } "--" | "#" { goto Comment; } "and" { goto Return; } "break" { goto Return; } "do" { goto Return; } "else" { goto Return; } "elseif" { goto Return; } "end" { goto Return; } "false" { goto Return; } "for" { goto Return; } "function" { goto Return; } "global" { goto Return; } "if" { goto Return; } "in" { goto Return; } "local" { goto Return; } "nil" { goto Return; } "not" { goto Return; } "or" { goto Return; } "repeat" { goto Return; } "return" { goto Return; } "then" { goto Return; } "true" { goto Return; } "until" { goto Return; } "while" { goto Return; } "..." { goto Return; } ".." { goto Return; } "==" { goto Return; } ">=" { goto Return; } "<=" { goto Return; } "~=" { goto Return; } "[[" { nest_count=0; goto LongString; } L ( L | D )* { ret = name; goto Return; } NUMBER { ret = number; goto Return; } STRING1 { ret = literal; goto Return; } STRING2 { ret = literal; goto Return; } ANY { goto Return; } END { goto TheEnd; } */ LongString: /*!re2c "[[" { nest_count++; goto LongString; } "]]" { if( nest_count == 0 ) { ret = literal; goto Return; } nest_count--; goto LongString; } ANY { goto LongString; } END { luaL_error(L,"unfinished long string"); } */ Comment: /*!re2c ( ANY \ LF )* { goto Space; } END { goto TheEnd; } */ LongComment: /*!re2c "[[" { nest_count++; goto LongComment; } "]]" { if( nest_count == 0 ) goto Space; nest_count--; goto LongComment; } ANY { goto LongComment; } END { luaL_error(L,"unfinished long comment"); } */ luaL_error(L,"impossible"); /* die */ TheEnd: if( --cursor != limit ) luaL_error(L,"didn't reach end of input"); /* die */ lua_pushnil(L); lua_pushnil(L); lua_pushlstring(L, white_space, token - white_space ); save_state(input,state,cursor,marker,limit); return 3; /* nil, nil, ws */ Return: lua_pushlstring(L, token, cursor - token ); if( ret ) lua_pushstring(L, ret ); else lua_pushnil(L); lua_pushlstring(L, white_space, token - white_space ); save_state(input,state,cursor,marker,limit); return 3; /* token, type, ws */ } static int scanner (lua_State *L) { Scanner *s; int len; const char *input = luaL_check_lstr(L, 1, &len); s = (Scanner*)lua_newuserdata(L, sizeof(Scanner)); s->cursor = 0; s->marker = 0; s->limit = len; lua_pushcclosure(L, scan, 2); /* string, userdata */ return 1; } int openLexLua (lua_State *L) { lua_register(L, "LexLua", scanner); return 0; }
此代码可以编译成一个 unix 共享库,如下所示
re2c -s LexLua.c > lex.c gcc -fPIC -g -c lex.c -o lexlua.o gcc -g -shared -Wl,-soname,liblexlua.so -o liblexlua.so.1.0.0 lexlua.o -L/usr/local/lib/ -llua -llualib su cp liblexlua.so.1.0.0 /usr/local/lib cd /usr/local/lib ln -s liblexlua.so.1.0.0 liblexlua.so ldconfig -v /usr/local/lib
$ lua Lua 5.0 (alpha) Copyright (C) 1994-2002 Tecgraf, PUC-Rio > assert(loadlib('/usr/local/lib/liblexlua.so','openLexLua'))() > for tok, tt in LexLua[[ for i = 1,10 do print(i*2) end ]] do print(tok,tt) end for nil i <name> = nil 1 <number> , nil 10 <number> do nil print <name> ( nil i <name> * nil 2 <number> ) nil end nil >
有关如何为 Lua 代码添加颜色的示例,请参见 [4] 或 LuaToHtml