词法分析 |
|
有些人认为 Lua 的正则表达式有限制。Lua 的妙处在于它很容易添加额外的功能。我们不需要花哨的正则表达式,因为我们可以轻松地使用 Peter Bumbulis 的 re2c [1] 添加词法分析器。
这是一个识别 Lua 5.0 语法和关键字的词法扫描器。函数 LexLua 接受要扫描的字符串并返回一个执行扫描的函数。返回的函数是一个闭包,它绑定到 C 函数 scan 和两个上值:要扫描的字符串和一个用于跟踪状态的用户数据。每次调用它时,它都会返回下一个标记,或者在到达字符串末尾时返回 nil。
re2c 将特殊注释中的正则表达式替换为扫描器的代码。以下是输入 [2] 和输出 [3] 的 re2c。
/*
==============================================================================
LexLua.c
==============================================================================
*/
#include "lua.h"
#include "lauxlib.h"
const char *name = "<name>";
const char *number = "<number>";
const char *literal = "<literal>";
#define YYCTYPE char
#define YYCURSOR cursor
#define YYMARKER marker
#define YYLIMIT limit
#define YYFILL(n)
#define save_state(i,s,c,m,l) {\
(s)->cursor = (c)-(i); \
(s)->marker = (m)-(i); \
(s)->limit = (l)-(i); \
}
typedef struct Scanner {
int cursor, marker, limit;
} Scanner;
static Scanner *check_Scanner(lua_State *L, int index)
{
luaL_check_type(L, index, LUA_TUSERDATA);
return (Scanner*)lua_touserdata(L,index);
}
static int scan (lua_State *L)
{
const char *input = luaL_check_string(L, lua_upvalueindex(1));
Scanner *state = check_Scanner(L,lua_upvalueindex(2));
char *cursor = (char*)input + state->cursor;
char *marker = (char*)input + state->marker;
char *limit = (char*)input + state->limit;
char *white_space, *token;
const char *ret = 0;
int nest_count = 0;
/*!re2c
D = [0-9] ;
E = [Ee] [+-]? D+ ;
L = [a-zA-Z_] ;
NUMBER = ( D+ | D* "." D+ | D+ "." D* ) E? ;
WS = [ \t\n\v\f]+ ;
LF = [\n] ;
END = [\000] ;
ANY = [\000-\377] \ END ;
ESC = [\\] ;
SQ = ['] ;
DQ = ["] ;
STRING1 = SQ ( ANY \ SQ \ ESC | ESC ANY )* SQ ;
STRING2 = DQ ( ANY \ DQ \ ESC | ESC ANY )* DQ ;
*/
Begin:
white_space = cursor; /* start of white space */
Space:
token = cursor; /* start of token */
/*!re2c
WS { goto Space; }
"--[[" { nest_count=0; goto LongComment; }
"--" | "#" { goto Comment; }
"and" { goto Return; }
"break" { goto Return; }
"do" { goto Return; }
"else" { goto Return; }
"elseif" { goto Return; }
"end" { goto Return; }
"false" { goto Return; }
"for" { goto Return; }
"function" { goto Return; }
"global" { goto Return; }
"if" { goto Return; }
"in" { goto Return; }
"local" { goto Return; }
"nil" { goto Return; }
"not" { goto Return; }
"or" { goto Return; }
"repeat" { goto Return; }
"return" { goto Return; }
"then" { goto Return; }
"true" { goto Return; }
"until" { goto Return; }
"while" { goto Return; }
"..." { goto Return; }
".." { goto Return; }
"==" { goto Return; }
">=" { goto Return; }
"<=" { goto Return; }
"~=" { goto Return; }
"[[" { nest_count=0; goto LongString; }
L ( L | D )* { ret = name; goto Return; }
NUMBER { ret = number; goto Return; }
STRING1 { ret = literal; goto Return; }
STRING2 { ret = literal; goto Return; }
ANY { goto Return; }
END { goto TheEnd; }
*/
LongString:
/*!re2c
"[[" { nest_count++; goto LongString; }
"]]" { if( nest_count == 0 ) { ret = literal; goto Return; }
nest_count--; goto LongString; }
ANY { goto LongString; }
END { luaL_error(L,"unfinished long string"); }
*/
Comment:
/*!re2c
( ANY \ LF )* { goto Space; }
END { goto TheEnd; }
*/
LongComment:
/*!re2c
"[[" { nest_count++; goto LongComment; }
"]]" { if( nest_count == 0 ) goto Space;
nest_count--; goto LongComment; }
ANY { goto LongComment; }
END { luaL_error(L,"unfinished long comment"); }
*/
luaL_error(L,"impossible"); /* die */
TheEnd:
if( --cursor != limit ) luaL_error(L,"didn't reach end of input"); /* die */
lua_pushnil(L);
lua_pushnil(L);
lua_pushlstring(L, white_space, token - white_space );
save_state(input,state,cursor,marker,limit);
return 3; /* nil, nil, ws */
Return:
lua_pushlstring(L, token, cursor - token );
if( ret ) lua_pushstring(L, ret );
else lua_pushnil(L);
lua_pushlstring(L, white_space, token - white_space );
save_state(input,state,cursor,marker,limit);
return 3; /* token, type, ws */
}
static int scanner (lua_State *L)
{
Scanner *s;
int len;
const char *input = luaL_check_lstr(L, 1, &len);
s = (Scanner*)lua_newuserdata(L, sizeof(Scanner));
s->cursor = 0;
s->marker = 0;
s->limit = len;
lua_pushcclosure(L, scan, 2); /* string, userdata */
return 1;
}
int openLexLua (lua_State *L)
{
lua_register(L, "LexLua", scanner);
return 0;
}
此代码可以编译成一个 unix 共享库,如下所示
re2c -s LexLua.c > lex.c gcc -fPIC -g -c lex.c -o lexlua.o gcc -g -shared -Wl,-soname,liblexlua.so -o liblexlua.so.1.0.0 lexlua.o -L/usr/local/lib/ -llua -llualib su cp liblexlua.so.1.0.0 /usr/local/lib cd /usr/local/lib ln -s liblexlua.so.1.0.0 liblexlua.so ldconfig -v /usr/local/lib
$ lua
Lua 5.0 (alpha) Copyright (C) 1994-2002 Tecgraf, PUC-Rio
> assert(loadlib('/usr/local/lib/liblexlua.so','openLexLua'))()
> for tok, tt in LexLua[[ for i = 1,10 do print(i*2) end ]] do print(tok,tt) end
for nil
i <name>
= nil
1 <number>
, nil
10 <number>
do nil
print <name>
( nil
i <name>
* nil
2 <number>
) nil
end nil
>
有关如何为 Lua 代码添加颜色的示例,请参见 [4] 或 LuaToHtml