词法分析

lua-users home
wiki

描述

有些人认为 Lua 的正则表达式有限制。Lua 的妙处在于它很容易添加额外的功能。我们不需要花哨的正则表达式,因为我们可以轻松地使用 Peter Bumbulis 的 re2c [1] 添加词法分析器。

这是一个识别 Lua 5.0 语法和关键字的词法扫描器。函数 LexLua 接受要扫描的字符串并返回一个执行扫描的函数。返回的函数是一个闭包,它绑定到 C 函数 scan 和两个上值:要扫描的字符串和一个用于跟踪状态的用户数据。每次调用它时,它都会返回下一个标记,或者在到达字符串末尾时返回 nil

C 代码

re2c 将特殊注释中的正则表达式替换为扫描器的代码。以下是输入 [2] 和输出 [3]re2c

/*
==============================================================================
  LexLua.c
==============================================================================
*/

#include "lua.h"
#include "lauxlib.h"

const char *name    = "<name>";
const char *number  = "<number>";
const char *literal = "<literal>";

#define YYCTYPE  char
#define YYCURSOR cursor
#define YYMARKER marker
#define YYLIMIT  limit
#define YYFILL(n)

#define save_state(i,s,c,m,l) {\
  (s)->cursor = (c)-(i); \
  (s)->marker = (m)-(i); \
  (s)->limit  = (l)-(i); \
}

typedef struct Scanner {
  int cursor, marker, limit;
} Scanner;

static Scanner *check_Scanner(lua_State *L, int index)
{
  luaL_check_type(L, index, LUA_TUSERDATA);
  return (Scanner*)lua_touserdata(L,index);
}

static int scan (lua_State *L)
{
  const char *input = luaL_check_string(L, lua_upvalueindex(1));
  Scanner    *state = check_Scanner(L,lua_upvalueindex(2));
  char *cursor = (char*)input + state->cursor;
  char *marker = (char*)input + state->marker;
  char *limit  = (char*)input + state->limit;
  char *white_space, *token;
  const char *ret = 0;
  int nest_count = 0;

/*!re2c

  D        = [0-9] ;
  E        = [Ee] [+-]? D+ ;
  L        = [a-zA-Z_] ;

  NUMBER   = ( D+ | D* "." D+ | D+ "." D* ) E? ;

  WS       = [ \t\n\v\f]+ ;
  LF       = [\n] ;
  END      = [\000] ;
  ANY      = [\000-\377] \ END ;

  ESC      = [\\] ;
  SQ       = ['] ;
  DQ       = ["] ;

  STRING1  = SQ ( ANY \ SQ \ ESC | ESC ANY )* SQ ;
  STRING2  = DQ ( ANY \ DQ \ ESC | ESC ANY )* DQ ;

*/

Begin:

  white_space = cursor; /* start of white space */

Space:

  token = cursor;       /* start of token */

/*!re2c

  WS               { goto Space; }
  "--[["           { nest_count=0; goto LongComment; }
  "--" | "#"       { goto Comment; }
  "and"            { goto Return; }
  "break"          { goto Return; }
  "do"             { goto Return; }
  "else"           { goto Return; }
  "elseif"         { goto Return; }
  "end"            { goto Return; }
  "false"          { goto Return; }
  "for"            { goto Return; }
  "function"       { goto Return; }
  "global"         { goto Return; }
  "if"             { goto Return; }
  "in"             { goto Return; }
  "local"          { goto Return; }
  "nil"            { goto Return; }
  "not"            { goto Return; }
  "or"             { goto Return; }
  "repeat"         { goto Return; }
  "return"         { goto Return; }
  "then"           { goto Return; }
  "true"           { goto Return; }
  "until"          { goto Return; }
  "while"          { goto Return; }
  "..."            { goto Return; }
  ".."             { goto Return; }
  "=="             { goto Return; }
  ">="             { goto Return; }
  "<="             { goto Return; }
  "~="             { goto Return; }
  "[["             { nest_count=0; goto LongString; }

  L ( L | D )*     { ret = name;    goto Return; }
  NUMBER           { ret = number;  goto Return; }
  STRING1          { ret = literal; goto Return; }
  STRING2          { ret = literal; goto Return; }

  ANY              { goto Return; }
  END              { goto TheEnd; }

*/

LongString:

/*!re2c

  "[["             { nest_count++; goto LongString; }
  "]]"             { if( nest_count == 0 ) { ret = literal; goto Return; }
                     nest_count--; goto LongString; }

  ANY              { goto LongString; }
  END              { luaL_error(L,"unfinished long string"); }

*/

Comment:

/*!re2c

  ( ANY \ LF )*    { goto Space; }
  END              { goto TheEnd; }

*/

LongComment:

/*!re2c

  "[["             { nest_count++; goto LongComment; }
  "]]"             { if( nest_count == 0 ) goto Space;
                     nest_count--; goto LongComment; }

  ANY              { goto LongComment; }
  END              { luaL_error(L,"unfinished long comment"); }

*/

  luaL_error(L,"impossible"); /* die */

TheEnd:

  if( --cursor != limit ) luaL_error(L,"didn't reach end of input"); /* die */
  lua_pushnil(L);
  lua_pushnil(L);
  lua_pushlstring(L, white_space, token - white_space );
  save_state(input,state,cursor,marker,limit);
  return 3; /* nil, nil, ws */

Return:

  lua_pushlstring(L, token, cursor - token );
  if( ret ) lua_pushstring(L, ret );
  else lua_pushnil(L);
  lua_pushlstring(L, white_space, token - white_space );
  save_state(input,state,cursor,marker,limit);
  return 3; /* token, type, ws */
}

static int scanner (lua_State *L)
{
  Scanner *s;
  int len;
  const char *input = luaL_check_lstr(L, 1, &len);
  s = (Scanner*)lua_newuserdata(L, sizeof(Scanner));
  s->cursor = 0;
  s->marker = 0;
  s->limit  = len;
  lua_pushcclosure(L, scan, 2); /* string, userdata */
  return 1;
}

int openLexLua (lua_State *L)
{
  lua_register(L, "LexLua", scanner);
  return 0;
}


编译代码

此代码可以编译成一个 unix 共享库,如下所示

re2c -s LexLua.c > lex.c
gcc -fPIC -g -c lex.c -o lexlua.o
gcc -g -shared -Wl,-soname,liblexlua.so -o liblexlua.so.1.0.0 lexlua.o -L/usr/local/lib/ -llua -llualib

su
cp liblexlua.so.1.0.0 /usr/local/lib
cd /usr/local/lib
ln -s liblexlua.so.1.0.0 liblexlua.so
ldconfig -v /usr/local/lib

Lua 测试代码

$ lua
Lua 5.0 (alpha)  Copyright (C) 1994-2002 Tecgraf, PUC-Rio
> assert(loadlib('/usr/local/lib/liblexlua.so','openLexLua'))()
> for tok, tt in LexLua[[ for i = 1,10 do print(i*2) end ]] do print(tok,tt) end
for     nil
i       <name>
=       nil
1       <number>
,       nil
10      <number>
do      nil
print   <name>
(       nil
i       <name>
*       nil
2       <number>
)       nil
end     nil
> 

有关如何为 Lua 代码添加颜色的示例,请参见 [4]LuaToHtml


最近更改 · 偏好设置
编辑 · 历史记录
最后编辑于 2008 年 8 月 6 日上午 9:41 GMT (差异)