Ex 模式

lua-users home
wiki

xpattern 是一个基于 Lua 模式并以 LuaPeg 风格的 Lua 模式匹配库。

描述

Lua 模式 [1] 是正则表达式的有限形式。与正则表达式不同,模式不支持对除单个字符集以外的任何内容进行分组和量化 [2] 操作。对于完整的正则表达式(或 Perl 风格的正则表达式)支持,您可以编译正则表达式模块,例如 lrexlib [3]。为了获得更大的灵活性,对于解析和词法分析应用程序,甚至对于更简单的情况,可以使用解析表达式语法 (PEG) 通过用 C 实现的 LuaPeg 库 [4]。

然而,有时 Lua 模式几乎足够了,因为我们可能希望在不编译任何额外的 C 代码的情况下,稍微扩展模式的功能。这就是这个模块的用武之地。这个模块在 Lua 模式之上构建扩展模式匹配。它采用以类似于 LPeg 的形式编写的模式表达式,并将其转换为包含 string.match 调用的 Lua 代码字符串,然后通过 loadstring 将其编译为 Lua 函数。这类似于您可能自己编写的代码,尽管这个模块使用代码生成对其进行了自动化。假设您预编译了重复使用的任何匹配对象,它应该具有类似的效率。

这是一个简单的例子

local M = require "xpattern"
local P = M.P
local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()
local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first
assert(a == nil and b == 'ccc' and c == 'b' and d == 11)

它在内部生成此 Lua 函数

local match = ...  -- string.match
return function(s,pos)
  for pos1=(pos or 1),#s do
    local pos2
    local c1,c2,c3,c4
    local pos3 = pos1
    c1,pos3 = match(s, "^(b+)()", pos1)
    if not pos3 then
      c2,pos3 = match(s, "^(c+)()", pos1)
    end
    pos2 = pos3
    if pos2 then
      do
        local pos3=pos2
        pos2 = pos2
        while 1 do
          pos3 = match(s, "^[A-Z][a-z]()", pos3)
          if pos3 then
            pos2=pos3
          else break end
        end
      end
    end
    if pos2 then
      c3,c4,pos2 = match(s, "^(.)()()", pos2)
    end
    if pos2 then return c1,c2,c3,c4 end
  end
end

有关更多示例,请参见下面的测试套件。

状态

警告:此代码应被视为实验性的。它尚未完全开发或测试。如果您想在生产中使用它,请修复代码。

附录:xpattern.lua

-- xpattern.lua
-- Preliminary regular expression-like support in Lua
-- Uses Lua patterns as the core building block.
--
-- Implemented in pure Lua with code generation technique.
-- It translates an expression into a snippet of Lua code
-- having a series of `string.match` calls, which is then
-- compiled (via `loadstring`).
--
-- Like lpeg, does not support backtracking.
--
-- WARNING: This is experimental code.  The design and implementation
-- has not been thoroughly tested.
--
-- Version v20091021.
-- (c) 2008-2009 David Manura. Licensed under the same terms as Lua (MIT license).
-- Please post patches.

local M = {}

local string = string
local format = string.format
local match  = string.match
local assert = assert
local error  = error
local ipairs = ipairs
local loadstring   = loadstring
local setmetatable = setmetatable
local type   = type
local print  = print


-- Adds whitespace to string `s`.
-- Whitespace string `ws` (default to '' if omitted) is prepended to each line
-- of `s`.  Also ensures `s` is is terminated by a newline.
local function add_whitespace(s, ws)
  ws = ws or ''
  s = s:gsub('[^\r\n]+', ws .. '%1')
  if s:match('[^\r\n]$') then
    s = s .. '\n'
  end
  return s
end

-- Counts the number `count` of captures '()' in Lua pattern string `pat`.
local function count_captures(pat)
  local count = 0
  local pos = 1
  while pos <= #pat do
    local pos2 = pat:match('^[^%(%%%[]+()', pos)
    if pos2 then
      pos = pos2
    elseif pat:match('^%(', pos) then
      count = count + 1
      pos = pos + 1
    elseif pat:match('^%%b..', pos) then
      pos = pos + 3
    elseif pat:match('^%%.', pos) then
      pos = pos + 2
    else
      local pos2 = pat:match('^%[[^%]%%]*()', pos)
      if pos2 then
        pos = pos2
        while 1 do
          local pos2 = pat:match('^%%.[^%]%%]*()', pos)
          if pos2 then
            pos = pos2
          elseif pat:match('^%]', pos) then
            pos = pos + 1
            break
          else
            error('syntax', 2)
          end
        end
      else
        error('syntax', 2)
      end
    end
  end
  return count
end
M._count_captures = count_captures


-- Appends '()' to Lua pattern string `pat`.
local function pat_append_pos(pat)
  local prefix = pat:match'^(.*)%$$'
  pat = prefix and prefix .. '()$' or pat .. '()'
  return pat
end

-- Prepends '()' to Lua pattern string `pat`.
local function pat_prepend_pos(pat)
  local postfix = pat:match'^%^(.*)'
  pat = postfix and '^()' .. postfix or '()' .. pat
  return pat
end


-- Prepends '^' to Lua pattern string `pat`.
local function pat_prepend_carrot(pat)
  local postfix = pat:match'^%^(.*)'
  pat = postfix and pat or '^' .. pat
  return pat
end


-- Return a string listing pattern capture variables with indices `firstidx`
-- to `lastidx`.
-- Ex: code_vars(1,2) --> 'c1,c2'
local function code_vars(firstidx, lastidx)
  local code = ''
  for i=firstidx,lastidx do
    code = code .. (i == firstidx and '' or ',') .. 'c' .. i
  end
  return code
end


-- Metatable for expression objects
local epat_mt = {}
epat_mt.__index = epat_mt


-- Builds an extended pattern object `epat` from Lua string pattern `pat`.
local function pattern(pat)
  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    local ncaptures = count_captures(pat)
    local lvars =
      code_vars(totncaptures0+1, totncaptures0+ncaptures)
      .. (ncaptures == 0 and '' or ',') .. 'pos' .. destidx0
    local pat = pat_append_pos(pat)

    pat = pat_prepend_carrot(pat)

    local str = format('%q', pat)
    local code = lvars .. ' = match(s, ' .. str .. ', pos' .. srcidx0 .. ')\n'
    return code, ncaptures
  end
  epat.anchored = pat:sub(1,1) == '^'
  return epat
end


-- Generates code from pattern `anypat` (either Lua pattern string or extended
-- pattern object).
--  `anypat`    - either Lua pattern string or extended pattern object
--  `srcidx0`   - index of variable holding position to start matching at
--  `destidx0`  - index of variable holding position to store subsequent
--                match position at.  stores nil if no match
--  `totncaptures0` - number of captures prior to this match
--  `code`      - Lua code string (code) and number of
--  `ncaptures` - number of captures in pattern.
local function gen(anypat, srcidx0, destidx0, totncaptures0)
  if type(anypat) == 'string' then
    anypat = pat_prepend_carrot(anypat)
    anypat = pattern(anypat)
  end
  local code, ncaptures = anypat(srcidx0, destidx0, totncaptures0)
  return code, ncaptures
end


-- Creates a new extended pattern object `epat` that is the concatenation of
-- the given list (of size >= 0) of pattern objects.
-- Specify a single string argument to convert a Lua pattern to an extended
-- pattern object.
local function seq(...) -- epats...
  -- Ensure args are extended pattern objects.
  local epats = {...}
  for i=1,#epats do
    if type(epats[i]) == 'string' then
      epats[i] = pattern(epats[i])
    end
  end

  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0, ws)
    if #epats == 0 then
      return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0
    elseif #epats == 1 then
      return epats[1](srcidx0, destidx0, totncaptures0, ws)
    else
      local ncaptures = 0
      local pat_code, pat_ncaptures =
          gen(epats[1], srcidx0, destidx0, totncaptures0+ncaptures, true)
      ncaptures = ncaptures + pat_ncaptures
      local code = add_whitespace(pat_code, '')

      for i=2,#epats do
        local pat_code, pat_ncaptures =
            gen(epats[i], destidx0, destidx0, totncaptures0+ncaptures, true)
        ncaptures = ncaptures + pat_ncaptures
        code =
          code ..
          'if pos' .. destidx0 .. ' then\n' ..
            add_whitespace(pat_code, '  ') ..
          'end\n'
      end
      return code, ncaptures
    end
  end
  if epats[1] and epats[1].anchored then
    epat.anchored = true
  end
  return epat
end
M.P = seq


-- Creates new extended pattern object `epat` that is the alternation of the
-- given list of pattern objects `epats...`.
local function alt(...) -- epats...
  -- Ensure args are extended pattern objects.
  local epats = {...}
  for i=1,#epats do
    if type(epats[i]) == 'string' then
      epats[i] = pattern(epats[i])
    end
  end

  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    if #epats == 0 then
      return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0
    elseif #epats == 1 then
      return epats[1](srcidx0, destidx0, totncaptures0)
    else
      local ncaptures = 0
      local pat_code, pat_ncaptures =
          gen(epats[1], srcidx0, destidx0+1, totncaptures0+ncaptures, true)
      ncaptures = ncaptures + pat_ncaptures
      local code = 'local pos' .. destidx0+1 .. ' = pos' .. srcidx0 .. '\n' ..
                   add_whitespace(pat_code, '')

      for i=2,#epats do
        local pat_code, pat_ncaptures =
            gen(epats[i], srcidx0, destidx0+1, totncaptures0+ncaptures, true)
        ncaptures = ncaptures + pat_ncaptures
        code =
          code ..
          'if not pos' .. destidx0+1 .. ' then\n' ..
            add_whitespace(pat_code, '  ') ..
          'end\n'
      end
      code = code .. 'pos' .. destidx0 .. ' = pos' .. destidx0+1 .. '\n'
      return code, ncaptures
    end
  end
  return epat
end
M.alt = alt


-- Creates new extended pattern object `epat` that is zero or more repetitions
-- of the given pattern object `pat` (if one evaluates to false) or one or more
-- (if one evaluates to true).
local function star(pat, one)
  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    local ncaptures = 0
    local destidx = destidx0 + 1
    local code = 'do\n' ..
                 '  local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'
    local pat_code, pat_ncaptures =
        gen(pat, destidx, destidx, totncaptures0+ncaptures, true)
    ncaptures = ncaptures + pat_ncaptures
    code = code ..
      (one and ('  pos' .. destidx0 .. ' = nil\n')
           or  ('  pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n')) ..
      '  while 1 do\n' ..
           add_whitespace(pat_code, '    ') ..
      '    if pos' .. destidx .. ' then\n' ..
      '      pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..
      '    else break end\n' ..
      '  end\n' ..
      'end\n'
    return code, ncaptures
  end
  return epat
end
M.star = star


-- Creates new extended pattern object `epat` that is zero or one of the
-- given pattern object `epat0`.
local function zero_or_one(epat0)
  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    local ncaptures = 0
    local destidx = destidx0 + 1
    local code = 'do\n' ..
                 '  local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'
    local epat0_code, epat0_ncaptures =
        gen(epat0, destidx, destidx, totncaptures0+ncaptures, true)
    ncaptures = ncaptures + epat0_ncaptures
    code = code ..
      add_whitespace(epat0_code) ..
      '  if pos' .. destidx .. ' then\n' ..
      '    pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..
      '  else\n' ..
      '    pos' .. destidx0 .. '=pos' .. srcidx0 .. '\n' ..
      '  end\n' ..
      'end\n'
    return code, ncaptures
  end
  return epat
end
M.zero_or_one = zero_or_one


-- Gets Lua core code string `code` corresponding to pattern object `epat`
local function basic_code_of(epat)
  local pat_code, ncaptures = epat(1, 2, 0, true)
  local lvars = code_vars(1, ncaptures)

  if epat.anchored then
    local code =
      'local pos1=pos or 1\n' ..
      'local pos2\n' ..
      (lvars ~= '' and '  local ' .. lvars .. '\n' or '') ..
      add_whitespace(pat_code) .. '\n' ..
      'if pos2 then return ' .. (lvars ~= '' and lvars or 's:sub(pos1,pos2-1)') .. ' end\n'
    return code
  else
    local code =
        'for pos1=(pos or 1),#s do\n' ..
        '  local pos2\n'
    if lvars == '' then
      code =
        code ..
           add_whitespace(pat_code, '  ') ..
        '  if pos2 then return s:sub(pos1,pos2-1) end\n'
    else
      code =
        code ..
        '  local ' .. lvars .. '\n' ..
           add_whitespace(pat_code, '  ') ..
        '  if pos2 then return ' .. lvars .. ' end\n'
    end
    code =
        code ..
        'end\n'
    return code
  end
end
M.basic_code_of = basic_code_of


-- Gets Lua complete code string `code` corresponding to pattern object `epat`.
local function code_of(epat)
  local code =
    'local match = ...\n' ..
    'return function(s,pos)\n' ..
    add_whitespace(basic_code_of(epat), '  ') ..
    'end\n'
  return code
end
M.code_of = code_of


-- Compiles pattern object `epat` to Lua function `f`.
local function compile(epat)
  local code = code_of(epat)
  if M.debug then print('DEBUG:\n' .. code) end
  local f = assert(loadstring(code))(match)
  return f
end
M.compile = compile


-- operator for pattern matching
function epat_mt.__call(epat, ...)
  return epat.call(...)
end


-- operator for pattern alternation
function epat_mt.__add(a_epat, b_epat)
  return alt(a_epat, b_epat)
end


-- operator for pattern concatenation
function epat_mt.__mul(a_epat, b_epat)
  return seq(a_epat, b_epat)
end


-- operator for pattern repetition
function epat_mt.__pow(epat, n)
  if n == 0 then
    return star(epat)
  elseif n == 1 then
    return star(epat, true)
  elseif n == -1 then
    return zero_or_one(epat)
  else
    error 'FIX - unimplemented'
  end
end


-- IMPROVE design?
epat_mt.compile       = compile
epat_mt.basic_code_of = basic_code_of
epat_mt.code_of       = code_of


return M

附录:xpattern_test.lua

-- xpattern_test.lua - test suite for xpattern.lua

-- utility function: convert list of values to string.
local function str(...)
  local n = select('#', ...)
  local t = {...}
  for i=1,n do t[i] = tostring(t[i]) end
  return table.concat(t, ',')
end

local M = require "xpattern"
local P = M.P
M.debug = true

-- internal: _count_captures
assert(M._count_captures'' == 0)
assert(M._count_captures'a' == 0)
assert(not pcall(function() M._count_captures'%' end))
assert(M._count_captures'()' == 1)
assert(M._count_captures'%(%)' == 0)    -- %(
assert(M._count_captures'[()]' == 0)    -- () inside []
assert(M._count_captures'[%(%)]' == 0)  -- %( inside []
assert(M._count_captures'[%]()]' == 0)  -- %] inside []
assert(M._count_captures'[]()]' == 1)
assert(M._count_captures'%b()' == 0)    -- () on %b..
assert(M._count_captures'(()().())' == 4)   -- nested
-- more complex example
assert(M._count_captures'.(.%))[(]%(()' == 2)


-- simple matching
assert(str(P'':compile()('')) == '')
assert(str(P'':compile()('a')) == '')
assert(str(P'a':compile()('')) == '')
assert(str(P'a':compile()('a')) == 'a')
assert(str(P'a':compile()('ba')) == 'a')
assert(str(P'a+':compile()('baa')) == 'aa')

-- simple anchors
assert(str(P'^a+':compile()('aa')) == 'aa')
assert(str(P'^a+':compile()('baab')) == '') -- $ fail
assert(str(P'a+$':compile()('baa')) == 'aa')
assert(str(P'a+$':compile()('baab')) == '') -- $ fail

-- simple captures
assert(str(P'(a+)(b+)':compile()('baab')) == 'aa,b')
assert(str(P'^(a+)(b+)':compile()('baab')) == '')

-- simple sequences
local m = P():compile()
assert(str( m('') ) == '')
assert(str( m('a') ) == '')
local m = P(''):compile()
assert(str( m('') ) == '')
assert(str( m('a') ) == '')
local m = P('a', 'b', 'c'):compile()
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = (P'a' * P'b' * P'c'):compile() -- identical
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = P(P'a', 'b', P'c'):compile() -- identical
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = P(P'a+', 'b+', P'c+'):compile()
assert(str( m('.abaabcc.') ) == 'aabcc')

-- simple alternation
local m = (P'aa+' + P'bb+'):compile()
assert(str( m('abbaa') ) == 'bb')
local m = (P'aa+' + P'bb+' + P'cc+'):compile()
assert(str( m('abccaa') ) == 'cc')

-- simple combinations
local m = ((P'(a+)' + P'b(b*)') * P'(c+)()'):compile()
assert(str( m("aacccdd")) == 'aa,nil,ccc,6')
assert(str( m("bbcccdd")) == 'nil,b,ccc,6')
assert(str( m("bbdd")) == '')
print('?'..str( m("aabbcc")))
assert(str( m("aabbcc")) == 'nil,b,cc,7') -- alternative

-- simple replication (*)
local m = ( P'a'^0 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'aa')

-- replication (*)
local m = ( (P'a+' + P'b+')^0 ):compile()
assert(str(m'zabaabbc') == '')
assert(str(m'abaabb') == 'abaabb')
local m = ( (P'a+' * P'b+' + P'c+' * P'd+')^0 ):compile()
assert(str(m'aabbccddaa') == 'aabbccdd')
local m = ( P'aa'^0 * P'bb' * P'aa'^0 ):compile()
assert(str(m'aaccaaaabbaa') == 'aaaabbaa')

-- simple replication (+)
local m = ( P'a'^1 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'aa')

-- replacation (+)
local m = ( P'b' * P'a'^1 ):compile()
print(m'b')
assert(str(m'b') == '')
assert(str(m'ba') == 'ba')
assert(str(m'baab') == 'baa')

-- simple replication (?)
local m = ( P'a'^-1 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'a')

-- replication (?)
local m = ( P'c' * (P'a+' + P'b+')^-1 ):compile()
assert(str(m'caabb') == 'caa')


-- Some of these examples from Mastering Regular Expressions (MRE),
-- 2nd Ed. Jeffrey .Friedl.

-- MRE p.19
local m = ( P'^' * (P'From' + P'Subject' + P'Date') * P':%s*(.*)' ):compile()
assert(str(m('Subject: test')) == 'test')

-- MRE p.13
local m = ( (P'Geo' + P'Je') * P'ff' * (P're' + P'er') * P'y' ):compile()
assert(str(m'Jeffrey') == 'Jeffrey')
assert(str(m'Jeffery') == 'Jeffery')
assert(str(m'Geoffrey') == 'Geoffrey')
assert(str(m'Geoffery') == 'Geoffery')
assert(str(m'Jefery') == '')
assert(str(m'Geofferi') == '')
assert(str(m'GeoffrezGeoffery') == 'Geoffery') -- skips
assert(str(m'JefferzGeoffery') == 'Geoffery') -- skips
assert(str(m'GeoffJeffery') == 'Jeffery') -- skips

-- MRE p.24
local m = ( P'%$[0-9]+' * P'%.[0-9][0-9]'^-1 ):compile()
assert(str(m'$20.00') == '$20.00')
assert(str(m'$20') == '$20')
assert(str(m'$20.00.00') == '$20.00')

-- example
print 'example'
local M = require "xpattern"
local P = M.P
local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()
local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first
assert(a == nil and b == 'ccc' and c == 'b' and d == 11)

-- example
local m = P('foo', P'bar'+P'baz', 'qux'):compile()
assert(str(m'afoobazfoobarquxbar', 'foobarqux'))
local m = P('^foo', P'bar'+P'baz', 'qux'):compile() -- anchored
assert(str(m'afoobazfoobarquxbar', ''))
assert(str(m'foobarquxbar', ''))

-- https://lua-users.lua.ac.cn/lists/lua-l/2009-10/msg00752.html
local m = (
  P'^' * ( ( P'ceil'+P'abs' +P'floor'+P'mod' +P'exp'+P'log'+P'pow'+
             P'sqrt'+P'acos'+P'asin' +P'atan'+P'cos'+P'sin'+P'tan'+
             P'deg' +P'rad' +P'random'
           ) * P'%('
           + P'[0-9%(%)%-%+%*%/%.%,]' + P'pi'
          )^1 * P'$'
):compile()
assert(m'cos(1+pi)' == 'cos(1+pi)')
assert(m'cos(1+p)' == nil) -- 'p'
assert(m'cos(12.3/2)+mod(2,3)' == 'cos(12.3/2)+mod(2,3)')
assert(m'cos(12.3/2)+mod(2,3) ' == nil) -- ' '
assert(m' cos(12.3/2)+mod(2,3)' == nil) -- ' '
assert(m'cos(12.3/2)+mod+2' == nil) -- no '('

print 'DONE'

更改

作者

DavidManura

另请参阅


最近更改 · 偏好设置
编辑 · 历史记录
最后编辑于 2010 年 1 月 7 日凌晨 2:27 GMT (差异)