字符串修剪

lua-users home
wiki

在 Lua 中,有很多方法可以实现“修剪”函数 [1]

-- trim implementations

function trim1(s)
   return (s:gsub("^%s*(.-)%s*$", "%1"))
end
-- from PiL2 20.4

function trim2(s)
   return s:match "^%s*(.-)%s*$"
end
-- variant of trim1 (match)

function trim3(s)
   return s:gsub("^%s+", ""):gsub("%s+$", "")
end
-- two gsub's

function trim4(s)
   return s:match"^%s*(.*)":match"(.-)%s*$"
end
-- variant of trim3 (match)

function trim5(s)
   return s:match'^%s*(.*%S)' or ''
end
-- warning: has bad performance when s:match'^%s*$' and #s is large

function trim6(s)
   return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)'
end
-- fixes performance problem in trim5.
-- note: the '()' avoids the overhead of default string capture.
-- This overhead is small, ~ 10% for successful whitespace match call
-- alone, and may not be noticeable in the overall benchmarks here,
-- but there's little harm either.  Instead replacing the first `match`
-- with a `find` has a similar effect, but that requires localizing
-- two functions in the trim7 variant below.

local match = string.match
function trim7(s)
   return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)')
end
-- variant of trim6 (localize functions)

local find = string.find
local sub = string.sub
function trim8(s)
   local i1,i2 = find(s,'^%s*')
   if i2 >= i1 then
      s = sub(s,i2+1)
   end
   local i1,i2 = find(s,'%s*$')
   if i2 >= i1 then
      s = sub(s,1,i1-1)
   end
   return s
end
-- based on penlight 0.7.2

function trim9(s)
   local _, i1 = find(s,'^%s*')
   local i2 = find(s,'%s*$')
   return sub(s, i1 + 1, i2 - 1)
end
-- simplification of trim8

function trim10(s)
   local a = s:match('^%s*()')
   local b = s:match('()%s*$', a)
   return s:sub(a,b-1)
end
-- variant of trim9 (match)

function trim11(s)
   local n = s:find"%S"
   return n and s:match(".*%S", n) or ""
end
-- variant of trim6 (use n position)
-- https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00904.html

function trim12(s)
   local from = s:match"^%s*()"
   return from > #s and "" or s:match(".*%S", from)
end
-- variant of trim11 (performs better for all
-- whitespace string). See Roberto's comments
-- on ^%s*$" v.s. "%S" performance:
-- https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00921.html

do
   local lpeg = require("lpeg")
   local space = lpeg.S' \t\n\v\f\r'
   local nospace = 1 - space
   local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0)
   local match = lpeg.match
   function trim13(s)
      return match(ptrim, s)
   end
end
-- lpeg.  based on https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00921.html

do
   local lpeg = require("lpeg")
   local re = require("re")
   local ptrim = re.compile"%s* {(%s* %S+)*}"
   local match = lpeg.match
   function trim14(s)
      return match(ptrim, s)
   end
end
-- variant with re module.

require 'trim'
local trim15 = trim
-- C implementation (see separate trim.c file)


-- test utilities

local function trimtest(trim)
   assert(trim'' == '')
   assert(trim' ' == '')
   assert(trim'  ' == '')
   assert(trim'a' == 'a')
   assert(trim' a' == 'a')
   assert(trim'a ' == 'a')
   assert(trim' a ' == 'a')
   assert(trim'  a  ' == 'a')
   assert(trim'  ab cd  ' == 'ab cd')
   assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b')
end

local function perftest(f, s)
   local time = os.clock  -- os.time or os.clock
   local t1 = time()
   for i=1,100000 do
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
   end
   local dt = time() - t1
   io.stdout:write(("%4.1f"):format(dt) .. " ")
end

local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7,
               trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15}

-- correctness tests
for _,trim in ipairs(trims) do
   trimtest(trim)
end

-- performance tests
for j = 1, 3 do
   for i,trim in ipairs(trims) do
      io.stdout:write(string.format("%2d",i) .. ": ")
      perftest(trim,  "")
      perftest(trim,  "abcdef")
      perftest(trim,  "   abcdef   ")
      perftest(trim,  "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef")
      perftest(trim,  "  a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ")
      perftest(trim,  "                               a                            ")
      perftest(trim,  "                                                            ")
      print()
   end
end

基于 LuaList:2009-12/msg00951.html 的 test15 可选 C 模块

/* trim.c - based on https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00951.html
            from Sean Conner */
#include <stddef.h>
#include <ctype.h>
#include <lua.h>
#include <lauxlib.h>

int trim(lua_State* L) {
   const char* front;
   const char* end;
   size_t size;

   front = luaL_checklstring(L,1,&size);
   end = &front[size - 1];

   while (size && isspace((unsigned char)*front)) {
      size--;
      front++;
   }

   while (size && isspace((unsigned char)*end)) {
      size--;
      end--;
   }

   lua_pushlstring(L,front,(size_t)(end - front) + 1);
   return 1;
}

int luaopen_trim(lua_State* L) {
   lua_register(L, "trim", trim);
   return 0;
}

测试结果(数字越小越快)

Lua 5.1.4/Cygwin1.7
 1:  0.9  1.9  2.1  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.2  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.1  1.2  1.3  2.8 77.6
 6:  0.6  1.2  1.6  1.6  1.8  4.1  1.7
 7:  0.6  1.1  1.5  1.5  1.6  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.7  3.0  2.4
 9:  1.2  2.0  2.7  8.0  9.3 21.2  3.4
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  1.9
 2:  0.7  1.6  1.8  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.4  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.2  2.8 78.2
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.5  9.8 10.8  2.8  2.5
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.4  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.3  2.8 77.3
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.6  7.4  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.6  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3

LuaJIT 2.0.0-beta2/Cygwin1.7
 1:  0.6  1.5  1.5  7.7  8.3  1.3  1.2
 2:  0.4  1.2  1.2  7.1  7.8  1.1  1.0
 3:  0.6  1.0  1.2  3.1  4.9  1.7  1.3
 4:  0.8  1.6  1.8  8.4  9.0  1.9  1.3
 5:  0.4  0.6  0.8  1.2  1.2  2.3 99.2
 6:  0.4  0.9  1.1  1.5  1.5  3.2  0.9
 7:  0.3  0.8  1.1  1.4  1.5  3.0  0.8
 8:  0.6  1.2  1.6  5.3  6.8  1.7  1.3
 9:  0.7  1.2  1.8  5.6  6.7 14.4  1.7
10:  0.9  1.6  1.7  7.6  8.3  1.5  1.5
11:  0.3  0.8  1.1  1.4  1.5  2.9  2.1
12:  0.4  0.9  1.1  1.5  1.5  2.7  0.9
13:  0.6  0.7  0.7  1.0  1.4  0.8  0.7
14:  0.6  0.7  0.7  1.0  1.4  0.8  0.8
15:  0.1  0.1  0.1  0.3  0.3  0.2  0.2
...

trim5 的速度在数据集中相对有利或具有竞争力,除了在只有空格的长字符串情况下速度很慢。使用“.*”(而不是非贪婪的“.-”)会快速地遍历长字符串的末尾,然后“%S”会触发回溯以避免尾随空格,但“%s*”和“.*”的并置如果找不到“%S”匹配项,则会造成大量的回溯(O(N^2))。trim6 特别处理最坏情况,并在整个数据集中表现良好。trim7 是一个将函数本地化的变体,它会产生更大的代码大小,并且不会带来实质性的速度提升。(在进一步测试中,每个本地化在涉及空字符串数据和内联函数调用的最佳情况下可能带来 10% 的提升,但这里有两个对 match 的调用)。trim11 也是 trim6 的一个变体,具有类似的性能特征,可能略好,除了在只有空格的长字符串情况下速度大约是原来的一半(这在 trim12 中已修复)。trim13(或其变体 trim14)是一个 lpeg (LuaPeg) 实现,它通常能达到或超过最佳模式匹配实现的性能,特别是在大量空格的情况下,但在交替空格和非空格的最坏情况下速度稍慢。C 实现 (trim15) 在整个数据集中是迄今为止最快的。

在所有测试迭代中重用相同的字符串,如上所述,可能无法正确衡量由于 Lua 的字符串驻留而导致的临时字符串创建的影响。使用 local ss = {}; for i = 1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end 作为字符串缓存的快速测试似乎不会影响上述基本结论。

--DavidManura

另请参阅


最近更改 · 偏好设置
编辑 · 历史记录
最后编辑于 2022 年 4 月 15 日凌晨 3:02 GMT (差异)