字符串修剪 |
|
-- trim implementations function trim1(s) return (s:gsub("^%s*(.-)%s*$", "%1")) end -- from PiL2 20.4 function trim2(s) return s:match "^%s*(.-)%s*$" end -- variant of trim1 (match) function trim3(s) return s:gsub("^%s+", ""):gsub("%s+$", "") end -- two gsub's function trim4(s) return s:match"^%s*(.*)":match"(.-)%s*$" end -- variant of trim3 (match) function trim5(s) return s:match'^%s*(.*%S)' or '' end -- warning: has bad performance when s:match'^%s*$' and #s is large function trim6(s) return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)' end -- fixes performance problem in trim5. -- note: the '()' avoids the overhead of default string capture. -- This overhead is small, ~ 10% for successful whitespace match call -- alone, and may not be noticeable in the overall benchmarks here, -- but there's little harm either. Instead replacing the first `match` -- with a `find` has a similar effect, but that requires localizing -- two functions in the trim7 variant below. local match = string.match function trim7(s) return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)') end -- variant of trim6 (localize functions) local find = string.find local sub = string.sub function trim8(s) local i1,i2 = find(s,'^%s*') if i2 >= i1 then s = sub(s,i2+1) end local i1,i2 = find(s,'%s*$') if i2 >= i1 then s = sub(s,1,i1-1) end return s end -- based on penlight 0.7.2 function trim9(s) local _, i1 = find(s,'^%s*') local i2 = find(s,'%s*$') return sub(s, i1 + 1, i2 - 1) end -- simplification of trim8 function trim10(s) local a = s:match('^%s*()') local b = s:match('()%s*$', a) return s:sub(a,b-1) end -- variant of trim9 (match) function trim11(s) local n = s:find"%S" return n and s:match(".*%S", n) or "" end -- variant of trim6 (use n position) -- https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00904.html function trim12(s) local from = s:match"^%s*()" return from > #s and "" or s:match(".*%S", from) end -- variant of trim11 (performs better for all -- whitespace string). See Roberto's comments -- on ^%s*$" v.s. "%S" performance: -- https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00921.html do local lpeg = require("lpeg") local space = lpeg.S' \t\n\v\f\r' local nospace = 1 - space local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0) local match = lpeg.match function trim13(s) return match(ptrim, s) end end -- lpeg. based on https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00921.html do local lpeg = require("lpeg") local re = require("re") local ptrim = re.compile"%s* {(%s* %S+)*}" local match = lpeg.match function trim14(s) return match(ptrim, s) end end -- variant with re module. require 'trim' local trim15 = trim -- C implementation (see separate trim.c file) -- test utilities local function trimtest(trim) assert(trim'' == '') assert(trim' ' == '') assert(trim' ' == '') assert(trim'a' == 'a') assert(trim' a' == 'a') assert(trim'a ' == 'a') assert(trim' a ' == 'a') assert(trim' a ' == 'a') assert(trim' ab cd ' == 'ab cd') assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b') end local function perftest(f, s) local time = os.clock -- os.time or os.clock local t1 = time() for i=1,100000 do f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) end local dt = time() - t1 io.stdout:write(("%4.1f"):format(dt) .. " ") end local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7, trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15} -- correctness tests for _,trim in ipairs(trims) do trimtest(trim) end -- performance tests for j = 1, 3 do for i,trim in ipairs(trims) do io.stdout:write(string.format("%2d",i) .. ": ") perftest(trim, "") perftest(trim, "abcdef") perftest(trim, " abcdef ") perftest(trim, "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef") perftest(trim, " a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ") perftest(trim, " a ") perftest(trim, " ") print() end end
基于 LuaList:2009-12/msg00951.html 的 test15 可选 C 模块
/* trim.c - based on https://lua-users.lua.ac.cn/lists/lua-l/2009-12/msg00951.html from Sean Conner */ #include <stddef.h> #include <ctype.h> #include <lua.h> #include <lauxlib.h> int trim(lua_State* L) { const char* front; const char* end; size_t size; front = luaL_checklstring(L,1,&size); end = &front[size - 1]; while (size && isspace((unsigned char)*front)) { size--; front++; } while (size && isspace((unsigned char)*end)) { size--; end--; } lua_pushlstring(L,front,(size_t)(end - front) + 1); return 1; } int luaopen_trim(lua_State* L) { lua_register(L, "trim", trim); return 0; }
测试结果(数字越小越快)
Lua 5.1.4/Cygwin1.7 1: 0.9 1.9 2.1 9.6 10.3 2.2 2.0 2: 0.7 1.6 1.7 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.2 2.2 2.2 10.1 11.2 2.7 2.2 5: 0.6 0.9 1.1 1.2 1.3 2.8 77.6 6: 0.6 1.2 1.6 1.6 1.8 4.1 1.7 7: 0.6 1.1 1.5 1.5 1.6 3.9 1.6 8: 1.0 1.7 2.5 7.5 9.7 3.0 2.4 9: 1.2 2.0 2.7 8.0 9.3 21.2 3.4 10: 1.5 2.4 2.6 9.8 10.8 2.8 2.6 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.7 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 1: 0.9 1.9 2.0 9.6 10.3 2.2 1.9 2: 0.7 1.6 1.8 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.1 2.2 2.2 10.1 11.4 2.7 2.2 5: 0.6 0.9 1.2 1.2 1.2 2.8 78.2 6: 0.6 1.2 1.7 1.6 1.8 4.2 1.7 7: 0.6 1.1 1.5 1.5 1.7 3.9 1.6 8: 1.0 1.7 2.5 7.5 9.6 3.1 2.3 9: 1.2 2.0 2.7 8.0 9.3 21.1 3.3 10: 1.5 2.4 2.5 9.8 10.8 2.8 2.5 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.7 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.4 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 1: 0.9 1.9 2.0 9.6 10.3 2.2 2.0 2: 0.7 1.6 1.7 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.1 2.2 2.2 10.1 11.2 2.7 2.2 5: 0.6 0.9 1.2 1.2 1.3 2.8 77.3 6: 0.6 1.2 1.7 1.6 1.8 4.2 1.7 7: 0.6 1.1 1.5 1.5 1.7 3.9 1.6 8: 1.0 1.7 2.6 7.4 9.6 3.1 2.3 9: 1.2 2.0 2.7 8.0 9.3 21.1 3.3 10: 1.5 2.4 2.6 9.8 10.8 2.8 2.6 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.6 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 LuaJIT 2.0.0-beta2/Cygwin1.7 1: 0.6 1.5 1.5 7.7 8.3 1.3 1.2 2: 0.4 1.2 1.2 7.1 7.8 1.1 1.0 3: 0.6 1.0 1.2 3.1 4.9 1.7 1.3 4: 0.8 1.6 1.8 8.4 9.0 1.9 1.3 5: 0.4 0.6 0.8 1.2 1.2 2.3 99.2 6: 0.4 0.9 1.1 1.5 1.5 3.2 0.9 7: 0.3 0.8 1.1 1.4 1.5 3.0 0.8 8: 0.6 1.2 1.6 5.3 6.8 1.7 1.3 9: 0.7 1.2 1.8 5.6 6.7 14.4 1.7 10: 0.9 1.6 1.7 7.6 8.3 1.5 1.5 11: 0.3 0.8 1.1 1.4 1.5 2.9 2.1 12: 0.4 0.9 1.1 1.5 1.5 2.7 0.9 13: 0.6 0.7 0.7 1.0 1.4 0.8 0.7 14: 0.6 0.7 0.7 1.0 1.4 0.8 0.8 15: 0.1 0.1 0.1 0.3 0.3 0.2 0.2 ...
trim5
的速度在数据集中相对有利或具有竞争力,除了在只有空格的长字符串情况下速度很慢。使用“.*
”(而不是非贪婪的“.-
”)会快速地遍历长字符串的末尾,然后“%S
”会触发回溯以避免尾随空格,但“%s*
”和“.*
”的并置如果找不到“%S
”匹配项,则会造成大量的回溯(O(N^2))。trim6
特别处理最坏情况,并在整个数据集中表现良好。trim7
是一个将函数本地化的变体,它会产生更大的代码大小,并且不会带来实质性的速度提升。(在进一步测试中,每个本地化在涉及空字符串数据和内联函数调用的最佳情况下可能带来 10% 的提升,但这里有两个对 match
的调用)。trim11
也是 trim6
的一个变体,具有类似的性能特征,可能略好,除了在只有空格的长字符串情况下速度大约是原来的一半(这在 trim12
中已修复)。trim13
(或其变体 trim14
)是一个 lpeg (LuaPeg) 实现,它通常能达到或超过最佳模式匹配实现的性能,特别是在大量空格的情况下,但在交替空格和非空格的最坏情况下速度稍慢。C 实现 (trim15
) 在整个数据集中是迄今为止最快的。
在所有测试迭代中重用相同的字符串,如上所述,可能无法正确衡量由于 Lua 的字符串驻留而导致的临时字符串创建的影响。使用 local ss = {}; for i = 1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end
作为字符串缓存的快速测试似乎不会影响上述基本结论。