验证 Unicode 字符串 |
|
string.subutf8(string, start[,end])
子字符串,支持 UTF-8
pos, char = string.nextutf8(string, orig_pos)
返回 orig_pos 处的字符和下一个字符在 pos 中的位置。
for i, char in str:nextutf8(orig_pos)
迭代字符串,从 orig_pos 开始。
pos = string.seekutf8(string, orig_pos, n)
返回位置 orig_pos,向前(或向后,如果 N 为负数) N 个字符。
char = string.utf8char(code)
返回代码为 code 的字符。
code = string.utf8code(char)
返回 char(UTF-8 字符)的代码。
len = string.lenutf8(string)
返回字符串中 UTF-8 字符的长度。
UTF-8 BOM 按惯例代码为 0。有效代码范围为:0-0xD7FF、0xE000-0x10FFFF。
Unicode 是一个通用的字符集,广泛用于 XML 文档。
重点是 Unicode 代码点长度为 0-21 位。使用 UTF-8,ASCII 字符存储为一个字节,其他字符使用 2 到 4 个字节。参见 [RFC 3629]。
前一段链接到 [RFC 2279],该链接已被 RFC-3629 弃用,以使其与 Unicode 标准 [1] 一致。可以在 [2] 中找到一个相当快的符合标准的纯 Lua 库。(链接已断开)
StephaneArnold 2007-11-13 - 我删除了发布的代码,该代码不符合最新的 UTF-8 标准。我已经将“纯 Lua 库”中的一些函数转换为 C 函数
lua_utf8.c
/*==================================================================*/ /* C program by [email protected] 2007, MIT license based on the work of Rici Lake [email protected] */ /*==================================================================*/ #include <memory.h> #include "lua.h" #include "lauxlib.h" #include "lualib.h" #define INVALID_UTF8 "invalid utf-8 string" #define POINTS_ASCII(p) (*((unsigned char*)p) < 128) #define RANGE(x, min, max) ((x)>=min && (x)<=max) #define RANGE_SND(x) RANGE(x,128,191) #define UTF8_BOM(p) (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF) int sarn_utf8_next(const unsigned char* str) { if (*str < 128) return 1; if (UTF8_BOM(str)) return 3; if (*str < 194) return 0; if (*str > 244) return 0; if (*str < 224 && RANGE_SND(str[1])) return 2; if (RANGE(*str, 225, 239) && *str != 237 && RANGE_SND(str[1]) && RANGE_SND(str[2])) return 3; if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2])) return 3; if (*str == 237 && RANGE(str[1],128,159) && RANGE_SND(str[2])) return 3; if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 240 && RANGE(str[1],144,191) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 244 && RANGE(str[1],128,143) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; return 0; } #define BACK(str, remain) if (--remain == 0) return 0; else str-- int sarn_utf8_prev(unsigned char* str, int remain) { BACK(str,remain); if (*str < 128) return 1; BACK(str,remain); if (RANGE(*str,195,224) && RANGE_SND(str[1])) return 2; BACK(str,remain); if (UTF8_BOM(str)) return 3; if (RANGE(*str, 225, 239) && *str != 237 && RANGE_SND(str[1]) && RANGE_SND(str[2])) return 3; if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2])) return 3; if (*str == 237 && RANGE(str[1],160,191) && RANGE_SND(str[2])) return 3; BACK(str,remain); if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 240 && RANGE(str[1],144,191) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 244 && RANGE(str[1],128,143) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; /* fail back */ return 0; } /** Realign index on an UTF-8 char boundary in str. Returns the offset (0 to 3) to be seeked backwards, or -1 if it fails. */ int sarn_utf8_realign(unsigned char* str, size_t index) { size_t size, i; for (i = 0; i<4 && index>=i;i++) { if (sarn_utf8_next(str-i)!=0) return i; } return -1; } int sarn_utf8_next_func(lua_State* L) { const char *str; size_t pos, clen; char utf8[5]; str = luaL_checkstring(L, 1); pos = luaL_checklong(L, 2); if (strlen(str)<pos) { lua_pushnil(L); return 1; } memset(utf8, '\0', sizeof(utf8)); if (pos == 0) return luaL_error(L, "bad index value : 0"); clen = sarn_utf8_next((unsigned char *)str+pos-1); if (!clen) return luaL_error(L, INVALID_UTF8); lua_pushnumber(L, pos+clen); strncpy(utf8, str+pos-1, clen); lua_pushstring(L, utf8); return 2; } int sarn_utf8_len_func(lua_State *L) { unsigned char *str; int l; size_t len = 0; str = (unsigned char*) luaL_checkstring(L, 1); while (*str) { if (POINTS_ASCII(str)) { str++; len++; continue; } l = sarn_utf8_next(str); if (!l) return luaL_error(L, INVALID_UTF8); len++; str+=l; } lua_pushnumber(L, len); return 1; } int sarn_utf8_seek_func(lua_State *L) { unsigned char* str; int pos, shift; int clen, len; str = (unsigned char*)luaL_checkstring(L, 1); pos = luaL_checklong(L, 2); shift = luaL_checklong(L, 3); len = strlen(str); if (shift == 0) { lua_pushinteger(L, pos); return 1; } if (pos > len || pos < 1) return luaL_error(L, "invalid index (arg #2)"); /* then, pos is 0-based */ pos--; if (abs(shift) > len) { /* out of range */ lua_pushnil(L); return 1; } if (shift < 0) { while ((shift++) != 0) { clen = sarn_utf8_prev(str+pos, pos+1); if (clen == 0 || pos+1 < clen) { lua_pushnil(L); return 1; } pos -= clen; } } else { while ((shift--) != 0) { if (POINTS_ASCII(str+pos)) { pos ++; continue; } clen = sarn_utf8_next(str+pos); if (clen == 0 || pos+clen >= len) { lua_pushnil(L); return 1; } pos += clen; } } lua_pushinteger(L, pos+1); return 1; } int sarn_utf8_char_func(lua_State *L) { unsigned char str[2]; long int i; unsigned long int code; unsigned char result[5]; i = luaL_checklong(L, 1); memset(result, '\0', sizeof(result)); code = i; if (i >= 0xD800 && i <= 0xDFFF) return luaL_error(L, "invalid utf-8 code"); if (i >= 0 && i < 0x110000UL) { if (code == 0) { /* UTF8 BOM */ lua_pushstring(L, "\xEF\xBB\xBF"); return 1; } if (code < 128) { result[0] = code; lua_pushstring(L, (char*)result); return 1; } str[0] = 0x80 + (code & 63); code = code >> 6; if (code < 32) { result[0] = 0xC0+code; result[1] = str[0]; lua_pushstring(L, (char*)result); return 1; } str[1] = code & 0x3f; code = code >> 6; if (code < 16 && (code != 13 || str[1] < 32)) { result[0] = 0xE0 + code; result[1] = str[1] + 0x80; result[2] = str[0]; lua_pushstring(L, (char*)result); return 1; } else if (code >= 16 && code < 0x110) { result[1] = 0x80 + (code & 0x3f); result[0] = 0xF0 + (code >> 6); result[2] = str[1] + 0x80; result[3] = str[0]; lua_pushstring(L, (char*) result); return 1; } } return luaL_error(L, "invalid utf-8 code"); } int sarn_utf8_code_func(lua_State *L) { unsigned char* str; size_t len, i; unsigned long int code; unsigned long int offset[] = {0, 0x3000, 0xE0000UL, 0x3C00000UL}; str = (unsigned char*)luaL_checklstring(L, 1, &len); if (len != sarn_utf8_next(str)) return luaL_error(L, INVALID_UTF8); if (UTF8_BOM(str)) { lua_pushinteger(L, 0); return 1; } code = str[0]; for (i = 1; i < len; i++) { code = (code << 6) + (str[i] & 63); } lua_pushinteger(L, code - offset[len-1]); return 1; } int luaopen_libluautf8 (lua_State *L) { lua_getglobal(L, "string"); lua_pushcfunction(L, sarn_utf8_next_func); lua_setfield(L, -2, "nextutf8"); lua_pushcfunction(L, sarn_utf8_len_func); lua_setfield(L, -2, "utf8len"); lua_pushcfunction(L, sarn_utf8_seek_func); lua_setfield(L, -2, "seekutf8"); lua_pushcfunction(L, sarn_utf8_code_func); lua_setfield(L, -2, "utf8code"); lua_pushcfunction(L, sarn_utf8_char_func); lua_setfield(L, -2, "utf8char"); return 0; }
Makefile
all: compile LUA_CFLAGS=-O2 -fpic LUA_LDFLAGS=-O -shared -fpic compile: lua_utf8 lua_utf8: lua_utf8.c $(CC) $(CFLAGS) $(LUA_CFLAGS) -c lua_utf8.c $(CC) $(CFLAGS) $(LUA_LDFLAGS) -o libluautf8.so lua_utf8.o
module(...,package.seeall) require'libluautf8' local mt = {} local unistr = {} function unistr:new(str) return setmetatable({value = str or ''},mt) end -- redirects methods to unistr mt.__index = function(t,key) if key == 'length' then return string.utf8len(t.value) end if key == 'value' then return t.value end return unistr[key] end -- substrings, utf8 ready -- it might be very expensive -- isn't every encoding function expensive compared to raw access -- to bytes function unistr:sub (first, last) local fn fn = function (str,idx) if idx == 1 or idx == 0 then return idx end if idx<0 then -- negative indices are counted backwards return str:seekutf8(#str, idx) or 1 else return str:seekutf8(1, idx-1) or #str+1 end end local i = fn(self.value, first) if last == nil then return self.value:sub(i) end if last < 0 then if first > 0 or (first<0 and last-first > -last) then -- we must anyway walk through the encoded string -- when walking from the end of the string backwards -- has costs less than walking from the first index -- we choose the least cost -- we get the last index from fn return self.value:sub(i, fn(self.value, last)) end end if first == 0 then return self.value:sub(i, fn(self.value, last)) end return self.value:sub(i, self.value:seekutf8(i, last-first)) end local u2s=function (str) if type(str) == 'string' then return str else return str.value end end -- unicode strings concat function mt.__concat(a,b) return u(u2s(a)..u2s(b)) end -- encoded string length with a metatable is not possible -- so let's stick with a len() method function unistr:len() return self.value:utf8len() end -- iterator function unistr:each(pos) return string.nextutf8, self.value, pos or 1 end -- creates a global "u" function to be used like that: -- str = u"Hello" (it feels Python-like but is really a Lua function) -- then, thanks to the metatable mechanism, concatenation and other funcs -- can be invoked as if it was a simple scalar of type string _G.u = function(str) return unistr:new(str) end function unicodize(f) return function(str) return f(u2s(str)) end end _G.print = unicodize(print) -- return this function return _G.u
测试代码
require 'utf8' a=u'hello' b="hello" function assertEqual(name,a,b) if a~=b then print(name.."["..a..'|'..b..']') else --print(name.."...OK") end end for i = 0,10 do assertEqual("sub1."..i,a:sub(i),b:sub(i)) end for i = 0,5 do for j = i,10 do assertEqual("sub2."..i.."-"..j,a:sub(i,j),b:sub(i,j)) end end lentest = {{"h",1},{"",0},{"hel",3},{"hi St�phane",11}} for _,val in ipairs(lentest) do str = u(val[1]) assertEqual("len1.".._, str.length, val[2]) end firstName=u"St�phane" lastName = u"Arnold" print("hello "..firstName.." "..lastName)