Module:Lib UTF8
Documentation for this module may be created at Module:Lib UTF8/doc
local common = require("Module:Common")
local utf8 = {}
--[[ Iterator that returns the start, end and the current character.
Required by utf8.explode ]]
function utf8.iter(str)
str = tostring(str)
local i = 1
local j
local n = #str
local char = nil
return function()
if i <= n then
j = i
char = str:byte(j)
if char < 0x80 then
-- 0xxxxxxx
i = j + 1
elseif char < 0xc0 then
-- 10xxxxxx
return nil -- error, we're in the middle of a character
elseif char < 0xe0 then
-- 110xxxxx 10xxxxxx
i = j + 2
elseif char < 0xf0 then
-- 1110xxxx 10xxxxxx 10xxxxxx
i = j + 3
elseif char < 0xf8 then
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
i = j + 4
elseif char < 0xfc then
-- 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
i = j + 5
elseif char < 0xfe then
-- 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
i = j + 6
else
return nil -- 0xfe and 0xff are invalid UTF-8 values
end
-- TODO: parsing of a character?
return j,(i-1),string.sub(str, j, (i-1))
end
end
end
--[[ Splits an UTF-8 text (encoding used by Lua) into single character parts. ]]
function utf8.explode(text)
local pts = {}
if text == nil or type(text) ~= 'string' then return pts end
if #text > 0 then
for s,e,v in utf8.iter(text) do pts[#pts+1] = v end
else
-- technically there is one part - the empty string
pts[#pts+1] = ''
end
return pts
end
--[[ Replaces whole non-ASCII characters.
'reps' should be a table in format ['find'] = 'replace', like
{['A'] = 'a', ['B'] = b, ['犬'] = '猫', ...},
where table keys have to be single characters and the replacement can be any string. ]]
function utf8.replace_char(text, reps)
local parts = utf8.explode(text)
for k,v in pairs(parts) do
if common.isset(reps[v]) then parts[k] = reps[v] end
end
return table.concat(parts)
end
--[[ Replaces non-ASCII strings.
'reps' should be a table of string pairs, like:
{ {'find', 'replace'}, {'bird', 'cat'}, {'fly', 'walk'}, {'八雲 藍', '式神'}, ... }
Only the first found replacement is being executed.
Note that this is relatively slow solution, so should be used only for replacement of non-ASCII texts.
For ASCII text it's better to use string.gsub ]]
function utf8.replace(text, reps)
local parts = utf8.explode(text)
local ret = {}
local reps2 = {}
local found
for k,v in pairs(reps) do reps2[#reps2+1] = {utf8.explode(v[1]), utf8.explode(v[2])} end
local i = 1
while i <= #parts do
found = false
for k,v in pairs(reps2) do
if common.partialTableCompare(parts, v[1], i, 1, #v[1]) then
found = true
-- found match, perform swap
for k1,v1 in pairs(v[2]) do ret[#ret+1] = v1 end
i = i + #v[1] - 1
break
end
end
if not found then
ret[#ret+1] = parts[i]
end
i = i + 1
end
return table.concat(ret)
end
return utf8