• Welcome to Touhou Wiki!
  • Registering is temporarily disabled. Check in our Discord server to request an account and for assistance of any kind.

Module:Lib UTF8

From Touhou Wiki
Jump to navigation Jump to search

Documentation for this module may be created at Module:Lib UTF8/doc

local common = require("Module:Common")

local utf8 = {}

--[[ Iterator that returns the start, end and the current character.
     Required by utf8.explode ]]
function utf8.iter(str)
  str = tostring(str)
  local i = 1
  local j
  local n = #str
  local char = nil
  return function()
      if i <= n then
        j = i
        char = str:byte(j)
        if char < 0x80 then
          -- 0xxxxxxx
          i = j + 1
        elseif char < 0xc0 then
          -- 10xxxxxx
          return nil -- error, we're in the middle of a character
        elseif char < 0xe0 then
          -- 110xxxxx 10xxxxxx
          i = j + 2
        elseif char < 0xf0 then
          -- 1110xxxx 10xxxxxx 10xxxxxx
          i = j + 3
        elseif char < 0xf8 then
          -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
          i = j + 4
        elseif char < 0xfc then
          -- 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
          i = j + 5
        elseif char < 0xfe then
          -- 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
          i = j + 6
        else
          return nil -- 0xfe and 0xff are invalid UTF-8 values
        end
        
        -- TODO: parsing of a character?
        return j,(i-1),string.sub(str, j, (i-1))
      end
    end
end

--[[ Splits an UTF-8 text (encoding used by Lua) into single character parts. ]]
function utf8.explode(text)
  local pts = {}
  if text == nil or type(text) ~= 'string' then return pts end
  if #text > 0 then
    for s,e,v in utf8.iter(text) do pts[#pts+1] = v end
  else
    -- technically there is one part - the empty string
    pts[#pts+1] = ''
  end
  return pts
end

--[[ Replaces whole non-ASCII characters.
     'reps' should be a table in format ['find'] = 'replace', like
     {['A'] = 'a', ['B'] = b, ['犬'] = '猫', ...},
     where table keys have to be single characters and the replacement can be any string. ]]
function utf8.replace_char(text, reps)
  local parts = utf8.explode(text)
  for k,v in pairs(parts) do
    if common.isset(reps[v]) then parts[k] = reps[v] end
  end
  return table.concat(parts)
end

--[[ Replaces non-ASCII strings.
     'reps' should be a table of string pairs, like:
     { {'find', 'replace'}, {'bird', 'cat'}, {'fly', 'walk'}, {'八雲 藍', '式神'}, ... }
     Only the first found replacement is being executed.
     Note that this is relatively slow solution, so should be used only for replacement of non-ASCII texts.
     For ASCII text it's better to use string.gsub ]]
function utf8.replace(text, reps)
  local parts = utf8.explode(text)
  local ret = {}
  local reps2 = {}
  local found
  for k,v in pairs(reps) do reps2[#reps2+1] = {utf8.explode(v[1]), utf8.explode(v[2])} end

  local i = 1
  while i <= #parts do
    found = false

    for k,v in pairs(reps2) do
      if common.partialTableCompare(parts, v[1], i, 1, #v[1]) then
        found = true
        -- found match, perform swap
        for k1,v1 in pairs(v[2]) do ret[#ret+1] = v1 end
        i = i + #v[1] - 1
        break
      end
    end

    if not found then
      ret[#ret+1] = parts[i]
    end
    i = i + 1
  end

  return table.concat(ret)
end

return utf8