User:VasilievVV/ustring.lua draft

From mediawiki.org
require( 'ustring_data' )

do
	local ustring_metatable = {}

	-- Check whether the value is a ustring
	isustring = function( val )
		return getmetatable( val ) == ustring_metatable
	end

	-- Construct a ustring from a usual string
	u = function( str )
		-- Quick return if the value is already a ustring
		if isustring( str ) then
			return str
		end

		-- Determine the amount of trail bytes by the first one
		local countTrail = function( val )
			if val < 0x80 then
				return 0
			elseif val < 0xC0 then
				error( "Invalid UTF-8 sequence supplied" )
			elseif val < 0xE0 then
				return 1
			elseif val < 0xF0 then
				return 2
			else
				return 3
			end
		end

		-- Determine whether the given byte is a trail of a character
		local isTrail = function( val )
			return val >= 0x80 and val < 0xC0
		end

		-- Throws error if the illegal byte is found
		local checkByte = function( val )
			if val == 0xC0 or val == 0xC1 or val > 0xF4 then
				error( "Invalid UTF-8 sequence supplied" )
			end
		end

		local result = {}
		local pos, upos = 1, 1
		local subpos

		-- Autoconvert numbers to match Lua standard string behavior
		if type(str) == "number" then
			str = tostring(str)
		end

		-- Avoid possible traps caused by supplying tables to u()
		if type(str) ~= "string" then
			error( "Only strings may be converted to ustrings" )
		end

		-- Main converstion loop
		while pos <= #str do
			local byte, trailCount

			byte = str:byte( pos )
			checkByte( byte )
			trailCount = countTrail( byte )
			
			if pos + trailCount > #str then
				error( "Invalid UTF-8 sequence supplied" )
			end

			for subpos = pos + 1, pos + trailCount do
				if not isTrail( str:byte( subpos ) ) then
					error( "Invalid UTF-8 sequence supplied" )
				end
			end
			
			result[upos] = str:sub( pos, pos + trailCount )
			pos = pos + trailCount + 1
			upos = upos + 1
		end
		
		setmetatable( result, ustring_metatable )
		return result
	end

	local ustring_changecase = function( s, casetable )
		s = u(s)

		local result = {}
		
		for i = 1, #s do
			local char = rawget( s, i )

			if casetable[char] ~= nil then
				result[i] = casetable[char]
			else
				result[i] = char
			end
		end
		
		setmetatable( result, ustring_metatable )
		return result
	end

	ustring = {
		len = function( s )
			return #s
		end,
		uc = function( s )
			return ustring_changecase( s, ustring_uppercase_map )
		end,
		lc = function( s )
			return ustring_changecase( s, ustring_lowercase_map )
		end,
	}

	ustring_metatable["__concat"] = function( a, b )
		a = u(a)
		b = u(b)

		local result = {}
		local firstlen = #a

		for pos = 1, firstlen do
			result[pos] = a[pos]
		end
		for pos = 1, #b do
			result[pos + firstlen] = b[pos]
		end

		setmetatable( result, ustring_metatable )
		return result
	end
	
	ustring_metatable["__index"] = function( s, idx )
		if type(idx) == "number" then
			if idx == 0 then
				error( "Invalid ustring index supplied (zero)" )
			end
			if (idx > 0 and idx > #s) or (idx < 0 and idx < -#s) then
				error( "Invalid ustring index supplied (out of bounds)" )
			end
			
			if idx > 0 then
				return rawget( s, idx )
			else
				return rawget( s, #s + idx + 1 )
			end
		end

		return ustring[idx]
	end
	
	ustring_metatable["__eq"] = function( a, b )
		if #a ~= #b then
			return false
		else
			for pos = 1, #a do
				if rawget( a, pos ) ~= rawget( b, pos ) then
					return false
				end
			end
			
			return true
		end
	end
	
	ustring_metatable["__tostring"] = function( s )
		local result = ""

		for i = 1, #s do
			result = result .. rawget( s, i )
		end

		return result
	end
end