Module:Unicode data

Fiji Hindi editions of Wiki, an online encyclopedia

Documentation for this module may be created at Module:Unicode data/doc

local p = {}local floor = math.floorlocal function errorf(level, ...)if type(level) == "number" thenreturn error(string.format(...), level + 1)else -- level is actually the format string.return error(string.format(level, ...), 2)endendlocal function binary_range_search(codepoint, ranges)local low, mid, highlow, high = 1, ranges.length or require "Module:TableTools".length(ranges)while low <= high domid = floor((low + high) / 2)local range = ranges[mid]if codepoint < range[1] thenhigh = mid - 1elseif codepoint <= range[2] thenreturn range, midelselow = mid + 1endendreturn nil, midendp.binary_range_search = binary_range_search--[[local function linear_range_search(codepoint, ranges)for i, range in ipairs(ranges) doif range[1] <= codepoint and codepoint <= range[2] thenreturn rangeendendend--]]-- Load a module by indexing "loader" with the name of the module minus the-- "Module:Unicode data/" part. For instance, loader.blocks returns-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be-- returned.local loader = setmetatable({}, {__index = function (self, key)local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)if not success thendata = falseendself[key] = datareturn dataend})-- For the algorithm used to generate Hangul Syllable names,-- see "Hangul Syllable Name Generation" in section 3.12 of the-- Unicode Specification:-- name_hooks = {{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A{   0x4E00,   0x9FFC, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllableslocal Hangul_data = loader.Hangullocal syllable_index = codepoint - 0xAC00return ("HANGUL SYLLABLE %s%s%s"):format(Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)/ Hangul_data.trail_count)],Hangul_data.trails[syllable_index % Hangul_data.trail_count])end },-- High Surrogates, High Private Use Surrogates, Low Surrogates{   0xD800,   0xDFFF, "<surrogate-%04X>" },{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use-- CJK Compatibility Ideographs{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph{  0x18800,  0x18AFF, function (codepoint)return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)end },{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu{  0x20000,  0x2A6DD, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane){  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplementreturn ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)end},{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use}name_hooks.length = #name_hookslocal name_range_cachelocal function generate_name(data, codepoint)if type(data) == "string" thenreturn data:format(codepoint)elsereturn data(codepoint)endend--[[-- Checks that the code point is a number and in range.-- Does not check whether code point is an integer.-- Not usedlocal function check_codepoint(funcName, argIdx, val)require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')if codepoint < 0 or 0x10FFFF < codepoint thenerrorf("Codepoint %04X out of range", codepoint)endend--]]--, section 4.8function p.lookup_name(codepoint)-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned-- (Cn) and specifically noncharacters:-- 0xFDD0 <= codepoint and (codepoint <= 0xFDEFor floor(codepoint % 0x10000) >= 0xFFFE) thenreturn ("<noncharacter-%04X>"):format(codepoint)endif name_range_cache -- Check if previously used "name hook" applies to this code point.and codepoint >= name_range_cache[1]and codepoint <= name_range_cache[2] thenreturn generate_name(name_range_cache[3], codepoint)endlocal range = binary_range_search(codepoint, name_hooks)if range thenname_range_cache = rangereturn generate_name(range[3], codepoint)endlocal data = loader[('names/%03X'):format(codepoint / 0x1000)]if data and data[codepoint] thenreturn data[codepoint]-- Unassigned (Cn) consists of noncharacters and reserved characters.-- The character has been established not to be a noncharacter,-- and if it were assigned, its name would already been retrieved,-- so it must be reserved.elsereturn ("<reserved-%04X>"):format(codepoint)endend--[[-- No image data modules on Wikipedia yet.function p.lookup_image(codepoint)local data = loader[('images/%03X'):format(codepoint / 0x1000)]if data thenreturn data[codepoint]endend--]]local planes = {[ 0] = "Basic Multilingual Plane";[ 1] = "Supplementary Multilingual Plane";[ 2] = "Supplementary Ideographic Plane";[ 3] = "Tertiary Ideographic Plane";[14] = "Supplementary Special-purpose Plane";[15] = "Supplementary Private Use Area-A";[16] = "Supplementary Private Use Area-B";}-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.local blockslocal function block_iter(blocks, i)i = i + 1local data = blocks[i]if data then -- Unpack doesn't work on tables loaded with mw.loadData.return i, data[1], data[2], data[3]endend-- An ipairs-type iterator generator for the list of blocks.function p.enum_blocks()local blocks = loader.blocksreturn block_iter, blocks, 0endfunction p.lookup_plane(codepoint)local i = floor(codepoint / 0x10000)return planes[i] or ("Plane %u"):format(i)endfunction p.lookup_block(codepoint)local blocks = loader.blockslocal range = binary_range_search(codepoint, blocks)if range thenreturn range[3]elsereturn "No Block"endendfunction p.get_block_info(name)for i, block in ipairs(loader.blocks) doif block[3] == name thenreturn blockendendendfunction p.is_valid_pagename(pagename)local has_nonws = falsefor cp in mw.ustring.gcodepoint(pagename) doif (cp == 0x0023) -- #or (cp == 0x005B) -- [or (cp == 0x005D) -- ]or (cp == 0x007B) -- {or (cp == 0x007C) -- |or (cp == 0x007D) -- }or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATORor ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation blockor (cp == 0xFFFD) -- REPLACEMENT CHARACTERthenreturn falseendlocal printable, result = p.is_printable(cp)if not printable thenreturn falseendif result ~= "space-separator" thenhas_nonws = trueendendreturn has_nonwsendlocal function manual_unpack(what, from)if what[from + 1] == nil thenreturn what[from]endlocal result = {}from = from or 1for i, item in ipairs(what) doif i >= from thentable.insert(result, item)endendreturn unpack(result)endlocal function compare_ranges(range1, range2)return range1[1] < range2[1]end-- Creates a function to look up data in a module that contains "singles" (a-- code point-to-data map) and "ranges" (an array containing arrays that contain-- the low and high code points of a range and the data associated with that-- range).-- "loader" loads and returns the "singles" and "ranges" tables.-- "match_func" is passed the code point and either the data or the "dots", and-- generates the final result of the function.-- The varargs ("dots") describes the default data to be returned if there wasn't-- a match.-- In case the function is used more than once, "cache" saves ranges that have-- already been found to match, or a range whose data is the default if there-- was no match.local function memo_lookup(data_module_subpage, match_func, ...)local dots = { ... }local cache = {}local singles, rangesreturn function (codepoint)if not singles thenlocal data_module = loader[data_module_subpage]singles, ranges =, data_module.rangesendif singles[codepoint] thenreturn match_func(codepoint, singles[codepoint])endlocal range = binary_range_search(codepoint, cache)if range thenreturn match_func(codepoint, manual_unpack(range, 3))endlocal range, index = binary_range_search(codepoint, ranges)if range thentable.insert(cache, range)table.sort(cache, compare_ranges)return match_func(codepoint, manual_unpack(range, 3))endif ranges[index] thenlocal dots_rangeif codepoint > ranges[index][2] thendots_range = {ranges[index][2] + 1,ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,unpack(dots)}else -- codepoint < range[index][1]dots_range = {ranges[index - 1] and ranges[index - 1][2] + 1 or 0,ranges[index][1] - 1,unpack(dots)}endtable.sort(cache, compare_ranges)endreturn match_func(codepoint)endend-- Get a code point's combining class value in [[Module:Unicode data/combining]],-- and return whether this value is not zero. Zero is assigned as the default-- if the combining class value is not found in this data module.-- That is, return true if character is combining, or false if it is not.-- See for-- more information.p.is_combining = memo_lookup("combining",function (codepoint, combining_class)return combining_class and combining_class ~= 0 or falseend,0)function p.add_dotted_circle(str)return (mw.ustring.gsub(str, ".",function(char)if p.is_combining(mw.ustring.codepoint(char)) thenreturn '◌' .. charendend))endlocal lookup_control = memo_lookup("control",function (codepoint, ccc)return ccc or "assigned"end,"assigned")p.lookup_control = lookup_controlfunction p.is_assigned(codepoint)return lookup_control(codepoint) ~= "unassigned"endfunction p.is_printable(codepoint)local result = lookup_control(codepoint)return (result == "assigned") or (result == "space-separator"), resultendfunction p.is_whitespace(codepoint)local result = lookup_control(codepoint)return (result == "space-separator"), resultendp.lookup_category = memo_lookup("category",function (codepoint, category)return categoryend,"Cn")local lookup_script = memo_lookup("scripts",function (codepoint, script_code)return script_code or 'Zzzz'end,"Zzzz")p.lookup_script = lookup_scriptfunction p.get_best_script(str)-- Check type of argument, because mw.text.decode coerces numbers to strings!require "libraryUtil".checkType("get_best_script", 1, str, "string")-- Convert HTML character references (including named character references,-- or character entities) to characters.str = mw.text.decode(str, true)local scripts = {}for codepoint in mw.ustring.gcodepoint(str) dolocal script = lookup_script(codepoint)-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") thenscripts[script] = trueendend-- If scripts does not contain two or more keys,-- return first and only key (script code) in table.if not next(scripts, next(scripts)) thenreturn next(scripts)end -- else return majority script, or else "Zzzz"?endfunction p.is_Latin(str)require "libraryUtil".checkType("get_best_script", 1, str, "string")str = mw.text.decode(str, true)-- Search for the leading bytes that introduce the UTF-8 encoding of the-- code points U+0340-U+10FFFF. If they are not found and there is at least-- one Latin-script character, the string counts as Latin, because the rest-- of the characters can only be Zyyy, Zinh, and Zzzz.-- The only scripts found below U+0370 (the first code point of the Greek-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.-- See the codepage in the [[UTF-8]] article.if not str:find "[\205-\244]" thenfor codepoint in mw.ustring.gcodepoint(str) doif lookup_script(codepoint) == "Latn" thenreturn trueendendendlocal Latn = falsefor codepoint in mw.ustring.gcodepoint(str) dolocal script = lookup_script(codepoint)if script == "Latn" thenLatn = trueelseif not (script == "Zyyy" or script == "Zinh"or script == "Zzzz") thenreturn falseendendreturn Latnend-- Checks that a string contains only characters belonging to right-to-left-- scripts, or characters of ignorable scripts.function p.is_rtl(str)require "libraryUtil".checkType("get_best_script", 1, str, "string")str = mw.text.decode(str, true)-- Search for the leading bytes that introduce the UTF-8 encoding of the-- code points U+0580-U+10FFFF. If they are not found, the string can only-- have characters from a left-to-right script, because the first code point-- in a right-to-left script is U+0591, in the Hebrew block.if not str:find "[\214-\244]" thenreturn falseendlocal result = falselocal rtl = loader.scripts.rtlfor codepoint in mw.ustring.gcodepoint(str) dolocal script = lookup_script(codepoint)if rtl[script] thenresult = trueelseif not (script == "Zyyy" or script == "Zinh"or script == "Zzzz") thenreturn falseendendreturn resultendlocal function get_codepoint(args, arg)local codepoint_string = args[arg]or errorf(2, "Parameter %s is required", tostring(arg))local codepoint = tonumber(codepoint_string, 16)or errorf(2, "Parameter %s is not a code point in hexadecimal base",tostring(arg))if not (0 <= codepoint and codepoint <= 0x10FFFF) thenerrorf(2, "code point in parameter %s out of range", tostring(arg))endreturn codepointendlocal function get_func(args, arg, prefix)local suffix = args[arg]or errorf(2, "Parameter %s is required", tostring(arg))suffix = mw.text.trim(suffix)local func_name = prefix .. suffixlocal func = p[func_name]or errorf(2, "There is no function '%s'", func_name)return funcend-- This function allows any of the "lookup" functions to be invoked. The first-- parameter is the word after "lookup_"; the second parameter is the code point-- in hexadecimal base.function p.lookup(frame)local func = get_func(frame.args, 1, "lookup_")local codepoint = get_codepoint(frame.args, 2)local result = func(codepoint)if func == p.lookup_name then-- Prevent code point labels such as <control-0000> from being-- interpreted as HTML tags.result = result:gsub("<", "<")endreturn resultendfunction func = get_func(frame.args, 1, "is_")-- is_Latin and is_valid_pagename take strings.if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl thenreturn (func(frame.args[2]))else -- The rest take code points.local codepoint = get_codepoint(frame.args, 2)return (func(codepoint)) -- Adjust to one result.endendreturn p

This article uses material from the Wikipedia article Module:Unicode data, which is released under the Creative Commons Attribution-ShareAlike 3.0 license ("CC BY-SA 3.0"); additional terms may apply. (view authors). Content is available under CC BY-SA 3.0 unless otherwise noted. Images, videos and audio are available under their respective licenses.
#Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc. Wiki ( is an independent company and has no affiliation with Wikimedia Foundation.