Below are scripts I use for maintaining the Unicode database at Module:Unicode data.
Ingredients: GNU Make, GNU awk, GNU wget.
make clean
, then make
..lua
files will be generated from it. Save them in appropriate locations as specified in the table below.File name | Wiktionary page |
---|---|
aliases.lua
|
Module:Unicode data/aliases |
blocks.lua
|
the blocks table in Module:Unicode data
|
combining.lua
|
Module:Unicode data/combining |
control.lua
|
Module:Unicode data/control (includes Cx and Zx) |
names_XXX.lua
|
Module:Unicode data/names/XXX |
Makefile
.PHONY: all names clean update
all: blocks.lua control.lua aliases.lua combining.lua names
clean:
rm -f *.lua *.txt
names: names.awk UnicodeData.txt
gawk -F';' -f $^
update: all
./upload
%.lua: %.awk
gawk -F';' -f $^ >'$@'
Derived%.txt:
wget 'http://unicode.org/Public/UNIDATA/extracted/$@' -O '$@'
%.txt:
wget 'http://unicode.org/Public/UNIDATA/$@' -O '$@'
blocks.lua: Blocks.txt
control.lua: DerivedGeneralCategory.txt
aliases.lua: NameAliases.txt
combining.lua: DerivedCombiningClass.txt
names_%.lua: names
aliases.awk
/^/ {
codepoint = strtonum("0x" $1)
codepoints)] = $2";"$3
}
END {
print "local correction, control, alternate, figment, abbreviation = "
print "\t\"correction\", \"control\", \"alternate\", \"figment\", \"abbreviation\""
print ""
print "return {"
for (cpoint = 0; cpoint < 0x10ffff; ++cpoint) {
if (!(cpoint in codepoints))
continue
printf "\t = {\n", cpoint
for (i in codepoints) {
split(codepoints, fields, ";")
printf "\t\t{ %12s, \"%s\" };\n", fields, fields
}
printf "\t};\n"
}
print "}"
}
names.awk
BEGIN {
last = -1
}
/^/ && !($2 ~ /^</) {
cp = strtonum("0x"$1)
page = sprintf("%03X", int(cp / 0x1000))
if (last != page) {
if (last != -1)
print "}" >> "names_" last ".lua"
print "return {" > "names_" page ".lua"
last = page
}
printf("\t = \"%s\",\n", strtonum("0x"$1), $2) >> "names_" page ".lua"
}
END {
print "}" >> "names_" last ".lua"
}
blocks.awk
BEGIN {
FS=" *(\\.\\.|;) *"
max_name_len = 0
i = 0
}
/^/ {
starts = strtonum("0x"$1)
ends = strtonum("0x"$2)
names = $3
++i
name_length = length($3)
max_name_len = (name_length > max_name_len) ? name_length : max_name_len
}
END {
print ("local blocks = {")
for (i = 0; i in starts; ++i)
printf("\t{ 0x%06X, 0x%06X, %-*s },\n",
starts, ends, max_name_len + 2, "\"" names "\"")
print ("}")
}
combining.awk
BEGIN {
FS="\\.\\.| * *"
}
!($1 ~ /^/) {
next
}
($3 ~ /^*$/) {
if ($3 == "0")
next
ranges = strtonum("0x" $2)
kinds = $3
next
}
!($3 ~ /^*$/) {
if ($2 == "0")
next
singles = $2
next
}
END {
print "return {"
print "\tsingle = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in singles)
printf("\t\t = %4s\n", i, singles ",")
}
print "\t};"
print "\tranges = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in ranges)
printf("\t\t{ 0x%06X, 0x%06X, %3s },\n", i, ranges, kinds)
}
print "\t};"
print "}"
}
control.awk
BEGIN {
FS="\\.\\.| * *"
}
!($1 ~ /^/) {
next
}
($2 ~ /^/) {
singles = $2
}
($3 ~ /^/) {
ranges = strtonum("0x" $2)
kinds = $3
}
END {
print "local Cc, Cf, Cs, Co, Cn ="
print "\t\"control\", \"format\", \"surrogate\", \"private-use\", \"unassigned\""
print "local Zs, Zl, Zp ="
print "\t\"space-separator\", \"line-separator\", \"paragraph-separator\""
print ""
print "return {"
print "\tsingle = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in singles)
printf("\t\t = %s,\n", i, singles)
}
print "\t};"
print "\tranges = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in ranges)
printf("\t\t{ 0x%06X, 0x%06X, %s },\n", i, ranges, kinds)
}
print "\t};"
print "}"
}