. In DICTIOUS you will not only get to know all the dictionary meanings for the word
, but we will also tell you about its etymology, its characteristics and you will know how to say
in singular and plural. Everything you need to know about the word
you have here. The definition of the word
will help you to be more precise and correct when speaking or writing your texts. Knowing the definition of
, as well as those of other words, enriches your vocabulary and provides you with more and better linguistic resources.
// To compile: javac -encoding UTF-8 BaxterSagartWikitableBuilder.java
// To run: java BaxterSagartWikitableBuilder
// Output: Baxter-Sagart wikitable.txt
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.zip.*;
public class BaxterSagartWikitableBuilder {
public static final String baxterSagartURL =
"http://crlao.ehess.fr/docannexe.php?id=1221";
public static final String unihanURL =
"http://www.unicode.org/Public/UNIDATA/Unihan.zip";
public static final String outFile = "Baxter-Sagart wikitable.txt";
public static void main(String args) {
try { main(); }
catch (Exception e) {
e.printStackTrace();
System.exit(1);
} System.exit(0);
}
public static long fetchDate;
public static File baxterSagartFile;
public static File unihanFile;
public static TreeMap<String,String> scMap;
public static PrintWriter writer;
public static void main() throws Exception {
// Remembering the exact time we fetched the online data.
fetchDate = System.currentTimeMillis();
// Retrieving Baxter-Sagart data.
baxterSagartFile = download(baxterSagartURL);
// Retrieving Unihan data.
unihanFile = download(unihanURL);
// Processing Unihan data.
// We need this for mappings of
// Traditional Chinese characters to
// Simplified Chinese characters.
processUnihan();
// Beginning to write out wikitable file.
writer = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new BufferedOutputStream(
new FileOutputStream(outFile)
), "UTF-8"
)
)
);
writer.print(
"== Data ==\n" +
//":''This section is software-generated. The program's Java source code is ].''\n" +
"This table incorporates data from:\n" +
"* The .\n" +
"* ] and " +
"] (n.d.) " +
"Baxter-Sagart Old Chinese reconstruction (Version 1.00). " +
"Online at http://crlao.ehess.fr/document.php?id=1217 . Accessed "
);
// Printing the fetch date.
// We don't care about deprecated API. This works well enough.
writer.print(new Date(fetchDate).toGMTString());
writer.print(
".\n" +
"Legend of table headers:\n" +
"* '''TC''': {{w|Traditional Chinese}} character.\n" +
"* '''SC''': {{w|Simplified Chinese}} character.\n" +
"* '''PY''': ] {{w|Pinyin}} romanization.\n" +
"* '''MC''': {{w|Middle Chinese}} reconstruction.\n" +
"* '''MCI''': Middle Chinese initial.\n" +
"* '''MCF''': Middle Chinese final.\n" +
"* '''MCT''': Middle Chinese tone.\n" +
"** A = even tone (平聲).\n" +
"** B = rising tone (上聲).\n" +
"** C = departing tone (去聲).\n" +
"** D = entering tone (入聲).\n" +
"* '''OC''': {{w|Old Chinese}} reconstruction.\n" +
"* '''Gloss''': Word's meaning.\n" +
"{| class=\"wikitable sortable\"\n" +
"|-\n" +
"! TC\n" +
"! SC\n" +
"! PY\n" +
"! MC\n" +
"! MCI\n" +
"! MCF\n" +
"! MCT\n" +
"! OC\n" +
"! Gloss\n"
);
// Processing Baxter-Sagart data, and writing to file.
processBaxterSagart();
// Close the wikitable.
writer.print("|}\n");
// And we're done.
writer.flush();
writer.close();
}
public static File download(String url) throws Exception {
File file;
InputStream in;
OutputStream out;
byte bytes;
int read;
// Create the temp file.
// We don't care where it's stored or what its name is.
file = File.createTempFile("" + url.hashCode(), null);
// The file will be deleted when execution finishes.
file.deleteOnExit();
System.out.println("Downloading: " + url);
System.out.println("This may take a while...");
// Opening an HTTP connection and securing an input stream.
in = new URL(url).openStream();
// Buffering the input stream, if not already buffered.
if (!in.markSupported())
in = new BufferedInputStream(in);
// Opening an output stream to the temp file.
out = new FileOutputStream(file);
// Buffering the output stream.
out = new BufferedOutputStream(out);
// 4K read/write buffer.
bytes = new byte;
// Read/write loop.
for (;;) {
read = in.read(bytes);
if (read < 0) // EOF
break;
if (read > 0) // Have some data.
out.write(bytes, 0, read);
}
// Closing input stream.
in.close();
// Flushing and closing output stream.
out.flush();
out.close();
return file;
}
public static void processUnihan() throws Exception {
ZipInputStream zin;
ZipEntry entry;
String filename;
InputStream in;
BufferedReader reader;
String line;
String tokens;
String fieldType;
String traditional;
String simplified;
// Create traditional-to-simplified map data structure.
scMap = new TreeMap<String,String>();
// Opening zip file.
zin = new ZipInputStream(
new BufferedInputStream(
new FileInputStream(unihanFile)
)
);
// Searching for the right zip entry.
for (;;) {
entry = zin.getNextEntry();
if (entry == null) {
zin.close();
throw new RuntimeException(
"Can't find Unihan_Variants.txt.");
}
filename = entry.getName();
if (filename.endsWith("Unihan_Variants.txt")) {
// We found what we're looking for.
break;
}
// This isn't the zip entry we're looking for.
entry = null;
zin.closeEntry();
}
// We don't need this anymore.
entry = null;
in = zin;
// Buffering the entry's input stream, if not already buffered.
if (!in.markSupported())
in = new BufferedInputStream(in);
// Creating a UTF-8 input stream reader.
reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
// Looping through the data, gleaning only what we need.
for (;;) {
// Reading a line of text.
line = reader.readLine();
if (line == null) // EOF
break;
// Stripping comments from the line.
line = line.replaceFirst("#.*$", "");
// Stripping trailing whitespace from the line.
line = line.trim();
// Skipping empty lines.
if (line.length() == 0)
continue;
// Split line by tab characters.
tokens = line.split("\t");
// There should be at least three tokens.
if (tokens.length < 3)
continue; // Skip the line.
// Determine if this line has data we're looking for.
fieldType = tokens.trim();
if (!fieldType.equalsIgnoreCase("kSimplifiedVariant"))
continue; // Skip the line.
// Traditional and simplified Chinese characters.
traditional = fromUnicodeNotation(tokens);
simplified = fromUnicodeNotation(tokens);
// If by chance they are the same, skip them.
if (traditional.equals(simplified))
continue; // Skip the line.
// We found something we're looking for.
scMap.put(traditional, simplified);
}
// We're done with the zip file.
zin.close();
in = null;
zin = null;
// Rebranch the finished map for improved access speed.
scMap = new TreeMap<String,String>(scMap);
}
public static void processBaxterSagart() throws Exception {
BufferedReader reader;
boolean firstLine;
String line;
String tokens;
// Reading Baxter-Sagart database from temp file.
reader = new BufferedReader(
new InputStreamReader(
new BufferedInputStream(
new FileInputStream(baxterSagartFile)
), "UTF-8"
)
);
// We will skip the first non-empty line when we reach it.
firstLine = true;
// Looping through each line.
for (;;) {
// Reading a line of text.
line = reader.readLine();
if (line == null) // EOF
break;
// Trimming trailing whitespace.
line = line.trim();
// Skipping empty lines.
if (line.length() == 0)
continue;
// Skipping the first line, which is a table header.
if (firstLine) {
firstLine = false;
continue;
}
// Split line by tab characters.
tokens = line.split("*\t\\s*");
// There should be at least eleven tokens.
if (tokens.length < 11)
continue;
// Process tokens in another function.
processBaxterSagart(tokens, tokens, tokens,
tokens, tokens, tokens, tokens,
tokens, tokens, tokens, tokens);
}
// Closing the reader.
reader.close();
}
public static void processBaxterSagart(
String tc, String py, String py2,
String mc, String mci, String mcf, String mct,
String oc, String gloss, String gst, String utf16
) throws Exception {
String s;
// Begin the new table row.
writer.print("|-\n");
// Traditional Chinese character.
writer.print('|');
if (tc.length() > 0) {
writer.print("lang=zh-Hant|[[");
printEscaped(tc);
writer.print("]]");
} writer.print('\n');
// Simplified Chinese character.
writer.print('|');
if (tc.length() > 0) {
s = scMap.get(tc);
if (s == null || s.length() == 0)
s = tc;
writer.print("lang=zh-Hans|[[");
printEscaped(s);
writer.print("]]");
} writer.print('\n');
// Pinyin, sortable.
writer.print('|');
if (py.length() > 0) {
py2 = pinyinToSortable(py);
if (!py.equals(py2)) {
writer.print("<span style=display:none>");
printEscaped(py2);
writer.print(" </span>");
} writer.print("[[");
printEscaped(py);
writer.print("#Mandarin|");
printEscaped(py);
writer.print("]]");
} writer.print('\n');
// Middle Chinese, sortable.
writer.print('|');
if (mc.length() > 0) {
mc = middleChineseToUnicode(mc);
s = middleChineseToSortable(mc);
if (!mc.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(mc);
} writer.print('\n');
if (mcf.startsWith("-r")) { // misplaced
mcf = "-" + mcf.substring(2);
mci = mci.substring(0, mci.length() - 1) + "r-";
}
// Middle Chinese initial, sortable.
writer.print('|');
if (mci.length() > 0) {
mci = middleChineseToUnicode(mci);
s = middleChineseToSortable(mci);
if (!mci.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(mci);
} writer.print('\n');
// Middle Chinese final, sortable.
writer.print('|');
if (mcf.length() > 0) {
mcf = middleChineseToUnicode(mcf);
s = middleChineseToSortable(mcf);
if (!mcf.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(mcf);
} writer.print('\n');
// Middle Chinese tone, sortable.
writer.print('|');
if (mct.length() > 0) {
switch (mct.charAt(0)) {
case 'A':
writer.print("<span style=display:none>A</span>even"); break;
case 'B':
writer.print("<span style=display:none>B</span>rising"); break;
case 'C':
writer.print("<span style=display:none>C</span>departing"); break;
case 'D':
writer.print("<span style=display:none>D</span>entering"); break;
default: printEscaped(mct);
}
} writer.print('\n');
// Old Chinese, semi-sortable.
writer.print('|');
if (oc.length() > 0) {
writer.print("class=IPA|");
oc = oldChineseToUnicode(oc);
s = oldChineseToSortable(oc);
if (!oc.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(oc);
} writer.print('\n');
// Gloss, semi-sortable
writer.print('|');
if (gloss.length() > 0)
printEscaped(gloss);
writer.print('\n');
}
public static String pinyinToSortable(String string) {
string = string.replaceFirst("(.*)$", "$1"+"1");
string = string.replaceFirst("(.*)$", "$1"+"2");
string = string.replaceFirst("(.*)$", "$1"+"3");
string = string.replaceFirst("(.*)$", "$1"+"4");
string = string.replaceFirst("", "a");
string = string.replaceFirst("", "e");
string = string.replaceFirst("", "i");
string = string.replaceFirst("", "o");
string = string.replaceFirst("", "u");
string = string.replaceFirst("", "v");
return string;
}
public static String middleChineseToUnicode(String string) {
// Converting ASCII-friendly version to Unicode.
string = string.replace('\'', 'ʔ');
string = string.replace("ae", "æ");
string = string.replace("ea", "ɛ");
string = string.replace('+', 'ɨ');
return string;
}
public static String middleChineseToSortable(String string) {
// Dashes are not needed in sorting.
string = string.replace("-", "");
// 'ʔ' < letters
string = string.replace('ʔ', '\'');
// 'a' < 'æ' < 'b'
string = string.replace("æ", "a~");
// 'd' < 'ɛ' < 'e'
string = string.replace("e", "e~");
string = string.replace('ɛ', 'e');
// 'h' < 'ɨ' < 'i'
string = string.replace("i", "i~");
string = string.replace('ɨ', 'i');
// rising tone is second tone
string = string.replace('X', '2');
// departing tone is third tone
string = string.replace('H', '3');
return string;
}
public static String oldChineseToUnicode(String string) {
// Streamlining devoicing diacritics.
string = string.replaceAll("+", "̥");
string = string.replace("ŋ̥", "ŋ̊");
// Streamlining pharyngealization diacritics.
string = string.replace('ˤ', 'ˁ');
string = string.replaceAll("(\\*(\\ə?)?\\[?)g", "$1ɡ");
return string;
}
public static String oldChineseToSortable(String string) {
string = string.replace('ɡ', 'g');
// Temporarily converting "ts" and "dz".
string = string.replace("ts", "ʦ");
string = string.replace("dz", "ʣ");
// Making loosely-bound prefix schwas sort-neutral.
string = string.replaceAll("ə()", "$1");
// Stripping lots of sort-neutral stuff.
string = string.replaceAll(
"\\{\\}]", "");
// space < 'C' < 'N' < 'ʔ' < letters
string = string.replace('C', '$');
string = string.replace('N', '%');
string = string.replace('ʔ', '\'');
// 'd' < 'dz' < 'ə' < 'e'
string = string.replace("ʣ", "d~");
string = string.replace("e", "e~");
string = string.replace('ə', 'e');
// 'g' < 'ɢ' < 'h'
string = string.replace("ɢ", "g~");
// 'l' < 'l̥' < 'm' < 'm̥' < 'n' < 'n̥' < 'ŋ' < 'ŋ̊' < 'o'
// 'r' < 'r̥' < 's'
string = string.replaceAll("+", "~");
string = string.replace("ŋ", "n~~");
// 't' < 'ts' < 'u'
string = string.replace("ʦ", "t~");
// letters < 'ˁ' < 'ʰ' < 'ʷ'
string = string.replace("ˁ", "z");
string = string.replace("ʰ", "z~");
string = string.replace("ʷ", "z~~");
return string;
}
public static String fromUnicodeNotation(String string)
throws Exception {
int code;
StringBuilder builder;
// Stripping everything after a certain point.
string = string.replaceFirst(".*$", "");
// Stripping all non-hexadecimal characters.
string = string.replaceAll("", "");
// Parsing hexadecimal number.
code = Integer.parseInt(string, 16);
// Converting the code point to a string and returning it.
builder = new StringBuilder(4);
try {
builder.appendCodePoint(code);
} catch (IllegalArgumentException e) { System.out.println(string + ", " + code); throw e; }
return builder.toString().intern();
}
public static void printEscaped(String string) {
int length, index;
String substit;
char ch;
length = string.length();
for (index = 0; index < length; index++) {
substit = null;
ch = string.charAt(index);
switch (ch) {
case '&': substit = "&"; break;
case '<': substit = "‹"; break;
case '>': substit = "›"; break;
case '\"': substit = """; break;
case '\'': substit = "&#" + (int)'\'' + ";"; break;
case '[': substit = "&#" + (int)'[' + ";"; break;
case ']': substit = "&#" + (int)']' + ";"; break;
case '{': substit = "&#" + (int)'{' + ";"; break;
case '}': substit = "&#" + (int)'}' + ";"; break;
case '|': substit = "&#" + (int)'|' + ";"; break;
}
if (substit != null)
writer.print(substit);
else writer.print(ch);
}
}
}