ja-kana2API.py
#!/usr/bin/python # -*- coding: utf-8 -*- # Author : Fabrice TIERCELIN # Date : 2008-02-16 # Version : 1.2 # Licence : GPL import sys import re import time # kana2API returns the API pronunciation of the given word in kana (either hiragana or katakana) # # How it works : # The mecanism is based on the big characters (for instance : いらっしゃる -> いらしる) # because the pronunciations of those characters are all separated by dots. # Each iteration creates a syllable pronunciation. # Then, the pronunciation of each big character is divised into three parts (1) : # - the first consonant (if any) # - the second consonant (if any) # - the vowel # For the first consonant, there are several pronunciations, depending on different contexts (2) : # - there is a ん before # - there is a っ before # For the vowel, there are several pronunciations too (2) : # - there is a ゃ or similar after # - there is no small character around # Last, we check if there is any ん or small characters around and we choose the good pronunciation (3). def pop(kana): char = kana remainingKana = kana while (kana != char + remainingKana) and (len(remainingKana) > 0): remainingKana = remainingKana if (len(remainingKana) > 0): remainingKana = kana return char, remainingKana def kana2API(kana): remainingKana = kana ## remainingKana = kana.replace(u' ', u'') pronunciation = u'' # There is no dot before the first syllable. dot = u'' while (len(remainingKana) > 0): isFinished = False hasSmallTsuBefore = False hasNBefore = False char, remainingKana = pop(remainingKana) # First, we check if there is any っ or ん before a big character. # We notice it and then we increment the remaining kana. smallChar = u'' if char in : hasSmallTsuBefore = True if (len(remainingKana) > 0): char, remainingKana = pop(remainingKana) else: isFinished = True elif char in : hasNBefore = True if (len(remainingKana) > 0): char, remainingKana = pop(remainingKana) else: pronunciation = pronunciation + u'ɴ' isFinished = True if not isFinished: # 1. The big character # For the ん pronunciation if hasNBefore: if char in : n = u'ɴ' elif char in : n = u'ŋ̩' elif char in : n = u'ɱ' else: n = u'n' # For the first consonant pronunciation if char in : firstConsonant = u'' elif char in : firstConsonant = u'k' elif char in : firstConsonant = u'g' elif char in : firstConsonant = u's' elif char in : firstConsonant = u'ʃ' elif char in : firstConsonant = u'z' elif char in : firstConsonant = u'd' elif char in : firstConsonant = u't' elif char in : firstConsonant = u'n' elif char in : firstConsonant = u'h' elif char in : firstConsonant = u'ɸ' elif char in : firstConsonant = u'b' elif char in : firstConsonant = u'p' elif char in : firstConsonant = u'm' elif char in : firstConsonant = u'j' elif char in : firstConsonant = u'r' elif char in : firstConsonant = u'w' else: firstConsonant = u'' # For the second consonant pronunciation if char in : secondConsonant = u'ʃ' elif char in : secondConsonant = u's' elif char in : secondConsonant = u'z' else: secondConsonant = u'' # For the vowel pronunciation if char in : if hasNBefore and (char in ): vowelNormal = u'ã' else: vowelNormal = u'a' elif char in : vowelNormal = u'i' elif char in : vowelNormal = u'ɯ' elif char in : vowelNormal = u'e' elif char in : if hasNBefore and (char in ): vowelNormal = u'õ' else: vowelNormal = u'o' else: vowelNormal = u'' # For the vowel pronunciation before a small character if char in : ## vowelBeforeSmallChar = u'j' vowelBeforeSmallChar = u'' else: vowelBeforeSmallChar = vowelNormal # 2. The following small character hasLongVowel = False hasSmallCharAfter = False hasLongSmallCharAfter = False secondVowel = u'' if (len(remainingKana) > 0): if remainingKana in : hasSmallCharAfter = True elif (char in ) and (remainingKana in ): hasSmallCharAfter = True elif remainingKana in : hasLongVowel = True if hasSmallCharAfter: if remainingKana in : smallChar = u'a' elif remainingKana in : smallChar = u'i' elif remainingKana in : smallChar = u'ɯ' elif remainingKana in : smallChar = u'e' elif remainingKana in : smallChar = u'o' else: smallChar = u'' if hasLongVowel or hasSmallCharAfter: remainingKana = pop(remainingKana) if hasSmallCharAfter and (len(remainingKana) > 0): if remainingKana == u'ー': hasLongSmallCharAfter = True remainingKana = pop(remainingKana) if len(remainingKana) > 0: if remainingKana in : if not hasLongVowel and not hasSmallCharAfter and char in : secondVowel = u'ː' else: secondVowel = u'j' remainingKana = pop(remainingKana) # 3. Pronunciation writting # 3.1. consonants if hasNBefore: pronunciation = pronunciation + n + u'.' + firstConsonant + secondConsonant elif hasSmallTsuBefore: pronunciation = pronunciation + firstConsonant + u'.' + firstConsonant + secondConsonant else: pronunciation = pronunciation + dot + firstConsonant + secondConsonant # 3.2. vowels if hasLongSmallCharAfter: pronunciation = pronunciation + vowelBeforeSmallChar + smallChar + u'ː' + secondVowel elif hasSmallCharAfter: pronunciation = pronunciation + vowelBeforeSmallChar + smallChar + secondVowel elif hasLongVowel: pronunciation = pronunciation + vowelNormal + u'ː' + secondVowel else: pronunciation = pronunciation + vowelNormal + secondVowel # Now, we always need dot to separate syllables dot = u'.' return pronunciation if __name__ == "__main__": wordList = ftin =open('./japaneseWords.txt', 'r') line = ftin.readline() while (line): wordList.append(line) line = ftin.readline() ftin.close() now = time.localtime() filename = './pronunciations-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt' outputFile = open(filename, 'w') for rawWord in wordList: ## rawWord = wordList word = rawWord.decode('utf-8') decodedWord = word.strip(u'\r\n ') pron = kana2API(decodedWord) encodedWord = decodedWord.encode('utf-8') encodedPron = pron.encode('utf-8') outputFile.write(encodedWord + '\t/' + encodedPron + '/') outputFile.write("\r\n") outputFile.close()