Utilisateur:Ftiercel/ja-kana2API.py

ja-kana2API.py
#!/usr/bin/python
# -*- coding: utf-8  -*-

# Author : Fabrice TIERCELIN
# Date : 2008-02-16
# Version : 1.2
# Licence : GPL

import sys
import re
import time

# kana2API returns the API pronunciation of the given word in kana (either hiragana or katakana)
#
# How it works :
# The mecanism is based on the big characters (for instance : いらっしゃる -> いらしる)
# because the pronunciations of those characters are all separated by dots.
# Each iteration creates a syllable pronunciation.
# Then, the pronunciation of each big character is divised into three parts (1) :
#  - the first consonant (if any)
#  - the second consonant (if any)
#  - the vowel
# For the first consonant, there are several pronunciations, depending on different contexts (2) :
#  - there is a ん before
#  - there is a っ before
# For the vowel, there are several pronunciations too (2) :
#  - there is a ゃ or similar after
#  - there is no small character around
# Last, we check if there is any ん or small characters around and we choose the good pronunciation (3).
    
def pop(kana):
    char = kana
    remainingKana = kana
    while (kana != char + remainingKana) and (len(remainingKana) > 0):
        remainingKana = remainingKana
    if (len(remainingKana) > 0):
        remainingKana = kana
    return char, remainingKana
    
def kana2API(kana):
    remainingKana = kana
##    remainingKana = kana.replace(u' ', u'')
    pronunciation = u''
    # There is no dot before the first syllable.
    dot = u''
    while (len(remainingKana) > 0):
        isFinished = False
        hasSmallTsuBefore = False
        hasNBefore = False
        char, remainingKana = pop(remainingKana)
        # First, we check if there is any っ or ん before a big character.
        # We notice it and then we increment the remaining kana.
        smallChar = u''
        if char in :
            hasSmallTsuBefore = True
            if (len(remainingKana) > 0):
                char, remainingKana = pop(remainingKana)
            else:
                isFinished = True
        elif char in :
            hasNBefore = True
            if (len(remainingKana) > 0):
                char, remainingKana = pop(remainingKana)
            else:
                pronunciation = pronunciation + u'ɴ'
                isFinished = True

        if not isFinished:
            # 1. The big character

            # For the ん pronunciation
            if hasNBefore:
                if char in :
                    n = u'ɴ'
                elif char in :
                    n = u'ŋ̩'
                elif char in :
                    n = u'ɱ'
                else:
                    n = u'n'

            # For the first consonant pronunciation
            if char in :
                firstConsonant = u''
            elif char in :
                firstConsonant = u'k'
            elif char in :
                firstConsonant = u'g'
            elif char in :
                firstConsonant = u's'
            elif char in :
                firstConsonant = u'ʃ'
            elif char in :
                firstConsonant = u'z'
            elif char in :
                firstConsonant = u'd'
            elif char in :
                firstConsonant = u't'
            elif char in :
                firstConsonant = u'n'
            elif char in :
                firstConsonant = u'h'
            elif char in :
                firstConsonant = u'ɸ'
            elif char in :
                firstConsonant = u'b'
            elif char in :
                firstConsonant = u'p'
            elif char in :
                firstConsonant = u'm'
            elif char in :
                firstConsonant = u'j'
            elif char in :
                firstConsonant = u'r'
            elif char in :
                firstConsonant = u'w'
            else:
                firstConsonant = u''

            # For the second consonant pronunciation
            if char in :
                secondConsonant = u'ʃ'
            elif char in :
                secondConsonant = u's'
            elif char in :
                secondConsonant = u'z'
            else:
                secondConsonant = u''

            # For the vowel pronunciation
            if char in :
                if hasNBefore and (char in ):
                    vowelNormal = u'ã'
                else:
                    vowelNormal = u'a'
            elif char in :
                vowelNormal = u'i'
            elif char in :
                vowelNormal = u'ɯ'
            elif char in :
                vowelNormal = u'e'
            elif char in :
                if hasNBefore and (char in ):
                    vowelNormal = u'õ'
                else:
                    vowelNormal = u'o'
            else:
                vowelNormal = u''

            # For the vowel pronunciation before a small character
            if char in :
##                vowelBeforeSmallChar = u'j'
                vowelBeforeSmallChar = u''
            else:
                vowelBeforeSmallChar = vowelNormal


            # 2. The following small character
            hasLongVowel = False
            hasSmallCharAfter = False
            hasLongSmallCharAfter = False
            secondVowel = u''

            if (len(remainingKana) > 0):
                if remainingKana in :
                    hasSmallCharAfter = True
                elif (char in ) and (remainingKana in ):
                    hasSmallCharAfter = True
                elif remainingKana in :
                    hasLongVowel = True

                if hasSmallCharAfter:
                    if remainingKana in :
                        smallChar = u'a'
                    elif remainingKana in :
                        smallChar = u'i'
                    elif remainingKana in :
                        smallChar = u'ɯ'
                    elif remainingKana in :
                        smallChar = u'e'
                    elif remainingKana in :
                        smallChar = u'o'
                    else:
                        smallChar = u''

                if hasLongVowel or hasSmallCharAfter:
                    remainingKana = pop(remainingKana)
                
                if hasSmallCharAfter and (len(remainingKana) > 0):
                    if remainingKana == u'ー':
                        hasLongSmallCharAfter = True
                        remainingKana = pop(remainingKana)
            
            if len(remainingKana) > 0:
                if remainingKana in :
                    if not hasLongVowel and not hasSmallCharAfter and char in :
                        secondVowel = u'ː'
                    else:
                        secondVowel = u'j'
                    remainingKana = pop(remainingKana)
                    

            # 3. Pronunciation writting
            # 3.1. consonants
            if hasNBefore:
                pronunciation = pronunciation + n + u'.' + firstConsonant + secondConsonant
            elif hasSmallTsuBefore:
                pronunciation = pronunciation + firstConsonant + u'.' + firstConsonant + secondConsonant
            else:
                pronunciation = pronunciation + dot + firstConsonant + secondConsonant

            # 3.2. vowels
            if hasLongSmallCharAfter:
                pronunciation = pronunciation + vowelBeforeSmallChar + smallChar + u'ː' + secondVowel
            elif hasSmallCharAfter:
                pronunciation = pronunciation + vowelBeforeSmallChar + smallChar + secondVowel
            elif hasLongVowel:
                pronunciation = pronunciation + vowelNormal + u'ː' + secondVowel
            else:
                pronunciation = pronunciation + vowelNormal + secondVowel

            # Now, we always need dot to separate syllables
            dot = u'.'
    return pronunciation

if __name__ == "__main__":

    wordList = 

    ftin =open('./japaneseWords.txt', 'r')
    line = ftin.readline()
    while (line):
        wordList.append(line)
        line = ftin.readline()
    ftin.close()

    now = time.localtime()
    filename = './pronunciations-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt'
    outputFile = open(filename, 'w')

    for rawWord in wordList:
##    rawWord = wordList
        word = rawWord.decode('utf-8')
        decodedWord = word.strip(u'\r\n ')
        pron = kana2API(decodedWord)
        encodedWord = decodedWord.encode('utf-8')
        encodedPron = pron.encode('utf-8')
        outputFile.write(encodedWord + '\t/' + encodedPron + '/')
        outputFile.write("\r\n")
    
    outputFile.close()
Utilisateur:Ftiercel/ja-kana2API.py

Wikious

Sapientia

Scientia

Boobota

Sagapedia

Wikithot