#!/usr/bin/python # -*- coding: utf-8 -*- # wikipath en wiktionary User:Tbot/code/script """ Given a word and language code, return a script template for the en.wikt """ # table of scripts, each is lowest character code point, highest code + 1, ISO script Scs = [ (0x0370, 0x0400, 'Grek'), (0x0400, 0x0530, 'Cyrl'), (0x0530, 0x0590, 'Armn'), (0x0590, 0x0600, 'Hebr'), (0x0600, 0x0700, 'Arab'), (0x0700, 0x0750, 'Syrc'), (0x0750, 0x0780, 'Arab'), (0x0900, 0x0980, 'Deva'), (0x0980, 0x0A00, 'Beng'), (0x0A00, 0x0A80, 'Guru'), (0x0A80, 0x0B00, 'Gujr'), (0x0B00, 0x0B80, 'Orya'), (0x0B80, 0x0C00, 'Taml'), (0x0C00, 0x0C80, 'Telu'), (0x0C80, 0x0D00, 'Knda'), (0x0D00, 0x0D80, 'Mlym'), (0x0D80, 0x0E00, 'Sinh'), (0x0E00, 0x0E80, 'Thai'), (0x0E80, 0x0F00, 'Laoo'), (0x0F00, 0x1000, 'Tibt'), (0x1000, 0x10A0, 'Mymr'), (0x10A0, 0x1100, 'Geor'), (0x1100, 0x1200, 'Hang'), # jamo (0x1200, 0x13A0, 'Ethi'), (0x13A0, 0x1400, 'Cher'), (0x1400, 0x1680, 'Cans'), (0x3040, 0x3100, 'Jpan'), (0x3400, 0xA000, 'Hani'), # Han Ext A and Unified (0xAC00, 0xD800, 'Hang'), (0x20000, 0x2A6D7, 'Hant') ] # Han Ext B, mostly archaic so assume traditional # table of combinations for specific languages that have particular templates Lsp = { 'fa-Arab':'fa-Arab', 'ur-Arab':'ur-Arab', 'pa-Arab':'pa-Arab', 'ku-Arab':'ku-Arab', 'grc-Grek':'polytonic', 'ja-Hani':'Jpan', 'ja-Hant':'Jpan' } # need some more ... # all recognized script templates, including redirects, which we do not canonicalize Scripts = set(['ARchar', 'KUchar', 'FAchar', 'THchar', 'URchar', 'Arab', 'fa-Arab', 'ur-Arab', 'pa-Arab', 'ku-Arab', 'THchar', 'polytonic', 'Hebr', 'Beng', 'Hant', 'Hani', 'Jpan', 'Grek', 'Cyrl', 'Deva', 'Sryc', 'Hang', 'RUchar', 'JAchar', 'Hayeren']) for low, high, scode in Scs: Scripts.add(scode) # make sure we have all of those def script(word, lc, report = False): if not word: return '' a = ord(word) if a >= 0xd800 and a < 0xdc00: if len(word) < 2: return '' b = ord(word) # "UTF-16" crap: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000 sc = '' for low, high, scode in Scs: if a >= low and a < high: sc = scode break if sc and lc + '-' + sc in Lsp: sc = Lsp if report and not sc and a > 0x0370: print "no match for script for char code %x" % a return sc def scriptp(sc): if sc in Scripts: return True return False