includes/zhtable/Makefile.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8  -*-
   3 # @author Philip
   4 import tarfile, zipfile
   5 import os, re, shutil, sys, platform
   6
   7 pyversion = platform.python_version()
   8 islinux = platform.system().lower() == 'linux' or False
   9
  10 if pyversion[:3] in ['2.5', '2.6', '2.7']:
  11     import urllib as urllib_request
  12     import codecs
  13     uniopen = codecs.open
  14     def unichr2(i):
  15         if sys.maxunicode >= 0x10000 or i < 0x10000:
  16             return unichr(i)
  17         else:
  18             return unichr(0xD7C0+(i>>10)) + unichr(0xDC00+(i&0x3FF))
  19 elif pyversion[:2] == '3.':
  20     import urllib.request as urllib_request
  21     uniopen = open
  22     unichr2 = chr
  23
  24 # DEFINE
  25 SF_MIRROR = 'easynews'
  26 SCIM_TABLES_VER = '0.5.9'
  27 SCIM_PINYIN_VER = '0.5.91'
  28 LIBTABE_VER = '0.2.3'
  29 # END OF DEFINE
  30
  31 def GetFileFromURL( url, dest ):
  32     if os.path.isfile(dest):
  33         print( 'File %s up to date.' % dest )
  34         return
  35     global islinux
  36     if islinux:
  37         # we use wget instead urlretrieve under Linux,
  38         # because wget will display details like download progress
  39         os.system('wget %s' % url)
  40     else:
  41         print( 'Downloading from [%s] ...' % url )
  42         urllib_request.urlretrieve( url, dest )
  43         print( 'Download complete.\n' )
  44     return
  45
  46 def GetFileFromUnihan( path ):
  47     print( 'Extracting files from %s ...' % path )
  48     text = zipfile.ZipFile(path).read('Unihan_Variants.txt')
  49     uhfile = uniopen('Unihan_Variants.txt', 'w')
  50     uhfile.write(text)
  51     uhfile.close()
  52     return
  53
  54 def GetFileFromTar( path, member, rename ):
  55     print( 'Extracting %s from %s ...' % (rename, path) )
  56     tarfile.open(path, 'r:gz').extract(member)
  57     shutil.move(member, rename)
  58     tree_rmv = member.split('/')[0]
  59     shutil.rmtree(tree_rmv)
  60     return
  61
  62 def ReadBIG5File( dest ):
  63     print( 'Reading and decoding %s ...' % dest )
  64     f1 = uniopen( dest, 'r', encoding='big5hkscs', errors='replace' )
  65     text = f1.read()
  66     text = text.replace( '\ufffd', '\n' )
  67     f1.close()
  68     f2 = uniopen( dest, 'w', encoding='utf8' )
  69     f2.write(text)
  70     f2.close()
  71     return text
  72
  73 def ReadFile( dest ):
  74     print( 'Reading and decoding %s ...' % dest )
  75     f = uniopen( dest, 'r', encoding='utf8' )
  76     ret = f.read()
  77     f.close()
  78     return ret
  79
  80 def ReadUnihanFile( dest ):
  81     print( 'Reading and decoding %s ...' % dest )
  82     f = uniopen( dest, 'r', encoding='utf8' )
  83     t2s_code = []
  84     s2t_code = []
  85     while True:
  86         line = f.readline()
  87         if line:
  88             if line.startswith('#'):
  89                 continue
  90             elif not line.find('kSimplifiedVariant') == -1:
  91                 temp = line.split('kSimplifiedVariant')
  92                 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
  93             elif not line.find('kTraditionalVariant') == -1:
  94                 temp = line.split('kTraditionalVariant')
  95                 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
  96         else:
  97             break
  98     f.close()
  99     return ( t2s_code, s2t_code )
 100
 101 def RemoveRows( text, num ):
 102     text = re.sub( '.*\s*', '', text, num)
 103     return text
 104
 105 def RemoveOneCharConv( text ):
 106     preg = re.compile('^.\s*$', re.MULTILINE)
 107     text = preg.sub( '', text )
 108     return text
 109
 110 def ConvertToChar( code ):
 111     code = code.split('<')[0]
 112     return unichr2( int( code[2:], 16 ) )
 113
 114 def GetDefaultTable( code_table ):
 115     char_table = {}
 116     for ( f, t ) in code_table:
 117         if f and t:
 118             from_char = ConvertToChar( f )
 119             to_chars = [ConvertToChar( code ) for code in t.split()]
 120             char_table[from_char] = to_chars
 121     return char_table
 122
 123 def GetManualTable( dest ):
 124     text = ReadFile( dest )
 125     temp1 = text.split()
 126     char_table = {}
 127     for elem in temp1:
 128         elem = elem.strip('|')
 129         if elem:
 130             temp2 = elem.split( '|', 1 )
 131             from_char = unichr2( int( temp2[0][2:7], 16 ) )
 132             to_chars = [unichr2( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
 133             char_table[from_char] = to_chars
 134     return char_table
 135
 136 def GetValidTable( src_table ):
 137     valid_table = {}
 138     for f, t in src_table.items():
 139         valid_table[f] = t[0]
 140     return valid_table
 141
 142 def GetToManyRules( src_table ):
 143     tomany_table = {}
 144     for f, t in src_table.items():
 145         for i in range(1, len(t)):
 146             tomany_table[t[i]] = True
 147     return tomany_table
 148
 149 def RemoveRules( dest, table ):
 150     text = ReadFile( dest )
 151     temp1 = text.split()
 152     for elem in temp1:
 153         f = ''
 154         t = ''
 155         elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
 156         if '=>' in elem:
 157             if elem.startswith( '=>' ):
 158                 t = elem.replace( '=>', '' ).strip()
 159             elif elem.endswith( '=>' ):
 160                 f = elem.replace( '=>', '' ).strip()
 161             else:
 162                 temp2 = elem.split( '=>' )
 163                 f = temp2[0].strip()
 164                 t = temp2[1].strip()
 165                 try:
 166                     table.pop(f, t)
 167                     continue
 168                 except:
 169                     continue
 170         else:
 171             f = t = elem
 172         if f:
 173             try:
 174                 table.pop(f)
 175             except:
 176                 x = 1
 177         if t:
 178             for temp_f, temp_t in table.copy().items():
 179                 if temp_t == t:
 180                     table.pop(temp_f)
 181     return table
 182
 183 def DictToSortedList1( src_table ):
 184     return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
 185
 186 def DictToSortedList2( src_table ):
 187     return sorted( src_table.items(), key = lambda m: m[1] )
 188
 189 def Converter( string, conv_table ):
 190     i = 0
 191     while i < len(string):
 192         for j in range(len(string) - i, 0, -1):
 193             f = string[i:][:j]
 194             t = conv_table.get( f )
 195             if t:
 196                 string = string[:i] + t + string[i:][j:]
 197                 i += len(t) - 1
 198                 break
 199         i += 1
 200     return string
 201
 202 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 203     wordlist = list( set( src_wordlist ) )
 204     wordlist.sort( key = len, reverse = True )
 205     word_conv_table = {}
 206     word_reconv_table = {}
 207     while wordlist:
 208         conv_table = {}
 209         reconv_table = {}
 210         conv_table.update( word_conv_table )
 211         conv_table.update( char_conv_table )
 212         reconv_table.update( word_reconv_table )
 213         reconv_table.update( char_reconv_table )
 214         word = wordlist.pop()
 215         new_word_len = word_len = len(word)
 216         while new_word_len == word_len:
 217             rvt_test = False
 218             for char in word:
 219                 rvt_test = rvt_test or src_tomany.get(char)
 220             test_word = Converter( word, reconv_table )
 221             new_word = Converter( word, conv_table )
 222             if not reconv_table.get( new_word ):
 223                 if not test_word == word:
 224                     word_conv_table[word] = new_word
 225                     word_reconv_table[new_word] = word
 226                 elif rvt_test:
 227                     rvt_word = Converter( new_word, reconv_table )
 228                     if not rvt_word == word:
 229                         word_conv_table[word] = new_word
 230                         word_reconv_table[new_word] = word
 231             try:
 232                 word = wordlist.pop()
 233             except IndexError:
 234                 break
 235             new_word_len = len(word)
 236     return word_reconv_table
 237
 238 def GetManualWordsTable( src_wordlist, conv_table ):
 239     src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
 240     wordlist = list( set( src_wordlist ) )
 241     wordlist.sort( key = len, reverse = True )
 242     reconv_table = {}
 243     while wordlist:
 244         word = wordlist.pop()
 245         new_word = Converter( word, conv_table )
 246         reconv_table[new_word] = word
 247     return reconv_table
 248
 249 def CustomRules( dest ):
 250     text = ReadFile( dest )
 251     temp = text.split()
 252     ret = dict()
 253     for i in range( 0, len( temp ), 2 ):
 254         ret[temp[i]] = temp[i + 1]
 255     return ret
 256
 257 def GetPHPArray( table ):
 258     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
 259     #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
 260     return '\n'.join(lines)
 261
 262 def RemoveSameChar( src_table ):
 263     dst_table = {}
 264     for f, t in src_table.items():
 265         if f != t:
 266             dst_table[f] = t
 267     return dst_table
 268
 269 def main():
 270     #Get Unihan.zip:
 271     url  = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
 272     han_dest = 'Unihan.zip'
 273     GetFileFromURL( url, han_dest )
 274
 275     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 276     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 277     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 278     GetFileFromURL( url, tbe_dest )
 279
 280     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 281     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 282     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 283     GetFileFromURL( url, pyn_dest )
 284
 285     # Get libtabe-$(LIBTABE_VER).tgz:
 286     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 287     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 288     GetFileFromURL( url, lbt_dest )
 289
 290     # Extract the file from a comressed files
 291
 292     # Unihan.txt Simp. & Trad
 293     GetFileFromUnihan( han_dest )
 294
 295     # Make word lists
 296     t_wordlist = []
 297     s_wordlist = []
 298
 299     # EZ.txt.in Trad
 300     src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
 301     dst = 'EZ.txt.in'
 302     GetFileFromTar( tbe_dest, src, dst )
 303     text = ReadFile( dst )
 304     text = text.split( 'BEGIN_TABLE' )[1].strip()
 305     text = text.split( 'END_TABLE' )[0].strip()
 306     text = re.sub( '.*\t', '', text )
 307     text = RemoveOneCharConv( text )
 308     t_wordlist.extend( text.split() )
 309
 310     # Wubi.txt.in Simp
 311     src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
 312     dst = 'Wubi.txt.in'
 313     GetFileFromTar( tbe_dest, src, dst )
 314     text = ReadFile( dst )
 315     text = text.split( 'BEGIN_TABLE' )[1].strip()
 316     text = text.split( 'END_TABLE' )[0].strip()
 317     text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
 318     text = RemoveOneCharConv( text )
 319     s_wordlist.extend( text.split() )
 320
 321     # Ziranma.txt.in Simp
 322     src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
 323     dst = 'Ziranma.txt.in'
 324     GetFileFromTar( tbe_dest, src, dst )
 325     text = ReadFile( dst )
 326     text = text.split( 'BEGIN_TABLE' )[1].strip()
 327     text = text.split( 'END_TABLE' )[0].strip()
 328     text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
 329     text = RemoveOneCharConv( text )
 330     s_wordlist.extend( text.split() )
 331
 332     # phrase_lib.txt Simp
 333     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 334     dst = 'phrase_lib.txt'
 335     GetFileFromTar( pyn_dest, src, dst )
 336     text = ReadFile( 'phrase_lib.txt' )
 337     text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
 338     text = RemoveRows( text, 5 )
 339     text = RemoveOneCharConv( text )
 340     s_wordlist.extend( text.split() )
 341
 342     # tsi.src Trad
 343     src = 'libtabe/tsi-src/tsi.src'
 344     dst = 'tsi.src'
 345     GetFileFromTar( lbt_dest, src, dst )
 346     text = ReadBIG5File( 'tsi.src' )
 347     text = re.sub( ' \d.*', '', text.replace('# ', ''))
 348     text = RemoveOneCharConv( text )
 349     t_wordlist.extend( text.split() )
 350
 351     # remove duplicate elements
 352     t_wordlist = list( set( t_wordlist ) )
 353     s_wordlist = list( set( s_wordlist ) )
 354
 355     # simpphrases_exclude.manual Simp
 356     text = ReadFile( 'simpphrases_exclude.manual' )
 357     temp = text.split()
 358     s_string = '\n'.join( s_wordlist )
 359     for elem in temp:
 360         s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
 361     s_wordlist = s_string.split('\n')
 362
 363     # tradphrases_exclude.manual Trad
 364     text = ReadFile( 'tradphrases_exclude.manual' )
 365     temp = text.split()
 366     t_string = '\n'.join( t_wordlist )
 367     for elem in temp:
 368         t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
 369     t_wordlist = t_string.split('\n')
 370
 371     # Make char to char convertion table
 372     # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
 373     ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan_Variants.txt' )
 374     # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
 375     t2s_1tomany = {}
 376     t2s_1tomany.update( GetDefaultTable( t2s_code ) )
 377     t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
 378     # dict s2t_1tomany
 379     s2t_1tomany = {}
 380     s2t_1tomany.update( GetDefaultTable( s2t_code ) )
 381     s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
 382     # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
 383     t2s_1to1 = GetValidTable( t2s_1tomany )
 384     s_tomany = GetToManyRules( t2s_1tomany )
 385     # dict s2t_1to1; s2t_trans
 386     s2t_1to1 = GetValidTable( s2t_1tomany )
 387     t_tomany = GetToManyRules( s2t_1tomany )
 388     # remove noconvert rules
 389     t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 390     s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 391
 392     # Make word to word convertion table
 393     t2s_1to1_supp = t2s_1to1.copy()
 394     s2t_1to1_supp = s2t_1to1.copy()
 395     # trad2simp_supp_set.manual
 396     t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
 397     # simp2trad_supp_set.manual
 398     s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
 399     # simpphrases.manual
 400     text = ReadFile( 'simpphrases.manual' )
 401     s_wordlist_manual = text.split('\n')
 402     t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
 403     t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
 404     # tradphrases.manual
 405     text = ReadFile( 'tradphrases.manual' )
 406     t_wordlist_manual = text.split('\n')
 407     s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
 408     s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
 409     # t2s_word2word
 410     s2t_supp = s2t_1to1_supp.copy()
 411     s2t_supp.update( s2t_word2word_manual )
 412     t2s_supp = t2s_1to1_supp.copy()
 413     t2s_supp.update( t2s_word2word_manual )
 414     t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 415     ## toSimp.manual
 416     t2s_word2word.update( t2s_word2word_manual )
 417     # s2t_word2word
 418     s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 419     ## toTrad.manual
 420     s2t_word2word.update( s2t_word2word_manual )
 421
 422     # Final tables
 423     # sorted list toHans
 424     t2s_1to1 = RemoveSameChar( t2s_1to1 )
 425     s2t_1to1 = RemoveSameChar( s2t_1to1 )
 426     toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
 427     # sorted list toHant
 428     toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
 429     # sorted list toCN
 430     toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
 431     # sorted list toHK
 432     toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
 433     # sorted list toSG
 434     toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
 435     # sorted list toTW
 436     toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
 437
 438     # Get PHP Array
 439     php = '''<?php
 440 /**
 441  * Simplified / Traditional Chinese conversion tables
 442  *
 443  * Automatically generated using code and data in includes/zhtable/
 444  * Do not modify directly!
 445  */
 446
 447 $zh2Hant = array(\n'''
 448     php += GetPHPArray( toHant )
 449     php += '\n);\n\n$zh2Hans = array(\n'
 450     php += GetPHPArray( toHans )
 451     php += '\n);\n\n$zh2TW = array(\n'
 452     php += GetPHPArray( toTW )
 453     php += '\n);\n\n$zh2HK = array(\n'
 454     php += GetPHPArray( toHK )
 455     php += '\n);\n\n$zh2CN = array(\n'
 456     php += GetPHPArray( toCN )
 457     php += '\n);\n\n$zh2SG = array(\n'
 458     php += GetPHPArray( toSG )
 459     php += '\n);'
 460
 461     f = uniopen( 'ZhConversion.php', 'w', encoding = 'utf8' )
 462     print ('Writing ZhConversion.php ... ')
 463     f.write( php )
 464     f.close()
 465
 466     #Remove temp files
 467     print ('Deleting temp files ... ')
 468     os.remove('EZ.txt.in')
 469     os.remove('phrase_lib.txt')
 470     os.remove('tsi.src')
 471     os.remove('Unihan_Variants.txt')
 472     os.remove('Wubi.txt.in')
 473     os.remove('Ziranma.txt.in')
 474
 475
 476 if __name__ == '__main__':
 477     main()