maintenance/language/zhtable/Makefile.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # @author Philip
   4 import tarfile as tf
   5 import zipfile as zf
   6 import os, re, shutil, sys, platform
   7
   8 pyversion = platform.python_version()
   9 islinux = platform.system().lower() == 'linux'
  10
  11 if pyversion[:3] in ['2.6', '2.7']:
  12     import urllib as urllib_request
  13     import codecs
  14     open = codecs.open
  15     _unichr = unichr
  16     if sys.maxunicode < 0x10000:
  17         def unichr(i):
  18             if i < 0x10000:
  19                 return _unichr(i)
  20             else:
  21                 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
  22 elif pyversion[:2] == '3.':
  23     import urllib.request as urllib_request
  24     unichr = chr
  25
  26 def unichr2( *args ):
  27     return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
  28
  29 def unichr3( *args ):
  30     return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
  31
  32 # DEFINE
  33 UNIHAN_VER = '6.3.0'
  34 SF_MIRROR = 'dfn'
  35 SCIM_TABLES_VER = '0.5.13'
  36 SCIM_PINYIN_VER = '0.5.92'
  37 LIBTABE_VER = '0.2.3'
  38 # END OF DEFINE
  39
  40 def download( url, dest ):
  41     if os.path.isfile( dest ):
  42         print( 'File %s is up to date.' % dest )
  43         return
  44     global islinux
  45     if islinux:
  46         # we use wget instead urlretrieve under Linux,
  47         # because wget could display details like download progress
  48         os.system( 'wget %s -O %s' % ( url, dest ) )
  49     else:
  50         print( 'Downloading from [%s] ...' % url )
  51         urllib_request.urlretrieve( url, dest )
  52         print( 'Download complete.\n' )
  53     return
  54
  55 def uncompress( fp, member, encoding = 'U8' ):
  56     name = member.rsplit( '/', 1 )[-1]
  57     print( 'Extracting %s ...' % name )
  58     fp.extract( member )
  59     shutil.move( member, name )
  60     if '/' in member:
  61         shutil.rmtree( member.split( '/', 1 )[0] )
  62     if pyversion[:1] in ['2']:
  63         fc = open( name, 'rb', encoding, 'ignore' )
  64     else:
  65         fc = open( name, 'r', encoding = encoding, errors = 'ignore' )
  66     return fc
  67
  68 unzip = lambda path, member, encoding = 'U8': \
  69         uncompress( zf.ZipFile( path ), member, encoding )
  70
  71 untargz = lambda path, member, encoding = 'U8': \
  72         uncompress( tf.open( path, 'r:gz' ), member, encoding )
  73
  74 def parserCore( fp, pos, beginmark = None, endmark = None ):
  75     if beginmark and endmark:
  76         start = False
  77     else: start = True
  78     mlist = set()
  79     for line in fp:
  80         if beginmark and line.startswith( beginmark ):
  81             start = True
  82             continue
  83         elif endmark and line.startswith( endmark ):
  84             break
  85         if start and not line.startswith( '#' ):
  86             elems = line.split()
  87             if len( elems ) < 2:
  88                 continue
  89             elif len( elems[0] ) > 1 and \
  90                 len( elems[pos] ) > 1: # words only
  91                 mlist.add( elems[pos] )
  92     return mlist
  93
  94 def tablesParser( path, name ):
  95     """ Read file from scim-tables and parse it. """
  96     global SCIM_TABLES_VER
  97     src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
  98     fp = untargz( path, src, 'U8' )
  99     return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
 100
 101 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
 102 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
 103 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
 104
 105 def phraseParser( path ):
 106     """ Read phrase_lib.txt and parse it. """
 107     global SCIM_PINYIN_VER
 108     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 109     dst = 'phrase_lib.txt'
 110     fp = untargz( path, src, 'U8' )
 111     return parserCore( fp, 0 )
 112
 113 def tsiParser( path ):
 114     """ Read tsi.src and parse it. """
 115     src = 'libtabe/tsi-src/tsi.src'
 116     dst = 'tsi.src'
 117     fp = untargz( path, src, 'big5hkscs' )
 118     return parserCore( fp, 0 )
 119
 120 def unihanParser( path ):
 121     """ Read Unihan_Variants.txt and parse it. """
 122     fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
 123     t2s = dict()
 124     s2t = dict()
 125     for line in fp:
 126         if line.startswith( '#' ):
 127             continue
 128         else:
 129             elems = line.split()
 130             if len( elems ) < 3:
 131                 continue
 132             type = elems.pop( 1 )
 133             elems = unichr2( *elems )
 134             if type == 'kTraditionalVariant':
 135                 s2t[elems[0]] = elems[1:]
 136             elif type == 'kSimplifiedVariant':
 137                 t2s[elems[0]] = elems[1:]
 138     fp.close()
 139     return ( t2s, s2t )
 140
 141 def applyExcludes( mlist, path ):
 142     """ Apply exclude rules from path to mlist. """
 143     if pyversion[:1] in ['2']:
 144         excludes = open( path, 'rb', 'U8' ).read().split()
 145     else:
 146         excludes = open( path, 'r', encoding = 'U8' ).read().split()
 147     excludes = [word.split( '#' )[0].strip() for word in excludes]
 148     excludes = '|'.join( excludes )
 149     excptn = re.compile( '.*(?:%s).*' % excludes )
 150     diff = [mword for mword in mlist if excptn.search( mword )]
 151     mlist.difference_update( diff )
 152     return mlist
 153
 154 def charManualTable( path ):
 155    fp = open( path, 'r', encoding = 'U8' )
 156    for line in fp:
 157        elems = line.split( '#' )[0].split( '|' )
 158        elems = unichr3( *elems )
 159        if len( elems ) > 1:
 160            yield elems[0], elems[1:]
 161
 162 def toManyRules( src_table ):
 163     tomany = set()
 164     if pyversion[:1] in ['2']:
 165         for ( f, t ) in src_table.iteritems():
 166             for i in range( 1, len( t ) ):
 167                 tomany.add( t[i] )
 168     else:
 169         for ( f, t ) in src_table.items():
 170             for i in range( 1, len( t ) ):
 171                 tomany.add( t[i] )
 172     return tomany
 173
 174 def removeRules( path, table ):
 175     fp = open( path, 'r', encoding = 'U8' )
 176     texc = list()
 177     for line in fp:
 178         elems = line.split( '=>' )
 179         f = t = elems[0].strip()
 180         if len( elems ) == 2:
 181             t = elems[1].strip()
 182         f = f.strip('"').strip("'")
 183         t = t.strip('"').strip("'")
 184         if f:
 185             try:
 186                 table.pop( f )
 187             except:
 188                 pass
 189         if t:
 190             texc.append( t )
 191     texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
 192     if pyversion[:1] in ['2']:
 193         for (tmp_f, tmp_t) in table.copy().iteritems():
 194             if texcptn.match( tmp_t ):
 195                 table.pop( tmp_f )
 196     else:
 197         for (tmp_f, tmp_t) in table.copy().items():
 198             if texcptn.match( tmp_t ):
 199                 table.pop( tmp_f )
 200     return table
 201
 202 def customRules( path ):
 203     fp = open( path, 'r', encoding = 'U8' )
 204     ret = dict()
 205     for line in fp:
 206         line = line.rstrip( '\r\n' )
 207         if '#' in line:
 208             line = line.split( '#' )[0].rstrip()
 209         elems = line.split( '\t' )
 210         if len( elems ) > 1:
 211             ret[elems[0]] = elems[1]
 212     return ret
 213
 214 def dictToSortedList( src_table, pos ):
 215     return sorted( src_table.items(), key = lambda m: ( m[pos], m[1 - pos] ) )
 216
 217 def translate( text, conv_table ):
 218     i = 0
 219     while i < len( text ):
 220         for j in range( len( text ) - i, 0, -1 ):
 221             f = text[i:][:j]
 222             t = conv_table.get( f )
 223             if t:
 224                 text = text[:i] + t + text[i:][j:]
 225                 i += len(t) - 1
 226                 break
 227         i += 1
 228     return text
 229
 230 def manualWordsTable( path, conv_table, reconv_table ):
 231     fp = open( path, 'r', encoding = 'U8' )
 232     reconv_table = {}
 233     wordlist = [line.split( '#' )[0].strip() for line in fp]
 234     wordlist = list( set( wordlist ) )
 235     wordlist.sort( key = lambda w: ( len(w), w ), reverse = True )
 236     while wordlist:
 237         word = wordlist.pop()
 238         new_word = translate( word, conv_table )
 239         rcv_word = translate( word, reconv_table )
 240         if word != rcv_word:
 241             reconv_table[word] = word
 242         reconv_table[new_word] = word
 243     return reconv_table
 244
 245 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 246     wordlist = list( src_wordlist )
 247     wordlist.sort( key = lambda w: ( len(w), w ), reverse = True )
 248     word_conv_table = {}
 249     word_reconv_table = {}
 250     conv_table = char_conv_table.copy()
 251     reconv_table = char_reconv_table.copy()
 252     tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
 253     while wordlist:
 254         conv_table.update( word_conv_table )
 255         reconv_table.update( word_reconv_table )
 256         word = wordlist.pop()
 257         new_word_len = word_len = len( word )
 258         while new_word_len == word_len:
 259             add = False
 260             test_word = translate( word, reconv_table )
 261             new_word = translate( word, conv_table )
 262             if not reconv_table.get( new_word ) \
 263                and ( test_word != word \
 264                or ( tomanyptn.search( word ) \
 265                and word != translate( new_word, reconv_table ) ) ):
 266                 word_conv_table[word] = new_word
 267                 word_reconv_table[new_word] = word
 268             try:
 269                 word = wordlist.pop()
 270             except IndexError:
 271                 break
 272             new_word_len = len(word)
 273     return word_reconv_table
 274
 275 def PHPArray( table ):
 276     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
 277     return '\n'.join(lines)
 278
 279 def main():
 280     #Get Unihan.zip:
 281     url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
 282     han_dest = 'Unihan-%s.zip' % UNIHAN_VER
 283     download( url, han_dest )
 284
 285     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 286     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 287     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 288     download( url, tbe_dest )
 289
 290     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 291     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 292     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 293     download( url, pyn_dest )
 294
 295     # Get libtabe-$(LIBTABE_VER).tgz:
 296     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 297     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 298     download( url, lbt_dest )
 299
 300     # Unihan.txt
 301     ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
 302
 303     t2s_1tomany.update( charManualTable( 'symme_supp.manual' ) )
 304     t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
 305     s2t_1tomany.update( ( t[0], [f] ) for ( f, t ) in charManualTable( 'symme_supp.manual' ) )
 306     s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
 307
 308     if pyversion[:1] in ['2']:
 309       t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
 310       s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
 311     else:
 312       t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] )
 313       s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] )
 314
 315     s_tomany = toManyRules( t2s_1tomany )
 316     t_tomany = toManyRules( s2t_1tomany )
 317
 318     # noconvert rules
 319     t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 320     s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 321
 322     # the supper set for word to word conversion
 323     t2s_1to1_supp = t2s_1to1.copy()
 324     s2t_1to1_supp = s2t_1to1.copy()
 325     t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
 326     s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
 327
 328     # word to word manual rules
 329     t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
 330     t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
 331     s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
 332     s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
 333
 334     # word to word rules from input methods
 335     t_wordlist = set()
 336     s_wordlist = set()
 337     t_wordlist.update( ezbigParser( tbe_dest ),
 338                        tsiParser( lbt_dest ) )
 339     s_wordlist.update( wubiParser( tbe_dest ),
 340                        zrmParser( tbe_dest ),
 341                        phraseParser( pyn_dest ) )
 342
 343     # exclude
 344     s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
 345     t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
 346
 347     s2t_supp = s2t_1to1_supp.copy()
 348     s2t_supp.update( s2t_word2word_manual )
 349     t2s_supp = t2s_1to1_supp.copy()
 350     t2s_supp.update( t2s_word2word_manual )
 351
 352     # parse list to dict
 353     t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 354     t2s_word2word.update( t2s_word2word_manual )
 355     s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 356     s2t_word2word.update( s2t_word2word_manual )
 357
 358     # Final tables
 359     # sorted list toHans
 360     if pyversion[:1] in ['2']:
 361         t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
 362     else:
 363         t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] )
 364     toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
 365     # sorted list toHant
 366     if pyversion[:1] in ['2']:
 367         s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
 368     else:
 369         s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] )
 370     toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
 371     # sorted list toCN
 372     toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
 373     # sorted list toHK
 374     toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
 375     # sorted list toTW
 376     toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
 377
 378     # Get PHP Array
 379     php = '''<?php
 380 /**
 381  * Simplified / Traditional Chinese conversion tables
 382  *
 383  * Automatically generated using code and data in maintenance/language/zhtable/
 384  * Do not modify directly!
 385  *
 386  * @file
 387  */
 388
 389 $zh2Hant = array(\n'''
 390     php += PHPArray( toHant ) \
 391         +  '\n);\n\n$zh2Hans = array(\n' \
 392         +  PHPArray( toHans ) \
 393         +  '\n);\n\n$zh2TW = array(\n' \
 394         +  PHPArray( toTW ) \
 395         +  '\n);\n\n$zh2HK = array(\n' \
 396         +  PHPArray( toHK ) \
 397         +  '\n);\n\n$zh2CN = array(\n' \
 398         +  PHPArray( toCN ) \
 399         +  '\n);\n'
 400
 401     if pyversion[:1] in ['2']:
 402         f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
 403     else:
 404         f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' )
 405     print ('Writing ZhConversion.php ... ')
 406     f.write( php )
 407     f.close()
 408
 409     # Remove temporary files
 410     print ('Deleting temporary files ... ')
 411     os.remove('EZ-Big.txt.in')
 412     os.remove('phrase_lib.txt')
 413     os.remove('tsi.src')
 414     os.remove('Unihan_Variants.txt')
 415     os.remove('Wubi.txt.in')
 416     os.remove('Ziranma.txt.in')
 417
 418
 419 if __name__ == '__main__':
 420     main()