Merge "Allow 'uselang', 'useskin', 'debug' as query parameters in RedirectSpecialPages"
[lhc/web/wiklou.git] / maintenance / language / zhtable / Makefile.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import tarfile as tf
5 import zipfile as zf
6 import os, re, shutil, sys, platform
7
8 pyversion = platform.python_version()
9 islinux = platform.system().lower() == 'linux'
10
11 if pyversion[:3] in ['2.6', '2.7']:
12 import urllib as urllib_request
13 import codecs
14 open = codecs.open
15 _unichr = unichr
16 if sys.maxunicode < 0x10000:
17 def unichr(i):
18 if i < 0x10000:
19 return _unichr(i)
20 else:
21 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
22 elif pyversion[:2] == '3.':
23 import urllib.request as urllib_request
24 unichr = chr
25
26 def unichr2( *args ):
27 return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
28
29 def unichr3( *args ):
30 return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
31
32 # DEFINE
33 UNIHAN_VER = '6.3.0'
34 SF_MIRROR = 'dfn'
35 SCIM_TABLES_VER = '0.5.13'
36 SCIM_PINYIN_VER = '0.5.92'
37 LIBTABE_VER = '0.2.3'
38 # END OF DEFINE
39
40 def download( url, dest ):
41 if os.path.isfile( dest ):
42 print( 'File %s is up to date.' % dest )
43 return
44 global islinux
45 if islinux:
46 # we use wget instead urlretrieve under Linux,
47 # because wget could display details like download progress
48 os.system( 'wget %s -O %s' % ( url, dest ) )
49 else:
50 print( 'Downloading from [%s] ...' % url )
51 urllib_request.urlretrieve( url, dest )
52 print( 'Download complete.\n' )
53 return
54
55 def uncompress( fp, member, encoding = 'U8' ):
56 name = member.rsplit( '/', 1 )[-1]
57 print( 'Extracting %s ...' % name )
58 fp.extract( member )
59 shutil.move( member, name )
60 if '/' in member:
61 shutil.rmtree( member.split( '/', 1 )[0] )
62 if pyversion[:1] in ['2']:
63 fc = open( name, 'rb', encoding, 'ignore' )
64 else:
65 fc = open( name, 'r', encoding = encoding, errors = 'ignore' )
66 return fc
67
68 unzip = lambda path, member, encoding = 'U8': \
69 uncompress( zf.ZipFile( path ), member, encoding )
70
71 untargz = lambda path, member, encoding = 'U8': \
72 uncompress( tf.open( path, 'r:gz' ), member, encoding )
73
74 def parserCore( fp, pos, beginmark = None, endmark = None ):
75 if beginmark and endmark:
76 start = False
77 else: start = True
78 mlist = set()
79 for line in fp:
80 if beginmark and line.startswith( beginmark ):
81 start = True
82 continue
83 elif endmark and line.startswith( endmark ):
84 break
85 if start and not line.startswith( '#' ):
86 elems = line.split()
87 if len( elems ) < 2:
88 continue
89 elif len( elems[0] ) > 1 and \
90 len( elems[pos] ) > 1: # words only
91 mlist.add( elems[pos] )
92 return mlist
93
94 def tablesParser( path, name ):
95 """ Read file from scim-tables and parse it. """
96 global SCIM_TABLES_VER
97 src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
98 fp = untargz( path, src, 'U8' )
99 return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
100
101 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
102 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
103 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
104
105 def phraseParser( path ):
106 """ Read phrase_lib.txt and parse it. """
107 global SCIM_PINYIN_VER
108 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
109 dst = 'phrase_lib.txt'
110 fp = untargz( path, src, 'U8' )
111 return parserCore( fp, 0 )
112
113 def tsiParser( path ):
114 """ Read tsi.src and parse it. """
115 src = 'libtabe/tsi-src/tsi.src'
116 dst = 'tsi.src'
117 fp = untargz( path, src, 'big5hkscs' )
118 return parserCore( fp, 0 )
119
120 def unihanParser( path ):
121 """ Read Unihan_Variants.txt and parse it. """
122 fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
123 t2s = dict()
124 s2t = dict()
125 for line in fp:
126 if line.startswith( '#' ):
127 continue
128 else:
129 elems = line.split()
130 if len( elems ) < 3:
131 continue
132 type = elems.pop( 1 )
133 elems = unichr2( *elems )
134 if type == 'kTraditionalVariant':
135 s2t[elems[0]] = elems[1:]
136 elif type == 'kSimplifiedVariant':
137 t2s[elems[0]] = elems[1:]
138 fp.close()
139 return ( t2s, s2t )
140
141 def applyExcludes( mlist, path ):
142 """ Apply exclude rules from path to mlist. """
143 if pyversion[:1] in ['2']:
144 excludes = open( path, 'rb', 'U8' ).read().split()
145 else:
146 excludes = open( path, 'r', encoding = 'U8' ).read().split()
147 excludes = [word.split( '#' )[0].strip() for word in excludes]
148 excludes = '|'.join( excludes )
149 excptn = re.compile( '.*(?:%s).*' % excludes )
150 diff = [mword for mword in mlist if excptn.search( mword )]
151 mlist.difference_update( diff )
152 return mlist
153
154 def charManualTable( path ):
155 fp = open( path, 'r', encoding = 'U8' )
156 for line in fp:
157 elems = line.split( '#' )[0].split( '|' )
158 elems = unichr3( *elems )
159 if len( elems ) > 1:
160 yield elems[0], elems[1:]
161
162 def toManyRules( src_table ):
163 tomany = set()
164 if pyversion[:1] in ['2']:
165 for ( f, t ) in src_table.iteritems():
166 for i in range( 1, len( t ) ):
167 tomany.add( t[i] )
168 else:
169 for ( f, t ) in src_table.items():
170 for i in range( 1, len( t ) ):
171 tomany.add( t[i] )
172 return tomany
173
174 def removeRules( path, table ):
175 fp = open( path, 'r', encoding = 'U8' )
176 texc = list()
177 for line in fp:
178 elems = line.split( '=>' )
179 f = t = elems[0].strip()
180 if len( elems ) == 2:
181 t = elems[1].strip()
182 f = f.strip('"').strip("'")
183 t = t.strip('"').strip("'")
184 if f:
185 try:
186 table.pop( f )
187 except:
188 pass
189 if t:
190 texc.append( t )
191 texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
192 if pyversion[:1] in ['2']:
193 for (tmp_f, tmp_t) in table.copy().iteritems():
194 if texcptn.match( tmp_t ):
195 table.pop( tmp_f )
196 else:
197 for (tmp_f, tmp_t) in table.copy().items():
198 if texcptn.match( tmp_t ):
199 table.pop( tmp_f )
200 return table
201
202 def customRules( path ):
203 fp = open( path, 'r', encoding = 'U8' )
204 ret = dict()
205 for line in fp:
206 line = line.rstrip( '\r\n' )
207 if '#' in line:
208 line = line.split( '#' )[0].rstrip()
209 elems = line.split( '\t' )
210 if len( elems ) > 1:
211 ret[elems[0]] = elems[1]
212 return ret
213
214 def dictToSortedList( src_table, pos ):
215 return sorted( src_table.items(), key = lambda m: ( m[pos], m[1 - pos] ) )
216
217 def translate( text, conv_table ):
218 i = 0
219 while i < len( text ):
220 for j in range( len( text ) - i, 0, -1 ):
221 f = text[i:][:j]
222 t = conv_table.get( f )
223 if t:
224 text = text[:i] + t + text[i:][j:]
225 i += len(t) - 1
226 break
227 i += 1
228 return text
229
230 def manualWordsTable( path, conv_table, reconv_table ):
231 fp = open( path, 'r', encoding = 'U8' )
232 reconv_table = {}
233 wordlist = [line.split( '#' )[0].strip() for line in fp]
234 wordlist = list( set( wordlist ) )
235 wordlist.sort( key = lambda w: ( len(w), w ), reverse = True )
236 while wordlist:
237 word = wordlist.pop()
238 new_word = translate( word, conv_table )
239 rcv_word = translate( word, reconv_table )
240 if word != rcv_word:
241 reconv_table[word] = word
242 reconv_table[new_word] = word
243 return reconv_table
244
245 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
246 wordlist = list( src_wordlist )
247 wordlist.sort( key = lambda w: ( len(w), w ), reverse = True )
248 word_conv_table = {}
249 word_reconv_table = {}
250 conv_table = char_conv_table.copy()
251 reconv_table = char_reconv_table.copy()
252 tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
253 while wordlist:
254 conv_table.update( word_conv_table )
255 reconv_table.update( word_reconv_table )
256 word = wordlist.pop()
257 new_word_len = word_len = len( word )
258 while new_word_len == word_len:
259 add = False
260 test_word = translate( word, reconv_table )
261 new_word = translate( word, conv_table )
262 if not reconv_table.get( new_word ) \
263 and ( test_word != word \
264 or ( tomanyptn.search( word ) \
265 and word != translate( new_word, reconv_table ) ) ):
266 word_conv_table[word] = new_word
267 word_reconv_table[new_word] = word
268 try:
269 word = wordlist.pop()
270 except IndexError:
271 break
272 new_word_len = len(word)
273 return word_reconv_table
274
275 def PHPArray( table ):
276 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
277 return '\n'.join(lines)
278
279 def main():
280 #Get Unihan.zip:
281 url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
282 han_dest = 'Unihan-%s.zip' % UNIHAN_VER
283 download( url, han_dest )
284
285 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
286 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
287 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
288 download( url, tbe_dest )
289
290 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
291 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
292 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
293 download( url, pyn_dest )
294
295 # Get libtabe-$(LIBTABE_VER).tgz:
296 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
297 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
298 download( url, lbt_dest )
299
300 # Unihan.txt
301 ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
302
303 t2s_1tomany.update( charManualTable( 'symme_supp.manual' ) )
304 t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
305 s2t_1tomany.update( ( t[0], [f] ) for ( f, t ) in charManualTable( 'symme_supp.manual' ) )
306 s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
307
308 if pyversion[:1] in ['2']:
309 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
310 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
311 else:
312 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] )
313 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] )
314
315 s_tomany = toManyRules( t2s_1tomany )
316 t_tomany = toManyRules( s2t_1tomany )
317
318 # noconvert rules
319 t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
320 s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
321
322 # the supper set for word to word conversion
323 t2s_1to1_supp = t2s_1to1.copy()
324 s2t_1to1_supp = s2t_1to1.copy()
325 t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
326 s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
327
328 # word to word manual rules
329 t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
330 t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
331 s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
332 s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
333
334 # word to word rules from input methods
335 t_wordlist = set()
336 s_wordlist = set()
337 t_wordlist.update( ezbigParser( tbe_dest ),
338 tsiParser( lbt_dest ) )
339 s_wordlist.update( wubiParser( tbe_dest ),
340 zrmParser( tbe_dest ),
341 phraseParser( pyn_dest ) )
342
343 # exclude
344 s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
345 t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
346
347 s2t_supp = s2t_1to1_supp.copy()
348 s2t_supp.update( s2t_word2word_manual )
349 t2s_supp = t2s_1to1_supp.copy()
350 t2s_supp.update( t2s_word2word_manual )
351
352 # parse list to dict
353 t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
354 t2s_word2word.update( t2s_word2word_manual )
355 s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
356 s2t_word2word.update( s2t_word2word_manual )
357
358 # Final tables
359 # sorted list toHans
360 if pyversion[:1] in ['2']:
361 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
362 else:
363 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] )
364 toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
365 # sorted list toHant
366 if pyversion[:1] in ['2']:
367 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
368 else:
369 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] )
370 toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
371 # sorted list toCN
372 toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
373 # sorted list toHK
374 toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
375 # sorted list toTW
376 toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
377
378 # Get PHP Array
379 php = '''<?php
380 /**
381 * Simplified / Traditional Chinese conversion tables
382 *
383 * Automatically generated using code and data in maintenance/language/zhtable/
384 * Do not modify directly!
385 *
386 * @file
387 */
388
389 $zh2Hant = array(\n'''
390 php += PHPArray( toHant ) \
391 + '\n);\n\n$zh2Hans = array(\n' \
392 + PHPArray( toHans ) \
393 + '\n);\n\n$zh2TW = array(\n' \
394 + PHPArray( toTW ) \
395 + '\n);\n\n$zh2HK = array(\n' \
396 + PHPArray( toHK ) \
397 + '\n);\n\n$zh2CN = array(\n' \
398 + PHPArray( toCN ) \
399 + '\n);\n'
400
401 if pyversion[:1] in ['2']:
402 f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
403 else:
404 f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' )
405 print ('Writing ZhConversion.php ... ')
406 f.write( php )
407 f.close()
408
409 # Remove temporary files
410 print ('Deleting temporary files ... ')
411 os.remove('EZ-Big.txt.in')
412 os.remove('phrase_lib.txt')
413 os.remove('tsi.src')
414 os.remove('Unihan_Variants.txt')
415 os.remove('Wubi.txt.in')
416 os.remove('Ziranma.txt.in')
417
418
419 if __name__ == '__main__':
420 main()