Fix a bug found in Makefile.py.
[lhc/web/wiklou.git] / includes / zhtable / Makefile.py
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import tarfile, zipfile
5 import os, re, shutil, sys, platform
6
7 pyversion = platform.python_version()
8 islinux = platform.system().lower() == 'linux' or False
9
10 if pyversion[:3] in ['2.5', '2.6', '2.7']:
11 import urllib as urllib_request
12 import codecs
13 uniopen = codecs.open
14 def unichr2(i):
15 if sys.maxunicode >= 0x10000 or i < 0x10000:
16 return unichr(i)
17 else:
18 return unichr(0xD7C0+(i>>10)) + unichr(0xDC00+(i&0x3FF))
19 elif pyversion[:2] == '3.':
20 import urllib.request as urllib_request
21 uniopen = open
22 unichr2 = chr
23
24 # DEFINE
25 SF_MIRROR = 'easynews'
26 SCIM_TABLES_VER = '0.5.9'
27 SCIM_PINYIN_VER = '0.5.91'
28 LIBTABE_VER = '0.2.3'
29 # END OF DEFINE
30
31 def GetFileFromURL( url, dest ):
32 if os.path.isfile(dest):
33 print( 'File %s up to date.' % dest )
34 return
35 global islinux
36 if islinux:
37 # we use wget instead urlretrieve under Linux,
38 # because wget will display details like download progress
39 os.system('wget %s' % url)
40 else:
41 print( 'Downloading from [%s] ...' % url )
42 urllib_request.urlretrieve( url, dest )
43 print( 'Download complete.\n' )
44 return
45
46 def GetFileFromUnihan( path ):
47 print( 'Extracting files from %s ...' % path )
48 text = zipfile.ZipFile(path).read('Unihan_Variants.txt')
49 uhfile = uniopen('Unihan_Variants.txt', 'w')
50 uhfile.write(text)
51 uhfile.close()
52 return
53
54 def GetFileFromTar( path, member, rename ):
55 print( 'Extracting %s from %s ...' % (rename, path) )
56 tarfile.open(path, 'r:gz').extract(member)
57 shutil.move(member, rename)
58 tree_rmv = member.split('/')[0]
59 shutil.rmtree(tree_rmv)
60 return
61
62 def ReadBIG5File( dest ):
63 print( 'Reading and decoding %s ...' % dest )
64 f1 = uniopen( dest, 'r', encoding='big5hkscs', errors='replace' )
65 text = f1.read()
66 text = text.replace( '\ufffd', '\n' )
67 f1.close()
68 f2 = uniopen( dest, 'w', encoding='utf8' )
69 f2.write(text)
70 f2.close()
71 return text
72
73 def ReadFile( dest ):
74 print( 'Reading and decoding %s ...' % dest )
75 f = uniopen( dest, 'r', encoding='utf8' )
76 ret = f.read()
77 f.close()
78 return ret
79
80 def ReadUnihanFile( dest ):
81 print( 'Reading and decoding %s ...' % dest )
82 f = uniopen( dest, 'r', encoding='utf8' )
83 t2s_code = []
84 s2t_code = []
85 while True:
86 line = f.readline()
87 if line:
88 if line.startswith('#'):
89 continue
90 elif not line.find('kSimplifiedVariant') == -1:
91 temp = line.split('kSimplifiedVariant')
92 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
93 elif not line.find('kTraditionalVariant') == -1:
94 temp = line.split('kTraditionalVariant')
95 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
96 else:
97 break
98 f.close()
99 return ( t2s_code, s2t_code )
100
101 def RemoveRows( text, num ):
102 text = re.sub( '.*\s*', '', text, num)
103 return text
104
105 def RemoveOneCharConv( text ):
106 preg = re.compile('^.\s*$', re.MULTILINE)
107 text = preg.sub( '', text )
108 return text
109
110 def ConvertToChar( code ):
111 code = code.split('<')[0]
112 return unichr2( int( code[2:], 16 ) )
113
114 def GetDefaultTable( code_table ):
115 char_table = {}
116 for ( f, t ) in code_table:
117 if f and t:
118 from_char = ConvertToChar( f )
119 to_chars = [ConvertToChar( code ) for code in t.split()]
120 char_table[from_char] = to_chars
121 return char_table
122
123 def GetManualTable( dest ):
124 text = ReadFile( dest )
125 temp1 = text.split()
126 char_table = {}
127 for elem in temp1:
128 elem = elem.strip('|')
129 if elem:
130 temp2 = elem.split( '|', 1 )
131 from_char = unichr2( int( temp2[0][2:7], 16 ) )
132 to_chars = [unichr2( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
133 char_table[from_char] = to_chars
134 return char_table
135
136 def GetValidTable( src_table ):
137 valid_table = {}
138 for f, t in src_table.items():
139 valid_table[f] = t[0]
140 return valid_table
141
142 def GetToManyRules( src_table ):
143 tomany_table = {}
144 for f, t in src_table.items():
145 for i in range(1, len(t)):
146 tomany_table[t[i]] = True
147 return tomany_table
148
149 def RemoveRules( dest, table ):
150 text = ReadFile( dest )
151 temp1 = text.split()
152 for elem in temp1:
153 f = ''
154 t = ''
155 elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
156 if '=>' in elem:
157 if elem.startswith( '=>' ):
158 t = elem.replace( '=>', '' ).strip()
159 elif elem.endswith( '=>' ):
160 f = elem.replace( '=>', '' ).strip()
161 else:
162 temp2 = elem.split( '=>' )
163 f = temp2[0].strip()
164 t = temp2[1].strip()
165 try:
166 table.pop(f, t)
167 continue
168 except:
169 continue
170 else:
171 f = t = elem
172 if f:
173 try:
174 table.pop(f)
175 except:
176 x = 1
177 if t:
178 for temp_f, temp_t in table.copy().items():
179 if temp_t == t:
180 table.pop(temp_f)
181 return table
182
183 def DictToSortedList1( src_table ):
184 return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
185
186 def DictToSortedList2( src_table ):
187 return sorted( src_table.items(), key = lambda m: m[1] )
188
189 def Converter( string, conv_table ):
190 i = 0
191 while i < len(string):
192 for j in range(len(string) - i, 0, -1):
193 f = string[i:][:j]
194 t = conv_table.get( f )
195 if t:
196 string = string[:i] + t + string[i:][j:]
197 i += len(t) - 1
198 break
199 i += 1
200 return string
201
202 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
203 wordlist = list( set( src_wordlist ) )
204 wordlist.sort( key = len, reverse = True )
205 word_conv_table = {}
206 word_reconv_table = {}
207 while wordlist:
208 conv_table = {}
209 reconv_table = {}
210 conv_table.update( word_conv_table )
211 conv_table.update( char_conv_table )
212 reconv_table.update( word_reconv_table )
213 reconv_table.update( char_reconv_table )
214 word = wordlist.pop()
215 new_word_len = word_len = len(word)
216 while new_word_len == word_len:
217 rvt_test = False
218 for char in word:
219 rvt_test = rvt_test or src_tomany.get(char)
220 test_word = Converter( word, reconv_table )
221 new_word = Converter( word, conv_table )
222 if not reconv_table.get( new_word ):
223 if not test_word == word:
224 word_conv_table[word] = new_word
225 word_reconv_table[new_word] = word
226 elif rvt_test:
227 rvt_word = Converter( new_word, reconv_table )
228 if not rvt_word == word:
229 word_conv_table[word] = new_word
230 word_reconv_table[new_word] = word
231 try:
232 word = wordlist.pop()
233 except IndexError:
234 break
235 new_word_len = len(word)
236 return word_reconv_table
237
238 def GetManualWordsTable( src_wordlist, conv_table ):
239 src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
240 wordlist = list( set( src_wordlist ) )
241 wordlist.sort( key = len, reverse = True )
242 reconv_table = {}
243 while wordlist:
244 word = wordlist.pop()
245 new_word = Converter( word, conv_table )
246 reconv_table[new_word] = word
247 return reconv_table
248
249 def CustomRules( dest ):
250 text = ReadFile( dest )
251 temp = text.split()
252 ret = dict()
253 for i in range( 0, len( temp ), 2 ):
254 ret[temp[i]] = temp[i + 1]
255 return ret
256
257 def GetPHPArray( table ):
258 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
259 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
260 return '\n'.join(lines)
261
262 def RemoveSameChar( src_table ):
263 dst_table = {}
264 for f, t in src_table.items():
265 if f != t:
266 dst_table[f] = t
267 return dst_table
268
269 def main():
270 #Get Unihan.zip:
271 url = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
272 han_dest = 'Unihan.zip'
273 GetFileFromURL( url, han_dest )
274
275 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
276 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
277 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
278 GetFileFromURL( url, tbe_dest )
279
280 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
281 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
282 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
283 GetFileFromURL( url, pyn_dest )
284
285 # Get libtabe-$(LIBTABE_VER).tgz:
286 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
287 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
288 GetFileFromURL( url, lbt_dest )
289
290 # Extract the file from a comressed files
291
292 # Unihan.txt Simp. & Trad
293 GetFileFromUnihan( han_dest )
294
295 # Make word lists
296 t_wordlist = []
297 s_wordlist = []
298
299 # EZ.txt.in Trad
300 src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
301 dst = 'EZ.txt.in'
302 GetFileFromTar( tbe_dest, src, dst )
303 text = ReadFile( dst )
304 text = text.split( 'BEGIN_TABLE' )[1].strip()
305 text = text.split( 'END_TABLE' )[0].strip()
306 text = re.sub( '.*\t', '', text )
307 text = RemoveOneCharConv( text )
308 t_wordlist.extend( text.split() )
309
310 # Wubi.txt.in Simp
311 src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
312 dst = 'Wubi.txt.in'
313 GetFileFromTar( tbe_dest, src, dst )
314 text = ReadFile( dst )
315 text = text.split( 'BEGIN_TABLE' )[1].strip()
316 text = text.split( 'END_TABLE' )[0].strip()
317 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
318 text = RemoveOneCharConv( text )
319 s_wordlist.extend( text.split() )
320
321 # Ziranma.txt.in Simp
322 src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
323 dst = 'Ziranma.txt.in'
324 GetFileFromTar( tbe_dest, src, dst )
325 text = ReadFile( dst )
326 text = text.split( 'BEGIN_TABLE' )[1].strip()
327 text = text.split( 'END_TABLE' )[0].strip()
328 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
329 text = RemoveOneCharConv( text )
330 s_wordlist.extend( text.split() )
331
332 # phrase_lib.txt Simp
333 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
334 dst = 'phrase_lib.txt'
335 GetFileFromTar( pyn_dest, src, dst )
336 text = ReadFile( 'phrase_lib.txt' )
337 text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
338 text = RemoveRows( text, 5 )
339 text = RemoveOneCharConv( text )
340 s_wordlist.extend( text.split() )
341
342 # tsi.src Trad
343 src = 'libtabe/tsi-src/tsi.src'
344 dst = 'tsi.src'
345 GetFileFromTar( lbt_dest, src, dst )
346 text = ReadBIG5File( 'tsi.src' )
347 text = re.sub( ' \d.*', '', text.replace('# ', ''))
348 text = RemoveOneCharConv( text )
349 t_wordlist.extend( text.split() )
350
351 # remove duplicate elements
352 t_wordlist = list( set( t_wordlist ) )
353 s_wordlist = list( set( s_wordlist ) )
354
355 # simpphrases_exclude.manual Simp
356 text = ReadFile( 'simpphrases_exclude.manual' )
357 temp = text.split()
358 s_string = '\n'.join( s_wordlist )
359 for elem in temp:
360 s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
361 s_wordlist = s_string.split('\n')
362
363 # tradphrases_exclude.manual Trad
364 text = ReadFile( 'tradphrases_exclude.manual' )
365 temp = text.split()
366 t_string = '\n'.join( t_wordlist )
367 for elem in temp:
368 t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
369 t_wordlist = t_string.split('\n')
370
371 # Make char to char convertion table
372 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
373 ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan_Variants.txt' )
374 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
375 t2s_1tomany = {}
376 t2s_1tomany.update( GetDefaultTable( t2s_code ) )
377 t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
378 # dict s2t_1tomany
379 s2t_1tomany = {}
380 s2t_1tomany.update( GetDefaultTable( s2t_code ) )
381 s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
382 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
383 t2s_1to1 = GetValidTable( t2s_1tomany )
384 s_tomany = GetToManyRules( t2s_1tomany )
385 # dict s2t_1to1; s2t_trans
386 s2t_1to1 = GetValidTable( s2t_1tomany )
387 t_tomany = GetToManyRules( s2t_1tomany )
388 # remove noconvert rules
389 t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
390 s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
391
392 # Make word to word convertion table
393 t2s_1to1_supp = t2s_1to1.copy()
394 s2t_1to1_supp = s2t_1to1.copy()
395 # trad2simp_supp_set.manual
396 t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
397 # simp2trad_supp_set.manual
398 s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
399 # simpphrases.manual
400 text = ReadFile( 'simpphrases.manual' )
401 s_wordlist_manual = text.split('\n')
402 t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
403 t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
404 # tradphrases.manual
405 text = ReadFile( 'tradphrases.manual' )
406 t_wordlist_manual = text.split('\n')
407 s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
408 s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
409 # t2s_word2word
410 s2t_supp = s2t_1to1_supp.copy()
411 s2t_supp.update( s2t_word2word_manual )
412 t2s_supp = t2s_1to1_supp.copy()
413 t2s_supp.update( t2s_word2word_manual )
414 t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
415 ## toSimp.manual
416 t2s_word2word.update( t2s_word2word_manual )
417 # s2t_word2word
418 s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
419 ## toTrad.manual
420 s2t_word2word.update( s2t_word2word_manual )
421
422 # Final tables
423 # sorted list toHans
424 t2s_1to1 = RemoveSameChar( t2s_1to1 )
425 s2t_1to1 = RemoveSameChar( s2t_1to1 )
426 toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
427 # sorted list toHant
428 toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
429 # sorted list toCN
430 toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
431 # sorted list toHK
432 toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
433 # sorted list toSG
434 toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
435 # sorted list toTW
436 toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
437
438 # Get PHP Array
439 php = '''<?php
440 /**
441 * Simplified / Traditional Chinese conversion tables
442 *
443 * Automatically generated using code and data in includes/zhtable/
444 * Do not modify directly!
445 */
446
447 $zh2Hant = array(\n'''
448 php += GetPHPArray( toHant )
449 php += '\n);\n\n$zh2Hans = array(\n'
450 php += GetPHPArray( toHans )
451 php += '\n);\n\n$zh2TW = array(\n'
452 php += GetPHPArray( toTW )
453 php += '\n);\n\n$zh2HK = array(\n'
454 php += GetPHPArray( toHK )
455 php += '\n);\n\n$zh2CN = array(\n'
456 php += GetPHPArray( toCN )
457 php += '\n);\n\n$zh2SG = array(\n'
458 php += GetPHPArray( toSG )
459 php += '\n);'
460
461 f = uniopen( 'ZhConversion.php', 'w', encoding = 'utf8' )
462 print ('Writing ZhConversion.php ... ')
463 f.write( php )
464 f.close()
465
466 #Remove temp files
467 print ('Deleting temp files ... ')
468 os.remove('EZ.txt.in')
469 os.remove('phrase_lib.txt')
470 os.remove('tsi.src')
471 os.remove('Unihan_Variants.txt')
472 os.remove('Wubi.txt.in')
473 os.remove('Ziranma.txt.in')
474
475
476 if __name__ == '__main__':
477 main()