Update the Chinese conversion tables.
[lhc/web/wiklou.git] / includes / zhtable / Makefile.py
1 # @author Philip
2 # You should run this script UNDER python 3000.
3 import tarfile, zipfile
4 import os, re, shutil, urllib.request
5
6 # DEFINE
7 SF_MIRROR = 'easynews'
8 SCIM_TABLES_VER = '0.5.9'
9 SCIM_PINYIN_VER = '0.5.91'
10 LIBTABE_VER = '0.2.3'
11 # END OF DEFINE
12
13 def GetFileFromURL( url, dest ):
14 if os.path.isfile(dest):
15 print( 'File %s up to date.' % dest )
16 return
17 print( 'Downloading from [%s] ...' % url )
18 urllib.request.urlretrieve( url, dest )
19 print( 'Download complete.\n' )
20 return
21
22 def GetFileFromZip( path ):
23 print( 'Extracting files from %s ...' % path )
24 zipfile.ZipFile(path).extractall()
25 return
26
27 def GetFileFromTar( path, member, rename ):
28 print( 'Extracting %s from %s ...' % (rename, path) )
29 tarfile.open(path, 'r:gz').extract(member)
30 shutil.move(member, rename)
31 tree_rmv = member.split('/')[0]
32 shutil.rmtree(tree_rmv)
33 return
34
35 def ReadBIG5File( dest ):
36 print( 'Reading and decoding %s ...' % dest )
37 f1 = open( dest, 'r', encoding='big5hkscs', errors='replace' )
38 text = f1.read()
39 text = text.replace( '\ufffd', '\n' )
40 f1.close()
41 f2 = open( dest, 'w', encoding='utf8' )
42 f2.write(text)
43 f2.close()
44 return text
45
46 def ReadFile( dest ):
47 print( 'Reading and decoding %s ...' % dest )
48 f = open( dest, 'r', encoding='utf8' )
49 ret = f.read()
50 f.close()
51 return ret
52
53 def ReadUnihanFile( dest ):
54 print( 'Reading and decoding %s ...' % dest )
55 f = open( dest, 'r', encoding='utf8' )
56 t2s_code = []
57 s2t_code = []
58 while True:
59 line = f.readline()
60 if line:
61 if line.startswith('#'):
62 continue
63 elif not line.find('kSimplifiedVariant') == -1:
64 temp = line.split('kSimplifiedVariant')
65 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
66 elif not line.find('kTraditionalVariant') == -1:
67 temp = line.split('kTraditionalVariant')
68 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
69 else:
70 break
71 f.close()
72 return ( t2s_code, s2t_code )
73
74 def RemoveRows( text, num ):
75 text = re.sub( '.*\s*', '', text, num)
76 return text
77
78 def RemoveOneCharConv( text ):
79 preg = re.compile('^.\s*$', re.MULTILINE)
80 text = preg.sub( '', text )
81 return text
82
83 def ConvertToChar( code ):
84 code = code.split('<')[0]
85 return chr( int( code[2:], 16 ) )
86
87 def GetDefaultTable( code_table ):
88 char_table = {}
89 for ( f, t ) in code_table:
90 if f and t:
91 from_char = ConvertToChar( f )
92 to_chars = [ConvertToChar( code ) for code in t.split()]
93 char_table[from_char] = to_chars
94 return char_table
95
96 def GetManualTable( dest ):
97 text = ReadFile( dest )
98 temp1 = text.split()
99 char_table = {}
100 for elem in temp1:
101 elem = elem.strip('|')
102 if elem:
103 temp2 = elem.split( '|', 1 )
104 from_char = chr( int( temp2[0][2:7], 16 ) )
105 to_chars = [chr( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
106 char_table[from_char] = to_chars
107 return char_table
108
109 def GetValidTable( src_table ):
110 valid_table = {}
111 for f, t in src_table.items():
112 valid_table[f] = t[0]
113 return valid_table
114
115 def GetToManyRules( src_table ):
116 tomany_table = {}
117 for f, t in src_table.items():
118 for i in range(1, len(t)):
119 tomany_table[t[i]] = True
120 return tomany_table
121
122 def RemoveRules( dest, table ):
123 text = ReadFile( dest )
124 temp1 = text.split()
125 for elem in temp1:
126 f = ''
127 t = ''
128 elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
129 if '=>' in elem:
130 if elem.startswith( '=>' ):
131 t = elem.replace( '=>', '' ).strip()
132 elif elem.endswith( '=>' ):
133 f = elem.replace( '=>', '' ).strip()
134 else:
135 temp2 = elem.split( '=>' )
136 f = temp2[0].strip()
137 t = temp2[1].strip()
138 try:
139 table.pop(f, t)
140 continue
141 except:
142 continue
143 else:
144 f = t = elem
145 if f:
146 try:
147 table.pop(f)
148 except:
149 x = 1
150 if t:
151 for temp_f, temp_t in table.copy().items():
152 if temp_t == t:
153 table.pop(temp_f)
154 return table
155
156 def DictToSortedList1( src_table ):
157 return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
158
159 def DictToSortedList2( src_table ):
160 return sorted( src_table.items(), key = lambda m: m[1] )
161
162 def Converter( string, conv_table ):
163 i = 0
164 while i < len(string):
165 for j in range(len(string) - i, 0, -1):
166 f = string[i:][:j]
167 t = conv_table.get( f )
168 if t:
169 string = string[:i] + t + string[i:][j:]
170 i += len(t) - 1
171 break
172 i += 1
173 return string
174
175 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
176 wordlist = list( set( src_wordlist ) )
177 wordlist.sort( key = len, reverse = True )
178 word_conv_table = {}
179 word_reconv_table = {}
180 while wordlist:
181 conv_table = {}
182 reconv_table = {}
183 conv_table.update( word_conv_table )
184 conv_table.update( char_conv_table )
185 reconv_table.update( word_reconv_table )
186 reconv_table.update( char_reconv_table )
187 word = wordlist.pop()
188 new_word_len = word_len = len(word)
189 while new_word_len == word_len:
190 rvt_test = False
191 for char in word:
192 rvt_test = rvt_test or src_tomany.get(char)
193 test_word = Converter( word, reconv_table )
194 new_word = Converter( word, conv_table )
195 if not reconv_table.get( new_word ):
196 if not test_word == word:
197 word_conv_table[word] = new_word
198 word_reconv_table[new_word] = word
199 elif rvt_test:
200 rvt_word = Converter( new_word, reconv_table )
201 if not rvt_word == word:
202 word_conv_table[word] = new_word
203 word_reconv_table[new_word] = word
204 try:
205 word = wordlist.pop()
206 except IndexError:
207 break
208 new_word_len = len(word)
209 return word_reconv_table
210
211 def GetManualWordsTable( src_wordlist, conv_table ):
212 src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
213 wordlist = list( set( src_wordlist ) )
214 wordlist.sort( key = len, reverse = True )
215 reconv_table = {}
216 while wordlist:
217 word = wordlist.pop()
218 new_word = Converter( word, conv_table )
219 reconv_table[new_word] = word
220 return reconv_table
221
222 def CustomRules( dest ):
223 text = ReadFile( dest )
224 temp = text.split()
225 ret = {temp[i]: temp[i + 1] for i in range( 0, len( temp ), 2 )}
226 return ret
227
228 def GetPHPArray( table ):
229 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table]
230 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
231 return '\n'.join(lines)
232
233 def RemoveSameChar( src_table ):
234 dst_table = {}
235 for f, t in src_table.items():
236 if not f == t:
237 dst_table[f] = t
238 return dst_table
239
240 def main():
241 #Get Unihan.zip:
242 url = 'ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip'
243 han_dest = 'Unihan.zip'
244 GetFileFromURL( url, han_dest )
245
246 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
247 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
248 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
249 GetFileFromURL( url, tbe_dest )
250
251 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
252 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
253 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
254 GetFileFromURL( url, pyn_dest )
255
256 # Get libtabe-$(LIBTABE_VER).tgz:
257 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
258 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
259 GetFileFromURL( url, lbt_dest )
260
261 # Extract the file from a comressed files
262
263 # Unihan.txt Simp. & Trad
264 GetFileFromZip( han_dest )
265
266 # Make word lists
267 t_wordlist = []
268 s_wordlist = []
269
270 # EZ.txt.in Trad
271 src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
272 dst = 'EZ.txt.in'
273 GetFileFromTar( tbe_dest, src, dst )
274 text = ReadFile( dst )
275 text = text.split( 'BEGIN_TABLE' )[1].strip()
276 text = text.split( 'END_TABLE' )[0].strip()
277 text = re.sub( '.*\t', '', text )
278 text = RemoveOneCharConv( text )
279 t_wordlist.extend( text.split() )
280
281 # Wubi.txt.in Simp
282 src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
283 dst = 'Wubi.txt.in'
284 GetFileFromTar( tbe_dest, src, dst )
285 text = ReadFile( dst )
286 text = text.split( 'BEGIN_TABLE' )[1].strip()
287 text = text.split( 'END_TABLE' )[0].strip()
288 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
289 text = RemoveOneCharConv( text )
290 s_wordlist.extend( text.split() )
291
292 # Ziranma.txt.in Simp
293 src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
294 dst = 'Ziranma.txt.in'
295 GetFileFromTar( tbe_dest, src, dst )
296 text = ReadFile( dst )
297 text = text.split( 'BEGIN_TABLE' )[1].strip()
298 text = text.split( 'END_TABLE' )[0].strip()
299 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
300 text = RemoveOneCharConv( text )
301 s_wordlist.extend( text.split() )
302
303 # phrase_lib.txt Simp
304 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
305 dst = 'phrase_lib.txt'
306 GetFileFromTar( pyn_dest, src, dst )
307 text = ReadFile( 'phrase_lib.txt' )
308 text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
309 text = RemoveRows( text, 5 )
310 text = RemoveOneCharConv( text )
311 s_wordlist.extend( text.split() )
312
313 # tsi.src Trad
314 src = 'libtabe/tsi-src/tsi.src'
315 dst = 'tsi.src'
316 GetFileFromTar( lbt_dest, src, dst )
317 text = ReadBIG5File( 'tsi.src' )
318 text = re.sub( ' \d.*', '', text.replace('# ', ''))
319 text = RemoveOneCharConv( text )
320 t_wordlist.extend( text.split() )
321
322 # remove duplicate elements
323 t_wordlist = list( set( t_wordlist ) )
324 s_wordlist = list( set( s_wordlist ) )
325
326 # simpphrases_exclude.manual Simp
327 text = ReadFile( 'simpphrases_exclude.manual' )
328 temp = text.split()
329 s_string = '\n'.join( s_wordlist )
330 for elem in temp:
331 s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
332 s_wordlist = s_string.split('\n')
333
334 # tradphrases_exclude.manual Trad
335 text = ReadFile( 'tradphrases_exclude.manual' )
336 temp = text.split()
337 t_string = '\n'.join( t_wordlist )
338 for elem in temp:
339 t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
340 t_wordlist = t_string.split('\n')
341
342 # Make char to char convertion table
343 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
344 ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan.txt' )
345 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
346 t2s_1tomany = {}
347 t2s_1tomany.update( GetDefaultTable( t2s_code ) )
348 t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
349 # dict s2t_1tomany
350 s2t_1tomany = {}
351 s2t_1tomany.update( GetDefaultTable( s2t_code ) )
352 s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
353 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
354 t2s_1to1 = GetValidTable( t2s_1tomany )
355 s_tomany = GetToManyRules( t2s_1tomany )
356 # dict s2t_1to1; s2t_trans
357 s2t_1to1 = GetValidTable( s2t_1tomany )
358 t_tomany = GetToManyRules( s2t_1tomany )
359 # remove noconvert rules
360 t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
361 s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
362
363 # Make word to word convertion table
364 t2s_1to1_supp = t2s_1to1.copy()
365 s2t_1to1_supp = s2t_1to1.copy()
366 # trad2simp_supp_set.manual
367 t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
368 # simp2trad_supp_set.manual
369 s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
370 # simpphrases.manual
371 text = ReadFile( 'simpphrases.manual' )
372 s_wordlist_manual = text.split('\n')
373 t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
374 t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
375 # tradphrases.manual
376 text = ReadFile( 'tradphrases.manual' )
377 t_wordlist_manual = text.split('\n')
378 s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
379 s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
380 # t2s_word2word
381 s2t_supp = s2t_1to1_supp.copy()
382 s2t_supp.update( s2t_word2word_manual )
383 t2s_supp = t2s_1to1_supp.copy()
384 t2s_supp.update( t2s_word2word_manual )
385 t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
386 ## toSimp.manual
387 t2s_word2word.update( t2s_word2word_manual )
388 # s2t_word2word
389 s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
390 ## toTrad.manual
391 s2t_word2word.update( s2t_word2word_manual )
392
393 # Final tables
394 # sorted list toHans
395 t2s_1to1 = RemoveSameChar( t2s_1to1 )
396 s2t_1to1 = RemoveSameChar( s2t_1to1 )
397 toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
398 # sorted list toHant
399 toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
400 # sorted list toCN
401 toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
402 # sorted list toHK
403 toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
404 # sorted list toSG
405 toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
406 # sorted list toTW
407 toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
408
409 # Get PHP Array
410 php = '''<?php
411 /**
412 * Simplified / Traditional Chinese conversion tables
413 *
414 * Automatically generated using code and data in includes/zhtable/
415 * Do not modify directly!
416 */
417
418 $zh2Hant = array(\n'''
419 php += GetPHPArray( toHant )
420 php += '\n);\n\n$zh2Hans = array(\n'
421 php += GetPHPArray( toHans )
422 php += '\n);\n\n$zh2TW = array(\n'
423 php += GetPHPArray( toTW )
424 php += '\n);\n\n$zh2HK = array(\n'
425 php += GetPHPArray( toHK )
426 php += '\n);\n\n$zh2CN = array(\n'
427 php += GetPHPArray( toCN )
428 php += '\n);\n\n$zh2SG = array(\n'
429 php += GetPHPArray( toSG )
430 php += '\n);'
431
432 f = open( 'ZhConversion.php', 'w', encoding = 'utf8' )
433 print ('Writing ZhConversion.php ... ')
434 f.write( php )
435 f.close()
436
437 if __name__ == '__main__':
438 main()