- # EZ.txt.in Trad
- src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
- dst = 'EZ.txt.in'
- GetFileFromTar( tbe_dest, src, dst )
- text = ReadFile( dst )
- text = text.split( 'BEGIN_TABLE' )[1].strip()
- text = text.split( 'END_TABLE' )[0].strip()
- text = re.sub( '.*\t', '', text )
- text = RemoveOneCharConv( text )
- t_wordlist.extend( text.split() )
-
- # Wubi.txt.in Simp
- src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
- dst = 'Wubi.txt.in'
- GetFileFromTar( tbe_dest, src, dst )
- text = ReadFile( dst )
- text = text.split( 'BEGIN_TABLE' )[1].strip()
- text = text.split( 'END_TABLE' )[0].strip()
- text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
- text = RemoveOneCharConv( text )
- s_wordlist.extend( text.split() )
-
- # Ziranma.txt.in Simp
- src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
- dst = 'Ziranma.txt.in'
- GetFileFromTar( tbe_dest, src, dst )
- text = ReadFile( dst )
- text = text.split( 'BEGIN_TABLE' )[1].strip()
- text = text.split( 'END_TABLE' )[0].strip()
- text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
- text = RemoveOneCharConv( text )
- s_wordlist.extend( text.split() )
-
- # phrase_lib.txt Simp
- src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
- dst = 'phrase_lib.txt'
- GetFileFromTar( pyn_dest, src, dst )
- text = ReadFile( 'phrase_lib.txt' )
- text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
- text = RemoveRows( text, 5 )
- text = RemoveOneCharConv( text )
- s_wordlist.extend( text.split() )
-
- # tsi.src Trad
- src = 'libtabe/tsi-src/tsi.src'
- dst = 'tsi.src'
- GetFileFromTar( lbt_dest, src, dst )
- text = ReadBIG5File( 'tsi.src' )
- text = re.sub( ' \d.*', '', text.replace('# ', ''))
- text = RemoveOneCharConv( text )
- t_wordlist.extend( text.split() )
-
- # remove duplicate elements
- t_wordlist = list( set( t_wordlist ) )
- s_wordlist = list( set( s_wordlist ) )
-
- # simpphrases_exclude.manual Simp
- text = ReadFile( 'simpphrases_exclude.manual' )
- temp = text.split()
- s_string = '\n'.join( s_wordlist )
- for elem in temp:
- s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
- s_wordlist = s_string.split('\n')