Merge "Generate Utf8Case.ser directly from UnicodeData.txt"

author jenkins-bot <jenkins-bot@gerrit.wikimedia.org>

Sun, 12 Jan 2014 03:10:50 +0000 (03:10 +0000)

committer Gerrit Code Review <gerrit@wikimedia.org>

Sun, 12 Jan 2014 03:10:50 +0000 (03:10 +0000)
author jenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Sun, 12 Jan 2014 03:10:50 +0000 (03:10 +0000)
committer Gerrit Code Review <gerrit@wikimedia.org>
Sun, 12 Jan 2014 03:10:50 +0000 (03:10 +0000)
diff --git a/includes/normal/.gitignore b/includes/normal/.gitignore

new file mode 100644 (file)

index 0000000..0dea457
--- /dev/null
+++ b/includes/normal/.gitignore
@@ -0,0 +1,7 @@
+/CompositionExclusions.txt
+/DerivedNormalizationProps.txt
+/NormalizationCorrections.txt
+/NormalizationTest.txt
+/UTF-8-test.txt
+/UnicodeData.txt
+/testdata
diff --git a/includes/normal/Makefile b/includes/normal/Makefile

index 66348ee..76cb68b 100644 (file)
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@@ -16,14 +16,11 @@ PHP=php
  FETCH=wget
  #FETCH=fetch
  
-all : UtfNormalData.inc Utf8Case.php
+all : UtfNormalData.inc
  
  UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
         $(PHP) UtfNormalGenerate.php
  
-Utf8Case.php : Utf8CaseGenerate.php UtfNormalUtil.php UnicodeData.txt
-       $(PHP) Utf8CaseGenerate.php
-
  test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
         $(PHP) UtfNormalTest.php
  
diff --git a/includes/normal/Utf8Case.php b/includes/normal/Utf8Case.php

deleted file mode 100644 (file)

index abc56e4..0000000
--- a/includes/normal/Utf8Case.php
+++ /dev/null
@@ -1,2109 +0,0 @@
-<?php
-/**
- * Simple 1:1 upper/lowercase switching arrays for utf-8 text.
- * Won't get context-sensitive things yet.
- *
- * Hack for bugs in ucfirst() and company
- *
- * These are pulled from memcached if possible, as this is faster than filling
- * up a big array manually.
- *
- * @file
- * @ingroup Language
- */
-
-/**
- * Translation array to get upper case character
- */
-$wikiUpperChars = array(
-       'a' => 'A',
-       'b' => 'B',
-       'c' => 'C',
-       'd' => 'D',
-       'e' => 'E',
-       'f' => 'F',
-       'g' => 'G',
-       'h' => 'H',
-       'i' => 'I',
-       'j' => 'J',
-       'k' => 'K',
-       'l' => 'L',
-       'm' => 'M',
-       'n' => 'N',
-       'o' => 'O',
-       'p' => 'P',
-       'q' => 'Q',
-       'r' => 'R',
-       's' => 'S',
-       't' => 'T',
-       'u' => 'U',
-       'v' => 'V',
-       'w' => 'W',
-       'x' => 'X',
-       'y' => 'Y',
-       'z' => 'Z',
-       'µ' => 'Μ',
-       'à' => 'À',
-       'á' => 'Á',
-       'â' => 'Â',
-       'ã' => 'Ã',
-       'ä' => 'Ä',
-       'å' => 'Å',
-       'æ' => 'Æ',
-       'ç' => 'Ç',
-       'è' => 'È',
-       'é' => 'É',
-       'ê' => 'Ê',
-       'ë' => 'Ë',
-       'ì' => 'Ì',
-       'í' => 'Í',
-       'î' => 'Î',
-       'ï' => 'Ï',
-       'ð' => 'Ð',
-       'ñ' => 'Ñ',
-       'ò' => 'Ò',
-       'ó' => 'Ó',
-       'ô' => 'Ô',
-       'õ' => 'Õ',
-       'ö' => 'Ö',
-       'ø' => 'Ø',
-       'ù' => 'Ù',
-       'ú' => 'Ú',
-       'û' => 'Û',
-       'ü' => 'Ü',
-       'ý' => 'Ý',
-       'þ' => 'Þ',
-       'ÿ' => 'Ÿ',
-       'ā' => 'Ā',
-       'ă' => 'Ă',
-       'ą' => 'Ą',
-       'ć' => 'Ć',
-       'ĉ' => 'Ĉ',
-       'ċ' => 'Ċ',
-       'č' => 'Č',
-       'ď' => 'Ď',
-       'đ' => 'Đ',
-       'ē' => 'Ē',
-       'ĕ' => 'Ĕ',
-       'ė' => 'Ė',
-       'ę' => 'Ę',
-       'ě' => 'Ě',
-       'ĝ' => 'Ĝ',
-       'ğ' => 'Ğ',
-       'ġ' => 'Ġ',
-       'ģ' => 'Ģ',
-       'ĥ' => 'Ĥ',
-       'ħ' => 'Ħ',
-       'ĩ' => 'Ĩ',
-       'ī' => 'Ī',
-       'ĭ' => 'Ĭ',
-       'į' => 'Į',
-       'ı' => 'I',
-       'ĳ' => 'Ĳ',
-       'ĵ' => 'Ĵ',
-       'ķ' => 'Ķ',
-       'ĺ' => 'Ĺ',
-       'ļ' => 'Ļ',
-       'ľ' => 'Ľ',
-       'ŀ' => 'Ŀ',
-       'ł' => 'Ł',
-       'ń' => 'Ń',
-       'ņ' => 'Ņ',
-       'ň' => 'Ň',
-       'ŋ' => 'Ŋ',
-       'ō' => 'Ō',
-       'ŏ' => 'Ŏ',
-       'ő' => 'Ő',
-       'œ' => 'Œ',
-       'ŕ' => 'Ŕ',
-       'ŗ' => 'Ŗ',
-       'ř' => 'Ř',
-       'ś' => 'Ś',
-       'ŝ' => 'Ŝ',
-       'ş' => 'Ş',
-       'š' => 'Š',
-       'ţ' => 'Ţ',
-       'ť' => 'Ť',
-       'ŧ' => 'Ŧ',
-       'ũ' => 'Ũ',
-       'ū' => 'Ū',
-       'ŭ' => 'Ŭ',
-       'ů' => 'Ů',
-       'ű' => 'Ű',
-       'ų' => 'Ų',
-       'ŵ' => 'Ŵ',
-       'ŷ' => 'Ŷ',
-       'ź' => 'Ź',
-       'ż' => 'Ż',
-       'ž' => 'Ž',
-       'ſ' => 'S',
-       'ƀ' => 'Ƀ',
-       'ƃ' => 'Ƃ',
-       'ƅ' => 'Ƅ',
-       'ƈ' => 'Ƈ',
-       'ƌ' => 'Ƌ',
-       'ƒ' => 'Ƒ',
-       'ƕ' => 'Ƕ',
-       'ƙ' => 'Ƙ',
-       'ƚ' => 'Ƚ',
-       'ƞ' => 'Ƞ',
-       'ơ' => 'Ơ',
-       'ƣ' => 'Ƣ',
-       'ƥ' => 'Ƥ',
-       'ƨ' => 'Ƨ',
-       'ƭ' => 'Ƭ',
-       'ư' => 'Ư',
-       'ƴ' => 'Ƴ',
-       'ƶ' => 'Ƶ',
-       'ƹ' => 'Ƹ',
-       'ƽ' => 'Ƽ',
-       'ƿ' => 'Ƿ',
-       'ǅ' => 'Ǆ',
-       'ǆ' => 'Ǆ',
-       'ǈ' => 'Ǉ',
-       'ǉ' => 'Ǉ',
-       'ǋ' => 'Ǌ',
-       'ǌ' => 'Ǌ',
-       'ǎ' => 'Ǎ',
-       'ǐ' => 'Ǐ',
-       'ǒ' => 'Ǒ',
-       'ǔ' => 'Ǔ',
-       'ǖ' => 'Ǖ',
-       'ǘ' => 'Ǘ',
-       'ǚ' => 'Ǚ',
-       'ǜ' => 'Ǜ',
-       'ǝ' => 'Ǝ',
-       'ǟ' => 'Ǟ',
-       'ǡ' => 'Ǡ',
-       'ǣ' => 'Ǣ',
-       'ǥ' => 'Ǥ',
-       'ǧ' => 'Ǧ',
-       'ǩ' => 'Ǩ',
-       'ǫ' => 'Ǫ',
-       'ǭ' => 'Ǭ',
-       'ǯ' => 'Ǯ',
-       'ǲ' => 'Ǳ',
-       'ǳ' => 'Ǳ',
-       'ǵ' => 'Ǵ',
-       'ǹ' => 'Ǹ',
-       'ǻ' => 'Ǻ',
-       'ǽ' => 'Ǽ',
-       'ǿ' => 'Ǿ',
-       'ȁ' => 'Ȁ',
-       'ȃ' => 'Ȃ',
-       'ȅ' => 'Ȅ',
-       'ȇ' => 'Ȇ',
-       'ȉ' => 'Ȉ',
-       'ȋ' => 'Ȋ',
-       'ȍ' => 'Ȍ',
-       'ȏ' => 'Ȏ',
-       'ȑ' => 'Ȑ',
-       'ȓ' => 'Ȓ',
-       'ȕ' => 'Ȕ',
-       'ȗ' => 'Ȗ',
-       'ș' => 'Ș',
-       'ț' => 'Ț',
-       'ȝ' => 'Ȝ',
-       'ȟ' => 'Ȟ',
-       'ȣ' => 'Ȣ',
-       'ȥ' => 'Ȥ',
-       'ȧ' => 'Ȧ',
-       'ȩ' => 'Ȩ',
-       'ȫ' => 'Ȫ',
-       'ȭ' => 'Ȭ',
-       'ȯ' => 'Ȯ',
-       'ȱ' => 'Ȱ',
-       'ȳ' => 'Ȳ',
-       'ȼ' => 'Ȼ',
-       'ȿ' => 'Ȿ',
-       'ɀ' => 'Ɀ',
-       'ɂ' => 'Ɂ',
-       'ɇ' => 'Ɇ',
-       'ɉ' => 'Ɉ',
-       'ɋ' => 'Ɋ',
-       'ɍ' => 'Ɍ',
-       'ɏ' => 'Ɏ',
-       'ɐ' => 'Ɐ',
-       'ɑ' => 'Ɑ',
-       'ɒ' => 'Ɒ',
-       'ɓ' => 'Ɓ',
-       'ɔ' => 'Ɔ',
-       'ɖ' => 'Ɖ',
-       'ɗ' => 'Ɗ',
-       'ə' => 'Ə',
-       'ɛ' => 'Ɛ',
-       'ɠ' => 'Ɠ',
-       'ɣ' => 'Ɣ',
-       'ɥ' => 'Ɥ',
-       'ɨ' => 'Ɨ',
-       'ɩ' => 'Ɩ',
-       'ɫ' => 'Ɫ',
-       'ɯ' => 'Ɯ',
-       'ɱ' => 'Ɱ',
-       'ɲ' => 'Ɲ',
-       'ɵ' => 'Ɵ',
-       'ɽ' => 'Ɽ',
-       'ʀ' => 'Ʀ',
-       'ʃ' => 'Ʃ',
-       'ʈ' => 'Ʈ',
-       'ʉ' => 'Ʉ',
-       'ʊ' => 'Ʊ',
-       'ʋ' => 'Ʋ',
-       'ʌ' => 'Ʌ',
-       'ʒ' => 'Ʒ',
-       'ͅ' => 'Ι',
-       'ͱ' => 'Ͱ',
-       'ͳ' => 'Ͳ',
-       'ͷ' => 'Ͷ',
-       'ͻ' => 'Ͻ',
-       'ͼ' => 'Ͼ',
-       'ͽ' => 'Ͽ',
-       'ά' => 'Ά',
-       'έ' => 'Έ',
-       'ή' => 'Ή',
-       'ί' => 'Ί',
-       'α' => 'Α',
-       'β' => 'Β',
-       'γ' => 'Γ',
-       'δ' => 'Δ',
-       'ε' => 'Ε',
-       'ζ' => 'Ζ',
-       'η' => 'Η',
-       'θ' => 'Θ',
-       'ι' => 'Ι',
-       'κ' => 'Κ',
-       'λ' => 'Λ',
-       'μ' => 'Μ',
-       'ν' => 'Ν',
-       'ξ' => 'Ξ',
-       'ο' => 'Ο',
-       'π' => 'Π',
-       'ρ' => 'Ρ',
-       'ς' => 'Σ',
-       'σ' => 'Σ',
-       'τ' => 'Τ',
-       'υ' => 'Υ',
-       'φ' => 'Φ',
-       'χ' => 'Χ',
-       'ψ' => 'Ψ',
-       'ω' => 'Ω',
-       'ϊ' => 'Ϊ',
-       'ϋ' => 'Ϋ',
-       'ό' => 'Ό',
-       'ύ' => 'Ύ',
-       'ώ' => 'Ώ',
-       'ϐ' => 'Β',
-       'ϑ' => 'Θ',
-       'ϕ' => 'Φ',
-       'ϖ' => 'Π',
-       'ϗ' => 'Ϗ',
-       'ϙ' => 'Ϙ',
-       'ϛ' => 'Ϛ',
-       'ϝ' => 'Ϝ',
-       'ϟ' => 'Ϟ',
-       'ϡ' => 'Ϡ',
-       'ϣ' => 'Ϣ',
-       'ϥ' => 'Ϥ',
-       'ϧ' => 'Ϧ',
-       'ϩ' => 'Ϩ',
-       'ϫ' => 'Ϫ',
-       'ϭ' => 'Ϭ',
-       'ϯ' => 'Ϯ',
-       'ϰ' => 'Κ',
-       'ϱ' => 'Ρ',
-       'ϲ' => 'Ϲ',
-       'ϵ' => 'Ε',
-       'ϸ' => 'Ϸ',
-       'ϻ' => 'Ϻ',
-       'а' => 'А',
-       'б' => 'Б',
-       'в' => 'В',
-       'г' => 'Г',
-       'д' => 'Д',
-       'е' => 'Е',
-       'ж' => 'Ж',
-       'з' => 'З',
-       'и' => 'И',
-       'й' => 'Й',
-       'к' => 'К',
-       'л' => 'Л',
-       'м' => 'М',
-       'н' => 'Н',
-       'о' => 'О',
-       'п' => 'П',
-       'р' => 'Р',
-       'с' => 'С',
-       'т' => 'Т',
-       'у' => 'У',
-       'ф' => 'Ф',
-       'х' => 'Х',
-       'ц' => 'Ц',
-       'ч' => 'Ч',
-       'ш' => 'Ш',
-       'щ' => 'Щ',
-       'ъ' => 'Ъ',
-       'ы' => 'Ы',
-       'ь' => 'Ь',
-       'э' => 'Э',
-       'ю' => 'Ю',
-       'я' => 'Я',
-       'ѐ' => 'Ѐ',
-       'ё' => 'Ё',
-       'ђ' => 'Ђ',
-       'ѓ' => 'Ѓ',
-       'є' => 'Є',
-       'ѕ' => 'Ѕ',
-       'і' => 'І',
-       'ї' => 'Ї',
-       'ј' => 'Ј',
-       'љ' => 'Љ',
-       'њ' => 'Њ',
-       'ћ' => 'Ћ',
-       'ќ' => 'Ќ',
-       'ѝ' => 'Ѝ',
-       'ў' => 'Ў',
-       'џ' => 'Џ',
-       'ѡ' => 'Ѡ',
-       'ѣ' => 'Ѣ',
-       'ѥ' => 'Ѥ',
-       'ѧ' => 'Ѧ',
-       'ѩ' => 'Ѩ',
-       'ѫ' => 'Ѫ',
-       'ѭ' => 'Ѭ',
-       'ѯ' => 'Ѯ',
-       'ѱ' => 'Ѱ',
-       'ѳ' => 'Ѳ',
-       'ѵ' => 'Ѵ',
-       'ѷ' => 'Ѷ',
-       'ѹ' => 'Ѹ',
-       'ѻ' => 'Ѻ',
-       'ѽ' => 'Ѽ',
-       'ѿ' => 'Ѿ',
-       'ҁ' => 'Ҁ',
-       'ҋ' => 'Ҋ',
-       'ҍ' => 'Ҍ',
-       'ҏ' => 'Ҏ',
-       'ґ' => 'Ґ',
-       'ғ' => 'Ғ',
-       'ҕ' => 'Ҕ',
-       'җ' => 'Җ',
-       'ҙ' => 'Ҙ',
-       'қ' => 'Қ',
-       'ҝ' => 'Ҝ',
-       'ҟ' => 'Ҟ',
-       'ҡ' => 'Ҡ',
-       'ң' => 'Ң',
-       'ҥ' => 'Ҥ',
-       'ҧ' => 'Ҧ',
-       'ҩ' => 'Ҩ',
-       'ҫ' => 'Ҫ',
-       'ҭ' => 'Ҭ',
-       'ү' => 'Ү',
-       'ұ' => 'Ұ',
-       'ҳ' => 'Ҳ',
-       'ҵ' => 'Ҵ',
-       'ҷ' => 'Ҷ',
-       'ҹ' => 'Ҹ',
-       'һ' => 'Һ',
-       'ҽ' => 'Ҽ',
-       'ҿ' => 'Ҿ',
-       'ӂ' => 'Ӂ',
-       'ӄ' => 'Ӄ',
-       'ӆ' => 'Ӆ',
-       'ӈ' => 'Ӈ',
-       'ӊ' => 'Ӊ',
-       'ӌ' => 'Ӌ',
-       'ӎ' => 'Ӎ',
-       'ӏ' => 'Ӏ',
-       'ӑ' => 'Ӑ',
-       'ӓ' => 'Ӓ',
-       'ӕ' => 'Ӕ',
-       'ӗ' => 'Ӗ',
-       'ә' => 'Ә',
-       'ӛ' => 'Ӛ',
-       'ӝ' => 'Ӝ',
-       'ӟ' => 'Ӟ',
-       'ӡ' => 'Ӡ',
-       'ӣ' => 'Ӣ',
-       'ӥ' => 'Ӥ',
-       'ӧ' => 'Ӧ',
-       'ө' => 'Ө',
-       'ӫ' => 'Ӫ',
-       'ӭ' => 'Ӭ',
-       'ӯ' => 'Ӯ',
-       'ӱ' => 'Ӱ',
-       'ӳ' => 'Ӳ',
-       'ӵ' => 'Ӵ',
-       'ӷ' => 'Ӷ',
-       'ӹ' => 'Ӹ',
-       'ӻ' => 'Ӻ',
-       'ӽ' => 'Ӽ',
-       'ӿ' => 'Ӿ',
-       'ԁ' => 'Ԁ',
-       'ԃ' => 'Ԃ',
-       'ԅ' => 'Ԅ',
-       'ԇ' => 'Ԇ',
-       'ԉ' => 'Ԉ',
-       'ԋ' => 'Ԋ',
-       'ԍ' => 'Ԍ',
-       'ԏ' => 'Ԏ',
-       'ԑ' => 'Ԑ',
-       'ԓ' => 'Ԓ',
-       'ԕ' => 'Ԕ',
-       'ԗ' => 'Ԗ',
-       'ԙ' => 'Ԙ',
-       'ԛ' => 'Ԛ',
-       'ԝ' => 'Ԝ',
-       'ԟ' => 'Ԟ',
-       'ԡ' => 'Ԡ',
-       'ԣ' => 'Ԣ',
-       'ԥ' => 'Ԥ',
-       'ԧ' => 'Ԧ',
-       'ա' => 'Ա',
-       'բ' => 'Բ',
-       'գ' => 'Գ',
-       'դ' => 'Դ',
-       'ե' => 'Ե',
-       'զ' => 'Զ',
-       'է' => 'Է',
-       'ը' => 'Ը',
-       'թ' => 'Թ',
-       'ժ' => 'Ժ',
-       'ի' => 'Ի',
-       'լ' => 'Լ',
-       'խ' => 'Խ',
-       'ծ' => 'Ծ',
-       'կ' => 'Կ',
-       'հ' => 'Հ',
-       'ձ' => 'Ձ',
-       'ղ' => 'Ղ',
-       'ճ' => 'Ճ',
-       'մ' => 'Մ',
-       'յ' => 'Յ',
-       'ն' => 'Ն',
-       'շ' => 'Շ',
-       'ո' => 'Ո',
-       'չ' => 'Չ',
-       'պ' => 'Պ',
-       'ջ' => 'Ջ',
-       'ռ' => 'Ռ',
-       'ս' => 'Ս',
-       'վ' => 'Վ',
-       'տ' => 'Տ',
-       'ր' => 'Ր',
-       'ց' => 'Ց',
-       'ւ' => 'Ւ',
-       'փ' => 'Փ',
-       'ք' => 'Ք',
-       'օ' => 'Օ',
-       'ֆ' => 'Ֆ',
-       'ᵹ' => 'Ᵹ',
-       'ᵽ' => 'Ᵽ',
-       'ḁ' => 'Ḁ',
-       'ḃ' => 'Ḃ',
-       'ḅ' => 'Ḅ',
-       'ḇ' => 'Ḇ',
-       'ḉ' => 'Ḉ',
-       'ḋ' => 'Ḋ',
-       'ḍ' => 'Ḍ',
-       'ḏ' => 'Ḏ',
-       'ḑ' => 'Ḑ',
-       'ḓ' => 'Ḓ',
-       'ḕ' => 'Ḕ',
-       'ḗ' => 'Ḗ',
-       'ḙ' => 'Ḙ',
-       'ḛ' => 'Ḛ',
-       'ḝ' => 'Ḝ',
-       'ḟ' => 'Ḟ',
-       'ḡ' => 'Ḡ',
-       'ḣ' => 'Ḣ',
-       'ḥ' => 'Ḥ',
-       'ḧ' => 'Ḧ',
-       'ḩ' => 'Ḩ',
-       'ḫ' => 'Ḫ',
-       'ḭ' => 'Ḭ',
-       'ḯ' => 'Ḯ',
-       'ḱ' => 'Ḱ',
-       'ḳ' => 'Ḳ',
-       'ḵ' => 'Ḵ',
-       'ḷ' => 'Ḷ',
-       'ḹ' => 'Ḹ',
-       'ḻ' => 'Ḻ',
-       'ḽ' => 'Ḽ',
-       'ḿ' => 'Ḿ',
-       'ṁ' => 'Ṁ',
-       'ṃ' => 'Ṃ',
-       'ṅ' => 'Ṅ',
-       'ṇ' => 'Ṇ',
-       'ṉ' => 'Ṉ',
-       'ṋ' => 'Ṋ',
-       'ṍ' => 'Ṍ',
-       'ṏ' => 'Ṏ',
-       'ṑ' => 'Ṑ',
-       'ṓ' => 'Ṓ',
-       'ṕ' => 'Ṕ',
-       'ṗ' => 'Ṗ',
-       'ṙ' => 'Ṙ',
-       'ṛ' => 'Ṛ',
-       'ṝ' => 'Ṝ',
-       'ṟ' => 'Ṟ',
-       'ṡ' => 'Ṡ',
-       'ṣ' => 'Ṣ',
-       'ṥ' => 'Ṥ',
-       'ṧ' => 'Ṧ',
-       'ṩ' => 'Ṩ',
-       'ṫ' => 'Ṫ',
-       'ṭ' => 'Ṭ',
-       'ṯ' => 'Ṯ',
-       'ṱ' => 'Ṱ',
-       'ṳ' => 'Ṳ',
-       'ṵ' => 'Ṵ',
-       'ṷ' => 'Ṷ',
-       'ṹ' => 'Ṹ',
-       'ṻ' => 'Ṻ',
-       'ṽ' => 'Ṽ',
-       'ṿ' => 'Ṿ',
-       'ẁ' => 'Ẁ',
-       'ẃ' => 'Ẃ',
-       'ẅ' => 'Ẅ',
-       'ẇ' => 'Ẇ',
-       'ẉ' => 'Ẉ',
-       'ẋ' => 'Ẋ',
-       'ẍ' => 'Ẍ',
-       'ẏ' => 'Ẏ',
-       'ẑ' => 'Ẑ',
-       'ẓ' => 'Ẓ',
-       'ẕ' => 'Ẕ',
-       'ẛ' => 'Ṡ',
-       'ạ' => 'Ạ',
-       'ả' => 'Ả',
-       'ấ' => 'Ấ',
-       'ầ' => 'Ầ',
-       'ẩ' => 'Ẩ',
-       'ẫ' => 'Ẫ',
-       'ậ' => 'Ậ',
-       'ắ' => 'Ắ',
-       'ằ' => 'Ằ',
-       'ẳ' => 'Ẳ',
-       'ẵ' => 'Ẵ',
-       'ặ' => 'Ặ',
-       'ẹ' => 'Ẹ',
-       'ẻ' => 'Ẻ',
-       'ẽ' => 'Ẽ',
-       'ế' => 'Ế',
-       'ề' => 'Ề',
-       'ể' => 'Ể',
-       'ễ' => 'Ễ',
-       'ệ' => 'Ệ',
-       'ỉ' => 'Ỉ',
-       'ị' => 'Ị',
-       'ọ' => 'Ọ',
-       'ỏ' => 'Ỏ',
-       'ố' => 'Ố',
-       'ồ' => 'Ồ',
-       'ổ' => 'Ổ',
-       'ỗ' => 'Ỗ',
-       'ộ' => 'Ộ',
-       'ớ' => 'Ớ',
-       'ờ' => 'Ờ',
-       'ở' => 'Ở',
-       'ỡ' => 'Ỡ',
-       'ợ' => 'Ợ',
-       'ụ' => 'Ụ',
-       'ủ' => 'Ủ',
-       'ứ' => 'Ứ',
-       'ừ' => 'Ừ',
-       'ử' => 'Ử',
-       'ữ' => 'Ữ',
-       'ự' => 'Ự',
-       'ỳ' => 'Ỳ',
-       'ỵ' => 'Ỵ',
-       'ỷ' => 'Ỷ',
-       'ỹ' => 'Ỹ',
-       'ỻ' => 'Ỻ',
-       'ỽ' => 'Ỽ',
-       'ỿ' => 'Ỿ',
-       'ἀ' => 'Ἀ',
-       'ἁ' => 'Ἁ',
-       'ἂ' => 'Ἂ',
-       'ἃ' => 'Ἃ',
-       'ἄ' => 'Ἄ',
-       'ἅ' => 'Ἅ',
-       'ἆ' => 'Ἆ',
-       'ἇ' => 'Ἇ',
-       'ἐ' => 'Ἐ',
-       'ἑ' => 'Ἑ',
-       'ἒ' => 'Ἒ',
-       'ἓ' => 'Ἓ',
-       'ἔ' => 'Ἔ',
-       'ἕ' => 'Ἕ',
-       'ἠ' => 'Ἠ',
-       'ἡ' => 'Ἡ',
-       'ἢ' => 'Ἢ',
-       'ἣ' => 'Ἣ',
-       'ἤ' => 'Ἤ',
-       'ἥ' => 'Ἥ',
-       'ἦ' => 'Ἦ',
-       'ἧ' => 'Ἧ',
-       'ἰ' => 'Ἰ',
-       'ἱ' => 'Ἱ',
-       'ἲ' => 'Ἲ',
-       'ἳ' => 'Ἳ',
-       'ἴ' => 'Ἴ',
-       'ἵ' => 'Ἵ',
-       'ἶ' => 'Ἶ',
-       'ἷ' => 'Ἷ',
-       'ὀ' => 'Ὀ',
-       'ὁ' => 'Ὁ',
-       'ὂ' => 'Ὂ',
-       'ὃ' => 'Ὃ',
-       'ὄ' => 'Ὄ',
-       'ὅ' => 'Ὅ',
-       'ὑ' => 'Ὑ',
-       'ὓ' => 'Ὓ',
-       'ὕ' => 'Ὕ',
-       'ὗ' => 'Ὗ',
-       'ὠ' => 'Ὠ',
-       'ὡ' => 'Ὡ',
-       'ὢ' => 'Ὢ',
-       'ὣ' => 'Ὣ',
-       'ὤ' => 'Ὤ',
-       'ὥ' => 'Ὥ',
-       'ὦ' => 'Ὦ',
-       'ὧ' => 'Ὧ',
-       'ὰ' => 'Ὰ',
-       'ά' => 'Ά',
-       'ὲ' => 'Ὲ',
-       'έ' => 'Έ',
-       'ὴ' => 'Ὴ',
-       'ή' => 'Ή',
-       'ὶ' => 'Ὶ',
-       'ί' => 'Ί',
-       'ὸ' => 'Ὸ',
-       'ό' => 'Ό',
-       'ὺ' => 'Ὺ',
-       'ύ' => 'Ύ',
-       'ὼ' => 'Ὼ',
-       'ώ' => 'Ώ',
-       'ᾀ' => 'ᾈ',
-       'ᾁ' => 'ᾉ',
-       'ᾂ' => 'ᾊ',
-       'ᾃ' => 'ᾋ',
-       'ᾄ' => 'ᾌ',
-       'ᾅ' => 'ᾍ',
-       'ᾆ' => 'ᾎ',
-       'ᾇ' => 'ᾏ',
-       'ᾐ' => 'ᾘ',
-       'ᾑ' => 'ᾙ',
-       'ᾒ' => 'ᾚ',
-       'ᾓ' => 'ᾛ',
-       'ᾔ' => 'ᾜ',
-       'ᾕ' => 'ᾝ',
-       'ᾖ' => 'ᾞ',
-       'ᾗ' => 'ᾟ',
-       'ᾠ' => 'ᾨ',
-       'ᾡ' => 'ᾩ',
-       'ᾢ' => 'ᾪ',
-       'ᾣ' => 'ᾫ',
-       'ᾤ' => 'ᾬ',
-       'ᾥ' => 'ᾭ',
-       'ᾦ' => 'ᾮ',
-       'ᾧ' => 'ᾯ',
-       'ᾰ' => 'Ᾰ',
-       'ᾱ' => 'Ᾱ',
-       'ᾳ' => 'ᾼ',
-       'ι' => 'Ι',
-       'ῃ' => 'ῌ',
-       'ῐ' => 'Ῐ',
-       'ῑ' => 'Ῑ',
-       'ῠ' => 'Ῠ',
-       'ῡ' => 'Ῡ',
-       'ῥ' => 'Ῥ',
-       'ῳ' => 'ῼ',
-       'ⅎ' => 'Ⅎ',
-       'ⅰ' => 'Ⅰ',
-       'ⅱ' => 'Ⅱ',
-       'ⅲ' => 'Ⅲ',
-       'ⅳ' => 'Ⅳ',
-       'ⅴ' => 'Ⅴ',
-       'ⅵ' => 'Ⅵ',
-       'ⅶ' => 'Ⅶ',
-       'ⅷ' => 'Ⅷ',
-       'ⅸ' => 'Ⅸ',
-       'ⅹ' => 'Ⅹ',
-       'ⅺ' => 'Ⅺ',
-       'ⅻ' => 'Ⅻ',
-       'ⅼ' => 'Ⅼ',
-       'ⅽ' => 'Ⅽ',
-       'ⅾ' => 'Ⅾ',
-       'ⅿ' => 'Ⅿ',
-       'ↄ' => 'Ↄ',
-       'ⓐ' => 'Ⓐ',
-       'ⓑ' => 'Ⓑ',
-       'ⓒ' => 'Ⓒ',
-       'ⓓ' => 'Ⓓ',
-       'ⓔ' => 'Ⓔ',
-       'ⓕ' => 'Ⓕ',
-       'ⓖ' => 'Ⓖ',
-       'ⓗ' => 'Ⓗ',
-       'ⓘ' => 'Ⓘ',
-       'ⓙ' => 'Ⓙ',
-       'ⓚ' => 'Ⓚ',
-       'ⓛ' => 'Ⓛ',
-       'ⓜ' => 'Ⓜ',
-       'ⓝ' => 'Ⓝ',
-       'ⓞ' => 'Ⓞ',
-       'ⓟ' => 'Ⓟ',
-       'ⓠ' => 'Ⓠ',
-       'ⓡ' => 'Ⓡ',
-       'ⓢ' => 'Ⓢ',
-       'ⓣ' => 'Ⓣ',
-       'ⓤ' => 'Ⓤ',
-       'ⓥ' => 'Ⓥ',
-       'ⓦ' => 'Ⓦ',
-       'ⓧ' => 'Ⓧ',
-       'ⓨ' => 'Ⓨ',
-       'ⓩ' => 'Ⓩ',
-       'ⰰ' => 'Ⰰ',
-       'ⰱ' => 'Ⰱ',
-       'ⰲ' => 'Ⰲ',
-       'ⰳ' => 'Ⰳ',
-       'ⰴ' => 'Ⰴ',
-       'ⰵ' => 'Ⰵ',
-       'ⰶ' => 'Ⰶ',
-       'ⰷ' => 'Ⰷ',
-       'ⰸ' => 'Ⰸ',
-       'ⰹ' => 'Ⰹ',
-       'ⰺ' => 'Ⰺ',
-       'ⰻ' => 'Ⰻ',
-       'ⰼ' => 'Ⰼ',
-       'ⰽ' => 'Ⰽ',
-       'ⰾ' => 'Ⰾ',
-       'ⰿ' => 'Ⰿ',
-       'ⱀ' => 'Ⱀ',
-       'ⱁ' => 'Ⱁ',
-       'ⱂ' => 'Ⱂ',
-       'ⱃ' => 'Ⱃ',
-       'ⱄ' => 'Ⱄ',
-       'ⱅ' => 'Ⱅ',
-       'ⱆ' => 'Ⱆ',
-       'ⱇ' => 'Ⱇ',
-       'ⱈ' => 'Ⱈ',
-       'ⱉ' => 'Ⱉ',
-       'ⱊ' => 'Ⱊ',
-       'ⱋ' => 'Ⱋ',
-       'ⱌ' => 'Ⱌ',
-       'ⱍ' => 'Ⱍ',
-       'ⱎ' => 'Ⱎ',
-       'ⱏ' => 'Ⱏ',
-       'ⱐ' => 'Ⱐ',
-       'ⱑ' => 'Ⱑ',
-       'ⱒ' => 'Ⱒ',
-       'ⱓ' => 'Ⱓ',
-       'ⱔ' => 'Ⱔ',
-       'ⱕ' => 'Ⱕ',
-       'ⱖ' => 'Ⱖ',
-       'ⱗ' => 'Ⱗ',
-       'ⱘ' => 'Ⱘ',
-       'ⱙ' => 'Ⱙ',
-       'ⱚ' => 'Ⱚ',
-       'ⱛ' => 'Ⱛ',
-       'ⱜ' => 'Ⱜ',
-       'ⱝ' => 'Ⱝ',
-       'ⱞ' => 'Ⱞ',
-       'ⱡ' => 'Ⱡ',
-       'ⱥ' => 'Ⱥ',
-       'ⱦ' => 'Ⱦ',
-       'ⱨ' => 'Ⱨ',
-       'ⱪ' => 'Ⱪ',
-       'ⱬ' => 'Ⱬ',
-       'ⱳ' => 'Ⱳ',
-       'ⱶ' => 'Ⱶ',
-       'ⲁ' => 'Ⲁ',
-       'ⲃ' => 'Ⲃ',
-       'ⲅ' => 'Ⲅ',
-       'ⲇ' => 'Ⲇ',
-       'ⲉ' => 'Ⲉ',
-       'ⲋ' => 'Ⲋ',
-       'ⲍ' => 'Ⲍ',
-       'ⲏ' => 'Ⲏ',
-       'ⲑ' => 'Ⲑ',
-       'ⲓ' => 'Ⲓ',
-       'ⲕ' => 'Ⲕ',
-       'ⲗ' => 'Ⲗ',
-       'ⲙ' => 'Ⲙ',
-       'ⲛ' => 'Ⲛ',
-       'ⲝ' => 'Ⲝ',
-       'ⲟ' => 'Ⲟ',
-       'ⲡ' => 'Ⲡ',
-       'ⲣ' => 'Ⲣ',
-       'ⲥ' => 'Ⲥ',
-       'ⲧ' => 'Ⲧ',
-       'ⲩ' => 'Ⲩ',
-       'ⲫ' => 'Ⲫ',
-       'ⲭ' => 'Ⲭ',
-       'ⲯ' => 'Ⲯ',
-       'ⲱ' => 'Ⲱ',
-       'ⲳ' => 'Ⲳ',
-       'ⲵ' => 'Ⲵ',
-       'ⲷ' => 'Ⲷ',
-       'ⲹ' => 'Ⲹ',
-       'ⲻ' => 'Ⲻ',
-       'ⲽ' => 'Ⲽ',
-       'ⲿ' => 'Ⲿ',
-       'ⳁ' => 'Ⳁ',
-       'ⳃ' => 'Ⳃ',
-       'ⳅ' => 'Ⳅ',
-       'ⳇ' => 'Ⳇ',
-       'ⳉ' => 'Ⳉ',
-       'ⳋ' => 'Ⳋ',
-       'ⳍ' => 'Ⳍ',
-       'ⳏ' => 'Ⳏ',
-       'ⳑ' => 'Ⳑ',
-       'ⳓ' => 'Ⳓ',
-       'ⳕ' => 'Ⳕ',
-       'ⳗ' => 'Ⳗ',
-       'ⳙ' => 'Ⳙ',
-       'ⳛ' => 'Ⳛ',
-       'ⳝ' => 'Ⳝ',
-       'ⳟ' => 'Ⳟ',
-       'ⳡ' => 'Ⳡ',
-       'ⳣ' => 'Ⳣ',
-       'ⳬ' => 'Ⳬ',
-       'ⳮ' => 'Ⳮ',
-       'ⴀ' => 'Ⴀ',
-       'ⴁ' => 'Ⴁ',
-       'ⴂ' => 'Ⴂ',
-       'ⴃ' => 'Ⴃ',
-       'ⴄ' => 'Ⴄ',
-       'ⴅ' => 'Ⴅ',
-       'ⴆ' => 'Ⴆ',
-       'ⴇ' => 'Ⴇ',
-       'ⴈ' => 'Ⴈ',
-       'ⴉ' => 'Ⴉ',
-       'ⴊ' => 'Ⴊ',
-       'ⴋ' => 'Ⴋ',
-       'ⴌ' => 'Ⴌ',
-       'ⴍ' => 'Ⴍ',
-       'ⴎ' => 'Ⴎ',
-       'ⴏ' => 'Ⴏ',
-       'ⴐ' => 'Ⴐ',
-       'ⴑ' => 'Ⴑ',
-       'ⴒ' => 'Ⴒ',
-       'ⴓ' => 'Ⴓ',
-       'ⴔ' => 'Ⴔ',
-       'ⴕ' => 'Ⴕ',
-       'ⴖ' => 'Ⴖ',
-       'ⴗ' => 'Ⴗ',
-       'ⴘ' => 'Ⴘ',
-       'ⴙ' => 'Ⴙ',
-       'ⴚ' => 'Ⴚ',
-       'ⴛ' => 'Ⴛ',
-       'ⴜ' => 'Ⴜ',
-       'ⴝ' => 'Ⴝ',
-       'ⴞ' => 'Ⴞ',
-       'ⴟ' => 'Ⴟ',
-       'ⴠ' => 'Ⴠ',
-       'ⴡ' => 'Ⴡ',
-       'ⴢ' => 'Ⴢ',
-       'ⴣ' => 'Ⴣ',
-       'ⴤ' => 'Ⴤ',
-       'ⴥ' => 'Ⴥ',
-       'ꙁ' => 'Ꙁ',
-       'ꙃ' => 'Ꙃ',
-       'ꙅ' => 'Ꙅ',
-       'ꙇ' => 'Ꙇ',
-       'ꙉ' => 'Ꙉ',
-       'ꙋ' => 'Ꙋ',
-       'ꙍ' => 'Ꙍ',
-       'ꙏ' => 'Ꙏ',
-       'ꙑ' => 'Ꙑ',
-       'ꙓ' => 'Ꙓ',
-       'ꙕ' => 'Ꙕ',
-       'ꙗ' => 'Ꙗ',
-       'ꙙ' => 'Ꙙ',
-       'ꙛ' => 'Ꙛ',
-       'ꙝ' => 'Ꙝ',
-       'ꙟ' => 'Ꙟ',
-       'ꙡ' => 'Ꙡ',
-       'ꙣ' => 'Ꙣ',
-       'ꙥ' => 'Ꙥ',
-       'ꙧ' => 'Ꙧ',
-       'ꙩ' => 'Ꙩ',
-       'ꙫ' => 'Ꙫ',
-       'ꙭ' => 'Ꙭ',
-       'ꚁ' => 'Ꚁ',
-       'ꚃ' => 'Ꚃ',
-       'ꚅ' => 'Ꚅ',
-       'ꚇ' => 'Ꚇ',
-       'ꚉ' => 'Ꚉ',
-       'ꚋ' => 'Ꚋ',
-       'ꚍ' => 'Ꚍ',
-       'ꚏ' => 'Ꚏ',
-       'ꚑ' => 'Ꚑ',
-       'ꚓ' => 'Ꚓ',
-       'ꚕ' => 'Ꚕ',
-       'ꚗ' => 'Ꚗ',
-       'ꜣ' => 'Ꜣ',
-       'ꜥ' => 'Ꜥ',
-       'ꜧ' => 'Ꜧ',
-       'ꜩ' => 'Ꜩ',
-       'ꜫ' => 'Ꜫ',
-       'ꜭ' => 'Ꜭ',
-       'ꜯ' => 'Ꜯ',
-       'ꜳ' => 'Ꜳ',
-       'ꜵ' => 'Ꜵ',
-       'ꜷ' => 'Ꜷ',
-       'ꜹ' => 'Ꜹ',
-       'ꜻ' => 'Ꜻ',
-       'ꜽ' => 'Ꜽ',
-       'ꜿ' => 'Ꜿ',
-       'ꝁ' => 'Ꝁ',
-       'ꝃ' => 'Ꝃ',
-       'ꝅ' => 'Ꝅ',
-       'ꝇ' => 'Ꝇ',
-       'ꝉ' => 'Ꝉ',
-       'ꝋ' => 'Ꝋ',
-       'ꝍ' => 'Ꝍ',
-       'ꝏ' => 'Ꝏ',
-       'ꝑ' => 'Ꝑ',
-       'ꝓ' => 'Ꝓ',
-       'ꝕ' => 'Ꝕ',
-       'ꝗ' => 'Ꝗ',
-       'ꝙ' => 'Ꝙ',
-       'ꝛ' => 'Ꝛ',
-       'ꝝ' => 'Ꝝ',
-       'ꝟ' => 'Ꝟ',
-       'ꝡ' => 'Ꝡ',
-       'ꝣ' => 'Ꝣ',
-       'ꝥ' => 'Ꝥ',
-       'ꝧ' => 'Ꝧ',
-       'ꝩ' => 'Ꝩ',
-       'ꝫ' => 'Ꝫ',
-       'ꝭ' => 'Ꝭ',
-       'ꝯ' => 'Ꝯ',
-       'ꝺ' => 'Ꝺ',
-       'ꝼ' => 'Ꝼ',
-       'ꝿ' => 'Ꝿ',
-       'ꞁ' => 'Ꞁ',
-       'ꞃ' => 'Ꞃ',
-       'ꞅ' => 'Ꞅ',
-       'ꞇ' => 'Ꞇ',
-       'ꞌ' => 'Ꞌ',
-       'ꞑ' => 'Ꞑ',
-       'ꞡ' => 'Ꞡ',
-       'ꞣ' => 'Ꞣ',
-       'ꞥ' => 'Ꞥ',
-       'ꞧ' => 'Ꞧ',
-       'ꞩ' => 'Ꞩ',
-       'ａ' => 'Ａ',
-       'ｂ' => 'Ｂ',
-       'ｃ' => 'Ｃ',
-       'ｄ' => 'Ｄ',
-       'ｅ' => 'Ｅ',
-       'ｆ' => 'Ｆ',
-       'ｇ' => 'Ｇ',
-       'ｈ' => 'Ｈ',
-       'ｉ' => 'Ｉ',
-       'ｊ' => 'Ｊ',
-       'ｋ' => 'Ｋ',
-       'ｌ' => 'Ｌ',
-       'ｍ' => 'Ｍ',
-       'ｎ' => 'Ｎ',
-       'ｏ' => 'Ｏ',
-       'ｐ' => 'Ｐ',
-       'ｑ' => 'Ｑ',
-       'ｒ' => 'Ｒ',
-       'ｓ' => 'Ｓ',
-       'ｔ' => 'Ｔ',
-       'ｕ' => 'Ｕ',
-       'ｖ' => 'Ｖ',
-       'ｗ' => 'Ｗ',
-       'ｘ' => 'Ｘ',
-       'ｙ' => 'Ｙ',
-       'ｚ' => 'Ｚ',
-       '𐐨' => '𐐀',
-       '𐐩' => '𐐁',
-       '𐐪' => '𐐂',
-       '𐐫' => '𐐃',
-       '𐐬' => '𐐄',
-       '𐐭' => '𐐅',
-       '𐐮' => '𐐆',
-       '𐐯' => '𐐇',
-       '𐐰' => '𐐈',
-       '𐐱' => '𐐉',
-       '𐐲' => '𐐊',
-       '𐐳' => '𐐋',
-       '𐐴' => '𐐌',
-       '𐐵' => '𐐍',
-       '𐐶' => '𐐎',
-       '𐐷' => '𐐏',
-       '𐐸' => '𐐐',
-       '𐐹' => '𐐑',
-       '𐐺' => '𐐒',
-       '𐐻' => '𐐓',
-       '𐐼' => '𐐔',
-       '𐐽' => '𐐕',
-       '𐐾' => '𐐖',
-       '𐐿' => '𐐗',
-       '𐑀' => '𐐘',
-       '𐑁' => '𐐙',
-       '𐑂' => '𐐚',
-       '𐑃' => '𐐛',
-       '𐑄' => '𐐜',
-       '𐑅' => '𐐝',
-       '𐑆' => '𐐞',
-       '𐑇' => '𐐟',
-       '𐑈' => '𐐠',
-       '𐑉' => '𐐡',
-       '𐑊' => '𐐢',
-       '𐑋' => '𐐣',
-       '𐑌' => '𐐤',
-       '𐑍' => '𐐥',
-       '𐑎' => '𐐦',
-       '𐑏' => '𐐧'
-);
-
-/**
- * Translation array to get lower case character
- */
-$wikiLowerChars = array(
-       'A' => 'a',
-       'B' => 'b',
-       'C' => 'c',
-       'D' => 'd',
-       'E' => 'e',
-       'F' => 'f',
-       'G' => 'g',
-       'H' => 'h',
-       'I' => 'i',
-       'J' => 'j',
-       'K' => 'k',
-       'L' => 'l',
-       'M' => 'm',
-       'N' => 'n',
-       'O' => 'o',
-       'P' => 'p',
-       'Q' => 'q',
-       'R' => 'r',
-       'S' => 's',
-       'T' => 't',
-       'U' => 'u',
-       'V' => 'v',
-       'W' => 'w',
-       'X' => 'x',
-       'Y' => 'y',
-       'Z' => 'z',
-       'À' => 'à',
-       'Á' => 'á',
-       'Â' => 'â',
-       'Ã' => 'ã',
-       'Ä' => 'ä',
-       'Å' => 'å',
-       'Æ' => 'æ',
-       'Ç' => 'ç',
-       'È' => 'è',
-       'É' => 'é',
-       'Ê' => 'ê',
-       'Ë' => 'ë',
-       'Ì' => 'ì',
-       'Í' => 'í',
-       'Î' => 'î',
-       'Ï' => 'ï',
-       'Ð' => 'ð',
-       'Ñ' => 'ñ',
-       'Ò' => 'ò',
-       'Ó' => 'ó',
-       'Ô' => 'ô',
-       'Õ' => 'õ',
-       'Ö' => 'ö',
-       'Ø' => 'ø',
-       'Ù' => 'ù',
-       'Ú' => 'ú',
-       'Û' => 'û',
-       'Ü' => 'ü',
-       'Ý' => 'ý',
-       'Þ' => 'þ',
-       'Ā' => 'ā',
-       'Ă' => 'ă',
-       'Ą' => 'ą',
-       'Ć' => 'ć',
-       'Ĉ' => 'ĉ',
-       'Ċ' => 'ċ',
-       'Č' => 'č',
-       'Ď' => 'ď',
-       'Đ' => 'đ',
-       'Ē' => 'ē',
-       'Ĕ' => 'ĕ',
-       'Ė' => 'ė',
-       'Ę' => 'ę',
-       'Ě' => 'ě',
-       'Ĝ' => 'ĝ',
-       'Ğ' => 'ğ',
-       'Ġ' => 'ġ',
-       'Ģ' => 'ģ',
-       'Ĥ' => 'ĥ',
-       'Ħ' => 'ħ',
-       'Ĩ' => 'ĩ',
-       'Ī' => 'ī',
-       'Ĭ' => 'ĭ',
-       'Į' => 'į',
-       'İ' => 'i',
-       'Ĳ' => 'ĳ',
-       'Ĵ' => 'ĵ',
-       'Ķ' => 'ķ',
-       'Ĺ' => 'ĺ',
-       'Ļ' => 'ļ',
-       'Ľ' => 'ľ',
-       'Ŀ' => 'ŀ',
-       'Ł' => 'ł',
-       'Ń' => 'ń',
-       'Ņ' => 'ņ',
-       'Ň' => 'ň',
-       'Ŋ' => 'ŋ',
-       'Ō' => 'ō',
-       'Ŏ' => 'ŏ',
-       'Ő' => 'ő',
-       'Œ' => 'œ',
-       'Ŕ' => 'ŕ',
-       'Ŗ' => 'ŗ',
-       'Ř' => 'ř',
-       'Ś' => 'ś',
-       'Ŝ' => 'ŝ',
-       'Ş' => 'ş',
-       'Š' => 'š',
-       'Ţ' => 'ţ',
-       'Ť' => 'ť',
-       'Ŧ' => 'ŧ',
-       'Ũ' => 'ũ',
-       'Ū' => 'ū',
-       'Ŭ' => 'ŭ',
-       'Ů' => 'ů',
-       'Ű' => 'ű',
-       'Ų' => 'ų',
-       'Ŵ' => 'ŵ',
-       'Ŷ' => 'ŷ',
-       'Ÿ' => 'ÿ',
-       'Ź' => 'ź',
-       'Ż' => 'ż',
-       'Ž' => 'ž',
-       'Ɓ' => 'ɓ',
-       'Ƃ' => 'ƃ',
-       'Ƅ' => 'ƅ',
-       'Ɔ' => 'ɔ',
-       'Ƈ' => 'ƈ',
-       'Ɖ' => 'ɖ',
-       'Ɗ' => 'ɗ',
-       'Ƌ' => 'ƌ',
-       'Ǝ' => 'ǝ',
-       'Ə' => 'ə',
-       'Ɛ' => 'ɛ',
-       'Ƒ' => 'ƒ',
-       'Ɠ' => 'ɠ',
-       'Ɣ' => 'ɣ',
-       'Ɩ' => 'ɩ',
-       'Ɨ' => 'ɨ',
-       'Ƙ' => 'ƙ',
-       'Ɯ' => 'ɯ',
-       'Ɲ' => 'ɲ',
-       'Ɵ' => 'ɵ',
-       'Ơ' => 'ơ',
-       'Ƣ' => 'ƣ',
-       'Ƥ' => 'ƥ',
-       'Ʀ' => 'ʀ',
-       'Ƨ' => 'ƨ',
-       'Ʃ' => 'ʃ',
-       'Ƭ' => 'ƭ',
-       'Ʈ' => 'ʈ',
-       'Ư' => 'ư',
-       'Ʊ' => 'ʊ',
-       'Ʋ' => 'ʋ',
-       'Ƴ' => 'ƴ',
-       'Ƶ' => 'ƶ',
-       'Ʒ' => 'ʒ',
-       'Ƹ' => 'ƹ',
-       'Ƽ' => 'ƽ',
-       'Ǆ' => 'ǆ',
-       'ǅ' => 'ǆ',
-       'Ǉ' => 'ǉ',
-       'ǈ' => 'ǉ',
-       'Ǌ' => 'ǌ',
-       'ǋ' => 'ǌ',
-       'Ǎ' => 'ǎ',
-       'Ǐ' => 'ǐ',
-       'Ǒ' => 'ǒ',
-       'Ǔ' => 'ǔ',
-       'Ǖ' => 'ǖ',
-       'Ǘ' => 'ǘ',
-       'Ǚ' => 'ǚ',
-       'Ǜ' => 'ǜ',
-       'Ǟ' => 'ǟ',
-       'Ǡ' => 'ǡ',
-       'Ǣ' => 'ǣ',
-       'Ǥ' => 'ǥ',
-       'Ǧ' => 'ǧ',
-       'Ǩ' => 'ǩ',
-       'Ǫ' => 'ǫ',
-       'Ǭ' => 'ǭ',
-       'Ǯ' => 'ǯ',
-       'Ǳ' => 'ǳ',
-       'ǲ' => 'ǳ',
-       'Ǵ' => 'ǵ',
-       'Ƕ' => 'ƕ',
-       'Ƿ' => 'ƿ',
-       'Ǹ' => 'ǹ',
-       'Ǻ' => 'ǻ',
-       'Ǽ' => 'ǽ',
-       'Ǿ' => 'ǿ',
-       'Ȁ' => 'ȁ',
-       'Ȃ' => 'ȃ',
-       'Ȅ' => 'ȅ',
-       'Ȇ' => 'ȇ',
-       'Ȉ' => 'ȉ',
-       'Ȋ' => 'ȋ',
-       'Ȍ' => 'ȍ',
-       'Ȏ' => 'ȏ',
-       'Ȑ' => 'ȑ',
-       'Ȓ' => 'ȓ',
-       'Ȕ' => 'ȕ',
-       'Ȗ' => 'ȗ',
-       'Ș' => 'ș',
-       'Ț' => 'ț',
-       'Ȝ' => 'ȝ',
-       'Ȟ' => 'ȟ',
-       'Ƞ' => 'ƞ',
-       'Ȣ' => 'ȣ',
-       'Ȥ' => 'ȥ',
-       'Ȧ' => 'ȧ',
-       'Ȩ' => 'ȩ',
-       'Ȫ' => 'ȫ',
-       'Ȭ' => 'ȭ',
-       'Ȯ' => 'ȯ',
-       'Ȱ' => 'ȱ',
-       'Ȳ' => 'ȳ',
-       'Ⱥ' => 'ⱥ',
-       'Ȼ' => 'ȼ',
-       'Ƚ' => 'ƚ',
-       'Ⱦ' => 'ⱦ',
-       'Ɂ' => 'ɂ',
-       'Ƀ' => 'ƀ',
-       'Ʉ' => 'ʉ',
-       'Ʌ' => 'ʌ',
-       'Ɇ' => 'ɇ',
-       'Ɉ' => 'ɉ',
-       'Ɋ' => 'ɋ',
-       'Ɍ' => 'ɍ',
-       'Ɏ' => 'ɏ',
-       'Ͱ' => 'ͱ',
-       'Ͳ' => 'ͳ',
-       'Ͷ' => 'ͷ',
-       'Ά' => 'ά',
-       'Έ' => 'έ',
-       'Ή' => 'ή',
-       'Ί' => 'ί',
-       'Ό' => 'ό',
-       'Ύ' => 'ύ',
-       'Ώ' => 'ώ',
-       'Α' => 'α',
-       'Β' => 'β',
-       'Γ' => 'γ',
-       'Δ' => 'δ',
-       'Ε' => 'ε',
-       'Ζ' => 'ζ',
-       'Η' => 'η',
-       'Θ' => 'θ',
-       'Ι' => 'ι',
-       'Κ' => 'κ',
-       'Λ' => 'λ',
-       'Μ' => 'μ',
-       'Ν' => 'ν',
-       'Ξ' => 'ξ',
-       'Ο' => 'ο',
-       'Π' => 'π',
-       'Ρ' => 'ρ',
-       'Σ' => 'σ',
-       'Τ' => 'τ',
-       'Υ' => 'υ',
-       'Φ' => 'φ',
-       'Χ' => 'χ',
-       'Ψ' => 'ψ',
-       'Ω' => 'ω',
-       'Ϊ' => 'ϊ',
-       'Ϋ' => 'ϋ',
-       'Ϗ' => 'ϗ',
-       'Ϙ' => 'ϙ',
-       'Ϛ' => 'ϛ',
-       'Ϝ' => 'ϝ',
-       'Ϟ' => 'ϟ',
-       'Ϡ' => 'ϡ',
-       'Ϣ' => 'ϣ',
-       'Ϥ' => 'ϥ',
-       'Ϧ' => 'ϧ',
-       'Ϩ' => 'ϩ',
-       'Ϫ' => 'ϫ',
-       'Ϭ' => 'ϭ',
-       'Ϯ' => 'ϯ',
-       'ϴ' => 'θ',
-       'Ϸ' => 'ϸ',
-       'Ϲ' => 'ϲ',
-       'Ϻ' => 'ϻ',
-       'Ͻ' => 'ͻ',
-       'Ͼ' => 'ͼ',
-       'Ͽ' => 'ͽ',
-       'Ѐ' => 'ѐ',
-       'Ё' => 'ё',
-       'Ђ' => 'ђ',
-       'Ѓ' => 'ѓ',
-       'Є' => 'є',
-       'Ѕ' => 'ѕ',
-       'І' => 'і',
-       'Ї' => 'ї',
-       'Ј' => 'ј',
-       'Љ' => 'љ',
-       'Њ' => 'њ',
-       'Ћ' => 'ћ',
-       'Ќ' => 'ќ',
-       'Ѝ' => 'ѝ',
-       'Ў' => 'ў',
-       'Џ' => 'џ',
-       'А' => 'а',
-       'Б' => 'б',
-       'В' => 'в',
-       'Г' => 'г',
-       'Д' => 'д',
-       'Е' => 'е',
-       'Ж' => 'ж',
-       'З' => 'з',
-       'И' => 'и',
-       'Й' => 'й',
-       'К' => 'к',
-       'Л' => 'л',
-       'М' => 'м',
-       'Н' => 'н',
-       'О' => 'о',
-       'П' => 'п',
-       'Р' => 'р',
-       'С' => 'с',
-       'Т' => 'т',
-       'У' => 'у',
-       'Ф' => 'ф',
-       'Х' => 'х',
-       'Ц' => 'ц',
-       'Ч' => 'ч',
-       'Ш' => 'ш',
-       'Щ' => 'щ',
-       'Ъ' => 'ъ',
-       'Ы' => 'ы',
-       'Ь' => 'ь',
-       'Э' => 'э',
-       'Ю' => 'ю',
-       'Я' => 'я',
-       'Ѡ' => 'ѡ',
-       'Ѣ' => 'ѣ',
-       'Ѥ' => 'ѥ',
-       'Ѧ' => 'ѧ',
-       'Ѩ' => 'ѩ',
-       'Ѫ' => 'ѫ',
-       'Ѭ' => 'ѭ',
-       'Ѯ' => 'ѯ',
-       'Ѱ' => 'ѱ',
-       'Ѳ' => 'ѳ',
-       'Ѵ' => 'ѵ',
-       'Ѷ' => 'ѷ',
-       'Ѹ' => 'ѹ',
-       'Ѻ' => 'ѻ',
-       'Ѽ' => 'ѽ',
-       'Ѿ' => 'ѿ',
-       'Ҁ' => 'ҁ',
-       'Ҋ' => 'ҋ',
-       'Ҍ' => 'ҍ',
-       'Ҏ' => 'ҏ',
-       'Ґ' => 'ґ',
-       'Ғ' => 'ғ',
-       'Ҕ' => 'ҕ',
-       'Җ' => 'җ',
-       'Ҙ' => 'ҙ',
-       'Қ' => 'қ',
-       'Ҝ' => 'ҝ',
-       'Ҟ' => 'ҟ',
-       'Ҡ' => 'ҡ',
-       'Ң' => 'ң',
-       'Ҥ' => 'ҥ',
-       'Ҧ' => 'ҧ',
-       'Ҩ' => 'ҩ',
-       'Ҫ' => 'ҫ',
-       'Ҭ' => 'ҭ',
-       'Ү' => 'ү',
-       'Ұ' => 'ұ',
-       'Ҳ' => 'ҳ',
-       'Ҵ' => 'ҵ',
-       'Ҷ' => 'ҷ',
-       'Ҹ' => 'ҹ',
-       'Һ' => 'һ',
-       'Ҽ' => 'ҽ',
-       'Ҿ' => 'ҿ',
-       'Ӏ' => 'ӏ',
-       'Ӂ' => 'ӂ',
-       'Ӄ' => 'ӄ',
-       'Ӆ' => 'ӆ',
-       'Ӈ' => 'ӈ',
-       'Ӊ' => 'ӊ',
-       'Ӌ' => 'ӌ',
-       'Ӎ' => 'ӎ',
-       'Ӑ' => 'ӑ',
-       'Ӓ' => 'ӓ',
-       'Ӕ' => 'ӕ',
-       'Ӗ' => 'ӗ',
-       'Ә' => 'ә',
-       'Ӛ' => 'ӛ',
-       'Ӝ' => 'ӝ',
-       'Ӟ' => 'ӟ',
-       'Ӡ' => 'ӡ',
-       'Ӣ' => 'ӣ',
-       'Ӥ' => 'ӥ',
-       'Ӧ' => 'ӧ',
-       'Ө' => 'ө',
-       'Ӫ' => 'ӫ',
-       'Ӭ' => 'ӭ',
-       'Ӯ' => 'ӯ',
-       'Ӱ' => 'ӱ',
-       'Ӳ' => 'ӳ',
-       'Ӵ' => 'ӵ',
-       'Ӷ' => 'ӷ',
-       'Ӹ' => 'ӹ',
-       'Ӻ' => 'ӻ',
-       'Ӽ' => 'ӽ',
-       'Ӿ' => 'ӿ',
-       'Ԁ' => 'ԁ',
-       'Ԃ' => 'ԃ',
-       'Ԅ' => 'ԅ',
-       'Ԇ' => 'ԇ',
-       'Ԉ' => 'ԉ',
-       'Ԋ' => 'ԋ',
-       'Ԍ' => 'ԍ',
-       'Ԏ' => 'ԏ',
-       'Ԑ' => 'ԑ',
-       'Ԓ' => 'ԓ',
-       'Ԕ' => 'ԕ',
-       'Ԗ' => 'ԗ',
-       'Ԙ' => 'ԙ',
-       'Ԛ' => 'ԛ',
-       'Ԝ' => 'ԝ',
-       'Ԟ' => 'ԟ',
-       'Ԡ' => 'ԡ',
-       'Ԣ' => 'ԣ',
-       'Ԥ' => 'ԥ',
-       'Ԧ' => 'ԧ',
-       'Ա' => 'ա',
-       'Բ' => 'բ',
-       'Գ' => 'գ',
-       'Դ' => 'դ',
-       'Ե' => 'ե',
-       'Զ' => 'զ',
-       'Է' => 'է',
-       'Ը' => 'ը',
-       'Թ' => 'թ',
-       'Ժ' => 'ժ',
-       'Ի' => 'ի',
-       'Լ' => 'լ',
-       'Խ' => 'խ',
-       'Ծ' => 'ծ',
-       'Կ' => 'կ',
-       'Հ' => 'հ',
-       'Ձ' => 'ձ',
-       'Ղ' => 'ղ',
-       'Ճ' => 'ճ',
-       'Մ' => 'մ',
-       'Յ' => 'յ',
-       'Ն' => 'ն',
-       'Շ' => 'շ',
-       'Ո' => 'ո',
-       'Չ' => 'չ',
-       'Պ' => 'պ',
-       'Ջ' => 'ջ',
-       'Ռ' => 'ռ',
-       'Ս' => 'ս',
-       'Վ' => 'վ',
-       'Տ' => 'տ',
-       'Ր' => 'ր',
-       'Ց' => 'ց',
-       'Ւ' => 'ւ',
-       'Փ' => 'փ',
-       'Ք' => 'ք',
-       'Օ' => 'օ',
-       'Ֆ' => 'ֆ',
-       'Ⴀ' => 'ⴀ',
-       'Ⴁ' => 'ⴁ',
-       'Ⴂ' => 'ⴂ',
-       'Ⴃ' => 'ⴃ',
-       'Ⴄ' => 'ⴄ',
-       'Ⴅ' => 'ⴅ',
-       'Ⴆ' => 'ⴆ',
-       'Ⴇ' => 'ⴇ',
-       'Ⴈ' => 'ⴈ',
-       'Ⴉ' => 'ⴉ',
-       'Ⴊ' => 'ⴊ',
-       'Ⴋ' => 'ⴋ',
-       'Ⴌ' => 'ⴌ',
-       'Ⴍ' => 'ⴍ',
-       'Ⴎ' => 'ⴎ',
-       'Ⴏ' => 'ⴏ',
-       'Ⴐ' => 'ⴐ',
-       'Ⴑ' => 'ⴑ',
-       'Ⴒ' => 'ⴒ',
-       'Ⴓ' => 'ⴓ',
-       'Ⴔ' => 'ⴔ',
-       'Ⴕ' => 'ⴕ',
-       'Ⴖ' => 'ⴖ',
-       'Ⴗ' => 'ⴗ',
-       'Ⴘ' => 'ⴘ',
-       'Ⴙ' => 'ⴙ',
-       'Ⴚ' => 'ⴚ',
-       'Ⴛ' => 'ⴛ',
-       'Ⴜ' => 'ⴜ',
-       'Ⴝ' => 'ⴝ',
-       'Ⴞ' => 'ⴞ',
-       'Ⴟ' => 'ⴟ',
-       'Ⴠ' => 'ⴠ',
-       'Ⴡ' => 'ⴡ',
-       'Ⴢ' => 'ⴢ',
-       'Ⴣ' => 'ⴣ',
-       'Ⴤ' => 'ⴤ',
-       'Ⴥ' => 'ⴥ',
-       'Ḁ' => 'ḁ',
-       'Ḃ' => 'ḃ',
-       'Ḅ' => 'ḅ',
-       'Ḇ' => 'ḇ',
-       'Ḉ' => 'ḉ',
-       'Ḋ' => 'ḋ',
-       'Ḍ' => 'ḍ',
-       'Ḏ' => 'ḏ',
-       'Ḑ' => 'ḑ',
-       'Ḓ' => 'ḓ',
-       'Ḕ' => 'ḕ',
-       'Ḗ' => 'ḗ',
-       'Ḙ' => 'ḙ',
-       'Ḛ' => 'ḛ',
-       'Ḝ' => 'ḝ',
-       'Ḟ' => 'ḟ',
-       'Ḡ' => 'ḡ',
-       'Ḣ' => 'ḣ',
-       'Ḥ' => 'ḥ',
-       'Ḧ' => 'ḧ',
-       'Ḩ' => 'ḩ',
-       'Ḫ' => 'ḫ',
-       'Ḭ' => 'ḭ',
-       'Ḯ' => 'ḯ',
-       'Ḱ' => 'ḱ',
-       'Ḳ' => 'ḳ',
-       'Ḵ' => 'ḵ',
-       'Ḷ' => 'ḷ',
-       'Ḹ' => 'ḹ',
-       'Ḻ' => 'ḻ',
-       'Ḽ' => 'ḽ',
-       'Ḿ' => 'ḿ',
-       'Ṁ' => 'ṁ',
-       'Ṃ' => 'ṃ',
-       'Ṅ' => 'ṅ',
-       'Ṇ' => 'ṇ',
-       'Ṉ' => 'ṉ',
-       'Ṋ' => 'ṋ',
-       'Ṍ' => 'ṍ',
-       'Ṏ' => 'ṏ',
-       'Ṑ' => 'ṑ',
-       'Ṓ' => 'ṓ',
-       'Ṕ' => 'ṕ',
-       'Ṗ' => 'ṗ',
-       'Ṙ' => 'ṙ',
-       'Ṛ' => 'ṛ',
-       'Ṝ' => 'ṝ',
-       'Ṟ' => 'ṟ',
-       'Ṡ' => 'ṡ',
-       'Ṣ' => 'ṣ',
-       'Ṥ' => 'ṥ',
-       'Ṧ' => 'ṧ',
-       'Ṩ' => 'ṩ',
-       'Ṫ' => 'ṫ',
-       'Ṭ' => 'ṭ',
-       'Ṯ' => 'ṯ',
-       'Ṱ' => 'ṱ',
-       'Ṳ' => 'ṳ',
-       'Ṵ' => 'ṵ',
-       'Ṷ' => 'ṷ',
-       'Ṹ' => 'ṹ',
-       'Ṻ' => 'ṻ',
-       'Ṽ' => 'ṽ',
-       'Ṿ' => 'ṿ',
-       'Ẁ' => 'ẁ',
-       'Ẃ' => 'ẃ',
-       'Ẅ' => 'ẅ',
-       'Ẇ' => 'ẇ',
-       'Ẉ' => 'ẉ',
-       'Ẋ' => 'ẋ',
-       'Ẍ' => 'ẍ',
-       'Ẏ' => 'ẏ',
-       'Ẑ' => 'ẑ',
-       'Ẓ' => 'ẓ',
-       'Ẕ' => 'ẕ',
-       'ẞ' => 'ß',
-       'Ạ' => 'ạ',
-       'Ả' => 'ả',
-       'Ấ' => 'ấ',
-       'Ầ' => 'ầ',
-       'Ẩ' => 'ẩ',
-       'Ẫ' => 'ẫ',
-       'Ậ' => 'ậ',
-       'Ắ' => 'ắ',
-       'Ằ' => 'ằ',
-       'Ẳ' => 'ẳ',
-       'Ẵ' => 'ẵ',
-       'Ặ' => 'ặ',
-       'Ẹ' => 'ẹ',
-       'Ẻ' => 'ẻ',
-       'Ẽ' => 'ẽ',
-       'Ế' => 'ế',
-       'Ề' => 'ề',
-       'Ể' => 'ể',
-       'Ễ' => 'ễ',
-       'Ệ' => 'ệ',
-       'Ỉ' => 'ỉ',
-       'Ị' => 'ị',
-       'Ọ' => 'ọ',
-       'Ỏ' => 'ỏ',
-       'Ố' => 'ố',
-       'Ồ' => 'ồ',
-       'Ổ' => 'ổ',
-       'Ỗ' => 'ỗ',
-       'Ộ' => 'ộ',
-       'Ớ' => 'ớ',
-       'Ờ' => 'ờ',
-       'Ở' => 'ở',
-       'Ỡ' => 'ỡ',
-       'Ợ' => 'ợ',
-       'Ụ' => 'ụ',
-       'Ủ' => 'ủ',
-       'Ứ' => 'ứ',
-       'Ừ' => 'ừ',
-       'Ử' => 'ử',
-       'Ữ' => 'ữ',
-       'Ự' => 'ự',
-       'Ỳ' => 'ỳ',
-       'Ỵ' => 'ỵ',
-       'Ỷ' => 'ỷ',
-       'Ỹ' => 'ỹ',
-       'Ỻ' => 'ỻ',
-       'Ỽ' => 'ỽ',
-       'Ỿ' => 'ỿ',
-       'Ἀ' => 'ἀ',
-       'Ἁ' => 'ἁ',
-       'Ἂ' => 'ἂ',
-       'Ἃ' => 'ἃ',
-       'Ἄ' => 'ἄ',
-       'Ἅ' => 'ἅ',
-       'Ἆ' => 'ἆ',
-       'Ἇ' => 'ἇ',
-       'Ἐ' => 'ἐ',
-       'Ἑ' => 'ἑ',
-       'Ἒ' => 'ἒ',
-       'Ἓ' => 'ἓ',
-       'Ἔ' => 'ἔ',
-       'Ἕ' => 'ἕ',
-       'Ἠ' => 'ἠ',
-       'Ἡ' => 'ἡ',
-       'Ἢ' => 'ἢ',
-       'Ἣ' => 'ἣ',
-       'Ἤ' => 'ἤ',
-       'Ἥ' => 'ἥ',
-       'Ἦ' => 'ἦ',
-       'Ἧ' => 'ἧ',
-       'Ἰ' => 'ἰ',
-       'Ἱ' => 'ἱ',
-       'Ἲ' => 'ἲ',
-       'Ἳ' => 'ἳ',
-       'Ἴ' => 'ἴ',
-       'Ἵ' => 'ἵ',
-       'Ἶ' => 'ἶ',
-       'Ἷ' => 'ἷ',
-       'Ὀ' => 'ὀ',
-       'Ὁ' => 'ὁ',
-       'Ὂ' => 'ὂ',
-       'Ὃ' => 'ὃ',
-       'Ὄ' => 'ὄ',
-       'Ὅ' => 'ὅ',
-       'Ὑ' => 'ὑ',
-       'Ὓ' => 'ὓ',
-       'Ὕ' => 'ὕ',
-       'Ὗ' => 'ὗ',
-       'Ὠ' => 'ὠ',
-       'Ὡ' => 'ὡ',
-       'Ὢ' => 'ὢ',
-       'Ὣ' => 'ὣ',
-       'Ὤ' => 'ὤ',
-       'Ὥ' => 'ὥ',
-       'Ὦ' => 'ὦ',
-       'Ὧ' => 'ὧ',
-       'ᾈ' => 'ᾀ',
-       'ᾉ' => 'ᾁ',
-       'ᾊ' => 'ᾂ',
-       'ᾋ' => 'ᾃ',
-       'ᾌ' => 'ᾄ',
-       'ᾍ' => 'ᾅ',
-       'ᾎ' => 'ᾆ',
-       'ᾏ' => 'ᾇ',
-       'ᾘ' => 'ᾐ',
-       'ᾙ' => 'ᾑ',
-       'ᾚ' => 'ᾒ',
-       'ᾛ' => 'ᾓ',
-       'ᾜ' => 'ᾔ',
-       'ᾝ' => 'ᾕ',
-       'ᾞ' => 'ᾖ',
-       'ᾟ' => 'ᾗ',
-       'ᾨ' => 'ᾠ',
-       'ᾩ' => 'ᾡ',
-       'ᾪ' => 'ᾢ',
-       'ᾫ' => 'ᾣ',
-       'ᾬ' => 'ᾤ',
-       'ᾭ' => 'ᾥ',
-       'ᾮ' => 'ᾦ',
-       'ᾯ' => 'ᾧ',
-       'Ᾰ' => 'ᾰ',
-       'Ᾱ' => 'ᾱ',
-       'Ὰ' => 'ὰ',
-       'Ά' => 'ά',
-       'ᾼ' => 'ᾳ',
-       'Ὲ' => 'ὲ',
-       'Έ' => 'έ',
-       'Ὴ' => 'ὴ',
-       'Ή' => 'ή',
-       'ῌ' => 'ῃ',
-       'Ῐ' => 'ῐ',
-       'Ῑ' => 'ῑ',
-       'Ὶ' => 'ὶ',
-       'Ί' => 'ί',
-       'Ῠ' => 'ῠ',
-       'Ῡ' => 'ῡ',
-       'Ὺ' => 'ὺ',
-       'Ύ' => 'ύ',
-       'Ῥ' => 'ῥ',
-       'Ὸ' => 'ὸ',
-       'Ό' => 'ό',
-       'Ὼ' => 'ὼ',
-       'Ώ' => 'ώ',
-       'ῼ' => 'ῳ',
-       'Ω' => 'ω',
-       'K' => 'k',
-       'Å' => 'å',
-       'Ⅎ' => 'ⅎ',
-       'Ⅰ' => 'ⅰ',
-       'Ⅱ' => 'ⅱ',
-       'Ⅲ' => 'ⅲ',
-       'Ⅳ' => 'ⅳ',
-       'Ⅴ' => 'ⅴ',
-       'Ⅵ' => 'ⅵ',
-       'Ⅶ' => 'ⅶ',
-       'Ⅷ' => 'ⅷ',
-       'Ⅸ' => 'ⅸ',
-       'Ⅹ' => 'ⅹ',
-       'Ⅺ' => 'ⅺ',
-       'Ⅻ' => 'ⅻ',
-       'Ⅼ' => 'ⅼ',
-       'Ⅽ' => 'ⅽ',
-       'Ⅾ' => 'ⅾ',
-       'Ⅿ' => 'ⅿ',
-       'Ↄ' => 'ↄ',
-       'Ⓐ' => 'ⓐ',
-       'Ⓑ' => 'ⓑ',
-       'Ⓒ' => 'ⓒ',
-       'Ⓓ' => 'ⓓ',
-       'Ⓔ' => 'ⓔ',
-       'Ⓕ' => 'ⓕ',
-       'Ⓖ' => 'ⓖ',
-       'Ⓗ' => 'ⓗ',
-       'Ⓘ' => 'ⓘ',
-       'Ⓙ' => 'ⓙ',
-       'Ⓚ' => 'ⓚ',
-       'Ⓛ' => 'ⓛ',
-       'Ⓜ' => 'ⓜ',
-       'Ⓝ' => 'ⓝ',
-       'Ⓞ' => 'ⓞ',
-       'Ⓟ' => 'ⓟ',
-       'Ⓠ' => 'ⓠ',
-       'Ⓡ' => 'ⓡ',
-       'Ⓢ' => 'ⓢ',
-       'Ⓣ' => 'ⓣ',
-       'Ⓤ' => 'ⓤ',
-       'Ⓥ' => 'ⓥ',
-       'Ⓦ' => 'ⓦ',
-       'Ⓧ' => 'ⓧ',
-       'Ⓨ' => 'ⓨ',
-       'Ⓩ' => 'ⓩ',
-       'Ⰰ' => 'ⰰ',
-       'Ⰱ' => 'ⰱ',
-       'Ⰲ' => 'ⰲ',
-       'Ⰳ' => 'ⰳ',
-       'Ⰴ' => 'ⰴ',
-       'Ⰵ' => 'ⰵ',
-       'Ⰶ' => 'ⰶ',
-       'Ⰷ' => 'ⰷ',
-       'Ⰸ' => 'ⰸ',
-       'Ⰹ' => 'ⰹ',
-       'Ⰺ' => 'ⰺ',
-       'Ⰻ' => 'ⰻ',
-       'Ⰼ' => 'ⰼ',
-       'Ⰽ' => 'ⰽ',
-       'Ⰾ' => 'ⰾ',
-       'Ⰿ' => 'ⰿ',
-       'Ⱀ' => 'ⱀ',
-       'Ⱁ' => 'ⱁ',
-       'Ⱂ' => 'ⱂ',
-       'Ⱃ' => 'ⱃ',
-       'Ⱄ' => 'ⱄ',
-       'Ⱅ' => 'ⱅ',
-       'Ⱆ' => 'ⱆ',
-       'Ⱇ' => 'ⱇ',
-       'Ⱈ' => 'ⱈ',
-       'Ⱉ' => 'ⱉ',
-       'Ⱊ' => 'ⱊ',
-       'Ⱋ' => 'ⱋ',
-       'Ⱌ' => 'ⱌ',
-       'Ⱍ' => 'ⱍ',
-       'Ⱎ' => 'ⱎ',
-       'Ⱏ' => 'ⱏ',
-       'Ⱐ' => 'ⱐ',
-       'Ⱑ' => 'ⱑ',
-       'Ⱒ' => 'ⱒ',
-       'Ⱓ' => 'ⱓ',
-       'Ⱔ' => 'ⱔ',
-       'Ⱕ' => 'ⱕ',
-       'Ⱖ' => 'ⱖ',
-       'Ⱗ' => 'ⱗ',
-       'Ⱘ' => 'ⱘ',
-       'Ⱙ' => 'ⱙ',
-       'Ⱚ' => 'ⱚ',
-       'Ⱛ' => 'ⱛ',
-       'Ⱜ' => 'ⱜ',
-       'Ⱝ' => 'ⱝ',
-       'Ⱞ' => 'ⱞ',
-       'Ⱡ' => 'ⱡ',
-       'Ɫ' => 'ɫ',
-       'Ᵽ' => 'ᵽ',
-       'Ɽ' => 'ɽ',
-       'Ⱨ' => 'ⱨ',
-       'Ⱪ' => 'ⱪ',
-       'Ⱬ' => 'ⱬ',
-       'Ɑ' => 'ɑ',
-       'Ɱ' => 'ɱ',
-       'Ɐ' => 'ɐ',
-       'Ɒ' => 'ɒ',
-       'Ⱳ' => 'ⱳ',
-       'Ⱶ' => 'ⱶ',
-       'Ȿ' => 'ȿ',
-       'Ɀ' => 'ɀ',
-       'Ⲁ' => 'ⲁ',
-       'Ⲃ' => 'ⲃ',
-       'Ⲅ' => 'ⲅ',
-       'Ⲇ' => 'ⲇ',
-       'Ⲉ' => 'ⲉ',
-       'Ⲋ' => 'ⲋ',
-       'Ⲍ' => 'ⲍ',
-       'Ⲏ' => 'ⲏ',
-       'Ⲑ' => 'ⲑ',
-       'Ⲓ' => 'ⲓ',
-       'Ⲕ' => 'ⲕ',
-       'Ⲗ' => 'ⲗ',
-       'Ⲙ' => 'ⲙ',
-       'Ⲛ' => 'ⲛ',
-       'Ⲝ' => 'ⲝ',
-       'Ⲟ' => 'ⲟ',
-       'Ⲡ' => 'ⲡ',
-       'Ⲣ' => 'ⲣ',
-       'Ⲥ' => 'ⲥ',
-       'Ⲧ' => 'ⲧ',
-       'Ⲩ' => 'ⲩ',
-       'Ⲫ' => 'ⲫ',
-       'Ⲭ' => 'ⲭ',
-       'Ⲯ' => 'ⲯ',
-       'Ⲱ' => 'ⲱ',
-       'Ⲳ' => 'ⲳ',
-       'Ⲵ' => 'ⲵ',
-       'Ⲷ' => 'ⲷ',
-       'Ⲹ' => 'ⲹ',
-       'Ⲻ' => 'ⲻ',
-       'Ⲽ' => 'ⲽ',
-       'Ⲿ' => 'ⲿ',
-       'Ⳁ' => 'ⳁ',
-       'Ⳃ' => 'ⳃ',
-       'Ⳅ' => 'ⳅ',
-       'Ⳇ' => 'ⳇ',
-       'Ⳉ' => 'ⳉ',
-       'Ⳋ' => 'ⳋ',
-       'Ⳍ' => 'ⳍ',
-       'Ⳏ' => 'ⳏ',
-       'Ⳑ' => 'ⳑ',
-       'Ⳓ' => 'ⳓ',
-       'Ⳕ' => 'ⳕ',
-       'Ⳗ' => 'ⳗ',
-       'Ⳙ' => 'ⳙ',
-       'Ⳛ' => 'ⳛ',
-       'Ⳝ' => 'ⳝ',
-       'Ⳟ' => 'ⳟ',
-       'Ⳡ' => 'ⳡ',
-       'Ⳣ' => 'ⳣ',
-       'Ⳬ' => 'ⳬ',
-       'Ⳮ' => 'ⳮ',
-       'Ꙁ' => 'ꙁ',
-       'Ꙃ' => 'ꙃ',
-       'Ꙅ' => 'ꙅ',
-       'Ꙇ' => 'ꙇ',
-       'Ꙉ' => 'ꙉ',
-       'Ꙋ' => 'ꙋ',
-       'Ꙍ' => 'ꙍ',
-       'Ꙏ' => 'ꙏ',
-       'Ꙑ' => 'ꙑ',
-       'Ꙓ' => 'ꙓ',
-       'Ꙕ' => 'ꙕ',
-       'Ꙗ' => 'ꙗ',
-       'Ꙙ' => 'ꙙ',
-       'Ꙛ' => 'ꙛ',
-       'Ꙝ' => 'ꙝ',
-       'Ꙟ' => 'ꙟ',
-       'Ꙡ' => 'ꙡ',
-       'Ꙣ' => 'ꙣ',
-       'Ꙥ' => 'ꙥ',
-       'Ꙧ' => 'ꙧ',
-       'Ꙩ' => 'ꙩ',
-       'Ꙫ' => 'ꙫ',
-       'Ꙭ' => 'ꙭ',
-       'Ꚁ' => 'ꚁ',
-       'Ꚃ' => 'ꚃ',
-       'Ꚅ' => 'ꚅ',
-       'Ꚇ' => 'ꚇ',
-       'Ꚉ' => 'ꚉ',
-       'Ꚋ' => 'ꚋ',
-       'Ꚍ' => 'ꚍ',
-       'Ꚏ' => 'ꚏ',
-       'Ꚑ' => 'ꚑ',
-       'Ꚓ' => 'ꚓ',
-       'Ꚕ' => 'ꚕ',
-       'Ꚗ' => 'ꚗ',
-       'Ꜣ' => 'ꜣ',
-       'Ꜥ' => 'ꜥ',
-       'Ꜧ' => 'ꜧ',
-       'Ꜩ' => 'ꜩ',
-       'Ꜫ' => 'ꜫ',
-       'Ꜭ' => 'ꜭ',
-       'Ꜯ' => 'ꜯ',
-       'Ꜳ' => 'ꜳ',
-       'Ꜵ' => 'ꜵ',
-       'Ꜷ' => 'ꜷ',
-       'Ꜹ' => 'ꜹ',
-       'Ꜻ' => 'ꜻ',
-       'Ꜽ' => 'ꜽ',
-       'Ꜿ' => 'ꜿ',
-       'Ꝁ' => 'ꝁ',
-       'Ꝃ' => 'ꝃ',
-       'Ꝅ' => 'ꝅ',
-       'Ꝇ' => 'ꝇ',
-       'Ꝉ' => 'ꝉ',
-       'Ꝋ' => 'ꝋ',
-       'Ꝍ' => 'ꝍ',
-       'Ꝏ' => 'ꝏ',
-       'Ꝑ' => 'ꝑ',
-       'Ꝓ' => 'ꝓ',
-       'Ꝕ' => 'ꝕ',
-       'Ꝗ' => 'ꝗ',
-       'Ꝙ' => 'ꝙ',
-       'Ꝛ' => 'ꝛ',
-       'Ꝝ' => 'ꝝ',
-       'Ꝟ' => 'ꝟ',
-       'Ꝡ' => 'ꝡ',
-       'Ꝣ' => 'ꝣ',
-       'Ꝥ' => 'ꝥ',
-       'Ꝧ' => 'ꝧ',
-       'Ꝩ' => 'ꝩ',
-       'Ꝫ' => 'ꝫ',
-       'Ꝭ' => 'ꝭ',
-       'Ꝯ' => 'ꝯ',
-       'Ꝺ' => 'ꝺ',
-       'Ꝼ' => 'ꝼ',
-       'Ᵹ' => 'ᵹ',
-       'Ꝿ' => 'ꝿ',
-       'Ꞁ' => 'ꞁ',
-       'Ꞃ' => 'ꞃ',
-       'Ꞅ' => 'ꞅ',
-       'Ꞇ' => 'ꞇ',
-       'Ꞌ' => 'ꞌ',
-       'Ɥ' => 'ɥ',
-       'Ꞑ' => 'ꞑ',
-       'Ꞡ' => 'ꞡ',
-       'Ꞣ' => 'ꞣ',
-       'Ꞥ' => 'ꞥ',
-       'Ꞧ' => 'ꞧ',
-       'Ꞩ' => 'ꞩ',
-       'Ａ' => 'ａ',
-       'Ｂ' => 'ｂ',
-       'Ｃ' => 'ｃ',
-       'Ｄ' => 'ｄ',
-       'Ｅ' => 'ｅ',
-       'Ｆ' => 'ｆ',
-       'Ｇ' => 'ｇ',
-       'Ｈ' => 'ｈ',
-       'Ｉ' => 'ｉ',
-       'Ｊ' => 'ｊ',
-       'Ｋ' => 'ｋ',
-       'Ｌ' => 'ｌ',
-       'Ｍ' => 'ｍ',
-       'Ｎ' => 'ｎ',
-       'Ｏ' => 'ｏ',
-       'Ｐ' => 'ｐ',
-       'Ｑ' => 'ｑ',
-       'Ｒ' => 'ｒ',
-       'Ｓ' => 'ｓ',
-       'Ｔ' => 'ｔ',
-       'Ｕ' => 'ｕ',
-       'Ｖ' => 'ｖ',
-       'Ｗ' => 'ｗ',
-       'Ｘ' => 'ｘ',
-       'Ｙ' => 'ｙ',
-       'Ｚ' => 'ｚ',
-       '𐐀' => '𐐨',
-       '𐐁' => '𐐩',
-       '𐐂' => '𐐪',
-       '𐐃' => '𐐫',
-       '𐐄' => '𐐬',
-       '𐐅' => '𐐭',
-       '𐐆' => '𐐮',
-       '𐐇' => '𐐯',
-       '𐐈' => '𐐰',
-       '𐐉' => '𐐱',
-       '𐐊' => '𐐲',
-       '𐐋' => '𐐳',
-       '𐐌' => '𐐴',
-       '𐐍' => '𐐵',
-       '𐐎' => '𐐶',
-       '𐐏' => '𐐷',
-       '𐐐' => '𐐸',
-       '𐐑' => '𐐹',
-       '𐐒' => '𐐺',
-       '𐐓' => '𐐻',
-       '𐐔' => '𐐼',
-       '𐐕' => '𐐽',
-       '𐐖' => '𐐾',
-       '𐐗' => '𐐿',
-       '𐐘' => '𐑀',
-       '𐐙' => '𐑁',
-       '𐐚' => '𐑂',
-       '𐐛' => '𐑃',
-       '𐐜' => '𐑄',
-       '𐐝' => '𐑅',
-       '𐐞' => '𐑆',
-       '𐐟' => '𐑇',
-       '𐐠' => '𐑈',
-       '𐐡' => '𐑉',
-       '𐐢' => '𐑊',
-       '𐐣' => '𐑋',
-       '𐐤' => '𐑌',
-       '𐐥' => '𐑍',
-       '𐐦' => '𐑎',
-       '𐐧' => '𐑏'
-);
diff --git a/includes/normal/Utf8CaseGenerate.php b/includes/normal/Utf8CaseGenerate.php

deleted file mode 100644 (file)

index adc3ef2..0000000
--- a/includes/normal/Utf8CaseGenerate.php
+++ /dev/null
@@ -1,112 +0,0 @@
-<?php
-/**
- * This script generates Utf8Case.php from the Unicode Character Database
- * and supplementary files.
- *
- * Copyright © 2004,2008 Brion Vibber <brion@pobox.com>
- * http://www.mediawiki.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @ingroup UtfNormal
- */
-
-if( PHP_SAPI != 'cli' ) {
-       die( "Run me from the command line please.\n" );
-}
-
-require_once 'UtfNormalDefines.php';
-require_once 'UtfNormalUtil.php';
-
-$in = fopen("UnicodeData.txt", "rt" );
-if( !$in ) {
-       print "Can't open UnicodeData.txt for reading.\n";
-       print "If necessary, fetch this file from the internet:\n";
-       print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
-       exit(-1);
-}
-$wikiUpperChars = array();
-$wikiLowerChars = array();
-
-print "Reading character definitions...\n";
-while( false !== ($line = fgets( $in ) ) ) {
-       $columns = explode(';', $line);
-       $codepoint = $columns[0];
-       $name = $columns[1];
-       $simpleUpper = $columns[12];
-       $simpleLower = $columns[13];
-
-       $source = codepointToUtf8( hexdec( $codepoint ) );
-       if( $simpleUpper ) {
-               $wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
-       }
-       if( $simpleLower ) {
-               $wikiLowerChars[$source] = codepointToUtf8( hexdec( $simpleLower ) );
-       }
-}
-fclose( $in );
-
-$out = fopen( "Utf8Case.php", "wt" );
-if( $out ) {
-       $outUpperChars = escapeArray( $wikiUpperChars );
-       $outLowerChars = escapeArray( $wikiLowerChars );
-       $outdata = "<" . "?php
-/**
- * Simple 1:1 upper/lowercase switching arrays for utf-8 text.
- * Won't get context-sensitive things yet.
- *
- * Hack for bugs in ucfirst() and company
- *
- * These are pulled from memcached if possible, as this is faster than filling
- * up a big array manually.
- *
- * @file
- * @ingroup Language
- */
-
-/**
- * Translation array to get upper case character
- */
-\$wikiUpperChars = $outUpperChars;
-
-/**
- * Translation array to get lower case character
- */
-\$wikiLowerChars = $outLowerChars;\n";
-       fputs( $out, $outdata );
-       fclose( $out );
-       print "Wrote out Utf8Case.php\n";
-} else {
-       print "Can't create file Utf8Case.php\n";
-       exit(-1);
-}
-
-
-function escapeArray( $arr ) {
-       return "array(\n" .
-               implode( ",\n",
-                       array_map( "escapeLine",
-                               array_keys( $arr ),
-                               array_values( $arr ) ) ) .
-               "\n)";
-}
-
-function escapeLine( $key, $val ) {
-       $encKey = escapeSingleString( $key );
-       $encVal = escapeSingleString( $val );
-       return "\t'$encKey' => '$encVal'";
-}
diff --git a/maintenance/language/generateNormalizerData.php b/maintenance/language/generateNormalizerData.php

deleted file mode 100644 (file)

index b638b63..0000000
--- a/maintenance/language/generateNormalizerData.php
+++ /dev/null
@@ -1,160 +0,0 @@
-<?php
-/**
- * Generates normalizer data files for Arabic and Malayalam.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @ingroup MaintenanceLanguage
- */
-
-require_once __DIR__ . '/../../includes/normal/UtfNormalUtil.php';
-
-require_once __DIR__ . '/../Maintenance.php';
-
-/**
- * Generates normalizer data files for Arabic and Malayalam.
- * For NFC see includes/normal.
- *
- * @ingroup MaintenanceLanguage
- */
-class GenerateNormalizerData extends Maintenance {
-       public $dataFile;
-
-       public function __construct() {
-               parent::__construct();
-               $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
-                       'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
-       }
-
-       public function execute() {
-               if ( !$this->hasOption( 'unicode-data-file' ) ) {
-                       $this->dataFile = 'UnicodeData.txt';
-                       if ( !file_exists( $this->dataFile ) ) {
-                               $this->error( "Unable to find UnicodeData.txt. Please specify " .
-                                       "its location with --unicode-data-file=<FILE>" );
-                               exit( 1 );
-                       }
-               } else {
-                       $this->dataFile = $this->getOption( 'unicode-data-file' );
-                       if ( !file_exists( $this->dataFile ) ) {
-                               $this->error( 'Unable to find the specified data file.' );
-                               exit( 1 );
-                       }
-               }
-
-               $this->generateArabic();
-               $this->generateMalayalam();
-       }
-
-       function generateArabic() {
-               $file = fopen( $this->dataFile, 'r' );
-               if ( !$file ) {
-                       $this->error( 'Unable to open the data file.' );
-                       exit( 1 );
-               }
-
-               // For the file format, see http://www.unicode.org/reports/tr44/
-               $fieldNames = array(
-                       'Code',
-                       'Name',
-                       'General_Category',
-                       'Canonical_Combining_Class',
-                       'Bidi_Class',
-                       'Decomposition_Type_Mapping',
-                       'Numeric_Type_Value',
-                       'Bidi_Mirrored',
-                       'Unicode_1_Name',
-                       'ISO_Comment',
-                       'Simple_Uppercase_Mapping',
-                       'Simple_Lowercase_Mapping',
-                       'Simple_Titlecase_Mapping'
-               );
-
-               $pairs = array();
-
-               $lineNum = 0;
-               while ( false !== ( $line = fgets( $file ) ) ) {
-                       ++$lineNum;
-
-                       # Strip comments
-                       $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
-                       if ( $line === '' ) {
-                               continue;
-                       }
-
-                       # Split fields
-                       $numberedData = explode( ';', $line );
-                       $data = array();
-                       foreach ( $fieldNames as $number => $name ) {
-                               $data[$name] = $numberedData[$number];
-                       }
-
-                       $code = base_convert( $data['Code'], 16, 10 );
-                       if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
-                               || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B
-                       ) {
-                               if ( $data['Decomposition_Type_Mapping'] === '' ) {
-                                       // No decomposition
-                                       continue;
-                               }
-                               if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
-                                       $data['Decomposition_Type_Mapping'], $m )
-                               ) {
-                                       $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
-                                       $this->error( $line );
-                                       continue;
-                               }
-
-                               $source = hexSequenceToUtf8( $data['Code'] );
-                               $dest = hexSequenceToUtf8( $m[2] );
-                               $pairs[$source] = $dest;
-                       }
-               }
-
-               global $IP;
-               file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
-               echo "ar: " . count( $pairs ) . " pairs written.\n";
-       }
-
-       function generateMalayalam() {
-               $hexPairs = array(
-                       # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
-                       '0D23 0D4D 200D' => '0D7A',
-                       '0D28 0D4D 200D' => '0D7B',
-                       '0D30 0D4D 200D' => '0D7C',
-                       '0D32 0D4D 200D' => '0D7D',
-                       '0D33 0D4D 200D' => '0D7E',
-
-                       # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
-                       '0D15 0D4D 200D' => '0D7F',
-               );
-
-               $pairs = array();
-               foreach ( $hexPairs as $hexSource => $hexDest ) {
-                       $source = hexSequenceToUtf8( $hexSource );
-                       $dest = hexSequenceToUtf8( $hexDest );
-                       $pairs[$source] = $dest;
-               }
-
-               global $IP;
-               file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
-               echo "ml: " . count( $pairs ) . " pairs written.\n";
-       }
-}
-
-$maintClass = 'GenerateNormalizerData';
-require_once RUN_MAINTENANCE_IF_MAIN;
diff --git a/maintenance/language/generateNormalizerDataAr.php b/maintenance/language/generateNormalizerDataAr.php

new file mode 100644 (file)

index 0000000..ece0450
--- /dev/null
+++ b/maintenance/language/generateNormalizerDataAr.php
@@ -0,0 +1,133 @@
+<?php
+/**
+ * Generates the normalizer data file for Arabic.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup MaintenanceLanguage
+ */
+
+require_once __DIR__ . '/../Maintenance.php';
+
+/**
+ * Generates the normalizer data file for Arabic.
+ * For NFC see includes/normal.
+ *
+ * @ingroup MaintenanceLanguage
+ */
+class GenerateNormalizerDataAr extends Maintenance {
+       public function __construct() {
+               parent::__construct();
+               $this->mDescription = 'Generate the normalizer data file for Arabic';
+               $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
+                       'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
+       }
+
+       public function getDbType() {
+               return Maintenance::DB_NONE;
+       }
+
+       public function execute() {
+               if ( !$this->hasOption( 'unicode-data-file' ) ) {
+                       $dataFile = 'UnicodeData.txt';
+                       if ( !file_exists( $dataFile ) ) {
+                               $this->error( "Unable to find UnicodeData.txt. Please specify " .
+                                       "its location with --unicode-data-file=<FILE>" );
+                               exit( 1 );
+                       }
+               } else {
+                       $dataFile = $this->getOption( 'unicode-data-file' );
+                       if ( !file_exists( $dataFile ) ) {
+                               $this->error( 'Unable to find the specified data file.' );
+                               exit( 1 );
+                       }
+               }
+
+               $file = fopen( $dataFile, 'r' );
+               if ( !$file ) {
+                       $this->error( 'Unable to open the data file.' );
+                       exit( 1 );
+               }
+
+               // For the file format, see http://www.unicode.org/reports/tr44/
+               $fieldNames = array(
+                       'Code',
+                       'Name',
+                       'General_Category',
+                       'Canonical_Combining_Class',
+                       'Bidi_Class',
+                       'Decomposition_Type_Mapping',
+                       'Numeric_Type_Value_6',
+                       'Numeric_Type_Value_7',
+                       'Numeric_Type_Value_8',
+                       'Bidi_Mirrored',
+                       'Unicode_1_Name',
+                       'ISO_Comment',
+                       'Simple_Uppercase_Mapping',
+                       'Simple_Lowercase_Mapping',
+                       'Simple_Titlecase_Mapping'
+               );
+
+               $pairs = array();
+
+               $lineNum = 0;
+               while ( false !== ( $line = fgets( $file ) ) ) {
+                       ++$lineNum;
+
+                       # Strip comments
+                       $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
+                       if ( $line === '' ) {
+                               continue;
+                       }
+
+                       # Split fields
+                       $numberedData = explode( ';', $line );
+                       $data = array();
+                       foreach ( $fieldNames as $number => $name ) {
+                               $data[$name] = $numberedData[$number];
+                       }
+
+                       $code = base_convert( $data['Code'], 16, 10 );
+                       if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
+                               || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B
+                       ) {
+                               if ( $data['Decomposition_Type_Mapping'] === '' ) {
+                                       // No decomposition
+                                       continue;
+                               }
+                               if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
+                                       $data['Decomposition_Type_Mapping'], $m )
+                               ) {
+                                       $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
+                                       $this->error( $line );
+                                       continue;
+                               }
+
+                               $source = hexSequenceToUtf8( $data['Code'] );
+                               $dest = hexSequenceToUtf8( $m[2] );
+                               $pairs[$source] = $dest;
+                       }
+               }
+
+               global $IP;
+               file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
+               echo "ar: " . count( $pairs ) . " pairs written.\n";
+       }
+}
+
+$maintClass = 'GenerateNormalizerDataAr';
+require_once RUN_MAINTENANCE_IF_MAIN;
diff --git a/maintenance/language/generateNormalizerDataMl.php b/maintenance/language/generateNormalizerDataMl.php

new file mode 100644 (file)

index 0000000..c7237cf
--- /dev/null
+++ b/maintenance/language/generateNormalizerDataMl.php
@@ -0,0 +1,69 @@
+<?php
+/**
+ * Generates the normalizer data file for Malayalam.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup MaintenanceLanguage
+ */
+
+require_once __DIR__ . '/../Maintenance.php';
+
+/**
+ * Generates the normalizer data file for Malayalam.
+ * For NFC see includes/normal.
+ *
+ * @ingroup MaintenanceLanguage
+ */
+class GenerateNormalizerDataMl extends Maintenance {
+       public function __construct() {
+               parent::__construct();
+               $this->mDescription = 'Generate the normalizer data file for Malayalam';
+       }
+
+       public function getDbType() {
+               return Maintenance::DB_NONE;
+       }
+
+       public function execute() {
+               $hexPairs = array(
+                       # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
+                       '0D23 0D4D 200D' => '0D7A',
+                       '0D28 0D4D 200D' => '0D7B',
+                       '0D30 0D4D 200D' => '0D7C',
+                       '0D32 0D4D 200D' => '0D7D',
+                       '0D33 0D4D 200D' => '0D7E',
+
+                       # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
+                       '0D15 0D4D 200D' => '0D7F',
+               );
+
+               $pairs = array();
+               foreach ( $hexPairs as $hexSource => $hexDest ) {
+                       $source = hexSequenceToUtf8( $hexSource );
+                       $dest = hexSequenceToUtf8( $hexDest );
+                       $pairs[$source] = $dest;
+               }
+
+               global $IP;
+               file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
+               echo "ml: " . count( $pairs ) . " pairs written.\n";
+       }
+}
+
+$maintClass = 'GenerateNormalizerDataMl';
+require_once RUN_MAINTENANCE_IF_MAIN;
diff --git a/maintenance/language/generateUtf8Case.php b/maintenance/language/generateUtf8Case.php

new file mode 100644 (file)

index 0000000..dbb70c1
--- /dev/null
+++ b/maintenance/language/generateUtf8Case.php
@@ -0,0 +1,129 @@
+<?php
+/**
+ * Generates Utf8Case.ser from the Unicode Character Database and
+ * supplementary files.
+ *
+ * Copyright © 2004, 2008 Brion Vibber <brion@pobox.com>
+ * http://www.mediawiki.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup MaintenanceLanguage
+ */
+
+require_once __DIR__ . '/../Maintenance.php';
+
+/**
+ * Generates Utf8Case.ser from the Unicode Character Database and
+ * supplementary files.
+ *
+ * @ingroup MaintenanceLanguage
+ */
+class GenerateUtf8Case extends Maintenance {
+
+       public function __construct() {
+               parent::__construct();
+               $this->mDescription = 'Generate Utf8Case.ser from the Unicode Character Database ' .
+                       'and supplementary files';
+               $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
+                       'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
+       }
+
+       public function getDbType() {
+               return Maintenance::DB_NONE;
+       }
+
+       public function execute() {
+               if ( !$this->hasOption( 'unicode-data-file' ) ) {
+                       $dataFile = 'UnicodeData.txt';
+                       if ( !file_exists( $dataFile ) ) {
+                               $this->error( "Unable to find UnicodeData.txt. Please specify " .
+                                       "its location with --unicode-data-file=<FILE>" );
+                               exit( 1 );
+                       }
+               } else {
+                       $dataFile = $this->getOption( 'unicode-data-file' );
+                       if ( !file_exists( $dataFile ) ) {
+                               $this->error( 'Unable to find the specified data file.' );
+                               exit( 1 );
+                       }
+               }
+
+               $file = fopen( $dataFile, 'r' );
+               if ( !$file ) {
+                       $this->error( 'Unable to open the data file.' );
+                       exit( 1 );
+               }
+
+               // For the file format, see http://www.unicode.org/reports/tr44/
+               $fieldNames = array(
+                       'Code',
+                       'Name',
+                       'General_Category',
+                       'Canonical_Combining_Class',
+                       'Bidi_Class',
+                       'Decomposition_Type_Mapping',
+                       'Numeric_Type_Value_6',
+                       'Numeric_Type_Value_7',
+                       'Numeric_Type_Value_8',
+                       'Bidi_Mirrored',
+                       'Unicode_1_Name',
+                       'ISO_Comment',
+                       'Simple_Uppercase_Mapping',
+                       'Simple_Lowercase_Mapping',
+                       'Simple_Titlecase_Mapping'
+               );
+
+               $upper = array();
+               $lower = array();
+
+               $lineNum = 0;
+               while ( false !== ( $line = fgets( $file ) ) ) {
+                       ++$lineNum;
+
+                       # Strip comments
+                       $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
+                       if ( $line === '' ) {
+                               continue;
+                       }
+
+                       # Split fields
+                       $numberedData = explode( ';', $line );
+                       $data = array();
+                       foreach ( $fieldNames as $number => $name ) {
+                               $data[$name] = $numberedData[$number];
+                       }
+
+                       $source = hexSequenceToUtf8( $data['Code'] );
+                       if ( $data['Simple_Uppercase_Mapping'] ) {
+                               $upper[$source] = hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
+                       }
+                       if ( $data['Simple_Lowercase_Mapping'] ) {
+                               $lower[$source] = hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
+                       }
+               }
+
+               global $IP;
+               file_put_contents( "$IP/serialized/Utf8Case.ser", serialize( array(
+                       'wikiUpperChars' => $upper,
+                       'wikiLowerChars' => $lower,
+               ) ) );
+       }
+}
+
+$maintClass = 'GenerateUtf8Case';
+require_once RUN_MAINTENANCE_IF_MAIN;
diff --git a/serialized/.gitignore b/serialized/.gitignore

new file mode 100644 (file)

index 0000000..d9d58dd
--- /dev/null
+++ b/serialized/.gitignore
@@ -0,0 +1,4 @@
+/UnicodeData.txt
+/allkeys.txt
+/ucd.all.grouped.xml
+/ucd.all.grouped.zip
diff --git a/serialized/Makefile b/serialized/Makefile

index 062155b..c7e7506 100644 (file)
--- a/serialized/Makefile
+++ b/serialized/Makefile
@@ -1,7 +1,7 @@
-
-SPECIAL_TARGETS=Utf8Case.ser
+SPECIAL_TARGETS=Utf8Case.ser normalize-ar.ser normalize-ml.ser first-letters-root.ser
  ALL_TARGETS=$(SPECIAL_TARGETS)
  DIST_TARGETS=$(SPECIAL_TARGETS)
+UNICODE_VERSION=6.0.0
  
  .PHONY: all dist clean
  
@@ -13,6 +13,26 @@ dist: $(DIST_TARGETS)
  clean:
         rm -f $(ALL_TARGETS)
  
-Utf8Case.ser : ../includes/normal/Utf8Case.php
-       php serialize.php -o $@ $<
+Utf8Case.ser: UnicodeData.txt
+       php ../maintenance/language/generateUtf8Case.php
+
+normalize-ar.ser: UnicodeData.txt
+       php ../maintenance/language/generateNormalizerDataAr.php
+
+normalize-ml.ser:
+       php ../maintenance/language/generateNormalizerDataMl.php
+
+first-letters-root.ser: allkeys.txt ucd.all.grouped.xml
+       php ../maintenance/language/generateCollationData.php
+
+UnicodeData.txt:
+       wget http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
+
+allkeys.txt:
+       wget http://www.unicode.org/Public/UCA/$(UNICODE_VERSION)/allkeys.txt
+
+ucd.all.grouped.xml: ucd.all.grouped.zip
+       unzip ucd.all.grouped.zip ucd.all.grouped.xml
  
+ucd.all.grouped.zip:
+       wget http://www.unicode.org/Public/$(UNICODE_VERSION)/ucdxml/ucd.all.grouped.zip
diff --git a/serialized/serialize.php b/serialized/serialize.php

deleted file mode 100644 (file)

index 766c1a5..0000000
--- a/serialized/serialize.php
+++ /dev/null
@@ -1,95 +0,0 @@
-<?php
-/**
- * Serialize variables found in input file and store the result in the
- * specified file.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- */
-
-if ( !defined( 'MEDIAWIKI' ) ) {
-       $wgNoDBParam = true;
-       $optionsWithArgs = array( 'o' );
-       require_once __DIR__ .'/../maintenance/commandLine.inc';
-
-       $stderr = fopen( 'php://stderr', 'w' );
-       if ( !isset( $args[0] ) ) {
-               fwrite( $stderr, "No input file specified\n" );
-               exit( 1 );
-       }
-       if ( wfIsWindows() ) {
-               $files = array();
-               foreach ( $args as $arg ) {
-                       $files = array_merge( $files, glob( $arg ) );
-               }
-               if ( !$files ) {
-                       fwrite( $stderr, "No files found\n" );
-               }
-       } else {
-               $files = $args;
-       }
-
-       if ( isset( $options['o'] ) ) {
-               $out = fopen( $options['o'], 'wb' );
-               if ( !$out ) {
-                       fwrite( $stderr, "Unable to open file \"{$options['o']}\" for output\n" );
-                       exit( 1 );
-               }
-       } else {
-               $out = fopen( 'php://stdout', 'wb' );
-       }
-
-       $vars = array();
-       foreach ( $files as $inputFile ) {
-               $vars = array_merge( $vars, getVars( $inputFile ) );
-       }
-       fwrite( $out, serialize( $vars ) );
-       fclose( $out );
-       exit( 0 );
-}
-
-//----------------------------------------------------------------------------
-
-function getVars( $_gv_filename ) {
-       require $_gv_filename;
-       $vars = get_defined_vars();
-       unset( $vars['_gv_filename'] );
-
-       # Clean up line endings
-       if ( wfIsWindows() ) {
-               $vars = unixLineEndings( $vars );
-       }
-       return $vars;
-}
-
-function unixLineEndings( $var ) {
-       static $recursionLevel = 0;
-       if ( $recursionLevel > 50 ) {
-               global $stderr;
-               fwrite( $stderr, "Error: Recursion limit exceeded. Possible circular reference in array variable.\n" );
-               exit( 2 );
-       }
-
-       if ( is_array( $var ) ) {
-               ++$recursionLevel;
-               $var = array_map( 'unixLineEndings', $var );
-               --$recursionLevel;
-       } elseif ( is_string( $var ) ) {
-               $var = str_replace( "\r\n", "\n", $var );
-       }
-       return $var;
-}
author	jenkins-bot <jenkins-bot@gerrit.wikimedia.org>
	Sun, 12 Jan 2014 03:10:50 +0000 (03:10 +0000)
committer	Gerrit Code Review <gerrit@wikimedia.org>
	Sun, 12 Jan 2014 03:10:50 +0000 (03:10 +0000)
includes/normal/.gitignore	[new file with mode: 0644]	patch \| blob
includes/normal/Makefile		patch \| blob \| history
includes/normal/Utf8Case.php	[deleted file]	patch \| blob \| history
includes/normal/Utf8CaseGenerate.php	[deleted file]	patch \| blob \| history
maintenance/language/generateNormalizerData.php	[deleted file]	patch \| blob \| history
maintenance/language/generateNormalizerDataAr.php	[new file with mode: 0644]	patch \| blob
maintenance/language/generateNormalizerDataMl.php	[new file with mode: 0644]	patch \| blob
maintenance/language/generateUtf8Case.php	[new file with mode: 0644]	patch \| blob
serialized/.gitignore	[new file with mode: 0644]	patch \| blob
serialized/Makefile		patch \| blob \| history
serialized/serialize.php	[deleted file]	patch \| blob \| history