class CrhExceptions {
+ const WB = '\b'; # default word boundary; may be updated in the future
+
function __construct() {
$this->loadRegs();
}
# load C2L and L2C bidirectional affix mappings
$this->addMappings( $this->prefixMapping,
- $this->Cyrl2LatnPatterns, $this->Latn2CyrlPatterns, false, '/\b', '/u' );
+ $this->Cyrl2LatnPatterns, $this->Latn2CyrlPatterns, false, '/' . self::WB, '/u' );
$this->addMappings( $this->suffixMapping,
- $this->Cyrl2LatnPatterns, $this->Latn2CyrlPatterns, false, '/', '\b/u' );
+ $this->Cyrl2LatnPatterns, $this->Latn2CyrlPatterns, false, '/', self::WB . '/u' );
# tack on one-way mappings to the ends of the prefix and suffix patterns
$this->Cyrl2LatnPatterns += $this->Cyrl2LatnRegexes;
'оригинал' => 'original', 'оригиналь' => 'original',
'пускю' => 'püskü', 'пуськю' => 'püskü',
'къарагоз' => 'qaragöz', 'къарагозь' => 'qaragöz',
- 'етсин' => 'yetsin', 'етсин' => 'etsin',
#### Latin to Cyrillic (deduped from above)
'доцент' => 'dotsent', 'фармацевт' => 'farmatsevt', 'глицер' => 'glitser',
'люцерна' => 'lütserna', 'лицей' => 'litsey', 'меццо' => 'metstso', 'наци' => 'natsi',
'проце' => 'protse', 'рецеп' => 'retsep', 'реценз' => 'retsenz', 'теплица' => 'teplitsa',
- 'вице' => 'vitse', 'швейцар' => 'şveytsar',
+ 'вице' => 'vitse', 'швейцар' => 'şveytsar', 'богородиц' => 'bogorodits',
+ 'бруцел' => 'brutsel', 'дацюк' => 'datsük', 'доницетти' => 'donitsetti',
+ 'драцена' => 'dratsena', 'контрацеп' => 'kontratsep', 'коцюб' => 'kotsüb',
+ 'меценат' => 'metsenat', 'мицел' => 'mitsel', 'моцарт' => 'motsart', 'плац' => 'plats',
+ 'плацен' => 'platsen', 'прецедент' => 'pretsedent', 'прецес' => 'pretses',
+ 'прицеп' => 'pritsep', 'спец' => 'spets', 'троиц' => 'troits', 'шприц' => 'şprits',
+ 'эпицентр' => 'epitsentr', 'яценюк' => 'yatsenük',
# слова с тс
# words with тс
'козь' => 'köz', '-юнджи' => '-ünci', '-юнджиде' => '-üncide', '-юнджиден' => '-ünciden',
# originally L2C, here swapped
- 'еÑ\82Ñ\81ин' => 'etsin', 'лÑ\8cнаÑ\8f' => 'lnaya', 'лÑ\8cное' => 'lnoye', 'лÑ\8cнÑ\8bй' => 'lnıy', 'лÑ\8cний' => 'lniy',
+ 'льная' => 'lnaya', 'льное' => 'lnoye', 'льный' => 'lnıy', 'льний' => 'lniy',
'льская' => 'lskaya', 'льский' => 'lskiy', 'льское' => 'lskoye', 'ополь' => 'opol',
'щее' => 'şçeye', 'щий' => 'şçiy', 'щая' => 'şçaya', 'цепс' => 'tseps',
'му([иэИЭ])' => 'mü$1',
# originally L2C, here swapped
- 'роль$1' => 'rol([^ü])',
- 'усть$1' => 'üst([^ü])',
+ 'роль$1' => 'rol([^ü]|' . self::WB . ')',
+ 'усть$1' => 'üst([^ü]|' . self::WB . ')',
# more prefixes
'ком-кок' => 'köm-kök',
// TODO: refactor upper/lower/first capital whole words without
// regexes into simpler list
- '/\bКъЮШ\b/u' => 'QYŞ',
- '/\bЮШ\b/u' => 'YŞ',
-
- '/\bкок\b/u' => 'kök',
- '/\bКок\b/u' => 'Kök',
- '/\bКОК\b/u' => 'KÖK',
- '/\bком-кок\b/u' => 'köm-kök',
- '/\bКом-кок\b/u' => 'Köm-kök',
- '/\bКОМ-КОК\b/u' => 'KÖM-KÖK',
-
- '/\bкоп\b/u' => 'köp',
- '/\bКоп\b/u' => 'Köp',
- '/\bКОП\b/u' => 'KÖP',
-
- '/\bкурк\b/u' => 'kürk',
- '/\bКурк\b/u' => 'Kürk',
- '/\bКУРК\b/u' => 'KÜRK',
-
- '/\bог\b/u' => 'ög',
- '/\bОг\b/u' => 'Ög',
- '/\bОГ\b/u' => 'ÖG',
-
- '/\bюрип\b/u' => 'yürip',
- '/\bЮрип\b/u' => 'Yürip',
- '/\bЮРИП\b/u' => 'YÜRİP',
-
- '/\bюз\b/u' => 'yüz',
- '/\bЮз\b/u' => 'Yüz',
- '/\bЮЗ\b/u' => 'YÜZ',
-
- '/\bюк\b/u' => 'yük',
- '/\bЮк\b/u' => 'Yük',
- '/\bЮК\b/u' => 'YÜK',
-
- '/\bбуюп\b/u' => 'büyüp',
- '/\bБуюп\b/u' => 'Büyüp',
- '/\bБУЮП\b/u' => 'BÜYÜP',
-
- '/\bбуюк\b/u' => 'büyük',
- '/\bБуюк\b/u' => 'Büyük',
- '/\bБУЮК\b/u' => 'BÜYÜK',
-
- '/\bджонк\b/u' => 'cönk',
- '/\bДжонк\b/u' => 'Cönk',
- '/\bДЖОНК\b/u' => 'CÖNK',
- '/\bджонкю\b/u' => 'cönkü',
- '/\bДжонкю\b/u' => 'Cönkü',
- '/\bДЖОНКЮ\b/u' => 'CÖNKÜ',
-
- '/\bустке\b/u' => 'üstke',
- '/\bУстке\b/u' => 'Üstke',
- '/\bУСТКЕ\b/u' => 'ÜSTKE',
- '/\bустте\b/u' => 'üstte',
- '/\bУстте\b/u' => 'Üstte',
- '/\bУСТТЕ\b/u' => 'ÜSTTE',
- '/\bусттен\b/u' => 'üstten',
- '/\bУсттен\b/u' => 'Üstten',
- '/\bУСТТЕН\b/u' => 'ÜSTTEN',
+ '/' . self::WB . 'КъЮШ' . self::WB . '/u' => 'QYŞ',
+ '/' . self::WB . 'ЮШ' . self::WB . '/u' => 'YŞ',
+
+ '/' . self::WB . 'кок' . self::WB . '/u' => 'kök',
+ '/' . self::WB . 'Кок' . self::WB . '/u' => 'Kök',
+ '/' . self::WB . 'КОК' . self::WB . '/u' => 'KÖK',
+ '/' . self::WB . 'ком-кок' . self::WB . '/u' => 'köm-kök',
+ '/' . self::WB . 'Ком-кок' . self::WB . '/u' => 'Köm-kök',
+ '/' . self::WB . 'КОМ-КОК' . self::WB . '/u' => 'KÖM-KÖK',
+
+ '/' . self::WB . 'коп' . self::WB . '/u' => 'köp',
+ '/' . self::WB . 'Коп' . self::WB . '/u' => 'Köp',
+ '/' . self::WB . 'КОП' . self::WB . '/u' => 'KÖP',
+
+ '/' . self::WB . 'курк' . self::WB . '/u' => 'kürk',
+ '/' . self::WB . 'Курк' . self::WB . '/u' => 'Kürk',
+ '/' . self::WB . 'КУРК' . self::WB . '/u' => 'KÜRK',
+
+ '/' . self::WB . 'ог' . self::WB . '/u' => 'ög',
+ '/' . self::WB . 'Ог' . self::WB . '/u' => 'Ög',
+ '/' . self::WB . 'ОГ' . self::WB . '/u' => 'ÖG',
+
+ '/' . self::WB . 'юрип' . self::WB . '/u' => 'yürip',
+ '/' . self::WB . 'Юрип' . self::WB . '/u' => 'Yürip',
+ '/' . self::WB . 'ЮРИП' . self::WB . '/u' => 'YÜRİP',
+
+ '/' . self::WB . 'юз' . self::WB . '/u' => 'yüz',
+ '/' . self::WB . 'Юз' . self::WB . '/u' => 'Yüz',
+ '/' . self::WB . 'ЮЗ' . self::WB . '/u' => 'YÜZ',
+
+ '/' . self::WB . 'юк' . self::WB . '/u' => 'yük',
+ '/' . self::WB . 'Юк' . self::WB . '/u' => 'Yük',
+ '/' . self::WB . 'ЮК' . self::WB . '/u' => 'YÜK',
+
+ '/' . self::WB . 'буюп' . self::WB . '/u' => 'büyüp',
+ '/' . self::WB . 'Буюп' . self::WB . '/u' => 'Büyüp',
+ '/' . self::WB . 'БУЮП' . self::WB . '/u' => 'BÜYÜP',
+
+ '/' . self::WB . 'буюк' . self::WB . '/u' => 'büyük',
+ '/' . self::WB . 'Буюк' . self::WB . '/u' => 'Büyük',
+ '/' . self::WB . 'БУЮК' . self::WB . '/u' => 'BÜYÜK',
+
+ '/' . self::WB . 'джонк' . self::WB . '/u' => 'cönk',
+ '/' . self::WB . 'Джонк' . self::WB . '/u' => 'Cönk',
+ '/' . self::WB . 'ДЖОНК' . self::WB . '/u' => 'CÖNK',
+ '/' . self::WB . 'джонкю' . self::WB . '/u' => 'cönkü',
+ '/' . self::WB . 'Джонкю' . self::WB . '/u' => 'Cönkü',
+ '/' . self::WB . 'ДЖОНКЮ' . self::WB . '/u' => 'CÖNKÜ',
+
+ '/' . self::WB . 'куркчи/u' => 'kürkçi',
+ '/' . self::WB . 'Куркчи/u' => 'Kürkçi',
+ '/' . self::WB . 'КУРКЧИ/u' => 'KÜRKÇI',
+
+ '/' . self::WB . 'устке' . self::WB . '/u' => 'üstke',
+ '/' . self::WB . 'Устке' . self::WB . '/u' => 'Üstke',
+ '/' . self::WB . 'УСТКЕ' . self::WB . '/u' => 'ÜSTKE',
+ '/' . self::WB . 'устте' . self::WB . '/u' => 'üstte',
+ '/' . self::WB . 'Устте' . self::WB . '/u' => 'Üstte',
+ '/' . self::WB . 'УСТТЕ' . self::WB . '/u' => 'ÜSTTE',
+ '/' . self::WB . 'усттен' . self::WB . '/u' => 'üstten',
+ '/' . self::WB . 'Усттен' . self::WB . '/u' => 'Üstten',
+ '/' . self::WB . 'УСТТЕН' . self::WB . '/u' => 'ÜSTTEN',
# отдельно стоящие Ё и Я
# stand-alone Ё and Я
- '/\bЯ\b/u' => 'Ya',
- '/\bЁ\b/u' => 'Yo',
+ '/' . self::WB . 'Я' . self::WB . '/u' => 'Ya',
+ '/' . self::WB . 'Ё' . self::WB . '/u' => 'Yo',
############################
# относятся к началу слова #
# word prefixes #
############################
- '/\bКъЮШн/u' => 'QYŞn',
- '/\bЮШн/u' => 'YŞn',
+ '/' . self::WB . 'КъЮШн/u' => 'QYŞn',
+ '/' . self::WB . 'ЮШн/u' => 'YŞn',
# need to convert digraphs (гъ, къ, нъ, дж) now to match patterns
'/гъ/u' => 'ğ',
'/Д[жЖ]/u' => 'C',
# о => ö
- '/\b(['.Crh::C_M_CONS.'])о(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => '$1ö$2$3$4',
- '/\bо(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => 'ö$1$2$3',
- '/\b(['.Crh::C_M_CONS.'])О(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' =>
- '$1Ö$2$3$4',
- '/\bО(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' => 'Ö$1$2$3',
-
- '/\b(['.Crh::C_M_CONS.'])о(['.Crh::C_CONS.'])([еиэюьü])/u' => '$1ö$2$3',
- '/\bо(['.Crh::C_CONS.'])([еиэюьü])/u' => 'ö$1$2',
- '/\b(['.Crh::C_M_CONS.'])О(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' => '$1Ö$2$3',
- '/\bО(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' => 'Ö$1$2',
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])о([' . Crh::C_CONS . '])([' . Crh::C_CONS .
+ '])([еиэюьü])/u' => '$1ö$2$3$4',
+ '/' . self::WB . 'о([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьü])/u' => 'ö$1$2$3',
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])О([' . Crh::C_CONS . '])([' . Crh::C_CONS .
+ '])([еиэюьüЕИЭЮЬÜ])/u' => '$1Ö$2$3$4',
+ '/' . self::WB . 'О([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьüЕИЭЮЬÜ])/u'
+ => 'Ö$1$2$3',
+
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])о([' . Crh::C_CONS . '])([еиэюьü])/u' => '$1ö$2$3',
+ '/' . self::WB . 'о([' . Crh::C_CONS . '])([еиэюьü])/u' => 'ö$1$2',
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])О([' . Crh::C_CONS . '])([еиэюьüЕИЭЮЬÜ])/u'
+ => '$1Ö$2$3',
+ '/' . self::WB . 'О([' . Crh::C_CONS . '])([еиэюьüЕИЭЮЬÜ])/u' => 'Ö$1$2',
# ё => yö
- '/\bё(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([ьеюü])/u' => 'yö$1$2$3',
- '/\bЁ(['.Crh::C_CONS_LC.'])(['.Crh::C_CONS_LC.'])([ьеюü])/u' => 'Yö$1$2$3',
- '/\bЁ(['.Crh::C_CONS_UC.'])(['.Crh::C_CONS_UC.'])([ЬЕЮÜ])/u' => 'YÖ$1$2$3',
- '/\bё(['.Crh::C_CONS.'])([ьеюü])/u' => 'yö$1$2',
- '/\bЁ(['.Crh::C_CONS_LC.'])([ьеюü])/u' => 'Yö$1$2',
- '/\bЁ(['.Crh::C_CONS_UC.'])([ЬЕЮÜ])/u' => 'YÖ$1$2',
+ '/' . self::WB . 'ё([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([ьеюü])/u' => 'yö$1$2$3',
+ '/' . self::WB . 'Ё([' . Crh::C_CONS_LC . '])([' . Crh::C_CONS_LC . '])([ьеюü])/u' => 'Yö$1$2$3',
+ '/' . self::WB . 'Ё([' . Crh::C_CONS_UC . '])([' . Crh::C_CONS_UC . '])([ЬЕЮÜ])/u' => 'YÖ$1$2$3',
+ '/' . self::WB . 'ё([' . Crh::C_CONS . '])([ьеюü])/u' => 'yö$1$2',
+ '/' . self::WB . 'Ё([' . Crh::C_CONS_LC . '])([ьеюü])/u' => 'Yö$1$2',
+ '/' . self::WB . 'Ё([' . Crh::C_CONS_UC . '])([ЬЕЮÜ])/u' => 'YÖ$1$2',
# у => ü, ую => üyü
- '/\b(['.Crh::C_M_CONS.'])у(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => '$1ü$2$3$4',
- '/\bу(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => 'ü$1$2$3',
- '/\bую(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => 'üyü$1$2$3',
- '/\b(['.Crh::C_M_CONS.'])У(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' =>
- '$1Ü$2$3$4',
- '/\bУ(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' => 'Ü$1$2$3',
- '/\bУю(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => 'Üyü$1$2$2',
- '/\bУЮ(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([еиэюьü])/u' => 'ÜYÜ$1$2$3',
-
- '/\b(['.Crh::C_M_CONS.'])у(['.Crh::C_CONS.'])([еиэюьü])/u' => '$1ü$2$3',
- '/\bу(['.Crh::C_CONS.'])([еиэюьü])/u' => 'ü$1$2',
- '/\bую(['.Crh::C_CONS.'])([еиэюьü])/u' => 'üyü$1$2',
- '/\b(['.Crh::C_M_CONS.'])У(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' => '$1Ü$2$3',
- '/\bУ(['.Crh::C_CONS.'])([еиэюьüЕИЭЮЬÜ])/u' => 'Ü$1$2',
- '/\bУю(['.Crh::C_CONS.'])([еиэюьü])/u' => 'Üyü$1$2',
- '/\bУЮ(['.Crh::C_CONS.'])([еиэюьü])/u' => 'ÜYÜ$1$2',
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])у([' . Crh::C_CONS . '])([' . Crh::C_CONS .
+ '])([еиэюьü])/u' => '$1ü$2$3$4',
+ '/' . self::WB . 'у([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьü])/u' => 'ü$1$2$3',
+ '/' . self::WB . 'ую([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьü])/u' => 'üyü$1$2$3',
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])У([' . Crh::C_CONS . '])([' . Crh::C_CONS .
+ '])([еиэюьüЕИЭЮЬÜ])/u' => '$1Ü$2$3$4',
+ '/' . self::WB . 'У([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьüЕИЭЮЬÜ])/u'
+ => 'Ü$1$2$3',
+ '/' . self::WB . 'Ую([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьü])/u' => 'Üyü$1$2$3',
+ '/' . self::WB . 'УЮ([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([еиэюьü])/u' => 'ÜYÜ$1$2$3',
+
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])у([' . Crh::C_CONS . '])([еиэюьü])/u' => '$1ü$2$3',
+ '/' . self::WB . 'у([' . Crh::C_CONS . '])([еиэюьü])/u' => 'ü$1$2',
+ '/' . self::WB . 'ую([' . Crh::C_CONS . '])([еиэюьü])/u' => 'üyü$1$2',
+ '/' . self::WB . '([' . Crh::C_M_CONS . '])У([' . Crh::C_CONS . '])([еиэюьüЕИЭЮЬÜ])/u'
+ => '$1Ü$2$3',
+ '/' . self::WB . 'У([' . Crh::C_CONS . '])([еиэюьüЕИЭЮЬÜ])/u' => 'Ü$1$2',
+ '/' . self::WB . 'Ую([' . Crh::C_CONS . '])([еиэюьü])/u' => 'Üyü$1$2',
+ '/' . self::WB . 'УЮ([' . Crh::C_CONS . '])([еиэюьü])/u' => 'ÜYÜ$1$2',
# ю => yü
- '/\b([аыоуеиёюАЫОУЕИЁЮ]?)ю(['.Crh::C_CONS.'])(['.Crh::C_CONS.'])([ьеюü])/u' => '$1yü$2$3$4',
- '/\b([АЫОУЕИЁЮ]?)Ю(['.Crh::C_CONS_LC.'])(['.Crh::C_CONS_LC.'])([ьеюü])/u' => '$1Yü$2$3$4',
- '/\b([АЫОУЕИЁЮ]?)Ю(['.Crh::C_CONS_UC.'])(['.Crh::C_CONS_UC.'])([ЬЕЮÜ])/u' => '$1YÜ$2$3$4',
- '/\b([аыоуеиёюАЫОУЕИЁЮ]?)ю(['.Crh::C_CONS.'])([ьеюü])/u' => '$1yü$2$3',
- '/\b([АЫОУЕИЁЮ]?)Ю(['.Crh::C_CONS_LC.'])([ьеюü])/u' => '$1Yü$2$3',
- '/\b([АЫОУЕИЁЮ]?)Ю(['.Crh::C_CONS_UC.'])([ЬЕЮÜ])/u' => '$1YÜ$2$3',
+ '/' . self::WB . '([аыоуеиёюАЫОУЕИЁЮ]?)ю([' . Crh::C_CONS . '])([' . Crh::C_CONS . '])([ьеюü])/u'
+ => '$1yü$2$3$4',
+ '/' . self::WB . '([АЫОУЕИЁЮ]?)Ю([' . Crh::C_CONS_LC . '])([' . Crh::C_CONS_LC . '])([ьеюü])/u'
+ => '$1Yü$2$3$4',
+ '/' . self::WB . '([АЫОУЕИЁЮ]?)Ю([' . Crh::C_CONS_UC . '])([' . Crh::C_CONS_UC . '])([ЬЕЮÜ])/u'
+ => '$1YÜ$2$3$4',
+ '/' . self::WB . '([аыоуеиёюАЫОУЕИЁЮ]?)ю([' . Crh::C_CONS . '])([ьеюü])/u' => '$1yü$2$3',
+ '/' . self::WB . '([АЫОУЕИЁЮ]?)Ю([' . Crh::C_CONS_LC . '])([ьеюü])/u' => '$1Yü$2$3',
+ '/' . self::WB . '([АЫОУЕИЁЮ]?)Ю([' . Crh::C_CONS_UC . '])([ЬЕЮÜ])/u' => '$1YÜ$2$3',
# e => ye, я => ya
- '/\bе/u' => 'ye',
- '/\bЕ(['.Crh::C_LC.'cğñqöü])/u' => 'Ye$1',
- '/\bЕ(['.Crh::C_UC.'CĞÑQÖÜ])/u' => 'YE$1',
- '/\bя/u' => 'ya',
- '/\bЯ(['.Crh::C_LC.'cğñqöü])/u' => 'Ya$1',
- '/\bЯ(['.Crh::C_UC.'CĞÑQÖÜ])/u' => 'YA$1',
+ '/' . self::WB . 'е/u' => 'ye',
+ '/' . self::WB . 'Е([' . Crh::C_LC . 'cğñqöü])/u' => 'Ye$1',
+ '/' . self::WB . 'Е([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => 'YE$1',
+ '/' . self::WB . 'я/u' => 'ya',
+ '/' . self::WB . 'Я([' . Crh::C_LC . 'cğñqöü])/u' => 'Ya$1',
+ '/' . self::WB . 'Я([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => 'YA$1',
'/([аеёиоуыэюяйьъaeöüАЕЁИОУЫЭЮЯЙЬЪAEÖÜ])е/u' => '$1ye',
- '/([аеёиоуыэюяйьъaeöüАЕЁИОУЫЭЮЯЙЬЪAEÖÜ])Е(['.Crh::C_LC.'cğñqöü])/u' => '$1Ye$2',
- '/([аеёиоуыэюяйьъaeöüАЕЁИОУЫЭЮЯЙЬЪAEÖÜ])Е(['.Crh::C_UC.'CĞÑQÖÜ])/u' => '$1YE$2',
+ '/([аеёиоуыэюяйьъaeöüАЕЁИОУЫЭЮЯЙЬЪAEÖÜ])Е([' . Crh::C_LC . 'cğñqöü])/u' => '$1Ye$2',
+ '/([аеёиоуыэюяйьъaeöüАЕЁИОУЫЭЮЯЙЬЪAEÖÜ])Е([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => '$1YE$2',
'/([аеёиоуыэюяйьъaeöüğqАЕЁИОУЫЭЮЯЙЬЪAEÖÜĞQ])я/u' => '$1ya',
- '/([аеёиоуыэюяйьъaeöüğqАЕЁИОУЫЭЮЯЙЬЪAEÖÜĞQ])Я(['.Crh::C_LC.'cğñqöü])/u' => '$1Ya$2',
- '/([аеёиоуыэюяйьъaeöüğqАЕЁИОУЫЭЮЯЙЬЪAEÖÜĞQ])Я(['.Crh::C_UC.'CĞÑQÖÜ])/u' => '$1YA$2',
+ '/([аеёиоуыэюяйьъaeöüğqАЕЁИОУЫЭЮЯЙЬЪAEÖÜĞQ])Я([' . Crh::C_LC . 'cğñqöü])/u' => '$1Ya$2',
+ '/([аеёиоуыэюяйьъaeöüğqАЕЁИОУЫЭЮЯЙЬЪAEÖÜĞQ])Я([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => '$1YA$2',
###############################
# не зависят от места в слове #
# Ö, Ü 1-й заход: ё, ю после согласных > ö, ü
# Ö, Ü 1st instance: ё, ю after consonants > ö, ü
- '/(['.Crh::C_CONS.'])ю/u' => '$1ü',
- '/(['.Crh::C_CONS.'])Ю/u' => '$1Ü',
- '/(['.Crh::C_CONS.'])ё/u' => '$1ö',
- '/(['.Crh::C_CONS.'])Ё/u' => '$1Ö',
+ '/([' . Crh::C_CONS . '])ю/u' => '$1ü',
+ '/([' . Crh::C_CONS . '])Ю/u' => '$1Ü',
+ '/([' . Crh::C_CONS . '])ё/u' => '$1ö',
+ '/([' . Crh::C_CONS . '])Ё/u' => '$1Ö',
# остальные вхождения о, у, ё, ю
# other occurences of о, у, ё, ю
- '/Ё(['.Crh::C_UC.'CĞÑQÖÜ])/u' => 'YO$2',
- '/Ю(['.Crh::C_UC.'CĞÑQÖÜ])/u' => 'YU$2',
+ '/Ё([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => 'YO$1',
+ '/Ю([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => 'YU$1',
# Ц & Щ
- '/Ц(['.Crh::C_UC.'CĞÑQÖÜ])/u' => 'TS$2',
- '/Щ(['.Crh::C_UC.'CĞÑQÖÜ])/u' => 'ŞÇ$2',
+ '/Ц([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => 'TS$1',
+ '/Щ([' . Crh::C_UC . 'CĞÑQÖÜ])/u' => 'ŞÇ$1',
];
$this->Latn2CyrlRegexes = [
// TODO: refactor upper/lower/first capital whole words without
// regexes into simpler list
- '/\ban\b/u' => 'ань',
- '/\bAn\b/u' => 'Ань',
- '/\bAN\b/u' => 'АНЬ',
- '/\bange\b/u' => 'аньге',
- '/\bAnge\b/u' => 'Аньге',
- '/\bANGE\b/u' => 'АНЬГЕ',
- '/\bande\b/u' => 'аньде',
- '/\bAnde\b/u' => 'Аньде',
- '/\bANDE\b/u' => 'АНЬДЕ',
- '/\banki\b/u' => 'аньки',
- '/\bAnki\b/u' => 'Аньки',
- '/\bANKİ\b/u' => 'АНЬКИ',
- '/\bderal\b/u' => 'деръал',
- '/\bDeral\b/u' => 'Деръал',
- '/\bDERAL\b/u' => 'ДЕРЪАЛ',
- '/\bkör\b/u' => 'кёр',
- '/\bKör\b/u' => 'Кёр',
- '/\bKÖR\b/u' => 'КЁР',
- '/\bmer\b/u' => 'мэр',
- '/\bMer\b/u' => 'Мэр',
- '/\bMER\b/u' => 'МЭР',
-
- '/\bджонк/u' => 'cönk',
- '/\bДжонк/u' => 'Cönk',
- '/\bДЖОНК/u' => 'CÖNK',
-
- '/\bкуркчи/u' => 'kürkçi',
- '/\bКуркчи/u' => 'Kürkçi',
- '/\bКУРКЧИ/u' => 'KÜRKÇI',
+ '/' . self::WB . 'an' . self::WB . '/u' => 'ань',
+ '/' . self::WB . 'An' . self::WB . '/u' => 'Ань',
+ '/' . self::WB . 'AN' . self::WB . '/u' => 'АНЬ',
+ '/' . self::WB . 'ange' . self::WB . '/u' => 'аньге',
+ '/' . self::WB . 'Ange' . self::WB . '/u' => 'Аньге',
+ '/' . self::WB . 'ANGE' . self::WB . '/u' => 'АНЬГЕ',
+ '/' . self::WB . 'ande' . self::WB . '/u' => 'аньде',
+ '/' . self::WB . 'Ande' . self::WB . '/u' => 'Аньде',
+ '/' . self::WB . 'ANDE' . self::WB . '/u' => 'АНЬДЕ',
+ '/' . self::WB . 'anki' . self::WB . '/u' => 'аньки',
+ '/' . self::WB . 'Anki' . self::WB . '/u' => 'Аньки',
+ '/' . self::WB . 'ANKİ' . self::WB . '/u' => 'АНЬКИ',
+ '/' . self::WB . 'deral' . self::WB . '/u' => 'деръал',
+ '/' . self::WB . 'Deral' . self::WB . '/u' => 'Деръал',
+ '/' . self::WB . 'DERAL' . self::WB . '/u' => 'ДЕРЪАЛ',
+ '/' . self::WB . 'kör' . self::WB . '/u' => 'кёр',
+ '/' . self::WB . 'Kör' . self::WB . '/u' => 'Кёр',
+ '/' . self::WB . 'KÖR' . self::WB . '/u' => 'КЁР',
+ '/' . self::WB . 'mer' . self::WB . '/u' => 'мэр',
+ '/' . self::WB . 'Mer' . self::WB . '/u' => 'Мэр',
+ '/' . self::WB . 'MER' . self::WB . '/u' => 'МЭР',
+
+ '/' . self::WB . 'cönk/u' => 'джонк',
+ '/' . self::WB . 'Cönk/u' => 'Джонк',
+ '/' . self::WB . 'CÖNK/u' => 'ДЖОНК',
+
+ # (y)etsin -> етсин/этсин
+ # note that target starts with CYRILLIC е/Е!
+ '/yetsin/u' => 'етсин',
+ '/Yetsin/u' => 'Етсин',
+ '/YETSİN/u' => 'ЕТСИН',
+
+ # note that target starts with LATIN e/E!
+ # (other transformations will determine CYRILLIC е/э as needed)
+ '/etsin/u' => 'eтсин',
+ '/Etsin/u' => 'Eтсин',
+ '/ETSİN/u' => 'EТСИН',
# буква Ё - первый заход
# расставляем Ь после согласных
- '/\b([yY])ö(['.Crh::L_N_CONS.'])([aAuU'.Crh::L_CONS.']|\b)/u' => '$1ö$2ь$3',
- '/\b([yY])Ö(['.Crh::L_N_CONS.'])([aAuU'.Crh::L_CONS.']|\b)/u' => '$1Ö$2Ь$3',
- '/\bAQŞ([^AEI]|\b)/u' => 'АКъШ$1',
+ '/' . self::WB . '([yY])ö([' . Crh::L_N_CONS . '])([aAuU' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => '$1ö$2ь$3',
+ '/' . self::WB . '([yY])Ö([' . Crh::L_N_CONS . '])([aAuU' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => '$1Ö$2Ь$3',
+ '/' . self::WB . 'AQŞ([^AEI]|' . self::WB . ')/u' => 'АКъШ$1',
# буква Ю - первый заход
# расставляем Ь после согласных
- '/\b([yY])ü(['.Crh::L_N_CONS.'])([aAuU'.Crh::L_CONS.']|\b)/u' => '$1ü$2ь$3',
- '/\b([yY])Ü(['.Crh::L_N_CONS.'])([aAuU'.Crh::L_CONS.']|\b)/u' => '$1Ü$2Ь$3',
-
- '/\b([bcgkpşBCGKPŞ])ö(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => '$1ö$2ь$3',
- '/\b([bcgkpşBCGKPŞ])Ö(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => '$1Ö$2Ь$3',
- '/\b([bcgkpşBCGKPŞ])Ö(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => '$1Ö$2Ь$3',
- '/\b([bcgkpşBCGKPŞ])ü(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => '$1ü$2ь$3',
- '/\b([bcgkpşBCGKPŞ])Ü(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => '$1Ü$2Ь$3',
- '/\b([bcgkpşBCGKPŞ])Ü(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => '$1Ü$2Ь$3',
+ '/' . self::WB . '([yY])ü([' . Crh::L_N_CONS . '])([aAuU' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => '$1ü$2ь$3',
+ '/' . self::WB . '([yY])Ü([' . Crh::L_N_CONS . '])([aAuU' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => '$1Ü$2Ь$3',
+
+ '/' . self::WB . '([bcgkpşBCGKPŞ])ö([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' .
+ self::WB . ')/u' => '$1ö$2ь$3',
+ '/' . self::WB . '([bcgkpşBCGKPŞ])Ö([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' .
+ self::WB . ')/u' => '$1Ö$2Ь$3',
+ '/' . self::WB . '([bcgkpşBCGKPŞ])Ö([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' .
+ self::WB . ')/u' => '$1Ö$2Ь$3',
+ '/' . self::WB . '([bcgkpşBCGKPŞ])ü([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' .
+ self::WB . ')/u' => '$1ü$2ь$3',
+ '/' . self::WB . '([bcgkpşBCGKPŞ])Ü([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' .
+ self::WB . ')/u' => '$1Ü$2Ь$3',
+ '/' . self::WB . '([bcgkpşBCGKPŞ])Ü([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' .
+ self::WB . ')/u' => '$1Ü$2Ь$3',
# ö и ü в начале слова
# случаи, когда нужен Ь
- '/\bö(['.Crh::L_N_CONS.'pP])(['.Crh::L_CONS.']|\b)/u' => 'ö$1ь$2',
- '/\bÖ(['.Crh::L_N_CONS_LC.'p])(['.Crh::L_CONS.']|\b)/u' => 'Ö$1ь$2',
- '/\bÖ(['.Crh::L_N_CONS_UC.'P])(['.Crh::L_CONS.']|\b)/u' => 'Ö$1Ь$2',
- '/\bü(['.Crh::L_N_CONS.'])(['.Crh::L_CONS.']|\b)/u' => 'ü$1ь$2',
- '/\bÜ(['.Crh::L_N_CONS_LC.'])(['.Crh::L_CONS.']|\b)/u' => 'Ü$1ь$2',
- '/\bÜ(['.Crh::L_N_CONS_UC.'])(['.Crh::L_CONS.']|\b)/u' => 'Ü$1Ь$2',
-
- '/ts\b/u' => 'ц',
- '/şç\b/u' => 'щ',
- '/Ş[çÇ]\b/u' => 'Щ',
- '/T[sS]\b/u' => 'Ц',
+ '/' . self::WB . 'ö([' . Crh::L_N_CONS . 'pP])([' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => 'ö$1ь$2',
+ '/' . self::WB . 'Ö([' . Crh::L_N_CONS_LC . 'p])([' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => 'Ö$1ь$2',
+ '/' . self::WB . 'Ö([' . Crh::L_N_CONS_UC . 'P])([' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => 'Ö$1Ь$2',
+ '/' . self::WB . 'ü([' . Crh::L_N_CONS . '])([' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => 'ü$1ь$2',
+ '/' . self::WB . 'Ü([' . Crh::L_N_CONS_LC . '])([' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => 'Ü$1ь$2',
+ '/' . self::WB . 'Ü([' . Crh::L_N_CONS_UC . '])([' . Crh::L_CONS . ']|' . self::WB . ')/u'
+ => 'Ü$1Ь$2',
+
+ '/ts' . self::WB . '/u' => 'ц',
+ '/şç' . self::WB . '/u' => 'щ',
+ '/Ş[çÇ]' . self::WB . '/u' => 'Щ',
+ '/T[sS]' . self::WB . '/u' => 'Ц',
# Ь после Л
# add Ь after Л
- '/(['.Crh::L_F.'])l(['.Crh::L_CONS_LC.']|\b)/u' => '$1ль$2',
- '/(['.Crh::L_F_UC.'])L(['.Crh::L_CONS.']|\b)/u' => '$1ЛЬ$2',
-
- '/etsin\b/u' => 'етсин',
- '/Etsin\b/u' => 'Етсин',
- '/ETSİN\b/u' => 'ЕТСИН',
+ '/([' . Crh::L_F . '])l([' . Crh::L_CONS_LC . ']|' . self::WB . ')/u' => '$1ль$2',
+ '/([' . Crh::L_F_UC . '])L([' . Crh::L_CONS . ']|' . self::WB . ')/u' => '$1ЛЬ$2',
# относятся к началу слова
- '/\bts/u' => 'ц',
- '/\bT[sS]/u' => 'Ц',
+ '/' . self::WB . 'ts/u' => 'ц',
+ '/' . self::WB . 'T[sS]/u' => 'Ц',
- '/\bşç/u' => 'щ',
- '/\bŞ[çÇ]/u' => 'Щ',
+ '/' . self::WB . 'şç/u' => 'щ',
+ '/' . self::WB . 'Ş[çÇ]/u' => 'Щ',
# Э
- '/(\b|['.Crh::L_VOW.'аеэяАЕЭЯ])e/u' => '$1э',
- '/(\b|['.Crh::L_VOW_UC.'АЕЭЯ])E/u' => '$1Э',
+ '/(' . self::WB . '|[' . Crh::L_VOW . 'аеэяАЕЭЯ])e/u' => '$1э',
+ '/(' . self::WB . '|[' . Crh::L_VOW_UC . 'АЕЭЯ])E/u' => '$1Э',
- '/\b(['.Crh::L_M_CONS.'])ö/u' => '$1о',
- '/\b(['.Crh::L_M_CONS.'])Ö/u' => '$1О',
- '/\b(['.Crh::L_M_CONS.'])ü/u' => '$1у',
- '/\b(['.Crh::L_M_CONS.'])Ü/u' => '$1У',
+ '/' . self::WB . '([' . Crh::L_M_CONS . '])ö/u' => '$1о',
+ '/' . self::WB . '([' . Crh::L_M_CONS . '])Ö/u' => '$1О',
+ '/' . self::WB . '([' . Crh::L_M_CONS . '])ü/u' => '$1у',
+ '/' . self::WB . '([' . Crh::L_M_CONS . '])Ü/u' => '$1У',
- '/\bö/u' => 'о',
- '/\bÖ/u' => 'О',
- '/\bü/u' => 'у',
- '/\bÜ/u' => 'У',
+ '/' . self::WB . 'ö/u' => 'о',
+ '/' . self::WB . 'Ö/u' => 'О',
+ '/' . self::WB . 'ü/u' => 'у',
+ '/' . self::WB . 'Ü/u' => 'У',
# некоторые исключения
# some exceptions
'/KÖZ([^EÜ])/u' => 'КОЗЬ$1',
# Punctuation
- '/#|No\./' => '№',
+ '/#|No\./u' => '№',
# некоторые случаи употребления Ц
'/tsi([^zñ])/u' => 'ци$1',
'/T[sS][iİ]([^zZñÑ])/u' => 'ЦИ$1',
'/ts([ou])/u' => 'ц$1',
'/T[sS]([oOuU])/u' => 'Ц$1',
- '/ts(['.Crh::L_CONS.'])/u' => 'ц$1',
- '/T[sS](['.Crh::L_CONS.'])/u' => 'Ц$1',
- '/(['.Crh::L_CONS.'])ts/u' => '$1ц',
- '/(['.Crh::L_CONS.'])T[sS]/u' => '$1Ц',
+ '/ts([' . Crh::L_CONS . '])/u' => 'ц$1',
+ '/T[sS]([' . Crh::L_CONS . '])/u' => 'Ц$1',
+ '/([' . Crh::L_CONS . '])ts/u' => '$1ц',
+ '/([' . Crh::L_CONS . '])T[sS]/u' => '$1Ц',
'/tsиал/u' => 'циал',
'/TSИАЛ/u' => 'ЦИАЛ',
'/[ьЬ]([iİ])/u' => '$1',
# ya & ye
- '/(['.Crh::L_CONS.'])ya/u' => '$1ья',
- '/(['.Crh::L_CONS.'])Y[aA]/u' => '$1ЬЯ',
- '/(['.Crh::L_CONS.'])ye/u' => '$1ье',
- '/(['.Crh::L_CONS.'])Y[eE]/u' => '$1ЬЕ',
+ '/([' . Crh::L_CONS . '])ya/u' => '$1ья',
+ '/([' . Crh::L_CONS . '])Y[aA]/u' => '$1ЬЯ',
+ '/([' . Crh::L_CONS . '])ye/u' => '$1ье',
+ '/([' . Crh::L_CONS . '])Y[eE]/u' => '$1ЬЕ',
# расставляем Ь перед Ё
# place Ь in front of Ё
- '/(['.Crh::L_CONS.'])y[oö]/u' => '$1ьё',
- '/(['.Crh::L_CONS.'])Y[oOöÖ]/u' => '$1ЬЁ',
+ '/([' . Crh::L_CONS . '])y[oö]/u' => '$1ьё',
+ '/([' . Crh::L_CONS . '])Y[oOöÖ]/u' => '$1ЬЁ',
# оставшиеся вхождения yo и yö
# remaining occurrences of yo and yö
'/y[oö]/u' => 'ё',
# расставляем Ь перед Ю
# place Ь in front of Ю
- '/(['.Crh::L_CONS.'])y[uü]/u' => '$1ью',
- '/(['.Crh::L_CONS.'])Y[uUüÜ]/u' => '$1ЬЮ',
+ '/([' . Crh::L_CONS . '])y[uü]/u' => '$1ью',
+ '/([' . Crh::L_CONS . '])Y[uUüÜ]/u' => '$1ЬЮ',
# оставшиеся вхождения yu и yü
# remaining occurrences of yu and yü
'/y[uü]/u' => 'ю',
'/[ьЬ]([aA])/u' => '$1',
# дж
- '/C(['.Crh::L_UC.Crh::C_UC.'АЕЁЙОУЭЮЯ])/u' => 'ДЖ$1',
- '/(['.Crh::L_UC.Crh::C_UC.'АЕЁЙОУЭЮЯ])C/u' => '$1ДЖ',
+ '/C([' . Crh::L_UC . Crh::C_UC . 'АЕЁЙОУЭЮЯ])/u' => 'ДЖ$1',
+ '/([' . Crh::L_UC . Crh::C_UC . 'АЕЁЙОУЭЮЯ])C/u' => '$1ДЖ',
# гъ, къ, нъ
- '/Ğ(['.Crh::L_UC.Crh::C_UC.'])/u' => 'ГЪ$1',
- '/(['.Crh::L_UC.Crh::C_UC.'Ъ])Ğ/u' => '$1ГЪ',
+ '/Ğ([' . Crh::L_UC . Crh::C_UC . '])/u' => 'ГЪ$1',
+ '/([' . Crh::L_UC . Crh::C_UC . 'Ъ])Ğ/u' => '$1ГЪ',
- '/Q(['.Crh::L_UC.Crh::C_UC.'])/u' => 'КЪ$1',
- '/(['.Crh::L_UC.Crh::C_UC.'Ъ])Q/u' => '$1КЪ',
+ '/Q([' . Crh::L_UC . Crh::C_UC . '])/u' => 'КЪ$1',
+ '/([' . Crh::L_UC . Crh::C_UC . 'Ъ])Q/u' => '$1КЪ',
- '/Ñ(['.Crh::L_UC.Crh::C_UC.'])/u' => 'НЪ$1',
- '/(['.Crh::L_UC.Crh::C_UC.'Ъ])Ñ/u' => '$1НЪ',
+ '/Ñ([' . Crh::L_UC . Crh::C_UC . '])/u' => 'НЪ$1',
+ '/([' . Crh::L_UC . Crh::C_UC . 'Ъ])Ñ/u' => '$1НЪ',
];
}