<?php
-#$Id$
+/**
+ * @package MediaWiki
+ * @subpackage Language
+ */
+
if( defined( "MEDIAWIKI" ) ) {
+# This file and LanguageLatin1.php may be included from within functions, so
+# we need to have global statements
+
+global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
+global $wgDBname, $wgMemc;
+
$wgInputEncoding = "UTF-8";
$wgOutputEncoding = "UTF-8";
-if (function_exists('mb_internal_encoding')) {
+if( function_exists( 'mb_strtoupper' ) ) {
mb_internal_encoding('UTF-8');
} else {
# Hack our own case conversion routines
-
+
# Loading serialized arrays is faster than parsing code :P
$wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
$wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
-
+
if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
require_once( "includes/Utf8Case.php" );
$wgMemc->set( $key1, $wikiUpperChars );
}
}
-# Base stuff useful to all UTF-8 based language files
+/**
+ * Base stuff useful to all UTF-8 based language files
+ * @package MediaWiki
+ */
class LanguageUtf8 extends Language {
- # These two functions use mbstring library, if it is loaded
- # or compiled and character mapping arrays otherwise.
+ # These functions use mbstring library, if it is loaded
+ # or compiled and character mapping arrays otherwise.
# In case of language-specific character mismatch
# it should be dealt with in Language classes.
- function ucfirst( $string ) {
- if (function_exists('mb_strtoupper')) {
- return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
- } else {
- global $wikiUpperChars;
- return preg_replace (
- "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "strtr ( \"\$1\" , \$wikiUpperChars )",
- $string );
- }
+ function ucfirst( $str ) {
+ return LanguageUtf8::uc( $str, true );
}
-
- function lcfirst( $string ) {
- if (function_exists('mb_strtolower')) {
- return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
- } else {
- global $wikiLowerChars;
- return preg_replace (
- "/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "strtr ( \"\$1\" , \$wikiLowerChars )",
- $string );
- }
+
+ function uc( $str, $first = false ) {
+ if ( function_exists( 'mb_strtoupper' ) )
+ if ( $first )
+ if ( LanguageUtf8::isMultibyte( $str ) )
+ return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
+ else
+ return ucfirst( $str );
+ else
+ return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
+ else
+ if ( LanguageUtf8::isMultibyte( $str ) ) {
+ global $wikiUpperChars;
+ $x = $first ? '^' : '';
+ return preg_replace(
+ "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
+ "strtr( \"\$1\" , \$wikiUpperChars )",
+ $str
+ );
+ } else
+ return $first ? ucfirst( $str ) : strtoupper( $str );
+ }
+
+ function lcfirst( $str ) {
+ return LanguageUtf8::lc( $str, true );
+ }
+
+ function lc( $str, $first = false ) {
+ if ( function_exists( 'mb_strtolower' ) )
+ if ( $first )
+ if ( LanguageUtf8::isMultibyte( $str ) )
+ return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
+ else
+ return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
+ else
+ return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
+ else
+ if ( LanguageUtf8::isMultibyte( $str ) ) {
+ global $wikiLowerChars;
+ $x = $first ? '^' : '';
+ return preg_replace(
+ "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
+ "strtr( \"\$1\" , \$wikiLowerChars )",
+ $str
+ );
+ } else
+ return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
+ }
+
+ function isMultibyte( $str ) {
+ return (bool)preg_match( '/^[\x80-\xff]/', $str );
}
function stripForSearch( $string ) {
# all strtolower on stripped output or argument
# should be removed and all stripForSearch
# methods adjusted to that.
+
+ wfProfileIn( "LanguageUtf8::stripForSearch" );
if( function_exists( 'mb_strtolower' ) ) {
- return preg_replace(
+ $out = preg_replace(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
"'U8' . bin2hex( \"$1\" )",
mb_strtolower( $string ) );
} else {
global $wikiLowerChars;
- return preg_replace(
+ $out = preg_replace(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
"'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
$string );
}
+ wfProfileOut( "LanguageUtf8::stripForSearch" );
+ return $out;
}
function fallback8bitEncoding() {
function checkTitleEncoding( $s ) {
global $wgInputEncoding;
+ if( is_array( $s ) ) {
+ wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
+ }
# Check for non-UTF-8 URLs
$ishigh = preg_match( '/[\x80-\xff]/', $s);
if(!$ishigh) return $s;
-
+
$isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
if( $isutf8 ) return $s;
function firstChar( $s ) {
preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
-
+
return isset( $matches[1] ) ? $matches[1] : "";
}