Normalize Unicode input to normalization form C. Most of the time input
authorBrion Vibber <brion@users.mediawiki.org>
Thu, 2 Sep 2004 07:50:04 +0000 (07:50 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Thu, 2 Sep 2004 07:50:04 +0000 (07:50 +0000)
is already in this form and it shouldn't take very long to verify it.
There is still optimization to be done though.

Partial fix for http://bugzilla.wikipedia.org/show_bug.cgi?id=240

Will also need to verify correct UTF-8 sequences and strip characters
that are illegal in XML.

Some input may not be going through this verification yet (eg the uploaded filenames)

includes/WebRequest.php

index a37c257..04742bb 100644 (file)
 # http://www.gnu.org/copyleft/gpl.html
 
 # Hypothetically, we could use a WebRequest object to fake a
-# self-contained request.
-
-## Enable this to debug total elimination of register_globals
+# self-contained request (FauxRequest).
 
 class WebRequest {
        function WebRequest() {
                $this->checkMagicQuotes();
+               global $wgUseLatin1;
+               if( !$wgUseLatin1 ) {
+                       $this->normalizeUnicode();
+               }
        }
 
        function &fix_magic_quotes( &$arr ) {
@@ -51,6 +53,17 @@ class WebRequest {
                }
        }
        
+       function normalizeUnicode() {
+               wfProfileIn( 'WebRequest:normalizeUnicode-include' );
+               require_once( 'normal/UtfNormal.php' );
+               wfProfileOut( 'WebRequest:normalizeUnicode-include' );
+               wfProfileIn( 'WebRequest:normalizeUnicode-fix' );
+               foreach( $_REQUEST as $key => $val ) {
+                       $_REQUEST[$key] = UtfNormal::toNFC( $val );
+               }
+               wfProfileOut( 'WebRequest:normalizeUnicode-fix' );
+       }
+       
        function getGPCVal( &$arr, $name, $default ) {
                if( isset( $arr[$name] ) ) {
                        return $arr[$name];