includes/normal/UtfNormalUtil.php

   1 <?php
   2 /**
   3  * Some of these functions are adapted from places in MediaWiki.
   4  * Should probably merge them for consistency.
   5  *
   6  * Copyright © 2004 Brion Vibber <brion@pobox.com>
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @file
  25  * @ingroup UtfNormal
  26  */
  27
  28 /**
  29  * Return UTF-8 sequence for a given Unicode code point.
  30  * May die if fed out of range data.
  31  *
  32  * @param $codepoint Integer:
  33  * @return String
  34  * @public
  35  */
  36 function codepointToUtf8( $codepoint ) {
  37         if($codepoint <         0x80) return chr($codepoint);
  38         if($codepoint <    0x800) return chr($codepoint >>      6 & 0x3f | 0xc0) .
  39                                                                          chr($codepoint           & 0x3f | 0x80);
  40         if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
  41                                                                          chr($codepoint >>      6 & 0x3f | 0x80) .
  42                                                                          chr($codepoint           & 0x3f | 0x80);
  43         if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
  44                                                                          chr($codepoint >> 12 & 0x3f | 0x80) .
  45                                                                          chr($codepoint >>      6 & 0x3f | 0x80) .
  46                                                                          chr($codepoint           & 0x3f | 0x80);
  47
  48         echo "Asked for code outside of range ($codepoint)\n";
  49         die( -1 );
  50 }
  51
  52 /**
  53  * Take a series of space-separated hexadecimal numbers representing
  54  * Unicode code points and return a UTF-8 string composed of those
  55  * characters. Used by UTF-8 data generation and testing routines.
  56  *
  57  * @param $sequence String
  58  * @return String
  59  * @private
  60  */
  61 function hexSequenceToUtf8( $sequence ) {
  62         $utf = '';
  63         foreach( explode( ' ', $sequence ) as $hex ) {
  64                 $n = hexdec( $hex );
  65                 $utf .= codepointToUtf8( $n );
  66         }
  67         return $utf;
  68 }
  69
  70 /**
  71  * Take a UTF-8 string and return a space-separated series of hex
  72  * numbers representing Unicode code points. For debugging.
  73  *
  74  * @param $str String: UTF-8 string.
  75  * @return string
  76  * @private
  77  */
  78 function utf8ToHexSequence( $str ) {
  79         return rtrim( preg_replace( '/(.)/uSe',
  80                                     'sprintf("%04x ", utf8ToCodepoint("$1"))',
  81                                     $str ) );
  82 }
  83
  84 /**
  85  * Determine the Unicode codepoint of a single-character UTF-8 sequence.
  86  * Does not check for invalid input data.
  87  *
  88  * @param $char String
  89  * @return Integer
  90  * @public
  91  */
  92 function utf8ToCodepoint( $char ) {
  93         # Find the length
  94         $z = ord( $char[0] );
  95         if ( $z & 0x80 ) {
  96                 $length = 0;
  97                 while ( $z & 0x80 ) {
  98                         $length++;
  99                         $z <<= 1;
 100                 }
 101         } else {
 102                 $length = 1;
 103         }
 104
 105         if ( $length != strlen( $char ) ) {
 106                 return false;
 107         }
 108         if ( $length == 1 ) {
 109                 return ord( $char );
 110         }
 111
 112         # Mask off the length-determining bits and shift back to the original location
 113         $z &= 0xff;
 114         $z >>= $length;
 115
 116         # Add in the free bits from subsequent bytes
 117         for ( $i=1; $i<$length; $i++ ) {
 118                 $z <<= 6;
 119                 $z |= ord( $char[$i] ) & 0x3f;
 120         }
 121
 122         return $z;
 123 }
 124
 125 /**
 126  * Escape a string for inclusion in a PHP single-quoted string literal.
 127  *
 128  * @param $string String: string to be escaped.
 129  * @return String: escaped string.
 130  * @public
 131  */
 132 function escapeSingleString( $string ) {
 133         return strtr( $string,
 134                 array(
 135                         '\\' => '\\\\',
 136                         '\'' => '\\\''
 137                 ));
 138 }