includes/site/MediaWikiPageNameNormalizer.php

   1 <?php
   2
   3 namespace MediaWiki\Site;
   4
   5 use FormatJson;
   6 use Http;
   7 use UtfNormal\Validator;
   8
   9 /**
  10  * Service for normalizing a page name using a MediaWiki api.
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * This program is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License along
  23  * with this program; if not, write to the Free Software Foundation, Inc.,
  24  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  25  * http://www.gnu.org/copyleft/gpl.html
  26  *
  27  * @since 1.27
  28  *
  29  * @license GNU GPL v2+
  30  * @author John Erling Blad < jeblad@gmail.com >
  31  * @author Daniel Kinzler
  32  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  33  * @author Marius Hoch
  34  */
  35 class MediaWikiPageNameNormalizer {
  36
  37         /**
  38          * @var Http
  39          */
  40         private $http;
  41
  42         /**
  43          * @param Http|null $http
  44          */
  45         public function __construct( Http $http = null ) {
  46                 if ( !$http ) {
  47                         $http = new Http();
  48                 }
  49
  50                 $this->http = $http;
  51         }
  52
  53         /**
  54          * Returns the normalized form of the given page title, using the
  55          * normalization rules of the given site. If the given title is a redirect,
  56          * the redirect weill be resolved and the redirect target is returned.
  57          *
  58          * @note This actually makes an API request to the remote site, so beware
  59          *   that this function is slow and depends on an external service.
  60          *
  61          * @see Site::normalizePageName
  62          *
  63          * @since 1.27
  64          *
  65          * @param string $pageName
  66          * @param string $apiUrl
  67          *
  68          * @return string
  69          * @throws \MWException
  70          */
  71         public function normalizePageName( $pageName, $apiUrl ) {
  72
  73                 // Check if we have strings as arguments.
  74                 if ( !is_string( $pageName ) ) {
  75                         throw new \MWException( '$pageName must be a string' );
  76                 }
  77
  78                 // Go on call the external site
  79
  80                 // Make sure the string is normalized into NFC (due to T42017)
  81                 // but do nothing to the whitespaces, that should work appropriately.
  82                 // @see https://phabricator.wikimedia.org/T42017
  83                 $pageName = Validator::cleanUp( $pageName );
  84
  85                 // Build the args for the specific call
  86                 $args = [
  87                         'action' => 'query',
  88                         'prop' => 'info',
  89                         'redirects' => true,
  90                         'converttitles' => true,
  91                         'format' => 'json',
  92                         'titles' => $pageName,
  93                         // @todo options for maxlag and maxage
  94                         // Note that maxlag will lead to a long delay before a reply is made,
  95                         // but that maxage can avoid the extreme delay. On the other hand
  96                         // maxage could be nice to use anyhow as it stops unnecessary requests.
  97                         // Also consider smaxage if maxage is used.
  98                 ];
  99
 100                 $url = wfAppendQuery( $apiUrl, $args );
 101
 102                 // Go on call the external site
 103                 // @todo we need a good way to specify a timeout here.
 104                 $ret = $this->http->get( $url, [], __METHOD__ );
 105
 106                 if ( $ret === false ) {
 107                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
 108                         return false;
 109                 }
 110
 111                 $data = FormatJson::decode( $ret, true );
 112
 113                 if ( !is_array( $data ) ) {
 114                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
 115                         return false;
 116                 }
 117
 118                 $page = static::extractPageRecord( $data, $pageName );
 119
 120                 if ( isset( $page['missing'] ) ) {
 121                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
 122                                 . $ret );
 123                         return false;
 124                 }
 125
 126                 if ( isset( $page['invalid'] ) ) {
 127                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
 128                                 . $ret );
 129                         return false;
 130                 }
 131
 132                 if ( !isset( $page['title'] ) ) {
 133                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 134                         return false;
 135                 }
 136
 137                 return $page['title'];
 138         }
 139
 140         /**
 141          * Get normalization record for a given page title from an API response.
 142          *
 143          * @param array $externalData A reply from the API on a external server.
 144          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 145          *
 146          * @return array|bool A 'page' structure representing the page identified by $pageTitle.
 147          */
 148         private static function extractPageRecord( $externalData, $pageTitle ) {
 149                 // If there is a special case with only one returned page
 150                 // we can cheat, and only return
 151                 // the single page in the "pages" substructure.
 152                 if ( isset( $externalData['query']['pages'] ) ) {
 153                         $pages = array_values( $externalData['query']['pages'] );
 154                         if ( count( $pages ) === 1 ) {
 155                                 return $pages[0];
 156                         }
 157                 }
 158                 // This is only used during internal testing, as it is assumed
 159                 // a more optimal (and lossfree) storage.
 160                 // Make initial checks and return if prerequisites are not meet.
 161                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 162                         return false;
 163                 }
 164                 // Loop over the tree different named structures, that otherwise are similar
 165                 $structs = [
 166                         'normalized' => 'from',
 167                         'converted' => 'from',
 168                         'redirects' => 'from',
 169                         'pages' => 'title'
 170                 ];
 171                 foreach ( $structs as $listId => $fieldId ) {
 172                         // Check if the substructure exist at all.
 173                         if ( !isset( $externalData['query'][$listId] ) ) {
 174                                 continue;
 175                         }
 176                         // Filter the substructure down to what we actually are using.
 177                         $collectedHits = array_filter(
 178                                 array_values( $externalData['query'][$listId] ),
 179                                 function ( $a ) use ( $fieldId, $pageTitle ) {
 180                                         return $a[$fieldId] === $pageTitle;
 181                                 }
 182                         );
 183                         // If still looping over normalization, conversion or redirects,
 184                         // then we need to keep the new page title for later rounds.
 185                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 186                                 switch ( count( $collectedHits ) ) {
 187                                         case 0:
 188                                                 break;
 189                                         case 1:
 190                                                 $pageTitle = $collectedHits[0]['to'];
 191                                                 break;
 192                                         default:
 193                                                 return false;
 194                                 }
 195                         } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 196                                 // If on the pages structure we should prepare for returning.
 197
 198                                 switch ( count( $collectedHits ) ) {
 199                                         case 0:
 200                                                 return false;
 201                                         case 1:
 202                                                 return array_shift( $collectedHits );
 203                                         default:
 204                                                 return false;
 205                                 }
 206                         }
 207                 }
 208                 // should never be here
 209                 return false;
 210         }
 211
 212 }