includes/site/MediaWikiSite.php

   1 <?php
   2
   3 /**
   4  * Class representing a MediaWiki site.
   5  *
   6  * @since 1.21
   7  *
   8  * @file
   9  * @ingroup Site
  10  *
  11  * @licence GNU GPL v2+
  12  * @author John Erling Blad < jeblad@gmail.com >
  13  * @author Daniel Kinzler
  14  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  15  */
  16 class MediaWikiSite extends SiteObject {
  17
  18         const PATH_FILE = 'file_path';
  19         const PATH_PAGE = 'page_path';
  20
  21         /**
  22          * @since 1.21
  23          *
  24          * @param integer $globalId
  25          *
  26          * @return MediaWikiSite
  27          */
  28         public static function newFromGlobalId( $globalId ) {
  29                 return SitesTable::singleton()->newRow( array(
  30                         'type' => Site::TYPE_MEDIAWIKI,
  31                         'global_key' => $globalId,
  32                 ), true );
  33         }
  34
  35         /**
  36          * Returns the database form of the given title.
  37          *
  38          * @since 1.21
  39          *
  40          * @param String $title the target page's title, in normalized form.
  41          *
  42          * @return String
  43          */
  44         public function toDBKey( $title ) {
  45                 return str_replace( ' ', '_', $title );
  46         }
  47
  48         /**
  49          * Returns the normalized form of the given page title, using the normalization rules of the given site.
  50          * If the given title is a redirect, the redirect weill be resolved and the redirect target is returned.
  51          *
  52          * @note  : This actually makes an API request to the remote site, so beware that this function is slow and depends
  53          *          on an external service.
  54          *
  55          * @note  : If MW_PHPUNIT_TEST is defined or $egWBRemoteTitleNormalization is set to false, the call to the
  56          *          external site is skipped, and the title is normalized using the local normalization rules as
  57          *          implemented by the Title class.
  58          *
  59          * @see Site::normalizePageName
  60          *
  61          * @since 1.21
  62          *
  63          * @param string $pageName
  64          *
  65          * @return string
  66          * @throws MWException
  67          */
  68         public function normalizePageName( $pageName ) {
  69                 global $egWBRemoteTitleNormalization;
  70
  71                 // Check if we have strings as arguments.
  72                 if ( !is_string( $pageName ) ) {
  73                         throw new MWException( '$pageName must be a string' );
  74                 }
  75
  76                 // Go on call the external site
  77                 if ( defined( 'MW_PHPUNIT_TEST' ) ) {
  78                         // If the code is under test, don't call out to other sites, just normalize locally.
  79                         // Note: this may cause results to be inconsistent with the actual normalization used by the respective remote site!
  80
  81                         $t = Title::newFromText( $pageName );
  82                         return $t->getPrefixedText();
  83                 } else {
  84
  85                         // Make sure the string is normalized into NFC (due to the bug 40017)
  86                         // but do nothing to the whitespaces, that should work appropriately.
  87                         // @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017
  88                         $pageName = UtfNormal::cleanUp( $pageName );
  89
  90                         // Build the args for the specific call
  91                         $args = array(
  92                                 'action' => 'query',
  93                                 'prop' => 'info',
  94                                 'redirects' => true,
  95                                 'converttitles' => true,
  96                                 'format' => 'json',
  97                                 'titles' => $pageName,
  98                                 //@todo: options for maxlag and maxage
  99                                 // Note that maxlag will lead to a long delay before a reply is made,
 100                                 // but that maxage can avoid the extreme delay. On the other hand
 101                                 // maxage could be nice to use anyhow as it stops unnecessary requests.
 102                                 // Also consider smaxage if maxage is used.
 103                         );
 104
 105                         $url = $this->getFileUrl( 'api.php' ) . '?' . wfArrayToCgi( $args );
 106
 107                         // Go on call the external site
 108                         //@todo: we need a good way to specify a timeout here.
 109                         $ret = Http::get( $url );
 110                 }
 111
 112                 if ( $ret === false ) {
 113                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
 114                         return false;
 115                 }
 116
 117                 $data = FormatJson::decode( $ret, true );
 118
 119                 if ( !is_array( $data ) ) {
 120                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
 121                         return false;
 122                 }
 123
 124                 $page = static::extractPageRecord( $data, $pageName );
 125
 126                 if ( isset( $page['missing'] ) ) {
 127                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a missing page title! " . $ret );
 128                         return false;
 129                 }
 130
 131                 if ( !isset( $page['title'] ) ) {
 132                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 133                         return false;
 134                 }
 135
 136                 return $page['title'];
 137         }
 138
 139
 140         /**
 141          * Get normalization record for a given page title from an API response.
 142          *
 143          * @since 1.21
 144          *
 145          * @param array $externalData A reply from the API on a external server.
 146          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 147          *
 148          * @return array|false a 'page' structure representing the page identified by $pageTitle.
 149          */
 150         private static function extractPageRecord( $externalData, $pageTitle ) {
 151                 // If there is a special case with only one returned page
 152                 // we can cheat, and only return
 153                 // the single page in the "pages" substructure.
 154                 if ( isset( $externalData['query']['pages'] ) ) {
 155                         $pages = array_values( $externalData['query']['pages'] );
 156                         if ( count( $pages) === 1 ) {
 157                                 return $pages[0];
 158                         }
 159                 }
 160                 // This is only used during internal testing, as it is assumed
 161                 // a more optimal (and lossfree) storage.
 162                 // Make initial checks and return if prerequisites are not meet.
 163                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 164                         return false;
 165                 }
 166                 // Loop over the tree different named structures, that otherwise are similar
 167                 $structs = array(
 168                         'normalized' => 'from',
 169                         'converted' => 'from',
 170                         'redirects' => 'from',
 171                         'pages' => 'title'
 172                 );
 173                 foreach ( $structs as $listId => $fieldId ) {
 174                         // Check if the substructure exist at all.
 175                         if ( !isset( $externalData['query'][$listId] ) ) {
 176                                 continue;
 177                         }
 178                         // Filter the substructure down to what we actually are using.
 179                         $collectedHits = array_filter(
 180                                 array_values( $externalData['query'][$listId] ),
 181                                 function( $a ) use ( $fieldId, $pageTitle ) {
 182                                         return $a[$fieldId] === $pageTitle;
 183                                 }
 184                         );
 185                         // If still looping over normalization, conversion or redirects,
 186                         // then we need to keep the new page title for later rounds.
 187                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 188                                 switch ( count( $collectedHits ) ) {
 189                                         case 0:
 190                                                 break;
 191                                         case 1:
 192                                                 $pageTitle = $collectedHits[0]['to'];
 193                                                 break;
 194                                         default:
 195                                                 return false;
 196                                 }
 197                         }
 198                         // If on the pages structure we should prepare for returning.
 199                         elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 200                                 switch ( count( $collectedHits ) ) {
 201                                         case 0:
 202                                                 return false;
 203                                         case 1:
 204                                                 return array_shift( $collectedHits );
 205                                         default:
 206                                                 return false;
 207                                 }
 208                         }
 209                 }
 210                 // should never be here
 211                 return false;
 212         }
 213
 214         /**
 215          * @see Site::getLinkPathType
 216          * Returns Site::PATH_PAGE
 217          *
 218          * @since 1.21
 219          *
 220          * @return string
 221          */
 222         public function getLinkPathType() {
 223                 return self::PATH_PAGE;
 224         }
 225
 226         /**
 227          * Returns the relative page path.
 228          *
 229          * @since 1.21
 230          *
 231          * @return string
 232          */
 233         public function getRelativePagePath() {
 234                 return parse_url( $this->getPath( self::PATH_PAGE ), PHP_URL_PATH );
 235         }
 236
 237         /**
 238          * Returns the relative file path.
 239          *
 240          * @since 1.21
 241          *
 242          * @return string
 243          */
 244         public function getRelativeFilePath() {
 245                 return parse_url( $this->getPath( self::PATH_FILE ), PHP_URL_PATH );
 246         }
 247
 248         /**
 249          * Sets the relative page path.
 250          *
 251          * @since 1.21
 252          *
 253          * @param string $path
 254          */
 255         public function setPagePath( $path ) {
 256                 $this->setPath( self::PATH_PAGE, $path );
 257         }
 258
 259         /**
 260          * Sets the relative file path.
 261          *
 262          * @since 1.21
 263          *
 264          * @param string $path
 265          */
 266         public function setFilePath( $path ) {
 267                 $this->setPath( self::PATH_FILE, $path );
 268         }
 269
 270         /**
 271          * @see Site::getPagePath
 272          *
 273          * This implementation returns a URL constructed using the path returned by getLinkPath().
 274          * In addition to the default behaviour implemented by SiteObject::getPageUrl(), this
 275          * method converts the $pageName to DBKey-format by replacing spaces with underscores
 276          * before using it in the URL.
 277          *
 278          * @since 1.21
 279          *
 280          * @param string|false
 281          *
 282          * @return string
 283          */
 284         public function getPageUrl( $pageName = false ) {
 285                 $url = $this->getLinkPath();
 286
 287                 if ( $url === false ) {
 288                         return false;
 289                 }
 290
 291                 if ( $pageName !== false ) {
 292                         $pageName = $this->toDBKey( trim( $pageName ) );
 293                         $url = str_replace( '$1', wfUrlencode( $pageName ), $url ) ;
 294                 }
 295
 296                 return $url;
 297         }
 298
 299         /**
 300          * Returns the full file path (ie site url + relative file path).
 301          * The path should go at the $1 marker. If the $path
 302          * argument is provided, the marker will be replaced by it's value.
 303          *
 304          * @since 1.21
 305          *
 306          * @param string|false $path
 307          *
 308          * @return string
 309          */
 310         public function getFileUrl( $path = false ) {
 311                 $filePath = $this->getPath( self::PATH_FILE );
 312
 313                 if ( $filePath !== false ) {
 314                         $filePath = str_replace( '$1', $path, $filePath );
 315                 }
 316
 317                 return $filePath;
 318         }
 319
 320 }