Merge "Cleanup execution flow through SpecialSearch::execute()"
[lhc/web/wiklou.git] / includes / site / MediaWikiPageNameNormalizer.php
1 <?php
2
3 namespace MediaWiki\Site;
4
5 use FormatJson;
6 use Http;
7 use UtfNormal\Validator;
8
9 /**
10 * Service for normalizing a page name using a MediaWiki api.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 * http://www.gnu.org/copyleft/gpl.html
26 *
27 * @since 1.27
28 *
29 * @license GNU GPL v2+
30 * @author John Erling Blad < jeblad@gmail.com >
31 * @author Daniel Kinzler
32 * @author Jeroen De Dauw < jeroendedauw@gmail.com >
33 * @author Marius Hoch
34 */
35 class MediaWikiPageNameNormalizer {
36
37 /**
38 * @var Http
39 */
40 private $http;
41
42 /**
43 * @param Http|null $http
44 */
45 public function __construct( Http $http = null ) {
46 if ( !$http ) {
47 $http = new Http();
48 }
49
50 $this->http = $http;
51 }
52
53 /**
54 * Returns the normalized form of the given page title, using the
55 * normalization rules of the given site. If the given title is a redirect,
56 * the redirect weill be resolved and the redirect target is returned.
57 *
58 * @note This actually makes an API request to the remote site, so beware
59 * that this function is slow and depends on an external service.
60 *
61 * @see Site::normalizePageName
62 *
63 * @since 1.27
64 *
65 * @param string $pageName
66 * @param string $apiUrl
67 *
68 * @return string
69 * @throws \MWException
70 */
71 public function normalizePageName( $pageName, $apiUrl ) {
72
73 // Check if we have strings as arguments.
74 if ( !is_string( $pageName ) ) {
75 throw new \MWException( '$pageName must be a string' );
76 }
77
78 // Go on call the external site
79
80 // Make sure the string is normalized into NFC (due to T42017)
81 // but do nothing to the whitespaces, that should work appropriately.
82 // @see https://phabricator.wikimedia.org/T42017
83 $pageName = Validator::cleanUp( $pageName );
84
85 // Build the args for the specific call
86 $args = [
87 'action' => 'query',
88 'prop' => 'info',
89 'redirects' => true,
90 'converttitles' => true,
91 'format' => 'json',
92 'titles' => $pageName,
93 // @todo options for maxlag and maxage
94 // Note that maxlag will lead to a long delay before a reply is made,
95 // but that maxage can avoid the extreme delay. On the other hand
96 // maxage could be nice to use anyhow as it stops unnecessary requests.
97 // Also consider smaxage if maxage is used.
98 ];
99
100 $url = wfAppendQuery( $apiUrl, $args );
101
102 // Go on call the external site
103 // @todo we need a good way to specify a timeout here.
104 $ret = $this->http->get( $url, [], __METHOD__ );
105
106 if ( $ret === false ) {
107 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
108 return false;
109 }
110
111 $data = FormatJson::decode( $ret, true );
112
113 if ( !is_array( $data ) ) {
114 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
115 return false;
116 }
117
118 $page = static::extractPageRecord( $data, $pageName );
119
120 if ( isset( $page['missing'] ) ) {
121 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
122 . $ret );
123 return false;
124 }
125
126 if ( isset( $page['invalid'] ) ) {
127 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
128 . $ret );
129 return false;
130 }
131
132 if ( !isset( $page['title'] ) ) {
133 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
134 return false;
135 }
136
137 return $page['title'];
138 }
139
140 /**
141 * Get normalization record for a given page title from an API response.
142 *
143 * @param array $externalData A reply from the API on a external server.
144 * @param string $pageTitle Identifies the page at the external site, needing normalization.
145 *
146 * @return array|bool A 'page' structure representing the page identified by $pageTitle.
147 */
148 private static function extractPageRecord( $externalData, $pageTitle ) {
149 // If there is a special case with only one returned page
150 // we can cheat, and only return
151 // the single page in the "pages" substructure.
152 if ( isset( $externalData['query']['pages'] ) ) {
153 $pages = array_values( $externalData['query']['pages'] );
154 if ( count( $pages ) === 1 ) {
155 return $pages[0];
156 }
157 }
158 // This is only used during internal testing, as it is assumed
159 // a more optimal (and lossfree) storage.
160 // Make initial checks and return if prerequisites are not meet.
161 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
162 return false;
163 }
164 // Loop over the tree different named structures, that otherwise are similar
165 $structs = [
166 'normalized' => 'from',
167 'converted' => 'from',
168 'redirects' => 'from',
169 'pages' => 'title'
170 ];
171 foreach ( $structs as $listId => $fieldId ) {
172 // Check if the substructure exist at all.
173 if ( !isset( $externalData['query'][$listId] ) ) {
174 continue;
175 }
176 // Filter the substructure down to what we actually are using.
177 $collectedHits = array_filter(
178 array_values( $externalData['query'][$listId] ),
179 function ( $a ) use ( $fieldId, $pageTitle ) {
180 return $a[$fieldId] === $pageTitle;
181 }
182 );
183 // If still looping over normalization, conversion or redirects,
184 // then we need to keep the new page title for later rounds.
185 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
186 switch ( count( $collectedHits ) ) {
187 case 0:
188 break;
189 case 1:
190 $pageTitle = $collectedHits[0]['to'];
191 break;
192 default:
193 return false;
194 }
195 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
196 // If on the pages structure we should prepare for returning.
197
198 switch ( count( $collectedHits ) ) {
199 case 0:
200 return false;
201 case 1:
202 return array_shift( $collectedHits );
203 default:
204 return false;
205 }
206 }
207 }
208 // should never be here
209 return false;
210 }
211
212 }