Merge "maintenance: Script to rename titles for Unicode uppercasing changes"
[lhc/web/wiklou.git] / includes / MagicWordArray.php
1 <?php
2
3 /**
4 * See docs/magicword.txt.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 */
24
25 use MediaWiki\Logger\LoggerFactory;
26 use MediaWiki\MediaWikiServices;
27
28 /**
29 * Class for handling an array of magic words
30 * @ingroup Parser
31 */
32 class MagicWordArray {
33 /** @var string[] */
34 public $names = [];
35
36 /** @var MagicWordFactory */
37 private $factory;
38
39 /** @var array */
40 private $hash;
41
42 private $baseRegex;
43
44 private $regex;
45
46 /**
47 * @param string[] $names
48 * @param MagicWordFactory|null $factory
49 */
50 public function __construct( $names = [], MagicWordFactory $factory = null ) {
51 $this->names = $names;
52 $this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory();
53 }
54
55 /**
56 * Add a magic word by name
57 *
58 * @param string $name
59 */
60 public function add( $name ) {
61 $this->names[] = $name;
62 $this->hash = $this->baseRegex = $this->regex = null;
63 }
64
65 /**
66 * Add a number of magic words by name
67 *
68 * @param string[] $names
69 */
70 public function addArray( $names ) {
71 $this->names = array_merge( $this->names, array_values( $names ) );
72 $this->hash = $this->baseRegex = $this->regex = null;
73 }
74
75 /**
76 * Get a 2-d hashtable for this array
77 * @return array
78 */
79 public function getHash() {
80 if ( is_null( $this->hash ) ) {
81 $this->hash = [ 0 => [], 1 => [] ];
82 foreach ( $this->names as $name ) {
83 $magic = $this->factory->get( $name );
84 $case = intval( $magic->isCaseSensitive() );
85 foreach ( $magic->getSynonyms() as $syn ) {
86 if ( !$case ) {
87 $syn = $this->factory->getContentLanguage()->lc( $syn );
88 }
89 $this->hash[$case][$syn] = $name;
90 }
91 }
92 }
93 return $this->hash;
94 }
95
96 /**
97 * Get the base regex
98 * @return string[]
99 */
100 public function getBaseRegex() {
101 if ( is_null( $this->baseRegex ) ) {
102 $this->baseRegex = [ 0 => '', 1 => '' ];
103 $allGroups = [];
104 foreach ( $this->names as $name ) {
105 $magic = $this->factory->get( $name );
106 $case = intval( $magic->isCaseSensitive() );
107 foreach ( $magic->getSynonyms() as $i => $syn ) {
108 // Group name must start with a non-digit in PCRE 8.34+
109 $it = strtr( $i, '0123456789', 'abcdefghij' );
110 $groupName = $it . '_' . $name;
111 $group = '(?P<' . $groupName . '>' . preg_quote( $syn, '/' ) . ')';
112 // look for same group names to avoid same named subpatterns in the regex
113 if ( isset( $allGroups[$groupName] ) ) {
114 throw new MWException(
115 __METHOD__ . ': duplicate internal name in magic word array: ' . $name
116 );
117 }
118 $allGroups[$groupName] = true;
119 if ( $this->baseRegex[$case] === '' ) {
120 $this->baseRegex[$case] = $group;
121 } else {
122 $this->baseRegex[$case] .= '|' . $group;
123 }
124 }
125 }
126 }
127 return $this->baseRegex;
128 }
129
130 /**
131 * Get an unanchored regex that does not match parameters
132 * @return string[]
133 */
134 public function getRegex() {
135 if ( is_null( $this->regex ) ) {
136 $base = $this->getBaseRegex();
137 $this->regex = [ '', '' ];
138 if ( $this->baseRegex[0] !== '' ) {
139 $this->regex[0] = "/{$base[0]}/iuS";
140 }
141 if ( $this->baseRegex[1] !== '' ) {
142 $this->regex[1] = "/{$base[1]}/S";
143 }
144 }
145 return $this->regex;
146 }
147
148 /**
149 * Get a regex for matching variables with parameters
150 *
151 * @return string[]
152 */
153 public function getVariableRegex() {
154 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
155 }
156
157 /**
158 * Get a regex anchored to the start of the string that does not match parameters
159 *
160 * @return string[]
161 */
162 public function getRegexStart() {
163 $base = $this->getBaseRegex();
164 $newRegex = [ '', '' ];
165 if ( $base[0] !== '' ) {
166 $newRegex[0] = "/^(?:{$base[0]})/iuS";
167 }
168 if ( $base[1] !== '' ) {
169 $newRegex[1] = "/^(?:{$base[1]})/S";
170 }
171 return $newRegex;
172 }
173
174 /**
175 * Get an anchored regex for matching variables with parameters
176 *
177 * @return string[]
178 */
179 public function getVariableStartToEndRegex() {
180 $base = $this->getBaseRegex();
181 $newRegex = [ '', '' ];
182 if ( $base[0] !== '' ) {
183 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
184 }
185 if ( $base[1] !== '' ) {
186 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
187 }
188 return $newRegex;
189 }
190
191 /**
192 * @since 1.20
193 * @return string[]
194 */
195 public function getNames() {
196 return $this->names;
197 }
198
199 /**
200 * Parse a match array from preg_match
201 * Returns array(magic word ID, parameter value)
202 * If there is no parameter value, that element will be false.
203 *
204 * @param array $m
205 *
206 * @throws MWException
207 * @return array
208 */
209 public function parseMatch( $m ) {
210 reset( $m );
211 while ( ( $key = key( $m ) ) !== null ) {
212 $value = current( $m );
213 next( $m );
214 if ( $key === 0 || $value === '' ) {
215 continue;
216 }
217 $parts = explode( '_', $key, 2 );
218 if ( count( $parts ) != 2 ) {
219 // This shouldn't happen
220 // continue;
221 throw new MWException( __METHOD__ . ': bad parameter name' );
222 }
223 list( /* $synIndex */, $magicName ) = $parts;
224 $paramValue = next( $m );
225 return [ $magicName, $paramValue ];
226 }
227 // This shouldn't happen either
228 throw new MWException( __METHOD__ . ': parameter not found' );
229 }
230
231 /**
232 * Match some text, with parameter capture
233 * Returns an array with the magic word name in the first element and the
234 * parameter in the second element.
235 * Both elements are false if there was no match.
236 *
237 * @param string $text
238 *
239 * @return array
240 */
241 public function matchVariableStartToEnd( $text ) {
242 $regexes = $this->getVariableStartToEndRegex();
243 foreach ( $regexes as $regex ) {
244 if ( $regex !== '' ) {
245 $m = [];
246 if ( preg_match( $regex, $text, $m ) ) {
247 return $this->parseMatch( $m );
248 }
249 }
250 }
251 return [ false, false ];
252 }
253
254 /**
255 * Match some text, without parameter capture
256 * Returns the magic word name, or false if there was no capture
257 *
258 * @param string $text
259 *
260 * @return string|bool False on failure
261 */
262 public function matchStartToEnd( $text ) {
263 $hash = $this->getHash();
264 if ( isset( $hash[1][$text] ) ) {
265 return $hash[1][$text];
266 }
267 $lc = $this->factory->getContentLanguage()->lc( $text );
268 return $hash[0][$lc] ?? false;
269 }
270
271 /**
272 * Returns an associative array, ID => param value, for all items that match
273 * Removes the matched items from the input string (passed by reference)
274 *
275 * @param string &$text
276 *
277 * @return array
278 */
279 public function matchAndRemove( &$text ) {
280 $found = [];
281 $regexes = $this->getRegex();
282 foreach ( $regexes as $regex ) {
283 if ( $regex === '' ) {
284 continue;
285 }
286 $matches = [];
287 $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
288 if ( $res === false ) {
289 LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all returned false', [
290 'code' => preg_last_error(),
291 'regex' => $regex,
292 'text' => $text,
293 ] );
294 } elseif ( $res ) {
295 foreach ( $matches as $m ) {
296 list( $name, $param ) = $this->parseMatch( $m );
297 $found[$name] = $param;
298 }
299 }
300 $res = preg_replace( $regex, '', $text );
301 if ( $res === null ) {
302 LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace returned null', [
303 'code' => preg_last_error(),
304 'regex' => $regex,
305 'text' => $text,
306 ] );
307 }
308 $text = $res;
309 }
310 return $found;
311 }
312
313 /**
314 * Return the ID of the magic word at the start of $text, and remove
315 * the prefix from $text.
316 * Return false if no match found and $text is not modified.
317 * Does not match parameters.
318 *
319 * @param string &$text
320 *
321 * @return int|bool False on failure
322 */
323 public function matchStartAndRemove( &$text ) {
324 $regexes = $this->getRegexStart();
325 foreach ( $regexes as $regex ) {
326 if ( $regex === '' ) {
327 continue;
328 }
329 if ( preg_match( $regex, $text, $m ) ) {
330 list( $id, ) = $this->parseMatch( $m );
331 if ( strlen( $m[0] ) >= strlen( $text ) ) {
332 $text = '';
333 } else {
334 $text = substr( $text, strlen( $m[0] ) );
335 }
336 return $id;
337 }
338 }
339 return false;
340 }
341 }