<?php
/**
- * A codec for %MediaWiki page titles.
+ * A codec for MediaWiki page titles.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* http://www.gnu.org/copyleft/gpl.html
*
* @file
- * @license GPL 2+
* @author Daniel Kinzler
*/
use MediaWiki\Interwiki\InterwikiLookup;
use MediaWiki\Linker\LinkTarget;
/**
- * A codec for %MediaWiki page titles.
+ * A codec for MediaWiki page titles.
*
* @note Normalization and validation is applied while parsing, not when formatting.
* It's possible to construct a TitleValue with an invalid title, and use MediaWikiTitleCodec
'user_case_dbkey' => $dbkey,
];
- # Strip Unicode bidi override characters.
+ # Strip soft hyphens (U+00AD) and Unicode directional formatting characters (U+061C, U+200E,
+ # U+200F, U+202A. U+202B, U+202C, U+202D, U+202E, U+2066, U+2067, U+2068, U+2069).
# Sometimes they slip into cut-n-pasted page titles, where the
- # override chars get included in list displays.
- $dbkey = preg_replace( '/\xE2\x80[\x8E\x8F\xAA-\xAE]/S', '', $dbkey );
+ # soft hyphens or override chars get included in list displays.
+ $dbkey = preg_replace(
+ '/\xC2\xAD|\xD8\x9C|\xE2\x80[\x8E\x8F\xAA-\xAE]|\xE2\x81[\xA6-\xA9]/S',
+ '',
+ $dbkey
+ );
# Clean up whitespace
# Note: use of the /u option on preg_replace here will cause