maintenance: Script to rename titles for Unicode uppercasing changes
authorBrad Jorsch <bjorsch@wikimedia.org>
Tue, 30 Apr 2019 15:33:06 +0000 (11:33 -0400)
committerBrad Jorsch <bjorsch@wikimedia.org>
Tue, 9 Jul 2019 17:53:43 +0000 (13:53 -0400)
This uses MovePage where possible to try to better keep data in sync.
Archives, log entries, and so on can't do that though.

The script skips User and User_talk pages for registered users, as
renaming the users is more complicated than makes sense to try to
implement here. Use something like Extension:Renameuser to clean those
up; this script can provide a list of user names affected.

Bug: T219279
Change-Id: I157577cb5bedfd347b808c254fb19ae8088818ab

autoload.php
maintenance/uppercaseTitlesForUnicodeTransition.php [new file with mode: 0644]

index 13037ff..856ce3a 100644 (file)
@@ -1549,6 +1549,7 @@ $wgAutoloadLocalClasses = [
        'UploadStashWrongOwnerException' => __DIR__ . '/includes/upload/exception/UploadStashWrongOwnerException.php',
        'UploadStashZeroLengthFileException' => __DIR__ . '/includes/upload/exception/UploadStashZeroLengthFileException.php',
        'UppercaseCollation' => __DIR__ . '/includes/collation/UppercaseCollation.php',
+       'UppercaseTitlesForUnicodeTransition' => __DIR__ . '/maintenance/uppercaseTitlesForUnicodeTransition.php',
        'User' => __DIR__ . '/includes/user/User.php',
        'UserArray' => __DIR__ . '/includes/user/UserArray.php',
        'UserArrayFromResult' => __DIR__ . '/includes/user/UserArrayFromResult.php',
diff --git a/maintenance/uppercaseTitlesForUnicodeTransition.php b/maintenance/uppercaseTitlesForUnicodeTransition.php
new file mode 100644 (file)
index 0000000..f5bafde
--- /dev/null
@@ -0,0 +1,595 @@
+<?php
+/**
+ * Obligatory redundant license notice. Exception to the GPL's "keep intact all
+ * the notices" clause with respect to this notice is hereby granted.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+use MediaWiki\MediaWikiServices;
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script to rename titles affected by changes to Unicode (or
+ * otherwise to Language::ucfirst).
+ *
+ * @ingroup Maintenance
+ */
+class UppercaseTitlesForUnicodeTransition extends Maintenance {
+
+       /** @var bool */
+       private $run = false;
+
+       /** @var array */
+       private $charmap = [];
+
+       /** @var User */
+       private $user;
+
+       /** @var string */
+       private $reason = 'Uppercasing title for Unicode upgrade';
+
+       /** @var string[] */
+       private $tags = [];
+
+       /** @var array */
+       private $seenUsers = [];
+
+       /** @var array|null */
+       private $namespaces = null;
+
+       /** @var string|null */
+       private $prefix = null, $suffix = null;
+
+       /** @var int|null */
+       private $prefixNs = null;
+
+       /** @var string[]|null */
+       private $tables = null;
+
+       public function __construct() {
+               parent::__construct();
+               $this->addDescription(
+                       "Rename titles when changing behavior of Language::ucfirst().\n"
+                       . "\n"
+                       . "This script skips User and User_talk pages for registered users, as renaming of users "
+                       . "is too complex to try to implement here. Use something like Extension:Renameuser to "
+                       . "clean those up; this script can provide a list of user names affected."
+               );
+               $this->addOption(
+                       'charmap', 'Character map generated by maintenance/language/generateUcfirstOverrides.php',
+                       true, true
+               );
+               $this->addOption(
+                       'user', 'System user to use to do the renames. Default is "Maintenance script".', false, true
+               );
+               $this->addOption(
+                       'steal',
+                       'If the username specified by --user exists, specify this to force conversion to a system user.'
+               );
+               $this->addOption(
+                       'run', 'If not specified, the script will not actually perform any moves (i.e. it will dry-run).'
+               );
+               $this->addOption(
+                       'prefix', 'When the new title already exists, add this prefix.', false, true
+               );
+               $this->addOption(
+                       'suffix', 'When the new title already exists, add this suffix.', false, true
+               );
+               $this->addOption( 'reason', 'Reason to use when moving pages.', false, true );
+               $this->addOption( 'tag', 'Change tag to apply when moving pages.', false, true );
+               $this->addOption( 'tables', 'Comma-separated list of database tables to process.', false, true );
+               $this->addOption(
+                       'userlist', 'Filename to which to output usernames needing rename.', false, true
+               );
+               $this->setBatchSize( 1000 );
+       }
+
+       public function execute() {
+               $this->run = $this->getOption( 'run', false );
+
+               if ( $this->run ) {
+                       $username = $this->getOption( 'user', 'Maintenance script' );
+                       $steal = $this->getOption( 'steal', false );
+                       $this->user = User::newSystemUser( $username, [ 'steal' => $steal ] );
+                       if ( !$this->user ) {
+                               $user = User::newFromName( $username );
+                               if ( !$steal && $user && $user->isLoggedIn() ) {
+                                       $this->fatalError( "User $username already exists.\n"
+                                               . "Use --steal if you really want to steal it from the human who currently owns it."
+                                       );
+                               }
+                               $this->fatalError( "Could not obtain system user $username." );
+                       }
+               }
+
+               $tables = $this->getOption( 'tables' );
+               if ( $tables !== null ) {
+                       $this->tables = explode( ',', $tables );
+               }
+
+               $prefix = $this->getOption( 'prefix' );
+               if ( $prefix !== null ) {
+                       $title = Title::newFromText( $prefix . 'X' );
+                       if ( !$title || substr( $title->getDBkey(), -1 ) !== 'X' ) {
+                               $this->fatalError( 'Invalid --prefix.' );
+                       }
+                       if ( $title->getNamespace() <= NS_MAIN || $title->isExternal() ) {
+                               $this->fatalError( 'Invalid --prefix. It must not be in namespace 0 and must not be external' );
+                       }
+                       $this->prefixNs = $title->getNamespace();
+                       $this->prefix = substr( $title->getText(), 0, -1 );
+               }
+               $this->suffix = $this->getOption( 'suffix' );
+
+               $this->reason = $this->getOption( 'reason' ) ?: $this->reason;
+               $this->tags = (array)$this->getOption( 'tag', null );
+
+               $charmapFile = $this->getOption( 'charmap' );
+               if ( !file_exists( $charmapFile ) ) {
+                       $this->fatalError( "Charmap file $charmapFile does not exist." );
+               }
+               if ( !is_file( $charmapFile ) || !is_readable( $charmapFile ) ) {
+                       $this->fatalError( "Charmap file $charmapFile is not readable." );
+               }
+               $this->charmap = require $charmapFile;
+               if ( !is_array( $this->charmap ) ) {
+                       $this->fatalError( "Charmap file $charmapFile did not return a PHP array." );
+               }
+               $this->charmap = array_filter(
+                       $this->charmap,
+                       function ( $v, $k ) {
+                               if ( mb_strlen( $k ) !== 1 ) {
+                                       $this->error( "Ignoring mapping from multi-character key '$k' to '$v'" );
+                                       return false;
+                               }
+                               return $k !== $v;
+                       },
+                       ARRAY_FILTER_USE_BOTH
+               );
+               if ( !$this->charmap ) {
+                       $this->fatalError( "Charmap file $charmapFile did not contain any usable character mappings." );
+               }
+
+               $db = $this->getDB( $this->run ? DB_MASTER : DB_REPLICA );
+               $this->processTable( $db, true, 'page', 'page_namespace', 'page_title', [ 'page_id' ] );
+               $this->processTable( $db, true, 'image', NS_FILE, 'img_name', [] );
+               $this->processTable(
+                       $db, false, 'archive', 'ar_namespace', 'ar_title', [ 'ar_timestamp', 'ar_id' ]
+               );
+               $this->processTable( $db, false, 'filearchive', NS_FILE, 'fa_name', [ 'fa_timestamp', 'fa_id' ] );
+               $this->processTable( $db, false, 'logging', 'log_namespace', 'log_title', [ 'log_id' ] );
+               $this->processTable( $db, false, 'redirect', 'rd_namespace', 'rd_title', [ 'rd_from' ] );
+               $this->processTable( $db, false, 'protected_titles', 'pt_namespace', 'pt_title', [] );
+               $this->processUsers( $db );
+       }
+
+       /**
+        * Get batched LIKE conditions from the charmap
+        * @param IDatabase $db Database handle
+        * @param string $field Field name
+        * @param int $batchSize Size of the batches
+        * @return array
+        */
+       private function getLikeBatches( IDatabase $db, $field, $batchSize = 100 ) {
+               $ret = [];
+               $likes = [];
+               foreach ( $this->charmap as $from => $to ) {
+                       $likes[] = $field . $db->buildLike( $from, $db->anyString() );
+                       if ( count( $likes ) >= $batchSize ) {
+                               $ret[] = $db->makeList( $likes, $db::LIST_OR );
+                               $likes = [];
+                       }
+               }
+               if ( $likes ) {
+                       $ret[] = $db->makeList( $likes, $db::LIST_OR );
+               }
+               return $ret;
+       }
+
+       /**
+        * Get the list of namespaces to operate on
+        *
+        * We only care about namespaces where we can move pages and titles are
+        * capitalized.
+        *
+        * @return int[]
+        */
+       private function getNamespaces() {
+               if ( $this->namespaces === null ) {
+                       $nsinfo = MediaWikiServices::getInstance()->getNamespaceInfo();
+                       $this->namespaces = array_filter(
+                               array_keys( $nsinfo->getCanonicalNamespaces() ),
+                               function ( $ns ) use ( $nsinfo ) {
+                                       return $nsinfo->isMovable( $ns ) && $nsinfo->isCapitalized( $ns );
+                               }
+                       );
+                       usort( $this->namespaces, function ( $ns1, $ns2 ) use ( $nsinfo ) {
+                               if ( $ns1 === $ns2 ) {
+                                       return 0;
+                               }
+
+                               $s1 = $nsinfo->getSubject( $ns1 );
+                               $s2 = $nsinfo->getSubject( $ns2 );
+
+                               // Order by subject namespace number first
+                               if ( $s1 !== $s2 ) {
+                                       return $s1 < $s2 ? -1 : 1;
+                               }
+
+                               // Second, put subject namespaces before non-subject namespaces
+                               if ( $s1 === $ns1 ) {
+                                       return -1;
+                               }
+                               if ( $s2 === $ns2 ) {
+                                       return 1;
+                               }
+
+                               // Don't care about the relative order if there are somehow
+                               // multiple non-subject namespaces for a namespace.
+                               return 0;
+                       } );
+               }
+
+               return $this->namespaces;
+       }
+
+       /**
+        * Check if a ns+title is a registered user's page
+        * @param IDatabase $db Database handle
+        * @param int $ns
+        * @param string $title
+        * @return bool
+        */
+       private function isUserPage( IDatabase $db, $ns, $title ) {
+               if ( $ns !== NS_USER && $ns !== NS_USER_TALK ) {
+                       return false;
+               }
+
+               list( $base ) = explode( '/', $title, 2 );
+               if ( !isset( $this->seenUsers[$base] ) ) {
+                       // Can't use User directly because it might uppercase the name
+                       $this->seenUsers[$base] = (bool)$db->selectField(
+                               'user',
+                               'user_id',
+                               [ 'user_name' => strtr( $base, '_', ' ' ) ],
+                               __METHOD__
+                       );
+               }
+               return $this->seenUsers[$base];
+       }
+
+       /**
+        * Munge a target title, if necessary
+        * @param IDatabase $db Database handle
+        * @param Title $oldTitle
+        * @param Title &$newTitle
+        * @return bool If $newTitle is (now) ok
+        */
+       private function mungeTitle( IDatabase $db, Title $oldTitle, Title &$newTitle ) {
+               $nt = $newTitle->getPrefixedText();
+
+               $munge = false;
+               if ( $this->isUserPage( $db, $newTitle->getNamespace(), $newTitle->getText() ) ) {
+                       $munge = 'Target title\'s user exists';
+               } else {
+                       $mp = new MovePage( $oldTitle, $newTitle );
+                       $status = $mp->isValidMove();
+                       if ( !$status->isOK() && $status->hasMessage( 'articleexists' ) ) {
+                               $munge = 'Target title exists';
+                       }
+               }
+               if ( !$munge ) {
+                       return true;
+               };
+
+               if ( $this->prefix !== null ) {
+                       $newTitle = Title::makeTitle(
+                               $this->prefixNs,
+                               $this->prefix . $oldTitle->getPrefixedText() . ( $this->suffix ?? '' )
+                       );
+               } elseif ( $this->suffix !== null ) {
+                       $newTitle = Title::makeTitle( $newTitle->getNamespace(), $newTitle->getText() . $this->suffix );
+               } else {
+                       $this->error(
+                               "Cannot move {$oldTitle->getPrefixedText()} → $nt: "
+                               . "$munge and no --prefix or --suffix was given"
+                       );
+                       return false;
+               }
+
+               if ( !$newTitle->isValid() ) {
+                       $this->error(
+                               "Cannot move {$oldTitle->getPrefixedText()} → $nt: "
+                               . "$munge and munged title '{$newTitle->getPrefixedText()}' is not valid"
+                       );
+                       return false;
+               }
+               if ( $newTitle->exists() ) {
+                       $this->error(
+                               "Cannot move {$oldTitle->getPrefixedText()} → $nt: "
+                               . "$munge and munged title '{$newTitle->getPrefixedText()}' also exists"
+                       );
+                       return false;
+               }
+
+               return true;
+       }
+
+       /**
+        * Use MovePage to move a title
+        * @param IDatabase $db Database handle
+        * @param int $ns
+        * @param string $title
+        * @return bool|null True on success, false on error, null if skipped
+        */
+       private function doMove( IDatabase $db, $ns, $title ) {
+               $char = mb_substr( $title, 0, 1 );
+               if ( !array_key_exists( $char, $this->charmap ) ) {
+                       $this->error(
+                               "Query returned NS$ns $title, which does not begin with a character in the charmap."
+                       );
+                       return false;
+               }
+
+               if ( $this->isUserPage( $db, $ns, $title ) ) {
+                       $this->output( "... Skipping user page NS$ns $title\n" );
+                       return null;
+               }
+
+               $oldTitle = Title::makeTitle( $ns, $title );
+               $newTitle = Title::makeTitle( $ns, $this->charmap[$char] . mb_substr( $title, 1 ) );
+               if ( !$this->mungeTitle( $db, $oldTitle, $newTitle ) ) {
+                       return false;
+               }
+
+               $mp = new MovePage( $oldTitle, $newTitle );
+               $status = $mp->isValidMove();
+               if ( !$status->isOK() ) {
+                       $this->error(
+                               "Invalid move {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()}: "
+                               . $status->getMessage( false, false, 'en' )->useDatabase( false )->plain()
+                       );
+                       return false;
+               }
+
+               if ( !$this->run ) {
+                       $this->output(
+                               "Would rename {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()}\n"
+                       );
+                       return true;
+               }
+
+               $status = $mp->move( $this->user, $this->reason, false, $this->tags );
+               if ( !$status->isOK() ) {
+                       $this->error(
+                               "Move {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()} failed: "
+                               . $status->getMessage( false, false, 'en' )->useDatabase( false )->plain()
+                       );
+               }
+               return $status->isOK();
+       }
+
+       /**
+        * Directly update a database row
+        * @param IDatabase $db Database handle
+        * @param string $table
+        * @param string|int $nsField
+        * @param string $titleField
+        * @return bool|null True on success, false on error, null if skipped
+        */
+       private function doUpdate( IDatabase $db, $table, $nsField, $titleField, $row ) {
+               $ns = is_int( $nsField ) ? $nsField : (int)$row->$nsField;
+               $title = $row->$titleField;
+
+               $char = mb_substr( $title, 0, 1 );
+               if ( !array_key_exists( $char, $this->charmap ) ) {
+                       $r = json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE );
+                       $this->error(
+                               "Query returned $r, but title does not begin with a character in the charmap."
+                       );
+                       return false;
+               }
+
+               if ( $this->isUserPage( $db, $ns, $title ) ) {
+                       $this->output( "... Skipping user page NS$ns $title\n" );
+                       return null;
+               }
+
+               $oldTitle = Title::makeTitle( $ns, $title );
+               $newTitle = Title::makeTitle( $ns, $this->charmap[$char] . mb_substr( $title, 1 ) );
+               if ( !$this->mungeTitle( $db, $oldTitle, $newTitle ) ) {
+                       return false;
+               }
+
+               if ( $this->run ) {
+                       $db->update(
+                               $table,
+                               array_merge(
+                                       is_int( $nsField ) ? [] : [ $nsField => $newTitle->getNamespace() ],
+                                       [ $titleField => $newTitle->getDBkey() ]
+                               ),
+                               (array)$row,
+                               __METHOD__
+                       );
+               } else {
+                       $r = json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE );
+                       $this->output( "Would set $r to {$newTitle->getPrefixedText()}\n" );
+               }
+
+               return true;
+       }
+
+       /**
+        * Rename entries in other tables
+        * @param IDatabase $db Database handle
+        * @param bool $doMove Whether to use MovePage or direct table manipulation
+        * @param string $table
+        * @param string|int $nsField
+        * @param string $titleField
+        * @param string[] $pkFields Additional fields to match a unique index
+        *  starting with $nsField and $titleField.
+        */
+       private function processTable( IDatabase $db, $doMove, $table, $nsField, $titleField, $pkFields ) {
+               if ( $this->tables !== null && !in_array( $table, $this->tables, true ) ) {
+                       $this->output( "Skipping table `$table`, not in --tables.\n" );
+                       return;
+               }
+
+               $batchSize = $this->getBatchSize();
+               $namespaces = $this->getNamespaces();
+               $likes = $this->getLikeBatches( $db, $titleField );
+               $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+
+               if ( is_int( $nsField ) ) {
+                       $namespaces = array_intersect( $namespaces, [ $nsField ] );
+               }
+
+               if ( !$namespaces ) {
+                       $this->output( "Skipping table `$table`, no valid namespaces.\n" );
+                       return;
+               }
+
+               $this->output( "Processing table `$table`...\n" );
+
+               $selectFields = array_merge(
+                       is_int( $nsField ) ? [] : [ $nsField ],
+                       [ $titleField ],
+                       $pkFields
+               );
+               $contFields = array_reverse( array_merge( [ $titleField ], $pkFields ) );
+
+               $lastReplicationWait = 0.0;
+               $count = 0;
+               $errors = 0;
+               foreach ( $namespaces as $ns ) {
+                       foreach ( $likes as $like ) {
+                               $cont = [];
+                               do {
+                                       $res = $db->select(
+                                               $table,
+                                               $selectFields,
+                                               array_merge( [ "$nsField = $ns", $like ], $cont ),
+                                               __METHOD__,
+                                               [ 'ORDER BY' => array_merge( [ $titleField ], $pkFields ), 'LIMIT' => $batchSize ]
+                                       );
+                                       $cont = [];
+                                       foreach ( $res as $row ) {
+                                               $cont = '';
+                                               foreach ( $contFields as $field ) {
+                                                       $v = $db->addQuotes( $row->$field );
+                                                       if ( $cont === '' ) {
+                                                               $cont = "$field > $v";
+                                                       } else {
+                                                               $cont = "$field > $v OR $field = $v AND ($cont)";
+                                                       }
+                                               }
+                                               $cont = [ $cont ];
+
+                                               if ( $doMove ) {
+                                                       $ns = is_int( $nsField ) ? $nsField : (int)$row->$nsField;
+                                                       $ret = $this->doMove( $db, $ns, $row->$titleField );
+                                               } else {
+                                                       $ret = $this->doUpdate( $db, $table, $nsField, $titleField, $row );
+                                               }
+                                               if ( $ret === true ) {
+                                                       $count++;
+                                               } elseif ( $ret === false ) {
+                                                       $errors++;
+                                               }
+                                       }
+
+                                       if ( $this->run ) {
+                                               $r = $cont ? json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ) : '<end>';
+                                               $this->output( "... $table: $count renames, $errors errors at $r\n" );
+                                               $lbFactory->waitForReplication(
+                                                       [ 'timeout' => 30, 'ifWritesSince' => $lastReplicationWait ]
+                                               );
+                                               $lastReplicationWait = microtime( true );
+                                       }
+                               } while ( $cont );
+                       }
+               }
+
+               $this->output( "Done processing table `$table`.\n" );
+       }
+
+       /**
+        * List users needing renaming
+        * @param IDatabase $db Database handle
+        */
+       private function processUsers( IDatabase $db ) {
+               $userlistFile = $this->getOption( 'userlist' );
+               if ( $userlistFile === null ) {
+                       $this->output( "Not generating user list, --userlist was not specified.\n" );
+                       return;
+               }
+
+               $fh = fopen( $userlistFile, 'wb' );
+               if ( !$fh ) {
+                       $this->error( "Could not open user list file $userlistFile" );
+                       return;
+               }
+
+               $this->output( "Generating user list...\n" );
+               $count = 0;
+               $batchSize = $this->getBatchSize();
+               foreach ( $this->getLikeBatches( $db, 'user_name' ) as $like ) {
+                       $cont = [];
+                       while ( true ) {
+                               $names = $db->selectFieldValues(
+                                       'user',
+                                       'user_name',
+                                       array_merge( [ $like ], $cont ),
+                                       __METHOD__,
+                                       [ 'ORDER BY' => 'user_name', 'LIMIT' => $batchSize ]
+                               );
+                               if ( !$names ) {
+                                       break;
+                               }
+
+                               $last = end( $names );
+                               $cont = [ 'user_name > ' . $db->addQuotes( $last ) ];
+                               foreach ( $names as $name ) {
+                                       $char = mb_substr( $name, 0, 1 );
+                                       if ( !array_key_exists( $char, $this->charmap ) ) {
+                                               $this->error(
+                                                       "Query returned $name, but user name does not begin with a character in the charmap."
+                                               );
+                                               continue;
+                                       }
+                                       $newName = $this->charmap[$char] . mb_substr( $name, 1 );
+                                       fprintf( $fh, "%s\t%s\n", $name, $newName );
+                                       $count++;
+                               }
+                               $this->output( "... at $last, $count names so far\n" );
+                       }
+               }
+
+               if ( !fclose( $fh ) ) {
+                       $this->error( "fclose on $userlistFile failed" );
+               }
+               $this->output( "User list output to $userlistFile, $count users need renaming.\n" );
+       }
+}
+
+$maintClass = UppercaseTitlesForUnicodeTransition::class;
+require_once RUN_MAINTENANCE_IF_MAIN;