From 9ef8c9608d7aec920121671fcbe11e3f459f1946 Mon Sep 17 00:00:00 2001 From: Brad Jorsch Date: Tue, 30 Apr 2019 11:33:06 -0400 Subject: [PATCH 1/1] maintenance: Script to rename titles for Unicode uppercasing changes This uses MovePage where possible to try to better keep data in sync. Archives, log entries, and so on can't do that though. The script skips User and User_talk pages for registered users, as renaming the users is more complicated than makes sense to try to implement here. Use something like Extension:Renameuser to clean those up; this script can provide a list of user names affected. Bug: T219279 Change-Id: I157577cb5bedfd347b808c254fb19ae8088818ab --- autoload.php | 1 + .../uppercaseTitlesForUnicodeTransition.php | 595 ++++++++++++++++++ 2 files changed, 596 insertions(+) create mode 100644 maintenance/uppercaseTitlesForUnicodeTransition.php diff --git a/autoload.php b/autoload.php index 13037ff98e..856ce3adfb 100644 --- a/autoload.php +++ b/autoload.php @@ -1549,6 +1549,7 @@ $wgAutoloadLocalClasses = [ 'UploadStashWrongOwnerException' => __DIR__ . '/includes/upload/exception/UploadStashWrongOwnerException.php', 'UploadStashZeroLengthFileException' => __DIR__ . '/includes/upload/exception/UploadStashZeroLengthFileException.php', 'UppercaseCollation' => __DIR__ . '/includes/collation/UppercaseCollation.php', + 'UppercaseTitlesForUnicodeTransition' => __DIR__ . '/maintenance/uppercaseTitlesForUnicodeTransition.php', 'User' => __DIR__ . '/includes/user/User.php', 'UserArray' => __DIR__ . '/includes/user/UserArray.php', 'UserArrayFromResult' => __DIR__ . '/includes/user/UserArrayFromResult.php', diff --git a/maintenance/uppercaseTitlesForUnicodeTransition.php b/maintenance/uppercaseTitlesForUnicodeTransition.php new file mode 100644 index 0000000000..f5bafdeb83 --- /dev/null +++ b/maintenance/uppercaseTitlesForUnicodeTransition.php @@ -0,0 +1,595 @@ +addDescription( + "Rename titles when changing behavior of Language::ucfirst().\n" + . "\n" + . "This script skips User and User_talk pages for registered users, as renaming of users " + . "is too complex to try to implement here. Use something like Extension:Renameuser to " + . "clean those up; this script can provide a list of user names affected." + ); + $this->addOption( + 'charmap', 'Character map generated by maintenance/language/generateUcfirstOverrides.php', + true, true + ); + $this->addOption( + 'user', 'System user to use to do the renames. Default is "Maintenance script".', false, true + ); + $this->addOption( + 'steal', + 'If the username specified by --user exists, specify this to force conversion to a system user.' + ); + $this->addOption( + 'run', 'If not specified, the script will not actually perform any moves (i.e. it will dry-run).' + ); + $this->addOption( + 'prefix', 'When the new title already exists, add this prefix.', false, true + ); + $this->addOption( + 'suffix', 'When the new title already exists, add this suffix.', false, true + ); + $this->addOption( 'reason', 'Reason to use when moving pages.', false, true ); + $this->addOption( 'tag', 'Change tag to apply when moving pages.', false, true ); + $this->addOption( 'tables', 'Comma-separated list of database tables to process.', false, true ); + $this->addOption( + 'userlist', 'Filename to which to output usernames needing rename.', false, true + ); + $this->setBatchSize( 1000 ); + } + + public function execute() { + $this->run = $this->getOption( 'run', false ); + + if ( $this->run ) { + $username = $this->getOption( 'user', 'Maintenance script' ); + $steal = $this->getOption( 'steal', false ); + $this->user = User::newSystemUser( $username, [ 'steal' => $steal ] ); + if ( !$this->user ) { + $user = User::newFromName( $username ); + if ( !$steal && $user && $user->isLoggedIn() ) { + $this->fatalError( "User $username already exists.\n" + . "Use --steal if you really want to steal it from the human who currently owns it." + ); + } + $this->fatalError( "Could not obtain system user $username." ); + } + } + + $tables = $this->getOption( 'tables' ); + if ( $tables !== null ) { + $this->tables = explode( ',', $tables ); + } + + $prefix = $this->getOption( 'prefix' ); + if ( $prefix !== null ) { + $title = Title::newFromText( $prefix . 'X' ); + if ( !$title || substr( $title->getDBkey(), -1 ) !== 'X' ) { + $this->fatalError( 'Invalid --prefix.' ); + } + if ( $title->getNamespace() <= NS_MAIN || $title->isExternal() ) { + $this->fatalError( 'Invalid --prefix. It must not be in namespace 0 and must not be external' ); + } + $this->prefixNs = $title->getNamespace(); + $this->prefix = substr( $title->getText(), 0, -1 ); + } + $this->suffix = $this->getOption( 'suffix' ); + + $this->reason = $this->getOption( 'reason' ) ?: $this->reason; + $this->tags = (array)$this->getOption( 'tag', null ); + + $charmapFile = $this->getOption( 'charmap' ); + if ( !file_exists( $charmapFile ) ) { + $this->fatalError( "Charmap file $charmapFile does not exist." ); + } + if ( !is_file( $charmapFile ) || !is_readable( $charmapFile ) ) { + $this->fatalError( "Charmap file $charmapFile is not readable." ); + } + $this->charmap = require $charmapFile; + if ( !is_array( $this->charmap ) ) { + $this->fatalError( "Charmap file $charmapFile did not return a PHP array." ); + } + $this->charmap = array_filter( + $this->charmap, + function ( $v, $k ) { + if ( mb_strlen( $k ) !== 1 ) { + $this->error( "Ignoring mapping from multi-character key '$k' to '$v'" ); + return false; + } + return $k !== $v; + }, + ARRAY_FILTER_USE_BOTH + ); + if ( !$this->charmap ) { + $this->fatalError( "Charmap file $charmapFile did not contain any usable character mappings." ); + } + + $db = $this->getDB( $this->run ? DB_MASTER : DB_REPLICA ); + $this->processTable( $db, true, 'page', 'page_namespace', 'page_title', [ 'page_id' ] ); + $this->processTable( $db, true, 'image', NS_FILE, 'img_name', [] ); + $this->processTable( + $db, false, 'archive', 'ar_namespace', 'ar_title', [ 'ar_timestamp', 'ar_id' ] + ); + $this->processTable( $db, false, 'filearchive', NS_FILE, 'fa_name', [ 'fa_timestamp', 'fa_id' ] ); + $this->processTable( $db, false, 'logging', 'log_namespace', 'log_title', [ 'log_id' ] ); + $this->processTable( $db, false, 'redirect', 'rd_namespace', 'rd_title', [ 'rd_from' ] ); + $this->processTable( $db, false, 'protected_titles', 'pt_namespace', 'pt_title', [] ); + $this->processUsers( $db ); + } + + /** + * Get batched LIKE conditions from the charmap + * @param IDatabase $db Database handle + * @param string $field Field name + * @param int $batchSize Size of the batches + * @return array + */ + private function getLikeBatches( IDatabase $db, $field, $batchSize = 100 ) { + $ret = []; + $likes = []; + foreach ( $this->charmap as $from => $to ) { + $likes[] = $field . $db->buildLike( $from, $db->anyString() ); + if ( count( $likes ) >= $batchSize ) { + $ret[] = $db->makeList( $likes, $db::LIST_OR ); + $likes = []; + } + } + if ( $likes ) { + $ret[] = $db->makeList( $likes, $db::LIST_OR ); + } + return $ret; + } + + /** + * Get the list of namespaces to operate on + * + * We only care about namespaces where we can move pages and titles are + * capitalized. + * + * @return int[] + */ + private function getNamespaces() { + if ( $this->namespaces === null ) { + $nsinfo = MediaWikiServices::getInstance()->getNamespaceInfo(); + $this->namespaces = array_filter( + array_keys( $nsinfo->getCanonicalNamespaces() ), + function ( $ns ) use ( $nsinfo ) { + return $nsinfo->isMovable( $ns ) && $nsinfo->isCapitalized( $ns ); + } + ); + usort( $this->namespaces, function ( $ns1, $ns2 ) use ( $nsinfo ) { + if ( $ns1 === $ns2 ) { + return 0; + } + + $s1 = $nsinfo->getSubject( $ns1 ); + $s2 = $nsinfo->getSubject( $ns2 ); + + // Order by subject namespace number first + if ( $s1 !== $s2 ) { + return $s1 < $s2 ? -1 : 1; + } + + // Second, put subject namespaces before non-subject namespaces + if ( $s1 === $ns1 ) { + return -1; + } + if ( $s2 === $ns2 ) { + return 1; + } + + // Don't care about the relative order if there are somehow + // multiple non-subject namespaces for a namespace. + return 0; + } ); + } + + return $this->namespaces; + } + + /** + * Check if a ns+title is a registered user's page + * @param IDatabase $db Database handle + * @param int $ns + * @param string $title + * @return bool + */ + private function isUserPage( IDatabase $db, $ns, $title ) { + if ( $ns !== NS_USER && $ns !== NS_USER_TALK ) { + return false; + } + + list( $base ) = explode( '/', $title, 2 ); + if ( !isset( $this->seenUsers[$base] ) ) { + // Can't use User directly because it might uppercase the name + $this->seenUsers[$base] = (bool)$db->selectField( + 'user', + 'user_id', + [ 'user_name' => strtr( $base, '_', ' ' ) ], + __METHOD__ + ); + } + return $this->seenUsers[$base]; + } + + /** + * Munge a target title, if necessary + * @param IDatabase $db Database handle + * @param Title $oldTitle + * @param Title &$newTitle + * @return bool If $newTitle is (now) ok + */ + private function mungeTitle( IDatabase $db, Title $oldTitle, Title &$newTitle ) { + $nt = $newTitle->getPrefixedText(); + + $munge = false; + if ( $this->isUserPage( $db, $newTitle->getNamespace(), $newTitle->getText() ) ) { + $munge = 'Target title\'s user exists'; + } else { + $mp = new MovePage( $oldTitle, $newTitle ); + $status = $mp->isValidMove(); + if ( !$status->isOK() && $status->hasMessage( 'articleexists' ) ) { + $munge = 'Target title exists'; + } + } + if ( !$munge ) { + return true; + }; + + if ( $this->prefix !== null ) { + $newTitle = Title::makeTitle( + $this->prefixNs, + $this->prefix . $oldTitle->getPrefixedText() . ( $this->suffix ?? '' ) + ); + } elseif ( $this->suffix !== null ) { + $newTitle = Title::makeTitle( $newTitle->getNamespace(), $newTitle->getText() . $this->suffix ); + } else { + $this->error( + "Cannot move {$oldTitle->getPrefixedText()} → $nt: " + . "$munge and no --prefix or --suffix was given" + ); + return false; + } + + if ( !$newTitle->isValid() ) { + $this->error( + "Cannot move {$oldTitle->getPrefixedText()} → $nt: " + . "$munge and munged title '{$newTitle->getPrefixedText()}' is not valid" + ); + return false; + } + if ( $newTitle->exists() ) { + $this->error( + "Cannot move {$oldTitle->getPrefixedText()} → $nt: " + . "$munge and munged title '{$newTitle->getPrefixedText()}' also exists" + ); + return false; + } + + return true; + } + + /** + * Use MovePage to move a title + * @param IDatabase $db Database handle + * @param int $ns + * @param string $title + * @return bool|null True on success, false on error, null if skipped + */ + private function doMove( IDatabase $db, $ns, $title ) { + $char = mb_substr( $title, 0, 1 ); + if ( !array_key_exists( $char, $this->charmap ) ) { + $this->error( + "Query returned NS$ns $title, which does not begin with a character in the charmap." + ); + return false; + } + + if ( $this->isUserPage( $db, $ns, $title ) ) { + $this->output( "... Skipping user page NS$ns $title\n" ); + return null; + } + + $oldTitle = Title::makeTitle( $ns, $title ); + $newTitle = Title::makeTitle( $ns, $this->charmap[$char] . mb_substr( $title, 1 ) ); + if ( !$this->mungeTitle( $db, $oldTitle, $newTitle ) ) { + return false; + } + + $mp = new MovePage( $oldTitle, $newTitle ); + $status = $mp->isValidMove(); + if ( !$status->isOK() ) { + $this->error( + "Invalid move {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()}: " + . $status->getMessage( false, false, 'en' )->useDatabase( false )->plain() + ); + return false; + } + + if ( !$this->run ) { + $this->output( + "Would rename {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()}\n" + ); + return true; + } + + $status = $mp->move( $this->user, $this->reason, false, $this->tags ); + if ( !$status->isOK() ) { + $this->error( + "Move {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()} failed: " + . $status->getMessage( false, false, 'en' )->useDatabase( false )->plain() + ); + } + return $status->isOK(); + } + + /** + * Directly update a database row + * @param IDatabase $db Database handle + * @param string $table + * @param string|int $nsField + * @param string $titleField + * @return bool|null True on success, false on error, null if skipped + */ + private function doUpdate( IDatabase $db, $table, $nsField, $titleField, $row ) { + $ns = is_int( $nsField ) ? $nsField : (int)$row->$nsField; + $title = $row->$titleField; + + $char = mb_substr( $title, 0, 1 ); + if ( !array_key_exists( $char, $this->charmap ) ) { + $r = json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); + $this->error( + "Query returned $r, but title does not begin with a character in the charmap." + ); + return false; + } + + if ( $this->isUserPage( $db, $ns, $title ) ) { + $this->output( "... Skipping user page NS$ns $title\n" ); + return null; + } + + $oldTitle = Title::makeTitle( $ns, $title ); + $newTitle = Title::makeTitle( $ns, $this->charmap[$char] . mb_substr( $title, 1 ) ); + if ( !$this->mungeTitle( $db, $oldTitle, $newTitle ) ) { + return false; + } + + if ( $this->run ) { + $db->update( + $table, + array_merge( + is_int( $nsField ) ? [] : [ $nsField => $newTitle->getNamespace() ], + [ $titleField => $newTitle->getDBkey() ] + ), + (array)$row, + __METHOD__ + ); + } else { + $r = json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); + $this->output( "Would set $r to {$newTitle->getPrefixedText()}\n" ); + } + + return true; + } + + /** + * Rename entries in other tables + * @param IDatabase $db Database handle + * @param bool $doMove Whether to use MovePage or direct table manipulation + * @param string $table + * @param string|int $nsField + * @param string $titleField + * @param string[] $pkFields Additional fields to match a unique index + * starting with $nsField and $titleField. + */ + private function processTable( IDatabase $db, $doMove, $table, $nsField, $titleField, $pkFields ) { + if ( $this->tables !== null && !in_array( $table, $this->tables, true ) ) { + $this->output( "Skipping table `$table`, not in --tables.\n" ); + return; + } + + $batchSize = $this->getBatchSize(); + $namespaces = $this->getNamespaces(); + $likes = $this->getLikeBatches( $db, $titleField ); + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + + if ( is_int( $nsField ) ) { + $namespaces = array_intersect( $namespaces, [ $nsField ] ); + } + + if ( !$namespaces ) { + $this->output( "Skipping table `$table`, no valid namespaces.\n" ); + return; + } + + $this->output( "Processing table `$table`...\n" ); + + $selectFields = array_merge( + is_int( $nsField ) ? [] : [ $nsField ], + [ $titleField ], + $pkFields + ); + $contFields = array_reverse( array_merge( [ $titleField ], $pkFields ) ); + + $lastReplicationWait = 0.0; + $count = 0; + $errors = 0; + foreach ( $namespaces as $ns ) { + foreach ( $likes as $like ) { + $cont = []; + do { + $res = $db->select( + $table, + $selectFields, + array_merge( [ "$nsField = $ns", $like ], $cont ), + __METHOD__, + [ 'ORDER BY' => array_merge( [ $titleField ], $pkFields ), 'LIMIT' => $batchSize ] + ); + $cont = []; + foreach ( $res as $row ) { + $cont = ''; + foreach ( $contFields as $field ) { + $v = $db->addQuotes( $row->$field ); + if ( $cont === '' ) { + $cont = "$field > $v"; + } else { + $cont = "$field > $v OR $field = $v AND ($cont)"; + } + } + $cont = [ $cont ]; + + if ( $doMove ) { + $ns = is_int( $nsField ) ? $nsField : (int)$row->$nsField; + $ret = $this->doMove( $db, $ns, $row->$titleField ); + } else { + $ret = $this->doUpdate( $db, $table, $nsField, $titleField, $row ); + } + if ( $ret === true ) { + $count++; + } elseif ( $ret === false ) { + $errors++; + } + } + + if ( $this->run ) { + $r = $cont ? json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ) : ''; + $this->output( "... $table: $count renames, $errors errors at $r\n" ); + $lbFactory->waitForReplication( + [ 'timeout' => 30, 'ifWritesSince' => $lastReplicationWait ] + ); + $lastReplicationWait = microtime( true ); + } + } while ( $cont ); + } + } + + $this->output( "Done processing table `$table`.\n" ); + } + + /** + * List users needing renaming + * @param IDatabase $db Database handle + */ + private function processUsers( IDatabase $db ) { + $userlistFile = $this->getOption( 'userlist' ); + if ( $userlistFile === null ) { + $this->output( "Not generating user list, --userlist was not specified.\n" ); + return; + } + + $fh = fopen( $userlistFile, 'wb' ); + if ( !$fh ) { + $this->error( "Could not open user list file $userlistFile" ); + return; + } + + $this->output( "Generating user list...\n" ); + $count = 0; + $batchSize = $this->getBatchSize(); + foreach ( $this->getLikeBatches( $db, 'user_name' ) as $like ) { + $cont = []; + while ( true ) { + $names = $db->selectFieldValues( + 'user', + 'user_name', + array_merge( [ $like ], $cont ), + __METHOD__, + [ 'ORDER BY' => 'user_name', 'LIMIT' => $batchSize ] + ); + if ( !$names ) { + break; + } + + $last = end( $names ); + $cont = [ 'user_name > ' . $db->addQuotes( $last ) ]; + foreach ( $names as $name ) { + $char = mb_substr( $name, 0, 1 ); + if ( !array_key_exists( $char, $this->charmap ) ) { + $this->error( + "Query returned $name, but user name does not begin with a character in the charmap." + ); + continue; + } + $newName = $this->charmap[$char] . mb_substr( $name, 1 ); + fprintf( $fh, "%s\t%s\n", $name, $newName ); + $count++; + } + $this->output( "... at $last, $count names so far\n" ); + } + } + + if ( !fclose( $fh ) ) { + $this->error( "fclose on $userlistFile failed" ); + } + $this->output( "User list output to $userlistFile, $count users need renaming.\n" ); + } +} + +$maintClass = UppercaseTitlesForUnicodeTransition::class; +require_once RUN_MAINTENANCE_IF_MAIN; -- 2.20.1