From: jenkins-bot Date: Wed, 10 Jul 2019 06:42:16 +0000 (+0000) Subject: Merge "maintenance: Script to rename titles for Unicode uppercasing changes" X-Git-Tag: 1.34.0-rc.0~1103 X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=commitdiff_plain;h=04d1aa3033f40a38d721f7f0e88b5bac440d2869;hp=8763879da660e66038686da0fa63125988589db9 Merge "maintenance: Script to rename titles for Unicode uppercasing changes" --- diff --git a/autoload.php b/autoload.php index 218c244a53..9f9f1a6b52 100644 --- a/autoload.php +++ b/autoload.php @@ -1560,6 +1560,7 @@ $wgAutoloadLocalClasses = [ 'UploadStashWrongOwnerException' => __DIR__ . '/includes/upload/exception/UploadStashWrongOwnerException.php', 'UploadStashZeroLengthFileException' => __DIR__ . '/includes/upload/exception/UploadStashZeroLengthFileException.php', 'UppercaseCollation' => __DIR__ . '/includes/collation/UppercaseCollation.php', + 'UppercaseTitlesForUnicodeTransition' => __DIR__ . '/maintenance/uppercaseTitlesForUnicodeTransition.php', 'User' => __DIR__ . '/includes/user/User.php', 'UserArray' => __DIR__ . '/includes/user/UserArray.php', 'UserArrayFromResult' => __DIR__ . '/includes/user/UserArrayFromResult.php', diff --git a/maintenance/uppercaseTitlesForUnicodeTransition.php b/maintenance/uppercaseTitlesForUnicodeTransition.php new file mode 100644 index 0000000000..f5bafdeb83 --- /dev/null +++ b/maintenance/uppercaseTitlesForUnicodeTransition.php @@ -0,0 +1,595 @@ +addDescription( + "Rename titles when changing behavior of Language::ucfirst().\n" + . "\n" + . "This script skips User and User_talk pages for registered users, as renaming of users " + . "is too complex to try to implement here. Use something like Extension:Renameuser to " + . "clean those up; this script can provide a list of user names affected." + ); + $this->addOption( + 'charmap', 'Character map generated by maintenance/language/generateUcfirstOverrides.php', + true, true + ); + $this->addOption( + 'user', 'System user to use to do the renames. Default is "Maintenance script".', false, true + ); + $this->addOption( + 'steal', + 'If the username specified by --user exists, specify this to force conversion to a system user.' + ); + $this->addOption( + 'run', 'If not specified, the script will not actually perform any moves (i.e. it will dry-run).' + ); + $this->addOption( + 'prefix', 'When the new title already exists, add this prefix.', false, true + ); + $this->addOption( + 'suffix', 'When the new title already exists, add this suffix.', false, true + ); + $this->addOption( 'reason', 'Reason to use when moving pages.', false, true ); + $this->addOption( 'tag', 'Change tag to apply when moving pages.', false, true ); + $this->addOption( 'tables', 'Comma-separated list of database tables to process.', false, true ); + $this->addOption( + 'userlist', 'Filename to which to output usernames needing rename.', false, true + ); + $this->setBatchSize( 1000 ); + } + + public function execute() { + $this->run = $this->getOption( 'run', false ); + + if ( $this->run ) { + $username = $this->getOption( 'user', 'Maintenance script' ); + $steal = $this->getOption( 'steal', false ); + $this->user = User::newSystemUser( $username, [ 'steal' => $steal ] ); + if ( !$this->user ) { + $user = User::newFromName( $username ); + if ( !$steal && $user && $user->isLoggedIn() ) { + $this->fatalError( "User $username already exists.\n" + . "Use --steal if you really want to steal it from the human who currently owns it." + ); + } + $this->fatalError( "Could not obtain system user $username." ); + } + } + + $tables = $this->getOption( 'tables' ); + if ( $tables !== null ) { + $this->tables = explode( ',', $tables ); + } + + $prefix = $this->getOption( 'prefix' ); + if ( $prefix !== null ) { + $title = Title::newFromText( $prefix . 'X' ); + if ( !$title || substr( $title->getDBkey(), -1 ) !== 'X' ) { + $this->fatalError( 'Invalid --prefix.' ); + } + if ( $title->getNamespace() <= NS_MAIN || $title->isExternal() ) { + $this->fatalError( 'Invalid --prefix. It must not be in namespace 0 and must not be external' ); + } + $this->prefixNs = $title->getNamespace(); + $this->prefix = substr( $title->getText(), 0, -1 ); + } + $this->suffix = $this->getOption( 'suffix' ); + + $this->reason = $this->getOption( 'reason' ) ?: $this->reason; + $this->tags = (array)$this->getOption( 'tag', null ); + + $charmapFile = $this->getOption( 'charmap' ); + if ( !file_exists( $charmapFile ) ) { + $this->fatalError( "Charmap file $charmapFile does not exist." ); + } + if ( !is_file( $charmapFile ) || !is_readable( $charmapFile ) ) { + $this->fatalError( "Charmap file $charmapFile is not readable." ); + } + $this->charmap = require $charmapFile; + if ( !is_array( $this->charmap ) ) { + $this->fatalError( "Charmap file $charmapFile did not return a PHP array." ); + } + $this->charmap = array_filter( + $this->charmap, + function ( $v, $k ) { + if ( mb_strlen( $k ) !== 1 ) { + $this->error( "Ignoring mapping from multi-character key '$k' to '$v'" ); + return false; + } + return $k !== $v; + }, + ARRAY_FILTER_USE_BOTH + ); + if ( !$this->charmap ) { + $this->fatalError( "Charmap file $charmapFile did not contain any usable character mappings." ); + } + + $db = $this->getDB( $this->run ? DB_MASTER : DB_REPLICA ); + $this->processTable( $db, true, 'page', 'page_namespace', 'page_title', [ 'page_id' ] ); + $this->processTable( $db, true, 'image', NS_FILE, 'img_name', [] ); + $this->processTable( + $db, false, 'archive', 'ar_namespace', 'ar_title', [ 'ar_timestamp', 'ar_id' ] + ); + $this->processTable( $db, false, 'filearchive', NS_FILE, 'fa_name', [ 'fa_timestamp', 'fa_id' ] ); + $this->processTable( $db, false, 'logging', 'log_namespace', 'log_title', [ 'log_id' ] ); + $this->processTable( $db, false, 'redirect', 'rd_namespace', 'rd_title', [ 'rd_from' ] ); + $this->processTable( $db, false, 'protected_titles', 'pt_namespace', 'pt_title', [] ); + $this->processUsers( $db ); + } + + /** + * Get batched LIKE conditions from the charmap + * @param IDatabase $db Database handle + * @param string $field Field name + * @param int $batchSize Size of the batches + * @return array + */ + private function getLikeBatches( IDatabase $db, $field, $batchSize = 100 ) { + $ret = []; + $likes = []; + foreach ( $this->charmap as $from => $to ) { + $likes[] = $field . $db->buildLike( $from, $db->anyString() ); + if ( count( $likes ) >= $batchSize ) { + $ret[] = $db->makeList( $likes, $db::LIST_OR ); + $likes = []; + } + } + if ( $likes ) { + $ret[] = $db->makeList( $likes, $db::LIST_OR ); + } + return $ret; + } + + /** + * Get the list of namespaces to operate on + * + * We only care about namespaces where we can move pages and titles are + * capitalized. + * + * @return int[] + */ + private function getNamespaces() { + if ( $this->namespaces === null ) { + $nsinfo = MediaWikiServices::getInstance()->getNamespaceInfo(); + $this->namespaces = array_filter( + array_keys( $nsinfo->getCanonicalNamespaces() ), + function ( $ns ) use ( $nsinfo ) { + return $nsinfo->isMovable( $ns ) && $nsinfo->isCapitalized( $ns ); + } + ); + usort( $this->namespaces, function ( $ns1, $ns2 ) use ( $nsinfo ) { + if ( $ns1 === $ns2 ) { + return 0; + } + + $s1 = $nsinfo->getSubject( $ns1 ); + $s2 = $nsinfo->getSubject( $ns2 ); + + // Order by subject namespace number first + if ( $s1 !== $s2 ) { + return $s1 < $s2 ? -1 : 1; + } + + // Second, put subject namespaces before non-subject namespaces + if ( $s1 === $ns1 ) { + return -1; + } + if ( $s2 === $ns2 ) { + return 1; + } + + // Don't care about the relative order if there are somehow + // multiple non-subject namespaces for a namespace. + return 0; + } ); + } + + return $this->namespaces; + } + + /** + * Check if a ns+title is a registered user's page + * @param IDatabase $db Database handle + * @param int $ns + * @param string $title + * @return bool + */ + private function isUserPage( IDatabase $db, $ns, $title ) { + if ( $ns !== NS_USER && $ns !== NS_USER_TALK ) { + return false; + } + + list( $base ) = explode( '/', $title, 2 ); + if ( !isset( $this->seenUsers[$base] ) ) { + // Can't use User directly because it might uppercase the name + $this->seenUsers[$base] = (bool)$db->selectField( + 'user', + 'user_id', + [ 'user_name' => strtr( $base, '_', ' ' ) ], + __METHOD__ + ); + } + return $this->seenUsers[$base]; + } + + /** + * Munge a target title, if necessary + * @param IDatabase $db Database handle + * @param Title $oldTitle + * @param Title &$newTitle + * @return bool If $newTitle is (now) ok + */ + private function mungeTitle( IDatabase $db, Title $oldTitle, Title &$newTitle ) { + $nt = $newTitle->getPrefixedText(); + + $munge = false; + if ( $this->isUserPage( $db, $newTitle->getNamespace(), $newTitle->getText() ) ) { + $munge = 'Target title\'s user exists'; + } else { + $mp = new MovePage( $oldTitle, $newTitle ); + $status = $mp->isValidMove(); + if ( !$status->isOK() && $status->hasMessage( 'articleexists' ) ) { + $munge = 'Target title exists'; + } + } + if ( !$munge ) { + return true; + }; + + if ( $this->prefix !== null ) { + $newTitle = Title::makeTitle( + $this->prefixNs, + $this->prefix . $oldTitle->getPrefixedText() . ( $this->suffix ?? '' ) + ); + } elseif ( $this->suffix !== null ) { + $newTitle = Title::makeTitle( $newTitle->getNamespace(), $newTitle->getText() . $this->suffix ); + } else { + $this->error( + "Cannot move {$oldTitle->getPrefixedText()} → $nt: " + . "$munge and no --prefix or --suffix was given" + ); + return false; + } + + if ( !$newTitle->isValid() ) { + $this->error( + "Cannot move {$oldTitle->getPrefixedText()} → $nt: " + . "$munge and munged title '{$newTitle->getPrefixedText()}' is not valid" + ); + return false; + } + if ( $newTitle->exists() ) { + $this->error( + "Cannot move {$oldTitle->getPrefixedText()} → $nt: " + . "$munge and munged title '{$newTitle->getPrefixedText()}' also exists" + ); + return false; + } + + return true; + } + + /** + * Use MovePage to move a title + * @param IDatabase $db Database handle + * @param int $ns + * @param string $title + * @return bool|null True on success, false on error, null if skipped + */ + private function doMove( IDatabase $db, $ns, $title ) { + $char = mb_substr( $title, 0, 1 ); + if ( !array_key_exists( $char, $this->charmap ) ) { + $this->error( + "Query returned NS$ns $title, which does not begin with a character in the charmap." + ); + return false; + } + + if ( $this->isUserPage( $db, $ns, $title ) ) { + $this->output( "... Skipping user page NS$ns $title\n" ); + return null; + } + + $oldTitle = Title::makeTitle( $ns, $title ); + $newTitle = Title::makeTitle( $ns, $this->charmap[$char] . mb_substr( $title, 1 ) ); + if ( !$this->mungeTitle( $db, $oldTitle, $newTitle ) ) { + return false; + } + + $mp = new MovePage( $oldTitle, $newTitle ); + $status = $mp->isValidMove(); + if ( !$status->isOK() ) { + $this->error( + "Invalid move {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()}: " + . $status->getMessage( false, false, 'en' )->useDatabase( false )->plain() + ); + return false; + } + + if ( !$this->run ) { + $this->output( + "Would rename {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()}\n" + ); + return true; + } + + $status = $mp->move( $this->user, $this->reason, false, $this->tags ); + if ( !$status->isOK() ) { + $this->error( + "Move {$oldTitle->getPrefixedText()} → {$newTitle->getPrefixedText()} failed: " + . $status->getMessage( false, false, 'en' )->useDatabase( false )->plain() + ); + } + return $status->isOK(); + } + + /** + * Directly update a database row + * @param IDatabase $db Database handle + * @param string $table + * @param string|int $nsField + * @param string $titleField + * @return bool|null True on success, false on error, null if skipped + */ + private function doUpdate( IDatabase $db, $table, $nsField, $titleField, $row ) { + $ns = is_int( $nsField ) ? $nsField : (int)$row->$nsField; + $title = $row->$titleField; + + $char = mb_substr( $title, 0, 1 ); + if ( !array_key_exists( $char, $this->charmap ) ) { + $r = json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); + $this->error( + "Query returned $r, but title does not begin with a character in the charmap." + ); + return false; + } + + if ( $this->isUserPage( $db, $ns, $title ) ) { + $this->output( "... Skipping user page NS$ns $title\n" ); + return null; + } + + $oldTitle = Title::makeTitle( $ns, $title ); + $newTitle = Title::makeTitle( $ns, $this->charmap[$char] . mb_substr( $title, 1 ) ); + if ( !$this->mungeTitle( $db, $oldTitle, $newTitle ) ) { + return false; + } + + if ( $this->run ) { + $db->update( + $table, + array_merge( + is_int( $nsField ) ? [] : [ $nsField => $newTitle->getNamespace() ], + [ $titleField => $newTitle->getDBkey() ] + ), + (array)$row, + __METHOD__ + ); + } else { + $r = json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); + $this->output( "Would set $r to {$newTitle->getPrefixedText()}\n" ); + } + + return true; + } + + /** + * Rename entries in other tables + * @param IDatabase $db Database handle + * @param bool $doMove Whether to use MovePage or direct table manipulation + * @param string $table + * @param string|int $nsField + * @param string $titleField + * @param string[] $pkFields Additional fields to match a unique index + * starting with $nsField and $titleField. + */ + private function processTable( IDatabase $db, $doMove, $table, $nsField, $titleField, $pkFields ) { + if ( $this->tables !== null && !in_array( $table, $this->tables, true ) ) { + $this->output( "Skipping table `$table`, not in --tables.\n" ); + return; + } + + $batchSize = $this->getBatchSize(); + $namespaces = $this->getNamespaces(); + $likes = $this->getLikeBatches( $db, $titleField ); + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + + if ( is_int( $nsField ) ) { + $namespaces = array_intersect( $namespaces, [ $nsField ] ); + } + + if ( !$namespaces ) { + $this->output( "Skipping table `$table`, no valid namespaces.\n" ); + return; + } + + $this->output( "Processing table `$table`...\n" ); + + $selectFields = array_merge( + is_int( $nsField ) ? [] : [ $nsField ], + [ $titleField ], + $pkFields + ); + $contFields = array_reverse( array_merge( [ $titleField ], $pkFields ) ); + + $lastReplicationWait = 0.0; + $count = 0; + $errors = 0; + foreach ( $namespaces as $ns ) { + foreach ( $likes as $like ) { + $cont = []; + do { + $res = $db->select( + $table, + $selectFields, + array_merge( [ "$nsField = $ns", $like ], $cont ), + __METHOD__, + [ 'ORDER BY' => array_merge( [ $titleField ], $pkFields ), 'LIMIT' => $batchSize ] + ); + $cont = []; + foreach ( $res as $row ) { + $cont = ''; + foreach ( $contFields as $field ) { + $v = $db->addQuotes( $row->$field ); + if ( $cont === '' ) { + $cont = "$field > $v"; + } else { + $cont = "$field > $v OR $field = $v AND ($cont)"; + } + } + $cont = [ $cont ]; + + if ( $doMove ) { + $ns = is_int( $nsField ) ? $nsField : (int)$row->$nsField; + $ret = $this->doMove( $db, $ns, $row->$titleField ); + } else { + $ret = $this->doUpdate( $db, $table, $nsField, $titleField, $row ); + } + if ( $ret === true ) { + $count++; + } elseif ( $ret === false ) { + $errors++; + } + } + + if ( $this->run ) { + $r = $cont ? json_encode( $row, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ) : ''; + $this->output( "... $table: $count renames, $errors errors at $r\n" ); + $lbFactory->waitForReplication( + [ 'timeout' => 30, 'ifWritesSince' => $lastReplicationWait ] + ); + $lastReplicationWait = microtime( true ); + } + } while ( $cont ); + } + } + + $this->output( "Done processing table `$table`.\n" ); + } + + /** + * List users needing renaming + * @param IDatabase $db Database handle + */ + private function processUsers( IDatabase $db ) { + $userlistFile = $this->getOption( 'userlist' ); + if ( $userlistFile === null ) { + $this->output( "Not generating user list, --userlist was not specified.\n" ); + return; + } + + $fh = fopen( $userlistFile, 'wb' ); + if ( !$fh ) { + $this->error( "Could not open user list file $userlistFile" ); + return; + } + + $this->output( "Generating user list...\n" ); + $count = 0; + $batchSize = $this->getBatchSize(); + foreach ( $this->getLikeBatches( $db, 'user_name' ) as $like ) { + $cont = []; + while ( true ) { + $names = $db->selectFieldValues( + 'user', + 'user_name', + array_merge( [ $like ], $cont ), + __METHOD__, + [ 'ORDER BY' => 'user_name', 'LIMIT' => $batchSize ] + ); + if ( !$names ) { + break; + } + + $last = end( $names ); + $cont = [ 'user_name > ' . $db->addQuotes( $last ) ]; + foreach ( $names as $name ) { + $char = mb_substr( $name, 0, 1 ); + if ( !array_key_exists( $char, $this->charmap ) ) { + $this->error( + "Query returned $name, but user name does not begin with a character in the charmap." + ); + continue; + } + $newName = $this->charmap[$char] . mb_substr( $name, 1 ); + fprintf( $fh, "%s\t%s\n", $name, $newName ); + $count++; + } + $this->output( "... at $last, $count names so far\n" ); + } + } + + if ( !fclose( $fh ) ) { + $this->error( "fclose on $userlistFile failed" ); + } + $this->output( "User list output to $userlistFile, $count users need renaming.\n" ); + } +} + +$maintClass = UppercaseTitlesForUnicodeTransition::class; +require_once RUN_MAINTENANCE_IF_MAIN;