From: Reedy Date: Mon, 12 Feb 2018 11:30:32 +0000 (+0000) Subject: Make cleanupSpam.php query for http and https X-Git-Tag: 1.34.0-rc.0~5205^2 X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=commitdiff_plain;h=0a1144fb2a2804c4b01e4cbae336475f92186e5b Make cleanupSpam.php query for http and https Bug: T186795 Change-Id: I9e350306f85d3d3ca4daba626e49a0792605acc7 --- diff --git a/includes/LinkFilter.php b/includes/LinkFilter.php index 790e2be448..17b4d56635 100644 --- a/includes/LinkFilter.php +++ b/includes/LinkFilter.php @@ -38,9 +38,10 @@ class LinkFilter { * * @param Content $content Content to check * @param string $filterEntry Domainparts, see makeRegex() for more details + * @param string $protocol 'http://' or 'https://' * @return int 0 if no match or 1 if there's at least one match */ - static function matchEntry( Content $content, $filterEntry ) { + public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) { if ( !( $content instanceof TextContent ) ) { // TODO: handle other types of content too. // Maybe create ContentHandler::matchFilter( LinkFilter ). @@ -50,7 +51,7 @@ class LinkFilter { $text = $content->getNativeData(); - $regex = self::makeRegex( $filterEntry ); + $regex = self::makeRegex( $filterEntry, $protocol ); return preg_match( $regex, $text ); } @@ -59,10 +60,12 @@ class LinkFilter { * * @param string $filterEntry URL, if it begins with "*.", it'll be * replaced to match any subdomain + * @param string $protocol 'http://' or 'https://' + * * @return string Regex pattern, for preg_match() */ - private static function makeRegex( $filterEntry ) { - $regex = '!http://'; + private static function makeRegex( $filterEntry, $protocol ) { + $regex = '!' . preg_quote( $protocol ); if ( substr( $filterEntry, 0, 2 ) == '*.' ) { $regex .= '(?:[A-Za-z0-9.-]+\.|)'; $filterEntry = substr( $filterEntry, 2 ); diff --git a/maintenance/cleanupSpam.php b/maintenance/cleanupSpam.php index 038b28ce76..24ca86d60f 100644 --- a/maintenance/cleanupSpam.php +++ b/maintenance/cleanupSpam.php @@ -53,9 +53,14 @@ class CleanupSpam extends Maintenance { $wgUser->addGroup( 'bot' ); $spec = $this->getArg(); - $like = LinkFilter::makeLikeArray( $spec ); - if ( !$like ) { - $this->fatalError( "Not a valid hostname specification: $spec" ); + + $likes = []; + foreach ( [ 'http://', 'https://' ] as $prot ) { + $like = LinkFilter::makeLikeArray( $spec, $prot ); + if ( !$like ) { + $this->fatalError( "Not a valid hostname specification: $spec" ); + } + $likes[$prot] = $like; } if ( $this->hasOption( 'all' ) ) { @@ -63,15 +68,24 @@ class CleanupSpam extends Maintenance { $this->output( "Finding spam on " . count( $wgLocalDatabases ) . " wikis\n" ); $found = false; foreach ( $wgLocalDatabases as $wikiID ) { + /** @var $dbr Database */ $dbr = $this->getDB( DB_REPLICA, [], $wikiID ); - $count = $dbr->selectField( 'externallinks', 'COUNT(*)', - [ 'el_index' . $dbr->buildLike( $like ) ], __METHOD__ ); - if ( $count ) { - $found = true; - $cmd = wfShellWikiCmd( "$IP/maintenance/cleanupSpam.php", - [ '--wiki', $wikiID, $spec ] ); - passthru( "$cmd | sed 's/^/$wikiID: /'" ); + foreach ( $likes as $like ) { + $count = $dbr->selectField( + 'externallinks', + 'COUNT(*)', + [ 'el_index' . $dbr->buildLike( $like ) ], + __METHOD__ + ); + if ( $count ) { + $found = true; + $cmd = wfShellWikiCmd( + "$IP/maintenance/cleanupSpam.php", + [ '--wiki', $wikiID, $spec ] + ); + passthru( "$cmd | sed 's/^/$wikiID: /'" ); + } } } if ( $found ) { @@ -82,13 +96,21 @@ class CleanupSpam extends Maintenance { } else { // Clean up spam on this wiki + $count = 0; + /** @var $dbr Database */ $dbr = $this->getDB( DB_REPLICA ); - $res = $dbr->select( 'externallinks', [ 'DISTINCT el_from' ], - [ 'el_index' . $dbr->buildLike( $like ) ], __METHOD__ ); - $count = $dbr->numRows( $res ); - $this->output( "Found $count articles containing $spec\n" ); - foreach ( $res as $row ) { - $this->cleanupArticle( $row->el_from, $spec ); + foreach ( $likes as $prot => $like ) { + $res = $dbr->select( + 'externallinks', + [ 'DISTINCT el_from' ], + [ 'el_index' . $dbr->buildLike( $like ) ], + __METHOD__ + ); + $count = $dbr->numRows( $res ); + $this->output( "Found $count articles containing $spec\n" ); + foreach ( $res as $row ) { + $this->cleanupArticle( $row->el_from, $spec, $prot ); + } } if ( $count ) { $this->output( "Done\n" ); @@ -96,7 +118,13 @@ class CleanupSpam extends Maintenance { } } - private function cleanupArticle( $id, $domain ) { + /** + * @param int $id + * @param string $domain + * @param string $protocol + * @throws MWException + */ + private function cleanupArticle( $id, $domain, $protocol ) { $title = Title::newFromID( $id ); if ( !$title ) { $this->error( "Internal error: no page for ID $id" ); @@ -109,7 +137,7 @@ class CleanupSpam extends Maintenance { $currentRevId = $rev->getId(); while ( $rev && ( $rev->isDeleted( Revision::DELETED_TEXT ) - || LinkFilter::matchEntry( $rev->getContent( Revision::RAW ), $domain ) ) + || LinkFilter::matchEntry( $rev->getContent( Revision::RAW ), $domain, $protocol ) ) ) { $rev = $rev->getPrevious(); }