Make cleanupSpam.php query for http and https
authorReedy <reedy@wikimedia.org>
Mon, 12 Feb 2018 11:30:32 +0000 (11:30 +0000)
committerReedy <reedy@wikimedia.org>
Sun, 3 Jun 2018 22:00:11 +0000 (22:00 +0000)
Bug: T186795
Change-Id: I9e350306f85d3d3ca4daba626e49a0792605acc7

includes/LinkFilter.php
maintenance/cleanupSpam.php

index 790e2be..17b4d56 100644 (file)
@@ -38,9 +38,10 @@ class LinkFilter {
         *
         * @param Content $content Content to check
         * @param string $filterEntry Domainparts, see makeRegex() for more details
+        * @param string $protocol 'http://' or 'https://'
         * @return int 0 if no match or 1 if there's at least one match
         */
-       static function matchEntry( Content $content, $filterEntry ) {
+       public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
                if ( !( $content instanceof TextContent ) ) {
                        // TODO: handle other types of content too.
                        //      Maybe create ContentHandler::matchFilter( LinkFilter ).
@@ -50,7 +51,7 @@ class LinkFilter {
 
                $text = $content->getNativeData();
 
-               $regex = self::makeRegex( $filterEntry );
+               $regex = self::makeRegex( $filterEntry, $protocol );
                return preg_match( $regex, $text );
        }
 
@@ -59,10 +60,12 @@ class LinkFilter {
         *
         * @param string $filterEntry URL, if it begins with "*.", it'll be
         *        replaced to match any subdomain
+        * @param string $protocol 'http://' or 'https://'
+        *
         * @return string Regex pattern, for preg_match()
         */
-       private static function makeRegex( $filterEntry ) {
-               $regex = '!http://';
+       private static function makeRegex( $filterEntry, $protocol ) {
+               $regex = '!' . preg_quote( $protocol );
                if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
                        $regex .= '(?:[A-Za-z0-9.-]+\.|)';
                        $filterEntry = substr( $filterEntry, 2 );
index 038b28c..24ca86d 100644 (file)
@@ -53,9 +53,14 @@ class CleanupSpam extends Maintenance {
                $wgUser->addGroup( 'bot' );
 
                $spec = $this->getArg();
-               $like = LinkFilter::makeLikeArray( $spec );
-               if ( !$like ) {
-                       $this->fatalError( "Not a valid hostname specification: $spec" );
+
+               $likes = [];
+               foreach ( [ 'http://', 'https://' ] as $prot ) {
+                       $like = LinkFilter::makeLikeArray( $spec, $prot );
+                       if ( !$like ) {
+                               $this->fatalError( "Not a valid hostname specification: $spec" );
+                       }
+                       $likes[$prot] = $like;
                }
 
                if ( $this->hasOption( 'all' ) ) {
@@ -63,15 +68,24 @@ class CleanupSpam extends Maintenance {
                        $this->output( "Finding spam on " . count( $wgLocalDatabases ) . " wikis\n" );
                        $found = false;
                        foreach ( $wgLocalDatabases as $wikiID ) {
+                               /** @var $dbr Database */
                                $dbr = $this->getDB( DB_REPLICA, [], $wikiID );
 
-                               $count = $dbr->selectField( 'externallinks', 'COUNT(*)',
-                                       [ 'el_index' . $dbr->buildLike( $like ) ], __METHOD__ );
-                               if ( $count ) {
-                                       $found = true;
-                                       $cmd = wfShellWikiCmd( "$IP/maintenance/cleanupSpam.php",
-                                               [ '--wiki', $wikiID, $spec ] );
-                                       passthru( "$cmd | sed 's/^/$wikiID:  /'" );
+                               foreach ( $likes as $like ) {
+                                       $count = $dbr->selectField(
+                                               'externallinks',
+                                               'COUNT(*)',
+                                               [ 'el_index' . $dbr->buildLike( $like ) ],
+                                               __METHOD__
+                                       );
+                                       if ( $count ) {
+                                               $found = true;
+                                               $cmd = wfShellWikiCmd(
+                                                       "$IP/maintenance/cleanupSpam.php",
+                                                       [ '--wiki', $wikiID, $spec ]
+                                               );
+                                               passthru( "$cmd | sed 's/^/$wikiID:  /'" );
+                                       }
                                }
                        }
                        if ( $found ) {
@@ -82,13 +96,21 @@ class CleanupSpam extends Maintenance {
                } else {
                        // Clean up spam on this wiki
 
+                       $count = 0;
+                       /** @var $dbr Database */
                        $dbr = $this->getDB( DB_REPLICA );
-                       $res = $dbr->select( 'externallinks', [ 'DISTINCT el_from' ],
-                               [ 'el_index' . $dbr->buildLike( $like ) ], __METHOD__ );
-                       $count = $dbr->numRows( $res );
-                       $this->output( "Found $count articles containing $spec\n" );
-                       foreach ( $res as $row ) {
-                               $this->cleanupArticle( $row->el_from, $spec );
+                       foreach ( $likes as $prot => $like ) {
+                               $res = $dbr->select(
+                                       'externallinks',
+                                       [ 'DISTINCT el_from' ],
+                                       [ 'el_index' . $dbr->buildLike( $like ) ],
+                                       __METHOD__
+                               );
+                               $count = $dbr->numRows( $res );
+                               $this->output( "Found $count articles containing $spec\n" );
+                               foreach ( $res as $row ) {
+                                       $this->cleanupArticle( $row->el_from, $spec, $prot );
+                               }
                        }
                        if ( $count ) {
                                $this->output( "Done\n" );
@@ -96,7 +118,13 @@ class CleanupSpam extends Maintenance {
                }
        }
 
-       private function cleanupArticle( $id, $domain ) {
+       /**
+        * @param int $id
+        * @param string $domain
+        * @param string $protocol
+        * @throws MWException
+        */
+       private function cleanupArticle( $id, $domain, $protocol ) {
                $title = Title::newFromID( $id );
                if ( !$title ) {
                        $this->error( "Internal error: no page for ID $id" );
@@ -109,7 +137,7 @@ class CleanupSpam extends Maintenance {
                $currentRevId = $rev->getId();
 
                while ( $rev && ( $rev->isDeleted( Revision::DELETED_TEXT )
-                       || LinkFilter::matchEntry( $rev->getContent( Revision::RAW ), $domain ) )
+                       || LinkFilter::matchEntry( $rev->getContent( Revision::RAW ), $domain, $protocol ) )
                ) {
                        $rev = $rev->getPrevious();
                }