(bug 27567) Add file repo support to prop=duplicatefiles
authorumherirrender <umherirrender_de.wp@web.de>
Mon, 23 Jul 2012 16:45:38 +0000 (18:45 +0200)
committerumherirrender <umherirrender_de.wp@web.de>
Mon, 23 Jul 2012 16:45:38 +0000 (18:45 +0200)
This adds a new method findBySha1s to the FileRepo classes to support
the multiple hash search against all repos (with only one query for each
db repo).

Change-Id: I745cae7a1db3a32c20aa0067b744402fcf1a3122

RELEASE-NOTES-1.20
includes/api/ApiQueryDuplicateFiles.php
includes/filerepo/FileRepo.php
includes/filerepo/LocalRepo.php
includes/filerepo/RepoGroup.php

index 2f463a7..24d6b19 100644 (file)
@@ -198,6 +198,7 @@ upgrade PHP if you have not done so prior to upgrading MediaWiki.
 * (bug 36987) API avoids mangling fields in continuation parameters
 * (bug 30836) siteinfo prop=specialpagealiases will no longer return nonexistent special pages
 * (bug 38190) Add "required" flag to some token params for hint in api docs.
+* (bug 27567) Add file repo support to prop=duplicatefiles.
 
 === Languages updated in 1.20 ===
 
index e05ffb6..719c84a 100644 (file)
@@ -59,73 +59,86 @@ class ApiQueryDuplicateFiles extends ApiQueryGeneratorBase {
                }
                $images = $namespaces[NS_FILE];
 
-               $this->addTables( 'image', 'i1' );
-               $this->addTables( 'image', 'i2' );
-               $this->addFields( array(
-                       'i1.img_name AS orig_name',
-                       'i2.img_name AS dup_name',
-                       'i2.img_user_text AS dup_user_text',
-                       'i2.img_timestamp AS dup_timestamp'
-               ) );
-
-               $this->addWhere( array(
-                       'i1.img_name' => array_keys( $images ),
-                       'i1.img_sha1 = i2.img_sha1',
-                       'i1.img_name != i2.img_name',
-               ) );
+               if( $params['dir'] == 'descending' ) {
+                       $images = array_reverse( $images );
+               }
 
+               $skipUntilThisDup = false;
                if ( isset( $params['continue'] ) ) {
                        $cont = explode( '|', $params['continue'] );
                        if ( count( $cont ) != 2 ) {
                                $this->dieUsage( 'Invalid continue param. You should pass the ' .
                                        'original value returned by the previous query', '_badcontinue' );
                        }
-                       $op = $params['dir'] == 'descending' ? '<' : '>';
-                       $db = $this->getDB();
-                       $orig = $db->addQuotes( $cont[0] );
-                       $dup = $db->addQuotes( $cont[1] );
-                       $this->addWhere(
-                               "i1.img_name $op $orig OR " .
-                               "(i1.img_name = $orig AND " .
-                               "i2.img_name $op= $dup)"
-                       );
+                       $fromImage = $cont[0];
+                       $skipUntilThisDup = $cont[1];
+                       // Filter out any images before $fromImage
+                       foreach ( $images as $image => $pageId ) {
+                               if ( $image < $fromImage ) {
+                                       unset( $images[$image] );
+                               } else {
+                                       break;
+                               }
+                       }
                }
 
-               $sort = ( $params['dir'] == 'descending' ? ' DESC' : '' );
-               // Don't order by i1.img_name if it's constant in the WHERE clause
-               if ( count( $this->getPageSet()->getGoodTitles() ) == 1 ) {
-                       $this->addOption( 'ORDER BY', 'i2.img_name' . $sort );
-               } else {
-                       $this->addOption( 'ORDER BY', array(
-                                       'i1.img_name' . $sort,
-                                       'i2.img_name' . $sort
-                       ));
-               }
-               $this->addOption( 'LIMIT', $params['limit'] + 1 );
+               $files = RepoGroup::singleton()->findFiles( array_keys( $images ) );
 
-               $res = $this->select( __METHOD__ );
+               $fit = true;
                $count = 0;
                $titles = array();
-               foreach ( $res as $row ) {
-                       if ( ++$count > $params['limit'] ) {
-                               // We've reached the one extra which shows that
-                               // there are additional pages to be had. Stop here...
-                               $this->setContinueEnumParameter( 'continue', $row->orig_name . '|' . $row->dup_name );
-                               break;
+
+               $sha1s = array();
+               foreach ( $files as $file ) {
+                       $sha1s[$file->getName()] = $file->getSha1();
+               }
+
+               // find all files with the hashes, result format is: array( hash => array( dup1, dup2 ), hash1 => ... )
+               $filesBySha1s = RepoGroup::singleton()->findBySha1s( array_unique( array_values( $sha1s ) ) );
+
+               // iterate over $images to handle continue param correct
+               foreach( $images as $image => $pageId ) {
+                       if( !isset( $sha1s[$image] ) ) {
+                               continue; //file does not exist
                        }
-                       if ( !is_null( $resultPageSet ) ) {
-                               $titles[] = Title::makeTitle( NS_FILE, $row->dup_name );
-                       } else {
-                               $r = array(
-                                       'name' => $row->dup_name,
-                                       'user' => $row->dup_user_text,
-                                       'timestamp' => wfTimestamp( TS_ISO_8601, $row->dup_timestamp )
-                               );
-                               $fit = $this->addPageSubItem( $images[$row->orig_name], $r );
-                               if ( !$fit ) {
-                                       $this->setContinueEnumParameter( 'continue', $row->orig_name . '|' . $row->dup_name );
+                       $sha1 = $sha1s[$image];
+                       $dupFiles = $filesBySha1s[$sha1];
+                       if( $params['dir'] == 'descending' ) {
+                               $dupFiles = array_reverse( $dupFiles );
+                       }
+                       foreach ( $dupFiles as $dupFile ) {
+                               $dupName = $dupFile->getName();
+                               if( $image == $dupName ) {
+                                       continue; //ignore the file itself
+                               }
+                               if( $skipUntilThisDup !== false && $dupName < $skipUntilThisDup ) {
+                                       continue; //skip to pos after the image from continue param
+                               }
+                               $skipUntilThisDup = false;
+                               if ( ++$count > $params['limit'] ) {
+                                       $fit = false; //break outer loop
+                                       // We're one over limit which shows that
+                                       // there are additional images to be had. Stop here...
+                                       $this->setContinueEnumParameter( 'continue', $image . '|' . $dupName );
                                        break;
                                }
+                               if ( !is_null( $resultPageSet ) ) {
+                                       $titles[] = $file->getTitle();
+                               } else {
+                                       $r = array(
+                                               'name' => $dupName,
+                                               'user' => $dupFile->getUser( 'text' ),
+                                               'timestamp' => wfTimestamp( TS_ISO_8601, $dupFile->getTimestamp() )
+                                       );
+                                       $fit = $this->addPageSubItem( $pageId, $r );
+                                       if ( !$fit ) {
+                                               $this->setContinueEnumParameter( 'continue', $image . '|' . $dupName );
+                                               break;
+                                       }
+                               }
+                       }
+                       if( !$fit ) {
+                               break;
                        }
                }
                if ( !is_null( $resultPageSet ) ) {
@@ -155,7 +168,7 @@ class ApiQueryDuplicateFiles extends ApiQueryGeneratorBase {
 
        public function getParamDescription() {
                return array(
-                       'limit' => 'How many files to return',
+                       'limit' => 'How many duplicate files to return',
                        'continue' => 'When more results are available, use this to continue',
                        'dir' => 'The direction in which to list',
                );
@@ -172,7 +185,7 @@ class ApiQueryDuplicateFiles extends ApiQueryGeneratorBase {
        }
 
        public function getDescription() {
-               return 'List all files that are duplicates of the given file(s)';
+               return 'List all files that are duplicates of the given file(s) based on hash values';
        }
 
        public function getPossibleErrors() {
index 3d2ccf4..5a55096 100644 (file)
@@ -439,6 +439,24 @@ class FileRepo {
                return array();
        }
 
+       /**
+        * Get an array of arrays or iterators of file objects for files that
+        * have the given SHA-1 content hashes.
+        *
+        * @param $hashes array An array of hashes
+        * @return array An Array of arrays or iterators of file objects and the hash as key
+        */
+       public function findBySha1s( array $hashes ) {
+               $result = array();
+               foreach ( $hashes as $hash ) {
+                       $files = $this->findBySha1( $hash );
+                       if ( count( $files ) ) {
+                               $result[$hash] = $files;
+                       }
+               }
+               return $result;
+       }
+
        /**
         * Get the public root URL of the repository
         *
index dd0c947..0954422 100644 (file)
@@ -248,6 +248,39 @@ class LocalRepo extends FileRepo {
                return $result;
        }
 
+       /**
+        * Get an array of arrays or iterators of file objects for files that
+        * have the given SHA-1 content hashes.
+        *
+        * Overrides generic implementation in FileRepo for performance reason
+        *
+        * @param $hashes array An array of hashes
+        * @return array An Array of arrays or iterators of file objects and the hash as key
+        */
+       function findBySha1s( array $hashes ) {
+               if( !count( $hashes ) ) {
+                       return array(); //empty parameter
+               }
+
+               $dbr = $this->getSlaveDB();
+               $res = $dbr->select(
+                       'image',
+                       LocalFile::selectFields(),
+                       array( 'img_sha1' => $hashes ),
+                       __METHOD__,
+                       array( 'ORDER BY' => 'img_name' )
+               );
+
+               $result = array();
+               foreach ( $res as $row ) {
+                       $file = $this->newFileFromRow( $row );
+                       $result[$file->getSha1()][] = $file;
+               }
+               $res->free();
+
+               return $result;
+       }
+
        /**
         * Get a connection to the slave DB
         * @return DatabaseBase
index 6b31b7e..f9e5759 100644 (file)
@@ -263,6 +263,28 @@ class RepoGroup {
                return $result;
        }
 
+       /**
+        * Find all instances of files with this keys
+        *
+        * @param $hashes Array base 36 SHA-1 hashes
+        * @return Array of array of File objects
+        */
+       function findBySha1s( array $hashes ) {
+               if ( !$this->reposInitialised ) {
+                       $this->initialiseRepos();
+               }
+
+               $result = $this->localRepo->findBySha1s( $hashes );
+               foreach ( $this->foreignRepos as $repo ) {
+                       $result = array_merge_recursive( $result, $repo->findBySha1s( $hashes ) );
+               }
+               //sort the merged (and presorted) sublist of each hash
+               foreach( $result as $hash => $files ) {
+                       usort( $result[$hash], 'File::compare' );
+               }
+               return $result;
+       }
+
        /**
         * Get the repo instance with a given key.
         * @param $index string|int