Merge "Expose sort orders from search engine in ApiQuerySearch"
[lhc/web/wiklou.git] / maintenance / deduplicateArchiveRevId.php
1 <?php
2
3 use Wikimedia\Rdbms\IDatabase;
4
5 require_once __DIR__ . '/Maintenance.php';
6
7 /**
8 * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
9 * both within archive and between archive and revision.
10 *
11 * @ingroup Maintenance
12 * @since 1.32
13 */
14 class DeduplicateArchiveRevId extends LoggedUpdateMaintenance {
15
16 /** @var array|null */
17 private $arActorQuery = null;
18
19 private $deleted = 0;
20 private $reassigned = 0;
21
22 public function __construct() {
23 parent::__construct();
24 $this->addDescription(
25 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
26 );
27 $this->setBatchSize( 10000 );
28 }
29
30 protected function getUpdateKey() {
31 return __CLASS__;
32 }
33
34 protected function doDBUpdates() {
35 $this->output( "Deduplicating ar_rev_id...\n" );
36
37 $dbw = $this->getDB( DB_MASTER );
38
39 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
40 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
41 $batchSize = $this->getBatchSize();
42
43 $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
44 $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
45
46 for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
47 $endId = min( $maxId, $id + $batchSize - 1 );
48
49 $this->beginTransaction( $dbw, __METHOD__ );
50
51 // Lock the archive and revision table rows for the IDs we're checking
52 // to try to prevent deletions or undeletions from confusing things.
53 $dbw->selectRowCount(
54 'archive',
55 1,
56 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
57 __METHOD__,
58 [ 'FOR UPDATE' ]
59 );
60 $dbw->selectRowCount(
61 'revision',
62 1,
63 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
64 __METHOD__,
65 [ 'LOCK IN SHARE MODE' ]
66 );
67
68 // Figure out the ar_rev_ids we actually need to look at
69 $res = $dbw->select(
70 [ 'archive', 'revision' ] + $revActorQuery['tables'],
71 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
72 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
73 __METHOD__,
74 [ 'DISTINCT' ],
75 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
76 );
77 $revRows = [];
78 foreach ( $res as $row ) {
79 $revRows[$row->rev_id] = $row;
80 }
81
82 $arRevIds = $dbw->selectFieldValues(
83 [ 'archive' ],
84 'ar_rev_id',
85 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
86 __METHOD__,
87 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
88 );
89 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
90
91 if ( $arRevIds ) {
92 $this->processArRevIds( $dbw, $arRevIds, $revRows );
93 }
94
95 $this->output( "... $id-$endId\n" );
96 $this->commitTransaction( $dbw, __METHOD__ );
97 }
98
99 $this->output(
100 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
101 . "$this->reassigned assigned new IDs.\n"
102 );
103 return true;
104 }
105
106 /**
107 * Process a set of ar_rev_ids
108 * @param IDatabase $dbw
109 * @param int[] $arRevIds IDs to process
110 * @param object[] $revRows Existing revision-table row data
111 */
112 private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
113 // Select all the data we need for deduplication
114 $res = $dbw->select(
115 [ 'archive' ] + $this->arActorQuery['tables'],
116 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
117 + $this->arActorQuery['fields'],
118 [ 'ar_rev_id' => $arRevIds ],
119 __METHOD__,
120 [],
121 $this->arActorQuery['joins']
122 );
123
124 // Determine which rows we need to delete or reassign
125 $seen = [];
126 $toDelete = [];
127 $toReassign = [];
128 foreach ( $res as $row ) {
129 // Revision-table row exists?
130 if ( isset( $revRows[$row->ar_rev_id] ) ) {
131 $revRow = $revRows[$row->ar_rev_id];
132
133 // Record the rev_id as seen, so the code below will always delete or reassign.
134 if ( !isset( $seen[$revRow->rev_id] ) ) {
135 $seen[$revRow->rev_id] = [
136 'first' => "revision row",
137 ];
138 }
139
140 // Delete the archive row if it seems to be the same regardless
141 // of page, because moves can change IDs and titles.
142 if ( $row->ar_timestamp === $revRow->rev_timestamp &&
143 $row->ar_sha1 === $revRow->rev_sha1 &&
144 $row->ar_user === $revRow->rev_user &&
145 $row->ar_user_text === $revRow->rev_user_text
146 ) {
147 $this->output(
148 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
149 );
150 $toDelete[] = $row->ar_id;
151 continue;
152 }
153 }
154
155 $key = $this->getSeenKey( $row );
156 if ( !isset( $seen[$row->ar_rev_id] ) ) {
157 // This rev_id hasn't even been seen yet, nothing to do besides record it.
158 $seen[$row->ar_rev_id] = [
159 'first' => "archive row $row->ar_id",
160 $key => $row->ar_id,
161 ];
162 } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
163 // The rev_id was seen, but not this particular change. Reassign it.
164 $seen[$row->ar_rev_id][$key] = $row->ar_id;
165 $this->output(
166 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
167 . "for rev_id $row->ar_rev_id, reassigning\n"
168 );
169 $toReassign[] = $row->ar_id;
170 } else {
171 // The rev_id was seen with a row that matches this change. Delete it.
172 $this->output(
173 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
174 . "for rev_id $row->ar_rev_id, deleting\n"
175 );
176 $toDelete[] = $row->ar_id;
177 }
178 }
179
180 // Perform the updates
181 if ( $toDelete ) {
182 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
183 $this->deleted += $dbw->affectedRows();
184 }
185 if ( $toReassign ) {
186 $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
187 }
188 }
189
190 /**
191 * Make a key identifying a "unique" change from a row
192 * @param object $row
193 * @return string
194 */
195 private function getSeenKey( $row ) {
196 return implode( "\n", [
197 $row->ar_namespace,
198 $row->ar_title,
199 $row->ar_timestamp,
200 $row->ar_sha1,
201 $row->ar_user,
202 $row->ar_user_text,
203 ] );
204 }
205
206 }
207
208 $maintClass = "DeduplicateArchiveRevId";
209 require_once RUN_MAINTENANCE_IF_MAIN;