Merge "Type hint against LinkTarget in WatchedItemStore"
[lhc/web/wiklou.git] / maintenance / deduplicateArchiveRevId.php
1 <?php
2
3 use Wikimedia\Rdbms\IDatabase;
4
5 require_once __DIR__ . '/Maintenance.php';
6
7 /**
8 * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
9 * both within archive and between archive and revision.
10 *
11 * @ingroup Maintenance
12 * @since 1.32
13 */
14 class DeduplicateArchiveRevId extends LoggedUpdateMaintenance {
15
16 /** @var array|null */
17 private $arActorQuery = null;
18
19 private $deleted = 0;
20 private $reassigned = 0;
21
22 public function __construct() {
23 parent::__construct();
24 $this->addDescription(
25 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
26 );
27 $this->setBatchSize( 10000 );
28 }
29
30 protected function getUpdateKey() {
31 return __CLASS__;
32 }
33
34 protected function doDBUpdates() {
35 $this->output( "Deduplicating ar_rev_id...\n" );
36 $dbw = $this->getDB( DB_MASTER );
37 // Sanity check. If this is a new install, we don't need to do anything here.
38 if ( PopulateArchiveRevId::isNewInstall( $dbw ) ) {
39 $this->output( "New install, nothing to do here.\n" );
40 return true;
41 }
42
43 PopulateArchiveRevId::checkMysqlAutoIncrementBug( $dbw );
44
45 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
46 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
47 $batchSize = $this->getBatchSize();
48
49 $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
50 $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
51
52 for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
53 $endId = min( $maxId, $id + $batchSize - 1 );
54
55 $this->beginTransaction( $dbw, __METHOD__ );
56
57 // Lock the archive and revision table rows for the IDs we're checking
58 // to try to prevent deletions or undeletions from confusing things.
59 $dbw->selectRowCount(
60 'archive',
61 1,
62 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
63 __METHOD__,
64 [ 'FOR UPDATE' ]
65 );
66 $dbw->selectRowCount(
67 'revision',
68 1,
69 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
70 __METHOD__,
71 [ 'LOCK IN SHARE MODE' ]
72 );
73
74 // Figure out the ar_rev_ids we actually need to look at
75 $res = $dbw->select(
76 [ 'archive', 'revision' ] + $revActorQuery['tables'],
77 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
78 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
79 __METHOD__,
80 [ 'DISTINCT' ],
81 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
82 );
83 $revRows = [];
84 foreach ( $res as $row ) {
85 $revRows[$row->rev_id] = $row;
86 }
87
88 $arRevIds = $dbw->selectFieldValues(
89 [ 'archive' ],
90 'ar_rev_id',
91 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
92 __METHOD__,
93 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
94 );
95 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
96
97 if ( $arRevIds ) {
98 $this->processArRevIds( $dbw, $arRevIds, $revRows );
99 }
100
101 $this->output( "... $id-$endId\n" );
102 $this->commitTransaction( $dbw, __METHOD__ );
103 }
104
105 $this->output(
106 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
107 . "$this->reassigned assigned new IDs.\n"
108 );
109 return true;
110 }
111
112 /**
113 * Process a set of ar_rev_ids
114 * @param IDatabase $dbw
115 * @param int[] $arRevIds IDs to process
116 * @param object[] $revRows Existing revision-table row data
117 */
118 private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
119 // Select all the data we need for deduplication
120 $res = $dbw->select(
121 [ 'archive' ] + $this->arActorQuery['tables'],
122 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
123 + $this->arActorQuery['fields'],
124 [ 'ar_rev_id' => $arRevIds ],
125 __METHOD__,
126 [],
127 $this->arActorQuery['joins']
128 );
129
130 // Determine which rows we need to delete or reassign
131 $seen = [];
132 $toDelete = [];
133 $toReassign = [];
134 foreach ( $res as $row ) {
135 // Revision-table row exists?
136 if ( isset( $revRows[$row->ar_rev_id] ) ) {
137 $revRow = $revRows[$row->ar_rev_id];
138
139 // Record the rev_id as seen, so the code below will always delete or reassign.
140 if ( !isset( $seen[$revRow->rev_id] ) ) {
141 $seen[$revRow->rev_id] = [
142 'first' => "revision row",
143 ];
144 }
145
146 // Delete the archive row if it seems to be the same regardless
147 // of page, because moves can change IDs and titles.
148 if ( $row->ar_timestamp === $revRow->rev_timestamp &&
149 $row->ar_sha1 === $revRow->rev_sha1 &&
150 $row->ar_user === $revRow->rev_user &&
151 $row->ar_user_text === $revRow->rev_user_text
152 ) {
153 $this->output(
154 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
155 );
156 $toDelete[] = $row->ar_id;
157 continue;
158 }
159 }
160
161 $key = $this->getSeenKey( $row );
162 if ( !isset( $seen[$row->ar_rev_id] ) ) {
163 // This rev_id hasn't even been seen yet, nothing to do besides record it.
164 $seen[$row->ar_rev_id] = [
165 'first' => "archive row $row->ar_id",
166 $key => $row->ar_id,
167 ];
168 } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
169 // The rev_id was seen, but not this particular change. Reassign it.
170 $seen[$row->ar_rev_id][$key] = $row->ar_id;
171 $this->output(
172 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
173 . "for rev_id $row->ar_rev_id, reassigning\n"
174 );
175 $toReassign[] = $row->ar_id;
176 } else {
177 // The rev_id was seen with a row that matches this change. Delete it.
178 $this->output(
179 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
180 . "for rev_id $row->ar_rev_id, deleting\n"
181 );
182 $toDelete[] = $row->ar_id;
183 }
184 }
185
186 // Perform the updates
187 if ( $toDelete ) {
188 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
189 $this->deleted += $dbw->affectedRows();
190 }
191 if ( $toReassign ) {
192 $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
193 }
194 }
195
196 /**
197 * Make a key identifying a "unique" change from a row
198 * @param object $row
199 * @return string
200 */
201 private function getSeenKey( $row ) {
202 return implode( "\n", [
203 $row->ar_namespace,
204 $row->ar_title,
205 $row->ar_timestamp,
206 $row->ar_sha1,
207 $row->ar_user,
208 $row->ar_user_text,
209 ] );
210 }
211
212 }
213
214 $maintClass = "DeduplicateArchiveRevId";
215 require_once RUN_MAINTENANCE_IF_MAIN;