Merge "Add MessagesBi.php"
[lhc/web/wiklou.git] / maintenance / deduplicateArchiveRevId.php
1 <?php
2
3 use Wikimedia\Rdbms\IDatabase;
4
5 require_once __DIR__ . '/Maintenance.php';
6
7 /**
8 * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
9 * both within archive and between archive and revision.
10 *
11 * @ingroup Maintenance
12 * @since 1.32
13 */
14 class DeduplicateArchiveRevId extends LoggedUpdateMaintenance {
15
16 /** @var array|null */
17 private $arActorQuery = null;
18
19 private $deleted = 0;
20 private $reassigned = 0;
21
22 public function __construct() {
23 parent::__construct();
24 $this->addDescription(
25 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
26 );
27 $this->setBatchSize( 10000 );
28 }
29
30 protected function getUpdateKey() {
31 return __CLASS__;
32 }
33
34 protected function doDBUpdates() {
35 $this->output( "Deduplicating ar_rev_id...\n" );
36
37 $dbw = $this->getDB( DB_MASTER );
38 PopulateArchiveRevId::checkMysqlAutoIncrementBug( $dbw );
39
40 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
41 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
42 $batchSize = $this->getBatchSize();
43
44 $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
45 $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
46
47 for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
48 $endId = min( $maxId, $id + $batchSize - 1 );
49
50 $this->beginTransaction( $dbw, __METHOD__ );
51
52 // Lock the archive and revision table rows for the IDs we're checking
53 // to try to prevent deletions or undeletions from confusing things.
54 $dbw->selectRowCount(
55 'archive',
56 1,
57 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
58 __METHOD__,
59 [ 'FOR UPDATE' ]
60 );
61 $dbw->selectRowCount(
62 'revision',
63 1,
64 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
65 __METHOD__,
66 [ 'LOCK IN SHARE MODE' ]
67 );
68
69 // Figure out the ar_rev_ids we actually need to look at
70 $res = $dbw->select(
71 [ 'archive', 'revision' ] + $revActorQuery['tables'],
72 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
73 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
74 __METHOD__,
75 [ 'DISTINCT' ],
76 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
77 );
78 $revRows = [];
79 foreach ( $res as $row ) {
80 $revRows[$row->rev_id] = $row;
81 }
82
83 $arRevIds = $dbw->selectFieldValues(
84 [ 'archive' ],
85 'ar_rev_id',
86 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
87 __METHOD__,
88 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
89 );
90 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
91
92 if ( $arRevIds ) {
93 $this->processArRevIds( $dbw, $arRevIds, $revRows );
94 }
95
96 $this->output( "... $id-$endId\n" );
97 $this->commitTransaction( $dbw, __METHOD__ );
98 }
99
100 $this->output(
101 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
102 . "$this->reassigned assigned new IDs.\n"
103 );
104 return true;
105 }
106
107 /**
108 * Process a set of ar_rev_ids
109 * @param IDatabase $dbw
110 * @param int[] $arRevIds IDs to process
111 * @param object[] $revRows Existing revision-table row data
112 */
113 private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
114 // Select all the data we need for deduplication
115 $res = $dbw->select(
116 [ 'archive' ] + $this->arActorQuery['tables'],
117 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
118 + $this->arActorQuery['fields'],
119 [ 'ar_rev_id' => $arRevIds ],
120 __METHOD__,
121 [],
122 $this->arActorQuery['joins']
123 );
124
125 // Determine which rows we need to delete or reassign
126 $seen = [];
127 $toDelete = [];
128 $toReassign = [];
129 foreach ( $res as $row ) {
130 // Revision-table row exists?
131 if ( isset( $revRows[$row->ar_rev_id] ) ) {
132 $revRow = $revRows[$row->ar_rev_id];
133
134 // Record the rev_id as seen, so the code below will always delete or reassign.
135 if ( !isset( $seen[$revRow->rev_id] ) ) {
136 $seen[$revRow->rev_id] = [
137 'first' => "revision row",
138 ];
139 }
140
141 // Delete the archive row if it seems to be the same regardless
142 // of page, because moves can change IDs and titles.
143 if ( $row->ar_timestamp === $revRow->rev_timestamp &&
144 $row->ar_sha1 === $revRow->rev_sha1 &&
145 $row->ar_user === $revRow->rev_user &&
146 $row->ar_user_text === $revRow->rev_user_text
147 ) {
148 $this->output(
149 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
150 );
151 $toDelete[] = $row->ar_id;
152 continue;
153 }
154 }
155
156 $key = $this->getSeenKey( $row );
157 if ( !isset( $seen[$row->ar_rev_id] ) ) {
158 // This rev_id hasn't even been seen yet, nothing to do besides record it.
159 $seen[$row->ar_rev_id] = [
160 'first' => "archive row $row->ar_id",
161 $key => $row->ar_id,
162 ];
163 } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
164 // The rev_id was seen, but not this particular change. Reassign it.
165 $seen[$row->ar_rev_id][$key] = $row->ar_id;
166 $this->output(
167 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
168 . "for rev_id $row->ar_rev_id, reassigning\n"
169 );
170 $toReassign[] = $row->ar_id;
171 } else {
172 // The rev_id was seen with a row that matches this change. Delete it.
173 $this->output(
174 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
175 . "for rev_id $row->ar_rev_id, deleting\n"
176 );
177 $toDelete[] = $row->ar_id;
178 }
179 }
180
181 // Perform the updates
182 if ( $toDelete ) {
183 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
184 $this->deleted += $dbw->affectedRows();
185 }
186 if ( $toReassign ) {
187 $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
188 }
189 }
190
191 /**
192 * Make a key identifying a "unique" change from a row
193 * @param object $row
194 * @return string
195 */
196 private function getSeenKey( $row ) {
197 return implode( "\n", [
198 $row->ar_namespace,
199 $row->ar_title,
200 $row->ar_timestamp,
201 $row->ar_sha1,
202 $row->ar_user,
203 $row->ar_user_text,
204 ] );
205 }
206
207 }
208
209 $maintClass = "DeduplicateArchiveRevId";
210 require_once RUN_MAINTENANCE_IF_MAIN;