Merge "Make sure Parsoid doesn't get snobbish and treat non-html5 tags badly."
[lhc/web/wiklou.git] / maintenance / syncFileBackend.php
1 <?php
2 /**
3 * Sync one file backend to another based on the journal of later.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance
22 */
23
24 require_once( __DIR__ . '/Maintenance.php' );
25
26 /**
27 * Maintenance script that syncs one file backend to another based on
28 * the journal of later.
29 *
30 * @ingroup Maintenance
31 */
32 class SyncFileBackend extends Maintenance {
33 public function __construct() {
34 parent::__construct();
35 $this->mDescription = "Sync one file backend with another using the journal";
36 $this->addOption( 'src', 'Name of backend to sync from', true, true );
37 $this->addOption( 'dst', 'Name of destination backend to sync', false, true );
38 $this->addOption( 'start', 'Starting journal ID', false, true );
39 $this->addOption( 'end', 'Ending journal ID', false, true );
40 $this->addOption( 'posdir', 'Directory to read/record journal positions', false, true );
41 $this->addOption( 'posdump', 'Just dump current journal position into the position dir.' );
42 $this->addOption( 'postime', 'For position dumps, get the ID at this time', false, true );
43 $this->addOption( 'verbose', 'Verbose mode', false, false, 'v' );
44 $this->setBatchSize( 50 );
45 }
46
47 public function execute() {
48 $src = FileBackendGroup::singleton()->get( $this->getOption( 'src' ) );
49
50 $posDir = $this->getOption( 'posdir' );
51 $posFile = $posDir ? $posDir . '/' . wfWikiID() : false;
52
53 if ( $this->hasOption( 'posdump' ) ) {
54 // Just dump the current position into the specified position dir
55 if ( !$this->hasOption( 'posdir' ) ) {
56 $this->error( "Param posdir required!", 1 );
57 }
58 if ( $this->hasOption( 'postime' ) ) {
59 $id = (int)$src->getJournal()->getPositionAtTime( $this->getOption( 'postime' ) );
60 $this->output( "Requested journal position is $id.\n" );
61 } else {
62 $id = (int)$src->getJournal()->getCurrentPosition();
63 $this->output( "Current journal position is $id.\n" );
64 }
65 if ( file_put_contents( $posFile, $id, LOCK_EX ) !== false ) {
66 $this->output( "Saved journal position file.\n" );
67 } else {
68 $this->output( "Could not save journal position file.\n" );
69 }
70 if ( $this->isQuiet() ) {
71 print $id; // give a single machine-readable number
72 }
73 return;
74 }
75
76 if ( !$this->hasOption( 'dst' ) ) {
77 $this->error( "Param dst required!", 1 );
78 }
79 $dst = FileBackendGroup::singleton()->get( $this->getOption( 'dst' ) );
80
81 $start = $this->getOption( 'start', 0 );
82 if ( !$start && $posFile && is_dir( $posDir ) ) {
83 $start = is_file( $posFile )
84 ? (int)trim( file_get_contents( $posFile ) )
85 : 0;
86 ++$start; // we already did this ID, start with the next one
87 $startFromPosFile = true;
88 } else {
89 $startFromPosFile = false;
90 }
91 $end = $this->getOption( 'end', INF );
92
93 $this->output( "Synchronizing backend '{$dst->getName()}' to '{$src->getName()}'...\n" );
94 $this->output( "Starting journal position is $start.\n" );
95 if ( is_finite( $end ) ) {
96 $this->output( "Ending journal position is $end.\n" );
97 }
98
99 // Actually sync the dest backend with the reference backend
100 $lastOKPos = $this->syncBackends( $src, $dst, $start, $end );
101
102 // Update the sync position file
103 if ( $startFromPosFile && $lastOKPos >= $start ) { // successfully advanced
104 if ( file_put_contents( $posFile, $lastOKPos, LOCK_EX ) !== false ) {
105 $this->output( "Updated journal position file.\n" );
106 } else {
107 $this->output( "Could not update journal position file.\n" );
108 }
109 }
110
111 if ( $lastOKPos === false ) {
112 if ( !$start ) {
113 $this->output( "No journal entries found.\n" );
114 } else {
115 $this->output( "No new journal entries found.\n" );
116 }
117 } else {
118 $this->output( "Stopped synchronization at journal position $lastOKPos.\n" );
119 }
120
121 if ( $this->isQuiet() ) {
122 print $lastOKPos; // give a single machine-readable number
123 }
124 }
125
126 /**
127 * Sync $dst backend to $src backend based on the $src logs given after $start.
128 * Returns the journal entry ID this advanced to and handled (inclusive).
129 *
130 * @param $src FileBackend
131 * @param $dst FileBackend
132 * @param $start integer Starting journal position
133 * @param $end integer Starting journal position
134 * @return integer|false Journal entry ID or false if there are none
135 */
136 protected function syncBackends( FileBackend $src, FileBackend $dst, $start, $end ) {
137 $lastOKPos = 0; // failed
138 $first = true; // first batch
139
140 if ( $start > $end ) { // sanity
141 $this->error( "Error: given starting ID greater than ending ID.", 1 );
142 }
143
144 do {
145 $limit = min( $this->mBatchSize, $end - $start + 1 ); // don't go pass ending ID
146 $this->output( "Doing id $start to " . ( $start + $limit - 1 ) . "...\n" );
147
148 $entries = $src->getJournal()->getChangeEntries( $start, $limit, $next );
149 $start = $next; // start where we left off next time
150 if ( $first && !count( $entries ) ) {
151 return false; // nothing to do
152 }
153 $first = false;
154
155 $lastPosInBatch = 0;
156 $pathsInBatch = array(); // changed paths
157 foreach ( $entries as $entry ) {
158 if ( $entry['op'] !== 'null' ) { // null ops are just for reference
159 $pathsInBatch[$entry['path']] = 1; // remove duplicates
160 }
161 $lastPosInBatch = $entry['id'];
162 }
163
164 $status = $this->syncFileBatch( array_keys( $pathsInBatch ), $src, $dst );
165 if ( $status->isOK() ) {
166 $lastOKPos = max( $lastOKPos, $lastPosInBatch );
167 } else {
168 $this->error( print_r( $status->getErrorsArray(), true ) );
169 break; // no gaps; everything up to $lastPos must be OK
170 }
171
172 if ( !$start ) {
173 $this->output( "End of journal entries.\n" );
174 }
175 } while ( $start && $start <= $end );
176
177 return $lastOKPos;
178 }
179
180 /**
181 * Sync particular files of backend $src to the corresponding $dst backend files
182 *
183 * @param $paths Array
184 * @param $src FileBackend
185 * @param $dst FileBackend
186 * @return Status
187 */
188 protected function syncFileBatch( array $paths, FileBackend $src, FileBackend $dst ) {
189 $status = Status::newGood();
190 if ( !count( $paths ) ) {
191 return $status; // nothing to do
192 }
193
194 // Source: convert internal backend names (FileBackendMultiWrite) to the public one
195 $sPaths = $this->replaceNamePaths( $paths, $src );
196 // Destination: get corresponding path name
197 $dPaths = $this->replaceNamePaths( $paths, $dst );
198
199 // Lock the live backend paths from modification
200 $sLock = $src->getScopedFileLocks( $sPaths, LockManager::LOCK_UW, $status );
201 $eLock = $dst->getScopedFileLocks( $dPaths, LockManager::LOCK_EX, $status );
202 if ( !$status->isOK() ) {
203 return $status;
204 }
205
206 $ops = array();
207 $fsFiles = array();
208 foreach ( $sPaths as $i => $sPath ) {
209 $dPath = $dPaths[$i]; // destination
210 $sExists = $src->fileExists( array( 'src' => $sPath, 'latest' => 1 ) );
211 if ( $sExists === true ) { // exists in source
212 if ( $this->filesAreSame( $src, $dst, $sPath, $dPath ) ) {
213 continue; // avoid local copies for non-FS backends
214 }
215 // Note: getLocalReference() is fast for FS backends
216 $fsFile = $src->getLocalReference( array( 'src' => $sPath, 'latest' => 1 ) );
217 if ( !$fsFile ) {
218 $this->error( "Unable to sync '$dPath': could not get local copy." );
219 $status->fatal( 'backend-fail-internal', $src->getName() );
220 return $status;
221 }
222 $fsFiles[] = $fsFile; // keep TempFSFile objects alive as needed
223 // Note: prepare() is usually fast for key/value backends
224 $status->merge( $dst->prepare( array(
225 'dir' => dirname( $dPath ), 'bypassReadOnly' => 1 ) ) );
226 if ( !$status->isOK() ) {
227 return $status;
228 }
229 $ops[] = array( 'op' => 'store',
230 'src' => $fsFile->getPath(), 'dst' => $dPath, 'overwrite' => 1 );
231 } elseif ( $sExists === false ) { // does not exist in source
232 $ops[] = array( 'op' => 'delete', 'src' => $dPath, 'ignoreMissingSource' => 1 );
233 } else { // error
234 $this->error( "Unable to sync '$dPath': could not stat file." );
235 $status->fatal( 'backend-fail-internal', $src->getName() );
236 return $status;
237 }
238 }
239
240 $t_start = microtime( true );
241 $status = $dst->doQuickOperations( $ops, array( 'bypassReadOnly' => 1 ) );
242 if ( !$status->isOK() ) {
243 sleep( 10 ); // wait and retry copy again
244 $status = $dst->doQuickOperations( $ops, array( 'bypassReadOnly' => 1 ) );
245 }
246 $ellapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
247 if ( $status->isOK() && $this->getOption( 'verbose' ) ) {
248 $this->output( "Synchronized these file(s) [{$ellapsed_ms}ms]:\n" .
249 implode( "\n", $dPaths ) . "\n" );
250 }
251
252 return $status;
253 }
254
255 /**
256 * Substitute the backend name of storage paths with that of a given one
257 *
258 * @param $paths Array|string List of paths or single string path
259 * @return Array|string
260 */
261 protected function replaceNamePaths( $paths, FileBackend $backend ) {
262 return preg_replace(
263 '!^mwstore://([^/]+)!',
264 StringUtils::escapeRegexReplacement( "mwstore://" . $backend->getName() ),
265 $paths // string or array
266 );
267 }
268
269 protected function filesAreSame( FileBackend $src, FileBackend $dst, $sPath, $dPath ) {
270 return (
271 ( $src->getFileSize( array( 'src' => $sPath ) )
272 === $dst->getFileSize( array( 'src' => $dPath ) ) // short-circuit
273 ) && ( $src->getFileSha1Base36( array( 'src' => $sPath ) )
274 === $dst->getFileSha1Base36( array( 'src' => $dPath ) )
275 )
276 );
277 }
278 }
279
280 $maintClass = "SyncFileBackend";
281 require_once( RUN_MAINTENANCE_IF_MAIN );