Merge "maintenance: Script to rename titles for Unicode uppercasing changes"
[lhc/web/wiklou.git] / includes / Storage / SqlBlobStore.php
1 <?php
2 /**
3 * Service for storing and loading data blobs representing revision content.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * Attribution notice: when this file was created, much of its content was taken
21 * from the Revision.php file as present in release 1.30. Refer to the history
22 * of that file for original authorship.
23 *
24 * @file
25 */
26
27 namespace MediaWiki\Storage;
28
29 use DBAccessObjectUtils;
30 use IDBAccessObject;
31 use IExpiringStore;
32 use InvalidArgumentException;
33 use Language;
34 use MWException;
35 use WANObjectCache;
36 use ExternalStoreAccess;
37 use Wikimedia\Assert\Assert;
38 use Wikimedia\Rdbms\IDatabase;
39 use Wikimedia\Rdbms\ILoadBalancer;
40
41 /**
42 * Service for storing and loading Content objects.
43 *
44 * @since 1.31
45 *
46 * @note This was written to act as a drop-in replacement for the corresponding
47 * static methods in Revision.
48 */
49 class SqlBlobStore implements IDBAccessObject, BlobStore {
50
51 // Note: the name has been taken unchanged from the Revision class.
52 const TEXT_CACHE_GROUP = 'revisiontext:10';
53
54 /**
55 * @var ILoadBalancer
56 */
57 private $dbLoadBalancer;
58
59 /**
60 * @var ExternalStoreAccess
61 */
62 private $extStoreAccess;
63
64 /**
65 * @var WANObjectCache
66 */
67 private $cache;
68
69 /**
70 * @var string|bool DB domain ID of a wiki or false for the local one
71 */
72 private $dbDomain;
73
74 /**
75 * @var int
76 */
77 private $cacheExpiry = 604800; // 7 days
78
79 /**
80 * @var bool
81 */
82 private $compressBlobs = false;
83
84 /**
85 * @var bool|string
86 */
87 private $legacyEncoding = false;
88
89 /**
90 * @var Language|null
91 */
92 private $legacyEncodingConversionLang = null;
93
94 /**
95 * @var boolean
96 */
97 private $useExternalStore = false;
98
99 /**
100 * @param ILoadBalancer $dbLoadBalancer A load balancer for acquiring database connections
101 * @param ExternalStoreAccess $extStoreAccess Access layer for external storage
102 * @param WANObjectCache $cache A cache manager for caching blobs. This can be the local
103 * wiki's default instance even if $dbDomain refers to a different wiki, since
104 * makeGlobalKey() is used to constructed a key that allows cached blobs from the
105 * same database to be re-used between wikis. For example, enwiki and frwiki will
106 * use the same cache keys for blobs from the wikidatawiki database, regardless of
107 * the cache's default key space.
108 * @param bool|string $dbDomain The ID of the target wiki database. Use false for the local wiki.
109 */
110 public function __construct(
111 ILoadBalancer $dbLoadBalancer,
112 ExternalStoreAccess $extStoreAccess,
113 WANObjectCache $cache,
114 $dbDomain = false
115 ) {
116 $this->dbLoadBalancer = $dbLoadBalancer;
117 $this->extStoreAccess = $extStoreAccess;
118 $this->cache = $cache;
119 $this->dbDomain = $dbDomain;
120 }
121
122 /**
123 * @return int time for which blobs can be cached, in seconds
124 */
125 public function getCacheExpiry() {
126 return $this->cacheExpiry;
127 }
128
129 /**
130 * @param int $cacheExpiry time for which blobs can be cached, in seconds
131 */
132 public function setCacheExpiry( $cacheExpiry ) {
133 Assert::parameterType( 'integer', $cacheExpiry, '$cacheExpiry' );
134
135 $this->cacheExpiry = $cacheExpiry;
136 }
137
138 /**
139 * @return bool whether blobs should be compressed for storage
140 */
141 public function getCompressBlobs() {
142 return $this->compressBlobs;
143 }
144
145 /**
146 * @param bool $compressBlobs whether blobs should be compressed for storage
147 */
148 public function setCompressBlobs( $compressBlobs ) {
149 $this->compressBlobs = $compressBlobs;
150 }
151
152 /**
153 * @return false|string The legacy encoding to assume for blobs that are not marked as utf8.
154 * False means handling of legacy encoding is disabled, and utf8 assumed.
155 */
156 public function getLegacyEncoding() {
157 return $this->legacyEncoding;
158 }
159
160 /**
161 * @return Language|null The locale to use when decoding from a legacy encoding, or null
162 * if handling of legacy encoding is disabled.
163 */
164 public function getLegacyEncodingConversionLang() {
165 return $this->legacyEncodingConversionLang;
166 }
167
168 /**
169 * @param string $legacyEncoding The legacy encoding to assume for blobs that are
170 * not marked as utf8.
171 * @param Language $language The locale to use when decoding from a legacy encoding.
172 */
173 public function setLegacyEncoding( $legacyEncoding, Language $language ) {
174 Assert::parameterType( 'string', $legacyEncoding, '$legacyEncoding' );
175
176 $this->legacyEncoding = $legacyEncoding;
177 $this->legacyEncodingConversionLang = $language;
178 }
179
180 /**
181 * @return bool Whether to use the ExternalStore mechanism for storing blobs.
182 */
183 public function getUseExternalStore() {
184 return $this->useExternalStore;
185 }
186
187 /**
188 * @param bool $useExternalStore Whether to use the ExternalStore mechanism for storing blobs.
189 */
190 public function setUseExternalStore( $useExternalStore ) {
191 Assert::parameterType( 'boolean', $useExternalStore, '$useExternalStore' );
192
193 $this->useExternalStore = $useExternalStore;
194 }
195
196 /**
197 * @return ILoadBalancer
198 */
199 private function getDBLoadBalancer() {
200 return $this->dbLoadBalancer;
201 }
202
203 /**
204 * @param int $index A database index, like DB_MASTER or DB_REPLICA
205 *
206 * @return IDatabase
207 */
208 private function getDBConnection( $index ) {
209 $lb = $this->getDBLoadBalancer();
210 return $lb->getConnection( $index, [], $this->dbDomain );
211 }
212
213 /**
214 * Stores an arbitrary blob of data and returns an address that can be used with
215 * getBlob() to retrieve the same blob of data,
216 *
217 * @param string $data
218 * @param array $hints An array of hints.
219 *
220 * @throws BlobAccessException
221 * @return string an address that can be used with getBlob() to retrieve the data.
222 */
223 public function storeBlob( $data, $hints = [] ) {
224 try {
225 $flags = $this->compressData( $data );
226
227 # Write to external storage if required
228 if ( $this->useExternalStore ) {
229 // Store and get the URL
230 $data = $this->extStoreAccess->insert( $data, [ 'domain' => $this->dbDomain ] );
231 if ( !$data ) {
232 throw new BlobAccessException( "Failed to store text to external storage" );
233 }
234 if ( $flags ) {
235 $flags .= ',';
236 }
237 $flags .= 'external';
238
239 // TODO: we could also return an address for the external store directly here.
240 // That would mean bypassing the text table entirely when the external store is
241 // used. We'll need to assess expected fallout before doing that.
242 }
243
244 $dbw = $this->getDBConnection( DB_MASTER );
245
246 $old_id = $dbw->nextSequenceValue( 'text_old_id_seq' );
247 $dbw->insert(
248 'text',
249 [
250 'old_id' => $old_id,
251 'old_text' => $data,
252 'old_flags' => $flags,
253 ],
254 __METHOD__
255 );
256
257 $textId = $dbw->insertId();
258
259 return self::makeAddressFromTextId( $textId );
260 } catch ( MWException $e ) {
261 throw new BlobAccessException( $e->getMessage(), 0, $e );
262 }
263 }
264
265 /**
266 * Retrieve a blob, given an address.
267 * Currently hardcoded to the 'text' table storage engine.
268 *
269 * MCR migration note: this replaces Revision::loadText
270 *
271 * @param string $blobAddress
272 * @param int $queryFlags
273 *
274 * @throws BlobAccessException
275 * @return string
276 */
277 public function getBlob( $blobAddress, $queryFlags = 0 ) {
278 Assert::parameterType( 'string', $blobAddress, '$blobAddress' );
279
280 // No negative caching; negative hits on text rows may be due to corrupted replica DBs
281 $blob = $this->cache->getWithSetCallback(
282 $this->getCacheKey( $blobAddress ),
283 $this->getCacheTTL(),
284 function ( $unused, &$ttl, &$setOpts ) use ( $blobAddress, $queryFlags ) {
285 // Ignore $setOpts; blobs are immutable and negatives are not cached
286 return $this->fetchBlob( $blobAddress, $queryFlags );
287 },
288 [ 'pcGroup' => self::TEXT_CACHE_GROUP, 'pcTTL' => IExpiringStore::TTL_PROC_LONG ]
289 );
290
291 if ( $blob === false ) {
292 throw new BlobAccessException( 'Failed to load blob from address ' . $blobAddress );
293 }
294
295 return $blob;
296 }
297
298 /**
299 * MCR migration note: this corresponds to Revision::fetchText
300 *
301 * @param string $blobAddress
302 * @param int $queryFlags
303 *
304 * @throws BlobAccessException
305 * @return string|false
306 */
307 private function fetchBlob( $blobAddress, $queryFlags ) {
308 list( $schema, $id, ) = self::splitBlobAddress( $blobAddress );
309
310 //TODO: MCR: also support 'ex' schema with ExternalStore URLs, plus flags encoded in the URL!
311 if ( $schema === 'tt' ) {
312 $textId = intval( $id );
313 } else {
314 // XXX: change to better exceptions! That makes migration more difficult, though.
315 throw new BlobAccessException( "Unknown blob address schema: $schema" );
316 }
317
318 if ( !$textId || $id !== (string)$textId ) {
319 // XXX: change to better exceptions! That makes migration more difficult, though.
320 throw new BlobAccessException( "Bad blob address: $blobAddress" );
321 }
322
323 // Callers doing updates will pass in READ_LATEST as usual. Since the text/blob tables
324 // do not normally get rows changed around, set READ_LATEST_IMMUTABLE in those cases.
325 $queryFlags |= DBAccessObjectUtils::hasFlags( $queryFlags, self::READ_LATEST )
326 ? self::READ_LATEST_IMMUTABLE
327 : 0;
328
329 list( $index, $options, $fallbackIndex, $fallbackOptions ) =
330 DBAccessObjectUtils::getDBOptions( $queryFlags );
331
332 // Text data is immutable; check replica DBs first.
333 $row = $this->getDBConnection( $index )->selectRow(
334 'text',
335 [ 'old_text', 'old_flags' ],
336 [ 'old_id' => $textId ],
337 __METHOD__,
338 $options
339 );
340
341 // Fallback to DB_MASTER in some cases if the row was not found, using the appropriate
342 // options, such as FOR UPDATE to avoid missing rows due to REPEATABLE-READ.
343 if ( !$row && $fallbackIndex !== null ) {
344 $row = $this->getDBConnection( $fallbackIndex )->selectRow(
345 'text',
346 [ 'old_text', 'old_flags' ],
347 [ 'old_id' => $textId ],
348 __METHOD__,
349 $fallbackOptions
350 );
351 }
352
353 if ( !$row ) {
354 wfWarn( __METHOD__ . ": No text row with ID $textId." );
355 return false;
356 }
357
358 $blob = $this->expandBlob( $row->old_text, $row->old_flags, $blobAddress );
359
360 if ( $blob === false ) {
361 wfLogWarning( __METHOD__ . ": Bad data in text row $textId." );
362 return false;
363 }
364
365 return $blob;
366 }
367
368 /**
369 * Get a cache key for a given Blob address.
370 *
371 * The cache key is constructed in a way that allows cached blobs from the same database
372 * to be re-used between wikis. For example, enwiki and frwiki will use the same cache keys
373 * for blobs from the wikidatawiki database.
374 *
375 * @param string $blobAddress
376 * @return string
377 */
378 private function getCacheKey( $blobAddress ) {
379 return $this->cache->makeGlobalKey(
380 'BlobStore',
381 'address',
382 $this->dbLoadBalancer->resolveDomainID( $this->dbDomain ),
383 $blobAddress
384 );
385 }
386
387 /**
388 * Expand a raw data blob according to the flags given.
389 *
390 * MCR migration note: this replaces Revision::getRevisionText
391 *
392 * @note direct use is deprecated, use getBlob() or SlotRecord::getContent() instead.
393 * @todo make this private, there should be no need to use this method outside this class.
394 *
395 * @param string $raw The raw blob data, to be processed according to $flags.
396 * May be the blob itself, or the blob compressed, or just the address
397 * of the actual blob, depending on $flags.
398 * @param string|string[] $flags Blob flags, such as 'external' or 'gzip'.
399 * Note that not including 'utf-8' in $flags will cause the data to be decoded
400 * according to the legacy encoding specified via setLegacyEncoding.
401 * @param string|null $cacheKey A blob address for use in the cache key. If not given,
402 * caching is disabled.
403 *
404 * @return false|string The expanded blob or false on failure
405 */
406 public function expandBlob( $raw, $flags, $cacheKey = null ) {
407 if ( is_string( $flags ) ) {
408 $flags = explode( ',', $flags );
409 }
410
411 // Use external methods for external objects, text in table is URL-only then
412 if ( in_array( 'external', $flags ) ) {
413 $url = $raw;
414 $parts = explode( '://', $url, 2 );
415 if ( count( $parts ) == 1 || $parts[1] == '' ) {
416 return false;
417 }
418
419 if ( $cacheKey ) {
420 // The cached value should be decompressed, so handle that and return here.
421 return $this->cache->getWithSetCallback(
422 $this->getCacheKey( $cacheKey ),
423 $this->getCacheTTL(),
424 function () use ( $url, $flags ) {
425 // Ignore $setOpts; blobs are immutable and negatives are not cached
426 $blob = $this->extStoreAccess
427 ->fetchFromURL( $url, [ 'domain' => $this->dbDomain ] );
428
429 return $blob === false ? false : $this->decompressData( $blob, $flags );
430 },
431 [ 'pcGroup' => self::TEXT_CACHE_GROUP, 'pcTTL' => WANObjectCache::TTL_PROC_LONG ]
432 );
433 } else {
434 $blob = $this->extStoreAccess->fetchFromURL( $url, [ 'domain' => $this->dbDomain ] );
435 return $blob === false ? false : $this->decompressData( $blob, $flags );
436 }
437 } else {
438 return $this->decompressData( $raw, $flags );
439 }
440 }
441
442 /**
443 * If $wgCompressRevisions is enabled, we will compress data.
444 * The input string is modified in place.
445 * Return value is the flags field: contains 'gzip' if the
446 * data is compressed, and 'utf-8' if we're saving in UTF-8
447 * mode.
448 *
449 * MCR migration note: this replaces Revision::compressRevisionText
450 *
451 * @note direct use is deprecated!
452 * @todo make this private, there should be no need to use this method outside this class.
453 *
454 * @param mixed &$blob Reference to a text
455 *
456 * @return string
457 */
458 public function compressData( &$blob ) {
459 $blobFlags = [];
460
461 // Revisions not marked as UTF-8 will have legacy decoding applied by decompressData().
462 // XXX: if $this->legacyEncoding is not set, we could skip this. That would however be
463 // risky, since $this->legacyEncoding being set in the future would lead to data corruption.
464 $blobFlags[] = 'utf-8';
465
466 if ( $this->compressBlobs ) {
467 if ( function_exists( 'gzdeflate' ) ) {
468 $deflated = gzdeflate( $blob );
469
470 if ( $deflated === false ) {
471 wfLogWarning( __METHOD__ . ': gzdeflate() failed' );
472 } else {
473 $blob = $deflated;
474 $blobFlags[] = 'gzip';
475 }
476 } else {
477 wfDebug( __METHOD__ . " -- no zlib support, not compressing\n" );
478 }
479 }
480 return implode( ',', $blobFlags );
481 }
482
483 /**
484 * Re-converts revision text according to its flags.
485 *
486 * MCR migration note: this replaces Revision::decompressRevisionText
487 *
488 * @note direct use is deprecated, use getBlob() or SlotRecord::getContent() instead.
489 * @todo make this private, there should be no need to use this method outside this class.
490 *
491 * @param string $blob Blob in compressed/encoded form.
492 * @param array $blobFlags Compression flags, such as 'gzip'.
493 * Note that not including 'utf-8' in $blobFlags will cause the data to be decoded
494 * according to the legacy encoding specified via setLegacyEncoding.
495 *
496 * @return string|bool Decompressed text, or false on failure
497 */
498 public function decompressData( $blob, array $blobFlags ) {
499 // Revision::decompressRevisionText accepted false here, so defend against that
500 Assert::parameterType( 'string', $blob, '$blob' );
501
502 if ( in_array( 'error', $blobFlags ) ) {
503 // Error row, return false
504 return false;
505 }
506
507 if ( in_array( 'gzip', $blobFlags ) ) {
508 # Deal with optional compression of archived pages.
509 # This can be done periodically via maintenance/compressOld.php, and
510 # as pages are saved if $wgCompressRevisions is set.
511 $blob = gzinflate( $blob );
512
513 if ( $blob === false ) {
514 wfWarn( __METHOD__ . ': gzinflate() failed' );
515 return false;
516 }
517 }
518
519 if ( in_array( 'object', $blobFlags ) ) {
520 # Generic compressed storage
521 $obj = unserialize( $blob );
522 if ( !is_object( $obj ) ) {
523 // Invalid object
524 return false;
525 }
526 $blob = $obj->getText();
527 }
528
529 // Needed to support old revisions left over from from the 1.4 / 1.5 migration.
530 if ( $blob !== false && $this->legacyEncoding && $this->legacyEncodingConversionLang
531 && !in_array( 'utf-8', $blobFlags ) && !in_array( 'utf8', $blobFlags )
532 ) {
533 # Old revisions kept around in a legacy encoding?
534 # Upconvert on demand.
535 # ("utf8" checked for compatibility with some broken
536 # conversion scripts 2008-12-30)
537 $blob = $this->legacyEncodingConversionLang->iconv( $this->legacyEncoding, 'UTF-8', $blob );
538 }
539
540 return $blob;
541 }
542
543 /**
544 * Get the text cache TTL
545 *
546 * MCR migration note: this replaces Revision::getCacheTTL
547 *
548 * @return int
549 */
550 private function getCacheTTL() {
551 if ( $this->cache->getQoS( WANObjectCache::ATTR_EMULATION )
552 <= WANObjectCache::QOS_EMULATION_SQL
553 ) {
554 // Do not cache RDBMs blobs in...the RDBMs store
555 $ttl = WANObjectCache::TTL_UNCACHEABLE;
556 } else {
557 $ttl = $this->cacheExpiry ?: WANObjectCache::TTL_UNCACHEABLE;
558 }
559
560 return $ttl;
561 }
562
563 /**
564 * Returns an ID corresponding to the old_id field in the text table, corresponding
565 * to the given $address.
566 *
567 * Currently, $address must start with 'tt:' followed by a decimal integer representing
568 * the old_id; if $address does not start with 'tt:', null is returned. However,
569 * the implementation may change to insert rows into the text table on the fly.
570 * This implies that this method cannot be static.
571 *
572 * @note This method exists for use with the text table based storage schema.
573 * It should not be assumed that is will function with all future kinds of content addresses.
574 *
575 * @deprecated since 1.31, so don't assume that all blob addresses refer to a row in the text
576 * table. This method should become private once the relevant refactoring in WikiPage is
577 * complete.
578 *
579 * @param string $address
580 *
581 * @return int|null
582 */
583 public function getTextIdFromAddress( $address ) {
584 list( $schema, $id, ) = self::splitBlobAddress( $address );
585
586 if ( $schema !== 'tt' ) {
587 return null;
588 }
589
590 $textId = intval( $id );
591
592 if ( !$textId || $id !== (string)$textId ) {
593 throw new InvalidArgumentException( "Malformed text_id: $id" );
594 }
595
596 return $textId;
597 }
598
599 /**
600 * Returns an address referring to content stored in the text table row with the given ID.
601 * The address schema for blobs stored in the text table is "tt:" followed by an integer
602 * that corresponds to a value of the old_id field.
603 *
604 * @deprecated since 1.31. This method should become private once the relevant refactoring
605 * in WikiPage is complete.
606 *
607 * @param int $id
608 *
609 * @return string
610 */
611 public static function makeAddressFromTextId( $id ) {
612 return 'tt:' . $id;
613 }
614
615 /**
616 * Splits a blob address into three parts: the schema, the ID, and parameters/flags.
617 *
618 * @since 1.33
619 *
620 * @param string $address
621 *
622 * @throws InvalidArgumentException
623 * @return array [ $schema, $id, $parameters ], with $parameters being an assoc array.
624 */
625 public static function splitBlobAddress( $address ) {
626 if ( !preg_match( '/^(\w+):(\w+)(\?(.*))?$/', $address, $m ) ) {
627 throw new InvalidArgumentException( "Bad blob address: $address" );
628 }
629
630 $schema = strtolower( $m[1] );
631 $id = $m[2];
632 $parameters = isset( $m[4] ) ? wfCgiToArray( $m[4] ) : [];
633
634 return [ $schema, $id, $parameters ];
635 }
636
637 public function isReadOnly() {
638 if ( $this->useExternalStore && $this->extStoreAccess->isReadOnly() ) {
639 return true;
640 }
641
642 return ( $this->getDBLoadBalancer()->getReadOnlyReason() !== false );
643 }
644 }