Merge "Add semantic tags to license info text"
[lhc/web/wiklou.git] / includes / Storage / SqlBlobStore.php
1 <?php
2 /**
3 * Service for storing and loading data blobs representing revision content.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * Attribution notice: when this file was created, much of its content was taken
21 * from the Revision.php file as present in release 1.30. Refer to the history
22 * of that file for original authorship.
23 *
24 * @file
25 */
26
27 namespace MediaWiki\Storage;
28
29 use DBAccessObjectUtils;
30 use ExternalStore;
31 use IDBAccessObject;
32 use IExpiringStore;
33 use InvalidArgumentException;
34 use Language;
35 use MWException;
36 use WANObjectCache;
37 use Wikimedia\Assert\Assert;
38 use Wikimedia\Rdbms\IDatabase;
39 use Wikimedia\Rdbms\LoadBalancer;
40
41 /**
42 * Service for storing and loading Content objects.
43 *
44 * @since 1.31
45 *
46 * @note This was written to act as a drop-in replacement for the corresponding
47 * static methods in Revision.
48 */
49 class SqlBlobStore implements IDBAccessObject, BlobStore {
50
51 // Note: the name has been taken unchanged from the Revision class.
52 const TEXT_CACHE_GROUP = 'revisiontext:10';
53
54 /**
55 * @var LoadBalancer
56 */
57 private $dbLoadBalancer;
58
59 /**
60 * @var WANObjectCache
61 */
62 private $cache;
63
64 /**
65 * @var bool|string Wiki ID
66 */
67 private $wikiId;
68
69 /**
70 * @var int
71 */
72 private $cacheExpiry = 604800; // 7 days
73
74 /**
75 * @var bool
76 */
77 private $compressBlobs = false;
78
79 /**
80 * @var bool|string
81 */
82 private $legacyEncoding = false;
83
84 /**
85 * @var Language|null
86 */
87 private $legacyEncodingConversionLang = null;
88
89 /**
90 * @var boolean
91 */
92 private $useExternalStore = false;
93
94 /**
95 * @param LoadBalancer $dbLoadBalancer A load balancer for acquiring database connections
96 * @param WANObjectCache $cache A cache manager for caching blobs
97 * @param bool|string $wikiId The ID of the target wiki database. Use false for the local wiki.
98 */
99 public function __construct(
100 LoadBalancer $dbLoadBalancer,
101 WANObjectCache $cache,
102 $wikiId = false
103 ) {
104 $this->dbLoadBalancer = $dbLoadBalancer;
105 $this->cache = $cache;
106 $this->wikiId = $wikiId;
107 }
108
109 /**
110 * @return int time for which blobs can be cached, in seconds
111 */
112 public function getCacheExpiry() {
113 return $this->cacheExpiry;
114 }
115
116 /**
117 * @param int $cacheExpiry time for which blobs can be cached, in seconds
118 */
119 public function setCacheExpiry( $cacheExpiry ) {
120 Assert::parameterType( 'integer', $cacheExpiry, '$cacheExpiry' );
121
122 $this->cacheExpiry = $cacheExpiry;
123 }
124
125 /**
126 * @return bool whether blobs should be compressed for storage
127 */
128 public function getCompressBlobs() {
129 return $this->compressBlobs;
130 }
131
132 /**
133 * @param bool $compressBlobs whether blobs should be compressed for storage
134 */
135 public function setCompressBlobs( $compressBlobs ) {
136 $this->compressBlobs = $compressBlobs;
137 }
138
139 /**
140 * @return false|string The legacy encoding to assume for blobs that are not marked as utf8.
141 * False means handling of legacy encoding is disabled, and utf8 assumed.
142 */
143 public function getLegacyEncoding() {
144 return $this->legacyEncoding;
145 }
146
147 /**
148 * @return Language|null The locale to use when decoding from a legacy encoding, or null
149 * if handling of legacy encoding is disabled.
150 */
151 public function getLegacyEncodingConversionLang() {
152 return $this->legacyEncodingConversionLang;
153 }
154
155 /**
156 * @param string $legacyEncoding The legacy encoding to assume for blobs that are
157 * not marked as utf8.
158 * @param Language $language The locale to use when decoding from a legacy encoding.
159 */
160 public function setLegacyEncoding( $legacyEncoding, Language $language ) {
161 Assert::parameterType( 'string', $legacyEncoding, '$legacyEncoding' );
162
163 $this->legacyEncoding = $legacyEncoding;
164 $this->legacyEncodingConversionLang = $language;
165 }
166
167 /**
168 * @return bool Whether to use the ExternalStore mechanism for storing blobs.
169 */
170 public function getUseExternalStore() {
171 return $this->useExternalStore;
172 }
173
174 /**
175 * @param bool $useExternalStore Whether to use the ExternalStore mechanism for storing blobs.
176 */
177 public function setUseExternalStore( $useExternalStore ) {
178 Assert::parameterType( 'boolean', $useExternalStore, '$useExternalStore' );
179
180 $this->useExternalStore = $useExternalStore;
181 }
182
183 /**
184 * @return LoadBalancer
185 */
186 private function getDBLoadBalancer() {
187 return $this->dbLoadBalancer;
188 }
189
190 /**
191 * @param int $index A database index, like DB_MASTER or DB_REPLICA
192 *
193 * @return IDatabase
194 */
195 private function getDBConnection( $index ) {
196 $lb = $this->getDBLoadBalancer();
197 return $lb->getConnection( $index, [], $this->wikiId );
198 }
199
200 /**
201 * Stores an arbitrary blob of data and returns an address that can be used with
202 * getBlob() to retrieve the same blob of data,
203 *
204 * @param string $data
205 * @param array $hints An array of hints.
206 *
207 * @throws BlobAccessException
208 * @return string an address that can be used with getBlob() to retrieve the data.
209 */
210 public function storeBlob( $data, $hints = [] ) {
211 try {
212 $flags = $this->compressData( $data );
213
214 # Write to external storage if required
215 if ( $this->useExternalStore ) {
216 // Store and get the URL
217 $data = ExternalStore::insertToDefault( $data );
218 if ( !$data ) {
219 throw new BlobAccessException( "Failed to store text to external storage" );
220 }
221 if ( $flags ) {
222 $flags .= ',';
223 }
224 $flags .= 'external';
225
226 // TODO: we could also return an address for the external store directly here.
227 // That would mean bypassing the text table entirely when the external store is
228 // used. We'll need to assess expected fallout before doing that.
229 }
230
231 $dbw = $this->getDBConnection( DB_MASTER );
232
233 $old_id = $dbw->nextSequenceValue( 'text_old_id_seq' );
234 $dbw->insert(
235 'text',
236 [
237 'old_id' => $old_id,
238 'old_text' => $data,
239 'old_flags' => $flags,
240 ],
241 __METHOD__
242 );
243
244 $textId = $dbw->insertId();
245
246 return 'tt:' . $textId;
247 } catch ( MWException $e ) {
248 throw new BlobAccessException( $e->getMessage(), 0, $e );
249 }
250 }
251
252 /**
253 * Retrieve a blob, given an address.
254 * Currently hardcoded to the 'text' table storage engine.
255 *
256 * MCR migration note: this replaces Revision::loadText
257 *
258 * @param string $blobAddress
259 * @param int $queryFlags
260 *
261 * @throws BlobAccessException
262 * @return string
263 */
264 public function getBlob( $blobAddress, $queryFlags = 0 ) {
265 Assert::parameterType( 'string', $blobAddress, '$blobAddress' );
266
267 // No negative caching; negative hits on text rows may be due to corrupted replica DBs
268 $blob = $this->cache->getWithSetCallback(
269 // TODO: change key, since this is not necessarily revision text!
270 $this->cache->makeKey( 'revisiontext', 'textid', $blobAddress ),
271 $this->getCacheTTL(),
272 function () use ( $blobAddress, $queryFlags ) {
273 return $this->fetchBlob( $blobAddress, $queryFlags );
274 },
275 [ 'pcGroup' => self::TEXT_CACHE_GROUP, 'pcTTL' => IExpiringStore::TTL_PROC_LONG ]
276 );
277
278 if ( $blob === false ) {
279 throw new BlobAccessException( 'Failed to load blob from address ' . $blobAddress );
280 }
281
282 return $blob;
283 }
284
285 /**
286 * MCR migration note: this corresponds to Revision::fetchText
287 *
288 * @param string $blobAddress
289 * @param int $queryFlags
290 *
291 * @throw BlobAccessException
292 * @return string|false
293 */
294 private function fetchBlob( $blobAddress, $queryFlags ) {
295 list( $schema, $id, ) = self::splitBlobAddress( $blobAddress );
296
297 //TODO: MCR: also support 'ex' schema with ExternalStore URLs, plus flags encoded in the URL!
298 //TODO: MCR: also support 'ar' schema for content blobs in old style archive rows!
299 if ( $schema === 'tt' ) {
300 $textId = intval( $id );
301 } else {
302 // XXX: change to better exceptions! That makes migration more difficult, though.
303 throw new BlobAccessException( "Unknown blob address schema: $schema" );
304 }
305
306 if ( !$textId || $id !== (string)$textId ) {
307 // XXX: change to better exceptions! That makes migration more difficult, though.
308 throw new BlobAccessException( "Bad blob address: $blobAddress" );
309 }
310
311 // Callers doing updates will pass in READ_LATEST as usual. Since the text/blob tables
312 // do not normally get rows changed around, set READ_LATEST_IMMUTABLE in those cases.
313 $queryFlags |= DBAccessObjectUtils::hasFlags( $queryFlags, self::READ_LATEST )
314 ? self::READ_LATEST_IMMUTABLE
315 : 0;
316
317 list( $index, $options, $fallbackIndex, $fallbackOptions ) =
318 DBAccessObjectUtils::getDBOptions( $queryFlags );
319
320 // Text data is immutable; check replica DBs first.
321 $row = $this->getDBConnection( $index )->selectRow(
322 'text',
323 [ 'old_text', 'old_flags' ],
324 [ 'old_id' => $textId ],
325 __METHOD__,
326 $options
327 );
328
329 // Fallback to DB_MASTER in some cases if the row was not found, using the appropriate
330 // options, such as FOR UPDATE to avoid missing rows due to REPEATABLE-READ.
331 if ( !$row && $fallbackIndex !== null ) {
332 $row = $this->getDBConnection( $fallbackIndex )->selectRow(
333 'text',
334 [ 'old_text', 'old_flags' ],
335 [ 'old_id' => $textId ],
336 __METHOD__,
337 $fallbackOptions
338 );
339 }
340
341 if ( !$row ) {
342 wfWarn( __METHOD__ . ": No text row with ID $textId." );
343 return false;
344 }
345
346 $blob = $this->expandBlob( $row->old_text, $row->old_flags, $blobAddress );
347
348 if ( $blob === false ) {
349 wfWarn( __METHOD__ . ": Bad data in text row $textId." );
350 return false;
351 }
352
353 return $blob;
354 }
355
356 /**
357 * Expand a raw data blob according to the flags given.
358 *
359 * MCR migration note: this replaces Revision::getRevisionText
360 *
361 * @note direct use is deprecated, use getBlob() or SlotRecord::getContent() instead.
362 * @todo make this private, there should be no need to use this method outside this class.
363 *
364 * @param string $raw The raw blob data, to be processed according to $flags.
365 * May be the blob itself, or the blob compressed, or just the address
366 * of the actual blob, depending on $flags.
367 * @param string|string[] $flags Blob flags, such as 'external' or 'gzip'.
368 * @param string|null $cacheKey May be used for caching if given
369 *
370 * @return false|string The expanded blob or false on failure
371 */
372 public function expandBlob( $raw, $flags, $cacheKey = null ) {
373 if ( is_string( $flags ) ) {
374 $flags = explode( ',', $flags );
375 }
376
377 // Use external methods for external objects, text in table is URL-only then
378 if ( in_array( 'external', $flags ) ) {
379 $url = $raw;
380 $parts = explode( '://', $url, 2 );
381 if ( count( $parts ) == 1 || $parts[1] == '' ) {
382 return false;
383 }
384
385 if ( $cacheKey && $this->wikiId === false ) {
386 // Make use of the wiki-local revision text cache.
387 // The cached value should be decompressed, so handle that and return here.
388 // NOTE: we rely on $this->cache being the right cache for $this->wikiId!
389 return $this->cache->getWithSetCallback(
390 // TODO: change key, since this is not necessarily revision text!
391 $this->cache->makeKey( 'revisiontext', 'textid', $cacheKey ),
392 $this->getCacheTTL(),
393 function () use ( $url, $flags ) {
394 // No negative caching per BlobStore::getBlob()
395 $blob = ExternalStore::fetchFromURL( $url, [ 'wiki' => $this->wikiId ] );
396
397 return $this->decompressData( $blob, $flags );
398 },
399 [ 'pcGroup' => self::TEXT_CACHE_GROUP, 'pcTTL' => WANObjectCache::TTL_PROC_LONG ]
400 );
401 } else {
402 $blob = ExternalStore::fetchFromURL( $url, [ 'wiki' => $this->wikiId ] );
403 return $this->decompressData( $blob, $flags );
404 }
405 } else {
406 return $this->decompressData( $raw, $flags );
407 }
408 }
409
410 /**
411 * If $wgCompressRevisions is enabled, we will compress data.
412 * The input string is modified in place.
413 * Return value is the flags field: contains 'gzip' if the
414 * data is compressed, and 'utf-8' if we're saving in UTF-8
415 * mode.
416 *
417 * MCR migration note: this replaces Revision::compressRevisionText
418 *
419 * @note direct use is deprecated!
420 * @todo make this private, there should be no need to use this method outside this class.
421 *
422 * @param mixed &$blob Reference to a text
423 *
424 * @return string
425 */
426 public function compressData( &$blob ) {
427 $blobFlags = [];
428
429 // Revisions not marked as UTF-8 will have legacy decoding applied by decompressData().
430 // XXX: if $this->legacyEncoding is not set, we could skip this. May be risky, though.
431 $blobFlags[] = 'utf-8';
432
433 if ( $this->compressBlobs ) {
434 if ( function_exists( 'gzdeflate' ) ) {
435 $deflated = gzdeflate( $blob );
436
437 if ( $deflated === false ) {
438 wfLogWarning( __METHOD__ . ': gzdeflate() failed' );
439 } else {
440 $blob = $deflated;
441 $blobFlags[] = 'gzip';
442 }
443 } else {
444 wfDebug( __METHOD__ . " -- no zlib support, not compressing\n" );
445 }
446 }
447 return implode( ',', $blobFlags );
448 }
449
450 /**
451 * Re-converts revision text according to its flags.
452 *
453 * MCR migration note: this replaces Revision::decompressRevisionText
454 *
455 * @note direct use is deprecated, use getBlob() or SlotRecord::getContent() instead.
456 * @todo make this private, there should be no need to use this method outside this class.
457 *
458 * @param mixed $blob Reference to a text
459 * @param array $blobFlags Compression flags
460 *
461 * @return string|bool Decompressed text, or false on failure
462 */
463 public function decompressData( $blob, $blobFlags ) {
464 if ( $blob === false ) {
465 // Text failed to be fetched; nothing to do
466 return false;
467 }
468
469 if ( in_array( 'gzip', $blobFlags ) ) {
470 # Deal with optional compression of archived pages.
471 # This can be done periodically via maintenance/compressOld.php, and
472 # as pages are saved if $wgCompressRevisions is set.
473 $blob = gzinflate( $blob );
474
475 if ( $blob === false ) {
476 wfLogWarning( __METHOD__ . ': gzinflate() failed' );
477 return false;
478 }
479 }
480
481 if ( in_array( 'object', $blobFlags ) ) {
482 # Generic compressed storage
483 $obj = unserialize( $blob );
484 if ( !is_object( $obj ) ) {
485 // Invalid object
486 return false;
487 }
488 $blob = $obj->getText();
489 }
490
491 // Needed to support old revisions left over from from the 1.4 / 1.5 migration.
492 if ( $blob !== false && $this->legacyEncoding && $this->legacyEncodingConversionLang
493 && !in_array( 'utf-8', $blobFlags ) && !in_array( 'utf8', $blobFlags )
494 ) {
495 # Old revisions kept around in a legacy encoding?
496 # Upconvert on demand.
497 # ("utf8" checked for compatibility with some broken
498 # conversion scripts 2008-12-30)
499 $blob = $this->legacyEncodingConversionLang->iconv( $this->legacyEncoding, 'UTF-8', $blob );
500 }
501
502 return $blob;
503 }
504
505 /**
506 * Get the text cache TTL
507 *
508 * MCR migration note: this replaces Revision::getCacheTTL
509 *
510 * @return int
511 */
512 private function getCacheTTL() {
513 if ( $this->cache->getQoS( WANObjectCache::ATTR_EMULATION )
514 <= WANObjectCache::QOS_EMULATION_SQL
515 ) {
516 // Do not cache RDBMs blobs in...the RDBMs store
517 $ttl = WANObjectCache::TTL_UNCACHEABLE;
518 } else {
519 $ttl = $this->cacheExpiry ?: WANObjectCache::TTL_UNCACHEABLE;
520 }
521
522 return $ttl;
523 }
524
525 /**
526 * Returns an ID corresponding to the old_id field in the text table, corresponding
527 * to the given $address.
528 *
529 * Currently, $address must start with 'tt:' followed by a decimal integer representing
530 * the old_id; if $address does not start with 'tt:', null is returned. However,
531 * the implementation may change to insert rows into the text table on the fly.
532 *
533 * @note This method exists for use with the text table based storage schema.
534 * It should not be assumed that is will function with all future kinds of content addresses.
535 *
536 * @deprecated since 1.31, so not assume that all blob addresses refer to a row in the text
537 * table. This method should become private once the relevant refactoring in WikiPage is
538 * complete.
539 *
540 * @param string $address
541 *
542 * @return int|null
543 */
544 public function getTextIdFromAddress( $address ) {
545 list( $schema, $id, ) = self::splitBlobAddress( $address );
546
547 if ( $schema !== 'tt' ) {
548 return null;
549 }
550
551 $textId = intval( $id );
552
553 if ( !$textId || $id !== (string)$textId ) {
554 throw new InvalidArgumentException( "Malformed text_id: $id" );
555 }
556
557 return $textId;
558 }
559
560 /**
561 * Splits a blob address into three parts: the schema, the ID, and parameters/flags.
562 *
563 * @param string $address
564 *
565 * @throws InvalidArgumentException
566 * @return array [ $schema, $id, $parameters ], with $parameters being an assoc array.
567 */
568 private static function splitBlobAddress( $address ) {
569 if ( !preg_match( '/^(\w+):(\w+)(\?(.*))?$/', $address, $m ) ) {
570 throw new InvalidArgumentException( "Bad blob address: $address" );
571 }
572
573 $schema = strtolower( $m[1] );
574 $id = $m[2];
575 $parameters = isset( $m[4] ) ? wfCgiToArray( $m[4] ) : [];
576
577 return [ $schema, $id, $parameters ];
578 }
579
580 }