Merge "Make DBAccessBase use DBConnRef, rename $wiki, and hide getLoadBalancer()"
[lhc/web/wiklou.git] / includes / historyblob / DiffHistoryBlob.php
1 <?php
2 /**
3 * Efficient concatenated text storage.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 */
22
23 /**
24 * Diff-based history compression
25 * Requires xdiff 1.5+ and zlib
26 */
27 class DiffHistoryBlob implements HistoryBlob {
28 /** @var array Uncompressed item cache */
29 public $mItems = [];
30
31 /** @var int Total uncompressed size */
32 public $mSize = 0;
33
34 /**
35 * @var array Array of diffs. If a diff D from A to B is notated D = B - A,
36 * and Z is an empty string:
37 *
38 * { item[map[i]] - item[map[i-1]] where i > 0
39 * diff[i] = {
40 * { item[map[i]] - Z where i = 0
41 */
42 public $mDiffs;
43
44 /** @var array The diff map, see above */
45 public $mDiffMap;
46
47 /** @var int The key for getText()
48 */
49 public $mDefaultKey;
50
51 /** @var string Compressed storage */
52 public $mCompressed;
53
54 /** @var bool True if the object is locked against further writes */
55 public $mFrozen = false;
56
57 /**
58 * @var int The maximum uncompressed size before the object becomes sad
59 * Should be less than max_allowed_packet
60 */
61 public $mMaxSize = 10000000;
62
63 /** @var int The maximum number of text items before the object becomes sad */
64 public $mMaxCount = 100;
65
66 /** Constants from xdiff.h */
67 const XDL_BDOP_INS = 1;
68 const XDL_BDOP_CPY = 2;
69 const XDL_BDOP_INSB = 3;
70
71 function __construct() {
72 if ( !function_exists( 'gzdeflate' ) ) {
73 throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
74 }
75 }
76
77 /**
78 * @throws MWException
79 * @param string $text
80 * @return int
81 */
82 function addItem( $text ) {
83 if ( $this->mFrozen ) {
84 throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" );
85 }
86
87 $this->mItems[] = $text;
88 $this->mSize += strlen( $text );
89 $this->mDiffs = null; // later
90 return count( $this->mItems ) - 1;
91 }
92
93 /**
94 * @param string $key
95 * @return string
96 */
97 function getItem( $key ) {
98 return $this->mItems[$key];
99 }
100
101 /**
102 * @param string $text
103 */
104 function setText( $text ) {
105 $this->mDefaultKey = $this->addItem( $text );
106 }
107
108 /**
109 * @return string
110 */
111 function getText() {
112 return $this->getItem( $this->mDefaultKey );
113 }
114
115 /**
116 * @throws MWException
117 */
118 function compress() {
119 if ( !function_exists( 'xdiff_string_rabdiff' ) ) {
120 throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
121 }
122 if ( isset( $this->mDiffs ) ) {
123 // Already compressed
124 return;
125 }
126 if ( $this->mItems === [] ) {
127 return;
128 }
129
130 // Create two diff sequences: one for main text and one for small text
131 $sequences = [
132 'small' => [
133 'tail' => '',
134 'diffs' => [],
135 'map' => [],
136 ],
137 'main' => [
138 'tail' => '',
139 'diffs' => [],
140 'map' => [],
141 ],
142 ];
143 $smallFactor = 0.5;
144
145 $mItemsCount = count( $this->mItems );
146 for ( $i = 0; $i < $mItemsCount; $i++ ) {
147 $text = $this->mItems[$i];
148 if ( $i == 0 ) {
149 $seqName = 'main';
150 } else {
151 $mainTail = $sequences['main']['tail'];
152 if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
153 $seqName = 'small';
154 } else {
155 $seqName = 'main';
156 }
157 }
158
159 $tail = $sequences[$seqName]['tail'];
160 $diff = $this->diff( $tail, $text );
161 $sequences[$seqName]['diffs'][] = $diff;
162 $sequences[$seqName]['map'][] = $i;
163 $sequences[$seqName]['tail'] = $text;
164 }
165
166 // Knit the sequences together
167 $tail = '';
168 $this->mDiffs = [];
169 $this->mDiffMap = [];
170 foreach ( $sequences as $seq ) {
171 if ( $seq['diffs'] === [] ) {
172 continue;
173 }
174 if ( $tail === '' ) {
175 $this->mDiffs[] = $seq['diffs'][0];
176 } else {
177 $head = $this->patch( '', $seq['diffs'][0] );
178 $this->mDiffs[] = $this->diff( $tail, $head );
179 }
180 $this->mDiffMap[] = $seq['map'][0];
181 $diffsCount = count( $seq['diffs'] );
182 for ( $i = 1; $i < $diffsCount; $i++ ) {
183 $this->mDiffs[] = $seq['diffs'][$i];
184 $this->mDiffMap[] = $seq['map'][$i];
185 }
186 $tail = $seq['tail'];
187 }
188 }
189
190 /**
191 * @param string $t1
192 * @param string $t2
193 * @return string
194 */
195 function diff( $t1, $t2 ) {
196 # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
197 # "String is not zero-terminated"
198 Wikimedia\suppressWarnings();
199 $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
200 Wikimedia\restoreWarnings();
201 return $diff;
202 }
203
204 /**
205 * @param string $base
206 * @param string $diff
207 * @return bool|string
208 */
209 function patch( $base, $diff ) {
210 if ( function_exists( 'xdiff_string_bpatch' ) ) {
211 Wikimedia\suppressWarnings();
212 $text = xdiff_string_bpatch( $base, $diff ) . '';
213 Wikimedia\restoreWarnings();
214 return $text;
215 }
216
217 # Pure PHP implementation
218
219 $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
220
221 # Check the checksum if hash extension is available
222 $ofp = $this->xdiffAdler32( $base );
223 if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) {
224 wfDebug( __METHOD__ . ": incorrect base checksum\n" );
225 return false;
226 }
227 if ( $header['csize'] != strlen( $base ) ) {
228 wfDebug( __METHOD__ . ": incorrect base length\n" );
229 return false;
230 }
231
232 $p = 8;
233 $out = '';
234 while ( $p < strlen( $diff ) ) {
235 $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
236 $op = $x['op'];
237 ++$p;
238 switch ( $op ) {
239 case self::XDL_BDOP_INS:
240 $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
241 $p++;
242 $out .= substr( $diff, $p, $x['size'] );
243 $p += $x['size'];
244 break;
245 case self::XDL_BDOP_INSB:
246 $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
247 $p += 4;
248 $out .= substr( $diff, $p, $x['csize'] );
249 $p += $x['csize'];
250 break;
251 case self::XDL_BDOP_CPY:
252 $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
253 $p += 8;
254 $out .= substr( $base, $x['off'], $x['csize'] );
255 break;
256 default:
257 wfDebug( __METHOD__ . ": invalid op\n" );
258 return false;
259 }
260 }
261 return $out;
262 }
263
264 /**
265 * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with
266 * the bytes backwards and initialised with 0 instead of 1. See T36428.
267 *
268 * @param string $s
269 * @return string|bool False if the hash extension is not available
270 */
271 function xdiffAdler32( $s ) {
272 if ( !function_exists( 'hash' ) ) {
273 return false;
274 }
275
276 static $init;
277 if ( $init === null ) {
278 $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02";
279 }
280
281 // The real Adler-32 checksum of $init is zero, so it initialises the
282 // state to zero, as it is at the start of LibXDiff's checksum
283 // algorithm. Appending the subject string then simulates LibXDiff.
284 return strrev( hash( 'adler32', $init . $s, true ) );
285 }
286
287 function uncompress() {
288 if ( !$this->mDiffs ) {
289 return;
290 }
291 $tail = '';
292 $mDiffsCount = count( $this->mDiffs );
293 for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) {
294 $textKey = $this->mDiffMap[$diffKey];
295 $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
296 $this->mItems[$textKey] = $text;
297 $tail = $text;
298 }
299 }
300
301 /**
302 * @return array
303 */
304 function __sleep() {
305 $this->compress();
306 if ( $this->mItems === [] ) {
307 $info = false;
308 } else {
309 // Take forward differences to improve the compression ratio for sequences
310 $map = '';
311 $prev = 0;
312 foreach ( $this->mDiffMap as $i ) {
313 if ( $map !== '' ) {
314 $map .= ',';
315 }
316 $map .= $i - $prev;
317 $prev = $i;
318 }
319 $info = [
320 'diffs' => $this->mDiffs,
321 'map' => $map
322 ];
323 }
324 if ( isset( $this->mDefaultKey ) ) {
325 $info['default'] = $this->mDefaultKey;
326 }
327 $this->mCompressed = gzdeflate( serialize( $info ) );
328 return [ 'mCompressed' ];
329 }
330
331 function __wakeup() {
332 // addItem() doesn't work if mItems is partially filled from mDiffs
333 $this->mFrozen = true;
334 $info = unserialize( gzinflate( $this->mCompressed ) );
335 $this->mCompressed = null;
336
337 if ( !$info ) {
338 // Empty object
339 return;
340 }
341
342 if ( isset( $info['default'] ) ) {
343 $this->mDefaultKey = $info['default'];
344 }
345 $this->mDiffs = $info['diffs'];
346 if ( isset( $info['base'] ) ) {
347 // Old format
348 $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
349 array_unshift( $this->mDiffs,
350 pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
351 $info['base'] );
352 } else {
353 // New format
354 $map = explode( ',', $info['map'] );
355 $cur = 0;
356 $this->mDiffMap = [];
357 foreach ( $map as $i ) {
358 $cur += $i;
359 $this->mDiffMap[] = $cur;
360 }
361 }
362 $this->uncompress();
363 }
364
365 /**
366 * Helper function for compression jobs
367 * Returns true until the object is "full" and ready to be committed
368 *
369 * @return bool
370 */
371 function isHappy() {
372 return $this->mSize < $this->mMaxSize
373 && count( $this->mItems ) < $this->mMaxCount;
374 }
375
376 }