* Adds an item of text, returns a stub object which points to the item.
* You must call setLocation() on the stub object before storing it to the
* database
- * Returns the key for getItem()
+ *
+ * @return String: the key for getItem()
*/
- public function addItem( $text );
+ function addItem( $text );
/**
* Get item by key, or false if the key is not present
+ *
+ * @return String or false
*/
- public function getItem( $key );
+ function getItem( $key );
/**
* Set the "default text"
*
* Default text is not required for two-part external storage URLs.
*/
- public function setText( $text );
+ function setText( $text );
/**
* Get default text. This is called from Revision::getRevisionText()
+ *
+ * @return String
*/
function getText();
}
class ConcatenatedGzipHistoryBlob implements HistoryBlob
{
public $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
- public $mFast = 0, $mSize = 0;
+ public $mSize = 0;
+ public $mMaxSize = 10000000;
+ public $mMaxCount = 100;
/** Constructor */
- public function ConcatenatedGzipHistoryBlob() {
+ public function __construct() {
if ( !function_exists( 'gzdeflate' ) ) {
throw new MWException( "Need zlib support to read or write this kind of history object (ConcatenatedGzipHistoryBlob)\n" );
}
public function addItem( $text ) {
$this->uncompress();
$hash = md5( $text );
- $this->mItems[$hash] = $text;
- $this->mSize += strlen( $text );
-
+ if ( !isset( $this->mItems[$hash] ) ) {
+ $this->mItems[$hash] = $text;
+ $this->mSize += strlen( $text );
+ }
return $hash;
}
public function setText( $text ) {
$this->uncompress();
- $stub = $this->addItem( $text );
- $this->mDefaultHash = $stub->mHash;
+ $this->mDefaultHash = $this->addItem( $text );
}
public function getText() {
* Helper function for compression jobs
* Returns true until the object is "full" and ready to be committed
*/
- public function isHappy( $maxFactor, $factorThreshold ) {
- if ( count( $this->mItems ) == 0 ) {
- return true;
- }
- if ( !$this->mFast ) {
- $this->uncompress();
- $record = serialize( $this->mItems );
- $size = strlen( $record );
- $avgUncompressed = $size / count( $this->mItems );
- $compressed = strlen( gzdeflate( $record ) );
-
- if ( $compressed < $factorThreshold * 1024 ) {
- return true;
- } else {
- return $avgUncompressed * $maxFactor < $compressed;
- }
- } else {
- return count( $this->mItems ) <= 10;
- }
+ public function isHappy() {
+ return $this->mSize < $this->mMaxSize
+ && count( $this->mItems ) < $this->mMaxCount;
}
}
-/**
- * One-step cache variable to hold base blobs; operations that
- * pull multiple revisions may often pull multiple times from
- * the same blob. By keeping the last-used one open, we avoid
- * redundant unserialization and decompression overhead.
- */
-global $wgBlobCache;
-$wgBlobCache = array();
/**
* Pointer object for an item within a CGZ blob stored in the text table.
*/
class HistoryBlobStub {
+ /**
+ * One-step cache variable to hold base blobs; operations that
+ * pull multiple revisions may often pull multiple times from
+ * the same blob. By keeping the last-used one open, we avoid
+ * redundant unserialization and decompression overhead.
+ */
+ protected static $blobCache = array();
+
var $mOldId, $mHash, $mRef;
/**
- * @param string $hash The content hash of the text
- * @param integer $oldid The old_id for the CGZ object
+ * @param $hash Strng: the content hash of the text
+ * @param $oldid Integer: the old_id for the CGZ object
*/
- function HistoryBlobStub( $hash = '', $oldid = 0 ) {
+ function __construct( $hash = '', $oldid = 0 ) {
$this->mHash = $hash;
}
function getText() {
$fname = 'HistoryBlobStub::getText';
- global $wgBlobCache;
- if( isset( $wgBlobCache[$this->mOldId] ) ) {
- $obj = $wgBlobCache[$this->mOldId];
+
+ if( isset( self::$blobCache[$this->mOldId] ) ) {
+ $obj = self::$blobCache[$this->mOldId];
} else {
$dbr = wfGetDB( DB_SLAVE );
$row = $dbr->selectRow( 'text', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) );
// Save this item for reference; if pulling many
// items in a row we'll likely use it again.
$obj->uncompress();
- $wgBlobCache = array( $this->mOldId => $obj );
+ self::$blobCache = array( $this->mOldId => $obj );
}
return $obj->getItem( $this->mHash );
}
var $mCurId;
/**
- * @param integer $curid The cur_id pointed to
+ * @param $curid Integer: the cur_id pointed to
*/
- function HistoryBlobCurStub( $curid = 0 ) {
+ function __construct( $curid = 0 ) {
$this->mCurId = $curid;
}
/** Uncompressed item cache */
var $mItems = array();
+ /** Total uncompressed size */
+ var $mSize = 0;
+
/**
- * Array of diffs, where $this->mDiffs[0] is the diff between
- * $this->mDiffs[0] and $this->mDiffs[1]
+ * Array of diffs. If a diff D from A to B is notated D = B - A, and Z is
+ * an empty string:
+ *
+ * { item[map[i]] - item[map[i-1]] where i > 0
+ * diff[i] = {
+ * { item[map[i]] - Z where i = 0
*/
- var $mDiffs = array();
+ var $mDiffs;
+
+ /** The diff map, see above */
+ var $mDiffMap;
/**
* The key for getText()
*/
var $mFrozen = false;
+ /**
+ * The maximum uncompressed size before the object becomes sad
+ * Should be less than max_allowed_packet
+ */
+ var $mMaxSize = 10000000;
+
+ /**
+ * The maximum number of text items before the object becomes sad
+ */
+ var $mMaxCount = 100;
+
+ /** Constants from xdiff.h */
+ const XDL_BDOP_INS = 1;
+ const XDL_BDOP_CPY = 2;
+ const XDL_BDOP_INSB = 3;
function __construct() {
- if ( !function_exists( 'xdiff_string_bdiff' ) ){
- throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" );
- }
if ( !function_exists( 'gzdeflate' ) ) {
throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
}
}
$this->mItems[] = $text;
- $i = count( $this->mItems ) - 1;
- if ( $i > 0 ) {
- # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
- # "String is not zero-terminated"
- wfSuppressWarnings();
- $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . '';
- wfRestoreWarnings();
- }
- return $i;
+ $this->mSize += strlen( $text );
+ $this->mDiffs = null; // later
+ return count( $this->mItems ) - 1;
}
function getItem( $key ) {
- if ( $key > count( $this->mDiffs ) + 1 ) {
- return false;
- }
- $key = intval( $key );
- if ( $key == 0 ) {
- return $this->mItems[0];
- }
-
- $last = count( $this->mItems ) - 1;
- for ( $i = $last + 1; $i <= $key; $i++ ) {
- # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
- # "String is not zero-terminated"
- wfSuppressWarnings();
- $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . '';
- wfRestoreWarnings();
- }
return $this->mItems[$key];
}
return $this->getItem( $this->mDefaultKey );
}
+ function compress() {
+ if ( !function_exists( 'xdiff_string_rabdiff' ) ){
+ throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
+ }
+ if ( isset( $this->mDiffs ) ) {
+ // Already compressed
+ return;
+ }
+ if ( !count( $this->mItems ) ) {
+ // Empty
+ return;
+ }
+
+ // Create two diff sequences: one for main text and one for small text
+ $sequences = array(
+ 'small' => array(
+ 'tail' => '',
+ 'diffs' => array(),
+ 'map' => array(),
+ ),
+ 'main' => array(
+ 'tail' => '',
+ 'diffs' => array(),
+ 'map' => array(),
+ ),
+ );
+ $smallFactor = 0.5;
+
+ for ( $i = 0; $i < count( $this->mItems ); $i++ ) {
+ $text = $this->mItems[$i];
+ if ( $i == 0 ) {
+ $seqName = 'main';
+ } else {
+ $mainTail = $sequences['main']['tail'];
+ if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
+ $seqName = 'small';
+ } else {
+ $seqName = 'main';
+ }
+ }
+ $seq =& $sequences[$seqName];
+ $tail = $seq['tail'];
+ $diff = $this->diff( $tail, $text );
+ $seq['diffs'][] = $diff;
+ $seq['map'][] = $i;
+ $seq['tail'] = $text;
+ }
+ unset( $seq ); // unlink dangerous alias
+
+ // Knit the sequences together
+ $tail = '';
+ $this->mDiffs = array();
+ $this->mDiffMap = array();
+ foreach ( $sequences as $seq ) {
+ if ( !count( $seq['diffs'] ) ) {
+ continue;
+ }
+ if ( $tail === '' ) {
+ $this->mDiffs[] = $seq['diffs'][0];
+ } else {
+ $head = $this->patch( '', $seq['diffs'][0] );
+ $this->mDiffs[] = $this->diff( $tail, $head );
+ }
+ $this->mDiffMap[] = $seq['map'][0];
+ for ( $i = 1; $i < count( $seq['diffs'] ); $i++ ) {
+ $this->mDiffs[] = $seq['diffs'][$i];
+ $this->mDiffMap[] = $seq['map'][$i];
+ }
+ $tail = $seq['tail'];
+ }
+ }
+
+ function diff( $t1, $t2 ) {
+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
+ # "String is not zero-terminated"
+ wfSuppressWarnings();
+ $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
+ wfRestoreWarnings();
+ return $diff;
+ }
+
+ function patch( $base, $diff ) {
+ if ( function_exists( 'xdiff_string_bpatch' ) ) {
+ wfSuppressWarnings();
+ $text = xdiff_string_bpatch( $base, $diff ) . '';
+ wfRestoreWarnings();
+ return $text;
+ }
+
+ # Pure PHP implementation
+
+ $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
+
+ # Check the checksum if mhash is available
+ if ( extension_loaded( 'mhash' ) ) {
+ $ofp = mhash( MHASH_ADLER32, $base );
+ if ( $ofp !== substr( $diff, 0, 4 ) ) {
+ wfDebug( __METHOD__. ": incorrect base checksum\n" );
+ return false;
+ }
+ }
+ if ( $header['csize'] != strlen( $base ) ) {
+ wfDebug( __METHOD__. ": incorrect base length\n" );
+ return false;
+ }
+
+ $p = 8;
+ $out = '';
+ while ( $p < strlen( $diff ) ) {
+ $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
+ $op = $x['op'];
+ ++$p;
+ switch ( $op ) {
+ case self::XDL_BDOP_INS:
+ $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
+ $p++;
+ $out .= substr( $diff, $p, $x['size'] );
+ $p += $x['size'];
+ break;
+ case self::XDL_BDOP_INSB:
+ $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
+ $p += 4;
+ $out .= substr( $diff, $p, $x['csize'] );
+ $p += $x['csize'];
+ break;
+ case self::XDL_BDOP_CPY:
+ $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
+ $p += 8;
+ $out .= substr( $base, $x['off'], $x['csize'] );
+ break;
+ default:
+ wfDebug( __METHOD__.": invalid op\n" );
+ return false;
+ }
+ }
+ return $out;
+ }
+
+ function uncompress() {
+ if ( !$this->mDiffs ) {
+ return;
+ }
+ $tail = '';
+ for ( $diffKey = 0; $diffKey < count( $this->mDiffs ); $diffKey++ ) {
+ $textKey = $this->mDiffMap[$diffKey];
+ $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
+ $this->mItems[$textKey] = $text;
+ $tail = $text;
+ }
+ }
+
function __sleep() {
- if ( !isset( $this->mItems[0] ) ) {
+ $this->compress();
+ if ( !count( $this->mItems ) ) {
// Empty object
$info = false;
} else {
+ // Take forward differences to improve the compression ratio for sequences
+ $map = '';
+ $prev = 0;
+ foreach ( $this->mDiffMap as $i ) {
+ if ( $map !== '' ) {
+ $map .= ',';
+ }
+ $map .= $i - $prev;
+ $prev = $i;
+ }
$info = array(
- 'base' => $this->mItems[0],
- 'diffs' => $this->mDiffs
+ 'diffs' => $this->mDiffs,
+ 'map' => $map
);
}
if ( isset( $this->mDefaultKey ) ) {
if ( isset( $info['default'] ) ) {
$this->mDefaultKey = $info['default'];
}
- $this->mItems[0] = $info['base'];
$this->mDiffs = $info['diffs'];
+ if ( isset( $info['base'] ) ) {
+ // Old format
+ $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
+ array_unshift( $this->mDiffs,
+ pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
+ $info['base'] );
+ } else {
+ // New format
+ $map = explode( ',', $info['map'] );
+ $cur = 0;
+ $this->mDiffMap = array();
+ foreach ( $map as $i ) {
+ $cur += $i;
+ $this->mDiffMap[] = $cur;
+ }
+ }
+ $this->uncompress();
}
+
+ /**
+ * Helper function for compression jobs
+ * Returns true until the object is "full" and ready to be committed
+ */
+ function isHappy() {
+ return $this->mSize < $this->mMaxSize
+ && count( $this->mItems ) < $this->mMaxCount;
+ }
+
}