* Concept for diff-based compression using the new xdiff beta. Acheives massively...
[lhc/web/wiklou.git] / maintenance / storage / testCompression.php
1 <?php
2
3 $optionsWithArgs = array( 'start', 'limit', 'type' );
4 require( dirname(__FILE__).'/../commandLine.inc' );
5
6 if ( !isset( $args[0] ) ) {
7 echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
8 exit( 1 );
9 }
10
11 $title = Title::newFromText( $args[0] );
12 if ( isset( $options['start'] ) ) {
13 $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
14 echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
15 } else {
16 $start = '19700101000000';
17 }
18 $limit = isset( $options['limit'] ) ? $options['limit'] : 10;
19 $type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
20
21
22 $dbr = wfGetDB( DB_SLAVE );
23 $res = $dbr->select(
24 array( 'page', 'revision', 'text' ),
25 '*',
26 array(
27 'page_namespace' => $title->getNamespace(),
28 'page_title' => $title->getDBkey(),
29 'page_id=rev_page',
30 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
31 'rev_text_id=old_id'
32 ), __FILE__, array( 'LIMIT' => $limit )
33 );
34
35 $blob = new $type;
36 $hashes = array();
37 $keys = array();
38 $uncompressedSize = 0;
39 $t = -microtime( true );
40 foreach ( $res as $row ) {
41 $revision = new Revision( $row );
42 $text = $revision->getText();
43 $uncompressedSize += strlen( $text );
44 $hashes[$row->rev_id] = md5( $text );
45 $keys[$row->rev_id] = $blob->addItem( $text );
46 }
47
48 $serialized = serialize( $blob );
49 $t += microtime( true );
50
51 printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n",
52 $res->numRows(),
53 $uncompressedSize / strlen( $serialized ),
54 $wgLang->formatSize( $uncompressedSize ),
55 $wgLang->formatSize( strlen( $serialized ) )
56 );
57 printf( "Compression time: %5.2f ms\n", $t * 1000 );
58
59 $t = -microtime( true );
60 $blob = unserialize( $serialized );
61 foreach ( $keys as $id => $key ) {
62 $text = $blob->getItem( $key );
63 if ( md5( $text ) != $hashes[$id] ) {
64 echo "Content hash mismatch for rev_id $id\n";
65 #var_dump( $text );
66 }
67 }
68 $t += microtime( true );
69 printf( "Decompression time: %5.2f ms\n", $t * 1000 );
70