f40bc89350c941267f748348cd6b05e913a0aa92
[lhc/web/wiklou.git] / maintenance / backupPrefetch.inc
1 <?php
2
3 /**
4 * Readahead helper for making large MediaWiki data dumps;
5 * reads in a previous XML dump to sequentially prefetch text
6 * records already normalized and decompressed.
7 *
8 * This can save load on the external database servers, hopefully.
9 *
10 * Assumes that dumps will be recorded in the canonical order:
11 * - ascending by page_id
12 * - ascending by rev_id within each page
13 * - text contents are immutable and should not change once
14 * recorded, so the previous dump is a reliable source
15 *
16 * Requires PHP 5 and the XMLReader PECL extension.
17 */
18 class BaseDump {
19 var $reader = null;
20 var $atEnd = false;
21 var $atPageEnd = false;
22 var $lastPage = 0;
23 var $lastRev = 0;
24
25 function BaseDump( $infile ) {
26 $this->reader = new XMLReader();
27 $this->reader->open( $infile );
28 }
29
30 /**
31 * Attempts to fetch the text of a particular page revision
32 * from the dump stream. May return null if the page is
33 * unavailable.
34 *
35 * @param int $page ID number of page to read
36 * @param int $rev ID number of revision to read
37 * @return string or null
38 */
39 function prefetch( $page, $rev ) {
40 $page = intval( $page );
41 $rev = intval( $rev );
42 while( $this->lastPage < $page && !$this->atEnd ) {
43 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
44 $this->nextPage();
45 }
46 if( $this->lastPage > $page || $this->atEnd ) {
47 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
48 return null;
49 }
50 while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
51 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
52 $this->nextRev();
53 }
54 if( $this->lastRev == $rev && !$this->atEnd ) {
55 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
56 return $this->nextText();
57 } else {
58 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
59 return null;
60 }
61 }
62
63 function debug( $str ) {
64 wfDebug( $str . "\n" );
65 //global $dumper;
66 //$dumper->progress( $str );
67 }
68
69 /**
70 * @access private
71 */
72 function nextPage() {
73 if( $this->skipTo( 'page', 'mediawiki' ) ) {
74 if( $this->skipTo( 'id' ) ) {
75 $this->lastPage = intval( $this->nodeContents() );
76 $this->lastRev = 0;
77 $this->atPageEnd = false;
78 }
79 } else {
80 $this->atEnd = true;
81 }
82 }
83
84 /**
85 * @access private
86 */
87 function nextRev() {
88 if( $this->skipTo( 'revision' ) ) {
89 if( $this->skipTo( 'id' ) ) {
90 $this->lastRev = intval( $this->nodeContents() );
91 }
92 } else {
93 $this->atPageEnd = true;
94 }
95 }
96
97 /**
98 * @access private
99 */
100 function nextText() {
101 $this->skipTo( 'text' );
102 return strval( $this->nodeContents() );
103 }
104
105 /**
106 * @access private
107 */
108 function skipTo( $name, $parent='page' ) {
109 if( $this->atEnd ) {
110 return false;
111 }
112 while( $this->reader->read() ) {
113 if( $this->reader->nodeType == XMLREADER_ELEMENT &&
114 $this->reader->name == $name ) {
115 return true;
116 }
117 if( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
118 $this->reader->name == $parent ) {
119 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
120 return false;
121 }
122 }
123 return $this->close();
124 }
125
126 /**
127 * Shouldn't something like this be built-in to XMLReader?
128 * Fetches text contents of the current element, assuming
129 * no sub-elements or such scary things.
130 * @return string
131 * @access private
132 */
133 function nodeContents() {
134 if( $this->atEnd ) {
135 return null;
136 }
137 if( $this->reader->isEmptyElement ) {
138 return "";
139 }
140 $buffer = "";
141 while( $this->reader->read() ) {
142 switch( $this->reader->nodeType ) {
143 case XMLREADER_TEXT:
144 // case XMLREADER_WHITESPACE:
145 case XMLREADER_SIGNIFICANT_WHITESPACE:
146 $buffer .= $this->reader->value;
147 break;
148 case XMLREADER_END_ELEMENT:
149 return $buffer;
150 }
151 }
152 return $this->close();
153 }
154
155 /**
156 * @access private
157 */
158 function close() {
159 $this->reader->close();
160 $this->atEnd = true;
161 return null;
162 }
163 }
164
165 ?>