Added SQLite version of the patch since the other one has syntax errors on SQLite
[lhc/web/wiklou.git] / maintenance / backupPrefetch.inc
1 <?php
2 /**
3 * Helper class for the --prefetch option of dumpTextPass.php
4 *
5 * @file
6 * @ingrouo Maintenance
7 */
8
9 // Some smart guy removed XMLReader's global constants from PHP 5.1
10 // and replaced them with class constants. Breaking source compatibility
11 // is SUPER awesome, and I love languages which do this constantly!
12 $xmlReaderConstants = array(
13 "NONE",
14 "ELEMENT",
15 "ATTRIBUTE",
16 "TEXT",
17 "CDATA",
18 "ENTITY_REF",
19 "ENTITY",
20 "PI",
21 "COMMENT",
22 "DOC",
23 "DOC_TYPE",
24 "DOC_FRAGMENT",
25 "NOTATION",
26 "WHITESPACE",
27 "SIGNIFICANT_WHITESPACE",
28 "END_ELEMENT",
29 "END_ENTITY",
30 "XML_DECLARATION",
31 "LOADDTD",
32 "DEFAULTATTRS",
33 "VALIDATE",
34 "SUBST_ENTITIES" );
35 foreach ( $xmlReaderConstants as $name ) {
36 $fullName = "XMLREADER_$name";
37 $newName = "XMLReader::$name";
38 if ( !defined( $fullName ) ) {
39 if ( defined( $newName ) ) {
40 define( $fullName, constant( $newName ) );
41 } else {
42 // broken or missing the extension...
43 }
44 }
45 }
46
47 /**
48 * Readahead helper for making large MediaWiki data dumps;
49 * reads in a previous XML dump to sequentially prefetch text
50 * records already normalized and decompressed.
51 *
52 * This can save load on the external database servers, hopefully.
53 *
54 * Assumes that dumps will be recorded in the canonical order:
55 * - ascending by page_id
56 * - ascending by rev_id within each page
57 * - text contents are immutable and should not change once
58 * recorded, so the previous dump is a reliable source
59 *
60 * Requires the XMLReader PECL extension.
61 * @ingroup Maintenance
62 */
63 class BaseDump {
64 var $reader = null;
65 var $atEnd = false;
66 var $atPageEnd = false;
67 var $lastPage = 0;
68 var $lastRev = 0;
69
70 function BaseDump( $infile ) {
71 $this->reader = new XMLReader();
72 $this->reader->open( $infile );
73 }
74
75 /**
76 * Attempts to fetch the text of a particular page revision
77 * from the dump stream. May return null if the page is
78 * unavailable.
79 *
80 * @param $page Integer: ID number of page to read
81 * @param $rev Integer: ID number of revision to read
82 * @return string or null
83 */
84 function prefetch( $page, $rev ) {
85 $page = intval( $page );
86 $rev = intval( $rev );
87 while ( $this->lastPage < $page && !$this->atEnd ) {
88 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
89 $this->nextPage();
90 }
91 if ( $this->lastPage > $page || $this->atEnd ) {
92 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
93 return null;
94 }
95 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
96 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
97 $this->nextRev();
98 }
99 if ( $this->lastRev == $rev && !$this->atEnd ) {
100 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
101 return $this->nextText();
102 } else {
103 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
104 return null;
105 }
106 }
107
108 function debug( $str ) {
109 wfDebug( $str . "\n" );
110 // global $dumper;
111 // $dumper->progress( $str );
112 }
113
114 /**
115 * @access private
116 */
117 function nextPage() {
118 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
119 if ( $this->skipTo( 'id' ) ) {
120 $this->lastPage = intval( $this->nodeContents() );
121 $this->lastRev = 0;
122 $this->atPageEnd = false;
123 }
124 } else {
125 $this->atEnd = true;
126 }
127 }
128
129 /**
130 * @access private
131 */
132 function nextRev() {
133 if ( $this->skipTo( 'revision' ) ) {
134 if ( $this->skipTo( 'id' ) ) {
135 $this->lastRev = intval( $this->nodeContents() );
136 }
137 } else {
138 $this->atPageEnd = true;
139 }
140 }
141
142 /**
143 * @access private
144 */
145 function nextText() {
146 $this->skipTo( 'text' );
147 return strval( $this->nodeContents() );
148 }
149
150 /**
151 * @access private
152 */
153 function skipTo( $name, $parent = 'page' ) {
154 if ( $this->atEnd ) {
155 return false;
156 }
157 while ( $this->reader->read() ) {
158 if ( $this->reader->nodeType == XMLREADER_ELEMENT &&
159 $this->reader->name == $name ) {
160 return true;
161 }
162 if ( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
163 $this->reader->name == $parent ) {
164 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
165 return false;
166 }
167 }
168 return $this->close();
169 }
170
171 /**
172 * Shouldn't something like this be built-in to XMLReader?
173 * Fetches text contents of the current element, assuming
174 * no sub-elements or such scary things.
175 *
176 * @return String
177 * @access private
178 */
179 function nodeContents() {
180 if ( $this->atEnd ) {
181 return null;
182 }
183 if ( $this->reader->isEmptyElement ) {
184 return "";
185 }
186 $buffer = "";
187 while ( $this->reader->read() ) {
188 switch( $this->reader->nodeType ) {
189 case XMLREADER_TEXT:
190 // case XMLREADER_WHITESPACE:
191 case XMLREADER_SIGNIFICANT_WHITESPACE:
192 $buffer .= $this->reader->value;
193 break;
194 case XMLREADER_END_ELEMENT:
195 return $buffer;
196 }
197 }
198 return $this->close();
199 }
200
201 /**
202 * @access private
203 */
204 function close() {
205 $this->reader->close();
206 $this->atEnd = true;
207 return null;
208 }
209 }