c8af6a6a7a2ef8edbbdf74e006d6162030b3fae2
[lhc/web/wiklou.git] / maintenance / backupPrefetch.inc
1 <?php
2
3 // Some smart guy removed XMLReader's global constants from PHP 5.1
4 // and replaced them with class constants. Breaking source compatibility
5 // is SUPER awesome, and I love languages which do this constantly!
6 $xmlReaderConstants = array(
7 "NONE",
8 "ELEMENT",
9 "ATTRIBUTE",
10 "TEXT",
11 "CDATA",
12 "ENTITY_REF",
13 "ENTITY",
14 "PI",
15 "COMMENT",
16 "DOC",
17 "DOC_TYPE",
18 "DOC_FRAGMENT",
19 "NOTATION",
20 "WHITESPACE",
21 "SIGNIFICANT_WHITESPACE",
22 "END_ELEMENT",
23 "END_ENTITY",
24 "XML_DECLARATION",
25 "LOADDTD",
26 "DEFAULTATTRS",
27 "VALIDATE",
28 "SUBST_ENTITIES" );
29 foreach ( $xmlReaderConstants as $name ) {
30 $fullName = "XMLREADER_$name";
31 $newName = "XMLReader::$name";
32 if ( !defined( $fullName ) ) {
33 if ( defined( $newName ) ) {
34 define( $fullName, constant( $newName ) );
35 } else {
36 // broken or missing the extension...
37 }
38 }
39 }
40
41 /**
42 * Readahead helper for making large MediaWiki data dumps;
43 * reads in a previous XML dump to sequentially prefetch text
44 * records already normalized and decompressed.
45 *
46 * This can save load on the external database servers, hopefully.
47 *
48 * Assumes that dumps will be recorded in the canonical order:
49 * - ascending by page_id
50 * - ascending by rev_id within each page
51 * - text contents are immutable and should not change once
52 * recorded, so the previous dump is a reliable source
53 *
54 * Requires the XMLReader PECL extension.
55 * @ingroup Maintenance
56 */
57 class BaseDump {
58 var $reader = null;
59 var $atEnd = false;
60 var $atPageEnd = false;
61 var $lastPage = 0;
62 var $lastRev = 0;
63
64 function BaseDump( $infile ) {
65 $this->reader = new XMLReader();
66 $this->reader->open( $infile );
67 }
68
69 /**
70 * Attempts to fetch the text of a particular page revision
71 * from the dump stream. May return null if the page is
72 * unavailable.
73 *
74 * @param $page Integer: ID number of page to read
75 * @param $rev Integer: ID number of revision to read
76 * @return string or null
77 */
78 function prefetch( $page, $rev ) {
79 $page = intval( $page );
80 $rev = intval( $rev );
81 while ( $this->lastPage < $page && !$this->atEnd ) {
82 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
83 $this->nextPage();
84 }
85 if ( $this->lastPage > $page || $this->atEnd ) {
86 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
87 return null;
88 }
89 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
90 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
91 $this->nextRev();
92 }
93 if ( $this->lastRev == $rev && !$this->atEnd ) {
94 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
95 return $this->nextText();
96 } else {
97 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
98 return null;
99 }
100 }
101
102 function debug( $str ) {
103 wfDebug( $str . "\n" );
104 // global $dumper;
105 // $dumper->progress( $str );
106 }
107
108 /**
109 * @access private
110 */
111 function nextPage() {
112 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
113 if ( $this->skipTo( 'id' ) ) {
114 $this->lastPage = intval( $this->nodeContents() );
115 $this->lastRev = 0;
116 $this->atPageEnd = false;
117 }
118 } else {
119 $this->atEnd = true;
120 }
121 }
122
123 /**
124 * @access private
125 */
126 function nextRev() {
127 if ( $this->skipTo( 'revision' ) ) {
128 if ( $this->skipTo( 'id' ) ) {
129 $this->lastRev = intval( $this->nodeContents() );
130 }
131 } else {
132 $this->atPageEnd = true;
133 }
134 }
135
136 /**
137 * @access private
138 */
139 function nextText() {
140 $this->skipTo( 'text' );
141 return strval( $this->nodeContents() );
142 }
143
144 /**
145 * @access private
146 */
147 function skipTo( $name, $parent = 'page' ) {
148 if ( $this->atEnd ) {
149 return false;
150 }
151 while ( $this->reader->read() ) {
152 if ( $this->reader->nodeType == XMLREADER_ELEMENT &&
153 $this->reader->name == $name ) {
154 return true;
155 }
156 if ( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
157 $this->reader->name == $parent ) {
158 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
159 return false;
160 }
161 }
162 return $this->close();
163 }
164
165 /**
166 * Shouldn't something like this be built-in to XMLReader?
167 * Fetches text contents of the current element, assuming
168 * no sub-elements or such scary things.
169 *
170 * @return String
171 * @access private
172 */
173 function nodeContents() {
174 if ( $this->atEnd ) {
175 return null;
176 }
177 if ( $this->reader->isEmptyElement ) {
178 return "";
179 }
180 $buffer = "";
181 while ( $this->reader->read() ) {
182 switch( $this->reader->nodeType ) {
183 case XMLREADER_TEXT:
184 // case XMLREADER_WHITESPACE:
185 case XMLREADER_SIGNIFICANT_WHITESPACE:
186 $buffer .= $this->reader->value;
187 break;
188 case XMLREADER_END_ELEMENT:
189 return $buffer;
190 }
191 }
192 return $this->close();
193 }
194
195 /**
196 * @access private
197 */
198 function close() {
199 $this->reader->close();
200 $this->atEnd = true;
201 return null;
202 }
203 }