DO NOT USE ! Simple database layer for postgresql based on existing mysql one. I...
[lhc/web/wiklou.git] / maintenance / rebuildlinks.inc
1 <?php
2
3 die( "rebuildLinks.inc needs to be updated for the new schema\n" );
4
5 # Functions for rebuilding the link tracking tables; must
6 # be included within a script that also includes the Setup.
7 # See rebuildlinks.php, for example.
8 #
9
10 # Buffer this many rows before inserting them all in one sweep. More
11 # than about 1000 will probably not increase speed significantly on
12 # most setups.
13 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
14
15 function rebuildLinkTables()
16 {
17 error_reporting (E_ALL);
18 global $wgLang, $wgLinkCache, $rowbuf_size;
19
20 print "This script may take several hours to complete. If you abort during that time,\n";
21 print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
22 print "the time to do it.\n\n";
23 print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
24 sleep(15);
25
26 $count = 0;
27 print "Rebuilding link tables.\n";
28
29 print "Setting AUTOCOMMIT=1\n";
30 wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
31
32 print "Extracting often used data from cur (may take a few minutes)\n";
33 $sql = "CREATE TEMPORARY TABLE cur_fast SELECT cur_namespace, cur_title, cur_id FROM cur";
34 wfQuery( $sql, DB_WRITE );
35 $sql = "ALTER TABLE cur_fast ADD INDEX(cur_namespace, cur_title)";
36 wfQuery( $sql, DB_WRITE );
37
38 print "Locking tables\n";
39 $sql = "LOCK TABLES cur READ, cur_fast READ, interwiki READ, user_newtalk READ, " .
40 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
41 wfQuery( $sql, DB_WRITE );
42
43
44 print "Deleting old data in links table.\n";
45 $sql = "DELETE FROM links";
46 wfQuery( $sql, DB_WRITE );
47
48 print "Deleting old data in brokenlinks table.\n";
49 $sql = "DELETE FROM brokenlinks";
50 wfQuery( $sql, DB_WRITE );
51
52 print "Deleting old data in imagelinks table.\n";
53 $sql = "DELETE FROM imagelinks";
54 wfQuery( $sql, DB_WRITE );
55
56 print "Finding number of articles to process... ";
57 $sql = "SELECT COUNT(*) as count FROM cur";
58 $res = wfQuery( $sql, DB_READ );
59 $obj = wfFetchObject( $res );
60 $total = $obj->count;
61 print "$total\n";
62
63 print "Finding highest article id\n";
64 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
65 $res = wfQuery( $sql, DB_READ );
66 $obj = wfFetchObject( $res );
67
68 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
69 "FROM cur WHERE cur_id ",
70 $obj->min, $obj->max, 100);
71
72 $brokenlinks_inserter = new InsertBuffer(
73 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
74
75 $links_inserter = new InsertBuffer(
76 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
77
78 $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
79 "(il_from,il_to) VALUES ", $rowbuf_size);
80
81 print "Starting processing\n";
82
83 $ins = $wgLang->getNsText( Namespace::getImage() );
84 $inslen = strlen($ins)+1;
85
86 $tc = Title::legalChars();
87
88 $titleCache = new MRUCache( 10000 );
89 $titlecount = 0;
90 $start_time = time();
91
92 while ( $row = $cur_pulser->next() ) {
93
94 $from_id = intval($row->cur_id);
95 $ns = $wgLang->getNsText( $row->cur_namespace );
96 $from_full_title = $row->cur_title;
97 if ( "" != $ns ) {
98 $from_full_title = "$ns:{$from_full_title}";
99 }
100 $from_full_title_with_slashes = addslashes( $from_full_title );
101 $text = $row->cur_text;
102
103 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
104 $m, PREG_PATTERN_ORDER );
105
106 $seen_dbtitles = array(); // seen links (normalized and with ns, see below)
107 $titles_ready_for_insertion = array();
108 $titles_needing_curdata = array();
109 $titles_needing_curdata_pos = array();
110 $links_corresponding_to_titles = array();
111
112 for ( $i = 0 ; $i < $numlinks; ++$i ) {
113 $link = $m[1][$i];
114 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
115 # an URL link; not for us!
116 continue;
117 }
118
119 # FIXME: Handle subpage links
120 $nt = $titleCache->get( $link );
121 if( $nt != false ){
122 // Only process each unique link once per page
123 $nt_key = $nt->getDBkey() . $nt->getNamespace();
124 if( isset( $seen_dbtitles[$nt_key] ) )
125 continue;
126 $seen_dbtitles[$nt_key] = 1;
127
128 $titles_ready_for_insertion[] = $nt;
129 } else {
130 $nt = Title::newFromText( $link );
131 if (! $nt) {
132 // Invalid link, probably something like "[[ ]]"
133 continue;
134 }
135
136 // Only process each unique link once per page
137 $nt_key = $nt->getDBkey() . $nt->getNamespace();
138 if( isset( $seen_dbtitles[$nt_key] ) )
139 continue;
140 $seen_dbtitles[$nt_key] = 1;
141
142 if( $nt->getInterwiki() != "" ) {
143 # Interwiki links are not stored in the link tables
144 continue;
145 }
146 if( $nt->getNamespace() == Namespace::getSpecial() ) {
147 # Special links not stored in link tables
148 continue;
149 }
150 if( $nt->getNamespace() == Namespace::getMedia() ) {
151 # treat media: links as image: links
152 $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
153 }
154 $nt->mArticleID = 0; // assume broken link until proven otherwise
155
156 $pos = array_push($titles_needing_curdata, $nt) - 1;
157 $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos;
158 $links_corresponding_to_titles[] = $link;
159 unset( $link ); // useless outside this loop, but tempting
160 }
161 }
162
163
164 if ( count( $titles_needing_curdata ) > 0 ){
165 $parts = array();
166 foreach ($titles_needing_curdata as $nt ) {
167 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
168 "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')";
169 }
170 $sql = "SELECT cur_namespace, cur_title, cur_id FROM cur_fast WHERE " .
171 implode(" OR ", $parts);
172 $res = wfQuery( $sql, DB_WRITE );
173 while($row = wfFetchObject( $res ) ){
174 $pos = $titles_needing_curdata_pos[$row->cur_title . $row->cur_namespace];
175 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
176 }
177 for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
178 $tmplink = $links_corresponding_to_titles[$k];
179 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
180 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
181 }
182 }
183
184 foreach ( $titles_ready_for_insertion as $nt ) {
185 $dest_noslashes = $nt->getPrefixedDBkey();
186 $dest = addslashes( $dest_noslashes );
187 $dest_id = $nt->getArticleID();
188 $from = $from_full_title_with_slashes;
189
190 # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
191
192 if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) {
193 $iname = addslashes( substr( $dest_noslashes, $inslen ) );
194 $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
195 } else if ( 0 == $dest_id ) {
196 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
197 } else {
198 $links_inserter->insert( "('{$from}',{$dest_id})" );
199 }
200 $titlecount++;
201 }
202
203 if ( ( $count % 20 ) == 0 )
204 print ".";
205
206 if ( ( ++$count % 1000 ) == 0 ) {
207 $dt = time() - $start_time;
208 $start_time = time();
209 $rps = persec(1000, $dt);
210 $tps = persec($titlecount, $dt);
211 $titlecount = 0;
212 print "\n$count of $total articles scanned ({$rps} articles ".
213 "and {$tps} titles per second)\n";
214 print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
215
216 }
217
218 }
219
220 print "\nFlushing insertion buffers...";
221 $imagelinks_inserter->flush();
222 $links_inserter->flush();
223 $brokenlinks_inserter->flush();
224 print "ok\n";
225
226 print "$count articles scanned.\n";
227
228 $sql = "UNLOCK TABLES";
229 wfQuery( $sql, DB_WRITE );
230 print "Done\n";
231 }
232
233 /* private */ function persec($n, $t){
234 if($n == 0)
235 return "zero";
236 if($t == 0)
237 return "lots of";
238 return intval($n/$t);
239 }
240
241 # InsertBuffer increases performance slightly by inserting many rows
242 # at once. The gain is small (<5%) when running against a local, idle
243 # database, but may be significant in other circumstances. It also
244 # limits the number of inserted rows uppwards, which should avoid
245 # problems with huge articles and certain mysql settings that limits
246 # the size of queries. It's also convenient.
247
248 class InsertBuffer {
249 /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
250
251 function InsertBuffer( $sql, $bufsize ){
252 $this->mSql = $sql;
253 $this->mBuf = array();
254 $this->mBufcount = 0;
255 $this->mMaxsize = $bufsize;
256 }
257
258 function insert( $value ){
259 // print $this->mSql . " -> " . $value . "\n";
260 $this->mBuf[] = $value;
261 $this->mBufcount++;
262 if($this->mBufcount > $this->mMaxsize){
263 $this->flush();
264 }
265 }
266
267 function flush(){
268 if( $this->mBufcount > 0 ){
269 $sql = $this->mSql . implode(",", $this->mBuf);
270 wfQuery( $sql, DB_WRITE );
271 $this->mBuf = array();
272 $this->mBufcount = 0;
273 // print "Wrote query of size " . strlen( $sql ) . "\n";
274 }
275 }
276
277 }
278
279 # Select parts from a large table by using the "BETWEEN X AND Y"
280 # operator on the id column. Avoids buffering the whole thing in
281 # RAM. It's also convenient.
282
283 class SelectPulser {
284 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
285
286 function SelectPulser( $sql, $min, $max, $setsize) {
287 $this->mSql = $sql;
288 $this->mSet = array();
289 $this->mPos = $min;
290 $this->mMax = $max;
291 $this->mSetsize = $setsize;
292 }
293
294 function next(){
295 $result = current( $this->mSet );
296 next( $this->mSet );
297 if( false !== $result ){
298 return $result;
299 }
300 while( $this->mPos <= $this->mMax ){
301 $this->mSet = array();
302 $sql = $this->mSql . " BETWEEN " . $this->mPos .
303 " AND " . ($this->mPos + $this->mSetsize - 1);
304 $this->mPos += $this->mSetsize;
305
306 $res = wfQuery( $sql, DB_READ );
307 while ( $row = wfFetchObject( $res ) ) {
308 $this->mSet[] = $row;
309 }
310 wfFreeResult( $res );
311 if( count( $this->mSet ) > 0 ){
312 return $this->next();
313 }
314 }
315 return false;
316 }
317 }
318
319 # A simple MRU for general cacheing.
320
321 class MRUCache {
322 /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
323 /* private */ var $hits, $misses;
324
325 function MRUCache( $size, $purgefreq = -1 ) {
326 // purgefreq is 1/10 of $size if not stated
327 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
328 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
329
330 $this->mSize = $size;
331 $this->mMru = array();
332 $this->mCache = array();
333 $this->mPurgefreq = $purgefreq;
334 $this->nexti = 1;
335 print "purgefreq = " . $this->mPurgefreq . "\n";
336 }
337
338 function get( $key ){
339 if ( ! array_key_exists( $key, $this->mCache) ){
340 $this->misses++;
341 return false;
342 }
343 $this->hits++;
344 $this->mMru[$key] = $this->nexti++;
345 return $this->mCache[$key];
346 }
347
348 function set( $key, $value ){
349 $this->mMru[$key] = $this->nexti++;
350 $this->mCache[$key] = $value;
351
352 if($this->nexti % $this->mPurgefreq == 0)
353 $this->purge();
354 }
355
356 function purge(){
357 $to_remove = count( $this->mMru ) - $this->mSize;
358 if( $to_remove <= 0 ){
359 return;
360 }
361 asort( $this->mMru );
362 $removed = array_splice( $this->mMru, 0, $to_remove );
363 foreach( array_keys( $removed ) as $key ){
364 unset( $this->mCache[$key] );
365 }
366 }
367
368 function getPerformance(){
369 $tot = $this->hits + $this->misses;
370 if($tot > 0)
371 return intval(100.0 * $this->hits / $tot);
372 else
373 return 0;
374 }
375 }
376
377 ?>