Some adjustments
[lhc/web/wiklou.git] / maintenance / rebuildlinks.inc
1 <?
2
3 # Functions for rebuilding the link tracking tables; must
4 # be included within a script that also includes the Setup.
5 # See rebuildlinks.php, for example.
6 #
7
8 # Turn this on if you've got memory to burn
9 $wgUseMemoryTables = false;
10
11 # Buffer this many rows before inserting them all in one sweep. More
12 # than about 1000 will probably not increase speed significantly on
13 # most setups.
14 /* private */ $rowbuf_size = 2000; // 2000 rows ~ 80 kB
15
16 function rebuildLinkTables()
17 {
18 global $wgLang, $wgUseMemoryTables, $rowbuf_size;
19
20 print "This script may take many hours to complete. If you abort during that time,\n";
21 print "your wiki will be in an inconsistent state and you may have problems running\n";
22 print "this script again. If you are going to abort, this is the time to do it.\n\n";
23 print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
24 sleep(15);
25
26 $count = 0;
27 print "Rebuilding link tables.\n";
28
29 print "Setting AUTOCOMMIT=1\n";
30 wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
31
32 print "Deleting old data in links table.\n";
33 $sql = "DELETE FROM links";
34 wfQuery( $sql, DB_WRITE );
35
36 print "Deleting old data in brokenlinks table.\n";
37 $sql = "DELETE FROM brokenlinks";
38 wfQuery( $sql, DB_WRITE );
39
40 print "Deleting old data in imagelinks table.\n";
41 $sql = "DELETE FROM imagelinks";
42 wfQuery( $sql, DB_WRITE );
43
44 print "\nAdding temporary unique index on links, brokenlinks and imagelinks.\n";
45 print "->If build aborts now, you probably aborted a previous build. If that is\n";
46 print " the case, you can clean up the remains with the following SQL commands,\n";
47 print " and then try again.\n";
48 print " ALTER TABLE links DROP INDEX tmp_unique;\n";
49 print " ALTER TABLE brokenlinks DROP INDEX tmp_unique;\n";
50 print " ALTER TABLE imagelinks DROP INDEX tmp_unique;\n\n";
51
52 $sql = "ALTER TABLE links ADD UNIQUE tmp_unique (l_from, l_to)";
53 wfQuery( $sql, DB_WRITE );
54 $sql = "ALTER TABLE brokenlinks ADD UNIQUE tmp_unique (bl_from, bl_to)";
55 wfQuery( $sql, DB_WRITE );
56 $sql = "ALTER TABLE imagelinks ADD UNIQUE tmp_unique (il_from, il_to(244))";
57 wfQuery( $sql, DB_WRITE );
58 print "Temporary unique index added ok. Forget what I said.\n\n";
59
60 print "Locking tables\n";
61 $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
62 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
63 wfQuery( $sql, DB_WRITE );
64
65 print "Finding number of articles to process\n";
66 $sql = "SELECT COUNT(*) as count FROM cur";
67 $res = wfQuery( $sql, DB_READ );
68 $obj = wfFetchObject( $res );
69 $total = $obj->count;
70
71 print "Finding highest article id\n";
72 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
73 $res = wfQuery( $sql, DB_READ );
74 $obj = wfFetchObject( $res );
75
76 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
77 "FROM cur WHERE cur_id ",
78 $obj->min, $obj->max, $rowbuf_size);
79
80 $brokenlinks_inserter = new InsertBuffer(
81 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
82
83 $links_inserter = new InsertBuffer(
84 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
85
86 $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
87 "(il_from,il_to) VALUES ", $rowbuf_size);
88
89 print "Starting processing\n";
90
91 $ins = $wgLang->getNsText( Namespace::getImage() );
92 $inslen = strlen($ins)+1;
93
94 $tc = Title::legalChars();
95
96 $start_time = time();
97 while ( $row = $cur_pulser->next() ) {
98 $from_id = $row->cur_id;
99 $ns = $wgLang->getNsText( $row->cur_namespace );
100
101 $raw_title = $row->cur_title;
102 if ( "" != $ns ) {
103 $raw_title = "$ns:{$raw_title}";
104 }
105 $title = addslashes( $raw_title );
106 $text = $row->cur_text;
107
108 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
109 $m, PREG_PATTERN_ORDER );
110
111 for ( $i = 0; $i < $numlinks; ++$i ) {
112 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
113 # an URL link; not for us!
114 continue;
115 }
116
117 # FIXME: Handle subpage links
118 $nt = Title::newFromText( $m[1][$i] );
119
120 if (! $nt)
121 {
122 $txt = $m[1][$i];
123 print "error in '$ns:{$row->cur_title}' :\t'$txt'\n";
124 continue;
125 }
126 if( $nt->getInterwiki() != "" ) {
127 # Interwiki links are not stored in the link tables
128 continue;
129 }
130 if( $nt->getNamespace() == Namespace::getSpecial() ) {
131 # Special links not stored in link tables
132 continue;
133 }
134 if( $nt->getNamespace() == Namespace::getMedia() ) {
135 # treat media: links as image: links
136 $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
137 }
138
139 $dest = addslashes( $nt->getPrefixedDBkey() );
140 $dest_id = $nt->getArticleID();
141
142 if ( 0 == strncmp( "$ins:", $raw_title, $inslen ) ) {
143 $iname = addslashes( substr( $raw_title, $inslen ) );
144 $imagelinks_inserter->insert( "('{$title}','{$iname}')" );
145 } else if ( 0 == $dest_id ) {
146 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
147 } else {
148 $links_inserter->insert( "('{$title}',{$dest_id})" );
149 }
150 }
151
152 if ( ( $count % 10 ) == 0 )
153 print ".";
154
155 if ( ( ++$count % 1000 ) == 0 ) {
156 $dt = time() - $start_time;
157 $start_time = time();
158 $rps = ($dt == 0 ? "lots of" : intval(1000/$dt));
159 print "\n$count of $total articles scanned ({$rps} articles per second)\n";
160 }
161
162 }
163
164 $imagelinks_inserter->flush();
165 $links_inserter->flush();
166 $brokenlinks_inserter->flush();
167
168 print "$total articles scanned.\n";
169
170 print "Removing temporary unique indexes from tables links, brokenlinks and imagelinks.\n";
171 $sql = "ALTER TABLE links DROP INDEX tmp_unique";
172 wfQuery( $sql, DB_WRITE );
173 $sql = "ALTER TABLE brokenlinks DROP INDEX tmp_unique";
174 wfQuery( $sql, DB_WRITE );
175 $sql = "ALTER TABLE imagelinks DROP INDEX tmp_unique";
176 wfQuery( $sql, DB_WRITE );
177
178 $sql = "UNLOCK TABLES";
179 wfQuery( $sql, DB_WRITE );
180 print "Done\n";
181 }
182
183 # InsertBuffer increases performance slightly by inserting many rows
184 # at once. The gain is small (<5%) when running against a local, idle
185 # database, but may be significant in other circumstances. It also
186 # limits the number of inserted rows uppwards, which should avoid
187 # problems with huge articles and certain mysql settings that limits
188 # the size of queries. It's also convenient.
189
190 class InsertBuffer {
191 /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
192
193 function InsertBuffer( $sql, $bufsize ){
194 $this->mSql = $sql;
195 $this->mBuf = array();
196 $this->mBufcount = 0;
197 $this->mMaxsize = $bufsize;
198 }
199
200 function insert( $value ){
201 // print $this->mSql . " -> " . $value . "\n";
202 $this->mBuf[] = $value;
203 $this->mBufcount++;
204 if($this->mBufcount > $this->mMaxsize){
205 $this->flush();
206 }
207 }
208
209 function flush(){
210 if( $this->mBufcount > 0 ){
211 $sql = $this->mSql . implode(",", $this->mBuf);
212 wfQuery( $sql, DB_WRITE );
213 $this->mBuf = array();
214 $this->mBufcount = 0;
215 // print "Wrote query of size " . strlen( $sql ) . "\n";
216 }
217 }
218
219 }
220
221 # Select parts from a large table by using the "BETWEEN X AND Y"
222 # operator on the id column. Avoids buffering the whole thing in
223 # RAM. It's also convenient.
224 class SelectPulser {
225 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
226
227 function SelectPulser( $sql, $min, $max, $setsize) {
228 $this->mSql = $sql;
229 $this->mSet = array();
230 $this->mPos = $min;
231 $this->mMax = $max;
232 $this->mSetsize = $setsize;
233 }
234
235 function next(){
236 $result = current( $this->mSet );
237 next( $this->mSet );
238 if( false !== $result ){
239 return $result;
240 }
241 while( $this->mPos <= $this->mMax ){
242 $this->mSet = array();
243 $sql = $this->mSql . " BETWEEN " . $this->mPos .
244 " AND " . ($this->mPos + $this->mSetsize - 1);
245 $this->mPos += $this->mSetsize;
246
247 $res = wfQuery( $sql, DB_READ );
248 while ( $row = wfFetchObject( $res ) ) {
249 $this->mSet[] = $row;
250 }
251
252 if( count( $this->mSet ) > 0 ){
253 return $this->next();
254 }
255 }
256 return false;
257 }
258 }
259
260 ?>