Merge "Rewrite pref cleanup script"
[lhc/web/wiklou.git] / includes / libs / rdbms / loadmonitor / LoadMonitor.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 * @ingroup Database
20 */
21
22 namespace Wikimedia\Rdbms;
23
24 use Psr\Log\LoggerInterface;
25 use Psr\Log\NullLogger;
26 use Wikimedia\ScopedCallback;
27 use BagOStuff;
28 use WANObjectCache;
29
30 /**
31 * Basic DB load monitor with no external dependencies
32 * Uses memcached to cache the replication lag for a short time
33 *
34 * @ingroup Database
35 */
36 class LoadMonitor implements ILoadMonitor {
37 /** @var ILoadBalancer */
38 protected $parent;
39 /** @var BagOStuff */
40 protected $srvCache;
41 /** @var WANObjectCache */
42 protected $wanCache;
43 /** @var LoggerInterface */
44 protected $replLogger;
45
46 /** @var float Moving average ratio (e.g. 0.1 for 10% weight to new weight) */
47 private $movingAveRatio;
48 /** @var int Amount of replication lag in seconds before warnings are logged */
49 private $lagWarnThreshold;
50
51 /** @var int cache key version */
52 const VERSION = 1;
53 /** @var int Default 'max lag' in seconds when unspecified */
54 const LAG_WARN_THRESHOLD = 10;
55
56 /**
57 * @param ILoadBalancer $lb
58 * @param BagOStuff $srvCache
59 * @param WANObjectCache $wCache
60 * @param array $options
61 * - movingAveRatio: moving average constant for server weight updates based on lag
62 * - lagWarnThreshold: how many seconds of lag trigger warnings
63 */
64 public function __construct(
65 ILoadBalancer $lb, BagOStuff $srvCache, WANObjectCache $wCache, array $options = []
66 ) {
67 $this->parent = $lb;
68 $this->srvCache = $srvCache;
69 $this->wanCache = $wCache;
70 $this->replLogger = new NullLogger();
71
72 $this->movingAveRatio = isset( $options['movingAveRatio'] )
73 ? $options['movingAveRatio']
74 : 0.1;
75 $this->lagWarnThreshold = isset( $options['lagWarnThreshold'] )
76 ? $options['lagWarnThreshold']
77 : self::LAG_WARN_THRESHOLD;
78 }
79
80 public function setLogger( LoggerInterface $logger ) {
81 $this->replLogger = $logger;
82 }
83
84 public function scaleLoads( array &$weightByServer, $domain ) {
85 $serverIndexes = array_keys( $weightByServer );
86 $states = $this->getServerStates( $serverIndexes, $domain );
87 $coefficientsByServer = $states['weightScales'];
88 foreach ( $weightByServer as $i => $weight ) {
89 if ( isset( $coefficientsByServer[$i] ) ) {
90 $weightByServer[$i] = $weight * $coefficientsByServer[$i];
91 } else { // server recently added to config?
92 $host = $this->parent->getServerName( $i );
93 $this->replLogger->error( __METHOD__ . ": host $host not in cache" );
94 }
95 }
96 }
97
98 public function getLagTimes( array $serverIndexes, $domain ) {
99 $states = $this->getServerStates( $serverIndexes, $domain );
100
101 return $states['lagTimes'];
102 }
103
104 protected function getServerStates( array $serverIndexes, $domain ) {
105 $writerIndex = $this->parent->getWriterIndex();
106 if ( count( $serverIndexes ) == 1 && reset( $serverIndexes ) == $writerIndex ) {
107 # Single server only, just return zero without caching
108 return [
109 'lagTimes' => [ $writerIndex => 0 ],
110 'weightScales' => [ $writerIndex => 1.0 ]
111 ];
112 }
113
114 $key = $this->getCacheKey( $serverIndexes );
115 # Randomize TTLs to reduce stampedes (4.0 - 5.0 sec)
116 $ttl = mt_rand( 4e6, 5e6 ) / 1e6;
117 # Keep keys around longer as fallbacks
118 $staleTTL = 60;
119
120 # (a) Check the local APC cache
121 $value = $this->srvCache->get( $key );
122 if ( $value && $value['timestamp'] > ( microtime( true ) - $ttl ) ) {
123 $this->replLogger->debug( __METHOD__ . ": got lag times ($key) from local cache" );
124 return $value; // cache hit
125 }
126 $staleValue = $value ?: false;
127
128 # (b) Check the shared cache and backfill APC
129 $value = $this->wanCache->get( $key );
130 if ( $value && $value['timestamp'] > ( microtime( true ) - $ttl ) ) {
131 $this->srvCache->set( $key, $value, $staleTTL );
132 $this->replLogger->debug( __METHOD__ . ": got lag times ($key) from main cache" );
133
134 return $value; // cache hit
135 }
136 $staleValue = $value ?: $staleValue;
137
138 # (c) Cache key missing or expired; regenerate and backfill
139 if ( $this->srvCache->lock( $key, 0, 10 ) ) {
140 # Let only this process update the cache value on this server
141 $sCache = $this->srvCache;
142 /** @noinspection PhpUnusedLocalVariableInspection */
143 $unlocker = new ScopedCallback( function () use ( $sCache, $key ) {
144 $sCache->unlock( $key );
145 } );
146 } elseif ( $staleValue ) {
147 # Could not acquire lock but an old cache exists, so use it
148 return $staleValue;
149 }
150
151 $lagTimes = [];
152 $weightScales = [];
153 $movAveRatio = $this->movingAveRatio;
154 foreach ( $serverIndexes as $i ) {
155 if ( $i == $this->parent->getWriterIndex() ) {
156 $lagTimes[$i] = 0; // master always has no lag
157 $weightScales[$i] = 1.0; // nominal weight
158 continue;
159 }
160
161 $conn = $this->parent->getAnyOpenConnection( $i );
162 if ( $conn ) {
163 $close = false; // already open
164 } else {
165 $conn = $this->parent->openConnection( $i, '' );
166 $close = true; // new connection
167 }
168
169 $lastWeight = isset( $staleValue['weightScales'][$i] )
170 ? $staleValue['weightScales'][$i]
171 : 1.0;
172 $coefficient = $this->getWeightScale( $i, $conn ?: null );
173 $newWeight = $movAveRatio * $coefficient + ( 1 - $movAveRatio ) * $lastWeight;
174
175 // Scale from 10% to 100% of nominal weight
176 $weightScales[$i] = max( $newWeight, 0.10 );
177
178 $host = $this->parent->getServerName( $i );
179
180 if ( !$conn ) {
181 $lagTimes[$i] = false;
182 $this->replLogger->error(
183 __METHOD__ . ": host {db_server} is unreachable",
184 [ 'db_server' => $host ]
185 );
186 continue;
187 }
188
189 if ( $conn->getLBInfo( 'is static' ) ) {
190 $lagTimes[$i] = 0;
191 } else {
192 $lagTimes[$i] = $conn->getLag();
193 if ( $lagTimes[$i] === false ) {
194 $this->replLogger->error(
195 __METHOD__ . ": host {db_server} is not replicating?",
196 [ 'db_server' => $host ]
197 );
198 } elseif ( $lagTimes[$i] > $this->lagWarnThreshold ) {
199 $this->replLogger->error(
200 "Server {host} has {lag} seconds of lag (>= {maxlag})",
201 [
202 'host' => $host,
203 'lag' => $lagTimes[$i],
204 'maxlag' => $this->lagWarnThreshold
205 ]
206 );
207 }
208 }
209
210 if ( $close ) {
211 # Close the connection to avoid sleeper connections piling up.
212 # Note that the caller will pick one of these DBs and reconnect,
213 # which is slightly inefficient, but this only matters for the lag
214 # time cache miss cache, which is far less common that cache hits.
215 $this->parent->closeConnection( $conn );
216 }
217 }
218
219 # Add a timestamp key so we know when it was cached
220 $value = [
221 'lagTimes' => $lagTimes,
222 'weightScales' => $weightScales,
223 'timestamp' => microtime( true )
224 ];
225 $this->wanCache->set( $key, $value, $staleTTL );
226 $this->srvCache->set( $key, $value, $staleTTL );
227 $this->replLogger->info( __METHOD__ . ": re-calculated lag times ($key)" );
228
229 return $value;
230 }
231
232 /**
233 * @param int $index Server index
234 * @param IDatabase|null $conn Connection handle or null on connection failure
235 * @return float
236 */
237 protected function getWeightScale( $index, IDatabase $conn = null ) {
238 return $conn ? 1.0 : 0.0;
239 }
240
241 private function getCacheKey( array $serverIndexes ) {
242 sort( $serverIndexes );
243 // Lag is per-server, not per-DB, so key on the master DB name
244 return $this->srvCache->makeGlobalKey(
245 'lag-times',
246 self::VERSION,
247 $this->parent->getServerName( $this->parent->getWriterIndex() ),
248 implode( '-', $serverIndexes )
249 );
250 }
251 }