Make getLagFromPtHeartbeat() always use the LB cluster master entry
authorAaron Schulz <aschulz@wikimedia.org>
Fri, 4 Dec 2015 00:37:56 +0000 (16:37 -0800)
committerAaron Schulz <aschulz@wikimedia.org>
Thu, 10 Dec 2015 06:00:19 +0000 (22:00 -0800)
Before, it just used the immediate master entry, which could be another
slave. In that case, it may not even exists at all.

Bug: T119648
Change-Id: Iea970b81ad2c9855aafcccf0bb0662fc0b3a8d4d

includes/db/Database.php
includes/db/DatabaseMysqlBase.php
includes/db/loadbalancer/LoadBalancer.php
tests/phpunit/includes/db/DatabaseMysqlBaseTest.php
tests/phpunit/includes/db/LBFactoryTest.php

index dea4a59..31e2653 100644 (file)
@@ -148,6 +148,9 @@ abstract class DatabaseBase implements IDatabase {
         */
        private $mTrxWriteDuration = 0.0;
 
+       /** @var IDatabase|null Lazy handle to the master DB this server replicates from */
+       private $lazyMasterHandle;
+
        /**
         * @since 1.21
         * @var resource File handle for upgrade
@@ -328,6 +331,25 @@ abstract class DatabaseBase implements IDatabase {
                }
        }
 
+       /**
+        * Set a lazy-connecting DB handle to the master DB (for replication status purposes)
+        *
+        * @param IDatabase $conn
+        * @since 1.27
+        */
+       public function setLazyMasterHandle( IDatabase $conn ) {
+               $this->lazyMasterHandle = $conn;
+       }
+
+       /**
+        * @return IDatabase|null
+        * @see setLazyMasterHandle()
+        * @since 1.27
+        */
+       public function getLazyMasterHandle() {
+               return $this->lazyMasterHandle;
+       }
+
        /**
         * @return TransactionProfiler
         */
index ca568ad..c3cdf35 100644 (file)
@@ -621,13 +621,20 @@ abstract class DatabaseMysqlBase extends Database {
        abstract protected function mysqlPing();
 
        function getLag() {
-               if ( $this->lagDetectionMethod === 'pt-heartbeat' ) {
+               if ( $this->getLagDetectionMethod() === 'pt-heartbeat' ) {
                        return $this->getLagFromPtHeartbeat();
                } else {
                        return $this->getLagFromSlaveStatus();
                }
        }
 
+       /**
+        * @return string
+        */
+       protected function getLagDetectionMethod() {
+               return $this->lagDetectionMethod;
+       }
+
        /**
         * @return bool|int
         */
@@ -645,35 +652,82 @@ abstract class DatabaseMysqlBase extends Database {
         * @return bool|float
         */
        protected function getLagFromPtHeartbeat() {
-               $key = wfMemcKey( 'mysql', 'master-server-id', $this->getServer() );
-               $masterId = intval( $this->srvCache->get( $key ) );
-               if ( !$masterId ) {
-                       $res = $this->query( 'SHOW SLAVE STATUS', __METHOD__ );
-                       $row = $res ? $res->fetchObject() : false;
-                       if ( $row && strval( $row->Master_Server_Id ) !== '' ) {
-                               $masterId = intval( $row->Master_Server_Id );
-                               $this->srvCache->set( $key, $masterId, 30 );
-                       }
+               $masterInfo = $this->getMasterServerInfo();
+               if ( !$masterInfo ) {
+                       return false; // could not get master server ID
                }
 
-               if ( !$masterId ) {
-                       return false;
+               list( $time, $nowUnix ) = $this->getHeartbeatData( $masterInfo['serverId'] );
+               if ( $time !== null ) {
+                       // @time is in ISO format like "2015-09-25T16:48:10.000510"
+                       $dateTime = new DateTime( $time, new DateTimeZone( 'UTC' ) );
+                       $timeUnix = (int)$dateTime->format( 'U' ) + $dateTime->format( 'u' ) / 1e6;
+
+                       return max( $nowUnix - $timeUnix, 0.0 );
                }
 
+               return false;
+       }
+
+       protected function getMasterServerInfo() {
+               $cache = $this->srvCache;
+               $key = $cache->makeGlobalKey(
+                       'mysql',
+                       'master-info',
+                       // Using one key for all cluster slaves is preferable
+                       $this->getLBInfo( 'clusterMasterHost' ) ?: $this->getServer()
+               );
+
+               $that = $this;
+               return $cache->getWithSetCallback(
+                       $key,
+                       $cache::TTL_INDEFINITE,
+                       function () use ( $that, $cache, $key ) {
+                               // Get and leave a lock key in place for a short period
+                               if ( !$cache->lock( $key, 0, 10 ) ) {
+                                       return false; // avoid master connection spike slams
+                               }
+
+                               $conn = $that->getLazyMasterHandle();
+                               if ( !$conn ) {
+                                       return false; // something is misconfigured
+                               }
+
+                               // Connect to and query the master; catch errors to avoid outages
+                               try {
+                                       $res = $conn->query( 'SELECT @@server_id AS id', __METHOD__ );
+                                       $row = $res ? $res->fetchObject() : false;
+                                       $id = $row ? (int)$row->id : 0;
+                               } catch ( DBError $e ) {
+                                       $id = 0;
+                               }
+
+                               // Cache the ID if it was retrieved
+                               return $id ? array( 'serverId' => $id, 'asOf' => time() ) : false;
+                       }
+               );
+       }
+
+       /**
+        * @param string $masterId Server ID
+        * @return array (heartbeat `ts` column value or null, UNIX timestamp)
+        * @see https://www.percona.com/doc/percona-toolkit/2.1/pt-heartbeat.html
+        */
+       protected function getHeartbeatData( $masterId ) {
+               // Get the status row for this master; use the oldest for sanity in case the master
+               // has entries listed under different server IDs (which should really not happen).
+               // Note: this would use "MAX(TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)))" but the
+               // percision field is not supported in MySQL <= 5.5.
                $res = $this->query(
-                       "SELECT TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)) AS Lag " .
-                       "FROM heartbeat.heartbeat WHERE server_id = $masterId"
+                       "SELECT ts FROM heartbeat.heartbeat WHERE server_id=" . intval( $masterId )
                );
                $row = $res ? $res->fetchObject() : false;
-               if ( $row ) {
-                       return max( floatval( $row->Lag ) / 1e6, 0.0 );
-               }
 
-               return false;
+               return array( $row ? $row->ts : null, microtime( true ) );
        }
 
        public function getApproximateLagStatus() {
-               if ( $this->lagDetectionMethod === 'pt-heartbeat' ) {
+               if ( $this->getLagDetectionMethod() === 'pt-heartbeat' ) {
                        // Disable caching since this is fast enough and we don't wan't
                        // to be *too* pessimistic by having both the cache TTL and the
                        // pt-heartbeat interval count as lag in getSessionLagStatus()
index 19b2d1c..03c74fc 100644 (file)
@@ -816,11 +816,14 @@ class LoadBalancer {
                        $server['dbname'] = $dbNameOverride;
                }
 
+               // Let the handle know what the cluster master is (e.g. "db1052")
+               $masterName = $this->getServerName( 0 );
+               $server['clusterMasterHost'] = $masterName;
+
                // Log when many connection are made on requests
                if ( ++$this->connsOpened >= self::CONN_HELD_WARN_THRESHOLD ) {
-                       $masterAddr = $this->getServerName( 0 );
                        wfDebugLog( 'DBPerformance', __METHOD__ . ": " .
-                               "{$this->connsOpened}+ connections made (master=$masterAddr)\n" .
+                               "{$this->connsOpened}+ connections made (master=$masterName)\n" .
                                wfBacktrace( true ) );
                }
 
@@ -834,6 +837,9 @@ class LoadBalancer {
                }
 
                $db->setLBInfo( $server );
+               $db->setLazyMasterHandle(
+                       $this->getLazyConnectionRef( DB_MASTER, array(), $db->getWikiID() )
+               );
 
                return $db;
        }
index 31e4f5b..6403905 100644 (file)
@@ -256,4 +256,59 @@ class DatabaseMysqlBaseTest extends MediaWikiTestCase {
                $this->assertTrue( $pos2->hasReached( $pos1 ) );
                $this->assertFalse( $pos1->hasReached( $pos2 ) );
        }
+
+       /**
+        * @dataProvider provideLagAmounts
+        */
+       function testPtHeartbeat( $lag ) {
+               $db = $this->getMockBuilder( 'DatabaseMysql' )
+                       ->disableOriginalConstructor()
+                       ->setMethods( array(
+                               'getLagDetectionMethod', 'getHeartbeatData', 'getMasterServerInfo' ) )
+                       ->getMock();
+
+               $db->expects( $this->any() )
+                       ->method( 'getLagDetectionMethod' )
+                       ->will( $this->returnValue( 'pt-heartbeat' ) );
+
+               $db->expects( $this->any() )
+                       ->method( 'getMasterServerInfo' )
+                       ->will( $this->returnValue( array( 'serverId' => 172, 'asOf' => time() ) ) );
+
+               // Fake the current time.
+               list( $nowSecFrac, $nowSec ) = explode( ' ', microtime() );
+               $now = (float)$nowSec + (float)$nowSecFrac;
+               // Fake the heartbeat time.
+               // Work arounds for weak DataTime microseconds support.
+               $ptTime = $now - $lag;
+               $ptSec = (int)$ptTime;
+               $ptSecFrac = ( $ptTime - $ptSec );
+               $ptDateTime = new DateTime( "@$ptSec" );
+               $ptTimeISO = $ptDateTime->format( 'Y-m-d\TH:i:s' );
+               $ptTimeISO .= ltrim( number_format( $ptSecFrac, 6 ), '0' );
+
+               $db->expects( $this->any() )
+                       ->method( 'getHeartbeatData' )
+                       ->with( 172 )
+                       ->will( $this->returnValue( array( $ptTimeISO, $now ) ) );
+
+               $db->setLBInfo( 'clusterMasterHost', 'db1052' );
+               $lagEst = $db->getLag();
+
+               $this->assertGreaterThan( $lag - .010, $lagEst, "Correct heatbeat lag" );
+               $this->assertLessThan( $lag + .010, $lagEst, "Correct heatbeat lag" );
+       }
+
+       function provideLagAmounts() {
+               return array(
+                       array( 0 ),
+                       array( 0.3 ),
+                       array( 6.5 ),
+                       array( 10.1 ),
+                       array( 200.2 ),
+                       array( 400.7 ),
+                       array( 600.22 ),
+                       array( 1000.77 ),
+               );
+       }
 }
index 519d3a0..a647445 100644 (file)
@@ -103,9 +103,13 @@ class LBFactoryTest extends MediaWikiTestCase {
 
                $dbw = $lb->getConnection( DB_MASTER );
                $this->assertTrue( $dbw->getLBInfo( 'master' ), 'master shows as master' );
+               $this->assertEquals(
+                       $wgDBserver, $dbw->getLBInfo( 'clusterMasterHost' ), 'cluster master set' );
 
                $dbr = $lb->getConnection( DB_SLAVE );
                $this->assertTrue( $dbr->getLBInfo( 'slave' ), 'slave shows as slave' );
+               $this->assertEquals(
+                       $wgDBserver, $dbr->getLBInfo( 'clusterMasterHost' ), 'cluster master set' );
 
                $factory->shutdown();
                $lb->closeAll();