Merge "rdbms: avoid lag estimates in getLagFromPtHeartbeat ruined by snapshots"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Sat, 31 Mar 2018 08:31:10 +0000 (08:31 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Sat, 31 Mar 2018 08:31:11 +0000 (08:31 +0000)
1  2 
includes/libs/rdbms/database/Database.php
includes/libs/rdbms/database/DatabaseMysqlBase.php

@@@ -3392,8 -3392,10 +3392,8 @@@ abstract class Database implements IDat
                $this->trxWriteAdjQueryCount = 0;
                $this->trxWriteCallers = [];
                // First SELECT after BEGIN will establish the snapshot in REPEATABLE-READ.
 -              // Get an estimate of the replica DB lag before then, treating estimate staleness
 -              // as lag itself just to be safe
 -              $status = $this->getApproximateLagStatus();
 -              $this->trxReplicaLag = $status['lag'] + ( microtime( true ) - $status['since'] );
 +              // Get an estimate of the replication lag before any such queries.
 +              $this->trxReplicaLag = $this->getApproximateLagStatus()['lag'];
                // T147697: make explicitTrxActive() return true until begin() finishes. This way, no
                // caller will think its OK to muck around with the transaction just because startAtomic()
                // has not yet completed (e.g. setting trxAtomicLevels).
         * @return array|null ('lag': seconds or false on error, 'since': UNIX timestamp of BEGIN)
         * @since 1.27
         */
-       protected function getTransactionLagStatus() {
+       final protected function getTransactionLagStatus() {
                return $this->trxLevel
                        ? [ 'lag' => $this->trxReplicaLag, 'since' => $this->trxTimestamp() ]
                        : null;
@@@ -763,6 -763,17 +763,17 @@@ abstract class DatabaseMysqlBase extend
        protected function getLagFromPtHeartbeat() {
                $options = $this->lagDetectionOptions;
  
+               if ( $this->trxLevel ) {
+                       // Avoid returning higher and higher lag value due to snapshot age
+                       // given that the isolation level will typically be REPEATABLE-READ
+                       $this->queryLogger->warning(
+                               "Using cached lag value for {db_server} due to active transaction",
+                               $this->getLogContext( [ 'method' => __METHOD__ ] )
+                       );
+                       return $this->getTransactionLagStatus()['lag'];
+               }
                if ( isset( $options['conds'] ) ) {
                        // Best method for multi-DC setups: use logical channel names
                        $data = $this->getHeartbeatData( $options['conds'] );
                        $rpos = $this->getReplicaPos();
                        $gtidsWait = $rpos ? MySQLMasterPos::getCommonDomainGTIDs( $pos, $rpos ) : [];
                        if ( !$gtidsWait ) {
 +                              $this->queryLogger->error(
 +                                      "No GTIDs with the same domain between master ($pos) and replica ($rpos)",
 +                                      $this->getLogContext( [
 +                                              'method' => __METHOD__,
 +                                      ] )
 +                              );
 +
                                return -1; // $pos is from the wrong cluster?
                        }
                        // Wait on the GTID set (MariaDB only)