From bfe131448e17c235110a3d60b80b838a2d270bdc Mon Sep 17 00:00:00 2001 From: Erik Bernhardson Date: Fri, 2 Aug 2013 16:08:22 -0700 Subject: [PATCH] Batch retreival of ES data with minimal round trips Batches ES urls by schema and host to reduce round trips required to retreive many pieces of data. Takes array of ES urls to retreive and returns a map from ES urls to their data. Errored urls are represented with a boolean false in the result set. Initially implemented for ExternalStoreDB, other stores fallback to serial requests. Change-Id: If1bef25f57bfe7de32fc6787f553a90bd76e87ea --- includes/externalstore/ExternalStore.php | 33 ++++++ includes/externalstore/ExternalStoreDB.php | 108 ++++++++++++++++-- .../externalstore/ExternalStoreMedium.php | 19 +++ 3 files changed, 150 insertions(+), 10 deletions(-) diff --git a/includes/externalstore/ExternalStore.php b/includes/externalstore/ExternalStore.php index f5119d5ea7..b5139d644a 100644 --- a/includes/externalstore/ExternalStore.php +++ b/includes/externalstore/ExternalStore.php @@ -90,6 +90,39 @@ class ExternalStore { return $store->fetchFromURL( $url ); } + /** + * Fetch data from multiple URLs with a minimum of round trips + * + * @param array $urls The URLs of the text to get + * @return array Map from url to its data. Data is either string when found + * or false on failure. + */ + public static function batchFetchFromURLs( array $urls ) { + $batches = array(); + foreach ( $urls as $url ) { + $scheme = parse_url( $url, PHP_URL_SCHEME ); + if ( $scheme ) { + $batches[$scheme][] = $url; + } + } + $retval = array(); + foreach ( $batches as $proto => $batchedUrls ) { + $store = self::getStoreObject( $proto ); + if ( $store === false ) { + continue; + } + $retval += $store->batchFetchFromURLs( $batchedUrls ); + } + // invalid, not found, db dead, etc. + $missing = array_diff( $urls, array_keys( $retval ) ); + if ( $missing ) { + foreach ( $missing as $url ) { + $retval[$url] = false; + } + } + return $retval; + } + /** * Store a data item to an external store, identified by a partial URL * The protocol part is used to identify the class, the rest is passed to the diff --git a/includes/externalstore/ExternalStoreDB.php b/includes/externalstore/ExternalStoreDB.php index be9c066ab7..8042430530 100644 --- a/includes/externalstore/ExternalStoreDB.php +++ b/includes/externalstore/ExternalStoreDB.php @@ -30,21 +30,13 @@ */ class ExternalStoreDB extends ExternalStoreMedium { /** - * The URL returned is of the form of the form DB://cluster/id + * The provided URL is in the form of DB://cluster/id * or DB://cluster/id/itemid for concatened storage. * * @see ExternalStoreMedium::fetchFromURL() */ public function fetchFromURL( $url ) { - $path = explode( '/', $url ); - $cluster = $path[2]; - $id = $path[3]; - if ( isset( $path[4] ) ) { - $itemID = $path[4]; - } else { - $itemID = false; - } - + list( $cluster, $id, $itemID ) = $this->parseURL( $url ); $ret =& $this->fetchBlob( $cluster, $id, $itemID ); if ( $itemID !== false && $ret !== false ) { @@ -53,6 +45,41 @@ class ExternalStoreDB extends ExternalStoreMedium { return $ret; } + /** + * Fetch data from given external store URLs. + * The provided URLs are in the form of DB://cluster/id + * or DB://cluster/id/itemid for concatened storage. + * + * @param array $urls An array of external store URLs + * @return array A map from url to stored content. Failed results + * are not represented. + */ + public function batchFetchFromURLs( array $urls ) { + $batched = $inverseUrlMap = array(); + foreach ( $urls as $url ) { + list( $cluster, $id, $itemID ) = $this->parseURL( $url ); + $batched[$cluster][$id][] = $itemID; + // false $itemID gets cast to int, but should be ok + // since we do === from the $itemID in $batched + $inverseUrlMap[$cluster][$id][$itemID] = $url; + } + $ret = array(); + foreach ( $batched as $cluster => $batchByCluster ) { + $res = $this->batchFetchBlobs( $cluster, $batchByCluster ); + foreach ( $res as $id => $blob ) { + foreach ( $batchByCluster[$id] as $itemID ) { + $url = $inverseUrlMap[$cluster][$id][$itemID]; + if ( $itemID === false ) { + $ret[$url] = $blob; + } else { + $ret[$url] = $blob->getItem( $itemID ); + } + } + } + } + return $ret; + } + /** * @see ExternalStoreMedium::store() */ @@ -178,4 +205,65 @@ class ExternalStoreDB extends ExternalStoreMedium { $externalBlobCache = array( $cacheID => &$ret ); return $ret; } + + /** + * Fetch multiple blob items out of the database + * + * @param string $cluster A cluster name valid for use with LBFactory + * @param array $ids A map from the blob_id's to look for to the requested itemIDs in the blobs + * @return array A map from the blob_id's requested to their content. Unlocated ids are not represented + */ + function batchFetchBlobs( $cluster, array $ids ) { + $dbr = $this->getSlave( $cluster ); + $res = $dbr->select( $this->getTable( $dbr ), array( 'blob_id', 'blob_text' ), array( 'blob_id' => array_keys( $ids ) ), __METHOD__ ); + $ret = array(); + if ( $res !== false ) { + $this->mergeBatchResult( $ret, $ids, $res ); + } + if ( $ids ) { + wfDebugLog( __CLASS__, __METHOD__ . " master fallback on '$cluster' for: " . implode( ',', array_keys( $ids ) ) . "\n" ); + // Try the master + $dbw = $this->getMaster( $cluster ); + $res = $dbw->select( $this->getTable( $dbr ), array( 'blob_id', 'blob_text' ), array( 'blob_id' => array_keys( $ids ) ), __METHOD__ ); + if ( $res === false ) { + wfDebugLog( __CLASS__, __METHOD__ . " master failed on '$cluster'\n" ); + } else { + $this->mergeBatchResult( $ret, $ids, $res ); + } + } + if ( $ids ) { + wfDebugLog( __CLASS__, __METHOD__ . " master on '$cluster' failed locating items: " . implode( ',', array_keys( $ids ) ) . "\n" ); + } + return $ret; + } + + /** + * Helper function for self::batchFetchBlobs for merging master/slave results + * @param array &$ret Current self::batchFetchBlobs return value + * @param array &$ids Map from blob_id to requested itemIDs + * @param mixed $res DB result from DatabaseBase::select + */ + private function mergeBatchResult( array &$ret, array &$ids, $res ) { + foreach ( $res as $row ) { + $id = $row->blob_id; + $itemIDs = $ids[$id]; + unset( $ids[$id] ); // to track if everything is found + if ( count( $itemIDs ) === 1 && reset( $itemIDs ) === false ) { + // single result stored per blob + $ret[$id] = $row->blob_text; + } else { + // multi result stored per blob + $ret[$id] = unserialize( $row->blob_text ); + } + } + } + + protected function parseURL( $url ) { + $path = explode( '/', $url ); + return array( + $path[2], // cluster + $path[3], // id + isset( $path[4] ) ? $path[4] : false // itemID + ); + } } diff --git a/includes/externalstore/ExternalStoreMedium.php b/includes/externalstore/ExternalStoreMedium.php index 41af7d87ef..02bdcb51cf 100644 --- a/includes/externalstore/ExternalStoreMedium.php +++ b/includes/externalstore/ExternalStoreMedium.php @@ -48,6 +48,25 @@ abstract class ExternalStoreMedium { */ abstract public function fetchFromURL( $url ); + /** + * Fetch data from given external store URLs. + * + * @param array $urls A list of external store URLs + * @return array Map from the url to the text stored. Unfound data is not represented + */ + public function batchFetchFromURLs( array $urls ) { + $retval = array(); + foreach ( $urls as $url ) { + $data = $this->fetchFromURL( $url ); + // Dont return when false to allow for simpler implementations. + // errored urls are handled in ExternalStore::batchFetchFromURLs + if ( $data !== false ) { + $retval[$urls] = $data; + } + } + return $retval; + } + /** * Insert a data item into a given location * -- 2.20.1