From 6da3f169ac55ae87837a4ba3cf3e30f83fbf9d7d Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Sat, 18 Oct 2014 15:31:27 -0700 Subject: [PATCH] Use a fixed regex for StripState The JIT compiler in newer versions of PCRE experiences lock contention when multithreaded applications perform a high rate of concurrent compilations. We are seeing some performance impact on HHVM under normal production traffic. The random part of the strip marker is just there to protect against deliberate insertion of strip markers into the source text, which is very rare. So use a generic regex to find strip markers, and check in the callback whether the random state ID is correct. StripState::killMarkers() will be slower when it has to remove many strip markers, but most calls to it will not match any strip markers, so overall performance should be improved due to reduced JIT compilation. Bug: 72205 Change-Id: I8d37ae929a8c669c9e39adc8096b89e5732b68d0 --- includes/parser/Parser.php | 29 ++++++++++++---------- includes/parser/StripState.php | 44 +++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index ddd1f9a3b9..bcef8bc758 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -112,8 +112,20 @@ class Parser { const OT_MSG = 3; const OT_PLAIN = 4; # like extractSections() - portions of the original are returned unchanged. - # Marker Suffix needs to be accessible staticly. + /** + * Prefix for temporary replacement strings generated by the preprocessor + * ("strip markers"). Using \x7f at the front gives us a little extra + * robustness since it shouldn't match when butted up against + * identifier-like string constructs. + * + * Must not consist of all title characters, or else it will change + * the behavior of in a link. + */ + const MARKER_PREFIX = "\x7fUNIQ"; + /** Suffix for strip markers */ const MARKER_SUFFIX = "-QINU\x7f"; + /** Regex which matches the state ID part of strip markers */ + const MARKER_STATE_ID_REGEX = '[0-9a-f]{16}'; # Markers used for wrapping the table of contents const TOC_START = ''; @@ -330,18 +342,9 @@ class Parser { $this->mLangLinkLanguages = array(); $this->currentRevisionCache = null; - /** - * Prefix for temporary replacement strings for the multipass parser. - * \x07 should never appear in input as it's disallowed in XML. - * Using it at the front also gives us a little extra robustness - * since it shouldn't match when butted up against identifier-like - * string constructs. - * - * Must not consist of all title characters, or else it will change - * the behavior of in a link. - */ - $this->mUniqPrefix = "\x7fUNIQ" . self::getRandomString(); - $this->mStripState = new StripState( $this->mUniqPrefix ); + $stripId = self::getRandomString(); + $this->mUniqPrefix = self::MARKER_PREFIX . $stripId; + $this->mStripState = new StripState( $stripId ); # Clear these on every parse, bug 4549 $this->mTplRedirCache = $this->mTplDomCache = array(); diff --git a/includes/parser/StripState.php b/includes/parser/StripState.php index 5d1743e61c..a1d362b113 100644 --- a/includes/parser/StripState.php +++ b/includes/parser/StripState.php @@ -26,6 +26,7 @@ * @ingroup Parser */ class StripState { + protected $id; protected $prefix; protected $data; protected $regex; @@ -37,15 +38,17 @@ class StripState { const UNSTRIP_RECURSION_LIMIT = 20; /** - * @param string $prefix + * @param string $id */ - public function __construct( $prefix ) { - $this->prefix = $prefix; + public function __construct( $id ) { + $this->id = $id; + $this->prefix = Parser::MARKER_PREFIX . $id; $this->data = array( 'nowiki' => array(), 'general' => array() ); - $this->regex = "/{$this->prefix}([^\x7f]+)" . Parser::MARKER_SUFFIX . '/'; + $this->regex = "/" . Parser::MARKER_PREFIX . + '(' . Parser::MARKER_STATE_ID_REGEX . ")([^\x7f]+)" . Parser::MARKER_SUFFIX . '/'; $this->circularRefGuard = array(); } @@ -73,11 +76,11 @@ class StripState { * @param string $value */ protected function addItem( $type, $marker, $value ) { - if ( !preg_match( $this->regex, $marker, $m ) ) { + if ( !preg_match( $this->regex, $marker, $m ) || $m[1] !== $this->id ) { throw new MWException( "Invalid marker: $marker" ); } - $this->data[$type][$m[1]] = $value; + $this->data[$type][$m[2]] = $value; } /** @@ -131,8 +134,8 @@ class StripState { * @return array */ protected function unstripCallback( $m ) { - $marker = $m[1]; - if ( isset( $this->data[$this->tempType][$marker] ) ) { + $marker = $m[2]; + if ( $m[1] === $this->id && isset( $this->data[$this->tempType][$marker] ) ) { if ( isset( $this->circularRefGuard[$marker] ) ) { return '' . wfMessage( 'parser-unstrip-loop-warning' )->inContentLanguage()->text() @@ -164,7 +167,7 @@ class StripState { * @return StripState */ public function getSubState( $text ) { - $subState = new StripState( $this->prefix ); + $subState = new StripState( $this->id ); $pos = 0; while ( true ) { $startPos = strpos( $text, $this->prefix, $pos ); @@ -175,11 +178,11 @@ class StripState { $endPos += strlen( Parser::MARKER_SUFFIX ); $marker = substr( $text, $startPos, $endPos - $startPos ); - if ( !preg_match( $this->regex, $marker, $m ) ) { + if ( !preg_match( $this->regex, $marker, $m ) || $m[1] !== $this->id ) { continue; } - $key = $m[1]; + $key = $m[2]; if ( isset( $this->data['nowiki'][$key] ) ) { $subState->data['nowiki'][$key] = $this->data['nowiki'][$key]; } elseif ( isset( $this->data['general'][$key] ) ) { @@ -219,8 +222,12 @@ class StripState { * @return string */ protected function mergeCallback( $m ) { - $key = $m[1]; - return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX; + if ( $m[1] === $this->id ) { + $key = $m[2]; + return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX; + } else { + return $m[0]; + } } /** @@ -230,6 +237,15 @@ class StripState { * @return string */ public function killMarkers( $text ) { - return preg_replace( $this->regex, '', $text ); + $id = $this->id; // PHP 5.3 hack + return preg_replace_callback( $this->regex, + function ( $m ) use ( $id ) { + if ( $m[1] === $id ) { + return ''; + } else { + return $m[0]; + } + }, + $text ); } } -- 2.20.1