Use a fixed regex for StripState
authorTim Starling <tstarling@wikimedia.org>
Sat, 18 Oct 2014 22:31:27 +0000 (15:31 -0700)
committerOri Livneh <ori@wikimedia.org>
Sun, 19 Oct 2014 21:38:09 +0000 (14:38 -0700)
The JIT compiler in newer versions of PCRE experiences lock contention
when multithreaded applications perform a high rate of concurrent
compilations. We are seeing some performance impact on HHVM under normal
production traffic.

The random part of the strip marker is just there to protect against
deliberate insertion of strip markers into the source text, which is
very rare. So use a generic regex to find strip markers, and check in
the callback whether the random state ID is correct.

StripState::killMarkers() will be slower when it has to remove many
strip markers, but most calls to it will not match any strip markers, so
overall performance should be improved due to reduced JIT compilation.

Bug: 72205
Change-Id: I8d37ae929a8c669c9e39adc8096b89e5732b68d0

includes/parser/Parser.php
includes/parser/StripState.php

index ddd1f9a..bcef8bc 100644 (file)
@@ -112,8 +112,20 @@ class Parser {
        const OT_MSG = 3;
        const OT_PLAIN = 4; # like extractSections() - portions of the original are returned unchanged.
 
-       # Marker Suffix needs to be accessible staticly.
+       /**
+        * Prefix for temporary replacement strings generated by the preprocessor
+        * ("strip markers"). Using \x7f at the front gives us a little extra
+        * robustness since it shouldn't match when butted up against
+        * identifier-like string constructs.
+        *
+        * Must not consist of all title characters, or else it will change
+        * the behavior of <nowiki> in a link.
+        */
+       const MARKER_PREFIX = "\x7fUNIQ";
+       /** Suffix for strip markers */
        const MARKER_SUFFIX = "-QINU\x7f";
+       /** Regex which matches the state ID part of strip markers */
+       const MARKER_STATE_ID_REGEX = '[0-9a-f]{16}';
 
        # Markers used for wrapping the table of contents
        const TOC_START = '<mw:toc>';
@@ -330,18 +342,9 @@ class Parser {
                $this->mLangLinkLanguages = array();
                $this->currentRevisionCache = null;
 
-               /**
-                * Prefix for temporary replacement strings for the multipass parser.
-                * \x07 should never appear in input as it's disallowed in XML.
-                * Using it at the front also gives us a little extra robustness
-                * since it shouldn't match when butted up against identifier-like
-                * string constructs.
-                *
-                * Must not consist of all title characters, or else it will change
-                * the behavior of <nowiki> in a link.
-                */
-               $this->mUniqPrefix = "\x7fUNIQ" . self::getRandomString();
-               $this->mStripState = new StripState( $this->mUniqPrefix );
+               $stripId = self::getRandomString();
+               $this->mUniqPrefix = self::MARKER_PREFIX . $stripId;
+               $this->mStripState = new StripState( $stripId );
 
                # Clear these on every parse, bug 4549
                $this->mTplRedirCache = $this->mTplDomCache = array();
index 5d1743e..a1d362b 100644 (file)
@@ -26,6 +26,7 @@
  * @ingroup Parser
  */
 class StripState {
+       protected $id;
        protected $prefix;
        protected $data;
        protected $regex;
@@ -37,15 +38,17 @@ class StripState {
        const UNSTRIP_RECURSION_LIMIT = 20;
 
        /**
-        * @param string $prefix
+        * @param string $id
         */
-       public function __construct( $prefix ) {
-               $this->prefix = $prefix;
+       public function __construct( $id ) {
+               $this->id = $id;
+               $this->prefix = Parser::MARKER_PREFIX . $id;
                $this->data = array(
                        'nowiki' => array(),
                        'general' => array()
                );
-               $this->regex = "/{$this->prefix}([^\x7f]+)" . Parser::MARKER_SUFFIX . '/';
+               $this->regex = "/" . Parser::MARKER_PREFIX .
+                       '(' . Parser::MARKER_STATE_ID_REGEX . ")([^\x7f]+)" . Parser::MARKER_SUFFIX . '/';
                $this->circularRefGuard = array();
        }
 
@@ -73,11 +76,11 @@ class StripState {
         * @param string $value
         */
        protected function addItem( $type, $marker, $value ) {
-               if ( !preg_match( $this->regex, $marker, $m ) ) {
+               if ( !preg_match( $this->regex, $marker, $m ) || $m[1] !== $this->id ) {
                        throw new MWException( "Invalid marker: $marker" );
                }
 
-               $this->data[$type][$m[1]] = $value;
+               $this->data[$type][$m[2]] = $value;
        }
 
        /**
@@ -131,8 +134,8 @@ class StripState {
         * @return array
         */
        protected function unstripCallback( $m ) {
-               $marker = $m[1];
-               if ( isset( $this->data[$this->tempType][$marker] ) ) {
+               $marker = $m[2];
+               if ( $m[1] === $this->id && isset( $this->data[$this->tempType][$marker] ) ) {
                        if ( isset( $this->circularRefGuard[$marker] ) ) {
                                return '<span class="error">'
                                        . wfMessage( 'parser-unstrip-loop-warning' )->inContentLanguage()->text()
@@ -164,7 +167,7 @@ class StripState {
         * @return StripState
         */
        public function getSubState( $text ) {
-               $subState = new StripState( $this->prefix );
+               $subState = new StripState( $this->id );
                $pos = 0;
                while ( true ) {
                        $startPos = strpos( $text, $this->prefix, $pos );
@@ -175,11 +178,11 @@ class StripState {
 
                        $endPos += strlen( Parser::MARKER_SUFFIX );
                        $marker = substr( $text, $startPos, $endPos - $startPos );
-                       if ( !preg_match( $this->regex, $marker, $m ) ) {
+                       if ( !preg_match( $this->regex, $marker, $m ) || $m[1] !== $this->id ) {
                                continue;
                        }
 
-                       $key = $m[1];
+                       $key = $m[2];
                        if ( isset( $this->data['nowiki'][$key] ) ) {
                                $subState->data['nowiki'][$key] = $this->data['nowiki'][$key];
                        } elseif ( isset( $this->data['general'][$key] ) ) {
@@ -219,8 +222,12 @@ class StripState {
         * @return string
         */
        protected function mergeCallback( $m ) {
-               $key = $m[1];
-               return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX;
+               if ( $m[1] === $this->id ) {
+                       $key = $m[2];
+                       return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX;
+               } else {
+                       return $m[0];
+               }
        }
 
        /**
@@ -230,6 +237,15 @@ class StripState {
         * @return string
         */
        public function killMarkers( $text ) {
-               return preg_replace( $this->regex, '', $text );
+               $id = $this->id; // PHP 5.3 hack
+               return preg_replace_callback( $this->regex,
+                       function ( $m ) use ( $id ) {
+                               if ( $m[1] === $id ) {
+                                       return '';
+                               } else {
+                                       return $m[0];
+                               }
+                       },
+                       $text );
        }
 }