Step 4: Profit!!
authorPlatonides <platonides@users.mediawiki.org>
Tue, 26 Jan 2010 18:58:07 +0000 (18:58 +0000)
committerPlatonides <platonides@users.mediawiki.org>
Tue, 26 Jan 2010 18:58:07 +0000 (18:58 +0000)
Add and use PregSplitIterator instead of a direct preg_split.
Slower, but with an upper bound on memory usage.

includes/StringUtils.php
includes/parser/Parser.php
tests/preg_split_test.php [new file with mode: 0644]

index c437b3c..bab9be4 100644 (file)
@@ -179,6 +179,14 @@ class StringUtils {
                        return new ArrayIterator( explode( $separator, $subject ) );
                }
        }
+
+       /**
+        * Workalike for preg_split() with limited memory usage.
+        * Returns an Iterator
+        */
+       static function preg_split( $pattern, $subject, $limit = -1, $flags = 0 ) {
+               return new PregSplitIterator( $pattern, $subject, $limit, $flags );
+       }
 }
 
 /**
@@ -409,3 +417,82 @@ class ExplodeIterator implements Iterator {
        }
 }
 
+
+/**
+ * An iterator which works exactly like:
+ * 
+ * foreach ( preg_split( $pattern, $s, $limit, $flags ) as $element ) {
+ *    ...
+ * }
+ *
+ * Except it doesn't use huge amounts of memory when $limit is -1
+ *
+ * The flag PREG_SPLIT_OFFSET_CAPTURE isn't supported.
+ */
+class PregSplitIterator implements Iterator {
+       // The subject string
+       var $pattern, $subject, $originalLimit, $flags;
+
+       // The last extracted group of items.
+       var $smallArray;
+
+       // The position on the iterator.
+       var $curPos;
+
+       const MAX_LIMIT = 100;
+
+       /** 
+        * Construct a PregSplitIterator
+        */
+       function __construct( $pattern, $s, $limit, $flags) {
+               $this->pattern = $pattern;
+               $this->subject = $s;
+               $this->originalLimit = $limit;
+               $this->flags = $flags;
+
+               $this->rewind();
+       }
+
+       private function effectiveLimit() {
+               if ($this->originalLimit == -1) {
+                       return self::MAX_LIMIT + 1;
+               } else if ($this->limit > self::MAX_LIMIT) {
+                       $this->limit -= self::MAX_LIMIT;
+                       return self::MAX_LIMIT + 1;
+               } else {
+                       $old = $this->limit;
+                       $this->limit = 0;
+                       return $old;
+               }
+       }
+
+       function rewind() {
+               $this->curPos = 0;
+               $this->limit =  $this->originalLimit;
+               if ($this->limit == -1) $this->limit = self::MAX_LIMIT;
+               $this->smallArray = preg_split( $this->pattern, $this->subject, $this->effectiveLimit(), $this->flags);
+       }
+
+       function current() {
+               return $this->smallArray[$this->curPos % self::MAX_LIMIT];
+       }
+
+       function key() {
+               return $this->curPos;
+       }
+
+       function next() {
+               $this->curPos++;
+               if ( $this->curPos % self::MAX_LIMIT == 0 ) {
+                       # Last item contains the rest unsplitted.
+                       if ($this->limit > 0) {
+                               $this->smallArray = preg_split( $this->pattern, $this->smallArray[self::MAX_LIMIT], $this->effectiveLimit(), $this->flags);
+                       }
+               }
+               return;
+       }
+
+       function valid() {
+               return $this->curPos % self::MAX_LIMIT < count($this->smallArray);
+       }
+}
index e419a30..50e15dc 100644 (file)
@@ -1154,7 +1154,7 @@ class Parser
                        # be text, and the remaining three constitute mark-up for bold text.
                        # If there are more than 6 apostrophes in a row, assume they're all
                        # text except for the last 6.           
-                       $arr = preg_split( "/('{2,3}(?:''')?)(?!')/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
+                       $arr = Stringutils::preg_split( "/('{2,3}(?:''')?)(?!')/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
 
 
                        # Now let's actually convert our apostrophic mush to HTML!
diff --git a/tests/preg_split_test.php b/tests/preg_split_test.php
new file mode 100644 (file)
index 0000000..69c977f
--- /dev/null
@@ -0,0 +1,24 @@
+<?php
+include "../includes/StringUtils.php";
+
+$pattern = "/('')+/";
+$subject = str_repeat("'' ", 1024*1024 + 7);
+
+$m = memory_get_usage();
+
+$ps1 = preg_split($pattern, $subject);
+
+$r = "";
+foreach ($ps1 as $c) {
+       $r .= $c . "|";
+}
+echo "Original preg_split: " . md5($r) . "  " . (memory_get_usage()-$m) . "\n";
+
+unset($ps1);
+
+$r = "";
+$ps2 = StringUtils::preg_split($pattern, $subject);
+foreach ($ps2 as $c) {
+       $r .= $c . "|";
+}
+echo "StringUtils preg_split: " . md5($r) . "  " . (memory_get_usage()-$m) . "\n";