$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
- // Add extra spacing around break tags so text crammed together like<br>this
- // doesn't make one word.
- $text = str_replace( '<br', "\n<br", $text );
-
$formatter = new HtmlFormatter( $text );
// Strip elements from the page that we never want in the search text.
$this->text .= substr( $text, $start, $length );
}
function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
- // Do nothing.
+ // Inject whitespace for typical block-level tags to
+ // prevent merging unrelated<br>words.
+ if ( $this->isBlockLevelTag( $name ) ) {
+ $this->text .= ' ';
+ }
}
function endTag( $name, $sourceStart, $sourceLength ) {
- // Do nothing.
+ // Inject whitespace for typical block-level tags to
+ // prevent merging unrelated<br>words.
+ if ( $this->isBlockLevelTag( $name ) ) {
+ $this->text .= ' ';
+ }
}
function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
// Do nothing.
function comment( $text, $sourceStart, $sourceLength ) {
// Do nothing.
}
+
+ // Per https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+ // retrieved on sept 12, 2018. <br> is not block level but was added anyways.
+ // The following is a complete list of all HTML block level elements
+ // (although "block-level" is not technically defined for elements that are
+ // new in HTML5).
+ // Structured as tag => true to allow O(1) membership test.
+ static private $BLOCK_LEVEL_TAGS = [
+ 'address' => true,
+ 'article' => true,
+ 'aside' => true,
+ 'blockquote' => true,
+ 'br' => true,
+ 'canvas' => true,
+ 'dd' => true,
+ 'div' => true,
+ 'dl' => true,
+ 'dt' => true,
+ 'fieldset' => true,
+ 'figcaption' => true,
+ 'figure' => true,
+ 'figcaption' => true,
+ 'footer' => true,
+ 'form' => true,
+ 'h1' => true,
+ 'h2' => true,
+ 'h3' => true,
+ 'h4' => true,
+ 'h5' => true,
+ 'h6' => true,
+ 'header' => true,
+ 'hgroup' => true,
+ 'hr' => true,
+ 'li' => true,
+ 'main' => true,
+ 'nav' => true,
+ 'noscript' => true,
+ 'ol' => true,
+ 'output' => true,
+ 'p' => true,
+ 'pre' => true,
+ 'section' => true,
+ 'table' => true,
+ 'tfoot' => true,
+ 'ul' => true,
+ 'video' => true,
+ ];
+
+ /**
+ * Detect block level tags. Of course css can make anything a block
+ * level tag, but this is still better than nothing.
+ *
+ * @param string $tagName HTML tag name
+ * @return bool True when tag is an html block level element
+ */
+ private function isBlockLevelTag( $tagName ) {
+ $key = strtolower( trim( $tagName ) );
+ return isset( self::$BLOCK_LEVEL_TAGS[$key] );
+ }
}
END;
$struct = $this->getStructure( $text );
$this->assertEquals( "Opening text is opening.", $struct->getOpeningText() );
- $this->assertEquals( "Opening text is opening. Then we got more text",
+ $this->assertEquals( "Opening text is opening. Then we got more text",
$struct->getMainText() );
$this->assertEquals( [ "Header table row in table another row in table" ],
$struct->getAuxiliaryText() );
}
+
+ public function testPreservesWordSpacing() {
+ $text = "<dd><dl>foo</dl><dl>bar</dl></dd><p>baz</p>";
+ $struct = $this->getStructure( $text );
+ $this->assertEquals( "foo bar baz", $struct->getMainText() );
+ }
}
public function provideStripAllTags() {
return [
[ '<p>Foo</p>', 'Foo' ],
- [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
+ [ '<p id="one">Foo</p><p id="two">Bar</p>', 'Foo Bar' ],
[ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
[ '<p>Hello <strong> world café</p>', 'Hello <strong> world cafĂ©' ],
[