/**
* Removes content we've chosen to remove. The text of the removed elements can be
* extracted with the getText method.
- * @return array of removed DOMElements
+ * @return array Array of removed DOMElements
*/
public function filterContent() {
wfProfileIn( __METHOD__ );
$removals = $this->parseItemsToRemove();
- if ( !$removals ) {
+ // Bail out early if nothing to do
+ if ( array_reduce( $removals,
+ function ( $carry, $item ) {
+ return $carry && !$item;
+ },
+ true
+ ) ) {
wfProfileOut( __METHOD__ );
return array();
}
/**
* Removes a list of elelments from DOMDocument
* @param array|DOMNodeList $elements
- * @return array of removed elements
+ * @return array Array of removed elements
*/
private function removeElements( $elements ) {
$list = $elements;
private function fixLibXML( $html ) {
wfProfileIn( __METHOD__ );
static $replacements;
- if ( ! $replacements ) {
+ if ( !$replacements ) {
// We don't include rules like '"' => '"' because entities had already been
// normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
$replacements = new ReplacementArray( array(
* specify the $element in the method it'll change the underlying dom and you won't be able to get
* it back.
*
- * @param DOMElement|string|null $element ID of element to get HTML from or false to get it from the whole tree
+ * @param DOMElement|string|null $element ID of element to get HTML from or
+ * false to get it from the whole tree
* @return string Processed HTML
*/
public function getText( $element = null ) {
// XML code paths if possible and fix there.
$html = str_replace( ' ', '', $html );
}
- $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
wfProfileOut( __METHOD__ . '-fixes' );
} else {
$html = $this->html;
}
+ // Remove stuff added by wrapHTML()
+ $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
$html = $this->onHtmlReady( $html );
wfProfileIn( __METHOD__ . '-flatten' );
}
/**
+ * Helper function for parseItemsToRemove(). This function extracts the selector type
+ * and the raw name of a selector from a CSS-style selector string and assigns those
+ * values to parameters passed by reference. For example, if given '#toc' as the
+ * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
* @param string $selector CSS selector to parse
- * @param string $type
- * @param string $rawName
+ * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
+ * @param string $rawName The raw name of the selector
* @return bool Whether the selector was successfully recognised
*/
protected function parseSelector( $selector, &$type, &$rawName ) {
}
/**
- * Transforms CSS selectors into an internal representation suitable for processing
+ * Transforms CSS-style selectors into an internal representation suitable for
+ * processing by filterContent()
* @return array
*/
protected function parseItemsToRemove() {