}
/**
- * Removes content we've chosen to remove
+ * Removes content we've chosen to remove. The text of the removed elements can be
+ * extracted with the getText method.
+ * @return array of removed DOMElements
*/
public function filterContent() {
wfProfileIn( __METHOD__ );
$removals = $this->parseItemsToRemove();
- if ( !$removals ) {
+ // Bail out early if nothing to do
+ if ( array_reduce( $removals,
+ function( $carry, $item ) {
+ return $carry && !$item;
+ },
+ true
+ ) ) {
wfProfileOut( __METHOD__ );
- return;
+ return array();
}
$doc = $this->getDoc();
}
}
}
-
- $this->removeElements( $domElemsToRemove );
+ $removed = $this->removeElements( $domElemsToRemove );
// Elements with named IDs
$domElemsToRemove = array();
$domElemsToRemove[] = $itemToRemoveNode;
}
}
- $this->removeElements( $domElemsToRemove );
+ $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
// CSS Classes
$domElemsToRemove = array();
}
}
}
- $this->removeElements( $domElemsToRemove );
+ $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
// Tags with CSS Classes
foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
$elements = $xpath->query(
'//' . $parts[0] . '[@class="' . $parts[1] . '"]'
);
-
- $this->removeElements( $elements );
+ $removed = array_merge( $removed, $this->removeElements( $elements ) );
}
wfProfileOut( __METHOD__ );
+ return $removed;
}
/**
* Removes a list of elelments from DOMDocument
* @param array|DOMNodeList $elements
+ * @return array of removed elements
*/
private function removeElements( $elements ) {
$list = $elements;
$element->parentNode->removeChild( $element );
}
}
+ return $list;
}
/**
}
/**
- * Performs final transformations and returns resulting HTML
+ * Performs final transformations and returns resulting HTML. Note that if you want to call this
+ * both without an element and with an element you should call it without an element first. If you
+ * specify the $element in the method it'll change the underlying dom and you won't be able to get
+ * it back.
*
- * @param DOMElement|string|null $element ID of element to get HTML from or false to get it from the whole tree
+ * @param DOMElement|string|null $element ID of element to get HTML from or
+ * false to get it from the whole tree
* @return string Processed HTML
*/
public function getText( $element = null ) {
// XML code paths if possible and fix there.
$html = str_replace( ' ', '', $html );
}
- $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
wfProfileOut( __METHOD__ . '-fixes' );
} else {
$html = $this->html;
}
+ // Remove stuff added by wrapHTML()
+ $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
$html = $this->onHtmlReady( $html );
wfProfileIn( __METHOD__ . '-flatten' );