/**
* Removes content we've chosen to remove. The text of the removed elements can be
* extracted with the getText method.
- * @return array of removed DOMElements
+ * @return array Array of removed DOMElements
*/
public function filterContent() {
- wfProfileIn( __METHOD__ );
$removals = $this->parseItemsToRemove();
// Bail out early if nothing to do
},
true
) ) {
- wfProfileOut( __METHOD__ );
return array();
}
// CSS Classes
$domElemsToRemove = array();
- $xpath = new DOMXpath( $doc );
+ $xpath = new DOMXPath( $doc );
foreach ( $removals['CLASS'] as $classToRemove ) {
$elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
$removed = array_merge( $removed, $this->removeElements( $elements ) );
}
- wfProfileOut( __METHOD__ );
return $removed;
}
/**
* Removes a list of elelments from DOMDocument
* @param array|DOMNodeList $elements
- * @return array of removed elements
+ * @return array Array of removed elements
*/
private function removeElements( $elements ) {
$list = $elements;
* @return string
*/
private function fixLibXML( $html ) {
- wfProfileIn( __METHOD__ );
static $replacements;
- if ( ! $replacements ) {
+ if ( !$replacements ) {
// We don't include rules like '"' => '"' because entities had already been
// normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
$replacements = new ReplacementArray( array(
}
$html = $replacements->replace( $html );
$html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
- wfProfileOut( __METHOD__ );
return $html;
}
* @return string Processed HTML
*/
public function getText( $element = null ) {
- wfProfileIn( __METHOD__ );
if ( $this->doc ) {
- wfProfileIn( __METHOD__ . '-dom' );
if ( $element !== null && !( $element instanceof DOMElement ) ) {
$element = $this->doc->getElementById( $element );
}
$body->appendChild( $element );
}
$html = $this->doc->saveHTML();
- wfProfileOut( __METHOD__ . '-dom' );
- wfProfileIn( __METHOD__ . '-fixes' );
$html = $this->fixLibXml( $html );
if ( wfIsWindows() ) {
// Cleanup for CRLF misprocessing of unknown origin on Windows.
// XML code paths if possible and fix there.
$html = str_replace( ' ', '', $html );
}
- wfProfileOut( __METHOD__ . '-fixes' );
} else {
$html = $this->html;
}
$html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
$html = $this->onHtmlReady( $html );
- wfProfileIn( __METHOD__ . '-flatten' );
if ( $this->elementsToFlatten ) {
$elements = implode( '|', $this->elementsToFlatten );
$html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
}
- wfProfileOut( __METHOD__ . '-flatten' );
- wfProfileOut( __METHOD__ );
return $html;
}
* @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
* @param string $rawName The raw name of the selector
* @return bool Whether the selector was successfully recognised
+ * @throws MWException
*/
protected function parseSelector( $selector, &$type, &$rawName ) {
if ( strpos( $selector, '.' ) === 0 ) {
* @return array
*/
protected function parseItemsToRemove() {
- wfProfileIn( __METHOD__ );
$removals = array(
'ID' => array(),
'TAG' => array(),
$removals['TAG'][] = 'video';
}
- wfProfileOut( __METHOD__ );
return $removals;
}
}