6 * @author Roman Ivanov <thingol@mail.ru>
7 * @copyright 2004-2005 Roman Ivanov
8 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
10 * @link http://pixel-apes.com/safehtml/
13 if (!defined('_ECRIRE_INC_VERSION')) return;
15 require_once(XML_HTMLSAX3
. 'HTMLSax3.php');
21 var $_counter = array();
23 var $_stack = array();
25 var $_dcCounter = array();
27 var $_dcStack = array();
31 var $_liStack = array();
33 var $_protoRegexps = array();
35 var $_cssRegexps = array();
37 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
39 var $deleteTags = array(
40 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
41 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
42 'iframe', 'layer', 'link', 'meta', 'object', 'style',
46 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
48 var $protocolFiltering = 'white';
50 var $blackProtocols = array(
51 'about', 'chrome', 'data', 'disk', 'hcp',
52 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
53 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
54 'res', 'resource', 'shell', 'vbscript', 'view-source',
55 'vnd.ms.radio', 'wysiwyg',
58 var $whiteProtocols = array(
59 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
60 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
64 var $protocolAttributes = array(
65 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
68 var $cssKeywords = array(
69 'absolute', 'behavior', 'behaviour', 'content', 'expression',
70 'fixed', 'include-source', 'moz-binding',
73 var $noClose = array();
75 var $closeParagraph = array(
76 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
77 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
78 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
79 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
83 var $tableTags = array(
84 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
88 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
90 var $attributes = array('dynsrc', 'id', 'name', );
92 var $attributesNS = array('xml:lang', );
96 //making regular expressions based on Proto & CSS arrays
97 foreach ($this->blackProtocols
as $proto) {
98 $preg = "/[\s\x01-\x1F]*";
99 for ($i=0; $i<strlen($proto); $i++
) {
100 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
103 $this->_protoRegexps
[] = $preg;
106 foreach ($this->cssKeywords
as $css) {
107 $this->_cssRegexps
[] = '/' . $css . '/i';
112 function _writeAttrs ($attrs,$tag=null)
114 if (is_array($attrs)) {
115 foreach ($attrs as $name => $value) {
117 $name = strtolower($name);
119 if (strpos($name, 'on') === 0) {
122 if (strpos($name, 'data') === 0) {
125 if ($tag !='a' AND in_array($name, $this->attributes
)) {
128 if (!preg_match("/^[a-z0-9-]+$/i", $name)) {
129 if (!in_array($name, $this->attributesNS
))
135 if (($value === TRUE) ||
(is_null($value))) {
139 if ($name == 'style') {
141 // removes insignificant backslahes
142 $value = str_replace("\\", '', $value);
144 // removes CSS comments
147 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
148 if ($_value == $value) break;
152 // replace all & to &
153 $value = str_replace('&', '&', $value);
154 $value = str_replace('&', '&', $value);
156 foreach ($this->_cssRegexps
as $css) {
157 if (preg_match($css, $value)) {
161 foreach ($this->_protoRegexps
as $proto) {
162 if (preg_match($proto, $value)) {
168 $tempval = preg_replace_callback('/&#(\d+);?/m', create_function('$m', 'return chr($m[1]);'), $value);
169 $tempval = preg_replace_callback('/&#x([0-9a-f]+);?/mi', create_function('$m', 'return chr(hexdec($m[1]));'), $tempval);
171 if ((in_array($name, $this->protocolAttributes
)) &&
172 (strpos($tempval, ':') !== false))
174 if ($this->protocolFiltering
== 'black') {
175 foreach ($this->_protoRegexps
as $proto) {
176 if (preg_match($proto, $tempval)) continue 2;
179 $_tempval = explode(':', $tempval);
180 $proto = $_tempval[0];
181 if (!in_array($proto, $this->whiteProtocols
)) {
187 $value = str_replace("\"", """, $value);
188 $this->_xhtml
.= ' ' . $name . '="' . $value . '"';
194 function _openHandler(&$parser, $name, $attrs)
196 $name = strtolower($name);
198 if (in_array($name, $this->deleteTagsContent
)) {
199 array_push($this->_dcStack
, $name);
200 $this->_dcCounter
[$name] = isset($this->_dcCounter
[$name]) ?
$this->_dcCounter
[$name]+
1 : 1;
202 if (count($this->_dcStack
) != 0) {
206 if (in_array($name, $this->deleteTags
)) {
210 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
211 if (preg_match("!(?:\@|://)!i", $name)) {
212 $this->_xhtml
.= '<' . $name . '>';
217 if (in_array($name, $this->singleTags
)) {
218 $this->_xhtml
.= '<' . $name;
219 $this->_writeAttrs($attrs);
220 $this->_xhtml
.= ' />';
224 // TABLES: cannot open table elements when we are not inside table
225 if ((isset($this->_counter
['table'])) && ($this->_counter
['table'] <= 0)
226 && (in_array($name, $this->tableTags
)))
231 // PARAGRAPHS: close paragraph when closeParagraph tags opening
232 if ((in_array($name, $this->closeParagraph
)) && (in_array('p', $this->_stack
))) {
233 $this->_closeHandler($parser, 'p');
236 // LISTS: we should close <li> if <li> of the same level opening
237 if ($name == 'li' && count($this->_liStack
) &&
238 $this->_listScope
== $this->_liStack
[count($this->_liStack
)-1])
240 $this->_closeHandler($parser, 'li');
243 // LISTS: we want to know on what nesting level of lists we are
244 if (in_array($name, $this->listTags
)) {
248 array_push($this->_liStack
, $this->_listScope
);
251 $this->_xhtml
.= '<' . $name;
252 $this->_writeAttrs($attrs,$name);
253 $this->_xhtml
.= '>';
254 array_push($this->_stack
,$name);
255 $this->_counter
[$name] = isset($this->_counter
[$name]) ?
$this->_counter
[$name]+
1 : 1;
259 function _closeHandler(&$parser, $name)
262 $name = strtolower($name);
264 if (isset($this->_dcCounter
[$name]) && ($this->_dcCounter
[$name] > 0) &&
265 (in_array($name, $this->deleteTagsContent
)))
267 while ($name != ($tag = array_pop($this->_dcStack
))) {
268 $this->_dcCounter
[$tag]--;
271 $this->_dcCounter
[$name]--;
274 if (count($this->_dcStack
) != 0) {
278 if ((isset($this->_counter
[$name])) && ($this->_counter
[$name] > 0)) {
279 while ($name != ($tag = array_pop($this->_stack
))) {
280 $this->_closeTag($tag);
283 $this->_closeTag($name);
288 function _closeTag($tag)
290 if (!in_array($tag, $this->noClose
)) {
291 $this->_xhtml
.= '</' . $tag . '>';
294 $this->_counter
[$tag]--;
296 if (in_array($tag, $this->listTags
)) {
301 array_pop($this->_liStack
);
306 function _dataHandler(&$parser, $data)
308 if (count($this->_dcStack
) == 0) {
309 $this->_xhtml
.= $data;
314 function _escapeHandler(&$parser, $data)
321 while ($tag = array_pop($this->_stack
)) {
322 $this->_closeTag($tag);
325 return $this->_xhtml
;
337 // Save all '<' symbols
338 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
340 // Web documents shouldn't contains \x00 symbol
341 $doc = str_replace("\x00", '', $doc);
343 // Opera6 bug workaround
344 $doc = str_replace("\xC0\xBC", '<', $doc);
346 // UTF-7 encoding ASCII decode
347 $doc = $this->repackUTF7($doc);
349 // Instantiate the parser
350 $parser= new XML_HTMLSax3();
353 $parser->set_object($this);
355 $parser->set_element_handler('_openHandler','_closeHandler');
356 $parser->set_data_handler('_dataHandler');
357 $parser->set_escape_handler('_escapeHandler');
359 $parser->parse($doc);
361 return $this->getXHTML();
365 function repackUTF7($str)
367 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
370 function repackUTF7Callback($str)
372 $str = base64_decode($str[1]);
373 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
374 return preg_replace('/\x00(.)/', '$1', $str);
377 function repackUTF7Back($str)
379 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';