PHP Cleanup Cross-site XSS xss_clean function collation from CodeIgniter Security
The security Class is adapted to function Xss_clean single-file invocation directly. by Barking.
From CodeIgniter cleanup Cross-site XSS xss_clean//security Class adapted functions function Remove_invisible_characters ($str, $url _encoded = TRUE) {$non _displayables = array (); if ($url _encoded) {$non _displayables[] = '/%0[0-8bcef]/'; $non _displayables[] = '/%1[0-9a-f]/'; } $non _displayables[] = '/[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]+/s '; do {$str = Preg_replace ($non _displayables, ", $str,-1, $count); } while ($count); return $STR;} function _convert_attribute ($match) {return str_replace (Array (' > ', ' < ', ' \ \ '), Array (' > ', ' < ', ' \\\\ '), $ Match[0]);} function _decode_entity ($match) {$str = $match [0];if (Stristr ($str, ' & ') = = = = FALSE) {return $str;} $str = Html_entity_decode ($str, Ent_compat, ' UTF-8 '); $str = Preg_replace (' ~& #x (0*[0-9a-f]{2,5}) ~ei ', ' Chr (Hexdec ( "\\1")) ', $str); return Preg_replace (' ~&# ([0-9]{2,4}) ~e ', ' Chr (\\1) ', $str);} function _compact_exploded_words ($matches) {return preg_replace ('/\s+/s ', ' ", $matches [1]). $matches [2];} function _filter_attributes ($str) {$out = ", if (Preg_match_all (' #\s*[a-z\-]+\s*=\s* (\042|\047) ([^\\1]*?) \\1#is ', $str, $matches) {foreach ($matches [0] as $match) {$out. = Preg_replace ("#/\*.*?\*/#s", "', $match);}} return $out;} function _js_link_removal ($match) {return str_replace ($match [1],preg_replace (' #href =.*? ( Alert\ (|alert&\ #40; |javascript\:|livescript\:|mocha\:|charset\=|window\.| document\.| \.cookie|<script|<xss|data\s*:) #si ', ', _filter_attributes (Str_replace (' < ', ' > '), ', $match [1]) )), $match [0]);} function _js_img_removal ($match) {return str_replace ($match [1],preg_replace (' #src =.*? ( Alert\ (|alert&\ #40; |javascript\:|livescript\:|mocha\:|charset\=|window\.| document\.| \.cookie|<script|<xss|base64\s*,) #si ', ', _filter_attributes (Str_replace (' < ', ' > '), ', $match [1 ])), $match [0]);} function _sanitize_naughty_html ($matches) {//encode opening brace$str = ' < '. $matches [1]. $matches [2]. $matches [3];/ /encode captured opening or closing brace to prevent recursive vectOrs$str. = str_replace (Array (' > ', ' < '), array (' > ', ' < '), $matches [4]); return $str;} Call this function//from http://www.cnblogs.com/osfipin/function xss_clean ($str, $is _image = FALSE) {/* * is the string an array? * * /if (Is_array ($STR)) {while (list ($key) = each ($str)) {$str [$key] = Xss_clean ($str [$key]);} return $STR;} $str = Remove_invisible_characters ($STR);//Validate entities in Urls$hash = MD5 (time () + mt_rand (0, 1999999999)); $str = P Reg_replace (' |\& ([a-z\_0-9\-]+) \= ([a-z\_0-9\-]+] |i ', $hash. " \\1=\\2 ", $str); $str = Preg_replace (' # (&\#?[ 0-9a-z]{2,}) ([\x00-\x20]) *;? #i ', "\\1;\\2", $str); $str = Preg_replace (' # (&\ #x?) ([0-9a-f]+);? #i ', "\\1\\2;", $str); $str = Str_replace ($hash, ' & ', $str); $str = Rawurldecode ($STR);/* * Convert character entities to ASCII * * This permits we tests below to work reliably. * We only convert entities that is within tags since * these is the ones that would pose security problems. * */$str = Preg_replace_callback ("/[a-z]+= ([\ ' \"]). *?\\1/si ", '_convert_attribute ', $str); $str = Preg_replace_callback ("/<\w+.*?" =>|<|$)/si ", ' _decode_entity ', $str);/* * Remove Invisible characters again! */$str = Remove_invisible_characters ($STR);/* Convert all tabs to spaces * * This prevents strings like this:javascript * Note:we deal with spaces between characters later. * Note:preg_replace was found to being amazingly slow here in * large blocks of data, so we use Str_replace. */if (Strpos ($str, "\ T")!== FALSE) {$str = Str_replace ("\ T", "", $str);} /* Capture converted string for later comparison */$converted _string = $str;//Remove Strings that is never allowed$_ne Ver_allowed_str = Array (' document.cookie ' = ' [removed] ', ' document.write ' = ' [removed] ', '. parentnode ' = ' [ Removed] ', '. InnerHTML ' = ' [removed] ', ' window.location ' = ' [removed] ', '-moz-binding ' = ' [removed] ', ' <! --' = ' <!--', '--' and '--', ' <! [cdata[' = ' <![ cdata[', ' <comment> ' <comment> '); $str = Str_replace (arRay_keys ($_NEVER_ALLOWED_STR), $_never_allowed_str, $str); $_never_allowed_regex = Array (' javascript\s*: ', ' Expression\s* (\ (|&\ #40;) ',//CSS and IE ' vbscript\s*: ',//IE, surprise! ' redirect\s+302 ', "([\"])? Data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1 "); foreach ($_never_allowed_regex as $regex) {$str = Preg_replace (' # '. $regex. ' #is ', ' [removed] ', $str);} /* * Makes PHP tags safe * * note:xml tags is inadvertently replaced too: * * <?xml * * But it doesn ' t seem to pose a Problem. */if ($is _image = = TRUE) {//Images has a tendency to having the PHP short opening and//closing tags every so often so we Skip those and only//do the long opening tags. $str = preg_replace ('/<\? ( PHP)/I ', "<?\\1", $str);} else{$str = str_replace (Array (' <? ', '? '. ') > '), Array (' <? ', '?> '), $STR);} /* * Compact any exploded words * * This corrects words like:j a V a s C r i p T * These words is compacted back to the IR correct state. */$words = Array (' javascript ', ' expression ', ' VBScript ', ' scripT ', ' base64 ', ' applet ', ' alert ', ' document ', ' write ', ' cookie ', ' window '), foreach ($words as $word) {$temp = '; for ($i = 0, $wordlen = strlen ($word); $i < $wordlen; $i + +) {$temp. = substr ($word, $i, 1). " \s* ";} We only want to does this when it's followed by a Non-word character//that's the valid stuff like "dealer to" does isn't be Come "Dealerto" $str = Preg_replace_callback (' # ('. substr ($temp, 0,-3). ') (\w) #is ', ' _compact_exploded_words ', $str);} /* * Remove disallowed Javascript in links or img tags * We used to does some version comparisons and use of the Stripos for PHP 5, * But it's dog slow compared to these simplified non-capturing * preg_match (), especially if the pattern exists in the String */do{$original = $str, if (Preg_match ("/<a/i", $str) {$str = Preg_replace_callback ("#<a\s+" ([^>]*?) (>|$) #si ", ' _js_link_removal ', $str);} if (Preg_match ("/]*?) (\s?/?>|$) #si ", ' _js_img_removal ', $str);} if (Preg_match ("/scrIpt/i ", $str) OR preg_match ("/xss/i ", $str)) {$str = Preg_replace (" #< (/*) (SCRIPT|XSS) (. *) " \> #si ", ' [removed] ', $STR);}} while ($original! = $str); unset ($original);//Remove evil attributes such as style, onclick and xmlns//all JavaScript even T handlers (e.g. onload, onclick, onmouseover), style, and xmlns$evil_attributes = Array (' on\w* ', ' style ', ' xmlns ', ' forma Ction ') if ($is _image = = = TRUE) {/* * Adobe Photoshop puts XML metadata into JFIF images, * including namespacing, so we h Ave to allow the for images. */unset ($evil _attributes[array_search (' xmlns ', $evil _attributes)]);} do {$count = 0; $attribs = array ();//find occurrences of illegal attribute strings with quotes (042 and 047 is octal quot ES) Preg_match_all ('/(') Implode (' | ', $evil _attributes). \s*=\s* (\042|\047) ([^\\2]*?) (\\2)/is ', $str, $matches, Preg_set_order), foreach ($matches as $attr) {$attribs [] = preg_quote ($attr [0], '/');} Find occurrences of illegal attribute strings without Quotespreg_match_all ('/('. Implode (' | '), $evil _attributes). ') \s*=\s* ([^\s>]*)/is ', $str, $matches, Preg_set_order), foreach ($matches as $attr) {$attribs [] = preg_quote ($attr [0], ‘/‘);} Replace illegal attribute strings that is inside an HTML Tagif (count ($attribs) > 0) {$str = Preg_replace ('/(<?) (\/? [^><]+?] ([^a-za-z<>\-]) (.*?) ('. Implode (' | ', $attribs). ') (.*?) ([\s><]?) ([><]*)/I ', ' $1$2 $4$6$7$8 ', $str,-1, $count);}} while ($count);/* Sanitize Naughty HTML elements * * IF a tag containing any of the words in the list * below is found, The tag gets converted to entities. * * So this: <blink> * becomes: <blink> * * $naughty = ' alert|applet|audio|basefont|base|behavior|bgsound| Blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object| Plaintext|style|script|textarea|title|video|xml|xss '; $str = Preg_replace_callback (' #< (/*\s*) ('. $naughty. ') ([^><]*) ([><]*) #is ', ' _sanitize_naughty_html ', $str);/* * sanitize naughty Scripting elements * * Similar to above, only instead of looking for * tags it looks for PHP and JavaScript commands * that is D Isallowed. Rather than removing the * code, it simply converts the parenthesis to entities * rendering the code un-executable. * * for Example:eval (' some code ') * Becomes:eval (' some code ') */$str = Preg_replace (' # (alert|cmd|passthru|eval|exec| Expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink) (\s*) \ ((. *?) \) #si ', "\\1\\2 (\\3)", $str);//Final Clean up//This adds a bit of extra precaution in case//something got through the A Bove filters$str = Str_replace (Array_keys ($_NEVER_ALLOWED_STR), $_never_allowed_str, $str); foreach ($_never_allowed_ Regex as $regex) {$str = Preg_replace (' # '. $regex. ' #is ', ' [removed] ', $str);} /* * Images is Handled in a special-essentially, we want to know that after all of the character * conversion is Done whether any unwanted, likely XSS, code is found. * If not, we return TRUE, as the image was clean. * However, If the string post-conversion does not matched the * string post-removal of XSS, then it fails, as there is unwanted XSS * Code found and removed/changed during processing. */if ($is _image = = = TRUE) {return ($str = = $converted _string)? True:false;} return $STR;} Test function Show ($r) {echo $r; echo ' <br/> '; Echo Xss_clean ($r); Echo ' <br/><br/> ';} Show ('%3cscript%3ealert (' XSS ')%3c/script%3e '); Show ('%3c/a%3e%3cscript%3ealert (%22xss%22)%3c/script%3e '); Show (' '), Show (' <style type= "Text/css" >body{background:url ("Javascript:alert (' XSS ') ')}</style> ');
Can test how the effect
PHP Cleanup Cross-site XSS xss_clean function collation from CodeIgniter Security