Clean Word HTML using Regular Expressions

function cleanHTML($html) {
///
/// Removes all FONT and SPAN tags, and all Class and Style attributes.
/// Designed to get rid of non-standard Microsoft Word HTML tags.
///
// start by completely removing all unwanted tags
$html = ereg_replace(“<(/)?(font|span|del|ins)[^>]*>”,””,$html);
// then run another pass over the html (twice), removing unwanted attributes
$html = ereg_replace(“<([^>]*)(class|lang|style|size|face)=(\”[^\”]*\”|'[^’]*’|[^>]+)([^>]*)>”,”<\\1>”,$html);
$html = ereg_replace(“<([^>]*)(class|lang|style|size|face)=(\”[^\”]*\”|'[^’]*’|[^>]+)([^>]*)>”,”<\\1>”,$html);
return $html
}

http://snipplr.com/view/5217/clean-word-html-using-regular-expressions/

Leave a comment