/* @@@LICENSE * * Copyright (c) 2012 Hewlett-Packard Development Company, L.P. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * LICENSE@@@ */ /* * This is a naive, non-conformant HTML parser/sanitizer. * Any "unusual" (even valid) HTML will probably get dropped on the floor, * dumped in the wrong div, or in the worst case the entire document might get * written out an unreadable mess of escaped content. * * The results may not be pretty but at least it should be somewhat secure * against Javascript injection attacks. * * Given the limitations, this should eventually be replaced with a more * mature/compliant sanitizer library for maximum compatibility. */ var SimpleHtmlParser = (function () { var tagWhitelist = { 'a': {name: true, href: "uri"}, 'abbr': true, 'acronym': true, 'address': true, 'b': true, 'big': true, 'blockquote': {cite: "uri"}, 'br': {clear: true}, 'caption': true, 'center': true, 'cite': true, 'code': true, 'col': {span: true}, 'colgroup': {span: true}, 'dd': true, 'del': true, 'dfn': true, 'div': true, 'dl': true, 'dt': true, 'em': true, 'font': {face: true, size: true, color: true}, 'h1': true, 'h2': true, 'h3': true, 'h4': true, 'h5': true, 'h6': true, 'hr': {noshade: true, size: true}, 'i': true, 'img': {src: "uri", border: true}, 'ins': true, 'label': {'for': true}, 'li': true, 'ol': true, 'p': true, 'pre': true, 'q': true, 's': true, 'samp': true, 'small': true, 'span': true, 'strike': true, 'strong': true, 'sub': true, 'sup': true, 'table': {summary: true, border: true, frame: true, rules: true, cellspacing: true, cellpadding: true}, 'tbody': true, 'td': {headers: true, scope: true, abbr: true, rowspan: true, colspan: true}, 'tfoot': true, 'th': {headers: true, scope: true, abbr: true, rowspan: true, colspan: true}, 'thead': true, 'tr': true, 'tt': true, 'u': true, 'ul': true }; // Generic attributes that we'll always allow by default var attributeWhitelist = { 'class': true, 'style': true, 'alt': true, 'align': true, 'background': true, 'bgcolor': true, 'dir': true, 'height': true, 'lang': true, 'title': true, 'valign': true, 'width': true }; // These tags will be completely wiped, including any child nodes/text // NOTE: Technically, these should only be scanned for the respective // end tags and not other stuff, but we'll just live with the mess. var tagBlacklist = { 'script': true, 'style': true, 'head': true, 'title': true }; // Escape anything that could be interpreted as tags function escapeContent (text) { return text.replace(//g, '<'); } // Return attribute in double-quoted form function quoteAttribute (text) { return '"' + escapeContent(text).replace(/"/g, '"') + '"'; } function defaultTagHandler (tag) { var tagInfo = tagWhitelist[tag.tagName]; if (tagInfo) { var newTag = { tagName: tag.tagName, attributes: {}, selfClosing: tag.selfClosing }; for (var attrName in tag.attributes) { if (tagInfo[attrName] || attributeWhitelist[attrName]) { newTag.attributes[attrName] = tag.attributes[attrName]; } } return newTag; } else if (!tagBlacklist[tag.tagName]) { if (tagBlacklist[tag.tagName] || tag.selfClosing) { return null; } else { return {tagName: 'div'}; } } } // Parse HTML text. A tag handler can be passed in to remove or modify // tags. If a falsy value is returned, the tag and all child elements will // be stripped. function parseHtml (rawText, tagHandler) { tagHandler = tagHandler || defaultTagHandler; // match against: // comments // doctypes // close tags (1 = tag name) // open or self-closing tags (2 = tag name, 3 = attributes, 4 = close tag) var tagRe = /<\!--[\s\S]*?-->|<\![\s\S]*?>|<\/([0-9a-zA-Z\-\:]+)\s*>|<([0-9a-zA-Z\-\:]+)(\s+[^>]*?)?\s*(\/)?\s*>/g; // match against: // attribute name (match 1) // single-quoted value (optional, match 2) or // double-quoted value (optional, match 3) or // unquoted value (optional, match 4) var attributeRe = /\s+([0-9a-zA-Z\-\:]+)\s*(?:=\s*(?:"([^"]*?)"|'([^']*?)'|(\S*)))?/g; // output text var output = ""; var prevIndex = 0; var stack = []; var skipAll = false; var attrName, attrValue; while(true) { var match = tagRe.exec(rawText); if (match) { // write any preceding text if (!skipAll) { output += escapeContent(rawText.slice(prevIndex, match.index)); } if (match[1]) { var closeTagName = match[1].toLowerCase(); // pop tags until we find a matching tag name while (stack.length > 0) { var currentTag = stack.pop(); if (currentTag.tagName === closeTagName) { // write close tag (only if not ignored) if (!currentTag.skip) { output += ''; } break; } } // Update the skip state for the current tag skipAll = stack.length > 0 ? stack[stack.length - 1].skip : false; } else if (match[2]) { // open or self-closing tag var tagName = match[2].toLowerCase(); var selfClose = !!match[4]; if (!skipAll) { var attributesStr = match[3]; var attributes = {}; if (attributesStr) { while (true) { var attrMatch = attributeRe.exec(attributesStr); if (attrMatch) { attrName = attrMatch[1]; attrValue = attrMatch[2] || attrMatch[3] || attrMatch[4] || ""; attributes[attrName.toLowerCase()] = attrValue; } else { break; } } } // Allow callback to manipulate tag var tag = { tagName: tagName, attributes: attributes, selfClosing: !!selfClose }; // Tag can be rewritten, e.g. to div if unknown tag = tagHandler(tag); if (tag && tag.tagName) { output += '<' + tag.tagName; for (attrName in (tag.attributes || [])) { if (attrName) { output += ' ' + attrName; attrValue = tag.attributes[attrName]; if (attrValue) { output += "=" + quoteAttribute(attrValue); } } } output += tag.selfClosing ? '/>' : '>'; } if (!tag) { // skip contents entirely skipAll = true; } } // Add original tag to stack for tracking if (!selfClose) { stack.push({tagName: tagName, skip: skipAll}); } } // record index of the end of the tag prevIndex = tagRe.lastIndex; } else { // escape any text at the end if (!skipAll) { output += escapeContent(rawText.slice(prevIndex)); } break; } } return output; } return { parseHtml: parseHtml, tagWhitelist: tagWhitelist, tagBlacklist: tagBlacklist, attributeWhitelist: attributeWhitelist, defaultTagHandler: defaultTagHandler }; })();