Removing all the JS code from a HTML file using cheerio.js - Pastebin.com

1 min read Original article ↗

hersonHN

Sep 7th, 2012

421

0

Never

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

  1. (function () {

  2. "use strict";

  3. // Load cheerio.js and fs for test some file

  4. var fs = require("fs"),

  5.         cheerio = require('cheerio'),

  6.         file = 'tags.html';

  7. function clearMarkup(rawHTML) {

  8. var $ = cheerio.load(rawHTML);

  9. // First, we remove all the <script> tags

  10.         $("script").remove();

  11. // then, on every tag we remove all the javascript attributes

  12. // like, onclick, onload, etc...

  13.         $("*").each(function () {

  14. var attribute, tag, attribs;

  15.             tag = $(this);

  16.             attribs = tag.get(0).attribs;

  17. for (attribute in attribs) {

  18. if (attribs.hasOwnProperty(attribute)) {

  19. if (attribute.toLowerCase().substr(0, 2) === "on") {

  20.                         tag.removeAttr(attribute);

  21. }

  22. }

  23. }

  24. });

  25. return $.html();

  26. }

  27. // The test, there is no time for handling fs exceptions!!!

  28.     fs.readFile(file, function (error, data) {

  29. var cleanHTML = clearMarkup(data);

  30.         console.log(cleanHTML);

  31. });

  32. }());

  33. /* content of "tags.html" */

  34. /*

  35.     <!doctype html>

  36.     <html>

  37.     <head>

  38.         <title>test</title>

  39.     </head>

  40.     <body onload="alert('evil xss')">

  41.         <ul>

  42.             <li onmouseover="alert('evil XSS')">a</li>

  43.             <li>b</li>

  44.             <li>c</li>

  45.         </ul>

  46.         <script>alert("xss")</script>

  47.     </body>

  48.     </html>

  49. */

  50. /* console output */

  51. /*

  52.     <!doctype html>

  53.     <html>

  54.     <head>

  55.         <title>test</title>

  56.     </head>

  57.     <body>

  58.         <ul>

  59.             <li>a</li>

  60.             <li>b</li>

  61.             <li>c</li>

  62.         </ul>

  63.     </body>

  64.     </html>

  65. */