Clean HTML code when inserting text from MS Word into HTML5 WYSIWYG editor (contenteditable)

Hello!

When writing my WYSIWYG editor, there was a problem copying text from Word. Actually there are three problems:
  • Word inserts a lot of junk html code that needs to be cleaned
  • For some reason, Word uses paragraphs instead of UL and LI tags to present lists.
  • Actually how to determine that the inserted text is inserted from the Word.

In general, to solve these problems, a jquery plugin was written, the full source code of which is available at the end of the article. Usage example:

$('#editor'). msword_html_filter();

The plugin is hung up on the keyup event and checks if the source code inside the editor is pasted from Word, if so, then the cleanup function starts. Everything that can be beaten up in the resulting html is inextricable spaces, style and align attributes , span tags , all Mso classes, empty paragraphs.

Implementation details under the cut.

UPD Demo on CodePen
Most of the regulars used were peeped by TinyMCE .

How to determine if there is an html code inserted from Word in a line:


if (/class="?Mso|style="[^"]*\bmso-|style='[^'']*\bmso-|w:WordDocument/i.test( content )) {
    ...
}


The code cleaning function (the jquery editor object is passed to the function):


function word_filter(editor){
            var content = editor.html();

            // Word comments like conditional comments etc
            content = content.replace(/<!--[\s\S]+?-->/gi, '');

            // Remove comments, scripts (e.g., msoShowComment), XML tag, VML content,
            // MS Office namespaced tags, and a few other tags
            content = content.replace(/<(!|script[^>]*>.*?<\/script(?=[>\s])|\/?(\?xml(:\w+)?|img|meta|link|style|\w:\w+)(?=[\s\/>]))[^>]*>/gi, '');

            // Convert <s> into <strike> for line-though
            content = content.replace(/<(\/?)s>/gi, "<$1strike>");

            // Replace nbsp entites to char since it's easier to handle
            //content = content.replace(/ /gi, "\u00a0");
            content = content.replace(/ /gi, ' ');

            // Convert <span style="mso-spacerun:yes">___</span> to string of alternating
            // breaking/non-breaking spaces of same length
            content = content.replace(/<span\s+style\s*=\s*"\s*mso-spacerun\s*:\s*yes\s*;?\s*"\s*>([\s\u00a0]*)<\/span>/gi, function(str, spaces) {
                return (spaces.length > 0) ? spaces.replace(/./, " ").slice(Math.floor(spaces.length/2)).split("").join("\u00a0") : '';
            });

            editor.html(content);

            // Parse out list indent level for lists
            $('p', editor).each(function(){
                var str = $(this).attr('style');
                var matches = /mso-list:\w+ \w+([0-9]+)/.exec(str);
                if (matches) {
                    $(this).data('_listLevel',  parseInt(matches[1], 10));
                }
            });

            // Parse Lists
            var last_level=0;
            var pnt = null;
            $('p', editor).each(function(){
                var cur_level = $(this).data('_listLevel');
                if(cur_level != undefined){
                    var txt = $(this).text();
                    var list_tag = '<ul></ul>';
                    if (/^\s*\w+\./.test(txt)) {
                        var matches = /([0-9])\./.exec(txt);
                        if (matches) {
                            var start = parseInt(matches[1], 10);
                            list_tag = start>1 ? '<ol start="' + start + '"></ol>' : '<ol></ol>';
                        }else{
                            list_tag = '<ol></ol>';
                        }
                    }

                    if(cur_level>last_level){
                        if(last_level==0){
                            $(this).before(list_tag);
                            pnt = $(this).prev();
                        }else{
                            pnt = $(list_tag).appendTo(pnt);
                        }
                    }
                    if(cur_level<last_level){
                        for(var i=0; i<last_level-cur_level; i++){
                            pnt = pnt.parent();
                        }
                    }
                    $('span:first', this).remove();
                    pnt.append('<li>' + $(this).html() + '</li>')
                    $(this).remove();
                    last_level = cur_level;
                }else{
                    last_level = 0;
                }
            })

            $('[style]', editor).removeAttr('style');
            $('[align]', editor).removeAttr('align');
            $('span', editor).replaceWith(function() {return $(this).contents();});
            $('span:empty', editor).remove();
            $("[class^='Mso']", editor).removeAttr('class');
            $('p:empty', editor).remove();
        }


The full source code of the plugin under the spoiler, save to jquery.msword_html_filter.js

plugin source code

(function($) {
    $.fn.msword_html_filter = function(options) {
        var settings = $.extend( {}, options);

        function word_filter(editor){
            var content = editor.html();

            // Word comments like conditional comments etc
            content = content.replace(/<!--[\s\S]+?-->/gi, '');

            // Remove comments, scripts (e.g., msoShowComment), XML tag, VML content,
            // MS Office namespaced tags, and a few other tags
            content = content.replace(/<(!|script[^>]*>.*?<\/script(?=[>\s])|\/?(\?xml(:\w+)?|img|meta|link|style|\w:\w+)(?=[\s\/>]))[^>]*>/gi, '');

            // Convert <s> into <strike> for line-though
            content = content.replace(/<(\/?)s>/gi, "<$1strike>");

            // Replace nbsp entites to char since it's easier to handle
            //content = content.replace(/ /gi, "\u00a0");
            content = content.replace(/ /gi, ' ');

            // Convert <span style="mso-spacerun:yes">___</span> to string of alternating
            // breaking/non-breaking spaces of same length
            content = content.replace(/<span\s+style\s*=\s*"\s*mso-spacerun\s*:\s*yes\s*;?\s*"\s*>([\s\u00a0]*)<\/span>/gi, function(str, spaces) {
                return (spaces.length > 0) ? spaces.replace(/./, " ").slice(Math.floor(spaces.length/2)).split("").join("\u00a0") : '';
            });

            editor.html(content);

            // Parse out list indent level for lists
            $('p', editor).each(function(){
                var str = $(this).attr('style');
                var matches = /mso-list:\w+ \w+([0-9]+)/.exec(str);
                if (matches) {
                    $(this).data('_listLevel',  parseInt(matches[1], 10));
                }
            });

            // Parse Lists
            var last_level=0;
            var pnt = null;
            $('p', editor).each(function(){
                var cur_level = $(this).data('_listLevel');
                if(cur_level != undefined){
                    var txt = $(this).text();
                    var list_tag = '<ul></ul>';
                    if (/^\s*\w+\./.test(txt)) {
                        var matches = /([0-9])\./.exec(txt);
                        if (matches) {
                            var start = parseInt(matches[1], 10);
                            list_tag = start>1 ? '<ol start="' + start + '"></ol>' : '<ol></ol>';
                        }else{
                            list_tag = '<ol></ol>';
                        }
                    }

                    if(cur_level>last_level){
                        if(last_level==0){
                            $(this).before(list_tag);
                            pnt = $(this).prev();
                        }else{
                            pnt = $(list_tag).appendTo(pnt);
                        }
                    }
                    if(cur_level<last_level){
                        for(var i=0; i<last_level-cur_level; i++){
                            pnt = pnt.parent();
                        }
                    }
                    $('span:first', this).remove();
                    pnt.append('<li>' + $(this).html() + '</li>')
                    $(this).remove();
                    last_level = cur_level;
                }else{
                    last_level = 0;
                }
            })

            $('[style]', editor).removeAttr('style');
            $('[align]', editor).removeAttr('align');
            $('span', editor).replaceWith(function() {return $(this).contents();});
            $('span:empty', editor).remove();
            $("[class^='Mso']", editor).removeAttr('class');
            $('p:empty', editor).remove();
        }

        return this.each(function() {
            $(this).on('keyup', function(){
                var content = $(this).html();
                if (/class="?Mso|style="[^"]*\bmso-|style='[^'']*\bmso-|w:WordDocument/i.test( content )) {
                    word_filter( $(this) );
                }
            });
        });
    };
})( jQuery )


Performance was tested only in the latest Firefox.