ned Productions – RemoveBOM

by . Last updated .

In my recent move to Microsoft Expression Web, I came across a problem which has been plaguing users across the internet according to google. If one uses UTF-8 to encode ones web pages in Expression Web so you can be XHTML compliant, it inserts a hidden UTF-8 BOM marker at the start of the file. This is very, very bad, because PHP and Perl doesn't understand BOM markers (yet) and neither do web browsers which then display three squiggly characters at the top of each page. In particular, these three BOM bytes cause PHP to think <body> has started and thus to refuse the use of the header() function to reset Content-Type to utf-8, thus causing your UTF-8 pages to appear as ISO-8859!

The following Python script fixes this problem. It takes your Microsoft Expression Web directory tree and copies it to another location, testing HTML files for the UTF-8 BOM and removing them if present. It knows when to not copy files which are unchanged, so it is fast to run just before you upload your changes. You might also be interested in RemoveBOM v2.

No guarantees or support are given with this code. Enjoy!

# RemoveBOM
# Clones a directory structure but removing the BOM from UTF-8 files
# (C) 2007 Niall Douglas
# 23rd April 2007

import sys, os, shutil

def enumeratedir(path):
    ret={}
    for root, dirs, files in os.walk(path, False):
        for f in files:
            ret[os.path.join(root, f)]=(1, os.stat(os.path.join(root, f)))
        for f in dirs:
            ret[os.path.join(root, f)]=(2, os.stat(os.path.join(root, f)))
    return ret
def replaceroot(root, path, pathroot):
    return os.path.join(root, path[-(len(path)-len(pathroot)-1):])
def ensuredir(path):
    if os.path.exists(path): return
    ensuredir(os.path.dirname(path))
    print 'Making directory',path
    os.mkdir(path)
UTF8BOM=chr(0xef)+chr(0xbb)+chr(0xbf)

try:
    indir=sys.argv[1]
    outdir=sys.argv[2]
except:
    indir="public_html"
    outdir="public_html_bomfixed"
if not indir or not outdir: raise Exception, "Missing input or output dirs"
indircontents=enumeratedir(indir)
outdircontents=enumeratedir(outdir)
for path, st in outdircontents.iteritems():
    ipath=replaceroot(indir, path, outdir)
    if not indircontents.has_key(ipath):
        print 'Deleting',path
        if os.path.isfile(path) or os.path.islink(path):
            os.remove(path)
        elif os.path.isdir(path):
            os.rmdir(path)
    
for path, st in indircontents.iteritems():
    if st[0]==1:
        f2path=replaceroot(outdir, path, indir)
        if outdircontents.has_key(f2path) and \
            abs(st[1].st_atime-outdircontents[f2path][1].st_atime)<2 and \
            abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)<2:
            # Unchanged
            pass
        else:
            #print abs(st[1].st_atime-outdircontents[f2path][1].st_atime),abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)
            done=False
            if path[-5:]=='.html':
                #print 'Looking at',path
                f=open(path, 'rb')
                try:
                    data=f.read(3)
                    if data==UTF8BOM: # It's the UTF-8 BOM
                        print 'File',path,'changed, removing UTF-8 BOM to',f2path
                        ensuredir(os.path.dirname(f2path))
                        f2=open(f2path, 'wb')
                        try:
                            data=f.read()
                            f2.write(data)
                        finally:
                            f2.close()
                        done=True
                finally:
                    f.close()
                if done:
                    shutil.copymode(path, f2path)
                    shutil.copystat(path, f2path)
            if not done:
                print 'File',path,'changed, copying to',f2path
                ensuredir(os.path.dirname(f2path))
                shutil.copy2(path, f2path)
       
print "All up to date!"

Contact the webmaster: Niall Douglas @ webmaster2<at symbol>nedprod.com (Last updated: 2007-04-23 00:00:00 +0000 UTC)