Publish.py

From ActiveArchives
Jump to: navigation, search

Publish is a script that spiders and downloads a live site into static pages with relative links (so the resulting files are "portable").

Example usage to publish / archive / staticify a Django project:

from publish import Spider
import settings
import urllib2, os
 
 
def publish ():
    # Django settings
    # MEDIA_URL = 'http://sample.net/media/'
    # MEDIA_ROOT = '/var/www/vhosts/sample.net/httpdocs/media/'
 
    # Create the spider with a "pub" path: where to save the output pages
    # NB: this folder should be somewhere in or next to MEDIA_ROOT
    s = Spider(os.path.join(settings.MEDIA_ROOT, "pages/project"))
 
    # Tell the spider what URLs are local (already static) files
    # refs to these files will not be downloaded, links will just get relativized to them
    s.urls_local(settings.MEDIA_URL, settings.MEDIA_ROOT)
 
    # Give patterns that match URLs to be DOWNLOADED
    s.urls_download("^http://sample.net/project/")
    s.urls_download("^http://sample.net/aa/")
    s.urls_exclude("") # exclude the rest
 
    # Optionally, tell it to strip certain URLs to simplify the local filenames
    s.url_strip_startswith("http://sample.net/project/")
 
    # Finally, give one (or more) URLs to start spidering
    s.add_url("http://sample.net/project/")
 
    # Optionally you can map specific URLs to a specific pathname (relative to pub)
    s.url_setpath("http://sample.net/project/", "index.html")
    # GO!
    s.spider()
 
if __name__ == "__main__":
    publish()

Code

#!/usr/bin/env python
#-*- coding:utf-8 -*-
 
# portable archive script
#
 
"""
Tools to assist translating a live / dynamic site into a "portable" (active)Archive.
 
NOTES:
* It's important that the server gives the right mimetype. Esp. for JSON, which needs to be application/json...
  (eventually could/should add special code to do further auto-detection on a text/html documents)
 
"""
 
import sys, os, urllib2, re, urlparse, urllib
from urllib2 import HTTPError, URLError
import html5lib, lxml
from html5lib import treebuilders
from lxml import etree
from lxml.cssselect import CSSSelector
import codecs
 
DEBUG = True
 
CT_EXTENSIONS = {}
CT_EXTENSIONS ['text/html'] = 'html'
CT_EXTENSIONS ['image/jpeg'] = 'jpg'
CT_EXTENSIONS ['image/gif'] = 'gif'
CT_EXTENSIONS ['image/png'] = 'png'
CT_EXTENSIONS ['text/css'] = 'css'
CT_EXTENSIONS ['application/javascript'] = 'js'
CT_EXTENSIONS ['text/javascript'] = 'js'
CT_EXTENSIONS ['application/x-javascript'] = 'js'
CT_EXTENSIONS ['application/json'] = 'json'
CT_EXTENSIONS ['application/xml'] = 'xml'
# CT_EXTENSIONS ['text/json'] = 'json'
 
def extensionForContentType (ct):
    if ct in CT_EXTENSIONS:
        return CT_EXTENSIONS[ct]
    else:
        return "data"
 
# KWF: ('/home/murtaugh/public_html/kwf/htdocs/', 'http://192.168.2.10/~murtaugh/kwf/htdocs/')
 
def relativizePath (frompath, topath):
    fpath = frompath.split("/")[:-1]
    tpath = topath.split("/")
 
    # strip common elements from start
    i = 0
    for (a, b) in zip(fpath, tpath):
        if a != b: break
        i += 1
    fpath = fpath[i:]
    tpath = tpath[i:]
 
    # froms become ".."
    p = [".." for x in fpath]
    # & append to's
    p.extend(tpath)
    ret = "/".join(p)
 
    return ret
 
def relativizeURL (fromURL, toURL):
    f = urlparse.urlsplit(fromURL)
    t = urlparse.urlsplit(toURL)
    if (f.scheme != t.scheme) or (f.netloc != t.netloc):
        return toURL
    relpath = relativizePath(f.path, t.path)
    # urlparse.urlunsplit((t.scheme, t.netloc, relpath, t.query, t.fragment))
    if relpath == "":
        ret = "./"
    else:
        ret = relpath
    if t.query:
        ret += "?" + t.query
    if t.fragment:
        ret += "#" + t.fragment
    return ret
 
def urlToPath (url, base="/"):
    url = url.rstrip("/")
    parts = urlparse.urlparse(url)
    fullpath = os.path.join(base, parts.netloc, parts.path.lstrip('/'))
    if parts.query: fullpath = os.path.join(fullpath, "?"+parts.query)
    return fullpath
 
protocol_pat = re.compile("^(?P<protocol>\w+)://")
 
def absolutize (href, base):
    if not protocol_pat.match(href):
        return urlparse.urljoin(base, href)
    return href
 
def is_relative_url (href):
    if protocol_pat.match(href):
        return False
    if href.lower().startswith("javascript:"):
        return False
    return True
 
def correctContentTypeMeta (page, encoding="utf-8"):
    for tag in page.xpath("//meta"):
        if tag.get("http-equiv", "").lower() == "content-type":
            # LOCATED
            tag.set("content", "text/html; charset=%s" % encoding)
            return True
    # create a meta tag
    print "Inserting meta tag for content-type"
    meta = lxml.etree.Element("meta")
    meta.set("http-equiv", "Content-Type")
    meta.set("content", "text/html; charset=%s" % encoding)
    head = page.xpath("//head")
    if head:
        head[0].append(meta)
 
class Spider (object):
 
    default_user_agent="Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
 
    @classmethod
    def openURL (cls, url, user_agent=None):
        """
        Returns: tuple with (file, actualurl)
        sets user_agent & follows redirection if necessary
        realurl maybe different than url in the case of a redirect
        """
        # print "\t", url
        if user_agent == None:
            user_agent = cls.default_user_agent
 
        request = urllib2.Request(url)
        if user_agent:
            request.add_header("User-Agent", user_agent)
        pagefile=urllib2.urlopen(request)
        redurl = pagefile.geturl()
        info = pagefile.info()
 
        content_type, encoding = info.get('content-type', 'text/html'), None
        if ';' in content_type:
            (content_type, encoding) = [x.strip() for x in content_type.split(';', 1)]
            encoding = encoding.strip()
            if encoding.lower().startswith("charset="):
                encoding = encoding[8:]
        # print "\t", content_type, encoding
        content_length = info.get('content-length')
 
        return {
            'file': pagefile,
            'url': url,
            'redurl' : redurl,
            'content_type': content_type,
            'encoding': encoding,
            'content_length': content_length
        }
 
 
    def __init__ (self, pub_folder, strip_query=False):
        # self.base_url = url
        """ pub_folder is the base of where to store files """
        self.pub = pub_folder
        self.nurls = {}
        self.spider_urls = []
        self.url_patterns = []
        self.url_paths = {}
        self.url_starts_to_strip = []
        self.strip_query = strip_query
        self.text_encoding = "utf-8"
 
    def set_text_encoding (self, value):
        self.text_encoding = value
 
    def add_url(self, url):
        self.getResource(url) # adds url to spider_urls
 
    def spider (self):
        while self.spider_urls:
 
            url = self.spider_urls[0]
            self.spider_urls = self.spider_urls[1:]
 
            print >> sys.stderr, url
            res = self.getResource(url)
            try:
                res.download()
            except HTTPError, e:
                print >> sys.stderr, ("Error downloading %s, %s" % (res.url, e))
            except URLError, e:
                print >> sys.stderr, ("Error downloading %s, %s" % (res.url, e))
 
            # print res.url
            # print "-->\t", res.rel_path
 
    def unparse (self):
        for url in self.nurls:
            print url
            self.nurls[url].unparse()
            print
 
    def urls_download (self, pat):
        self.url_patterns.append((pat, True))
 
    def urls_exclude (self, pat):
        self.url_patterns.append((pat, False))
 
    def urls_local (self, pat, path):
        """ path should be absolute """
        self.url_patterns.append((pat, path))
 
    def url_setpath (self, url, path):
        """ path should be relative to the archive path """
        self.url_paths[url] = path
 
    def url_strip_startswith (self, urlstart):
        self.url_starts_to_strip.append(urlstart)
 
    def normalizeURL(self, url):
        parts = urlparse.urlsplit(url)
        return urlparse.urlunsplit((parts.scheme, parts.netloc, parts.path, parts.query, ''))
 
    def getResource (self, url):
        """
        Returns a resource object for the given url
        depending on the spider settings... may be live or local
        may cause the resource to be added to the current spidering...
        """
        # url = urlparse.urljoin(self.base_url, url)
        nurl = self.normalizeURL(url)
        try:
            return self.nurls[nurl]
        except KeyError:
            # need to determine what kind of resource this is
            r = Resource(nurl, self)
            self.nurls[nurl] = r
 
            for (pat, opts) in self.url_patterns:
                # print "checking pat", pat
                m = re.search(pat, url, re.I)
                # print "\t",m
                if m:
                    if opts == True:
                        # print "PAT %s ARCHIVE" % url
                        self.spider_urls.append(nurl)
                    elif opts == False:
                        # print "PAT %s LIVE" % url
                        r.local = False
                    else:
                        # local (but outside of ...) 
                        # not added to spider_urls
                        # print "PAT %s LOCAL" % url
                        base_url = m.group(0)
                        #print "\tbase_url", base_url
                        # nurl = nurl[len(base_url):]
                        nurl = nurl[m.end():]
                        #print "\taddl_url", nurl
                        lpath = os.path.join(opts, nurl.lstrip('/'))
                        r.local_path = lpath
                    break
#            try:
#                ## allow explicit path here
#                lpath = self.url_paths[nurl]
#                if not lpath.startswith("/"):
#                    lpath = os.path.join(self.pub, lpath)
#                r.local_path = lpath
#            except KeyError:
#                pass
 
            return r
 
    def pathForURL(self, url, content_type = None):
        """ return path (either relative to pub, or absolute?) """
        # see if there's an explicit replacement
        try:
            return self.url_paths[url]
#            if not ret.startswith("/"):
#                lpath = os.path.join(self.pub, ret)
        except KeyError:
            pass
 
        if self.strip_query:
            if "?" in url:
                url = url[:url.rindex("?")]
 
        ret = None
        parts = urlparse.urlsplit(url)
        ext = extensionForContentType(content_type)
 
        # allow url_starts_to_strip to simplify the url
        for start in self.url_starts_to_strip:
            if url.startswith(start):
                ret = url[len(start):]
                ret = ret.strip('/')
 
                break
 
        if ret == None:
            # parts = urlparse.urlsplit(url)
            # COLONS not allowed on FAT32 ?!
            ret = "hosts/" + os.path.join(parts.netloc.replace(":", "_"), parts.path.lstrip('/'))
            ret = ret.rstrip("/")
        if parts.query and (not self.strip_query): ret = os.path.join(ret, "?"+parts.query)
 
        if not ret.endswith("."+ext):
            ret += "."+ext
        return ret
 
 
class Resource (object):
    """
    local = True (either inside or "local" relative archive), False (live)
    _local_path => abs path (when local == True)
    """
 
    def __init__ (self, url, spider):
        self.url = url
        self.spider = spider
        self._content_type = None
        self.local = True
        self._local_path = None
 
    @property
    def content_type (self):
        if (self._content_type == None):
            try:
                req = self.spider.openURL(self.url)
                req['file'].close()
                self._content_type = req['content_type'] 
            except HTTPError:
                self._content_type = ''
            except URLError:
                self._content_type = ''
 
        return self._content_type
 
    def get_local_path (self):
        if (self.local and self._local_path == None):
            rpath = self.spider.pathForURL(self.url, self.content_type)
            self._local_path = os.path.join(self.spider.pub, rpath)
        return self._local_path
 
    def set_local_path (self, val):
        # if DEBUG: print >> sys.stderr, "setting local path for '%s' to '%s'" % (self.url, val)
        self._local_path = val
 
    local_path = property(get_local_path, set_local_path)
 
    @property
    def rel_path (self):
        if self.local:
            lpath = self.local_path
            return relativizePath(self.spider.pub, lpath)
        else:
            return ""
 
 
    def href_to (self, toRes):
        """ requires: self.local """
        assert(self.local), "%s/%s" % (self.url, self.local)
        if not toRes.local:
            return toRes.url
        else:
            ret = relativizePath(self.local_path, toRes.local_path)
            # print "rel", self.local_path, toRes.local_path, ret
            # REMOVING TEMP... CAN DOUBLE QUOTE URLs
            # ret = urllib.quote(ret)
            return ret
 
    def unparse(self):
        msg = "<Resource"
        if self.local:
            msg += "LOCAL %s" % self._local_path
        else:
            msg += "LIVE %s" % self.url
        msg += ">"
        return msg
 
    def download (self):
        # if DEBUG: print >> sys.stderr, "downloading", self.url
        req = Spider.openURL(self.url)
        self._content_type = req['content_type'] # prevents extra open for getting content-type
 
        if req['content_type'] == 'text/html':
            parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(req['file'])
            req['file'].close()
 
            thelinks = CSSSelector('link[href]')
            for elt in thelinks(page):
#                if (elt.attrib.get("rel", "").lower() == "stylesheet") or \
#                    (elt.attrib.get("type", "").lower() == "text/css"):
                    # rewrite/import stylesheet
                href = urlparse.urljoin(self.url, elt.attrib['href'])
                res2 = self.spider.getResource(href)
                elt.attrib['href'] = self.href_to(res2)
 
            thelinks = CSSSelector('a[href]')
            for elt in thelinks(page):
                href = urlparse.urljoin(self.url, elt.attrib['href'])
                res2 = self.spider.getResource(href)
                new_href = self.href_to(res2)
                # patch with #fragment
                m = re.search(r"(#.*)$", href)
                if m:
                    new_href += m.group(1)
                elt.attrib['href'] = new_href
 
            extscripts = CSSSelector('script[src]')
            for elt in extscripts(page):
                src = urlparse.urljoin(self.url, elt.attrib['src'])
                res2 = self.spider.getResource(src)
                elt.attrib['src'] = self.href_to(res2)
 
            images = CSSSelector('img[src]')
            for elt in images(page):
                src = urlparse.urljoin(self.url, elt.attrib['src'])
                res2 = self.spider.getResource(src)
                elt.attrib['src'] = self.href_to(res2)
 
            # data = etree.tostring(page, encoding=unicode, method="html")
            # data = data.encode("utf-8")
            encoding = self.spider.text_encoding
            correctContentTypeMeta(page, encoding)
            fp = self.local_path
            print >> sys.stderr, "  =>", fp
            # return
            # fp = os.path.join(settings.MEDIA_ROOT, self.localpath)
            (path, _) = os.path.split(fp)
            try:
                os.makedirs(path)
            except OSError:
                pass
            out = codecs.open(fp, mode="w", encoding=encoding)
            data = lxml.etree.tostring(page, encoding=unicode, method="html")
            out.write(data)
            out.close()
            return
 
        elif req['content_type'] == 'text/css':
            data = req['file'].read()
            # data = data.decode(req['encoding'])
            url_pat = re.compile(r'url\("?(?P<url>.+?)"?\)', re.I)
            def urlsub(m):
                url = urlparse.urljoin(self.url, m.groupdict()['url'])
                res2 = self.spider.getResource(url)
                url = self.href_to(res2)
                return 'url("%s")' % url
 
            data = url_pat.sub(urlsub, data)
 
        else:
            data = req['file'].read()
 
        # SAVE DATA AS LOCAL FILE
        fp = self.local_path
 
        # ensure dirs
        (path, _) = os.path.split(fp)
        try:
            os.makedirs(path)
        except OSError:
            pass
 
        out = open(fp, 'wb')
        out.write(data)
 
 
"""
 
URLs can be:
    left live,
        all others...
    relativize (local media),
        192.168.2.10/~murtaugh (MEDIA_ROOT)
    downloaded.
        localhost:8000/
 
 
"""

What links here

Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox