#!/usr/bin/env python """Download all TurboGears documentation pages from the wiki in ReST format. WARNING: This is not complete. It is provided for anybody who wants to work on offline doc generation for TurboGears and needs a starting point. Things that are working: - parsing the wiki TitleIndex - downloading ReST sources - converting ReST to HTML - using templates for conversion - saving generated pages TODO: - Correctly fix links - Handle attachments - PDF conversion? - CSS handling (could be done in template) I use this simple Jinja template for testing: {{ title | e }} {{ html_body }} """ import logging import os import re import sys import urllib import urlparse from optparse import OptionParser from os.path import abspath, basename, dirname, join, splitext from BeautifulSoup import BeautifulSoup from docutils.core import publish_parts from jinja import Template, Context, FileSystemLoader __program__ = "get_docs.py" __author__ = "Christopher Arndt" __version__ = "0.1" __revision__ = "$Rev$" __date__ = "$Date$" __copyright__ = "MIT license" class ReST2HTML(object): _rest_marker = re.compile(r'^#format\s+rst', re.M|re.I) def __init__(self, template=None): self.set_template(template) def set_template(self, template): if template: tmpl = splitext(basename(template))[0] tmpl_dir = abspath(dirname(template)) self._template = Template(tmpl, FileSystemLoader(tmpl_dir)) else: self._template = None def __call__(self, page, text): """Convert text (unicode) in ReST syntax to HTML (unicode).""" text, nsubst = self._rest_marker.subn('', text) text = text.strip() if not text: return u'' if nsubst: # we have ReST markup overrides = dict(input_encoding='unicode', output_encoding='unicode') rst = publish_parts(text, settings_overrides=overrides, writer_name='html') else: from cgi import escape rst = dict(html_body= '
\n%s\n
\n' % escape(text)) if self._template: rst.setdefault('title', page[0][-1]) return self._template.render(Context(rst)) else: return rst.get('html', rst['html_body']) class RegExFilter(object): filters = [ # remove MoinMoin comments (r'^##.*$', ''), # remove MoinMoin processing instructions # but leave in parser pi ('format') (r'(?i)^#(acl|language|pragma|redirect|refresh)\s+.*$', ''), # remove macro directives (r'^\.\. macro:: .*$', ''), ] def __init__(self): for i,f in enumerate(self.filters): self.filters[i] = (re.compile(f[0], re.M), f[1]) def __call__(self, page, text): """Apply all regexes in self.filters to text.""" for rx, subst in self.filters: text = rx.sub(subst, text) return text class LinkFixer(RegExFilter): """Strip namespace prefix from links and append file extension. FIXME: This is far from complete and correct. Needs more testing """ def __init__(self): self.ns = options.namespace self.ext = options.fileext self.rurl = options.root_url self.url_rx = re.compile(r'(?P.*?)\s+<(?P<url>.+?)>') self.filters = [ # fix links of form `SomePage`_ # (only if they don't start with the namespace prefix) #(r'`([^<`]*?)`_' , r'`\1 <%s/\1>`_' % rurl ), # fix links of form `SomePage`_ (r'`(?P<url>.*?)`_', self.handle_shortlink), # fix links of form `Title <1.0/SomePage>`_ #(r'`(.*?)\s+<%s/(.*?)>`_' % ns, r'`\1 <\2%s>`_' % ext), # fix links of form .. _Title: 1.0/SomePage #(r'^\.\. _(.*?): %s/(.*?)' % ns, r'.. _\1: \2%s' % ext), ] super(LinkFixer, self).__init__() def handle_shortlink(self, m): url = m.group('url') m = self.url_rx.search(url) if m: url = m.group('url') title = m.group('title') if url.startswith('%s/' % self.ns): title = url.split('/')[-1] url = xlate_pagename(url) + self.ext elif not '://' in url: title = url url = '%s/%s' % (self.rurl, url) else: title = url return '`%s <%s>`_' % (title, url) def xlate_pagename(pagename, sep='__'): """Transform pagename by converting forward slashes into sep.""" return sep.join(pagename.split('/')[1:]) def get_page_urls(): """Get TitleIndex page from TG wiki and parse it for list of doc URLs.""" logging.info("Dowloading wiki TitleIndex from '%s'" % options.titleindex) req = urllib.urlopen(options.titleindex) if not req.headers.get('Status').startswith('200'): raise IOError, 'Could not retrieve TitleINdex from %s' % \ options.titleindex html = req.read() soup = BeautifulSoup(html) logging.info("Parsing TitleIndex to find documentation pages") urls = dict() for link in soup.findAll('a', href=True): if link.string and link.string.startswith(options.namespace): parts = urlparse.urlparse(link['href']) rurl = parts[2] # remove '/' + namespace on left side of URL key = rurl.lstrip('/')[len(options.namespace):] if not key: key = u'index' info = urls.setdefault(key, dict()) # flag pages with attachments if parts[4] == 'action=AttachFile': info['attachments'] = True if rurl.startswith('/'): info['url'] = options.root_url + rurl else: info['url'] = options.parent_url + rurl return urls.items() def process_pages(pages, page_filters=[]): """Get all pages in dict 'pages' in ReST format, fic links and save them.""" for pid, page in pages: url = page['url'] try: logging.info('Dowloading %s from %s' % (pid, url)) if not options.dryrun: req = urllib.urlopen(url + '?action=raw') if not req.headers.get('status').startswith('200'): raise IOError except IOError: warnings.warn('Could not download %s' % page['url']) else: if not options.dryrun: rst = unicode(req.read()) else: rst = None for desc,f in page_filters: logging.info("Applying filter '%s' to '%s'" % (desc, pid)) if not options.dryrun: rst = f((pid, page), rst) if rst: save_page(pid, pages, rst) else: logging.info("Empty page '%s' will not be saved" % pid) def save_page(pagename, pages, content): path = join(options.destdir, xlate_pagename(pagename)) + options.fileext logging.info("Saving page to '%s'" % path) if not options.dryrun: try: os.makedirs(dirname(path)) except: pass fo = open(path, 'wb') fo.write(content) fo.close() def main(args): global options, optparser page_filters = [ ('Regular expression filter', RegExFilter()), ('Strip whitespace', lambda x,y: y.strip()), ] optparser = OptionParser(prog=__program__, version=__version__, description=__doc__) optparser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="Print what's going on to stdout.") optparser.add_option("-n", "--dry-run", action="store_true", dest="dryrun", default=False, help="Don't actually download or save pages (use with -v).") optparser.add_option("-i", "--title-index", action="store", dest="titleindex", metavar='URL', default='http://docs.turbogears.org/TitleIndex', help="Give URL to TitleIndex page on the TurboGears docs wiki.") optparser.add_option("-s", "--name-space", action="store", dest="namespace", default='1.0', metavar='PREFIX', help="Specify the wiki namespace which should be downloaded.") optparser.add_option("-d", "--dest-dir", action="store", dest="destdir", default=os.curdir, metavar='DIR', help="Destination directory where pages are saved. Default: current directory") optparser.add_option("-f", "--format", action="store", dest="format", default='html', help="Output format (rest, html). Default: html") optparser.add_option("-t", "--template", action="store", dest="template", metavar='TEMPLATE', help="Jinja template for ReST to HTML conversion.") optparser.add_option("-e", "--file-ext", action="store", dest="fileext", default='.html', metavar='EXT', help="File extension for saved pages. Default: '.html'") (options, args) = optparser.parse_args(args=args) parts = urlparse.urlparse(options.titleindex) options.root_url = urlparse.urlunparse(parts[:2] + ('', '', '', '')) options.parent_url = \ urlparse.urlunparse(parts[:2] + (parts[2] or '/', '', '', '')) if options.format == 'html': options.fileext = '.html' page_filters.append(('Fix Links', LinkFixer())) if options.verbose: logging.basicConfig(level=logging.INFO) if options.format == 'html': rest2html = ReST2HTML() if options.template: rest2html.set_template(options.template) page_filters.append(('ReST to HTML', rest2html)) pages = get_page_urls() if options.verbose: from pprint import pformat logging.debug(pformat(pages)) process_pages(pages, page_filters) return 0 if __name__ == '__main__': sys.exit(main(sys.argv[1:]))