#!/usr/bin/env python
"""Download all TurboGears documentation pages from the wiki in ReST format.
WARNING: This is not complete. It is provided for anybody who wants to work
on offline doc generation for TurboGears and needs a starting point.
Things that are working:
- parsing the wiki TitleIndex
- downloading ReST sources
- converting ReST to HTML
- using templates for conversion
- saving generated pages
TODO:
- Correctly fix links
- Handle attachments
- PDF conversion?
- CSS handling (could be done in template)
I use this simple Jinja template for testing:
{{ title | e }}
{{ html_body }}
"""
import logging
import os
import re
import sys
import urllib
import urlparse
from optparse import OptionParser
from os.path import abspath, basename, dirname, join, splitext
from BeautifulSoup import BeautifulSoup
from docutils.core import publish_parts
from jinja import Template, Context, FileSystemLoader
__program__ = "get_docs.py"
__author__ = "Christopher Arndt"
__version__ = "0.1"
__revision__ = "$Rev$"
__date__ = "$Date$"
__copyright__ = "MIT license"
class ReST2HTML(object):
_rest_marker = re.compile(r'^#format\s+rst', re.M|re.I)
def __init__(self, template=None):
self.set_template(template)
def set_template(self, template):
if template:
tmpl = splitext(basename(template))[0]
tmpl_dir = abspath(dirname(template))
self._template = Template(tmpl, FileSystemLoader(tmpl_dir))
else:
self._template = None
def __call__(self, page, text):
"""Convert text (unicode) in ReST syntax to HTML (unicode)."""
text, nsubst = self._rest_marker.subn('', text)
text = text.strip()
if not text:
return u''
if nsubst:
# we have ReST markup
overrides = dict(input_encoding='unicode',
output_encoding='unicode')
rst = publish_parts(text, settings_overrides=overrides,
writer_name='html')
else:
from cgi import escape
rst = dict(html_body= '
\n%s\n
\n' % escape(text))
if self._template:
rst.setdefault('title', page[0][-1])
return self._template.render(Context(rst))
else:
return rst.get('html', rst['html_body'])
class RegExFilter(object):
filters = [
# remove MoinMoin comments
(r'^##.*$', ''),
# remove MoinMoin processing instructions
# but leave in parser pi ('format')
(r'(?i)^#(acl|language|pragma|redirect|refresh)\s+.*$', ''),
# remove macro directives
(r'^\.\. macro:: .*$', ''),
]
def __init__(self):
for i,f in enumerate(self.filters):
self.filters[i] = (re.compile(f[0], re.M), f[1])
def __call__(self, page, text):
"""Apply all regexes in self.filters to text."""
for rx, subst in self.filters:
text = rx.sub(subst, text)
return text
class LinkFixer(RegExFilter):
"""Strip namespace prefix from links and append file extension.
FIXME: This is far from complete and correct. Needs more testing
"""
def __init__(self):
self.ns = options.namespace
self.ext = options.fileext
self.rurl = options.root_url
self.url_rx = re.compile(r'(?P.*?)\s+<(?P.+?)>')
self.filters = [
# fix links of form `SomePage`_
# (only if they don't start with the namespace prefix)
#(r'`([^<`]*?)`_' , r'`\1 <%s/\1>`_' % rurl ),
# fix links of form `SomePage`_
(r'`(?P.*?)`_', self.handle_shortlink),
# fix links of form `Title <1.0/SomePage>`_
#(r'`(.*?)\s+<%s/(.*?)>`_' % ns, r'`\1 <\2%s>`_' % ext),
# fix links of form .. _Title: 1.0/SomePage
#(r'^\.\. _(.*?): %s/(.*?)' % ns, r'.. _\1: \2%s' % ext),
]
super(LinkFixer, self).__init__()
def handle_shortlink(self, m):
url = m.group('url')
m = self.url_rx.search(url)
if m:
url = m.group('url')
title = m.group('title')
if url.startswith('%s/' % self.ns):
title = url.split('/')[-1]
url = xlate_pagename(url) + self.ext
elif not '://' in url:
title = url
url = '%s/%s' % (self.rurl, url)
else:
title = url
return '`%s <%s>`_' % (title, url)
def xlate_pagename(pagename, sep='__'):
"""Transform pagename by converting forward slashes into sep."""
return sep.join(pagename.split('/')[1:])
def get_page_urls():
"""Get TitleIndex page from TG wiki and parse it for list of doc URLs."""
logging.info("Dowloading wiki TitleIndex from '%s'" % options.titleindex)
req = urllib.urlopen(options.titleindex)
if not req.headers.get('Status').startswith('200'):
raise IOError, 'Could not retrieve TitleINdex from %s' % \
options.titleindex
html = req.read()
soup = BeautifulSoup(html)
logging.info("Parsing TitleIndex to find documentation pages")
urls = dict()
for link in soup.findAll('a', href=True):
if link.string and link.string.startswith(options.namespace):
parts = urlparse.urlparse(link['href'])
rurl = parts[2]
# remove '/' + namespace on left side of URL
key = rurl.lstrip('/')[len(options.namespace):]
if not key:
key = u'index'
info = urls.setdefault(key, dict())
# flag pages with attachments
if parts[4] == 'action=AttachFile':
info['attachments'] = True
if rurl.startswith('/'):
info['url'] = options.root_url + rurl
else:
info['url'] = options.parent_url + rurl
return urls.items()
def process_pages(pages, page_filters=[]):
"""Get all pages in dict 'pages' in ReST format, fic links and save them."""
for pid, page in pages:
url = page['url']
try:
logging.info('Dowloading %s from %s' % (pid, url))
if not options.dryrun:
req = urllib.urlopen(url + '?action=raw')
if not req.headers.get('status').startswith('200'):
raise IOError
except IOError:
warnings.warn('Could not download %s' % page['url'])
else:
if not options.dryrun:
rst = unicode(req.read())
else:
rst = None
for desc,f in page_filters:
logging.info("Applying filter '%s' to '%s'" % (desc, pid))
if not options.dryrun:
rst = f((pid, page), rst)
if rst:
save_page(pid, pages, rst)
else:
logging.info("Empty page '%s' will not be saved" % pid)
def save_page(pagename, pages, content):
path = join(options.destdir, xlate_pagename(pagename)) + options.fileext
logging.info("Saving page to '%s'" % path)
if not options.dryrun:
try:
os.makedirs(dirname(path))
except: pass
fo = open(path, 'wb')
fo.write(content)
fo.close()
def main(args):
global options, optparser
page_filters = [
('Regular expression filter', RegExFilter()),
('Strip whitespace', lambda x,y: y.strip()),
]
optparser = OptionParser(prog=__program__,
version=__version__, description=__doc__)
optparser.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=False,
help="Print what's going on to stdout.")
optparser.add_option("-n", "--dry-run",
action="store_true", dest="dryrun", default=False,
help="Don't actually download or save pages (use with -v).")
optparser.add_option("-i", "--title-index",
action="store", dest="titleindex", metavar='URL',
default='http://docs.turbogears.org/TitleIndex',
help="Give URL to TitleIndex page on the TurboGears docs wiki.")
optparser.add_option("-s", "--name-space",
action="store", dest="namespace", default='1.0', metavar='PREFIX',
help="Specify the wiki namespace which should be downloaded.")
optparser.add_option("-d", "--dest-dir",
action="store", dest="destdir", default=os.curdir, metavar='DIR',
help="Destination directory where pages are saved. Default: current directory")
optparser.add_option("-f", "--format",
action="store", dest="format", default='html',
help="Output format (rest, html). Default: html")
optparser.add_option("-t", "--template",
action="store", dest="template", metavar='TEMPLATE',
help="Jinja template for ReST to HTML conversion.")
optparser.add_option("-e", "--file-ext",
action="store", dest="fileext", default='.html', metavar='EXT',
help="File extension for saved pages. Default: '.html'")
(options, args) = optparser.parse_args(args=args)
parts = urlparse.urlparse(options.titleindex)
options.root_url = urlparse.urlunparse(parts[:2] + ('', '', '', ''))
options.parent_url = \
urlparse.urlunparse(parts[:2] + (parts[2] or '/', '', '', ''))
if options.format == 'html':
options.fileext = '.html'
page_filters.append(('Fix Links', LinkFixer()))
if options.verbose:
logging.basicConfig(level=logging.INFO)
if options.format == 'html':
rest2html = ReST2HTML()
if options.template:
rest2html.set_template(options.template)
page_filters.append(('ReST to HTML', rest2html))
pages = get_page_urls()
if options.verbose:
from pprint import pformat
logging.debug(pformat(pages))
process_pages(pages, page_filters)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))