| 1 |
|
| 2 |
"""Download all TurboGears documentation pages from the wiki in ReST format.
|
| 3 |
|
| 4 |
WARNING: This is not complete. It is provided for anybody who wants to work
|
| 5 |
on offline doc generation for TurboGears and needs a starting point.
|
| 6 |
|
| 7 |
Things that are working:
|
| 8 |
|
| 9 |
- parsing the wiki TitleIndex
|
| 10 |
- downloading ReST sources
|
| 11 |
- converting ReST to HTML
|
| 12 |
- using templates for conversion
|
| 13 |
- saving generated pages
|
| 14 |
|
| 15 |
TODO:
|
| 16 |
|
| 17 |
- Correctly fix links
|
| 18 |
- Handle attachments
|
| 19 |
- PDF conversion?
|
| 20 |
- CSS handling (could be done in template)
|
| 21 |
|
| 22 |
|
| 23 |
I use this simple Jinja template for testing:
|
| 24 |
|
| 25 |
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
| 26 |
"http://www.w3.org/TR/html4/loose.dtd">
|
| 27 |
<head>
|
| 28 |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
| 29 |
|
| 30 |
<link rel="stylesheet" type="text/css" href="rest.css" />
|
| 31 |
|
| 32 |
<title>{{ title | e }}</title>
|
| 33 |
</head>
|
| 34 |
<body>
|
| 35 |
{{ html_body }}
|
| 36 |
</body>
|
| 37 |
|
| 38 |
"""
|
| 39 |
|
| 40 |
import logging
|
| 41 |
import os
|
| 42 |
import re
|
| 43 |
import sys
|
| 44 |
import urllib
|
| 45 |
import urlparse
|
| 46 |
from optparse import OptionParser
|
| 47 |
from os.path import abspath, basename, dirname, join, splitext
|
| 48 |
|
| 49 |
from BeautifulSoup import BeautifulSoup
|
| 50 |
from docutils.core import publish_parts
|
| 51 |
from jinja import Template, Context, FileSystemLoader
|
| 52 |
|
| 53 |
__program__ = "get_docs.py"
|
| 54 |
__author__ = "Christopher Arndt"
|
| 55 |
__version__ = "0.1"
|
| 56 |
__revision__ = "$Rev$"
|
| 57 |
__date__ = "$Date$"
|
| 58 |
__copyright__ = "MIT license"
|
| 59 |
|
| 60 |
class ReST2HTML(object):
|
| 61 |
|
| 62 |
_rest_marker = re.compile(r'^#format\s+rst', re.M|re.I)
|
| 63 |
|
| 64 |
def __init__(self, template=None):
|
| 65 |
self.set_template(template)
|
| 66 |
|
| 67 |
def set_template(self, template):
|
| 68 |
if template:
|
| 69 |
tmpl = splitext(basename(template))[0]
|
| 70 |
tmpl_dir = abspath(dirname(template))
|
| 71 |
self._template = Template(tmpl, FileSystemLoader(tmpl_dir))
|
| 72 |
else:
|
| 73 |
self._template = None
|
| 74 |
|
| 75 |
def __call__(self, page, text):
|
| 76 |
"""Convert text (unicode) in ReST syntax to HTML (unicode)."""
|
| 77 |
|
| 78 |
text, nsubst = self._rest_marker.subn('', text)
|
| 79 |
text = text.strip()
|
| 80 |
if not text:
|
| 81 |
return u''
|
| 82 |
if nsubst:
|
| 83 |
|
| 84 |
overrides = dict(input_encoding='unicode',
|
| 85 |
output_encoding='unicode')
|
| 86 |
rst = publish_parts(text, settings_overrides=overrides,
|
| 87 |
writer_name='html')
|
| 88 |
else:
|
| 89 |
from cgi import escape
|
| 90 |
rst = dict(html_body= '<pre>\n%s\n</pre>\n' % escape(text))
|
| 91 |
if self._template:
|
| 92 |
rst.setdefault('title', page[0][-1])
|
| 93 |
return self._template.render(Context(rst))
|
| 94 |
else:
|
| 95 |
return rst.get('html', rst['html_body'])
|
| 96 |
|
| 97 |
class RegExFilter(object):
|
| 98 |
filters = [
|
| 99 |
|
| 100 |
(r'^##.*$', ''),
|
| 101 |
|
| 102 |
|
| 103 |
(r'(?i)^#(acl|language|pragma|redirect|refresh)\s+.*$', ''),
|
| 104 |
|
| 105 |
(r'^\.\. macro:: .*$', ''),
|
| 106 |
]
|
| 107 |
|
| 108 |
def __init__(self):
|
| 109 |
for i,f in enumerate(self.filters):
|
| 110 |
self.filters[i] = (re.compile(f[0], re.M), f[1])
|
| 111 |
|
| 112 |
def __call__(self, page, text):
|
| 113 |
"""Apply all regexes in self.filters to text."""
|
| 114 |
|
| 115 |
for rx, subst in self.filters:
|
| 116 |
text = rx.sub(subst, text)
|
| 117 |
return text
|
| 118 |
|
| 119 |
class LinkFixer(RegExFilter):
|
| 120 |
"""Strip namespace prefix from links and append file extension.
|
| 121 |
|
| 122 |
FIXME: This is far from complete and correct. Needs more testing
|
| 123 |
"""
|
| 124 |
|
| 125 |
def __init__(self):
|
| 126 |
self.ns = options.namespace
|
| 127 |
self.ext = options.fileext
|
| 128 |
self.rurl = options.root_url
|
| 129 |
self.url_rx = re.compile(r'(?P<title>.*?)\s+<(?P<url>.+?)>')
|
| 130 |
self.filters = [
|
| 131 |
|
| 132 |
|
| 133 |
|
| 134 |
|
| 135 |
|
| 136 |
(r'`(?P<url>.*?)`_', self.handle_shortlink),
|
| 137 |
|
| 138 |
|
| 139 |
|
| 140 |
|
| 141 |
|
| 142 |
|
| 143 |
]
|
| 144 |
super(LinkFixer, self).__init__()
|
| 145 |
|
| 146 |
def handle_shortlink(self, m):
|
| 147 |
url = m.group('url')
|
| 148 |
m = self.url_rx.search(url)
|
| 149 |
if m:
|
| 150 |
url = m.group('url')
|
| 151 |
title = m.group('title')
|
| 152 |
if url.startswith('%s/' % self.ns):
|
| 153 |
title = url.split('/')[-1]
|
| 154 |
url = xlate_pagename(url) + self.ext
|
| 155 |
elif not '://' in url:
|
| 156 |
title = url
|
| 157 |
url = '%s/%s' % (self.rurl, url)
|
| 158 |
else:
|
| 159 |
title = url
|
| 160 |
return '`%s <%s>`_' % (title, url)
|
| 161 |
|
| 162 |
def xlate_pagename(pagename, sep='__'):
|
| 163 |
"""Transform pagename by converting forward slashes into sep."""
|
| 164 |
|
| 165 |
return sep.join(pagename.split('/')[1:])
|
| 166 |
|
| 167 |
|
| 168 |
def get_page_urls():
|
| 169 |
"""Get TitleIndex page from TG wiki and parse it for list of doc URLs."""
|
| 170 |
|
| 171 |
logging.info("Dowloading wiki TitleIndex from '%s'" % options.titleindex)
|
| 172 |
req = urllib.urlopen(options.titleindex)
|
| 173 |
if not req.headers.get('Status').startswith('200'):
|
| 174 |
raise IOError, 'Could not retrieve TitleINdex from %s' % \
|
| 175 |
options.titleindex
|
| 176 |
html = req.read()
|
| 177 |
soup = BeautifulSoup(html)
|
| 178 |
|
| 179 |
logging.info("Parsing TitleIndex to find documentation pages")
|
| 180 |
urls = dict()
|
| 181 |
for link in soup.findAll('a', href=True):
|
| 182 |
if link.string and link.string.startswith(options.namespace):
|
| 183 |
parts = urlparse.urlparse(link['href'])
|
| 184 |
rurl = parts[2]
|
| 185 |
|
| 186 |
key = rurl.lstrip('/')[len(options.namespace):]
|
| 187 |
if not key:
|
| 188 |
key = u'index'
|
| 189 |
info = urls.setdefault(key, dict())
|
| 190 |
|
| 191 |
if parts[4] == 'action=AttachFile':
|
| 192 |
info['attachments'] = True
|
| 193 |
if rurl.startswith('/'):
|
| 194 |
info['url'] = options.root_url + rurl
|
| 195 |
else:
|
| 196 |
info['url'] = options.parent_url + rurl
|
| 197 |
return urls.items()
|
| 198 |
|
| 199 |
def process_pages(pages, page_filters=[]):
|
| 200 |
"""Get all pages in dict 'pages' in ReST format, fic links and save them."""
|
| 201 |
|
| 202 |
for pid, page in pages:
|
| 203 |
url = page['url']
|
| 204 |
try:
|
| 205 |
logging.info('Dowloading %s from %s' % (pid, url))
|
| 206 |
if not options.dryrun:
|
| 207 |
req = urllib.urlopen(url + '?action=raw')
|
| 208 |
if not req.headers.get('status').startswith('200'):
|
| 209 |
raise IOError
|
| 210 |
except IOError:
|
| 211 |
warnings.warn('Could not download %s' % page['url'])
|
| 212 |
else:
|
| 213 |
if not options.dryrun:
|
| 214 |
rst = unicode(req.read())
|
| 215 |
else:
|
| 216 |
rst = None
|
| 217 |
for desc,f in page_filters:
|
| 218 |
logging.info("Applying filter '%s' to '%s'" % (desc, pid))
|
| 219 |
if not options.dryrun:
|
| 220 |
rst = f((pid, page), rst)
|
| 221 |
if rst:
|
| 222 |
save_page(pid, pages, rst)
|
| 223 |
else:
|
| 224 |
logging.info("Empty page '%s' will not be saved" % pid)
|
| 225 |
|
| 226 |
def save_page(pagename, pages, content):
|
| 227 |
|
| 228 |
path = join(options.destdir, xlate_pagename(pagename)) + options.fileext
|
| 229 |
logging.info("Saving page to '%s'" % path)
|
| 230 |
if not options.dryrun:
|
| 231 |
try:
|
| 232 |
os.makedirs(dirname(path))
|
| 233 |
except: pass
|
| 234 |
fo = open(path, 'wb')
|
| 235 |
fo.write(content)
|
| 236 |
fo.close()
|
| 237 |
|
| 238 |
def main(args):
|
| 239 |
global options, optparser
|
| 240 |
|
| 241 |
page_filters = [
|
| 242 |
('Regular expression filter', RegExFilter()),
|
| 243 |
('Strip whitespace', lambda x,y: y.strip()),
|
| 244 |
]
|
| 245 |
|
| 246 |
optparser = OptionParser(prog=__program__,
|
| 247 |
version=__version__, description=__doc__)
|
| 248 |
optparser.add_option("-v", "--verbose",
|
| 249 |
action="store_true", dest="verbose", default=False,
|
| 250 |
help="Print what's going on to stdout.")
|
| 251 |
optparser.add_option("-n", "--dry-run",
|
| 252 |
action="store_true", dest="dryrun", default=False,
|
| 253 |
help="Don't actually download or save pages (use with -v).")
|
| 254 |
optparser.add_option("-i", "--title-index",
|
| 255 |
action="store", dest="titleindex", metavar='URL',
|
| 256 |
default='http://docs.turbogears.org/TitleIndex',
|
| 257 |
help="Give URL to TitleIndex page on the TurboGears docs wiki.")
|
| 258 |
optparser.add_option("-s", "--name-space",
|
| 259 |
action="store", dest="namespace", default='1.0', metavar='PREFIX',
|
| 260 |
help="Specify the wiki namespace which should be downloaded.")
|
| 261 |
optparser.add_option("-d", "--dest-dir",
|
| 262 |
action="store", dest="destdir", default=os.curdir, metavar='DIR',
|
| 263 |
help="Destination directory where pages are saved. Default: current directory")
|
| 264 |
optparser.add_option("-f", "--format",
|
| 265 |
action="store", dest="format", default='html',
|
| 266 |
help="Output format (rest, html). Default: html")
|
| 267 |
optparser.add_option("-t", "--template",
|
| 268 |
action="store", dest="template", metavar='TEMPLATE',
|
| 269 |
help="Jinja template for ReST to HTML conversion.")
|
| 270 |
optparser.add_option("-e", "--file-ext",
|
| 271 |
action="store", dest="fileext", default='.html', metavar='EXT',
|
| 272 |
help="File extension for saved pages. Default: '.html'")
|
| 273 |
|
| 274 |
(options, args) = optparser.parse_args(args=args)
|
| 275 |
|
| 276 |
parts = urlparse.urlparse(options.titleindex)
|
| 277 |
options.root_url = urlparse.urlunparse(parts[:2] + ('', '', '', ''))
|
| 278 |
options.parent_url = \
|
| 279 |
urlparse.urlunparse(parts[:2] + (parts[2] or '/', '', '', ''))
|
| 280 |
|
| 281 |
if options.format == 'html':
|
| 282 |
options.fileext = '.html'
|
| 283 |
page_filters.append(('Fix Links', LinkFixer()))
|
| 284 |
if options.verbose:
|
| 285 |
logging.basicConfig(level=logging.INFO)
|
| 286 |
if options.format == 'html':
|
| 287 |
rest2html = ReST2HTML()
|
| 288 |
if options.template:
|
| 289 |
rest2html.set_template(options.template)
|
| 290 |
page_filters.append(('ReST to HTML', rest2html))
|
| 291 |
|
| 292 |
pages = get_page_urls()
|
| 293 |
if options.verbose:
|
| 294 |
from pprint import pformat
|
| 295 |
logging.debug(pformat(pages))
|
| 296 |
process_pages(pages, page_filters)
|
| 297 |
return 0
|
| 298 |
|
| 299 |
if __name__ == '__main__':
|
| 300 |
sys.exit(main(sys.argv[1:]))
|