Stickum

Stickum strikes back.
Python code pasted @ 15:30 on Sat, 13 Jan 07
Copy & Paste Plain Text
1
#!/usr/bin/env python
2
"""Download all TurboGears documentation pages from the wiki in ReST format.
3
4
WARNING: This is not complete. It is provided for anybody who wants to work
5
on offline doc generation for TurboGears and needs a starting point.
6
7
Things that are working:
8
9
- parsing the wiki TitleIndex
10
- downloading ReST sources
11
- converting ReST to HTML
12
- using templates for conversion
13
- saving generated pages
14
15
TODO:
16
17
- Correctly fix links
18
- Handle attachments
19
- PDF conversion?
20
- CSS handling (could be done in template)
21
22
23
I use this simple Jinja template for testing:
24
25
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
26
"http://www.w3.org/TR/html4/loose.dtd">
27
<head>
28
  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
29
30
  <link rel="stylesheet" type="text/css" href="rest.css" />
31
32
  <title>{{ title | e }}</title>
33
</head>
34
<body>
35
{{ html_body }}
36
</body>
37
38
"""
39
40
import logging
41
import os
42
import re
43
import sys
44
import urllib
45
import urlparse
46
from optparse import OptionParser
47
from os.path import abspath, basename, dirname, join, splitext
48
49
from BeautifulSoup import BeautifulSoup
50
from docutils.core import publish_parts
51
from jinja import Template, Context, FileSystemLoader
52
53
__program__   = "get_docs.py"
54
__author__    = "Christopher Arndt"
55
__version__   = "0.1"
56
__revision__  = "$Rev$"
57
__date__      = "$Date$"
58
__copyright__ = "MIT license"
59
60
class ReST2HTML(object):
61
62
    _rest_marker = re.compile(r'^#format\s+rst', re.M|re.I)
63
64
    def __init__(self, template=None):
65
        self.set_template(template)
66
67
    def set_template(self, template):
68
        if template:
69
            tmpl = splitext(basename(template))[0]
70
            tmpl_dir = abspath(dirname(template))
71
            self._template = Template(tmpl, FileSystemLoader(tmpl_dir))
72
        else:
73
            self._template = None
74
75
    def __call__(self, page, text):
76
        """Convert text (unicode) in ReST syntax to HTML (unicode)."""
77
78
        text, nsubst = self._rest_marker.subn('', text)
79
        text = text.strip()
80
        if not text:
81
            return u''
82
        if nsubst:
83
            # we have ReST markup
84
            overrides = dict(input_encoding='unicode',
85
              output_encoding='unicode')
86
            rst = publish_parts(text, settings_overrides=overrides,
87
              writer_name='html')
88
        else:
89
            from cgi import escape
90
            rst = dict(html_body= '<pre>\n%s\n</pre>\n' % escape(text))
91
        if self._template:
92
            rst.setdefault('title', page[0][-1])
93
            return self._template.render(Context(rst))
94
        else:
95
            return rst.get('html', rst['html_body'])
96
97
class RegExFilter(object):
98
    filters = [
99
        # remove MoinMoin comments
100
        (r'^##.*$', ''),
101
        # remove MoinMoin processing instructions
102
        # but leave in parser pi ('format')
103
        (r'(?i)^#(acl|language|pragma|redirect|refresh)\s+.*$', ''),
104
        # remove macro directives
105
        (r'^\.\. macro:: .*$', ''),
106
    ]
107
108
    def __init__(self):
109
        for i,f in enumerate(self.filters):
110
            self.filters[i] = (re.compile(f[0], re.M), f[1])
111
112
    def __call__(self, page, text):
113
        """Apply all regexes in self.filters to text."""
114
115
        for rx, subst in self.filters:
116
            text = rx.sub(subst, text)
117
        return text
118
119
class LinkFixer(RegExFilter):
120
    """Strip namespace prefix from links and append file extension.
121
122
    FIXME: This is far from complete and correct. Needs more testing
123
    """
124
125
    def __init__(self):
126
        self.ns = options.namespace
127
        self.ext = options.fileext
128
        self.rurl = options.root_url
129
        self.url_rx = re.compile(r'(?P<title>.*?)\s+<(?P<url>.+?)>')
130
        self.filters = [
131
            # fix links of form `SomePage`_
132
            # (only if they don't start with the namespace prefix)
133
            #(r'`([^<`]*?)`_' , r'`\1 <%s/\1>`_' % rurl ),
134
135
            # fix links of form `SomePage`_
136
            (r'`(?P<url>.*?)`_', self.handle_shortlink),
137
138
            # fix links of form `Title <1.0/SomePage>`_
139
            #(r'`(.*?)\s+<%s/(.*?)>`_' % ns, r'`\1 <\2%s>`_' % ext),
140
141
            # fix links of form .. _Title: 1.0/SomePage
142
            #(r'^\.\. _(.*?): %s/(.*?)' % ns, r'.. _\1: \2%s' % ext),
143
        ]
144
        super(LinkFixer, self).__init__()
145
146
    def handle_shortlink(self, m):
147
        url = m.group('url')
148
        m = self.url_rx.search(url)
149
        if m:
150
            url = m.group('url')
151
            title = m.group('title')
152
        if url.startswith('%s/' % self.ns):
153
            title = url.split('/')[-1]
154
            url = xlate_pagename(url) + self.ext
155
        elif not '://' in url:
156
            title = url
157
            url = '%s/%s' % (self.rurl, url)
158
        else:
159
            title = url
160
        return '`%s <%s>`_' % (title, url)
161
162
def xlate_pagename(pagename, sep='__'):
163
    """Transform pagename by converting forward slashes into sep."""
164
165
    return sep.join(pagename.split('/')[1:])
166
167
168
def get_page_urls():
169
    """Get TitleIndex page from TG wiki and parse it for list of doc URLs."""
170
171
    logging.info("Dowloading wiki TitleIndex from '%s'" % options.titleindex)
172
    req = urllib.urlopen(options.titleindex)
173
    if not req.headers.get('Status').startswith('200'):
174
        raise IOError, 'Could not retrieve TitleINdex from %s' % \
175
          options.titleindex
176
    html = req.read()
177
    soup = BeautifulSoup(html)
178
179
    logging.info("Parsing TitleIndex to find documentation pages")
180
    urls = dict()
181
    for link in soup.findAll('a', href=True):
182
        if link.string and link.string.startswith(options.namespace):
183
            parts = urlparse.urlparse(link['href'])
184
            rurl = parts[2]
185
            # remove '/' + namespace on left side of URL
186
            key = rurl.lstrip('/')[len(options.namespace):]
187
            if not key:
188
                key = u'index'
189
            info = urls.setdefault(key, dict())
190
            # flag pages with attachments
191
            if parts[4] == 'action=AttachFile':
192
                info['attachments'] = True
193
            if rurl.startswith('/'):
194
                info['url'] = options.root_url + rurl
195
            else:
196
                info['url'] = options.parent_url + rurl
197
    return urls.items()
198
199
def process_pages(pages, page_filters=[]):
200
    """Get all pages in dict 'pages' in ReST format, fic links and save them."""
201
202
    for pid, page in pages:
203
        url = page['url']
204
        try:
205
            logging.info('Dowloading %s from %s' % (pid, url))
206
            if not options.dryrun:
207
                req = urllib.urlopen(url + '?action=raw')
208
                if not req.headers.get('status').startswith('200'):
209
                    raise IOError
210
        except IOError:
211
            warnings.warn('Could not download %s' % page['url'])
212
        else:
213
            if not options.dryrun:
214
                rst = unicode(req.read())
215
            else:
216
                rst = None
217
            for desc,f in page_filters:
218
                logging.info("Applying filter '%s' to '%s'" % (desc, pid))
219
                if not options.dryrun:
220
                    rst = f((pid, page), rst)
221
            if rst:
222
                save_page(pid, pages, rst)
223
            else:
224
                logging.info("Empty page '%s' will not be saved" % pid)
225
226
def save_page(pagename, pages, content):
227
228
    path = join(options.destdir, xlate_pagename(pagename)) + options.fileext
229
    logging.info("Saving page to '%s'" % path)
230
    if not options.dryrun:
231
        try:
232
            os.makedirs(dirname(path))
233
        except: pass
234
        fo = open(path, 'wb')
235
        fo.write(content)
236
        fo.close()
237
238
def main(args):
239
    global options, optparser
240
241
    page_filters = [
242
        ('Regular expression filter', RegExFilter()),
243
        ('Strip whitespace', lambda x,y: y.strip()),
244
    ]
245
246
    optparser = OptionParser(prog=__program__,
247
      version=__version__, description=__doc__)
248
    optparser.add_option("-v", "--verbose",
249
      action="store_true", dest="verbose", default=False,
250
      help="Print what's going on to stdout.")
251
    optparser.add_option("-n", "--dry-run",
252
      action="store_true", dest="dryrun", default=False,
253
      help="Don't actually download or save pages (use with -v).")
254
    optparser.add_option("-i", "--title-index",
255
      action="store", dest="titleindex", metavar='URL',
256
      default='http://docs.turbogears.org/TitleIndex',
257
      help="Give URL to TitleIndex page on the TurboGears docs wiki.")
258
    optparser.add_option("-s", "--name-space",
259
      action="store", dest="namespace", default='1.0', metavar='PREFIX',
260
      help="Specify the wiki namespace which should be downloaded.")
261
    optparser.add_option("-d", "--dest-dir",
262
      action="store", dest="destdir", default=os.curdir, metavar='DIR',
263
      help="Destination directory where pages are saved. Default: current directory")
264
    optparser.add_option("-f", "--format",
265
      action="store", dest="format", default='html',
266
      help="Output format (rest, html). Default: html")
267
    optparser.add_option("-t", "--template",
268
      action="store", dest="template", metavar='TEMPLATE',
269
      help="Jinja template for ReST to HTML conversion.")
270
    optparser.add_option("-e", "--file-ext",
271
      action="store", dest="fileext", default='.html', metavar='EXT',
272
      help="File extension for saved pages. Default: '.html'")
273
274
    (options, args) = optparser.parse_args(args=args)
275
276
    parts = urlparse.urlparse(options.titleindex)
277
    options.root_url = urlparse.urlunparse(parts[:2] + ('', '', '', ''))
278
    options.parent_url = \
279
      urlparse.urlunparse(parts[:2] + (parts[2] or '/', '', '', ''))
280
281
    if options.format == 'html':
282
        options.fileext = '.html'
283
    page_filters.append(('Fix Links', LinkFixer()))
284
    if options.verbose:
285
        logging.basicConfig(level=logging.INFO)
286
    if options.format == 'html':
287
        rest2html = ReST2HTML()
288
        if options.template:
289
            rest2html.set_template(options.template)
290
        page_filters.append(('ReST to HTML', rest2html))
291
292
    pages = get_page_urls()
293
    if options.verbose:
294
        from pprint import pformat
295
        logging.debug(pformat(pages))
296
    process_pages(pages, page_filters)
297
    return 0
298
299
if __name__ == '__main__':
300
    sys.exit(main(sys.argv[1:]))