Source code for wrdrd.tools.stripsinglehtml

#!/usr/bin/env python
# encoding: utf-8
from __future__ import print_function
"""
wrdrd.tools.stripsinglehtml
"""
import codecs
import bs4


[docs] def stripsinglehtml(path='index.html'): """ strip markup from sphinx singlehtml files (rather than writing a sphinx [...]-er) Args: path (str): path to a Sphinx singlehtml file Returns: bs4.BeautifulSoup: stripped HTML file """ contents = None with codecs.open(path, 'r', encoding='utf8') as f: contents = f.read() bs = bs4.BeautifulSoup(contents) bs.find('body') bs = bs.find('body') [x.extract() for x in bs.find_all('a', {'class': 'headerlink'})] elem = bs.find('div', {'class': 'sphinxsidebar'}) elem and elem.extract() elem = bs.find('div', {'id': 'indices-and-tables'}) elem and elem.extract() [x.extract() for x in bs.find_all('div', {'class': 'related'})] return bs
import unittest
[docs] class Test_stripsinglehtml(unittest.TestCase):
[docs] def test_stripsinglehtml(self): filename = './test/data/singlehtml.html' output = stripsinglehtml(filename) self.assertTrue(output)
[docs] def main(*args): """ :py:mod:`wrdrd.tools.stripsinglehtml` main method: print unicode stripsinglehtml output to ``stdout``. Args: args (list): list of commandline arguments Returns: int: zero """ import logging import optparse import sys prs = optparse.OptionParser( usage="%prog <path>", description="Strip markup from a Sphinx singlehtml HTML page") prs.add_option('-v', '--verbose', dest='verbose', action='store_true',) prs.add_option('-q', '--quiet', dest='quiet', action='store_true',) prs.add_option('-t', '--test', dest='run_tests', action='store_true',) args = args and list(args) or sys.argv[1:] (opts, args) = prs.parse_args() if not opts.quiet: logging.basicConfig() if opts.verbose: logging.getLogger().setLevel(logging.DEBUG) if opts.run_tests: sys.argv = [sys.argv[0]] + args import unittest sys.exit(unittest.main()) if len(args) != 1: raise Exception("Must specify a file path") path = args[0] sys.stdout = codecs.getwriter('utf-8')(sys.stdout, errors='replace') bs = stripsinglehtml(path) print(unicode(bs)) return 0
if __name__ == "__main__": import sys sys.exit(main())