convert-sphinx.py

   1 import os, glob
   2 from bs4 import BeautifulSoup
   3 from subprocess import run, PIPE
   4 from urllib.parse import urlparse
   5 import dashtable
   6
   7 def html_to_rst(input):
   8     return run(['pandoc', '-f', 'html', '-t', 'rst'],
   9                input=input, stdout=PIPE, universal_newlines=True).stdout
  10
  11 def convert_toc(filename):
  12     with open(filename, encoding='utf8') as input:
  13         soup = BeautifulSoup(input, 'html5lib')
  14         body = soup.find('body')
  15         with open('./docs/contents.rst', 'w', encoding='utf-8') as output:
  16             for elm in body.contents:
  17                 if elm.name == 'h2':
  18                     output.write(""".. toctree::
  19    :maxdepth: 1
  20    :caption: {0}
  21    :hidden:\n""".format(elm.get_text()))
  22                 elif elm.name == 'ul':
  23                     output.write('\n')
  24                     for li in elm.contents:
  25                         if li.name == 'li':
  26                             a = li.find('a')
  27                             url = a['href']
  28                             if url == 'index.html':
  29                                 output.write('   self\n')
  30                             elif bool(urlparse(url).netloc):
  31                                 output.write('   {0} <{1}>\n'.format(a.get_text(), url))
  32                             else:
  33                                 output.write('   {0}\n'.format(url[:-5]))
  34                     output.write('\n')
  35                 elif elm.name == 'dl':
  36                     a = elm.find('a')
  37                     output.write('\n   {0} <{1}>\n'.format(a.get_text(), url))
  38                 elif hasattr(elm, 'contents'):
  39                     print('**** UNKNOWN: ' + str(elm))
  40                     exit(1)
  41     print("SUCCESS: " + filename)
  42
  43 def convert_article(filename):
  44     with open(filename, encoding='utf8') as input:
  45         soup = BeautifulSoup(input, 'html5lib')
  46
  47         table = None
  48         if filename == './docs/release-calendar.html':
  49             table = dashtable.html2rst(str(soup.table.extract()))
  50
  51         content = soup.find('div', 'content')
  52         content = ''.join(map(str, content.contents))
  53         content = html_to_rst(str(content))
  54
  55         if table:
  56             content = '\n'.join([content, table, ''])
  57
  58         with open(os.path.splitext(filename)[0]+'.rst', 'w', encoding='utf-8') as output:
  59             output.write(str(content))
  60             if filename == './docs/relnotes.html':
  61                 output.write("""\n.. toctree::
  62    :maxdepth: 1
  63    :hidden:\n""")
  64                 output.write('\n')
  65                 for li in soup.findAll('li'):
  66                     a = li.find('a')
  67                     url = a['href']
  68                     split = os.path.splitext(url)
  69                     if split[1] == '.html':
  70                         output.write('   {0}\n'.format(split[0]))
  71                 output.write('   Older Versions <versions>\n')
  72
  73     print("SUCCESS: " + filename)
  74
  75 for filename in glob.iglob('./docs/**/*.html', recursive=True):
  76     if filename == './docs/contents.html':
  77         convert_toc(filename)
  78     else:
  79         convert_article(filename)