#!/usr/bin/env python
# -*- coding: utf-8 -*-
# $Id: extract_onlinehelp_links.py 10982 2010-09-15 11:02:25Z pstorz $
import re
import pprint
from httplib import *
baseurl = 'http://www.bacula.org/5.0.x-manuals/en/main/main/'
htmlpages = {
'client': 'Client_File_daemon_Configur.html',
'storagedaemon':'Storage_Daemon_Configuratio.html',
'autochanger':'Autochanger_Resource.html',
'director': 'Configuring_Director.html',
'console':'Console_Configuration.html',
'monitor':'Monitor_Configuration.html',
'messages': 'Messages_Resource.html',
}
childtablelinks = '''
Subsections
'''
RXP_TABLE_OF_CHILDLINKS = re.compile('^', re.M|re.S)
#RXP_HREF = re.compile('HREF="(?P[^"]+)">(?P.*)')
RXP_HREF = re.compile('HREF="(?P[^"]+)">(The )?(?P.*) Resource')
#for href in RXP_TABLE_OF_CHILDLINKS.finditer(childtablelinks):
# print href.groups()
#print href.group('title'),href.group('anchor')
htmlhelp = {}
connection = HTTPConnection('www.bacula.org')
for config,page in htmlpages.iteritems():
connection.request('GET', baseurl+page )
reply = connection.getresponse()
print reply.status, reply.reason
htmltext = reply.read()
htmlhelp[config]={}
for childlinks in RXP_TABLE_OF_CHILDLINKS.finditer(htmltext):
#print childlinks
for href in RXP_HREF.finditer(childlinks.group(0)):
print config,href.group('title') + ' link: ' + href.group('anchor')
htmlhelp[config][href.group('title').lower()]=baseurl+href.group('anchor')
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(htmlhelp)