#!/usr/bin/env python # -*- coding: utf-8 -*- # $Id: extract_onlinehelp_links.py 10982 2010-09-15 11:02:25Z pstorz $ import re import pprint from httplib import * baseurl = 'http://www.bacula.org/5.0.x-manuals/en/main/main/' htmlpages = { 'client': 'Client_File_daemon_Configur.html', 'storagedaemon':'Storage_Daemon_Configuratio.html', 'autochanger':'Autochanger_Resource.html', 'director': 'Configuring_Director.html', 'console':'Console_Configuration.html', 'monitor':'Monitor_Configuration.html', 'messages': 'Messages_Resource.html', } childtablelinks = ''' Subsections ''' RXP_TABLE_OF_CHILDLINKS = re.compile('^', re.M|re.S) #RXP_HREF = re.compile('HREF="(?P[^"]+)">(?P.*)</A>') RXP_HREF = re.compile('HREF="(?P<anchor>[^"]+)">(The )?(?P<title>.*) Resource</A>') #for href in RXP_TABLE_OF_CHILDLINKS.finditer(childtablelinks): # print href.groups() #print href.group('title'),href.group('anchor') htmlhelp = {} connection = HTTPConnection('www.bacula.org') for config,page in htmlpages.iteritems(): connection.request('GET', baseurl+page ) reply = connection.getresponse() print reply.status, reply.reason htmltext = reply.read() htmlhelp[config]={} for childlinks in RXP_TABLE_OF_CHILDLINKS.finditer(htmltext): #print childlinks for href in RXP_HREF.finditer(childlinks.group(0)): print config,href.group('title') + ' link: ' + href.group('anchor') htmlhelp[config][href.group('title').lower()]=baseurl+href.group('anchor') pp = pprint.PrettyPrinter(indent=4) pp.pprint(htmlhelp)