source: dassmodus/trunk/dassmodus/nosferatu/nosferatu/tools/extract_onlinehelp_links.py@ 953

Last change on this file since 953 was 953, checked in by pstorz, on Sep 28, 2011 at 11:32:32 AM

first checkin

File size: 4.4 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4# $Id: extract_onlinehelp_links.py 10982 2010-09-15 11:02:25Z pstorz $
5
6import re
7import pprint
8from httplib import *
9
10baseurl = 'http://www.bacula.org/5.0.x-manuals/en/main/main/'
11htmlpages = {
12 'client': 'Client_File_daemon_Configur.html',
13 'storagedaemon':'Storage_Daemon_Configuratio.html',
14 'autochanger':'Autochanger_Resource.html',
15 'director': 'Configuring_Director.html',
16 'console':'Console_Configuration.html',
17 'monitor':'Monitor_Configuration.html',
18 'messages': 'Messages_Resource.html',
19}
20childtablelinks = '''
21<!--Table of Child-Links-->
22<A NAME="CHILD_LINKS"><STRONG>Subsections</STRONG></A>
23
24<UL CLASS="ChildLinks">
25
26<LI><A NAME="tex2html1449"
27 HREF="Configuring_Director.html#SECTION001810000000000000000">Director Resource Types</A>
28<LI><A NAME="tex2html1450"
29 HREF="Configuring_Director.html#SECTION001820000000000000000">The Director Resource</A>
30<LI><A NAME="tex2html1451"
31 HREF="Configuring_Director.html#SECTION001830000000000000000">The Job Resource</A>
32<LI><A NAME="tex2html1452"
33 HREF="Configuring_Director.html#SECTION001840000000000000000">The JobDefs Resource</A>
34<LI><A NAME="tex2html1453"
35 HREF="Configuring_Director.html#SECTION001850000000000000000">The Schedule Resource</A>
36<LI><A NAME="tex2html1454"
37 HREF="Configuring_Director.html#SECTION001860000000000000000">Technical Notes on Schedules</A>
38<LI><A NAME="tex2html1455"
39 HREF="Configuring_Director.html#SECTION001870000000000000000">The FileSet Resource</A>
40<LI><A NAME="tex2html1456"
41 HREF="Configuring_Director.html#SECTION001880000000000000000">FileSet Examples</A>
42<LI><A NAME="tex2html1457"
43 HREF="Configuring_Director.html#SECTION001890000000000000000">Backing up Raw Partitions</A>
44
45<LI><A NAME="tex2html1458"
46 HREF="Configuring_Director.html#SECTION0018100000000000000000">Excluding Files and Directories</A>
47<LI><A NAME="tex2html1459"
48 HREF="Configuring_Director.html#SECTION0018110000000000000000">Windows FileSets</A>
49<UL>
50<LI><A NAME="tex2html1460"
51 HREF="Configuring_Director.html#SECTION0018110010000000000000">A Windows Example FileSet</A>
52<LI><A NAME="tex2html1461"
53 HREF="Configuring_Director.html#SECTION0018110020000000000000">Windows NTFS Naming Considerations</A>
54</UL>
55<BR>
56<LI><A NAME="tex2html1462"
57 HREF="Configuring_Director.html#SECTION0018120000000000000000">Testing Your FileSet</A>
58<LI><A NAME="tex2html1463"
59 HREF="Configuring_Director.html#SECTION0018130000000000000000">The Client Resource</A>
60<LI><A NAME="tex2html1464"
61 HREF="Configuring_Director.html#SECTION0018140000000000000000">The Storage Resource</A>
62
63<LI><A NAME="tex2html1465"
64 HREF="Configuring_Director.html#SECTION0018150000000000000000">The Pool Resource</A>
65<UL>
66<LI><A NAME="tex2html1466"
67 HREF="Configuring_Director.html#SECTION0018151000000000000000">The Scratch Pool</A>
68</UL>
69<BR>
70<LI><A NAME="tex2html1467"
71 HREF="Configuring_Director.html#SECTION0018160000000000000000">The Catalog Resource</A>
72<LI><A NAME="tex2html1468"
73 HREF="Configuring_Director.html#SECTION0018170000000000000000">The Messages Resource</A>
74<LI><A NAME="tex2html1469"
75 HREF="Configuring_Director.html#SECTION0018180000000000000000">The Console Resource</A>
76<LI><A NAME="tex2html1470"
77 HREF="Configuring_Director.html#SECTION0018190000000000000000">The Counter Resource</A>
78<LI><A NAME="tex2html1471"
79 HREF="Configuring_Director.html#SECTION0018200000000000000000">Example Director Configuration File</A>
80
81</UL>
82<!--End of Table of Child-Links-->
83'''
84
85
86RXP_TABLE_OF_CHILDLINKS = re.compile('^<UL CLASS="ChildLinks">(.*)^</UL>', re.M|re.S)
87#RXP_HREF = re.compile('HREF="(?P<anchor>[^"]+)">(?P<title>.*)</A>')
88
89RXP_HREF = re.compile('HREF="(?P<anchor>[^"]+)">(The )?(?P<title>.*) Resource</A>')
90#for href in RXP_TABLE_OF_CHILDLINKS.finditer(childtablelinks):
91# print href.groups()
92 #print href.group('title'),href.group('anchor')
93
94
95
96htmlhelp = {}
97
98connection = HTTPConnection('www.bacula.org')
99for config,page in htmlpages.iteritems():
100 connection.request('GET', baseurl+page )
101 reply = connection.getresponse()
102 print reply.status, reply.reason
103 htmltext = reply.read()
104
105 htmlhelp[config]={}
106
107 for childlinks in RXP_TABLE_OF_CHILDLINKS.finditer(htmltext):
108 #print childlinks
109 for href in RXP_HREF.finditer(childlinks.group(0)):
110 print config,href.group('title') + ' link: ' + href.group('anchor')
111 htmlhelp[config][href.group('title').lower()]=baseurl+href.group('anchor')
112
113pp = pprint.PrettyPrinter(indent=4)
114
115pp.pprint(htmlhelp)
Note: See TracBrowser for help on using the repository browser.