Codebase list archmage / 64c9807
New upstream version 0.4.0 Mikhail Gusarov 4 years ago
128 changed file(s) with 1391 addition(s) and 1714 deletion(s). Raw diff Collapse all Expand all
00 Copyright (c) 2003 Eugeny Korekin <az@ftc.ru>
11 Copyright (c) 2005-2009 Basil Shubin <basil.shubin@gmail.com>
2 Copyright (c) 2015 Mikhail Gusarov <dottedmag@dottedmag.net>
2 Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
0 include archmod/arch.conf
1 recursive-include archmod/templates *.html *.css *.gif
2 include AUTHORS COPYING INSTALL NEWS README archmage.1
3 include RELEASE-VERSION version.py
0 include archmage/arch.conf
1 recursive-include archmage/templates *.html *.css *.gif
2 include AUTHORS COPYING NEWS README.md archmage.1
0 arCHMage 0.4
1 ============
2 Changes:
3
4 * Works with Python 3.5+ (#10).
5
6 Bugfixes:
7
8 * Fix HTML conversion under Windows (#6).
9
10 Removals:
11
12 * mod_chm and option -p were removed. Extract CHM files to the filesystem and
13 use a real HTTP server to serve them.
14
015 arCHMage 0.3.1
116 ==============
217 Bug fixes:
0 Metadata-Version: 1.1
0 Metadata-Version: 1.2
11 Name: archmage
2 Version: 0.3.1
2 Version: 0.4.0
33 Summary: CHM decompressor
44 Home-page: https://github.com/dottedmag/archmage
5 Author: Mikhail Gusarov
6 Author-email: dottedmag@dottedmag.net
5 Maintainer: Mikhail Gusarov
6 Maintainer-email: dottedmag@dottedmag.net
77 License: GPLv2+
88 Description: arCHMage is a reader and decompressor for CHM format
99 Keywords: chm,HTML Help,Compiled HTML,Compressed HTML
0 arCHMage
1 ========
2
3 arCHMage converts CHM files to HTML, plain text and PDF. CHM is the format used
4 by Microsoft HTML Help, also known as Compiled HTML.
5
6 [![Latest Version](https://img.shields.io/pypi/v/archmage.svg)](https://pypi.python.org/pypi/archmage/)
7 [![Downloads](https://img.shields.io/pypi/dm/archmage.svg)](https://pypi.python.org/pypi/archmage/)
8 [![License](https://img.shields.io/github/license/dottedmag/archmage.svg)](https://pypi.python.org/pypi/archmage/)
9
10 Usage
11 =====
12
13 Extract CHM content into directory
14 ----------------------------------
15
16 archmage -x <chmfile> [output directory]
17
18 Extraction does not overwrite existing directories.
19
20 Dump HTML data from CHM
21 -----------------------
22
23 archmage -d <chmfile>
24
25 Convert CHM file into another format
26 ------------------------------------
27
28 archmage -c (html|text|pdf) <chmfile> [output file]
29
30 This feature requires `htmldoc(1)`, and `lynx(1)` or `elinks(1)` installed.
31
32 Installation
33 ============
34
35 pip install archmage
36
37 Requirements
38 ============
39
40 arCHMage has the following dependencies:
41
42 * Python 3.5+
43 * PyCHM
44 * BeautifulSoup4
45
46 Optional dependencies:
47
48 * htmldoc - converting to plain text, single HTML, PDF formats
49 (Debian/Ubuntu: `htmldoc`)
50 * Lynx or ELinks - converting to plain text
51 (Debian/Ubuntu: `lynx`)
+0
-1
RELEASE-VERSION less more
0 0.3.1
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
6 #
7 # This program is free software; you can redistribute it and/or modify it under
8 # the terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
10 # version.
11 #
12 # This program is distributed in the hope that it will be useful, but WITHOUT
13 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
15 # details.
16 #
17 # You should have received a copy of the GNU General Public License along with
18 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
19 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 #
21
22 import os
23 import sys
24 import re
25 import shutil
26 import errno
27 import string
28 import tempfile
29 from enum import Enum, auto
30
31 import archmage
32
33 from archmage.CHMParser import SitemapFile, PageLister, ImageCatcher, TOCCounter#, HeadersCounter
34
35 # import PyCHM bindings
36 try:
37 from chm import chmlib
38 except ImportError as msg:
39 sys.exit('ImportError: %s\nPlease check README file for system requirements.' % msg)
40
41 # External file converters
42 from archmage.chmtotext import chmtotext
43 from archmage.htmldoc import htmldoc
44
45 class Action(Enum):
46 EXTRACT = auto()
47 DUMPHTML = auto()
48 CHM2TXT = auto()
49 CHM2HTML = auto()
50 CHM2PDF = auto()
51
52 PARENT_RE = re.compile(r'(^|/|\\)\.\.(/|\\|$)')
53
54 class CHMFile:
55 """Class that represent CHM content from directory"""
56
57 def __init__(self, name):
58 self.cache = {}
59 # Name of source directory with CHM content
60 self.sourcename = name
61 self._chm = chmlib.chm_open(name)
62 # Import variables from config file into namespace
63 exec(compile(open(archmage.config, "rb").read(), archmage.config, 'exec'), self.__dict__)
64
65 # build regexp from the list of auxiliary files
66 self.aux_re = '|'.join([ re.escape(s) for s in self.auxes ])
67
68 # Get and parse 'Table of Contents'
69 try:
70 self.topicstree = self.topics()
71 except AttributeError:
72 self.topicstree = None
73 self.contents = SitemapFile(self.topicstree).parse()
74
75 def close(self):
76 chmlib.chm_close(self._chm)
77
78 def entries(self):
79 if 'entries' not in self.cache:
80 self.cache['entries'] = self._entries()
81 return self.cache['entries']
82
83 def _entries(self):
84 def get_name(chmfile, ui, out):
85 path = ui.path.decode('utf-8')
86 if path != '/':
87 out.append(path)
88 return chmlib.CHM_ENUMERATOR_CONTINUE
89
90 out = []
91 if chmlib.chm_enumerate(self._chm, chmlib.CHM_ENUMERATE_ALL, get_name, out) == 0:
92 sys.exit('UnknownError: CHMLIB or PyCHM bug?')
93 return out
94
95 # retrieves the list of HTML files contained into the CHM file, **in order**
96 # (that's the important bit).
97 # (actually performed by the PageLister class)
98 def html_files(self):
99 if 'html_files' not in self.cache:
100 self.cache['html_files'] = self._html_files()
101 return self.cache['html_files']
102
103 def _html_files(self):
104 lister = PageLister()
105 lister.feed(self.topicstree)
106 return lister.pages
107
108 # retrieves the list of images urls contained into the CHM file.
109 # (actually performed by the ImageCatcher class)
110 def image_urls(self):
111 if 'image_urls' not in self.cache:
112 self.cache['image_urls'] = self._image_urls()
113 return self.cache['image_urls']
114
115 def _image_urls(self):
116 out = []
117 image_catcher = ImageCatcher()
118 for file in self.html_files():
119 image_catcher.feed(CHMEntry(self, file).correct())
120 for image_url in image_catcher.imgurls:
121 if not out.count(image_url):
122 out.append(image_url)
123 return out
124
125 # retrieves a dictionary of actual file entries and corresponding urls into the CHM file
126 def image_files(self):
127 if 'image_files' not in self.cache:
128 self.cache['image_files'] = self._image_files()
129 return self.cache['image_files']
130
131 def _image_files(self):
132 out = {}
133 for image_url in self.image_urls():
134 for entry in self.entries():
135 if re.search(image_url, entry.lower()) and entry.lower() not in out:
136 out.update({entry : image_url})
137 return out
138
139 # Get topics file
140 def topics(self):
141 if 'topics' not in self.cache:
142 self.cache['topics'] = self._topics()
143 return self.cache['topics']
144
145 def _topics(self):
146 for e in self.entries():
147 if e.lower().endswith('.hhc'):
148 return CHMEntry(self, e, frontpage=self.frontpage()).get()
149
150 # use first page as deftopic. Note: without heading slash
151 def deftopic(self):
152 if 'deftopic' not in self.cache:
153 self.cache['deftopic'] = self._deftopic()
154 return self.cache['deftopic']
155
156 def _deftopic(self):
157 if self.html_files()[0].startswith('/'):
158 return self.html_files()[0].replace('/', '', 1).lower()
159 return self.html_files()[0].lower()
160
161 # Get frontpage name
162 def frontpage(self):
163 if 'frontpage' not in self.cache:
164 self.cache['frontpage'] = self._frontpage()
165 return self.cache['frontpage']
166
167 def _frontpage(self):
168 frontpage = os.path.join('/', 'index.html')
169 index = 2 # index2.html and etc.
170 for filename in self.entries():
171 if frontpage == filename:
172 frontpage = os.path.join('/', ('index%s.html' % index))
173 index += 1
174 return frontpage
175
176 # Get all templates files
177 def templates(self):
178 if 'templates' not in self.cache:
179 self.cache['templates'] = self._templates()
180 return self.cache['templates']
181
182 def _templates(self):
183 out = []
184 for file in os.listdir(self.templates_dir):
185 if os.path.isfile(os.path.join(self.templates_dir, file)):
186 if os.path.join('/', file) not in self.entries():
187 out.append(os.path.join('/', file))
188 return out
189
190 # Get ToC levels
191 def toclevels(self):
192 if 'toclevels' not in self.cache:
193 self.cache['toclevels'] = self._toclevels()
194 return self.cache['toclevels']
195
196 def _toclevels(self):
197 counter = TOCCounter()
198 counter.feed(self.topicstree)
199 if counter.count > self.maxtoclvl:
200 return self.maxtoclvl
201 else:
202 return counter.count
203
204 def get_template(self, name):
205 """Get template file by its name"""
206 if name == self.frontpage():
207 tpl = open(os.path.join(self.templates_dir, 'index.html')).read()
208 else:
209 tpl = open(os.path.join(self.templates_dir, os.path.basename(name))).read()
210 params = {
211 'title': self.title,
212 'contents': self.contents,
213 'deftopic': self.deftopic(),
214 'bcolor': self.bcolor,
215 'fcolor': self.fcolor,
216 }
217 return string.Template(tpl).substitute(params)
218
219 def process_templates(self, destdir="."):
220 """Process templates"""
221 for template in self.templates():
222 open(os.path.join(destdir, os.path.basename(template)), 'w').write(self.get_template(template))
223 if self.frontpage() not in self.templates():
224 open(os.path.join(destdir, os.path.basename(self.frontpage())), 'w').write(self.get_template('index.html'))
225 if not os.path.exists(os.path.join(destdir, 'icons/')):
226 shutil.copytree(os.path.join(self.icons_dir), os.path.join(destdir, 'icons/'))
227
228 def extract_entry(self, entry, output_file, destdir=".", correct=False):
229 # process output entry, remove first '/' in entry name
230 fname = output_file.lower().replace('/', '', 1)
231 # get directory name for file fname if any
232 dname = os.path.dirname(os.path.join(destdir, fname))
233 # if dname is a directory and it's not exist, than create it
234 if dname and not os.path.exists(dname):
235 os.makedirs(dname)
236 # otherwise write a file from CHM entry
237 if not os.path.isdir(os.path.join(destdir, fname)):
238 # write CHM entry content into the file, corrected or as is
239 if correct:
240 open(os.path.join(destdir, fname), 'wb').write(CHMEntry(self, entry).correct())
241 else:
242 open(os.path.join(destdir, fname), 'wb').write(CHMEntry(self, entry).get())
243
244 def extract_entries(self, entries=[], destdir=".", correct=False):
245 """Extract raw CHM entries into the files"""
246 for e in entries:
247 # if entry is auxiliary file, than skip it
248 if re.match(self.aux_re, e):
249 continue
250 if PARENT_RE.search(e):
251 raise RuntimeError('Giving up on malicious name: %s' % e)
252 self.extract_entry(e, output_file=e, destdir=destdir, correct=correct)
253
254 def extract(self, destdir):
255 """Extract CHM file content into FS"""
256 try:
257 # Create destination directory
258 os.mkdir(destdir)
259 # make raw content extraction
260 self.extract_entries(entries=self.entries(), destdir=destdir)
261 # process templates
262 self.process_templates(destdir=destdir)
263 except OSError as error:
264 if error.errno == errno.EEXIST:
265 sys.exit('%s is already exists' % destdir)
266
267 def dump_html(self, output=sys.stdout):
268 """Dump HTML data from CHM file into standard output"""
269 for e in self.html_files():
270 # if entry is auxiliary file, than skip it
271 if re.match(self.aux_re, e):
272 continue
273 print(CHMEntry(self, e).get(), file=output)
274
275 def chm2text(self, output=sys.stdout):
276 """Convert CHM into Single Text file"""
277 for e in self.html_files():
278 # if entry is auxiliary file, than skip it
279 if re.match(self.aux_re, e):
280 continue
281 # to use this function you should have 'lynx' or 'elinks' installed
282 chmtotext(input=CHMEntry(self, e).get(), cmd=self.chmtotext, output=output)
283
284 def htmldoc(self, output, format=Action.CHM2HTML):
285 """CHM to other file formats converter using htmldoc"""
286 # Extract CHM content into temporary directory
287 output = output.replace(' ', '_')
288 tempdir = tempfile.mkdtemp(prefix=output.rsplit('.', 1)[0])
289 self.extract_entries(entries=self.html_files(), destdir=tempdir, correct=True)
290 # List of temporary files
291 files = [ os.path.abspath(tempdir + file.lower()) for file in self.html_files() ]
292 if format == Action.CHM2HTML:
293 options = self.chmtohtml
294 # change output from single html file to a directory with html file and images
295 if self.image_files():
296 dirname = archmage.file2dir(output)
297 if os.path.exists(dirname):
298 sys.exit('%s is already exists' % dirname)
299 # Extract image files
300 os.mkdir(dirname)
301 # Extract all images
302 for key, value in list(self.image_files().items()):
303 self.extract_entry(entry=key, output_file=value, destdir=dirname)
304 # Fix output file name
305 output = os.path.join(dirname, output)
306 elif format == Action.CHM2PDF:
307 options = self.chmtopdf
308 if self.image_files():
309 # Extract all images
310 for key, value in list(self.image_files().items()):
311 self.extract_entry(entry=key, output_file=key.lower(), destdir=tempdir)
312 htmldoc(files, self.htmldoc_exec, options, self.toclevels, output)
313 # Remove temporary files
314 shutil.rmtree(path=tempdir)
315
316 class CHMEntry(object):
317 """Class for CHM file entry"""
318
319 def __init__(self, parent, name, frontpage='index.html'):
320 # parent CHM file
321 self.parent = parent
322 # object inside CHM file
323 self.name = name
324 # frontpage name to substitute
325 self.frontpage = os.path.basename(frontpage)
326
327 def read(self):
328 """Read CHM entry content"""
329 result, ui = chmlib.chm_resolve_object(self.parent._chm, self.name.encode('utf-8'))
330 if result != chmlib.CHM_RESOLVE_SUCCESS:
331 return None
332
333 size, content = chmlib.chm_retrieve_object(self.parent._chm, ui, 0, ui.length)
334 if size == 0:
335 return None
336 return content
337
338 def lower_links(self, text):
339 """Links to lower case"""
340 return re.sub(b'(?i)(href|src)\s*=\s*([^\s|>]+)', lambda m:m.group(0).lower(), text)
341
342 def add_restoreframing_js(self, name, text):
343 name = re.sub('/+', '/', name)
344 depth = name.count('/')
345
346 js = b"""<body><script language="javascript">
347 if (window.name != "content")
348 document.write("<center><a href='%s%s?page=%s'>show framing</a></center>")
349 </script>""" % ( b'../' * depth, self.frontpage.encode('utf8'), name.encode('utf8') )
350
351 return re.sub(b'(?i)<\s*body\s*>', js, text)
352
353 def correct(self):
354 """Get correct CHM entry content"""
355 data = self.read()
356 # If entry is a html page?
357 if re.search('(?i)\.html?$', self.name) and data is not None:
358 # lower-casing links if needed
359 if self.parent.filename_case:
360 data = self.lower_links(data)
361
362 # Delete unwanted HTML elements.
363 data = re.sub('<div .*teamlib\.gif.*\/div>', '', data)
364 data = re.sub('<a href.*>\[ Team LiB \]<\/a>', '', data)
365 data = re.sub('<table.*larrow\.gif.*rarrow\.gif.*<\/table>', '', data)
366 data = re.sub('<a href.*next\.gif[^>]*><\/a>', '' ,data)
367 data = re.sub('<a href.*previous\.gif[^>]*><\/a>', '', data)
368 data = re.sub('<a href.*prev\.gif[^>]*><\/a>', '', data)
369 data = re.sub('"[^"]*previous\.gif"', '""', data)
370 data = re.sub('"[^"]*prev\.gif"', '""', data)
371 data = re.sub('"[^"]*next\.gif"', '""', data)
372 if data is not None:
373 return data
374 else:
375 return ''
376
377 def get(self):
378 """Get CHM entry content"""
379 # read entry content
380 data = self.read()
381 # If entry is a html page?
382 if re.search('(?i)\.html?$', self.name) and data is not None:
383 # lower-casing links if needed
384 if self.parent.filename_case:
385 data = self.lower_links(data)
386 # restore framing if that option is set in config file
387 if self.parent.restore_framing:
388 data = self.add_restoreframing_js(self.name[1:], data)
389 if data is not None:
390 return data
391 else:
392 return ''
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
4 # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
5 #
6 # This program is free software; you can redistribute it and/or modify it under
7 # the terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 # details.
15 #
16 # You should have received a copy of the GNU General Public License along with
17 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #
20
21 import re
22 import mimetypes
23 import sgmllib, urllib.request, urllib.error, urllib.parse
24
25 from bs4 import BeautifulSoup, UnicodeDammit
26 from html.parser import HTMLParser
27 from urllib.parse import urlparse
28
29 START_TAG = '['
30 END_TAG = ']'
31
32
33 class SitemapFile(object):
34 """Sitemap file class"""
35
36 def __init__(self, lines):
37 # XXX: Cooking tasty beautiful soup ;-)
38 if lines:
39 soup = BeautifulSoup(lines, 'html.parser')
40 lines = soup.prettify()
41 # XXX: Removing empty tags
42 lines = re.sub(re.compile(r'<ul>\s*</ul>', re.I | re.M), '', lines)
43 lines = re.sub(re.compile(r'<li>\s*</li>', re.I | re.M), '', lines)
44 self.lines = lines
45 else:
46 self.lines = None
47
48 def parse(self):
49 p = SitemapParser()
50 if self.lines:
51 p.feed(self.lines)
52 # parsed text + last bracket
53 return (p.parsed + '\n' + END_TAG)
54
55
56 class TagStack(list):
57 """from book of David Mertz 'Text Processing in Python'"""
58
59 def append(self, tag):
60 # Remove every paragraph-level tag if this is one
61 if tag.lower() in ('p', 'blockquote'):
62 self = TagStack([ t for t in super if t not in ('p', 'blockquote') ])
63 super(TagStack, self).append(tag)
64
65 def pop(self, tag):
66 # 'Pop' by tag from nearest position, not only last item
67 self.reverse()
68 try:
69 pos = self.index(tag)
70 except ValueError:
71 raise Error('Tag not on stack')
72 self[:] = self[pos + 1:]
73 self.reverse()
74
75
76 class SitemapParser(sgmllib.SGMLParser):
77 """Class for parsing files in SiteMap format, such as .hhc"""
78
79 def __init__(self):
80 self.tagstack = TagStack()
81 self.in_obj = False
82 self.name = self.local = self.param = ""
83 self.imagenumber = 1
84 self.parsed = ""
85 sgmllib.SGMLParser.__init__(self)
86
87 def unknown_starttag(self, tag, attrs):
88 # first ul, start processing from here
89 if tag == 'ul' and not self.tagstack:
90 self.tagstack.append(tag)
91 # First bracket
92 self.parsed += '\n' + START_TAG
93
94 # if inside ul
95 elif self.tagstack:
96 if tag == 'li':
97 # append closing bracket if needed
98 if self.tagstack[-1] != 'ul':
99 self.parsed += END_TAG
100 self.tagstack.pop('li')
101 indent = ' ' * len(self.tagstack)
102
103 if self.parsed != '\n' + START_TAG:
104 self.parsed += ', '
105
106 self.parsed += '\n' + indent + START_TAG
107
108 if tag == 'object':
109 for x, y in attrs:
110 if x.lower() == 'type' and y.lower() == 'text/sitemap':
111 self.in_obj = True
112
113 if tag.lower() == 'param' and self.in_obj:
114 for x, y in attrs:
115 if x.lower() == 'name':
116 self.param = y.lower()
117 elif x.lower() == 'value':
118 if self.param == 'name' and not len(self.name):
119 # XXX: Remove LF and/or CR signs from name
120 self.name = y.replace('\n', '').replace('\r', '')
121 # XXX: Un-escaping double quotes :-)
122 self.name = self.name.replace('"', '\\"')
123 elif self.param == 'local':
124 # XXX: Change incorrect slashes in url
125 self.local = y.lower().replace('\\', '/').replace('..\\', '')
126 elif self.param == 'imagenumber':
127 self.imagenumber = y
128 self.tagstack.append(tag)
129
130 def unknown_endtag(self, tag):
131 # if inside ul
132 if self.tagstack:
133 if tag == 'ul':
134 self.parsed += END_TAG
135 if tag == 'object' and self.in_obj:
136 # "Link Name", "URL", "Icon"
137 self.parsed += "\"%s\", \"%s\", \"%s\"" % (self.name, self.local, self.imagenumber)
138 # Set to default values
139 self.in_obj = False
140 self.name = self.local = ""
141 self.imagenumber = 1
142 if tag != 'li':
143 self.tagstack.pop(tag)
144
145
146 class PageLister(sgmllib.SGMLParser):
147 """
148 Parser of the chm.chm GetTopicsTree() method that retrieves the URL of the HTML
149 page embedded in the CHM file.
150 """
151
152 def reset(self):
153 sgmllib.SGMLParser.reset(self)
154 self.pages = []
155
156 def feed(self, data):
157 sgmllib.SGMLParser.feed(self, UnicodeDammit(data).unicode_markup)
158
159 def start_param(self, attrs):
160 urlparam_flag = False
161 for key, value in attrs:
162 if key == 'name' and value.lower() == 'local':
163 urlparam_flag = True
164 if urlparam_flag and key == 'value':
165 # Sometime url has incorrect slashes
166 value = urllib.parse.unquote(urlparse(value.replace('\\', '/')).geturl())
167 value = '/' + re.sub("#.*$", '', value)
168 # Avoid duplicates
169 if not self.pages.count(value):
170 self.pages.append(value)
171
172
173 class ImageCatcher(sgmllib.SGMLParser):
174 """
175 Finds image urls in the current html page, so to take them out from the chm file.
176 """
177
178 def reset(self):
179 sgmllib.SGMLParser.reset(self)
180 self.imgurls = []
181
182 def start_img(self, attrs):
183 for key, value in attrs:
184 if key.lower() == 'src':
185 # Avoid duplicates in the list of image URLs.
186 if not self.imgurls.count('/' + value):
187 self.imgurls.append('/' + value)
188
189 def start_a(self, attrs):
190 for key, value in attrs:
191 if key.lower() == 'href':
192 url = urlparse(value)
193 value = urllib.parse.unquote(url.geturl())
194 # Remove unwanted crap
195 value = '/' + re.sub("#.*$", '', value)
196 # Check file's mimetype
197 type = mimetypes.guess_type(value)[0]
198 # Avoid duplicates in the list of image URLs.
199 if not url.scheme and not self.imgurls.count(value) and \
200 type and re.search('image/.*', type):
201 self.imgurls.append(value)
202
203
204 class TOCCounter(HTMLParser):
205 """Count Table of Contents levels"""
206
207 count = 0
208
209 def __init__(self):
210 self.tagstack = TagStack()
211 HTMLParser.__init__(self)
212
213 def handle_starttag(self, tag, attrs):
214 self.tagstack.append(tag)
215
216 def handle_endtag(self, tag):
217 if self.tagstack:
218 if tag.lower() == 'object':
219 if self.count < self.tagstack.count('param'):
220 self.count = self.tagstack.count('param')
221 if tag.lower() != 'li':
222 self.tagstack.pop(tag)
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
6 #
7 # This program is free software; you can redistribute it and/or modify it under
8 # the terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
10 # version.
11 #
12 # This program is distributed in the hope that it will be useful, but WITHOUT
13 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
15 # details.
16 #
17 # You should have received a copy of the GNU General Public License along with
18 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
19 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 #
21 __all__ = ['CHM']
22 __version__ = '0.4.0'
23
24 import sys, os, pkg_resources
25
26 # what config file to use - local or a system wide?
27 user_config = os.path.join(os.path.expanduser('~'), '.arch.conf')
28 if os.path.exists(user_config):
29 config = user_config
30 else:
31 config = pkg_resources.resource_filename('archmage', 'arch.conf')
32
33 def file2dir(filename):
34 """Convert file filename.chm to filename_html directory"""
35 dirname = filename.rsplit('.', 1)[0] + '_' + 'html'
36 return dirname
0 from os.path import basename, join
1 import pkg_resources
2
3 # Directory for templates
4 templates_dir = pkg_resources.resource_filename('archmage', 'templates/')
5
6 # Directory with icons
7 icons_dir = join(templates_dir, 'icons')
8
9 # List of auxiliary files, stored inside CHM file.
10 # Those files would not be extracted.
11 auxes = ('/#IDXHDR', '/#ITBITS', '/#STRINGS', '/#SYSTEM', '/#TOPICS',
12 '/#URLSTR', '/#URLTBL', '/#WINDOWS', '/$FIftiMain', '/$OBJINST',
13 '/$WWAssociativeLinks', '/$WWKeywordLinks', ':')
14
15 # Title. That is value, which you want to see in browser title.
16 # 'sourcename' is the name of source file.
17 title = basename(sourcename)
18
19 # Background and foreground colors for header.
20 bcolor = '#63baff'
21 fcolor = 'white'
22
23 # Filenames inside chm stored in utf-8, but links can be in some
24 # national codepage. If you set fs_encoding such links would be
25 # converted to it.
26 #
27 # Default: fs_encoding = 'utf-8'
28 fs_encoding = 'utf-8'
29
30 # If your filesystem is case-sensitive, links in the html can point to
31 # files that have differences in the case you need to set
32 # filename_case to 1 in that case :-)
33 #
34 # Default: filename_case=1
35 filename_case = 1
36
37 # If you want to add javascript code for restore framing to every
38 # page, set addframing.
39 #
40 # Default: restore_framing=1
41 restore_framing = 1
42
43 # Path to htmldoc executable
44 #
45 htmldoc_exec = '/usr/bin/htmldoc'
46
47 # CHM2TEXT converting. Use following command to convert CHM content to plain
48 # text file. Make sure that below apps are available on your system.
49 #chmtotext = 'lynx -dump -stdin'
50 chmtotext = '/usr/bin/elinks -dump'
51
52 # CHM2HTML converting. Use following command to convert CHM content to a single
53 # HTML file. Make sure that htmldoc is available on your system.
54 chmtohtml = '-t html -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --linkstyle underline --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --browserwidth 680 --no-strict --no-overflow --quiet'
55
56 # CHM2PDF converting. Use following command to convert CHM content to a single
57 # PDF file. Make sure that htmldoc is available on your system.
58 chmtopdf = '-t pdf14 -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --textcolor "#000000" --linkcolor "#0000ff" --linkstyle plain --size Universal --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --links --embedfonts --pagemode outline --pagelayout single --firstpage c1 --pageeffect none --pageduration 10 --effectduration 1.0 --no-encryption --permissions all --owner-password "" --user-password "" --browserwidth 680 --no-strict --no-overflow --quiet'
59
60 # Maximum Table of Content levels for htmldoc utility.
61 #
62 # Default: maxtoclvl = 4
63 maxtoclvl = 4
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # details.
14 #
15 # You should have received a copy of the GNU General Public License along with
16 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #
19
20 """CHM to Text converter (using external tool: lynx or elinks)"""
21
22 import sys
23 import signal
24 from subprocess import Popen, PIPE
25
26 if sys.platform != "win32":
27 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
28
29 def chmtotext(input, cmd, output=sys.stdout):
30 """CHM to Text converter"""
31 proc = Popen(cmd, stdin=PIPE, stdout=PIPE, shell=True)
32 proc.stdin.write(input)
33 print(proc.communicate()[0], file=output)
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net>
6 #
7 # This program is free software; you can redistribute it and/or modify it under
8 # the terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
10 # version.
11 #
12 # This program is distributed in the hope that it will be useful, but WITHOUT
13 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
15 # details.
16 #
17 # You should have received a copy of the GNU General Public License along with
18 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
19 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 #
21
22 """arCHMage -- extensible reader and decompiler for files in the CHM format.
23
24 Usage: %(program)s [options] <chmfile> [destdir|destfile]
25 Where:
26
27 -x / --extract
28 Extracts CHM file into specified directory. If destination
29 directory is omitted, than the new one will be created based
30 on name of CHM file. This options is by default.
31
32 -c format
33 --convert=format
34 Convert CHM file into specified file format. If destination
35 file is omitted, than the new one will be created based
36 on name of CHM file. Available formats:
37
38 html - Single HTML file
39 text - Plain Text file
40 pdf - Adobe PDF file format
41
42 -d / --dump
43 Dump HTML data from CHM file into standard output.
44
45 -V / --version
46 Print version number and exit.
47
48 -h / --help
49 Print this text and exit.
50 """
51
52 import os, sys
53 import getopt
54
55 import archmage
56 from archmage.CHM import CHMFile, Action
57
58 # Return codes
59 OK = 0
60 ERROR = 1
61
62 program = sys.argv[0]
63
64 # Miscellaneous auxiliary functions
65 def message(code=OK, msg=''):
66 outfp = sys.stdout
67 if code == ERROR:
68 outfp = sys.stderr
69 if msg:
70 print(msg, file=outfp)
71
72 def usage(code=OK, msg=''):
73 """Show application usage and quit"""
74 message(code, __doc__ % globals())
75 message(code, msg)
76 sys.exit(code)
77
78 def output_format(mode):
79 if mode == 'text':
80 return CHM2TXT
81 elif mode == 'html':
82 return CHM2HTML
83 elif mode == 'pdf':
84 return CHM2PDF
85 else:
86 sys.exit('Invalid output file format: %s' % mode)
87
88 def output_file(filename, mode):
89 """Convert filename.chm to filename.output"""
90 if mode == CHM2TXT:
91 file_ext = 'txt'
92 elif mode == CHM2HTML:
93 file_ext = 'html'
94 elif mode == CHM2PDF:
95 file_ext = 'pdf'
96 else:
97 file_ext = 'output'
98 output_filename = filename.rsplit('.', 1)[0] + '.' + file_ext
99 return output_filename
100
101 def parseargs():
102 try:
103 opts, args = getopt.getopt(sys.argv[1:], 'xc:dp:Vh',
104 ['extract', 'convert=', 'dump', 'port=', 'version', 'help'])
105 except getopt.error as msg:
106 usage(ERROR, msg)
107
108 class Options:
109 mode = None # EXTRACT or other
110 chmfile = None # CHM File to view/extract
111 output = None # Output file or directory
112
113 options = Options()
114
115 for opt, arg in opts:
116 if opt in ('-h', '--help'):
117 usage()
118 elif opt in ('-V', '--version'):
119 message(OK, archmage.__version__)
120 sys.exit(OK)
121 elif opt in ('-c', '--convert'):
122 if options.mode is not None:
123 sys.exit('-x and -c are mutually exclusive')
124 options.mode = output_format(str(arg))
125 elif opt in ('-x', '--extract'):
126 if options.mode is not None:
127 sys.exit('-x and -c are mutually exclusive')
128 options.mode = Action.EXTRACT
129 elif opt in ('-d', '--dump'):
130 if options.mode is not None:
131 sys.exit('-d should be used without any other options')
132 options.mode = Action.DUMPHTML
133 else:
134 assert False, (opt, arg)
135
136 # Sanity checks
137 if options.mode is None:
138 # Set default option
139 options.mode = Action.EXTRACT
140
141 if not args:
142 sys.exit('No CHM file was specified!')
143 else:
144 # Get CHM file name from command line
145 options.chmfile = args.pop(0)
146
147 # if CHM content should be extracted
148 if options.mode == Action.EXTRACT:
149 if not args:
150 options.output = archmage.file2dir(options.chmfile)
151 else:
152 # get output directory from command line
153 options.output = args.pop(0)
154 # or converted into another file format
155 elif options.mode in (Action.CHM2TXT, Action.CHM2HTML, Action.CHM2PDF):
156 if not args:
157 options.output = output_file(options.chmfile, options.mode)
158 else:
159 # get output filename from command line
160 options.output = args.pop(0)
161
162 # Any other arguments are invalid
163 if args:
164 sys.exit('Invalid arguments: ' + ', '.join(args))
165
166 return options
167
168
169 def main():
170 options = parseargs()
171 if not os.path.exists(options.chmfile):
172 sys.exit('No such file: %s' % options.chmfile)
173
174 if os.path.isdir(options.chmfile):
175 sys.exit('A regular files is expected, got directory: %s' % options.chmfile)
176
177 source = CHMFile(options.chmfile)
178
179 if options.mode == Action.DUMPHTML:
180 source.dump_html()
181 elif options.mode == Action.CHM2TXT:
182 if os.path.exists(options.output):
183 sys.exit('%s is already exists' % options.output)
184 source.chm2text(open(options.output, 'w'))
185 elif options.mode in (Action.CHM2HTML, Action.CHM2PDF):
186 source.htmldoc(options.output, options.mode)
187 elif options.mode == Action.EXTRACT:
188 source.extract(options.output)
189
190 source.close()
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # details.
14 #
15 # You should have received a copy of the GNU General Public License along with
16 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #
19
20 """Generic converter function"""
21
22 import os
23 import string
24 import tempfile
25 import subprocess
26 import archmage
27
28
29 def htmldoc(input, cmd, options, toclevels, output):
30 """CHM to other format converter
31
32 input - list of input html files
33 cmd - full path to htmldoc command
34 options - htmldoc options from arch.conf
35 toclevels - number of ToC levels as htmldoc option
36 output - output file (single html, ps, pdf and etc)
37 """
38 if toclevels:
39 toc = ('--toclevels %s' % (toclevels))
40 else:
41 toc = ('--no-toc')
42 options = options % {'output' : output, 'toc' : toc}
43 if input:
44 # Create a htmldoc file for batch processing
45 f = tempfile.NamedTemporaryFile(delete=False)
46 f.write('#HTMLDOC 1.8.27\n')
47 f.write(options + '\n')
48 f.write(string.join(input, '\n'))
49 f.close()
50 # Prepare command line to execute
51 command = '%s --batch %s' % (cmd, f.name)
52 subprocess.call(command, shell=True)
53 # Unlink temporary htmldoc file
54 os.unlink(f.name)
0 <html>
1 <head>
2 <title>$title</title>
3 <LINK rel="Stylesheet" type="text/css" href="arch_css.css">
4 </head>
5
6 <body onload="setInterval('getLoc()', 500);">
7 <script>
8 var lastDoc;
9 var contents = $contents;
10
11 var w=window,d=document
12 var icons={'0' : 'icons/0.gif','1' : 'icons/90.gif',
13 '2' : 'icons/91.gif', '3' : 'icons/92.gif', '4' : 'icons/99.gif',
14 '18' : 'icons/93.gif', '19' : 'icons/94.gif', '20' : 'icons/97.gif',
15 '26' : 'icons/95.gif', '27' : 'icons/96.gif', '28' : 'icons/98.gif'}
16
17 var dhtml=true
18 try{if(d.body.innerHTML.length<=0)dhtml=false}
19 catch(e){dhtml=false;}
20 var tree=[];
21
22 get_element=d.all ?
23 function(id){return d.all[id]}
24 :
25 function(id){return d.getElementById(id)}
26
27 function get_img1(){
28 return icons[((this.childs.length ? 16 : 0)+(this.childs.length && this.opened ? 8 : 0)+(this.is_last()? 1 : 0)+(this.is_first()? 2 : 0)+2)]
29 }
30 function get_img2(){
31 n=this.cnt[2]
32 if(n<9){
33 n=(this.opened ? ( n%2 ? parseInt(n)+1 : n ) : ( n%2 ? n : parseInt(n)-1 ))
34 }
35 return 'icons/'+n+'.gif'
36 }
37 function node(tree,n){
38 this.ind=tree.ind+1
39 this.cnt=tree.cnt[n+(this.ind ? 3 : 0)]
40 if(!this.cnt)return
41 this.tree=tree.tree
42 this.parent=tree
43 this.opened=!dhtml
44 this.nind=this.tree.nodes.length
45 this.tree.nodes[this.nind]=this
46 tree.childs[n]=this
47 this.childs=[]
48 for(var i=0;i < this.cnt.length - 2;i++)
49 new node(this,i)
50 this.get_img1=get_img1
51 this.get_img2=get_img2
52 this.open=open
53 this.select=select
54 this.init=init
55 this.is_last=function(){
56 return n==this.parent.childs.length - 1
57 }
58 this.is_first=function(){
59 return(this.ind==0)&&(n==0)&&(!this.is_last())
60 }
61 }
62
63 function open(){
64 var childs=[]
65 var el=get_element('divCont'+this.nind)
66 if(!el)return
67 if(!dhtml){
68 d.write(childs.join(''))
69 for(var i=0;i < this.childs.length;i++){
70 d.write(this.childs[i].init())
71 this.childs[i].open()
72 }
73 }
74 else{
75 if(!el.innerHTML){
76 for(var i=0;i < this.childs.length;i++)
77 childs[i]=this.childs[i].init()
78 el.innerHTML=childs.join('')
79 }
80 el.style.display=(this.opened ? 'none' : 'block')
81 this.opened=!this.opened
82 var img1=d.images['img1_'+this.nind],img2=d.images['img2_'+this.nind]
83 if(img1)img1.src=this.get_img1()
84 if(img2)img2.src=this.get_img2()
85 }
86 }
87
88
89 function select(nind){
90 if(!nind){
91 var sel=this.tree.sel
92 this.tree.sel=this
93 if(sel)sel.select(true)
94 }
95 var img2=d.images['img2_'+this.nind]
96 if(img2)img2.src=this.get_img2()
97 get_element('el'+this.nind).style.fontWeight=nind ? 'normal' : 'bold'
98 return Boolean(this.cnt[1])
99 }
100
101 function init(){
102 var temp=[],par=this.parent
103 for(var i=this.ind;i>0;i--){
104 temp[i]='<img src="'+icons[par.is_last()? 0 : 1]+'" border="0" align="absbottom">'
105 par=par.parent
106 }
107 r='<table cellpadding="0" cellspacing="0" border="0">'
108 r+='<tr><td nowrap>'
109 r+=temp.join('')
110 r+=(this.childs.length ?(!dhtml ? '' : '<a href="javascript: tree.toggle('+this.nind+')" >')+'<img src="'+this.get_img1()+'" border="0" align="absbottom" name="img1_'+this.nind+'">'+(!dhtml ? '' : '</a>'): '<img src="'+this.get_img1()+'" border="0" align="absbottom">')
111 r+='<a href="'+this.cnt[1]+'" target="'+'content'+'"'+' title="'+this.cnt[0]+'" onclick="return tree.select('+this.nind+')" '+(!dhtml ? '' : ' ondblclick="tree.toggle('+this.nind+')"')+' class="small" id="el'+this.nind+'"><img src="'+this.get_img2()+'" border="0" align="absbottom" name="img2_'+this.nind+'">&nbsp;'+this.cnt[0]+'</a>'
112 r+='</td></tr></table>'
113 r+=(this.childs.length ? '<div id="divCont'+this.nind+'" style="display:none"></div>' : '')
114 return r
115 }
116
117 function draw_contents(cnt){
118 tree=this;
119 tree.cnt=cnt;
120 tree.tree=this;
121 tree.nodes=[];
122 tree.sel=null;
123 tree.ind=-1;
124
125 tree.select=function(i){
126 return tree.nodes[i].select();
127 };
128 tree.toggle=function(i){
129 tree.nodes[i].open()
130 };
131 tree.childs=[]
132 for(var i=0;i<cnt.length;i++){
133 new node(tree,i)
134 }
135 tree.nind=0;
136
137 for(var i=0;i < tree.childs.length;i++){
138 d.write(tree.childs[i].init());
139 if(!dhtml)tree.childs[i].open();
140 }
141 }
142
143
144 function getLoc(){
145 var doc = ""+parent.frames[1].location;
146 if(doc != lastDoc){
147 var keyVals = new Array();
148 keyVals = doc.split("\/");
149 var targetPage = ""+keyVals[(keyVals.length-1)];
150
151 if(targetPage.indexOf("\#") > 0){
152 targetPage = targetPage.substr(0,targetPage.indexOf("\#"));
153 }
154
155 nodeCount = 0;
156 while( (""+tree.nodes[nodeCount].cnt[1]).lastIndexOf(targetPage) < 0){
157 nodeCount++;
158 }
159 parentNode = tree.nodes[nodeCount].parent;
160 if(parentNode != tree && parentNode.opened == false){
161 parentNode.open();
162 }
163 tree.nodes[nodeCount].select();
164 lastDoc = doc;
165 }
166 }
167 new draw_contents(contents);
168 </script>
169 </body>
170 </html>
0 .small { font-size: x-small; }
1 .htable { margin: 0; border: none; padding: 0 }
0 <html>
1 <head>
2 <title>$title</title>
3
4 <script>
5 var qs=location.search.substr(1)
6 var A=qs.split("&")
7 var B=null
8 var F="$deftopic"
9 for(var i=0;i<A.length;i++){B=A[i].split("=");A[i]=[B[0],B[1]]}
10 for(var j=0;j<A.length;j++){if(A[j][0]=='page'){ F=A[j][1];break}}
11 </script >
12 </head>
13 <script>
14 document.write('<frameset cols="200,*" bordercolor="$bcolor" frameborder="yes" framespacing="2" >')
15 document.write('<frame name="toc" src="arch_contents.html">')
16 document.write('<frame name="content" src="'+F+'" >')
17 document.write('</frameset>');
18 </script>
19 <noscript>
20 <frameset cols="200,*" bordercolor="$bcolor" frameborder="yes" framespacing="2" >
21 <frame name="toc" src="arch_contents.html" >
22 <frame name="content" src="$deftopic">
23 </frameset>
24 </noscript>
25 </html>
0 <html>
1 <head>
2 <title>$title</title>
3 <LINK rel="Stylesheet" type="text/css" href="arch_css.css">
4 </head>
5 <body bgcolor="$bcolor">
6 <table class='htable' cellpadding="0" cellspacing="0" width="100%"><td>
7 <td align="center" width="100%">
8 <b><font size="large" color="$fcolor">$title</font></b>
9 </table>
10 </body>
11 </html>
0 <html>
1 <head>
2 <script>var pageid="";</script>
3
4 <title>$title</title>
5
6 <script>
7 var qs=location.search.substr(1);
8 var A=qs.split("&")
9 var B=null
10 var F="$deftopic";
11 for(var i=0;i<A.length;i++){
12 B=A[i].split("=")
13 A[i]=[B[0],B[1]]
14 }
15 for(var j=0;j<A.length;j++){
16 if(A[j][0]=='page'){
17 F=A[j][1]
18 break
19 }
20 }
21 </script>
22 </head>
23
24 <script>
25 document.write('<frameset rows="30,*" frameborder="no" framespacing="0" border="0" >')
26 document.write('<frame name="header" src="arch_header.html" frameborder="no" noresize="yes" scrolling="no" >')
27 if(F!='')F='?page='+F
28 document.write('<frame name="main" src="arch_frameset.html'+F+'">')
29 document.write('</frameset>')
30 </script>
31 <noscript>
32 <frameset rows="30,*" frameborder="no" framespacing="0" border="0" >
33 <frame name="header" src="arch_header.html" frameborder="no" noresize="yes" scrolling="no">
34 <frame name="main" src="arch_frameset.html" >
35 </frameset>
36 </noscript>
37 </html>
2020 .B archmage
2121 .I chmfile directory
2222 .br
23 .B archmage
24 \-p port
25 .I chmfile
2623 .SH DESCRIPTION
2724 This manual page documents briefly the
2825 .B archmage
3835 chmlib from GnoCHM project.
3936 .SH USAGE
4037 .PP
41 There is three ways to use arCHMage package now:
42 .PP
43 1) Extract .chm to directory (directory will be created):
38 Extract .chm to directory (directory will be created):
4439
4540 archmage <chmfile> <directory>
4641 .PP
47 2) Run as http-server, which will publish chm file contents on
48 specified port:
49
50 archmage \-p <port> <chmfile>
51 .PP
52 3) Tune your apache to publish chm file contents if there is trailing
53 slash in request to that file (you will need working mod_python for
54 that):
55
56 Add that lines to your httpd.conf:
57
58 AddHandler python-program .chm
59 .br
60 PythonHandler archmod.mod_chm
61
62 Restart apache.
63 .PP
64 Let's suppose, you have file sample.chm in DocumentRoot of your
65 apache. After that tuning you can receive raw chm file, if you point
66 your browser to
67
68 http://yourserver/sample.chm
69 .PP
70 or you can view chm file on the fly if you point your browser to
71
72 http://yourserver/sample.chm/ (note trailing slash)
7342 .SH SEE ALSO
7443 .PP
7544 arCHMage Home Page: http://archmage.sf.net
0 Metadata-Version: 1.1
0 Metadata-Version: 1.2
11 Name: archmage
2 Version: 0.3.1
2 Version: 0.4.0
33 Summary: CHM decompressor
44 Home-page: https://github.com/dottedmag/archmage
5 Author: Mikhail Gusarov
6 Author-email: dottedmag@dottedmag.net
5 Maintainer: Mikhail Gusarov
6 Maintainer-email: dottedmag@dottedmag.net
77 License: GPLv2+
88 Description: arCHMage is a reader and decompressor for CHM format
99 Keywords: chm,HTML Help,Compiled HTML,Compressed HTML
11 COPYING
22 MANIFEST.in
33 NEWS
4 RELEASE-VERSION
4 README.md
55 archmage.1
66 setup.py
7 version.py
7 archmage/CHM.py
8 archmage/CHMParser.py
9 archmage/__init__.py
10 archmage/arch.conf
11 archmage/chmtotext.py
12 archmage/cli.py
13 archmage/htmldoc.py
814 archmage.egg-info/PKG-INFO
915 archmage.egg-info/SOURCES.txt
1016 archmage.egg-info/dependency_links.txt
1117 archmage.egg-info/entry_points.txt
1218 archmage.egg-info/requires.txt
1319 archmage.egg-info/top_level.txt
14 archmod/CHM.py
15 archmod/CHMParser.py
16 archmod/CHMServer.py
17 archmod/Cached.py
18 archmod/__init__.py
19 archmod/arch.conf
20 archmod/chmtotext.py
21 archmod/cli.py
22 archmod/htmldoc.py
23 archmod/mod_chm.py
24 archmod/templates/arch_contents.html
25 archmod/templates/arch_css.css
26 archmod/templates/arch_frameset.html
27 archmod/templates/arch_header.html
28 archmod/templates/index.html
29 archmod/templates/icons/0.gif
30 archmod/templates/icons/1.gif
31 archmod/templates/icons/10.gif
32 archmod/templates/icons/11.gif
33 archmod/templates/icons/12.gif
34 archmod/templates/icons/13.gif
35 archmod/templates/icons/14.gif
36 archmod/templates/icons/15.gif
37 archmod/templates/icons/16.gif
38 archmod/templates/icons/17.gif
39 archmod/templates/icons/18.gif
40 archmod/templates/icons/19.gif
41 archmod/templates/icons/2.gif
42 archmod/templates/icons/20.gif
43 archmod/templates/icons/21.gif
44 archmod/templates/icons/22.gif
45 archmod/templates/icons/23.gif
46 archmod/templates/icons/24.gif
47 archmod/templates/icons/25.gif
48 archmod/templates/icons/26.gif
49 archmod/templates/icons/27.gif
50 archmod/templates/icons/3.gif
51 archmod/templates/icons/35.gif
52 archmod/templates/icons/37.gif
53 archmod/templates/icons/39.gif
54 archmod/templates/icons/4.gif
55 archmod/templates/icons/5.gif
56 archmod/templates/icons/6.gif
57 archmod/templates/icons/7.gif
58 archmod/templates/icons/8.gif
59 archmod/templates/icons/9.gif
60 archmod/templates/icons/90.gif
61 archmod/templates/icons/91.gif
62 archmod/templates/icons/92.gif
63 archmod/templates/icons/93.gif
64 archmod/templates/icons/94.gif
65 archmod/templates/icons/95.gif
66 archmod/templates/icons/96.gif
67 archmod/templates/icons/97.gif
68 archmod/templates/icons/98.gif
69 archmod/templates/icons/99.gif
70 archmod/templates/icons/next.gif
71 archmod/templates/icons/prev.gif
20 archmage/templates/arch_contents.html
21 archmage/templates/arch_css.css
22 archmage/templates/arch_frameset.html
23 archmage/templates/arch_header.html
24 archmage/templates/index.html
25 archmage/templates/icons/0.gif
26 archmage/templates/icons/1.gif
27 archmage/templates/icons/10.gif
28 archmage/templates/icons/11.gif
29 archmage/templates/icons/12.gif
30 archmage/templates/icons/13.gif
31 archmage/templates/icons/14.gif
32 archmage/templates/icons/15.gif
33 archmage/templates/icons/16.gif
34 archmage/templates/icons/17.gif
35 archmage/templates/icons/18.gif
36 archmage/templates/icons/19.gif
37 archmage/templates/icons/2.gif
38 archmage/templates/icons/20.gif
39 archmage/templates/icons/21.gif
40 archmage/templates/icons/22.gif
41 archmage/templates/icons/23.gif
42 archmage/templates/icons/24.gif
43 archmage/templates/icons/25.gif
44 archmage/templates/icons/26.gif
45 archmage/templates/icons/27.gif
46 archmage/templates/icons/3.gif
47 archmage/templates/icons/35.gif
48 archmage/templates/icons/37.gif
49 archmage/templates/icons/39.gif
50 archmage/templates/icons/4.gif
51 archmage/templates/icons/5.gif
52 archmage/templates/icons/6.gif
53 archmage/templates/icons/7.gif
54 archmage/templates/icons/8.gif
55 archmage/templates/icons/9.gif
56 archmage/templates/icons/90.gif
57 archmage/templates/icons/91.gif
58 archmage/templates/icons/92.gif
59 archmage/templates/icons/93.gif
60 archmage/templates/icons/94.gif
61 archmage/templates/icons/95.gif
62 archmage/templates/icons/96.gif
63 archmage/templates/icons/97.gif
64 archmage/templates/icons/98.gif
65 archmage/templates/icons/99.gif
66 archmage/templates/icons/next.gif
67 archmage/templates/icons/prev.gif
00 [console_scripts]
1 archmage = archmod.cli:main
1 archmage = archmage.cli:main
22
00 pychm
1 BeautifulSoup
1 beautifulsoup4
2 sgmllib3k
+0
-393
archmod/CHM.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 #
6 # This program is free software; you can redistribute it and/or modify it under
7 # the terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 # details.
15 #
16 # You should have received a copy of the GNU General Public License along with
17 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #
20
21 import os
22 import sys
23 import re
24 import shutil
25 import errno
26 import string
27 import tempfile
28
29 import archmod
30
31 from archmod.CHMParser import SitemapFile, PageLister, ImageCatcher, TOCCounter#, HeadersCounter
32 from archmod.Cached import Cached
33
34 # import PyCHM bindings
35 try:
36 from chm import chmlib
37 except ImportError, msg:
38 sys.exit('ImportError: %s\nPlease check README file for system requirements.' % msg)
39
40 # External file converters
41 from archmod.chmtotext import chmtotext
42 from archmod.htmldoc import htmldoc
43
44 PARENT_RE = re.compile(r'(^|/|\\)\.\.(/|\\|$)')
45
46 class CHMDir(Cached):
47 """Class that represent CHM content from directory"""
48
49 def __init__(self, name):
50 # Name of source directory with CHM content
51 self.sourcename = name
52 # Import variables from config file into namespace
53 execfile(archmod.config, self.__dict__)
54
55 # build regexp from the list of auxiliary files
56 self.aux_re = '|'.join([ re.escape(s) for s in self.auxes ])
57
58 # Get and parse 'Table of Contents'
59 try:
60 self.topicstree = self.get_entry(self.topics)
61 except AttributeError:
62 self.topicstree = None
63 self.contents = SitemapFile(self.topicstree).parse()
64
65 def _getitem(self, name):
66 # Get all entries
67 if name == 'entries':
68 entries = []
69 for fname in archmod.listdir(self.sourcename):
70 name = '/' + fname
71 if os.path.isdir(self.sourcename + name):
72 name += '/'
73 entries.append(name)
74 return entries
75 # retrieves the list of HTML files contained into the CHM file, **in order** (that's the important bit).
76 # (actually performed by the PageLister class)
77 if name == 'html_files':
78 lister = PageLister()
79 lister.feed(self.topicstree)
80 return lister.pages
81 # retrieves the list of images urls contained into the CHM file.
82 # (actually performed by the ImageCatcher class)
83 if name == 'image_urls':
84 image_urls = []
85 image_catcher = ImageCatcher()
86 for file in self.html_files:
87 image_catcher.feed(CHMEntry(self, file).correct())
88 for image_url in image_catcher.imgurls:
89 if not image_urls.count(image_url):
90 image_urls.append(image_url)
91 return image_urls
92 # retrieves a dictionary of actual file entries and corresponding urls into the CHM file
93 if name == 'image_files':
94 image_files = {}
95 for image_url in self.image_urls:
96 for entry in self.entries:
97 if re.search(image_url, entry.lower()) and not image_files.has_key(entry.lower()):
98 image_files.update({entry : image_url})
99 return image_files
100 # Get topics file
101 if name == 'topics':
102 for e in self.entries:
103 if e.lower().endswith('.hhc'):
104 return e
105 if name == 'deftopic':
106 # use first page as deftopic. Note: without heading slash
107 if self.html_files[0].startswith('/'):
108 return self.html_files[0].replace('/', '', 1).lower()
109 return self.html_files[0].lower()
110 # Get index file
111 if name == 'index':
112 for e in self.entries:
113 if e.lower().endswith('.hhk'):
114 return e
115 # Get frontpage name
116 if name == 'frontpage':
117 frontpage = os.path.join('/', 'index.html')
118 index = 2 # index2.html and etc.
119 for filename in self.entries:
120 if frontpage == filename:
121 frontpage = os.path.join('/', ('index%s.html' % index))
122 index += 1
123 return frontpage
124 # Get all templates files
125 if name == 'templates':
126 templates = []
127 for file in os.listdir(self.templates_dir):
128 if os.path.isfile(os.path.join(self.templates_dir, file)):
129 if os.path.join('/', file) not in self.entries:
130 templates.append(os.path.join('/', file))
131 return templates
132 # Get ToC levels
133 if name == 'toclevels':
134 counter = TOCCounter()
135 counter.feed(self.topicstree)
136 if counter.count > self.maxtoclvl:
137 return self.maxtoclvl
138 else:
139 return counter.count
140 raise AttributeError(name)
141
142 def get_entry(self, name):
143 """Get CHM entry by name"""
144 # show index page or any other substitute
145 if name == '/':
146 name = self.frontpage
147 if name in self.templates or name == self.frontpage:
148 return self.get_template(name)
149 if name.lower() in [ os.path.join('/icons', icon.lower()) for icon in os.listdir(self.icons_dir) ]:
150 return open(os.path.join(self.icons_dir, os.path.basename(name))).read()
151 for e in self.entries:
152 if e.lower() == name.lower():
153 return CHMEntry(self, e, frontpage=self.frontpage).get()
154 else:
155 archmod.message(archmod.ERROR, 'NameError: There is no %s' % name)
156
157 def sub_mytag(self, re):
158 """Replacing tagname with attribute"""
159 try:
160 res = eval('self.' + re.group(1))
161 except:
162 try:
163 res = eval(re.group(1))
164 except:
165 res = ''
166 return res
167
168 def get_template(self, name):
169 """Get template file by it's name"""
170 if name == self.frontpage:
171 tpl = open(os.path.join(self.templates_dir, os.path.basename('index.html'))).read()
172 else:
173 tpl = open(os.path.join(self.templates_dir, os.path.basename(name))).read()
174 return re.sub('\<%(.+?)%\>', self.sub_mytag, tpl)
175
176 def process_templates(self, destdir="."):
177 """Process templates"""
178 for template in self.templates:
179 open(os.path.join(destdir, os.path.basename(template)), 'w').write(self.get_template(template))
180 if self.frontpage not in self.templates:
181 open(os.path.join(destdir, os.path.basename(self.frontpage)), 'w').write(self.get_template('index.html'))
182 if not os.path.exists(os.path.join(destdir, 'icons/')):
183 shutil.copytree(os.path.join(self.icons_dir), os.path.join(destdir, 'icons/'))
184
185 def extract_entry(self, entry, output_file, destdir=".", correct=False):
186 # process output entry, remove first '/' in entry name
187 fname = string.lower(output_file).replace('/', '', 1)
188 # get directory name for file fname if any
189 dname = os.path.dirname(os.path.join(destdir, fname))
190 # if dname is a directory and it's not exist, than create it
191 if dname and not os.path.exists(dname):
192 os.makedirs(dname)
193 # otherwise write a file from CHM entry
194 if not os.path.isdir(os.path.join(destdir, fname)):
195 # filename encoding conversion
196 if self.fs_encoding:
197 fname = fname.decode('utf-8').encode(self.fs_encoding)
198 # write CHM entry content into the file, corrected or as is
199 if correct:
200 open(os.path.join(destdir, fname), 'w').writelines(CHMEntry(self, entry).correct())
201 else:
202 open(os.path.join(destdir, fname), 'w').writelines(CHMEntry(self, entry).get())
203
204 def extract_entries(self, entries=[], destdir=".", correct=False):
205 """Extract raw CHM entries into the files"""
206 for e in entries:
207 # if entry is auxiliary file, than skip it
208 if re.match(self.aux_re, e):
209 continue
210 if PARENT_RE.search(e):
211 raise RuntimeError('Giving up on malicious name: %s' % e)
212 self.extract_entry(e, output_file=e, destdir=destdir, correct=correct)
213
214 def extract(self, destdir):
215 """Extract CHM file content into FS"""
216 try:
217 # Create destination directory
218 os.mkdir(destdir)
219 # make raw content extraction
220 self.extract_entries(entries=self.entries, destdir=destdir)
221 # process templates
222 self.process_templates(destdir=destdir)
223 except OSError, error:
224 if error[0] == errno.EEXIST:
225 sys.exit('%s is already exists' % destdir)
226
227 def dump_html(self, output=sys.stdout):
228 """Dump HTML data from CHM file into standard output"""
229 for e in self.html_files:
230 # if entry is auxiliary file, than skip it
231 if re.match(self.aux_re, e):
232 continue
233 print >> output, CHMEntry(self, e).get()
234
235 def chm2text(self, output=sys.stdout):
236 """Convert CHM into Single Text file"""
237 for e in self.html_files:
238 # if entry is auxiliary file, than skip it
239 if re.match(self.aux_re, e):
240 continue
241 # to use this function you should have 'lynx' or 'elinks' installed
242 chmtotext(input=CHMEntry(self, e).get(), cmd=self.chmtotext, output=output)
243
244 def htmldoc(self, output, format=archmod.CHM2HTML):
245 """CHM to other file formats converter using htmldoc"""
246 # Extract CHM content into temporary directory
247 output = output.replace(' ', '_')
248 tempdir = tempfile.mkdtemp(prefix=output.rsplit('.', 1)[0])
249 self.extract_entries(entries=self.html_files, destdir=tempdir, correct=True)
250 # List of temporary files
251 files = [ os.path.abspath(tempdir + file.lower()) for file in self.html_files ]
252 if format == archmod.CHM2HTML:
253 options = self.chmtohtml
254 # change output from single html file to a directory with html file and images
255 if self.image_files:
256 dirname = archmod.file2dir(output)
257 if os.path.exists(dirname):
258 sys.exit('%s is already exists' % dirname)
259 # Extract image files
260 os.mkdir(dirname)
261 # Extract all images
262 for key, value in self.image_files.items():
263 self.extract_entry(entry=key, output_file=value, destdir=dirname)
264 # Fix output file name
265 output = os.path.join(dirname, output)
266 elif format == archmod.CHM2PDF:
267 options = self.chmtopdf
268 if self.image_files:
269 # Extract all images
270 for key, value in self.image_files.items():
271 self.extract_entry(entry=key, output_file=key.lower(), destdir=tempdir)
272 htmldoc(files, self.htmldoc_exec, options, self.toclevels, output)
273 # Remove temporary files
274 shutil.rmtree(path=tempdir)
275
276
277 class CHMFile(CHMDir):
278 """CHM file class derived from CHMDir"""
279
280 def _getitem(self, name):
281 # Overriding CHMDir.entries attribute
282 if name == 'entries':
283 entries = []
284 # get CHM file content and process it
285 for name in self._get_names(self._handler):
286 if (name == '/'):
287 continue
288 entries.append(name)
289 return entries
290 if name == '_handler':
291 return chmlib.chm_open(self.sourcename)
292 return super(CHMFile, self)._getitem(name)
293
294 def __delattr__(self, name):
295 # Closes CHM file handler on class destroying
296 if name == '_handler':
297 chmlib.chm_close(self._handler)
298 return super(CHMFile, self).__delattr__(name)
299
300 def _get_names(self, chmfile):
301 """Get object's names inside CHM file"""
302 def get_name(chmfile, ui, content):
303 content.append(ui.path)
304 return chmlib.CHM_ENUMERATOR_CONTINUE
305
306 chmdir = []
307 if (chmlib.chm_enumerate(chmfile, chmlib.CHM_ENUMERATE_ALL, get_name, chmdir)) == 0:
308 sys.exit('UnknownError: CHMLIB or PyCHM bug?')
309 return chmdir
310
311
312 class CHMEntry(object):
313 """Class for CHM file entry"""
314
315 def __init__(self, parent, name, frontpage='index.html'):
316 # parent CHM file
317 self.parent = parent
318 # object inside CHM file
319 self.name = name
320 # frontpage name to substitute
321 self.frontpage = os.path.basename(frontpage)
322
323 def read(self):
324 """Read CHM entry content"""
325 # Check where parent instance is CHMFile or CHMDir
326 if isinstance(self.parent, CHMFile):
327 result, ui = chmlib.chm_resolve_object(self.parent._handler, self.name)
328 if (result != chmlib.CHM_RESOLVE_SUCCESS):
329 return None
330
331 size, content = chmlib.chm_retrieve_object(self.parent._handler, ui, 0l, ui.length)
332 if (size == 0):
333 return None
334 return content
335 else:
336 return open(self.parent.sourcename + self.name).read()
337
338 def lower_links(self, text):
339 """Links to lower case"""
340 return re.sub('(?i)(href|src)\s*=\s*([^\s|>]+)', lambda m:m.group(0).lower(), text)
341
342 def add_restoreframing_js(self, name, text):
343 name = re.sub('/+', '/', name)
344 depth = name.count('/')
345
346 js = """<body><script language="javascript">
347 if ((window.name != "content") && (navigator.userAgent.indexOf("Opera") <= -1) )
348 document.write("<center><a href='%s%s?page=%s'>show framing</a></center>")
349 </script>""" % ( '../' * depth, self.frontpage, name )
350
351 return re.sub('(?i)<\s*body\s*>', js, text)
352
353 def correct(self):
354 """Get correct CHM entry content"""
355 data = self.read()
356 # If entry is a html page?
357 if re.search('(?i)\.html?$', self.name) and data is not None:
358 # lower-casing links if needed
359 if self.parent.filename_case:
360 data = self.lower_links(data)
361
362 # Delete unwanted HTML elements.
363 data = re.sub('<div .*teamlib\.gif.*\/div>', '', data)
364 data = re.sub('<a href.*>\[ Team LiB \]<\/a>', '', data)
365 data = re.sub('<table.*larrow\.gif.*rarrow\.gif.*<\/table>', '', data)
366 data = re.sub('<a href.*next\.gif[^>]*><\/a>', '' ,data)
367 data = re.sub('<a href.*previous\.gif[^>]*><\/a>', '', data)
368 data = re.sub('<a href.*prev\.gif[^>]*><\/a>', '', data)
369 data = re.sub('"[^"]*previous\.gif"', '""', data)
370 data = re.sub('"[^"]*prev\.gif"', '""', data)
371 data = re.sub('"[^"]*next\.gif"', '""', data)
372 if data is not None:
373 return data
374 else:
375 return ''
376
377 def get(self):
378 """Get CHM entry content"""
379 # read entry content
380 data = self.read()
381 # If entry is a html page?
382 if re.search('(?i)\.html?$', self.name) and data is not None:
383 # lower-casing links if needed
384 if self.parent.filename_case:
385 data = self.lower_links(data)
386 # restore framing if that option is set in config file
387 if self.parent.restore_framing:
388 data = self.add_restoreframing_js(self.name[1:], data)
389 if data is not None:
390 return data
391 else:
392 return ''
+0
-221
archmod/CHMParser.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # details.
14 #
15 # You should have received a copy of the GNU General Public License along with
16 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #
19
20 import re
21 import mimetypes
22 import sgmllib, urllib2
23
24 from BeautifulSoup import BeautifulSoup
25 from HTMLParser import HTMLParser, HTMLParseError
26 from urlparse import urlparse
27
28 from archmod import COMMASPACE, LF, CR
29
30 START_TAG = '['
31 END_TAG = ']'
32
33
34 class SitemapFile(object):
35 """Sitemap file class"""
36
37 def __init__(self, lines):
38 # XXX: Cooking tasty beautiful soup ;-)
39 if lines:
40 soup = BeautifulSoup(lines)
41 lines = soup.prettify()
42 # XXX: Removing empty tags
43 lines = re.sub(re.compile(r'<ul>\s*</ul>', re.I | re.M), '', lines)
44 lines = re.sub(re.compile(r'<li>\s*</li>', re.I | re.M), '', lines)
45 self.lines = lines
46 else:
47 self.lines = None
48
49 def parse(self):
50 p = SitemapParser()
51 if self.lines:
52 p.feed(self.lines)
53 # parsed text + last bracket
54 return (p.parsed + LF + END_TAG)
55
56
57 class TagStack(list):
58 """from book of David Mertz 'Text Processing in Python'"""
59
60 def append(self, tag):
61 # Remove every paragraph-level tag if this is one
62 if tag.lower() in ('p', 'blockquote'):
63 self = TagStack([ t for t in super if t not in ('p', 'blockquote') ])
64 super(TagStack, self).append(tag)
65
66 def pop(self, tag):
67 # 'Pop' by tag from nearest position, not only last item
68 self.reverse()
69 try:
70 pos = self.index(tag)
71 except ValueError:
72 raise HTMLParseError, 'Tag not on stack'
73 self[:] = self[pos + 1:]
74 self.reverse()
75
76
77 class SitemapParser(sgmllib.SGMLParser):
78 """Class for parsing files in SiteMap format, such as .hhc"""
79
80 def __init__(self):
81 self.tagstack = TagStack()
82 self.in_obj = False
83 self.name = self.local = self.param = ""
84 self.imagenumber = 1
85 self.parsed = ""
86 sgmllib.SGMLParser.__init__(self)
87
88 def unknown_starttag(self, tag, attrs):
89 # first ul, start processing from here
90 if tag == 'ul' and not self.tagstack:
91 self.tagstack.append(tag)
92 # First bracket
93 self.parsed += LF + START_TAG
94
95 # if inside ul
96 elif self.tagstack:
97 if tag == 'li':
98 # append closing bracket if needed
99 if self.tagstack[-1] != 'ul':
100 self.parsed += END_TAG
101 self.tagstack.pop('li')
102 indent = ' ' * len(self.tagstack)
103
104 if self.parsed != LF + START_TAG:
105 self.parsed += COMMASPACE
106
107 self.parsed += LF + indent + START_TAG
108
109 if tag == 'object':
110 for x, y in attrs:
111 if x.lower() == 'type' and y.lower() == 'text/sitemap':
112 self.in_obj = True
113
114 if tag.lower() == 'param' and self.in_obj:
115 for x, y in attrs:
116 if x.lower() == 'name':
117 self.param = y.lower()
118 elif x.lower() == 'value':
119 if self.param == 'name' and not len(self.name):
120 # XXX: Remove LF and/or CR signs from name
121 self.name = y.replace(LF, '').replace(CR, '')
122 # XXX: Un-escaping double quotes :-)
123 self.name = self.name.replace('"', '\\"')
124 elif self.param == 'local':
125 # XXX: Change incorrect slashes in url
126 self.local = y.lower().replace('\\', '/').replace('..\\', '')
127 elif self.param == 'imagenumber':
128 self.imagenumber = y
129 self.tagstack.append(tag)
130
131 def unknown_endtag(self, tag):
132 # if inside ul
133 if self.tagstack:
134 if tag == 'ul':
135 self.parsed += END_TAG
136 if tag == 'object' and self.in_obj:
137 # "Link Name", "URL", "Icon"
138 self.parsed += "\"%s\", \"%s\", \"%s\"" % (self.name, self.local, self.imagenumber)
139 # Set to default values
140 self.in_obj = False
141 self.name = self.local = ""
142 self.imagenumber = 1
143 if tag != 'li':
144 self.tagstack.pop(tag)
145
146
147 class PageLister(sgmllib.SGMLParser):
148 """
149 Parser of the chm.chm GetTopicsTree() method that retrieves the URL of the HTML
150 page embedded in the CHM file.
151 """
152
153 def reset(self):
154 sgmllib.SGMLParser.reset(self)
155 self.pages = []
156
157 def start_param(self, attrs):
158 urlparam_flag = False
159 for key, value in attrs:
160 if key == 'name' and value.lower() == 'local':
161 urlparam_flag = True
162 if urlparam_flag and key == 'value':
163 # Sometime url has incorrect slashes
164 value = urllib2.unquote(urlparse(value.replace('\\', '/')).geturl())
165 value = '/' + re.sub("#.*$", '', value)
166 # Avoid duplicates
167 if not self.pages.count(value):
168 self.pages.append(value)
169
170
171 class ImageCatcher(sgmllib.SGMLParser):
172 """
173 Finds image urls in the current html page, so to take them out from the chm file.
174 """
175
176 def reset(self):
177 sgmllib.SGMLParser.reset(self)
178 self.imgurls = []
179
180 def start_img(self, attrs):
181 for key, value in attrs:
182 if key.lower() == 'src':
183 # Avoid duplicates in the list of image URLs.
184 if not self.imgurls.count('/' + value):
185 self.imgurls.append('/' + value)
186
187 def start_a(self, attrs):
188 for key, value in attrs:
189 if key.lower() == 'href':
190 url = urlparse(value)
191 value = urllib2.unquote(url.geturl())
192 # Remove unwanted crap
193 value = '/' + re.sub("#.*$", '', value)
194 # Check file's mimetype
195 type = mimetypes.guess_type(value)[0]
196 # Avoid duplicates in the list of image URLs.
197 if not url.scheme and not self.imgurls.count(value) and \
198 type and re.search('image/.*', type):
199 self.imgurls.append(value)
200
201
202 class TOCCounter(HTMLParser):
203 """Count Table of Contents levels"""
204
205 count = 0
206
207 def __init__(self):
208 self.tagstack = TagStack()
209 HTMLParser.__init__(self)
210
211 def handle_starttag(self, tag, attrs):
212 self.tagstack.append(tag)
213
214 def handle_endtag(self, tag):
215 if self.tagstack:
216 if tag.lower() == 'object':
217 if self.count < self.tagstack.count('param'):
218 self.count = self.tagstack.count('param')
219 if tag.lower() != 'li':
220 self.tagstack.pop(tag)
+0
-61
archmod/CHMServer.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 #
6 # This program is free software; you can redistribute it and/or modify it under
7 # the terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 # details.
15 #
16 # You should have received a copy of the GNU General Public License along with
17 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #
20
21 import urllib
22 import mimetypes
23
24 from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
25
26 import archmod
27
28
29 class CHMServer(HTTPServer):
30 """HTTP Server that handle Compressed HTML"""
31
32 def __init__(self, CHM, name='', port=8000):
33 self.address = (name, port)
34 self.httpd = HTTPServer(self.address, CHMRequestHandler)
35 self.httpd.CHM = CHM
36 self.address = (name, port)
37
38 def run(self):
39 self.httpd.serve_forever()
40
41
42 class CHMRequestHandler(BaseHTTPRequestHandler):
43 """This class handle HTTP request for CHMServer"""
44
45 def do_GET(self):
46 pagename = urllib.unquote(self.path.split('?')[0])
47 if pagename == '/':
48 mimetype = 'text/html'
49 else:
50 mimetype = mimetypes.guess_type(pagename)[0]
51
52 self.send_response(200)
53 self.send_header('Content-type', mimetype)
54 self.end_headers()
55
56 # get html data from CHM instance and write it into output
57 try:
58 self.wfile.write(self.server.CHM.get_entry(pagename))
59 except NameError, msg:
60 archmod.message(archmod.ERROR, 'NameError: %s' % msg)
+0
-48
archmod/Cached.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # details.
14 #
15 # You should have received a copy of the GNU General Public License along with
16 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #
19
20 class Cached(object):
21 """Provides caching storage for data access decoration.
22 Usage:
23 class CachedClass(Cached):
24 def _getitem(self, name):
25 # implement data getting routine, such as db access
26
27 CachedClass().attribute1 # returns value as if _getitem('attribute1') was called
28 CachedClass().attribute2 # returns value as if _getitem('attribute2') was called
29 CachedClass().__doc__ # returns real docstring
30 """
31
32 def __new__(classtype, *args, **kwargs):
33 __instance = object.__new__(classtype, *args, **kwargs)
34 __instance.cache = {}
35 return __instance
36
37 # to be implemented by contract in the descendant classes
38 def _getitem(self, name):
39 raise Exception(NotImplemented)
40
41 def __getattribute__(self, name):
42 try:
43 return object.__getattribute__(self, name)
44 except:
45 if not self.cache.has_key(name):
46 self.cache[name] = self._getitem(name)
47 return self.cache[name]
+0
-100
archmod/__init__.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 #
6 # This program is free software; you can redistribute it and/or modify it under
7 # the terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 # details.
15 #
16 # You should have received a copy of the GNU General Public License along with
17 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #
20 __all__ = ['CHM', 'CHMServer', 'mod_chm']
21 __version__ = '0.2.4'
22
23 import sys, os, pkg_resources
24
25 # Return codes
26 OK = 0
27 ERROR = 1
28
29 # Global variables
30 EXTRACT = 1 # Extract CHM content
31 HTTPSERVER = 2 # Act as standalone HTTP server
32 DUMPHTML = 3 # Dump CHM file as plain text
33 CHM2TXT = 4 # Convert CHM file into Single Text file
34 CHM2HTML = 5 # Convert CHM file into Single HTML file
35 CHM2PDF = 6 # Convert CHM file into PDF Document
36 #CHM2PS = 7 # Convert CHM file into PDF Document
37
38 # Special characters
39 COMMASPACE = ', '
40 LF = '\n'
41 CR = '\r'
42
43 # what config file to use - local or a system wide?
44 user_config = os.path.join(os.path.expanduser('~'), '.arch.conf')
45 if os.path.exists(user_config):
46 config = user_config
47 else:
48 config = pkg_resources.resource_filename('archmod', 'arch.conf')
49
50 # Miscellaneous auxiliary functions
51 def message(code=OK, msg=''):
52 outfp = sys.stdout
53 if code == ERROR:
54 outfp = sys.stderr
55 if msg:
56 print >> outfp, msg
57
58 def file2dir(filename):
59 """Convert file filename.chm to filename_html directory"""
60 dirname = filename.rsplit('.', 1)[0] + '_' + 'html'
61 return dirname
62
63 def output_format(mode):
64 if mode == 'text':
65 return CHM2TXT
66 elif mode == 'html':
67 return CHM2HTML
68 elif mode == 'pdf':
69 return CHM2PDF
70 # elif mode == 'ps':
71 # return CHM2PS
72 else:
73 sys.exit('Invalid output file format: %s' % mode)
74
75 def output_file(filename, mode):
76 """Convert filename.chm to filename.output"""
77 if mode == CHM2TXT:
78 file_ext = 'txt'
79 elif mode == CHM2HTML:
80 file_ext = 'html'
81 elif mode == CHM2PDF:
82 file_ext = 'pdf'
83 # elif mode == CHM2PS:
84 # file_ext = 'ps'
85 else:
86 file_ext = 'output'
87 output_filename = filename.rsplit('.', 1)[0] + '.' + file_ext
88 return output_filename
89
90 # Our own listdir method :)
91 def listdir(dir):
92 def f(res, dir, files):
93 for e in files:
94 d = '/'.join(dir.split('/')[1:])
95 if d: d += '/'
96 res.append(d + e)
97 res = []
98 os.path.walk(dir, f, res)
99 return res
+0
-74
archmod/arch.conf less more
0 # Directory for templates, all files in that directory will be parsed
1 # and <%.+%> occurencies will be replaced with values from that
2 # file. For example, <%title%>, will be substituted by value of title
3 # variable.
4 # There is also some special variables, which have default values:
5 # contents - list, which represents chm file contents and deftopic -
6 # name of default page.
7 from os.path import basename, join
8 import pkg_resources
9
10 templates_dir = pkg_resources.resource_filename('archmod', 'templates/')
11
12 # Directory with icons
13 icons_dir = join(templates_dir, 'icons')
14
15 # List of auxiliary files, stored inside CHM file.
16 # Those files would not be extracted.
17 auxes = ('/#IDXHDR', '/#ITBITS', '/#STRINGS', '/#SYSTEM', '/#TOPICS',
18 '/#URLSTR', '/#URLTBL', '/#WINDOWS', '/$FIftiMain', '/$OBJINST',
19 '/$WWAssociativeLinks', '/$WWKeywordLinks', ':')
20
21 # Title. That is value, which you want to see in browser title.
22 # 'sourcename' is the name of source file.
23 title = basename(sourcename)
24
25 # Background and foreground colors for header.
26 bcolor = '#63baff'
27 fcolor = 'white'
28
29 # Filenames inside chm stored in utf-8, but links can be in some
30 # national codepage. If you set fs_encoding such links would be
31 # converted to it.
32 #
33 # Default: fs_encoding = 'utf-8'
34 fs_encoding = 'utf-8'
35
36 # If your filesystem is case-sensitive, links in the html can point to
37 # files that have differences in the case you need to set
38 # filename_case to 1 in that case :-)
39 #
40 # Default: filename_case=1
41 filename_case = 1
42
43 # If you want to add javascript code for restore framing to every
44 # page, set addframing.
45 #
46 # Default: restore_framing=1
47 restore_framing = 1
48
49 # Path to htmldoc executable
50 #
51 htmldoc_exec = '/usr/bin/htmldoc'
52
53 # CHM2TEXT converting. Use following command to convert CHM content to plain
54 # text file. Make sure that below apps are available on your system.
55 #chmtotext = 'lynx -dump -stdin'
56 chmtotext = '/usr/bin/elinks -dump'
57
58 # CHM2HTML converting. Use following command to convert CHM content to a single
59 # HTML file. Make sure that htmldoc is available on your system.
60 chmtohtml = '-t html -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --linkstyle underline --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --browserwidth 680 --no-strict --no-overflow --quiet'
61
62 # CHM2PDF converting. Use following command to convert CHM content to a single
63 # PDF file. Make sure that htmldoc is available on your system.
64 chmtopdf = '-t pdf14 -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --textcolor "#000000" --linkcolor "#0000ff" --linkstyle plain --size Universal --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --links --embedfonts --pagemode outline --pagelayout single --firstpage c1 --pageeffect none --pageduration 10 --effectduration 1.0 --no-encryption --permissions all --owner-password "" --user-password "" --browserwidth 680 --no-strict --no-overflow --quiet'
65
66 # CHM2PS converting. Use following command to convert CHM content to a single
67 # PostScript file. Make sure that htmldoc is available on your system.
68 #chmtops = '-t ps2 -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --textcolor "#000000" --linkcolor "#0000ff" --linkstyle underline --size A4 --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --browserwidth 680 --no-strict --no-overflow --quiet'
69
70 # Maximum Table of Content levels for htmldoc utility.
71 #
72 # Default: maxtoclvl = 4
73 maxtoclvl = 4
+0
-34
archmod/chmtotext.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # details.
14 #
15 # You should have received a copy of the GNU General Public License along with
16 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #
19
20 """CHM to Text converter (using external tool: lynx or elinks)"""
21
22 import sys
23 import signal
24 from subprocess import Popen, PIPE
25
26 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
27
28
29 def chmtotext(input, cmd, output=sys.stdout):
30 """CHM to Text converter"""
31 proc = Popen(cmd, stdin=PIPE, stdout=PIPE, shell=True)
32 proc.stdin.write(input)
33 print >> output, proc.communicate()[0]
+0
-174
archmod/cli.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 #
6 # This program is free software; you can redistribute it and/or modify it under
7 # the terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 # details.
15 #
16 # You should have received a copy of the GNU General Public License along with
17 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #
20
21 """arCHMage -- extensible reader and decompiler for files in the CHM format.
22
23 Usage: %(program)s [options] <chmfile> [destdir|destfile]
24 Where:
25
26 -x / --extract
27 Extracts CHM file into specified directory. If destination
28 directory is omitted, than the new one will be created based
29 on name of CHM file. This options is by default.
30
31 -c format
32 --convert=format
33 Convert CHM file into specified file format. If destination
34 file is omitted, than the new one will be created based
35 on name of CHM file. Available formats:
36
37 html - Single HTML file
38 text - Plain Text file
39 pdf - Adobe PDF file format
40
41 -p number
42 --port=number
43 Acts as HTTP server on specified port number, so you can read
44 CHM file with your favorite browser. You can specify a directory
45 with decompressed content.
46
47 -d / --dump
48 Dump HTML data from CHM file into standard output.
49
50 -V / --version
51 Print version number and exit.
52
53 -h / --help
54 Print this text and exit.
55 """
56
57 import os, sys
58 import getopt
59
60 import archmod
61 from archmod.CHM import CHMFile, CHMDir
62 from archmod.CHMServer import CHMServer
63
64
65 program = sys.argv[0]
66
67 def usage(code=archmod.OK, msg=''):
68 """Show application usage and quit"""
69 archmod.message(code, __doc__ % globals())
70 archmod.message(code, msg)
71 sys.exit(code)
72
73
74 def parseargs():
75 try:
76 opts, args = getopt.getopt(sys.argv[1:], 'xc:dp:Vh',
77 ['extract', 'convert=', 'dump', 'port=', 'version', 'help'])
78 except getopt.error, msg:
79 usage(archmod.ERROR, msg)
80
81 class Options:
82 mode = None # EXTRACT or HTTPSERVER or other
83 port = None # HTTP port number
84 chmfile = None # CHM File to view/extract
85 output = None # Output file or directory
86
87 options = Options()
88
89 for opt, arg in opts:
90 if opt in ('-h', '--help'):
91 usage()
92 elif opt in ('-V', '--version'):
93 archmod.message(archmod.OK, archmod.__version__)
94 sys.exit(archmod.OK)
95 elif opt in ('-p', '--port'):
96 if options.mode is not None:
97 sys.exit('-x and -p or -c are mutually exclusive')
98 options.mode = archmod.HTTPSERVER
99 try:
100 options.port = int(arg)
101 except ValueError, msg:
102 sys.exit('Invalid port number: %s' % msg)
103 elif opt in ('-c', '--convert'):
104 if options.mode is not None:
105 sys.exit('-x and -p or -c are mutually exclusive')
106 options.mode = archmod.output_format(str(arg))
107 elif opt in ('-x', '--extract'):
108 if options.mode is not None:
109 sys.exit('-x and -p or -c are mutually exclusive')
110 options.mode = archmod.EXTRACT
111 elif opt in ('-d', '--dump'):
112 if options.mode is not None:
113 sys.exit('-d should be used without any other options')
114 options.mode = archmod.DUMPHTML
115 else:
116 assert False, (opt, arg)
117
118 # Sanity checks
119 if options.mode is None:
120 # Set default option
121 options.mode = archmod.EXTRACT
122
123 if not args:
124 sys.exit('No CHM file was specified!')
125 else:
126 # Get CHM file name from command line
127 options.chmfile = args.pop(0)
128
129 # if CHM content should be extracted
130 if options.mode == archmod.EXTRACT:
131 if not args:
132 options.output = archmod.file2dir(options.chmfile)
133 else:
134 # get output directory from command line
135 options.output = args.pop(0)
136 # or converted into another file format
137 elif options.mode in (archmod.CHM2TXT, archmod.CHM2HTML, archmod.CHM2PDF):
138 if not args:
139 options.output = archmod.output_file(options.chmfile, options.mode)
140 else:
141 # get output filename from command line
142 options.output = args.pop(0)
143
144 # Any other arguments are invalid
145 if args:
146 sys.exit('Invalid arguments: ' + archmod.COMMASPACE.join(args))
147
148 return options
149
150
151 def main():
152 options = parseargs()
153 if not os.path.exists(options.chmfile):
154 sys.exit('No such file: %s' % options.chmfile)
155
156 # Check where is argument a CHM file or directory with decompressed
157 # content. Depending on results make 'source' instance of CHMFile or
158 # CHMDir class.
159 source = os.path.isfile(options.chmfile) and \
160 CHMFile(options.chmfile) or CHMDir(options.chmfile)
161
162 if options.mode == archmod.HTTPSERVER:
163 CHMServer(source, port=options.port).run()
164 elif options.mode == archmod.DUMPHTML:
165 source.dump_html()
166 elif options.mode == archmod.CHM2TXT:
167 if os.path.exists(options.output):
168 sys.exit('%s is already exists' % options.output)
169 source.chm2text(open(options.output, 'w'))
170 elif options.mode in (archmod.CHM2HTML, archmod.CHM2PDF):
171 source.htmldoc(options.output, options.mode)
172 elif options.mode == archmod.EXTRACT:
173 source.extract(options.output)
+0
-55
archmod/htmldoc.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
4 #
5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version.
9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 # details.
14 #
15 # You should have received a copy of the GNU General Public License along with
16 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
17 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #
19
20 """Generic converter function"""
21
22 import os
23 import string
24 import tempfile
25 import subprocess
26 import archmod
27
28
29 def htmldoc(input, cmd, options, toclevels, output):
30 """CHM to other format converter
31
32 input - list of input html files
33 cmd - full path to htmldoc command
34 options - htmldoc options from arch.conf
35 toclevels - number of ToC levels as htmldoc option
36 output - output file (single html, ps, pdf and etc)
37 """
38 if toclevels:
39 toc = ('--toclevels %s' % (toclevels))
40 else:
41 toc = ('--no-toc')
42 options = options % {'output' : output, 'toc' : toc}
43 if input:
44 # Create a htmldoc file for batch processing
45 f = tempfile.NamedTemporaryFile(delete=False)
46 f.write('#HTMLDOC 1.8.27' + archmod.LF)
47 f.write(options + archmod.LF)
48 f.write(string.join(input, archmod.LF))
49 f.close()
50 # Prepare command line to execute
51 command = '%s --batch %s' % (cmd, f.name)
52 subprocess.call(command, shell=True)
53 # Unlink temporary htmldoc file
54 os.unlink(f.name)
+0
-66
archmod/mod_chm.py less more
0 # -*- coding: utf-8 -*-
1 #
2 # archmage -- CHM decompressor
3 # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
4 # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>
5 #
6 # This program is free software; you can redistribute it and/or modify it under
7 # the terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
10 #
11 # This program is distributed in the hope that it will be useful, but WITHOUT
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 # details.
15 #
16 # You should have received a copy of the GNU General Public License along with
17 # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
18 # Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #
20
21 from mod_python import apache
22 from mimetypes import guess_type
23 from archmod.CHM import CHMFile
24
25 chmfile = None
26 chmname = None
27
28
29 def handler(req):
30 source = req.filename
31 pagename = req.path_info
32
33 global chmfile, chmname
34
35 if chmname != source:
36 chmfile = CHMFile(source)
37
38 chmname = source
39
40 if pagename:
41 try:
42 page = chmfile.get_entry(pagename)
43 except:
44 return apache.HTTP_NOT_FOUND
45
46 if pagename == '/':
47 mimetype = 'text/html'
48 else:
49 mimetype = guess_type(pagename)[0] or 'application/octet-stream'
50
51 req.content_type = mimetype
52 req.send_http_header()
53
54 req.write(page)
55 else:
56 mimetype = 'application/chm'
57 req.content_type = mimetype
58 req.send_http_header()
59 file = open(source, 'rb')
60 while 1:
61 tmp = file.read(4096)
62 if len(tmp) == 0:
63 break
64 req.write(tmp)
65 return apache.OK
+0
-171
archmod/templates/arch_contents.html less more
0 <html>
1 <head>
2 <title><%title%></title>
3 <LINK rel="Stylesheet" type="text/css" href="arch_css.css">
4 </head>
5
6 <body onload="setInterval('getLoc()', 500);">
7 <script>
8 var lastDoc;
9 var contents = <%contents%>;
10
11 var w=window,d=document
12 var icons={'0' : 'icons/0.gif','1' : 'icons/90.gif',
13 '2' : 'icons/91.gif', '3' : 'icons/92.gif', '4' : 'icons/99.gif',
14 '18' : 'icons/93.gif', '19' : 'icons/94.gif', '20' : 'icons/97.gif',
15 '26' : 'icons/95.gif', '27' : 'icons/96.gif', '28' : 'icons/98.gif'}
16
17 var dhtml=true
18 try{if(d.body.innerHTML.length<=0)dhtml=false}
19 catch(e){dhtml=false;}
20 var tree=[];
21
22 get_element=d.all ?
23 function(id){return d.all[id]}
24 :
25 function(id){return d.getElementById(id)}
26
27 function get_img1(){
28 return icons[((this.childs.length ? 16 : 0)+(this.childs.length && this.opened ? 8 : 0)+(this.is_last()? 1 : 0)+(this.is_first()? 2 : 0)+2)]
29 }
30 function get_img2(){
31 n=this.cnt[2]
32 if(n<9){
33 n=(this.opened ? ( n%2 ? parseInt(n)+1 : n ) : ( n%2 ? n : parseInt(n)-1 ))
34 }
35 return 'icons/'+n+'.gif'
36 }
37 function node(tree,n){
38 this.ind=tree.ind+1
39 this.cnt=tree.cnt[n+(this.ind ? 3 : 0)]
40 if(!this.cnt)return
41 this.tree=tree.tree
42 this.parent=tree
43 this.opened=!dhtml
44 this.nind=this.tree.nodes.length
45 this.tree.nodes[this.nind]=this
46 tree.childs[n]=this
47 this.childs=[]
48 for(var i=0;i < this.cnt.length - 2;i++)
49 new node(this,i)
50 this.get_img1=get_img1
51 this.get_img2=get_img2
52 this.open=open
53 this.select=select
54 this.init=init
55 this.is_last=function(){
56 return n==this.parent.childs.length - 1
57 }
58 this.is_first=function(){
59 return(this.ind==0)&&(n==0)&&(!this.is_last())
60 }
61 }
62
63 function open(){
64 var childs=[]
65 var el=get_element('divCont'+this.nind)
66 if(!el)return
67 if(!dhtml){
68 d.write(childs.join(''))
69 for(var i=0;i < this.childs.length;i++){
70 d.write(this.childs[i].init())
71 this.childs[i].open()
72 }
73 }
74 else{
75 if(!el.innerHTML){
76 for(var i=0;i < this.childs.length;i++)
77 childs[i]=this.childs[i].init()
78 el.innerHTML=childs.join('')
79 }
80 el.style.display=(this.opened ? 'none' : 'block')
81 this.opened=!this.opened
82 var img1=d.images['img1_'+this.nind],img2=d.images['img2_'+this.nind]
83 if(img1)img1.src=this.get_img1()
84 if(img2)img2.src=this.get_img2()
85 }
86 }
87
88
89 function select(nind){
90 if(!nind){
91 var sel=this.tree.sel
92 this.tree.sel=this
93 if(sel)sel.select(true)
94 }
95 var img2=d.images['img2_'+this.nind]
96 if(img2)img2.src=this.get_img2()
97 get_element('el'+this.nind).style.fontWeight=nind ? 'normal' : 'bold'
98 return Boolean(this.cnt[1])
99 }
100
101 function init(){
102 var temp=[],par=this.parent
103 for(var i=this.ind;i>0;i--){
104 temp[i]='<img src="'+icons[par.is_last()? 0 : 1]+'" border="0" align="absbottom">'
105 par=par.parent
106 }
107 r='<table cellpadding="0" cellspacing="0" border="0">'
108 r+='<tr><td nowrap>'
109 r+=temp.join('')
110 r+=(this.childs.length ?(!dhtml ? '' : '<a href="javascript: tree.toggle('+this.nind+')" >')+'<img src="'+this.get_img1()+'" border="0" align="absbottom" name="img1_'+this.nind+'">'+(!dhtml ? '' : '</a>'): '<img src="'+this.get_img1()+'" border="0" align="absbottom">')
111 r+='<a href="'+this.cnt[1]+'" target="'+'content'+'"'+' title="'+this.cnt[0]+'" onclick="return tree.select('+this.nind+')" '+(!dhtml ? '' : ' ondblclick="tree.toggle('+this.nind+')"')+' class="small" id="el'+this.nind+'"><img src="'+this.get_img2()+'" border="0" align="absbottom" name="img2_'+this.nind+'">&nbsp;'+this.cnt[0]+'</a>'
112 r+='</td></tr></table>'
113 r+=(this.childs.length ? '<div id="divCont'+this.nind+'" style="display:none"></div>' : '')
114 return r
115 }
116
117 function draw_contents(cnt){
118 tree=this;
119 tree.cnt=cnt;
120 tree.tree=this;
121 tree.nodes=[];
122 tree.sel=null;
123 tree.ind=-1;
124
125 tree.select=function(i){
126 return tree.nodes[i].select();
127 };
128 tree.toggle=function(i){
129 tree.nodes[i].open()
130 };
131 tree.childs=[]
132 for(var i=0;i<cnt.length;i++){
133 new node(tree,i)
134 }
135 tree.nind=0;
136
137 for(var i=0;i < tree.childs.length;i++){
138 d.write(tree.childs[i].init());
139 if(!dhtml)tree.childs[i].open();
140 }
141 }
142
143
144 function getLoc(){
145 var doc = ""+parent.frames[1].location;
146 if(doc != lastDoc){
147 var keyVals = new Array();
148 keyVals = doc.split("\/");
149 var targetPage = ""+keyVals[(keyVals.length-1)];
150
151 if(targetPage.indexOf("\#") > 0){
152 targetPage = targetPage.substr(0,targetPage.indexOf("\#"));
153 }
154
155 nodeCount = 0;
156 while( (""+tree.nodes[nodeCount].cnt[1]).lastIndexOf(targetPage) < 0){
157 nodeCount++;
158 }
159 parentNode = tree.nodes[nodeCount].parent;
160 if(parentNode != tree && parentNode.opened == false){
161 parentNode.open();
162 }
163 tree.nodes[nodeCount].select();
164 lastDoc = doc;
165 }
166 }
167 new draw_contents(contents);
168 </script>
169 </body>
170 </html>
+0
-2
archmod/templates/arch_css.css less more
0 .small { font-size: x-small; }
1 .htable { margin: 0; border: none; padding: 0 }
+0
-26
archmod/templates/arch_frameset.html less more
0 <html>
1 <head>
2 <title><%title%></title>
3
4 <script>
5 var qs=location.search.substr(1)
6 var A=qs.split("&")
7 var B=null
8 var F="<%deftopic%>"
9 for(var i=0;i<A.length;i++){B=A[i].split("=");A[i]=[B[0],B[1]]}
10 for(var j=0;j<A.length;j++){if(A[j][0]=='page'){ F=A[j][1];break}}
11 </script >
12 </head>
13 <script>
14 document.write('<frameset cols="200,*" bordercolor="<%bcolor%>" frameborder="yes" framespacing="2" >')
15 document.write('<frame name="toc" src="arch_contents.html">')
16 document.write('<frame name="content" src="'+F+'" >')
17 document.write('</frameset>');
18 </script>
19 <noscript>
20 <frameset cols="200,*" bordercolor="<%bcolor%>" frameborder="yes" framespacing="2" >
21 <frame name="toc" src="arch_contents.html" >
22 <frame name="content" src="<%deftopic%>">
23 </frameset>
24 </noscript>
25 </html>
+0
-12
archmod/templates/arch_header.html less more
0 <html>
1 <head>
2 <title><%title%></title>
3 <LINK rel="Stylesheet" type="text/css" href="arch_css.css">
4 </head>
5 <body bgcolor="<%bcolor%>">
6 <table class='htable' cellpadding="0" cellspacing="0" width="100%"><td>
7 <td align="center" width="100%">
8 <b><font size="large" color="<%fcolor%>"><%title%></font></b>
9 </table>
10 </body>
11 </html>
archmod/templates/icons/0.gif less more
Binary diff not shown
archmod/templates/icons/1.gif less more
Binary diff not shown
archmod/templates/icons/10.gif less more
Binary diff not shown
archmod/templates/icons/11.gif less more
Binary diff not shown
archmod/templates/icons/12.gif less more
Binary diff not shown
archmod/templates/icons/13.gif less more
Binary diff not shown
archmod/templates/icons/14.gif less more
Binary diff not shown
archmod/templates/icons/15.gif less more
Binary diff not shown
archmod/templates/icons/16.gif less more
Binary diff not shown
archmod/templates/icons/17.gif less more
Binary diff not shown
archmod/templates/icons/18.gif less more
Binary diff not shown
archmod/templates/icons/19.gif less more
Binary diff not shown
archmod/templates/icons/2.gif less more
Binary diff not shown
archmod/templates/icons/20.gif less more
Binary diff not shown
archmod/templates/icons/21.gif less more
Binary diff not shown
archmod/templates/icons/22.gif less more
Binary diff not shown
archmod/templates/icons/23.gif less more
Binary diff not shown
archmod/templates/icons/24.gif less more
Binary diff not shown
archmod/templates/icons/25.gif less more
Binary diff not shown
archmod/templates/icons/26.gif less more
Binary diff not shown
archmod/templates/icons/27.gif less more
Binary diff not shown
archmod/templates/icons/3.gif less more
Binary diff not shown
archmod/templates/icons/35.gif less more
Binary diff not shown
archmod/templates/icons/37.gif less more
Binary diff not shown
archmod/templates/icons/39.gif less more
Binary diff not shown
archmod/templates/icons/4.gif less more
Binary diff not shown
archmod/templates/icons/5.gif less more
Binary diff not shown
archmod/templates/icons/6.gif less more
Binary diff not shown
archmod/templates/icons/7.gif less more
Binary diff not shown
archmod/templates/icons/8.gif less more
Binary diff not shown
archmod/templates/icons/9.gif less more
Binary diff not shown
archmod/templates/icons/90.gif less more
Binary diff not shown
archmod/templates/icons/91.gif less more
Binary diff not shown
archmod/templates/icons/92.gif less more
Binary diff not shown
archmod/templates/icons/93.gif less more
Binary diff not shown
archmod/templates/icons/94.gif less more
Binary diff not shown
archmod/templates/icons/95.gif less more
Binary diff not shown
archmod/templates/icons/96.gif less more
Binary diff not shown
archmod/templates/icons/97.gif less more
Binary diff not shown
archmod/templates/icons/98.gif less more
Binary diff not shown
archmod/templates/icons/99.gif less more
Binary diff not shown
archmod/templates/icons/next.gif less more
Binary diff not shown
archmod/templates/icons/prev.gif less more
Binary diff not shown
+0
-39
archmod/templates/index.html less more
0 <html>
1 <head>
2 <script>var pageid="";</script>
3
4 <title><%title%></title>
5
6 <script>
7 function IsOpera(){return navigator.userAgent.indexOf("Opera")>-1}
8 var qs=location.search.substr(1);
9 var A=qs.split("&")
10 var B=null
11 var F="<%deftopic%>";
12 for(var i=0;i<A.length;i++){
13 B=A[i].split("=")
14 A[i]=[B[0],B[1]]
15 }
16 for(var j=0;j<A.length;j++){
17 if(A[j][0]=='page'){
18 F=A[j][1]
19 break
20 }
21 }
22 if (IsOpera()) F = '';</script>
23 </head>
24
25 <script>
26 document.write('<frameset rows="30,*" frameborder="no" framespacing="0" border="0" >')
27 document.write('<frame name="header" src="arch_header.html" frameborder="no" noresize="yes" scrolling="no" >')
28 if(F!='')F='?page='+F
29 document.write('<frame name="main" src="arch_frameset.html'+F+'">')
30 document.write('</frameset>')
31 </script>
32 <noscript>
33 <frameset rows="30,*" frameborder="no" framespacing="0" border="0" >
34 <frame name="header" src="arch_header.html" frameborder="no" noresize="yes" scrolling="no">
35 <frame name="main" src="arch_frameset.html" >
36 </frameset>
37 </noscript>
38 </html>
00 [egg_info]
11 tag_build =
22 tag_date = 0
3 tag_svn_revision = 0
43
00 #!/usr/bin/env python
11
22 from setuptools import setup, find_packages
3 import version
43
54 long_desc='''arCHMage is a reader and decompressor for CHM format'''
65
1514
1615 setup(
1716 name='archmage',
18 version=version.getVersion(),
17 version='0.4.0',
1918 description='CHM decompressor',
2019 maintainer='Mikhail Gusarov',
2120 maintainer_email='dottedmag@dottedmag.net',
2726 packages=find_packages(),
2827 install_requires=[
2928 'pychm',
30 'BeautifulSoup',
29 'beautifulsoup4',
30 'sgmllib3k',
3131 ],
3232 entry_points={
33 'console_scripts': ['archmage = archmod.cli:main'],
33 'console_scripts': ['archmage = archmage.cli:main'],
3434 },
3535 package_data={
36 'archmod': ['*.conf', 'templates/*.html', 'templates/*.css',
36 'archmage': ['*.conf', 'templates/*.html', 'templates/*.css',
3737 'templates/icons/*.gif'],
3838 }
3939 )
+0
-123
version.py less more
0 # -*- coding: utf-8 -*-
1
2 """Calculates the current version number.
3
4 If possible, uses output of “git describe” modified to conform to the
5 visioning scheme that setuptools uses (see PEP 386). Releases must be
6 labelled with annotated tags (signed tags are annotated) of the following
7 format:
8
9 v<num>(.<num>)+ [ {a|b|c|rc} <num> (.<num>)* ]
10
11 If “git describe” returns an error (likely because we're in an unpacked copy
12 of a release tarball, rather than a git working copy), or returns a tag that
13 does not match the above format, version is read from RELEASE-VERSION file.
14
15 To use this script, simply import it your setup.py file, and use the results
16 of getVersion() as your package version:
17
18 import version
19 setup(
20 version=version.getVersion(),
21 .
22 .
23 .
24 )
25
26 This will automatically update the RELEASE-VERSION file. The RELEASE-VERSION
27 file should *not* be checked into git but it *should* be included in sdist
28 tarballs (as should version.py file). To do this, run:
29
30 echo include RELEASE-VERSION version.py >>MANIFEST.in
31 echo RELEASE-VERSION >>.gitignore
32
33 With that setup, a new release can be labelled by simply invoking:
34
35 git tag -s v1.0
36 """
37
38 __author__ = ('Douglas Creager <dcreager@dcreager.net>',
39 'Michal Nazarewicz <mina86@mina86.com>')
40 __license__ = 'This file is placed into the public domain.'
41 __maintainer__ = 'Michal Nazarewicz'
42 __email__ = 'mina86@mina86.com'
43
44 __all__ = ('getVersion')
45
46
47 import re
48 import subprocess
49 import sys
50
51
52 RELEASE_VERSION_FILE = 'RELEASE-VERSION'
53
54 # http://www.python.org/dev/peps/pep-0386/
55 _PEP386_SHORT_VERSION_RE = r'\d+(?:\.\d+)+(?:(?:[abc]|rc)\d+(?:\.\d+)*)?'
56 _PEP386_VERSION_RE = r'^%s(?:\.post\d+)?(?:\.dev\d+)?$' % (
57 _PEP386_SHORT_VERSION_RE)
58 _GIT_DESCRIPTION_RE = r'^(?P<ver>%s)-(?P<commits>\d+)-g(?P<sha>[\da-f]+)$' % (
59 _PEP386_SHORT_VERSION_RE)
60
61
62 def readGitVersion():
63 try:
64 proc = subprocess.Popen(('git', 'describe', '--long',
65 '--match', '[0-9]*.*'),
66 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
67 data, _ = proc.communicate()
68 if proc.returncode:
69 return None
70 ver = data.splitlines()[0].strip()
71 except:
72 return None
73
74 if not ver:
75 return None
76 m = re.search(_GIT_DESCRIPTION_RE, ver)
77 if not m:
78 sys.stderr.write('version: git description (%s) is invalid, '
79 'ignoring\n' % ver)
80 return None
81
82 commits = int(m.group('commits'))
83 if not commits:
84 return m.group('ver')
85 else:
86 return '%s.post%d.dev%d' % (
87 m.group('ver'), commits, int(m.group('sha'), 16))
88
89
90 def readReleaseVersion():
91 try:
92 fd = open(RELEASE_VERSION_FILE)
93 try:
94 ver = fd.readline().strip()
95 finally:
96 fd.close()
97 if not re.search(_PEP386_VERSION_RE, ver):
98 sys.stderr.write('version: release version (%s) is invalid, '
99 'will use it anyway\n' % ver)
100 return ver
101 except:
102 return None
103
104
105 def writeReleaseVersion(version):
106 fd = open(RELEASE_VERSION_FILE, 'w')
107 fd.write('%s\n' % version)
108 fd.close()
109
110
111 def getVersion():
112 release_version = readReleaseVersion()
113 version = readGitVersion() or release_version
114 if not version:
115 raise ValueError('Cannot find the version number')
116 if version != release_version:
117 writeReleaseVersion(version)
118 return version
119
120
121 if __name__ == '__main__':
122 print(getVersion())