Import upstream version 0.4.2.1+git20210913.1.a8f632d
Debian Janitor
2 years ago
0 | 0 | Copyright (c) 2003 Eugeny Korekin <az@ftc.ru> |
1 | 1 | Copyright (c) 2005-2009 Basil Shubin <basil.shubin@gmail.com> |
2 | Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net> | |
2 | Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net> |
0 | Metadata-Version: 1.2 | |
0 | Metadata-Version: 2.1 | |
1 | 1 | Name: archmage |
2 | 2 | Version: 0.4.2.1 |
3 | 3 | Summary: CHM decompressor |
4 | 4 | Home-page: https://github.com/dottedmag/archmage |
5 | Maintainer: Mikhail Gusarov | |
5 | Maintainer: Misha Gusarov | |
6 | 6 | Maintainer-email: dottedmag@dottedmag.net |
7 | 7 | License: GPLv2+ |
8 | Description: arCHMage is a reader and decompressor for CHM format | |
9 | 8 | Keywords: chm,HTML Help,Compiled HTML,Compressed HTML |
10 | 9 | Platform: UNKNOWN |
11 | 10 | Classifier: Development Status :: 5 - Production/Stable |
14 | 13 | Classifier: Intended Audience :: End Users/Desktop |
15 | 14 | Classifier: License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) |
16 | 15 | Classifier: Topic :: Text Processing :: Filters |
16 | License-File: COPYING | |
17 | License-File: AUTHORS | |
18 | ||
19 | arCHMage is a reader and decompressor for CHM format | |
20 |
32 | 32 | Installation |
33 | 33 | ============ |
34 | 34 | |
35 | Archmage uses PyCHM that depends on (C library) CHMlib. After CHMlib is installed, do | |
36 | ||
35 | 37 | pip install archmage |
36 | 38 | |
37 | 39 | Requirements |
39 | 41 | |
40 | 42 | arCHMage has the following dependencies: |
41 | 43 | |
42 | * Python 3.5+ | |
44 | * Python 3.6+ | |
43 | 45 | * PyCHM |
44 | 46 | * BeautifulSoup4 |
45 | 47 |
2 | 2 | # archmage -- CHM decompressor |
3 | 3 | # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net> |
4 | 4 | # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net> |
5 | # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net> | |
5 | # Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net> | |
6 | 6 | # |
7 | 7 | # This program is free software; you can redistribute it and/or modify it under |
8 | 8 | # the terms of the GNU General Public License as published by the Free Software |
28 | 28 | import tempfile |
29 | 29 | import os.path |
30 | 30 | from enum import Enum |
31 | from typing import List, Union | |
31 | 32 | |
32 | 33 | import archmage |
33 | 34 | |
35 | 36 | |
36 | 37 | # import PyCHM bindings |
37 | 38 | try: |
38 | from chm import chmlib | |
39 | from chm import chmlib # type: ignore | |
39 | 40 | except ImportError as msg: |
40 | 41 | sys.exit( |
41 | 42 | "ImportError: %s\nPlease check README file for system requirements." |
69 | 70 | out.append(path) |
70 | 71 | return chmlib.CHM_ENUMERATOR_CONTINUE |
71 | 72 | |
72 | out = [] | |
73 | out: List[str] = [] | |
73 | 74 | if ( |
74 | 75 | chmlib.chm_enumerate( |
75 | 76 | self._chm, chmlib.CHM_ENUMERATE_ALL, get_name, out |
122 | 123 | self.cache = {} |
123 | 124 | # Name of source directory with CHM content |
124 | 125 | if os.path.isdir(name): |
125 | self.source = DirSource(name) | |
126 | self.source: Union[DirSource, FileSource] = DirSource(name) | |
126 | 127 | else: |
127 | 128 | self.source = FileSource(name) |
128 | 129 | self.sourcename = name |
176 | 177 | return self.cache["image_urls"] |
177 | 178 | |
178 | 179 | def _image_urls(self): |
179 | out = [] | |
180 | out: List[str] = [] | |
180 | 181 | image_catcher = ImageCatcher() |
181 | 182 | for file in self.html_files(): |
183 | # Use latin-1, as it will accept any byte sequences | |
182 | 184 | image_catcher.feed( |
183 | 185 | Entry( |
184 | 186 | self.source, file, self.filename_case, self.restore_framing |
185 | ).correct() | |
187 | ).correct().decode("latin-1") | |
186 | 188 | ) |
187 | 189 | for image_url in image_catcher.imgurls: |
188 | 190 | if not out.count(image_url): |
272 | 274 | |
273 | 275 | def _toclevels(self): |
274 | 276 | counter = TOCCounter() |
275 | counter.feed(self.topicstree) | |
277 | # Use latin-1, as it will accept any byte sequences | |
278 | counter.feed(self.topicstree.decode("latin-1")) | |
276 | 279 | if counter.count > self.maxtoclvl: |
277 | 280 | return self.maxtoclvl |
278 | 281 | else: |
431 | 434 | self.extract_entry( |
432 | 435 | entry=key, output_file=key.lower(), destdir=tempdir |
433 | 436 | ) |
434 | htmldoc(files, self.htmldoc_exec, options, self.toclevels, output) | |
437 | htmldoc(files, self.htmldoc_exec, options, self.toclevels(), output) | |
435 | 438 | # Remove temporary files |
436 | 439 | shutil.rmtree(path=tempdir) |
437 | 440 | |
492 | 495 | data = self.lower_links(data) |
493 | 496 | |
494 | 497 | # Delete unwanted HTML elements. |
495 | data = re.sub("<div .*teamlib\\.gif.*\\/div>", "", data) | |
496 | data = re.sub("<a href.*>\\[ Team LiB \\]<\\/a>", "", data) | |
498 | data = re.sub(b"<div .*teamlib\\.gif.*\\/div>", b"", data) | |
499 | data = re.sub(b"<a href.*>\\[ Team LiB \\]<\\/a>", b"", data) | |
497 | 500 | data = re.sub( |
498 | "<table.*larrow\\.gif.*rarrow\\.gif.*<\\/table>", "", data | |
499 | ) | |
500 | data = re.sub("<a href.*next\\.gif[^>]*><\\/a>", "", data) | |
501 | data = re.sub("<a href.*previous\\.gif[^>]*><\\/a>", "", data) | |
502 | data = re.sub("<a href.*prev\\.gif[^>]*><\\/a>", "", data) | |
503 | data = re.sub('"[^"]*previous\\.gif"', '""', data) | |
504 | data = re.sub('"[^"]*prev\\.gif"', '""', data) | |
505 | data = re.sub('"[^"]*next\\.gif"', '""', data) | |
501 | b"<table.*larrow\\.gif.*rarrow\\.gif.*<\\/table>", b"", data | |
502 | ) | |
503 | data = re.sub(b"<a href.*next\\.gif[^>]*><\\/a>", b"", data) | |
504 | data = re.sub(b"<a href.*previous\\.gif[^>]*><\\/a>", b"", data) | |
505 | data = re.sub(b"<a href.*prev\\.gif[^>]*><\\/a>", b"", data) | |
506 | data = re.sub(b'"[^"]*previous\\.gif"', b'""', data) | |
507 | data = re.sub(b'"[^"]*prev\\.gif"', b'""', data) | |
508 | data = re.sub(b'"[^"]*next\\.gif"', b'""', data) | |
506 | 509 | if data is not None: |
507 | 510 | return data |
508 | 511 | else: |
509 | return "" | |
512 | return b"" | |
510 | 513 | |
511 | 514 | def get(self): |
512 | 515 | """Get CHM entry content""" |
523 | 526 | if data is not None: |
524 | 527 | return data |
525 | 528 | else: |
526 | return "" | |
529 | return b"" |
1 | 1 | # |
2 | 2 | # archmage -- CHM decompressor |
3 | 3 | # Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net> |
4 | # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net> | |
4 | # Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net> | |
5 | 5 | # |
6 | 6 | # This program is free software; you can redistribute it and/or modify it under |
7 | 7 | # the terms of the GNU General Public License as published by the Free Software |
20 | 20 | |
21 | 21 | import re |
22 | 22 | import mimetypes |
23 | import sgmllib, urllib.request, urllib.error, urllib.parse | |
24 | ||
25 | from bs4 import BeautifulSoup, UnicodeDammit | |
23 | import sgmllib # type: ignore | |
24 | import urllib.request, urllib.error, urllib.parse | |
25 | ||
26 | from bs4 import BeautifulSoup, UnicodeDammit # type: ignore | |
26 | 27 | from html.parser import HTMLParser |
27 | 28 | from urllib.parse import urlparse |
28 | 29 |
2 | 2 | # archmage -- CHM decompressor |
3 | 3 | # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net> |
4 | 4 | # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net> |
5 | # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net> | |
5 | # Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net> | |
6 | 6 | # |
7 | 7 | # This program is free software; you can redistribute it and/or modify it under |
8 | 8 | # the terms of the GNU General Public License as published by the Free Software |
55 | 55 | |
56 | 56 | # CHM2PDF converting. Use following command to convert CHM content to a single |
57 | 57 | # PDF file. Make sure that htmldoc is available on your system. |
58 | chmtopdf = '-t pdf14 -f "%(output)s" --book %(toc)s --no-numbered --toctitle "Table of Contents" --title --textcolor "#000000" --linkcolor "#0000ff" --linkstyle plain --size Universal --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --links --embedfonts --pagemode outline --pagelayout single --firstpage c1 --pageeffect none --pageduration 10 --effectduration 1.0 --no-encryption --permissions all --owner-password "" --user-password "" --browserwidth 680 --no-strict --no-overflow --quiet' | |
58 | chmtopdf = '-t pdf14 -f "%(output)s" --webpage %(toc)s --no-title --no-numbered --toctitle "Table of Contents" --textcolor "#000000" --linkcolor "#0000ff" --linkstyle plain --size Universal --left 1.00in --right 0.50in --top 0.50in --bottom 0.50in --header .t. --header1 ... --footer h.1 --nup 1 --tocheader .t. --tocfooter ..i --portrait --color --no-pscommands --no-xrxcomments --compression=1 --jpeg=0 --fontsize 11.0 --fontspacing 1.2 --headingfont Helvetica --bodyfont Times --headfootsize 11.0 --headfootfont Helvetica --charset iso-8859-1 --links --embedfonts --pagemode outline --pagelayout single --firstpage c1 --pageeffect none --pageduration 10 --effectduration 1.0 --no-encryption --permissions all --owner-password "" --user-password "" --browserwidth 680 --no-strict --no-overflow --quiet' | |
59 | 59 | |
60 | 60 | # Maximum Table of Content levels for htmldoc utility. |
61 | 61 | # |
2 | 2 | # archmage -- CHM decompressor |
3 | 3 | # Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net> |
4 | 4 | # Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net> |
5 | # Copyright (c) 2015,2019 Mikhail Gusarov <dottedmag@dottedmag.net> | |
5 | # Copyright (c) 2015-2020 Misha Gusarov <dottedmag@dottedmag.net> | |
6 | 6 | # |
7 | 7 | # This program is free software; you can redistribute it and/or modify it under |
8 | 8 | # the terms of the GNU General Public License as published by the Free Software |
20 | 20 | """Generic converter function""" |
21 | 21 | |
22 | 22 | import os |
23 | import string | |
24 | 23 | import tempfile |
25 | 24 | import subprocess |
26 | 25 | |
41 | 40 | options = options % {"output": output, "toc": toc} |
42 | 41 | if input: |
43 | 42 | # Create a htmldoc file for batch processing |
44 | f = tempfile.NamedTemporaryFile(delete=False) | |
45 | f.write("#HTMLDOC 1.8.27\n") | |
46 | f.write(options + "\n") | |
47 | f.write(string.join(input, "\n")) | |
43 | f = tempfile.NamedTemporaryFile(mode="wb", delete=False) | |
44 | f.write(b"#HTMLDOC 1.8.27\n") | |
45 | f.write(options.encode("utf-8") + b"\n") | |
46 | f.write(b'\n'.join(f.encode('utf-8') for f in input)) | |
48 | 47 | f.close() |
49 | 48 | # Prepare command line to execute |
50 | 49 | command = "%s --batch %s" % (cmd, f.name) |
0 | Metadata-Version: 1.2 | |
0 | Metadata-Version: 2.1 | |
1 | 1 | Name: archmage |
2 | 2 | Version: 0.4.2.1 |
3 | 3 | Summary: CHM decompressor |
4 | 4 | Home-page: https://github.com/dottedmag/archmage |
5 | Maintainer: Mikhail Gusarov | |
5 | Maintainer: Misha Gusarov | |
6 | 6 | Maintainer-email: dottedmag@dottedmag.net |
7 | 7 | License: GPLv2+ |
8 | Description: arCHMage is a reader and decompressor for CHM format | |
9 | 8 | Keywords: chm,HTML Help,Compiled HTML,Compressed HTML |
10 | 9 | Platform: UNKNOWN |
11 | 10 | Classifier: Development Status :: 5 - Production/Stable |
14 | 13 | Classifier: Intended Audience :: End Users/Desktop |
15 | 14 | Classifier: License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) |
16 | 15 | Classifier: Topic :: Text Processing :: Filters |
16 | License-File: COPYING | |
17 | License-File: AUTHORS | |
18 | ||
19 | arCHMage is a reader and decompressor for CHM format | |
20 |
3 | 3 | NEWS |
4 | 4 | README.md |
5 | 5 | archmage.1 |
6 | pyproject.toml | |
6 | 7 | setup.py |
7 | 8 | archmage/CHM.py |
8 | 9 | archmage/CHMParser.py |