New upstream version 0.6.2
tony mancill
6 years ago
8 | 8 | - "3.6-dev" |
9 | 9 | - "nightly" |
10 | 10 | - "pypy" |
11 | - "pypy3" | |
12 | 11 | |
13 | 12 | install: |
14 | 13 | - pip install -r requirements-test.txt |
19 | 18 | |
20 | 19 | after_script: |
21 | 20 | - coveralls |
21 | ||
22 | deploy: | |
23 | provider: pypi | |
24 | user: "gpodder" | |
25 | password: | |
26 | secure: "S0LtFa2Oz/srn78bQahypyMrp46jOHgucFYyGDGS+i9/fSar5zqyo+COX9vUsd3Lo7RRS00RPJDG5n/HSbH3x2LWUQRlltu51Qr8srC66lwjgkARjJbcbcyBBB+b0U8UYt5zns3CgJTWbrZ5tiN0Gtoq72ojfhiYuf54V4hRdlQ=" | |
27 | on: | |
28 | tags: true |
3 | 3 | The podcast parser project is a library from the gPodder project to provide an |
4 | 4 | easy and reliable way of parsing RSS- and Atom-based podcast feeds in Python. |
5 | 5 | |
6 | * Web: http://gpodder.org/podcastparser/ | |
6 | ||
7 | ## Automated Tests | |
8 | ||
9 | To run the unit tests you need [`nose`](http://nose.readthedocs.io/en/latest/). If you have `nose` installed, use the `nosetests` command in the repository's root directory to run the tests. |
0 | environment: | |
1 | ||
2 | matrix: | |
3 | ||
4 | # For Python versions available on Appveyor, see | |
5 | # http://www.appveyor.com/docs/installed-software#python | |
6 | # The list here is complete (excluding Python 2.6, which | |
7 | # isn't covered by this document) at the time of writing. | |
8 | ||
9 | - PYTHON: "C:\\Python27" | |
10 | - PYTHON: "C:\\Python33" | |
11 | - PYTHON: "C:\\Python34" | |
12 | - PYTHON: "C:\\Python35" | |
13 | - PYTHON: "C:\\Python27-x64" | |
14 | - PYTHON: "C:\\Python33-x64" | |
15 | DISTUTILS_USE_SDK: "1" | |
16 | - PYTHON: "C:\\Python34-x64" | |
17 | DISTUTILS_USE_SDK: "1" | |
18 | - PYTHON: "C:\\Python35-x64" | |
19 | ||
20 | install: | |
21 | # Install dependencies | |
22 | - "%PYTHON%\\python.exe -m pip install -r requirements-test.txt" | |
23 | - "%PYTHON%\\python.exe -m pip install nose coverage" | |
24 | ||
25 | build: off | |
26 | ||
27 | test_script: | |
28 | # Put your test command here. | |
29 | # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4, | |
30 | # you can remove "build.cmd" from the front of the command, as it's | |
31 | # only needed to support those cases. | |
32 | # Note that you must use the environment variable %PYTHON% to refer to | |
33 | # the interpreter you're using - Appveyor does not do anything special | |
34 | # to put the Python evrsion you want to use on PATH. | |
35 | - "%PYTHON%\\python.exe -m nose" | |
36 | ||
37 | after_test: | |
38 | # Do coverage report. | |
39 | # - "coverall" | |
40 |
191 | 191 | .. automodule:: podcastparser |
192 | 192 | :members: |
193 | 193 | |
194 | Unsupported Namespaces | |
195 | ====================== | |
196 | ||
197 | This is a list of podcast-related XML namespaces that are not yet | |
198 | supported by podcastparser, but might be in the future. | |
199 | ||
200 | Chapter Marks | |
201 | ------------- | |
202 | ||
203 | - `rawvoice RSS`_: Rating, Frequency, Poster, WebM, MP4, Metamark (kind of chapter-like markers) | |
204 | - `IGOR`_: Chapter Marks | |
205 | ||
206 | .. _rawvoice RSS: http://www.rawvoice.com/rawvoiceRssModule/ | |
207 | .. _IGOR: http://emonk.net/IGOR | |
208 | ||
209 | Others | |
210 | ------ | |
211 | ||
212 | - `libSYN RSS Extensions`_: contactPhone, contactEmail, contactTwitter, contactWebsite, wallpaper, pdf, background | |
213 | - `Comment API`_: Comments to a given item (readable via RSS) | |
214 | - `MVCB`_: Error Reports To Field (usually a mailto: link) | |
215 | - `Syndication Module`_: Update period, frequency and base (for skipping updates) | |
216 | - `Creative Commons RSS`_: Creative commons license for the content | |
217 | - `Pheedo`_: Original link to website and original link to enclosure (without going through pheedo redirect) | |
218 | - `WGS84`_: Geo-Coordinates per item | |
219 | - `Conversations Network`_: Intro duration in milliseconds (for skipping the intro), ratings | |
220 | - `purl DC Elements`_: dc:creator (author / creator of the podcast, possibly with e-mail address) | |
221 | - `Tristana`_: tristana:self (canonical URL to feed) | |
222 | - `Blip`_: Show name, show page, picture, username, language, rating, thumbnail_src, license | |
223 | ||
224 | .. _libSYN RSS Extensions: http://libsyn.com/rss-extension | |
225 | .. _Comment API: http://www.wellformedweb.org/CommentAPI/ | |
226 | .. _MVCB: http://webns.net/mvcb/ | |
227 | .. _Syndication Module: http://web.resource.org/rss/1.0/modules/syndication/ | |
228 | .. _Creative Commons RSS: http://backend.userland.com/creativeCommonsRssModule | |
229 | .. _Pheedo: http://www.pheedo.com/namespace/pheedo | |
230 | .. _WGS84: http://www.w3.org/2003/01/geo/wgs84_pos# | |
231 | .. _Conversations Network: http://conversationsnetwork.org/rssNamespace-1.0/ | |
232 | .. _purl DC Elements: http://purl.org/dc/elements/1.1/ | |
233 | .. _Tristana: http://www.tristana.org | |
234 | .. _Blip: http://blip.tv/dtd/blip/1.0 | |
235 | ||
236 | ||
194 | 237 | Indices and tables |
195 | 238 | ================== |
196 | 239 |
19 | 19 | |
20 | 20 | # Will be parsed by setup.py to determine package metadata |
21 | 21 | __author__ = 'Thomas Perl <m@thp.io>' |
22 | __version__ = '0.6.1' | |
22 | __version__ = '0.6.2' | |
23 | 23 | __website__ = 'http://gpodder.org/podcastparser/' |
24 | 24 | __license__ = 'ISC License' |
25 | 25 | |
47 | 47 | |
48 | 48 | try: |
49 | 49 | # Python 2 |
50 | from rfc822 import mktime_tz, parsedate_tz | |
50 | from rfc822 import parsedate_tz | |
51 | import calendar | |
52 | # This is taken from Python 3's email._parseaddr, since it handles | |
53 | # pre-epoch dates better than what Python 2 does (time.mktime()) | |
54 | def mktime_tz(data): | |
55 | if data[9] is None: | |
56 | # No zone info, so localtime is better assumption than GMT | |
57 | return time.mktime(data[:8] + (-1,)) | |
58 | else: | |
59 | t = calendar.timegm(data) | |
60 | return t - data[9] | |
51 | 61 | except ImportError: |
52 | 62 | # Python 3 |
53 | 63 | from email.utils import mktime_tz, parsedate_tz |
516 | 526 | >>> parse_pubdate('Fri, 21 Nov 1997 09:55:06 -0600') |
517 | 527 | 880127706 |
518 | 528 | |
529 | >>> parse_pubdate('2003-12-13T00:00:00+02:00') | |
530 | 1071266400 | |
531 | ||
532 | >>> parse_pubdate('2003-12-13T18:30:02Z') | |
533 | 1071340202 | |
534 | ||
535 | >>> parse_pubdate('Mon, 02 May 1960 09:05:01 +0100') | |
536 | -305049299 | |
537 | ||
519 | 538 | >>> parse_pubdate('') |
520 | 539 | 0 |
521 | 540 | |
527 | 546 | |
528 | 547 | parsed = parsedate_tz(text) |
529 | 548 | if parsed is not None: |
530 | return int(mktime_tz(parsed)) | |
531 | ||
532 | # TODO: Fully RFC 3339-compliant parsing (w/ timezone) | |
549 | try: | |
550 | pubtimeseconds = int(mktime_tz(parsed)) | |
551 | return pubtimeseconds | |
552 | except(OverflowError,ValueError): | |
553 | logger.warn('bad pubdate %s is before epoch or after end of time (2038)',parsed) | |
554 | return 0 | |
555 | ||
533 | 556 | try: |
534 | 557 | parsed = time.strptime(text[:19], '%Y-%m-%dT%H:%M:%S') |
535 | 558 | if parsed is not None: |
536 | return int(time.mktime(parsed)) | |
559 | m = re.match(r'^(?:Z|([+-])([0-9]{2})[:]([0-9]{2}))$', text[19:]) | |
560 | if m: | |
561 | parsed = list(iter(parsed)) | |
562 | if m.group(1): | |
563 | offset = 3600 * int(m.group(2)) + 60 * int(m.group(3)) | |
564 | if m.group(1) == '-': | |
565 | offset = 0 - offset | |
566 | else: | |
567 | offset = 0 | |
568 | parsed.append(offset) | |
569 | return int(mktime_tz(tuple(parsed))) | |
570 | else: | |
571 | return int(time.mktime(parsed)) | |
537 | 572 | except Exception: |
538 | 573 | pass |
539 | 574 | |
559 | 594 | 'rss/channel/item/link': EpisodeAttrRelativeLink('link'), |
560 | 595 | 'rss/channel/item/description': EpisodeAttr('description', squash_whitespace), |
561 | 596 | 'rss/channel/item/itunes:summary': EpisodeAttr('description', squash_whitespace), |
597 | 'rss/channel/item/media:description': EpisodeAttr('description', squash_whitespace), | |
562 | 598 | 'rss/channel/item/itunes:subtitle': EpisodeAttr('subtitle', squash_whitespace), |
563 | 599 | 'rss/channel/item/content:encoded': EpisodeAttr('description_html'), |
564 | 600 | 'rss/channel/item/itunes:duration': EpisodeAttr('total_time', parse_time), |
581 | 617 | 'atom:feed/atom:entry/atom:title': EpisodeAttr('title', squash_whitespace), |
582 | 618 | 'atom:feed/atom:entry/atom:link': AtomLink(), |
583 | 619 | 'atom:feed/atom:entry/atom:content': AtomContent(), |
620 | 'atom:feed/atom:entry/content:encoded': EpisodeAttr('description_html'), | |
584 | 621 | 'atom:feed/atom:entry/atom:published': EpisodeAttr('published', parse_pubdate), |
622 | 'atom:feed/atom:entry/atom:updated': EpisodeAttr('published', parse_pubdate), | |
623 | 'atom:feed/atom:entry/media:group/media:description': EpisodeAttr('description', squash_whitespace), | |
585 | 624 | 'atom:feed/atom:entry/psc:chapters': PodloveChapters(), |
586 | 625 | 'atom:feed/atom:entry/psc:chapters/psc:chapter': PodloveChapter(), |
587 | 626 | } |
627 | ||
628 | # Derive valid root elements from the supported MAPPINGs | |
629 | VALID_ROOTS = set(path.split('/')[0] for path in MAPPING.keys()) | |
630 | ||
631 | ||
632 | class FeedParseError(sax.SAXParseException, ValueError): | |
633 | """ | |
634 | Exception raised when asked to parse an invalid feed | |
635 | ||
636 | This exception allows users of this library to catch exceptions | |
637 | without having to import the XML parsing library themselves. | |
638 | """ | |
639 | pass | |
588 | 640 | |
589 | 641 | |
590 | 642 | class PodcastHandler(sax.handler.ContentHandler): |
634 | 686 | if len(entry['chapters']) == 0: |
635 | 687 | del entry['chapters'] |
636 | 688 | |
689 | # Ensures `description` does not contain HTML | |
690 | if 'description' in entry and is_html(entry['description']): | |
691 | if 'description_html' not in entry: | |
692 | entry['description_html'] = entry['description'] | |
693 | entry['description'] = '' | |
694 | ||
695 | # Sets `description` to stripped `description_html` when absent | |
637 | 696 | if 'description_html' in entry and not entry['description']: |
638 | 697 | entry['description'] = remove_html_tags(entry['description_html']) |
639 | 698 | |
672 | 731 | |
673 | 732 | def startElement(self, name, attrs): |
674 | 733 | self.namespace = Namespace(attrs, self.namespace) |
675 | self.path_stack.append(self.namespace.map(name)) | |
734 | name = self.namespace.map(name) | |
735 | if not self.path_stack and name not in VALID_ROOTS: | |
736 | raise FeedParseError( | |
737 | msg='Unsupported feed type: {}'.format(name), | |
738 | exception=None, | |
739 | locator=self._locator, | |
740 | ) | |
741 | self.path_stack.append(name) | |
676 | 742 | |
677 | 743 | target = MAPPING.get('/'.join(self.path_stack)) |
678 | 744 | if target is not None: |
706 | 772 | :returns: a dict with the parsed contents of the feed |
707 | 773 | """ |
708 | 774 | handler = PodcastHandler(url, max_episodes) |
709 | sax.parse(stream, handler) | |
775 | try: | |
776 | sax.parse(stream, handler) | |
777 | except sax.SAXParseException as e: | |
778 | raise FeedParseError(e.getMessage(), e.getException(), e._locator) | |
710 | 779 | return handler.data |
711 | 780 | |
712 | 781 | |
780 | 849 | # urlunsplit might return "a slighty different, but equivalent URL" |
781 | 850 | return urlparse.urlunsplit((scheme, netloc, path, query, fragment)) |
782 | 851 | |
852 | def is_html(text): | |
853 | """ | |
854 | Tests whether the given string contains HTML encoded data | |
855 | """ | |
856 | html_test = re.compile(r'<[a-z][\s\S]*>', re.IGNORECASE) | |
857 | return bool(html_test.search(text)) | |
783 | 858 | |
784 | 859 | def remove_html_tags(html): |
785 | 860 | """ |
20 | 20 | import os |
21 | 21 | import glob |
22 | 22 | import json |
23 | try: | |
24 | # Python 2 | |
25 | from StringIO import StringIO | |
26 | except ImportError: | |
27 | # Python 3 | |
28 | from io import StringIO | |
29 | ||
23 | 30 | |
24 | 31 | from nose.tools import assert_equal |
32 | from nose.tools import assert_raises | |
25 | 33 | |
26 | 34 | import podcastparser |
27 | 35 | |
38 | 46 | params = json.load(open(param_filename)) |
39 | 47 | |
40 | 48 | expected = json.load(open(json_filename)) |
41 | parsed = podcastparser.parse('file://' + rss_filename, | |
49 | normalized_rss_filename = rss_filename | |
50 | if os.sep == '\\': | |
51 | normalized_rss_filename = normalized_rss_filename.replace(os.sep, '/') | |
52 | parsed = podcastparser.parse('file://' + normalized_rss_filename, | |
42 | 53 | open(rss_filename), **params) |
43 | 54 | |
44 | 55 | assert_equal.__self__.maxDiff = None |
46 | 57 | |
47 | 58 | for rss_filename in glob.glob(os.path.join('tests', 'data', '*.rss')): |
48 | 59 | yield test_parse_rss, rss_filename |
60 | ||
61 | def test_invalid_roots(): | |
62 | def test_fail_parse(feed): | |
63 | with assert_raises(podcastparser.FeedParseError): | |
64 | podcastparser.parse('file://example.com/feed.xml', StringIO(feed)) | |
65 | ||
66 | feeds = [ | |
67 | '<html><body/></html>', | |
68 | '<foo xmlns="http://example.com/foo.xml"><bar/></foo>', | |
69 | '<baz:foo xmlns:baz="http://example.com/baz.xml"><baz:bar/></baz:foo>', | |
70 | ] | |
71 | for feed in feeds: | |
72 | yield test_fail_parse, feed |
0 | {"episodes": [{"description": "Hello", | |
1 | "description_html": "<h1>Hello</h1>", | |
2 | "enclosures": [], | |
3 | "guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a", | |
4 | "link": "http://example.org/2003/12/13/atom03", | |
5 | "payment_url": null, | |
6 | "published": 1071340202, | |
7 | "title": "Atom-Powered Robots Run Amok", | |
8 | "total_time": 0}], | |
9 | "title": "Example Feed"} |
0 | <?xml version="1.0" encoding="utf-8"?> | |
1 | <feed xmlns="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"> | |
2 | ||
3 | <title>Example Feed</title> | |
4 | <link href="http://example.org/"/> | |
5 | <updated>2003-12-13T18:30:02Z</updated> | |
6 | <author> | |
7 | <name>John Doe</name> | |
8 | </author> | |
9 | <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> | |
10 | ||
11 | <entry> | |
12 | <title>Atom-Powered Robots Run Amok</title> | |
13 | <link href="http://example.org/2003/12/13/atom03"/> | |
14 | <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> | |
15 | <updated>2003-12-13T18:30:02Z</updated> | |
16 | <summary>Some text.</summary> | |
17 | <content:encoded><![CDATA[<h1>Hello</h1>]]></content:encoded> | |
18 | </entry> | |
19 | ||
20 | </feed> |
0 | {"episodes": [{"description": "", | |
1 | "enclosures": [], | |
2 | "guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a", | |
3 | "link": "http://example.org/2003/12/13/atom03", | |
4 | "payment_url": null, | |
5 | "published": 1071340202, | |
6 | "title": "Atom-Powered Robots Run Amok", | |
7 | "total_time": 0}], | |
8 | "title": "Example Feed"} |
0 | <?xml version="1.0" encoding="utf-8"?> | |
1 | <feed xmlns="http://www.w3.org/2005/Atom"> | |
2 | ||
3 | <title>Example Feed</title> | |
4 | <link href="http://example.org/"/> | |
5 | <updated>2003-12-13T18:30:02Z</updated> | |
6 | <author> | |
7 | <name>John Doe</name> | |
8 | </author> | |
9 | <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> | |
10 | ||
11 | <entry> | |
12 | <title>Atom-Powered Robots Run Amok</title> | |
13 | <link href="http://example.org/2003/12/13/atom03"/> | |
14 | <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> | |
15 | <updated>2003-12-13T18:30:02Z</updated> | |
16 | <summary>Some text.</summary> | |
17 | </entry> | |
18 | ||
19 | </feed> |
0 | { | |
1 | "title": "HTML Podcast", | |
2 | "episodes": [ | |
3 | { | |
4 | "title": "Ep 1", | |
5 | "description": "This is a test", | |
6 | "description_html": "<h1>This is a <em>test</em></h1>", | |
7 | "published": 0, | |
8 | "guid": "http://example.org/example.opus", | |
9 | "link": "", | |
10 | "total_time": 0, | |
11 | "payment_url": null, | |
12 | "enclosures": [ | |
13 | { | |
14 | "file_size": -1, | |
15 | "url": "http://example.org/example.opus", | |
16 | "mime_type": "application/octet-stream" | |
17 | } | |
18 | ] | |
19 | } | |
20 | ] | |
21 | } |
0 | <rss> | |
1 | <channel> | |
2 | <title>HTML Podcast</title> | |
3 | <item> | |
4 | <title>Ep 1</title> | |
5 | <enclosure url="http://example.org/example.opus"/> | |
6 | <description> | |
7 | <![CDATA[ <h1>This is a <em>test</em></h1> ]]> | |
8 | </description> | |
9 | </item> | |
10 | </channel> | |
11 | </rss> |
0 | { | |
1 | "title": "HTML Podcast with Text Description", | |
2 | "episodes": [ | |
3 | { | |
4 | "title": "Ep 1", | |
5 | "description": "This is also a test", | |
6 | "description_html": "<h1>This is also a <em>test</em></h1>", | |
7 | "published": 0, | |
8 | "guid": "http://example.org/example.opus", | |
9 | "link": "", | |
10 | "total_time": 0, | |
11 | "payment_url": null, | |
12 | "enclosures": [ | |
13 | { | |
14 | "file_size": -1, | |
15 | "url": "http://example.org/example.opus", | |
16 | "mime_type": "application/octet-stream" | |
17 | } | |
18 | ] | |
19 | } | |
20 | ] | |
21 | } |
0 | <rss> | |
1 | <channel> | |
2 | <title>HTML Podcast with Text Description</title> | |
3 | <item> | |
4 | <title>Ep 1</title> | |
5 | <enclosure url="http://example.org/example.opus"/> | |
6 | <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"> | |
7 | <![CDATA[ <h1>This is also a <em>test</em></h1> ]]> | |
8 | </content:encoded> | |
9 | <description><![CDATA[ <h1>This is also a <em>test</em></h1> ]]></description> | |
10 | </item> | |
11 | </channel> | |
12 | </rss> |
0 | { | |
1 | "title": "HTML Podcast with Text Description", | |
2 | "episodes": [ | |
3 | { | |
4 | "title": "Ep 1", | |
5 | "description": "This is also a test", | |
6 | "description_html": "<h1>This is also a <em>test</em></h1>", | |
7 | "published": 0, | |
8 | "guid": "http://example.org/example.opus", | |
9 | "link": "", | |
10 | "total_time": 0, | |
11 | "payment_url": null, | |
12 | "enclosures": [ | |
13 | { | |
14 | "file_size": -1, | |
15 | "url": "http://example.org/example.opus", | |
16 | "mime_type": "application/octet-stream" | |
17 | } | |
18 | ] | |
19 | } | |
20 | ] | |
21 | } |
0 | <rss> | |
1 | <channel> | |
2 | <title>HTML Podcast with Text Description</title> | |
3 | <item> | |
4 | <title>Ep 1</title> | |
5 | <enclosure url="http://example.org/example.opus"/> | |
6 | <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"> | |
7 | <![CDATA[ <h1>This is also a <em>test</em></h1> ]]> | |
8 | </content:encoded> | |
9 | <description><![CDATA[ <h1>This text will be discarded</h1> ]]></description> | |
10 | </item> | |
11 | </channel> | |
12 | </rss> |