Commit d67111f59afe613aea4bfce1836244726718e4c0 - podcastparser

+8

-1

.travis.yml less more

8	8	- "3.6-dev"
9	9	- "nightly"
10	10	- "pypy"
11		- "pypy3"
12	11
13	12	install:
14	13	- pip install -r requirements-test.txt

19	18
20	19	after_script:
21	20	- coveralls
	21
	22	deploy:
	23	provider: pypi
	24	user: "gpodder"
	25	password:
	26	secure: "S0LtFa2Oz/srn78bQahypyMrp46jOHgucFYyGDGS+i9/fSar5zqyo+COX9vUsd3Lo7RRS00RPJDG5n/HSbH3x2LWUQRlltu51Qr8srC66lwjgkARjJbcbcyBBB+b0U8UYt5zns3CgJTWbrZ5tiN0Gtoq72ojfhiYuf54V4hRdlQ="
	27	on:
	28	tags: true

+4

-1

README.md less more

3	3	The podcast parser project is a library from the gPodder project to provide an
4	4	easy and reliable way of parsing RSS- and Atom-based podcast feeds in Python.
5	5
6		* Web: http://gpodder.org/podcastparser/
	6
	7	## Automated Tests
	8
	9	To run the unit tests you need [`nose`](http://nose.readthedocs.io/en/latest/). If you have `nose` installed, use the `nosetests` command in the repository's root directory to run the tests.

+41

-0

appveyor.yml less more

	0	environment:
	1
	2	matrix:
	3
	4	# For Python versions available on Appveyor, see
	5	# http://www.appveyor.com/docs/installed-software#python
	6	# The list here is complete (excluding Python 2.6, which
	7	# isn't covered by this document) at the time of writing.
	8
	9	- PYTHON: "C:\\Python27"
	10	- PYTHON: "C:\\Python33"
	11	- PYTHON: "C:\\Python34"
	12	- PYTHON: "C:\\Python35"
	13	- PYTHON: "C:\\Python27-x64"
	14	- PYTHON: "C:\\Python33-x64"
	15	DISTUTILS_USE_SDK: "1"
	16	- PYTHON: "C:\\Python34-x64"
	17	DISTUTILS_USE_SDK: "1"
	18	- PYTHON: "C:\\Python35-x64"
	19
	20	install:
	21	# Install dependencies
	22	- "%PYTHON%\\python.exe -m pip install -r requirements-test.txt"
	23	- "%PYTHON%\\python.exe -m pip install nose coverage"
	24
	25	build: off
	26
	27	test_script:
	28	# Put your test command here.
	29	# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
	30	# you can remove "build.cmd" from the front of the command, as it's
	31	# only needed to support those cases.
	32	# Note that you must use the environment variable %PYTHON% to refer to
	33	# the interpreter you're using - Appveyor does not do anything special
	34	# to put the Python evrsion you want to use on PATH.
	35	- "%PYTHON%\\python.exe -m nose"
	36
	37	after_test:
	38	# Do coverage report.
	39	# - "coverall"
	40

+43

-0

doc/index.rst less more

191	191	.. automodule:: podcastparser
192	192	:members:
193	193
	194	Unsupported Namespaces
	195	======================
	196
	197	This is a list of podcast-related XML namespaces that are not yet
	198	supported by podcastparser, but might be in the future.
	199
	200	Chapter Marks
	201	-------------
	202
	203	- `rawvoice RSS`_: Rating, Frequency, Poster, WebM, MP4, Metamark (kind of chapter-like markers)
	204	- `IGOR`_: Chapter Marks
	205
	206	.. _rawvoice RSS: http://www.rawvoice.com/rawvoiceRssModule/
	207	.. _IGOR: http://emonk.net/IGOR
	208
	209	Others
	210	------
	211
	212	- `libSYN RSS Extensions`_: contactPhone, contactEmail, contactTwitter, contactWebsite, wallpaper, pdf, background
	213	- `Comment API`_: Comments to a given item (readable via RSS)
	214	- `MVCB`_: Error Reports To Field (usually a mailto: link)
	215	- `Syndication Module`_: Update period, frequency and base (for skipping updates)
	216	- `Creative Commons RSS`_: Creative commons license for the content
	217	- `Pheedo`_: Original link to website and original link to enclosure (without going through pheedo redirect)
	218	- `WGS84`_: Geo-Coordinates per item
	219	- `Conversations Network`_: Intro duration in milliseconds (for skipping the intro), ratings
	220	- `purl DC Elements`_: dc:creator (author / creator of the podcast, possibly with e-mail address)
	221	- `Tristana`_: tristana:self (canonical URL to feed)
	222	- `Blip`_: Show name, show page, picture, username, language, rating, thumbnail_src, license
	223
	224	.. _libSYN RSS Extensions: http://libsyn.com/rss-extension
	225	.. _Comment API: http://www.wellformedweb.org/CommentAPI/
	226	.. _MVCB: http://webns.net/mvcb/
	227	.. _Syndication Module: http://web.resource.org/rss/1.0/modules/syndication/
	228	.. _Creative Commons RSS: http://backend.userland.com/creativeCommonsRssModule
	229	.. _Pheedo: http://www.pheedo.com/namespace/pheedo
	230	.. _WGS84: http://www.w3.org/2003/01/geo/wgs84_pos#
	231	.. _Conversations Network: http://conversationsnetwork.org/rssNamespace-1.0/
	232	.. _purl DC Elements: http://purl.org/dc/elements/1.1/
	233	.. _Tristana: http://www.tristana.org
	234	.. _Blip: http://blip.tv/dtd/blip/1.0
	235
	236
194	237	Indices and tables
195	238	==================
196	239

+83

-8

podcastparser.py less more

19	19
20	20	# Will be parsed by setup.py to determine package metadata
21	21	__author__ = 'Thomas Perl <m@thp.io>'
22		__version__ = '0.6.1'
	22	__version__ = '0.6.2'
23	23	__website__ = 'http://gpodder.org/podcastparser/'
24	24	__license__ = 'ISC License'
25	25

47	47
48	48	try:
49	49	# Python 2
50		from rfc822 import mktime_tz, parsedate_tz
	50	from rfc822 import parsedate_tz
	51	import calendar
	52	# This is taken from Python 3's email._parseaddr, since it handles
	53	# pre-epoch dates better than what Python 2 does (time.mktime())
	54	def mktime_tz(data):
	55	if data[9] is None:
	56	# No zone info, so localtime is better assumption than GMT
	57	return time.mktime(data[:8] + (-1,))
	58	else:
	59	t = calendar.timegm(data)
	60	return t - data[9]
51	61	except ImportError:
52	62	# Python 3
53	63	from email.utils import mktime_tz, parsedate_tz

516	526	>>> parse_pubdate('Fri, 21 Nov 1997 09:55:06 -0600')
517	527	880127706
518	528
	529	>>> parse_pubdate('2003-12-13T00:00:00+02:00')
	530	1071266400
	531
	532	>>> parse_pubdate('2003-12-13T18:30:02Z')
	533	1071340202
	534
	535	>>> parse_pubdate('Mon, 02 May 1960 09:05:01 +0100')
	536	-305049299
	537
519	538	>>> parse_pubdate('')
520	539	0
521	540

527	546
528	547	parsed = parsedate_tz(text)
529	548	if parsed is not None:
530		return int(mktime_tz(parsed))
531
532		# TODO: Fully RFC 3339-compliant parsing (w/ timezone)
	549	try:
	550	pubtimeseconds = int(mktime_tz(parsed))
	551	return pubtimeseconds
	552	except(OverflowError,ValueError):
	553	logger.warn('bad pubdate %s is before epoch or after end of time (2038)',parsed)
	554	return 0
	555
533	556	try:
534	557	parsed = time.strptime(text[:19], '%Y-%m-%dT%H:%M:%S')
535	558	if parsed is not None:
536		return int(time.mktime(parsed))
	559	m = re.match(r'^(?:Z\|([+-])([0-9]{2})[:]([0-9]{2}))$', text[19:])
	560	if m:
	561	parsed = list(iter(parsed))
	562	if m.group(1):
	563	offset = 3600 * int(m.group(2)) + 60 * int(m.group(3))
	564	if m.group(1) == '-':
	565	offset = 0 - offset
	566	else:
	567	offset = 0
	568	parsed.append(offset)
	569	return int(mktime_tz(tuple(parsed)))
	570	else:
	571	return int(time.mktime(parsed))
537	572	except Exception:
538	573	pass
539	574

559	594	'rss/channel/item/link': EpisodeAttrRelativeLink('link'),
560	595	'rss/channel/item/description': EpisodeAttr('description', squash_whitespace),
561	596	'rss/channel/item/itunes:summary': EpisodeAttr('description', squash_whitespace),
	597	'rss/channel/item/media:description': EpisodeAttr('description', squash_whitespace),
562	598	'rss/channel/item/itunes:subtitle': EpisodeAttr('subtitle', squash_whitespace),
563	599	'rss/channel/item/content:encoded': EpisodeAttr('description_html'),
564	600	'rss/channel/item/itunes:duration': EpisodeAttr('total_time', parse_time),

581	617	'atom:feed/atom:entry/atom:title': EpisodeAttr('title', squash_whitespace),
582	618	'atom:feed/atom:entry/atom:link': AtomLink(),
583	619	'atom:feed/atom:entry/atom:content': AtomContent(),
	620	'atom:feed/atom:entry/content:encoded': EpisodeAttr('description_html'),
584	621	'atom:feed/atom:entry/atom:published': EpisodeAttr('published', parse_pubdate),
	622	'atom:feed/atom:entry/atom:updated': EpisodeAttr('published', parse_pubdate),
	623	'atom:feed/atom:entry/media:group/media:description': EpisodeAttr('description', squash_whitespace),
585	624	'atom:feed/atom:entry/psc:chapters': PodloveChapters(),
586	625	'atom:feed/atom:entry/psc:chapters/psc:chapter': PodloveChapter(),
587	626	}
	627
	628	# Derive valid root elements from the supported MAPPINGs
	629	VALID_ROOTS = set(path.split('/')[0] for path in MAPPING.keys())
	630
	631
	632	class FeedParseError(sax.SAXParseException, ValueError):
	633	"""
	634	Exception raised when asked to parse an invalid feed
	635
	636	This exception allows users of this library to catch exceptions
	637	without having to import the XML parsing library themselves.
	638	"""
	639	pass
588	640
589	641
590	642	class PodcastHandler(sax.handler.ContentHandler):

634	686	if len(entry['chapters']) == 0:
635	687	del entry['chapters']
636	688
	689	# Ensures `description` does not contain HTML
	690	if 'description' in entry and is_html(entry['description']):
	691	if 'description_html' not in entry:
	692	entry['description_html'] = entry['description']
	693	entry['description'] = ''
	694
	695	# Sets `description` to stripped `description_html` when absent
637	696	if 'description_html' in entry and not entry['description']:
638	697	entry['description'] = remove_html_tags(entry['description_html'])
639	698

672	731
673	732	def startElement(self, name, attrs):
674	733	self.namespace = Namespace(attrs, self.namespace)
675		self.path_stack.append(self.namespace.map(name))
	734	name = self.namespace.map(name)
	735	if not self.path_stack and name not in VALID_ROOTS:
	736	raise FeedParseError(
	737	msg='Unsupported feed type: {}'.format(name),
	738	exception=None,
	739	locator=self._locator,
	740	)
	741	self.path_stack.append(name)
676	742
677	743	target = MAPPING.get('/'.join(self.path_stack))
678	744	if target is not None:

706	772	:returns: a dict with the parsed contents of the feed
707	773	"""
708	774	handler = PodcastHandler(url, max_episodes)
709		sax.parse(stream, handler)
	775	try:
	776	sax.parse(stream, handler)
	777	except sax.SAXParseException as e:
	778	raise FeedParseError(e.getMessage(), e.getException(), e._locator)
710	779	return handler.data
711	780
712	781

780	849	# urlunsplit might return "a slighty different, but equivalent URL"
781	850	return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
782	851
	852	def is_html(text):
	853	"""
	854	Tests whether the given string contains HTML encoded data
	855	"""
	856	html_test = re.compile(r'<[a-z][\s\S]*>', re.IGNORECASE)
	857	return bool(html_test.search(text))
783	858
784	859	def remove_html_tags(html):
785	860	"""

+25

-1

test_podcastparser.py less more

20	20	import os
21	21	import glob
22	22	import json
	23	try:
	24	# Python 2
	25	from StringIO import StringIO
	26	except ImportError:
	27	# Python 3
	28	from io import StringIO
	29
23	30
24	31	from nose.tools import assert_equal
	32	from nose.tools import assert_raises
25	33
26	34	import podcastparser
27	35

38	46	params = json.load(open(param_filename))
39	47
40	48	expected = json.load(open(json_filename))
41		parsed = podcastparser.parse('file://' + rss_filename,
	49	normalized_rss_filename = rss_filename
	50	if os.sep == '\\':
	51	normalized_rss_filename = normalized_rss_filename.replace(os.sep, '/')
	52	parsed = podcastparser.parse('file://' + normalized_rss_filename,
42	53	open(rss_filename), **params)
43	54
44	55	assert_equal.__self__.maxDiff = None

46	57
47	58	for rss_filename in glob.glob(os.path.join('tests', 'data', '*.rss')):
48	59	yield test_parse_rss, rss_filename
	60
	61	def test_invalid_roots():
	62	def test_fail_parse(feed):
	63	with assert_raises(podcastparser.FeedParseError):
	64	podcastparser.parse('file://example.com/feed.xml', StringIO(feed))
	65
	66	feeds = [
	67	'<html><body/></html>',
	68	'<foo xmlns="http://example.com/foo.xml"><bar/></foo>',
	69	'<baz:foo xmlns:baz="http://example.com/baz.xml"><baz:bar/></baz:foo>',
	70	]
	71	for feed in feeds:
	72	yield test_fail_parse, feed

+10

-0

tests/data/atom_content_encoded.json less more

	0	{"episodes": [{"description": "Hello",
	1	"description_html": "<h1>Hello</h1>",
	2	"enclosures": [],
	3	"guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
	4	"link": "http://example.org/2003/12/13/atom03",
	5	"payment_url": null,
	6	"published": 1071340202,
	7	"title": "Atom-Powered Robots Run Amok",
	8	"total_time": 0}],
	9	"title": "Example Feed"}

+21

-0

tests/data/atom_content_encoded.rss less more

	0	<?xml version="1.0" encoding="utf-8"?>
	1	<feed xmlns="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
	2
	3	<title>Example Feed</title>
	4	<link href="http://example.org/"/>
	5	<updated>2003-12-13T18:30:02Z</updated>
	6	<author>
	7	<name>John Doe</name>
	8	</author>
	9	<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
	10
	11	<entry>
	12	<title>Atom-Powered Robots Run Amok</title>
	13	<link href="http://example.org/2003/12/13/atom03"/>
	14	<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
	15	<updated>2003-12-13T18:30:02Z</updated>
	16	<summary>Some text.</summary>
	17	<content:encoded><![CDATA[<h1>Hello</h1>]]></content:encoded>
	18	</entry>
	19
	20	</feed>

+9

-0

tests/data/atom_updated.json less more

	0	{"episodes": [{"description": "",
	1	"enclosures": [],
	2	"guid": "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a",
	3	"link": "http://example.org/2003/12/13/atom03",
	4	"payment_url": null,
	5	"published": 1071340202,
	6	"title": "Atom-Powered Robots Run Amok",
	7	"total_time": 0}],
	8	"title": "Example Feed"}

+20

-0

tests/data/atom_updated.rss less more

	0	<?xml version="1.0" encoding="utf-8"?>
	1	<feed xmlns="http://www.w3.org/2005/Atom">
	2
	3	<title>Example Feed</title>
	4	<link href="http://example.org/"/>
	5	<updated>2003-12-13T18:30:02Z</updated>
	6	<author>
	7	<name>John Doe</name>
	8	</author>
	9	<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
	10
	11	<entry>
	12	<title>Atom-Powered Robots Run Amok</title>
	13	<link href="http://example.org/2003/12/13/atom03"/>
	14	<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
	15	<updated>2003-12-13T18:30:02Z</updated>
	16	<summary>Some text.</summary>
	17	</entry>
	18
	19	</feed>

+22

-0

tests/data/html_in_description.json less more

	0	{
	1	"title": "HTML Podcast",
	2	"episodes": [
	3	{
	4	"title": "Ep 1",
	5	"description": "This is a test",
	6	"description_html": "<h1>This is a <em>test</em></h1>",
	7	"published": 0,
	8	"guid": "http://example.org/example.opus",
	9	"link": "",
	10	"total_time": 0,
	11	"payment_url": null,
	12	"enclosures": [
	13	{
	14	"file_size": -1,
	15	"url": "http://example.org/example.opus",
	16	"mime_type": "application/octet-stream"
	17	}
	18	]
	19	}
	20	]
	21	}

+12

-0

tests/data/html_in_description.rss less more

	0	<rss>
	1	<channel>
	2	<title>HTML Podcast</title>
	3	<item>
	4	<title>Ep 1</title>
	5	<enclosure url="http://example.org/example.opus"/>
	6	<description>
	7	<![CDATA[ <h1>This is a <em>test</em></h1> ]]>
	8	</description>
	9	</item>
	10	</channel>
	11	</rss>

+22

-0

tests/data/html_in_description_rss_both.json less more

	0	{
	1	"title": "HTML Podcast with Text Description",
	2	"episodes": [
	3	{
	4	"title": "Ep 1",
	5	"description": "This is also a test",
	6	"description_html": "<h1>This is also a <em>test</em></h1>",
	7	"published": 0,
	8	"guid": "http://example.org/example.opus",
	9	"link": "",
	10	"total_time": 0,
	11	"payment_url": null,
	12	"enclosures": [
	13	{
	14	"file_size": -1,
	15	"url": "http://example.org/example.opus",
	16	"mime_type": "application/octet-stream"
	17	}
	18	]
	19	}
	20	]
	21	}

+13

-0

tests/data/html_in_description_rss_both.rss less more

	0	<rss>
	1	<channel>
	2	<title>HTML Podcast with Text Description</title>
	3	<item>
	4	<title>Ep 1</title>
	5	<enclosure url="http://example.org/example.opus"/>
	6	<content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/">
	7	<![CDATA[ <h1>This is also a <em>test</em></h1> ]]>
	8	</content:encoded>
	9	<description><![CDATA[ <h1>This is also a <em>test</em></h1> ]]></description>
	10	</item>
	11	</channel>
	12	</rss>

+22

-0

tests/data/html_in_description_rss_both_different.json less more

	0	{
	1	"title": "HTML Podcast with Text Description",
	2	"episodes": [
	3	{
	4	"title": "Ep 1",
	5	"description": "This is also a test",
	6	"description_html": "<h1>This is also a <em>test</em></h1>",
	7	"published": 0,
	8	"guid": "http://example.org/example.opus",
	9	"link": "",
	10	"total_time": 0,
	11	"payment_url": null,
	12	"enclosures": [
	13	{
	14	"file_size": -1,
	15	"url": "http://example.org/example.opus",
	16	"mime_type": "application/octet-stream"
	17	}
	18	]
	19	}
	20	]
	21	}

+13

-0

tests/data/html_in_description_rss_both_different.rss less more

	0	<rss>
	1	<channel>
	2	<title>HTML Podcast with Text Description</title>
	3	<item>
	4	<title>Ep 1</title>
	5	<enclosure url="http://example.org/example.opus"/>
	6	<content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/">
	7	<![CDATA[ <h1>This is also a <em>test</em></h1> ]]>
	8	</content:encoded>
	9	<description><![CDATA[ <h1>This text will be discarded</h1> ]]></description>
	10	</item>
	11	</channel>
	12	</rss>