Commit 1dd000c941ff3228606b2796feca55bbc9671b7a - pyquery

Merge pull request #196 from OriHoch/support-for-timeouts web scraping improvements: default timeout and session support Gael Pasgrimaud authored 5 years ago GitHub committed 5 years ago

3 changed file(s) with 53 addition(s) and 3 deletion(s). Raw diff Collapse all Expand all

+11

-0

docs/scrap.rst less more

18	18	>>> pq(your_url, {'q': 'foo'}, method='post', verify=True)
19	19	[<html>]
20	20
	21
	22	Timeout
	23	-------
	24
	25	The default timeout is 60 seconds, you can change it by setting the timeout parameter which is forwarded to the underlying urllib or requests library.
	26
	27	Session
	28	-------
	29
	30	When using the requests library you can instantiate a Session object which keeps state between http calls (for example - to keep cookies). You can set the session parameter to use this session object.
	31
21	32	.. _requests: http://docs.python-requests.org/en/latest/

-3

pyquery/openers.py less more

18	18	except ImportError:
19	19	HAS_REQUEST = False
20	20
	21	DEFAULT_TIMEOUT = 60
21	22
22	23	allowed_args = (
23	24	'auth', 'data', 'headers', 'verify',

47	48
48	49
49	50	def _requests(url, kwargs):
	51
50	52	encoding = kwargs.get('encoding')
51	53	method = kwargs.get('method', 'get').lower()
52		meth = getattr(requests, str(method))
	54	session = kwargs.get('session')
	55	if session:
	56	meth = getattr(session, str(method))
	57	else:
	58	meth = getattr(requests, str(method))
53	59	if method == 'get':
54	60	url, data = _query(url, method, kwargs)
55	61	kw = {}
56	62	for k in allowed_args:
57	63	if k in kwargs:
58	64	kw[k] = kwargs[k]
59		resp = meth(url=url, **kw)
	65	resp = meth(url=url, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT), **kw)
60	66	if not (200 <= resp.status_code < 300):
61	67	raise HTTPError(resp.url, resp.status_code,
62	68	resp.reason, resp.headers, None)

69	75	def _urllib(url, kwargs):
70	76	method = kwargs.get('method')
71	77	url, data = _query(url, method, kwargs)
72		return urlopen(url, data)
	78	return urlopen(url, data, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT))
73	79
74	80
75	81	def url_opener(url, kwargs):

+33

-0

tests/test_pyquery.py less more

4	4	# Distributed under the BSD license, see LICENSE.txt
5	5	import os
6	6	import sys
	7	import time
7	8	from lxml import etree
8	9	from pyquery.pyquery import PyQuery as pq, no_default
	10	from pyquery.openers import HAS_REQUEST
9	11	from webtest import http
10	12	from webtest.debugapp import debug_app
11	13	from .compat import PY3k

869	871	self.assertIn('REQUEST_METHOD: POST', d('p').text())
870	872	self.assertIn('q=foo', d('p').text())
871	873
	874	def test_session(self):
	875	if HAS_REQUEST:
	876	import requests
	877	session = requests.Session()
	878	session.headers.update({'X-FOO': 'bar'})
	879	d = pq(self.application_url, {'q': 'foo'},
	880	method='get', session=session)
	881	self.assertIn('HTTP_X_FOO: bar', d('p').text())
	882	else:
	883	self.skipTest('no requests library')
	884
872	885	def tearDown(self):
873	886	self.s.shutdown()
874	887

880	893	method='get')
881	894	print(d)
882	895	self.assertEqual(d('#pt-login').text(), u'Войти')
	896
	897
	898	class TestWebScrappingTimeouts(TestCase):
	899
	900	def setUp(self):
	901	def app(environ, start_response):
	902	start_response('200 OK', [('Content-Type', 'text/plain')])
	903	time.sleep(2)
	904	return [b'foobar\n']
	905	self.s = http.StopableWSGIServer.create(app)
	906	self.s.wait()
	907	self.application_url = self.s.application_url.rstrip('/')
	908
	909	def test_get(self):
	910	pq(self.application_url)
	911	with self.assertRaises(Exception):
	912	pq(self.application_url, timeout=1)
	913
	914	def tearDown(self):
	915	self.s.shutdown()