Codebase list pyquery / 1dd000c
Merge pull request #196 from OriHoch/support-for-timeouts web scraping improvements: default timeout and session support Gael Pasgrimaud authored 5 years ago GitHub committed 5 years ago
3 changed file(s) with 53 addition(s) and 3 deletion(s). Raw diff Collapse all Expand all
1818 >>> pq(your_url, {'q': 'foo'}, method='post', verify=True)
1919 [<html>]
2020
21
22 Timeout
23 -------
24
25 The default timeout is 60 seconds, you can change it by setting the timeout parameter which is forwarded to the underlying urllib or requests library.
26
27 Session
28 -------
29
30 When using the requests library you can instantiate a Session object which keeps state between http calls (for example - to keep cookies). You can set the session parameter to use this session object.
31
2132 .. _requests: http://docs.python-requests.org/en/latest/
1818 except ImportError:
1919 HAS_REQUEST = False
2020
21 DEFAULT_TIMEOUT = 60
2122
2223 allowed_args = (
2324 'auth', 'data', 'headers', 'verify',
4748
4849
4950 def _requests(url, kwargs):
51
5052 encoding = kwargs.get('encoding')
5153 method = kwargs.get('method', 'get').lower()
52 meth = getattr(requests, str(method))
54 session = kwargs.get('session')
55 if session:
56 meth = getattr(session, str(method))
57 else:
58 meth = getattr(requests, str(method))
5359 if method == 'get':
5460 url, data = _query(url, method, kwargs)
5561 kw = {}
5662 for k in allowed_args:
5763 if k in kwargs:
5864 kw[k] = kwargs[k]
59 resp = meth(url=url, **kw)
65 resp = meth(url=url, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT), **kw)
6066 if not (200 <= resp.status_code < 300):
6167 raise HTTPError(resp.url, resp.status_code,
6268 resp.reason, resp.headers, None)
6975 def _urllib(url, kwargs):
7076 method = kwargs.get('method')
7177 url, data = _query(url, method, kwargs)
72 return urlopen(url, data)
78 return urlopen(url, data, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT))
7379
7480
7581 def url_opener(url, kwargs):
44 # Distributed under the BSD license, see LICENSE.txt
55 import os
66 import sys
7 import time
78 from lxml import etree
89 from pyquery.pyquery import PyQuery as pq, no_default
10 from pyquery.openers import HAS_REQUEST
911 from webtest import http
1012 from webtest.debugapp import debug_app
1113 from .compat import PY3k
869871 self.assertIn('REQUEST_METHOD: POST', d('p').text())
870872 self.assertIn('q=foo', d('p').text())
871873
874 def test_session(self):
875 if HAS_REQUEST:
876 import requests
877 session = requests.Session()
878 session.headers.update({'X-FOO': 'bar'})
879 d = pq(self.application_url, {'q': 'foo'},
880 method='get', session=session)
881 self.assertIn('HTTP_X_FOO: bar', d('p').text())
882 else:
883 self.skipTest('no requests library')
884
872885 def tearDown(self):
873886 self.s.shutdown()
874887
880893 method='get')
881894 print(d)
882895 self.assertEqual(d('#pt-login').text(), u'Войти')
896
897
898 class TestWebScrappingTimeouts(TestCase):
899
900 def setUp(self):
901 def app(environ, start_response):
902 start_response('200 OK', [('Content-Type', 'text/plain')])
903 time.sleep(2)
904 return [b'foobar\n']
905 self.s = http.StopableWSGIServer.create(app)
906 self.s.wait()
907 self.application_url = self.s.application_url.rstrip('/')
908
909 def test_get(self):
910 pq(self.application_url)
911 with self.assertRaises(Exception):
912 pq(self.application_url, timeout=1)
913
914 def tearDown(self):
915 self.s.shutdown()