Merge pull request #196 from OriHoch/support-for-timeouts
web scraping improvements: default timeout and session support
Gael Pasgrimaud authored 5 years ago
GitHub committed 5 years ago
18 | 18 | >>> pq(your_url, {'q': 'foo'}, method='post', verify=True) |
19 | 19 | [<html>] |
20 | 20 | |
21 | ||
22 | Timeout | |
23 | ------- | |
24 | ||
25 | The default timeout is 60 seconds, you can change it by setting the timeout parameter which is forwarded to the underlying urllib or requests library. | |
26 | ||
27 | Session | |
28 | ------- | |
29 | ||
30 | When using the requests library you can instantiate a Session object which keeps state between http calls (for example - to keep cookies). You can set the session parameter to use this session object. | |
31 | ||
21 | 32 | .. _requests: http://docs.python-requests.org/en/latest/ |
18 | 18 | except ImportError: |
19 | 19 | HAS_REQUEST = False |
20 | 20 | |
21 | DEFAULT_TIMEOUT = 60 | |
21 | 22 | |
22 | 23 | allowed_args = ( |
23 | 24 | 'auth', 'data', 'headers', 'verify', |
47 | 48 | |
48 | 49 | |
49 | 50 | def _requests(url, kwargs): |
51 | ||
50 | 52 | encoding = kwargs.get('encoding') |
51 | 53 | method = kwargs.get('method', 'get').lower() |
52 | meth = getattr(requests, str(method)) | |
54 | session = kwargs.get('session') | |
55 | if session: | |
56 | meth = getattr(session, str(method)) | |
57 | else: | |
58 | meth = getattr(requests, str(method)) | |
53 | 59 | if method == 'get': |
54 | 60 | url, data = _query(url, method, kwargs) |
55 | 61 | kw = {} |
56 | 62 | for k in allowed_args: |
57 | 63 | if k in kwargs: |
58 | 64 | kw[k] = kwargs[k] |
59 | resp = meth(url=url, **kw) | |
65 | resp = meth(url=url, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT), **kw) | |
60 | 66 | if not (200 <= resp.status_code < 300): |
61 | 67 | raise HTTPError(resp.url, resp.status_code, |
62 | 68 | resp.reason, resp.headers, None) |
69 | 75 | def _urllib(url, kwargs): |
70 | 76 | method = kwargs.get('method') |
71 | 77 | url, data = _query(url, method, kwargs) |
72 | return urlopen(url, data) | |
78 | return urlopen(url, data, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT)) | |
73 | 79 | |
74 | 80 | |
75 | 81 | def url_opener(url, kwargs): |
4 | 4 | # Distributed under the BSD license, see LICENSE.txt |
5 | 5 | import os |
6 | 6 | import sys |
7 | import time | |
7 | 8 | from lxml import etree |
8 | 9 | from pyquery.pyquery import PyQuery as pq, no_default |
10 | from pyquery.openers import HAS_REQUEST | |
9 | 11 | from webtest import http |
10 | 12 | from webtest.debugapp import debug_app |
11 | 13 | from .compat import PY3k |
869 | 871 | self.assertIn('REQUEST_METHOD: POST', d('p').text()) |
870 | 872 | self.assertIn('q=foo', d('p').text()) |
871 | 873 | |
874 | def test_session(self): | |
875 | if HAS_REQUEST: | |
876 | import requests | |
877 | session = requests.Session() | |
878 | session.headers.update({'X-FOO': 'bar'}) | |
879 | d = pq(self.application_url, {'q': 'foo'}, | |
880 | method='get', session=session) | |
881 | self.assertIn('HTTP_X_FOO: bar', d('p').text()) | |
882 | else: | |
883 | self.skipTest('no requests library') | |
884 | ||
872 | 885 | def tearDown(self): |
873 | 886 | self.s.shutdown() |
874 | 887 | |
880 | 893 | method='get') |
881 | 894 | print(d) |
882 | 895 | self.assertEqual(d('#pt-login').text(), u'Войти') |
896 | ||
897 | ||
898 | class TestWebScrappingTimeouts(TestCase): | |
899 | ||
900 | def setUp(self): | |
901 | def app(environ, start_response): | |
902 | start_response('200 OK', [('Content-Type', 'text/plain')]) | |
903 | time.sleep(2) | |
904 | return [b'foobar\n'] | |
905 | self.s = http.StopableWSGIServer.create(app) | |
906 | self.s.wait() | |
907 | self.application_url = self.s.application_url.rstrip('/') | |
908 | ||
909 | def test_get(self): | |
910 | pq(self.application_url) | |
911 | with self.assertRaises(Exception): | |
912 | pq(self.application_url, timeout=1) | |
913 | ||
914 | def tearDown(self): | |
915 | self.s.shutdown() |