Codebase list pyct / daf8303
Support --use-test-data (#63) * Now allows substitution of 'test data', reading from matching file in .data_stubs * Added cleanup command to remove the substituted files * Added pytest-based tests Chris B authored 5 years ago James A. Bednar committed 5 years ago
8 changed file(s) with 389 addition(s) and 30 deletion(s). Raw diff Collapse all Expand all
2626
2727 ```
2828 $ datashader examples --help
29 usage: datashader examples [-h] [--path PATH] [-v] [--force]
29 usage: datashader examples [-h] [--path PATH] [-v] [--force] [--use-test-data]
3030
3131 optional arguments:
32 -h, --help show this help message and exit
33 --path PATH location to place examples and data
32 -h, --help show this help message and exit
33 --path PATH location to place examples and data
3434 -v, --verbose
35 --force if PATH already exists, force overwrite existing examples if older than source examples
35 --force if PATH already exists, force overwrite existing examples
36 if older than source examples. ALSO force any existing data
37 files to be replaced
38 --use-test-data Use data's test files, if any, instead of fetching full
39 data. If test file not in '.data_stubs', fall back to
40 fetching full data.
3641 ```
3742
3843 To copy the examples of e.g. datashader but not download the data,
4550 -h, --help show this help message and exit
4651 --path PATH where to copy examples
4752 -v, --verbose
48 --force if PATH already exists, force overwrite existing examples if older than source examples
53 --force if PATH already exists, force overwrite existing files if
54 older than source files
4955 ```
5056
5157 And to download the data only, the `fetch-data` command:
5258
5359 ```
5460 usage: datashader fetch-data [-h] [--path PATH] [--datasets DATASETS] [-v]
61 [--force] [--use-test-data]
5562
5663 optional arguments:
5764 -h, --help show this help message and exit
5865 --path PATH where to put data
59 --datasets DATASETS *name* of datasets file; must exist either in path specified by --path or in package/examples/
66 --datasets DATASETS *name* of datasets file; must exist either in path
67 specified by --path or in package/examples/
6068 -v, --verbose
69 --force Force any existing data files to be replaced
70 --use-test-data Use data's test files, if any, instead of fetching full
71 data. If test file not in '.data_stubs', fall back to
72 fetching full data.
6173 ```
6274
6375 Can specify different 'datasets' file:
7890 Skipping Depth data for the Chesapeake and Delaware Bay region of the USA
7991 ```
8092
93 Can use smaller files instead of large ones by using the `--use-test-data` flag
94 and placing a small file with the same name in `examples/data/.data_stubs`:
95
96 ```
97 $ tree examples/data -a
98 examples/data
99 ├── .data_stubs
100 │   └── nyc_taxi_wide.parq
101 └── diamonds.csv
102
103 $ cat examples/dataset.yml
104 data:
105
106 - url: http://s3.amazonaws.com/datashader-data/nyc_taxi_wide.parq
107 title: 'NYC Taxi Data'
108 files:
109 - nyc_taxi_wide.parq
110
111 - url: http://s3.amazonaws.com/datashader-data/maccdc2012_graph.zip
112 title: 'National CyberWatch Mid-Atlantic Collegiate Cyber Defense Competition'
113 files:
114 - maccdc2012_nodes.parq
115 - maccdc2012_edges.parq
116 - maccdc2012_full_nodes.parq
117 - maccdc2012_full_edges.parq
118
119 $ pyviz fetch-data --path=examples --use-test-data
120 Fetching data defined in /tmp/pyviz/examples/datasets.yml and placing in /tmp/pyviz/examples/data
121 Copying test data file '/tmp/pyviz/examples/data/.data_stubs/nyc_taxi_wide.parq' to '/tmp/pyviz/examples/data/nyc_taxi_wide.parq'
122 No test file found for: /tmp/pyviz/examples/data/.data_stubs/maccdc2012_nodes.parq. Using regular file instead
123 Downloading National CyberWatch Mid-Atlantic Collegiate Cyber Defense Competition 1 of 1
124 [################################] 59/59 - 00:00:00
125 ```
126
127 To clean up any potential test files masquerading as real data use `clean-data`:
128
129 ```
130 usage: pyviz clean-data [-h] [--path PATH]
131
132 optional arguments:
133 -h, --help show this help message and exit
134 --path PATH where to clean data
135 ```
81136
82137 ## pyct.build
83138
0 {
1 "cells": [
2 {
3 "cell_type": "markdown",
4 "metadata": {},
5 "source": [
6 "**NOTE:** This notebook is used in the tests and should not be deleted."
7 ]
8 },
9 {
10 "cell_type": "code",
11 "execution_count": null,
12 "metadata": {},
13 "outputs": [],
14 "source": [
15 "import pandas as pd"
16 ]
17 },
18 {
19 "cell_type": "code",
20 "execution_count": null,
21 "metadata": {},
22 "outputs": [],
23 "source": [
24 "df = pd.read_csv('../data/test_data.csv')\n",
25 "df.head()"
26 ]
27 }
28 ],
29 "metadata": {
30 "language_info": {
31 "name": "python",
32 "pygments_lexer": "ipython3"
33 }
34 },
35 "nbformat": 4,
36 "nbformat_minor": 2
37 }
0 name,score,rank
1 Alice,100.5,1
2 Bob,50.3,2
3 Charlie,25,3
0 data:
1 - url: this_should_never_be_used
2 title: 'Test Data'
3 files:
4 - test_data.csv
88 import inspect
99 import argparse
1010 import distutils.dir_util
11 import shutil
1112
1213 def _find_examples(name):
1314 module_path = os.path.dirname(inspect.getfile(importlib.import_module(name)))
2324
2425 raise ValueError("Could not find examples for %s at any of %s"%(name,candidates))
2526
26 def examples(name,path,verbose=False,force=False):
27 """Copy examples and fetch data (if any) to the supplied path. See copy-examples and fetch-data for more flexibility."""
27 def examples(name,path,verbose=False,use_test_data=False,force=False):
28 """
29 Copy examples and fetch data (if any) to the supplied path.
30 See copy-examples and fetch-data for more flexibility.
31
32 NOTE: force operates both on example and data over-writing
33 pre-existing files.
34 """
2835 copy_examples(name, path, verbose, force)
29 fetch_data(name,path,require_datasets=False)
36 fetch_data(name,path,require_datasets=False,use_test_data=use_test_data,force=force)
3037
3138
3239 def copy_examples(name,path,verbose=False,force=False):
7986 # print('this download script requires the requests module: conda install requests')
8087 # sys.exit(1)
8188
82
89
8390 STREAM = sys.stderr
8491
8592 BAR_TEMPLATE = '%s[%s%s] %i/%i - %s\r'
95102 # How many intervals (excluding the current one) to calculate the simple moving
96103 # average
97104 ETA_SMA_WINDOW = 9
105 DATA_DIR = 'data'
106 DATA_STUBS_DIR = '.data_stubs'
98107
99108
100109 class Bar(object):
253262 os.remove(output_path)
254263
255264
256 def _process_dataset(dataset, output_dir, here):
265 def _process_dataset(dataset, output_dir, here, use_test_data=False, force=False):
257266 '''Process each download spec in datasets.yml
258267
259268 Typically each dataset list entry in the yml has
275284 requires_download = True
276285 break
277286
278 if not requires_download:
287 if force is False and not requires_download:
279288 print('Skipping {0}'.format(dataset['title']))
280289 return
281290 url = dataset['url']
282291 title_fmt = dataset['title'] + ' {} of {}'
283292 if url.endswith('/'):
284293 urls = [url + f for f in dataset['files']]
285 output_paths = [os.path.join(here, 'data', fname)
294 output_paths = [os.path.join(here, DATA_DIR, fname)
286295 for fname in dataset['files']]
287296
288297 unpacked = ['.'.join(output_path.split('.')[:(-2 if output_path.endswith('gz') else -1)]) + '*'
296305 zipped = zip(urls, output_paths, unpacked)
297306 for idx, (url, output_path, unpack) in enumerate(zipped):
298307 running_title = title_fmt.format(idx + 1, len(urls))
299 if glob.glob(unpack) or os.path.exists(unpack.replace('*','')):
308 if force is False and (glob.glob(unpack) or os.path.exists(unpack.replace('*',''))):
300309 # Skip a file if a similar one is downloaded:
301310 # i.e. one that has same name but dif't extension
302311 print('Skipping {0}'.format(running_title))
303312 continue
313 test = os.path.join(output_dir, DATA_STUBS_DIR, unpack)
314 if use_test_data and os.path.exists(test):
315 target = os.path.join(output_dir, unpack)
316 print("Copying test data file '{0}' to '{1}'".format(test, target))
317 shutil.copyfile(test, target)
318 continue
319 elif use_test_data and not os.path.exists(test):
320 print("No test file found for: {}. Using regular file instead".format(test))
304321 _url_to_binary_write(url, output_path, running_title)
305322 _extract_downloaded_archive(output_path)
306323
309326 print('this download script requires the requests module: conda install requests')
310327 sys.exit(1)
311328
312 def fetch_data(name,path,datasets="datasets.yml",require_datasets=True):
329 def fetch_data(name,path,datasets="datasets.yml",require_datasets=True,use_test_data=False,force=False):
313330 '''Fetch sample datasets as defined by path/datasets if it exists or else module's own examples/datasets otherwise.
314331
315332 Datasets are placed in path/data
322339 if not os.path.exists(info_file) and require_datasets is False:
323340 print("No datasets to download")
324341 return
325
326 print("Fetching data defined in %s and placing in %s"%(info_file,os.path.join(path,"data"))) # data is added later...
342
343 print("Fetching data defined in %s and placing in %s"%(info_file,os.path.join(path,DATA_DIR))) # data is added later...
327344
328345 with open(info_file) as f:
329346 info = ordered_load(f.read())
330347 for topic, downloads in info.items():
331348 output_dir = os.path.join(path, topic)
332349 for d in downloads:
333 _process_dataset(d, output_dir, path)
350 _process_dataset(d, output_dir, path, use_test_data=use_test_data, force=force)
351
352 def clean_data(name, path):
353 '''Remove up any data files that are copied from test files
354 '''
355 path = os.path.abspath(path)
356 if not os.path.exists(path):
357 path = _find_examples(name)
358
359 data_dir = os.path.join(path, DATA_DIR)
360 test_dir = os.path.join(data_dir, DATA_STUBS_DIR)
361 if not os.path.exists(test_dir) or len(os.listdir(test_dir)) == 0:
362 print("No test files found")
363 return
364
365 for f in os.listdir(test_dir):
366 data_file = os.path.join(data_dir, f)
367 if not os.path.isfile(data_file):
368 print("Test file was not copied to data:", f)
369 continue
370
371 test_file = os.path.join(test_dir, f)
372 if os.path.isfile(test_file):
373 data_s = os.path.getsize(data_file)
374 test_s = os.path.getsize(test_file)
375 if data_s == test_s:
376 print("Removing copied test file:", f)
377 os.remove(data_file)
378 else:
379 print("Size of test file {:.2e} did not match "
380 "size of data file {:.2e}".format(test_s, data_s))
334381
335382
336383 # TODO: cmds=None defaults to 'all', basically, which is a bit confusing
343390
344391 if cmds is None:
345392 # again a reg (duplicated in substitute_main)
346 cmds = ['examples','copy-examples','fetch-data']
393 cmds = ['examples','copy-examples','fetch-data','clean-data']
347394
348395 # use dict/reg instead
349396 if 'copy-examples' in cmds:
355402
356403 if 'fetch-data' in cmds:
357404 d_parser = parser.add_parser('fetch-data', help=inspect.getdoc(fetch_data))
358 d_parser.set_defaults(func=lambda args: fetch_data(name,args.path,args.datasets))
405 d_parser.set_defaults(func=lambda args: fetch_data(name,args.path,args.datasets,use_test_data=args.use_test_data,force=args.force))
359406 d_parser.add_argument('--path',type=str,help='where to put data',default='%s-examples'%name)
360407 d_parser.add_argument('--datasets',type=str,help='*name* of datasets file; must exist either in path specified by --path or in package/examples/',default='datasets.yml')
361408 d_parser.add_argument('-v', '--verbose', action='count', default=0)
409 d_parser.add_argument('--force',action='store_true', help='Force any existing data files to be replaced')
410 d_parser.add_argument('--use-test-data',action='store_true',
411 help=("Use data's test files, if any, instead of fetching full data. "
412 "If test file not in '.data_stubs', fall back to fetching full data."))
362413
363414 if 'examples' in cmds:
364415 egd_parser = parser.add_parser('examples', help=inspect.getdoc(examples))
365 egd_parser.set_defaults(func=lambda args: examples(name, args.path, args.verbose, args.force))
416 egd_parser.set_defaults(func=lambda args: examples(name, args.path, args.verbose, args.use_test_data, args.force))
366417 egd_parser.add_argument('--path',type=str,help='location to place examples and data',default='%s-examples'%name)
367418 egd_parser.add_argument('-v', '--verbose', action='count', default=0)
368 egd_parser.add_argument('--force', action='store_true', help='if PATH already exists, force overwrite existing examples if older than source examples')
369
419 egd_parser.add_argument('--force', action='store_true',
420 help=('if PATH already exists, force overwrite existing examples if older '
421 'than source examples. ALSO force any existing data files to be replaced'))
422 egd_parser.add_argument('--use-test-data',action='store_true',
423 help=("Use data's test files, if any, instead of fetching full data. "
424 "If test file not in '.data_stubs', fall back to fetching full data."))
425
426 if 'clean-data' in cmds:
427 cd_parser = parser.add_parser('clean-data', help=inspect.getdoc(clean_data))
428 cd_parser.set_defaults(func=lambda args: clean_data(name,args.path))
429 cd_parser.add_argument('--path',type=str,help='where to clean data',default='%s-examples'%name)
370430
371431 def substitute_main(name,cmds=None,args=None):
372432 # can use if your module has no other commands
373433
374434 if cmds is None:
375435 # again a reg
376 cmds = ['examples','copy-examples','fetch-data']
377
436 cmds = ['examples','copy-examples','fetch-data', 'clean-data']
437
378438 mod = importlib.import_module(name)
379439 parser = argparse.ArgumentParser(description="%s commands"%name)
380440 parser.add_argument('--version', action='version', version='%(prog)s '+mod.__version__)
381441 subparsers = parser.add_subparsers(title='available commands')
382442 add_commands(subparsers,name,cmds,args)
383443 args = parser.parse_args()
384 args.func(args) if hasattr(args,'func') else parser.error("must supply command to run")
385
444 args.func(args) if hasattr(args,'func') else parser.error("must supply command to run")
445
0 from pyct.cmd import fetch_data, clean_data, copy_examples, examples
1 import pytest
2
3 # Same as in pyct/examples/datasets.yml
4 DATASETS_CONTENT = """
5 data:
6 - url: this_should_never_be_used
7 title: 'Test Data'
8 files:
9 - test_data.csv
10 """
11
12 # Same as in pyct/examples/data/.data_stubs/test_data.csv
13 TEST_FILE_CONTENT = """
14 name,score,rank
15 Alice,100.5,1
16 Bob,50.3,2
17 Charlie,25,3
18 """
19
20 REAL_FILE_CONTENT = """
21 name,score,rank
22 Alice,100.5,1
23 Bob,50.3,2
24 Charlie,25,3
25 Dave,28,4
26 Eve,25,3
27 Frank,75,9
28 """
29
30 FAKE_EXAMPLE_CONTENT = """
31 import numpy as np
32
33 a = np.arange(10)
34 """
35
36
37 @pytest.fixture(scope='function')
38 def tmp_project(tmp_path):
39 project = tmp_path / "test_project"
40 project.mkdir()
41 return project
42
43 @pytest.fixture(scope='function')
44 def tmp_project_with_examples(tmp_path):
45 project = tmp_path
46 examples = project / "examples"
47 examples.mkdir()
48 datasets = examples / "datasets.yml"
49 datasets.write_text(DATASETS_CONTENT)
50 (examples / "data").mkdir()
51 example = examples / "Test_Example_Notebook.ipynb"
52 example.write_text(FAKE_EXAMPLE_CONTENT)
53 return project
54
55 @pytest.fixture(scope='function')
56 def tmp_project_with_stubs(tmp_project_with_examples):
57 project = tmp_project_with_examples
58 data_stubs = project / "examples" / "data" / ".data_stubs"
59 data_stubs.mkdir()
60 return project
61
62 @pytest.fixture(scope='function')
63 def tmp_project_with_test_file(tmp_project_with_stubs):
64 project = tmp_project_with_stubs
65 data_stub = project / "examples" / "data" / ".data_stubs" / "test_data.csv"
66 data_stub.write_text(TEST_FILE_CONTENT)
67 return project
68
69
70 def test_examples_with_use_test_data(tmp_project):
71 project = tmp_project
72 path = str(project / "examples")
73 examples(name="pyct", path=path, use_test_data=True)
74 assert (project / "examples" / "data" / "test_data.csv").is_file()
75 assert (project / "examples" / "Test_Example_Notebook.ipynb").is_file()
76
77 def test_examples_with_prexisting_content_in_target_raises_error(tmp_project_with_examples):
78 project = tmp_project_with_examples
79 path = str(project / "examples")
80 data = project / "examples" / "data" / "test_data.csv"
81 data.write_text(REAL_FILE_CONTENT)
82 with pytest.raises(ValueError):
83 examples(name="pyct", path=path, use_test_data=True)
84 assert (project / "examples" / "Test_Example_Notebook.ipynb").is_file()
85 assert (project / "examples" / "Test_Example_Notebook.ipynb").read_text() == FAKE_EXAMPLE_CONTENT
86 assert (project / "examples" / "data" / "test_data.csv").is_file()
87 assert (project / "examples" / "data" / "test_data.csv").read_text() == REAL_FILE_CONTENT
88
89 def test_examples_using_test_data_and_force_with_prexisting_content_in_target(tmp_project_with_examples):
90 project = tmp_project_with_examples
91 path = str(project / "examples")
92 data = project / "examples" / "data" / "test_data.csv"
93 data.write_text(REAL_FILE_CONTENT)
94 examples(name="pyct", path=path, use_test_data=True, force=True)
95 assert (project / "examples" / "Test_Example_Notebook.ipynb").is_file()
96 assert (project / "examples" / "Test_Example_Notebook.ipynb").read_text() != FAKE_EXAMPLE_CONTENT
97 assert (project / "examples" / "data" / "test_data.csv").is_file()
98 assert (project / "examples" / "data" / "test_data.csv").read_text() != REAL_FILE_CONTENT
99
100 def test_copy_examples(tmp_project):
101 project = tmp_project
102 path = str(project / "examples")
103 copy_examples(name="pyct", path=path)
104 assert (project / "examples" / "Test_Example_Notebook.ipynb").is_file()
105
106 def test_copy_examples_with_prexisting_content_in_target_raises_error(tmp_project_with_examples):
107 project = tmp_project_with_examples
108 path = str(project / "examples")
109 with pytest.raises(ValueError):
110 copy_examples(name="pyct", path=path)
111 assert (project / "examples" / "Test_Example_Notebook.ipynb").is_file()
112 assert (project / "examples" / "Test_Example_Notebook.ipynb").read_text() == FAKE_EXAMPLE_CONTENT
113
114 def test_copy_examples_using_force_with_prexisting_content_in_target(tmp_project_with_examples):
115 project = tmp_project_with_examples
116 path = str(project / "examples")
117 copy_examples(name="pyct", path=path, force=True)
118 assert (project / "examples" / "Test_Example_Notebook.ipynb").is_file()
119 assert (project / "examples" / "Test_Example_Notebook.ipynb").read_text() != FAKE_EXAMPLE_CONTENT
120
121 def test_fetch_data_using_test_data_with_no_file_in_data_copies_from_stubs(tmp_project_with_test_file):
122 project = tmp_project_with_test_file
123 name = 'pyct'
124 path = str(project / "examples")
125 fetch_data(name=name, path=path, use_test_data=True)
126 assert (project / "examples" / "data" / "test_data.csv").is_file()
127 assert (project / "examples" / "data" / "test_data.csv").read_text() == TEST_FILE_CONTENT
128
129 def test_fetch_data_using_test_data_with_file_in_data_skips(tmp_project_with_test_file):
130 project = tmp_project_with_test_file
131 name = 'pyct'
132 path = str(project / "examples")
133 data = project / "examples" / "data" / "test_data.csv"
134 data.write_text(REAL_FILE_CONTENT)
135 fetch_data(name=name, path=path, use_test_data=True)
136 assert (project / "examples" / "data" / "test_data.csv").is_file()
137 assert (project / "examples" / "data" / "test_data.csv").read_text() == REAL_FILE_CONTENT
138
139 def test_fetch_data_using_test_data_and_force_with_file_in_data_over_writes(tmp_project_with_test_file):
140 project = tmp_project_with_test_file
141 name = 'pyct'
142 path = str(project / "examples")
143 data = project / "examples" / "data" / "test_data.csv"
144 data.write_text(REAL_FILE_CONTENT)
145 fetch_data(name=name, path=path, use_test_data=True, force=True)
146 assert (project / "examples" / "data" / "test_data.csv").is_file()
147 assert (project / "examples" / "data" / "test_data.csv").read_text() == TEST_FILE_CONTENT
148
149 def test_clean_data_when_data_file_is_real_does_nothing(tmp_project_with_test_file):
150 project = tmp_project_with_test_file
151 name = 'pyct'
152 path = str(project / "examples")
153 data = project / "examples" / "data" / "test_data.csv"
154 data.write_text(REAL_FILE_CONTENT)
155 clean_data(name=name, path=path)
156 assert (project / "examples" / "data" / "test_data.csv").is_file()
157 assert (project / "examples" / "data" / "test_data.csv").read_text() == REAL_FILE_CONTENT
158
159 def test_clean_data_when_data_file_is_from_stubs_removes_file_from_data(tmp_project_with_test_file):
160 project = tmp_project_with_test_file
161 name = 'pyct'
162 path = str(project / "examples")
163 data = project / "examples" / "data" / "test_data.csv"
164 data.write_text(TEST_FILE_CONTENT)
165 clean_data(name=name, path=path)
166 assert not (project / "examples" / "data" / "test_data.csv").is_file()
167 assert (project / "examples" / "data" / ".data_stubs" / "test_data.csv").is_file()
168 assert (project / "examples" / "data" / ".data_stubs" / "test_data.csv").read_text() == TEST_FILE_CONTENT
169
170 def test_clean_data_when_file_not_in_data_does_nothing(tmp_project_with_test_file):
171 project = tmp_project_with_test_file
172 name = 'pyct'
173 path = str(project / "examples")
174 clean_data(name=name, path=path)
175 assert not (project / "examples" / "data" / "test_data.csv").is_file()
176 assert (project / "examples" / "data" / ".data_stubs" / "test_data.csv").is_file()
177 assert (project / "examples" / "data" / ".data_stubs" / "test_data.csv").read_text() == TEST_FILE_CONTENT
178
179 def test_clean_data_when_stubs_is_empty_does_nothing(tmp_project_with_stubs):
180 project = tmp_project_with_stubs
181 name = 'pyct'
182 path = str(project / "examples")
183 data = project / "examples" / "data" / "test_data.csv"
184 data.write_text(REAL_FILE_CONTENT)
185 clean_data(name=name, path=path)
186 assert (project / "examples" / "data" / "test_data.csv").is_file()
187 assert not (project / "examples" / "data" / ".data_stubs" / "test_data.csv").is_file()
188
189 def test_clean_data_when_no_stubs_dir_does_nothing(tmp_project_with_examples):
190 project = tmp_project_with_examples
191 name = 'pyct'
192 path = str(project / "examples")
193 data = project / "examples" / "data" / "test_data.csv"
194 data.write_text(REAL_FILE_CONTENT)
195 clean_data(name=name, path=path)
196 assert (project / "examples" / "data" / "test_data.csv").is_file()
3737
3838 tests =
3939 flake8
40 pytest
4041 doc =
4142 nbsite
4243 sphinx_ioam_theme
66 deps = .[tests]
77
88 [_cmd_examples]
9 # TODO: not much of a test yet...
10 commands = python -c "from pyct.cmd import examples, fetch_data, copy_examples"
9 commands = pytest --verbose
1110 deps = .[tests,cmd]
1211
1312 [_build_examples]
1817 [_all]
1918 commands = {[_flakes]commands}
2019 {[_cmd_examples]commands}
21 {[_build_examples]commands}
20 {[_build_examples]commands}
2221 deps = .[examples, tests]
2322
2423 [testenv]