Import upstream version 0.9
Debian Janitor
1 year, 5 months ago
29 | 29 | pip3 install . |
30 | 30 | coverage run --source=rdata/ --omit=rdata/tests/ setup.py test; |
31 | 31 | |
32 | - name: Generate coverage XML | |
33 | run: | | |
34 | coverage xml | |
35 | ||
32 | 36 | - name: Upload coverage to Codecov |
33 | uses: codecov/codecov-action@v1 | |
37 | uses: codecov/codecov-action@v2 |
0 | # This workflow will upload a Python Package using Twine when a release is created | |
1 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries | |
2 | ||
3 | # This workflow uses actions that are not certified by GitHub. | |
4 | # They are provided by a third-party and are governed by | |
5 | # separate terms of service, privacy policy, and support | |
6 | # documentation. | |
7 | ||
8 | name: Upload Python Package | |
9 | ||
10 | on: | |
11 | release: | |
12 | types: [published] | |
13 | ||
14 | permissions: | |
15 | contents: read | |
16 | ||
17 | jobs: | |
18 | deploy: | |
19 | ||
20 | runs-on: ubuntu-latest | |
21 | ||
22 | steps: | |
23 | - uses: actions/checkout@v3 | |
24 | - name: Set up Python | |
25 | uses: actions/setup-python@v3 | |
26 | with: | |
27 | python-version: '3.x' | |
28 | - name: Install dependencies | |
29 | run: | | |
30 | python -m pip install --upgrade pip | |
31 | pip install build | |
32 | - name: Build package | |
33 | run: python -m build | |
34 | - name: Publish package | |
35 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 | |
36 | with: | |
37 | user: __token__ | |
38 | password: ${{ secrets.PYPI_API_TOKEN }} |
0 | cff-version: 1.2.0 | |
1 | message: "If you use this software, please cite it as below." | |
2 | authors: | |
3 | - family-names: "Ramos-Carreño" | |
4 | given-names: "Carlos" | |
5 | orcid: "https://orcid.org/0000-0003-2566-7058" | |
6 | affiliation: "Universidad Autónoma de Madrid" | |
7 | email: vnmabus@gmail.com | |
8 | title: "rdata: Read R datasets from Python" | |
9 | date-released: 2022-03-24 | |
10 | doi: 10.5281/zenodo.6382237 | |
11 | url: "https://github.com/vnmabus/rdata" | |
12 | license: MIT | |
13 | keywords: | |
14 | - rdata | |
15 | - Python | |
16 | - R | |
17 | - parser | |
18 | - conversion | |
19 | identifiers: | |
20 | - description: "This is the collection of archived snapshots of all versions of rdata" | |
21 | type: doi | |
22 | value: 10.5281/zenodo.6382237 | |
23 | - description: "This is the archived snapshot of version 0.7 of rdata" | |
24 | type: doi | |
25 | value: 10.5281/zenodo.6382238⏎ |
0 | 0 | include MANIFEST.in |
1 | include VERSION | |
1 | include rdata/VERSION | |
2 | 2 | include LICENSE |
3 | 3 | include rdata/py.typed |
4 | 4 | include *.txt⏎ |
0 | 0 | rdata |
1 | 1 | ===== |
2 | 2 | |
3 | |build-status| |docs| |coverage| |landscape| |pypi| | |
3 | |build-status| |docs| |coverage| |landscape| |pypi| |zenodo| | |
4 | 4 | |
5 | 5 | Read R datasets from Python. |
6 | 6 | |
102 | 102 | >>> converted = rdata.conversion.convert(parsed, new_dict) |
103 | 103 | >>> converted |
104 | 104 | {'test_dataframe': class value |
105 | 0 b'a' 1 | |
106 | 1 b'b' 2 | |
107 | 2 b'b' 3} | |
105 | 1 b'a' 1 | |
106 | 2 b'b' 2 | |
107 | 3 b'b' 3} | |
108 | 108 | |
109 | 109 | |
110 | 110 | .. |build-status| image:: https://github.com/vnmabus/rdata/actions/workflows/main.yml/badge.svg?branch=master |
129 | 129 | .. |pypi| image:: https://badge.fury.io/py/rdata.svg |
130 | 130 | :alt: Pypi version |
131 | 131 | :scale: 100% |
132 | :target: https://pypi.python.org/pypi/rdata/⏎ | |
132 | :target: https://pypi.python.org/pypi/rdata/ | |
133 | ||
134 | .. |zenodo| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.6382237.svg | |
135 | :alt: Zenodo DOI | |
136 | :scale: 100% | |
137 | :target: https://doi.org/10.5281/zenodo.6382237⏎ |
21 | 21 | # sys.path.insert(0, '/home/carlos/git/rdata/rdata') |
22 | 22 | |
23 | 23 | import sys |
24 | ||
24 | 25 | import pkg_resources |
26 | ||
25 | 27 | try: |
26 | 28 | release = pkg_resources.get_distribution('rdata').version |
27 | 29 | except pkg_resources.DistributionNotFound: |
207 | 209 | |
208 | 210 | intersphinx_mapping = {'python': ('https://docs.python.org/3', None), |
209 | 211 | 'pandas': ('http://pandas.pydata.org/pandas-docs/dev', None)} |
212 | ||
213 | autodoc_preserve_defaults = True | |
214 | autodoc_typehints = "description" |
69 | 69 | >>> converted = rdata.conversion.convert(parsed, new_dict) |
70 | 70 | >>> converted |
71 | 71 | {'test_dataframe': class value |
72 | 0 b'a' 1 | |
73 | 1 b'b' 2 | |
74 | 2 b'b' 3} | |
72 | 1 b'a' 1 | |
73 | 2 b'b' 2 | |
74 | 3 b'b' 3} |
0 | 0.9⏎ |
0 | """rdata: Read R datasets from Python.""" | |
1 | import errno as _errno | |
0 | 2 | import os as _os |
1 | 3 | import pathlib as _pathlib |
2 | 4 | |
12 | 14 | Path of the test data. |
13 | 15 | |
14 | 16 | """ |
17 | ||
18 | try: | |
19 | with open( | |
20 | _pathlib.Path(_os.path.dirname(__file__)) / 'VERSION', | |
21 | 'r', | |
22 | ) as version_file: | |
23 | __version__ = version_file.read().strip() | |
24 | except IOError as e: | |
25 | if e.errno != _errno.ENOENT: | |
26 | raise | |
27 | ||
28 | __version__ = "0.0" |
0 | from ._conversion import (RExpression, RLanguage, | |
1 | convert_list, convert_attrs, convert_vector, | |
2 | convert_char, convert_symbol, convert_array, | |
3 | Converter, SimpleConverter, | |
4 | dataframe_constructor, | |
5 | factor_constructor, | |
6 | ts_constructor, | |
7 | DEFAULT_CLASS_MAP, convert) | |
0 | from ._conversion import ( | |
1 | DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP, | |
2 | Converter as Converter, | |
3 | RBuiltin as RBuiltin, | |
4 | RBytecode as RBytecode, | |
5 | RExpression as RExpression, | |
6 | RFunction as RFunction, | |
7 | RLanguage as RLanguage, | |
8 | SimpleConverter as SimpleConverter, | |
9 | convert as convert, | |
10 | convert_array as convert_array, | |
11 | convert_attrs as convert_attrs, | |
12 | convert_char as convert_char, | |
13 | convert_list as convert_list, | |
14 | convert_symbol as convert_symbol, | |
15 | convert_vector as convert_vector, | |
16 | dataframe_constructor as dataframe_constructor, | |
17 | factor_constructor as factor_constructor, | |
18 | ts_constructor as ts_constructor, | |
19 | ) |
0 | from __future__ import annotations | |
1 | ||
0 | 2 | import abc |
1 | 3 | import warnings |
4 | from dataclasses import dataclass | |
2 | 5 | from fractions import Fraction |
3 | 6 | from types import MappingProxyType, SimpleNamespace |
4 | 7 | from typing import ( |
5 | 8 | Any, |
6 | 9 | Callable, |
7 | 10 | ChainMap, |
8 | Hashable, | |
9 | 11 | List, |
10 | 12 | Mapping, |
11 | 13 | MutableMapping, |
12 | 14 | NamedTuple, |
13 | 15 | Optional, |
16 | Sequence, | |
14 | 17 | Union, |
15 | 18 | cast, |
16 | 19 | ) |
22 | 25 | from .. import parser |
23 | 26 | from ..parser import RObject |
24 | 27 | |
28 | ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any] | |
29 | StrMap = Mapping[Union[str, bytes], Any] | |
30 | ||
25 | 31 | |
26 | 32 | class RLanguage(NamedTuple): |
27 | """ | |
28 | R language construct. | |
29 | """ | |
33 | """R language construct.""" | |
34 | ||
30 | 35 | elements: List[Any] |
36 | attributes: Mapping[str, Any] | |
31 | 37 | |
32 | 38 | |
33 | 39 | class RExpression(NamedTuple): |
34 | """ | |
35 | R expression. | |
36 | """ | |
40 | """R expression.""" | |
41 | ||
37 | 42 | elements: List[RLanguage] |
43 | ||
44 | ||
45 | @dataclass | |
46 | class RBuiltin: | |
47 | """R builtin.""" | |
48 | ||
49 | name: str | |
50 | ||
51 | ||
52 | @dataclass | |
53 | class RFunction: | |
54 | """R function.""" | |
55 | ||
56 | environment: Mapping[str, Any] | |
57 | formals: Optional[Mapping[str, Any]] | |
58 | body: RLanguage | |
59 | attributes: StrMap | |
60 | ||
61 | @property | |
62 | def source(self) -> str: | |
63 | return "\n".join(self.attributes["srcref"].srcfile.lines) | |
64 | ||
65 | ||
66 | @dataclass | |
67 | class RExternalPointer: | |
68 | """R bytecode.""" | |
69 | ||
70 | protected: Any | |
71 | tag: Any | |
72 | ||
73 | ||
74 | @dataclass | |
75 | class RBytecode: | |
76 | """R bytecode.""" | |
77 | ||
78 | code: xarray.DataArray | |
79 | constants: Sequence[Any] | |
80 | attributes: StrMap | |
81 | ||
82 | ||
83 | class REnvironment(ChainMap[Union[str, bytes], Any]): | |
84 | """R environment.""" | |
85 | ||
86 | def __init__( | |
87 | self, | |
88 | *maps: MutableMapping[str | bytes, Any], | |
89 | frame: StrMap | None = None, | |
90 | ) -> None: | |
91 | super().__init__(*maps) | |
92 | self.frame = frame | |
38 | 93 | |
39 | 94 | |
40 | 95 | def convert_list( |
41 | 96 | r_list: parser.RObject, |
42 | conversion_function: Callable[ | |
43 | [Union[parser.RData, parser.RObject] | |
44 | ], Any]=lambda x: x | |
45 | ) -> Union[Mapping[Union[str, bytes], Any], List[Any]]: | |
97 | conversion_function: ConversionFunction, | |
98 | ) -> Union[StrMap, List[Any]]: | |
46 | 99 | """ |
47 | 100 | Expand a tagged R pairlist to a Python dictionary. |
48 | 101 | |
67 | 120 | """ |
68 | 121 | if r_list.info.type is parser.RObjectType.NILVALUE: |
69 | 122 | return {} |
70 | elif r_list.info.type not in [parser.RObjectType.LIST, | |
71 | parser.RObjectType.LANG]: | |
123 | elif r_list.info.type not in { | |
124 | parser.RObjectType.LIST, | |
125 | parser.RObjectType.LANG, | |
126 | }: | |
72 | 127 | raise TypeError("Must receive a LIST, LANG or NILVALUE object") |
73 | 128 | |
74 | 129 | if r_list.tag is None: |
83 | 138 | cdr = {} |
84 | 139 | |
85 | 140 | return {tag: conversion_function(r_list.value[0]), **cdr} |
86 | else: | |
87 | if cdr is None: | |
88 | cdr = [] | |
89 | ||
90 | return [conversion_function(r_list.value[0]), *cdr] | |
141 | ||
142 | if cdr is None: | |
143 | cdr = [] | |
144 | ||
145 | return [conversion_function(r_list.value[0]), *cdr] | |
91 | 146 | |
92 | 147 | |
93 | 148 | def convert_env( |
94 | 149 | r_env: parser.RObject, |
95 | conversion_function: Callable[ | |
96 | [Union[parser.RData, parser.RObject] | |
97 | ], Any]=lambda x: x | |
98 | ) -> ChainMap[Union[str, bytes], Any]: | |
99 | ||
150 | conversion_function: ConversionFunction, | |
151 | ) -> REnvironment: | |
152 | """Convert environment objects.""" | |
100 | 153 | if r_env.info.type is not parser.RObjectType.ENV: |
101 | 154 | raise TypeError("Must receive a ENV object") |
102 | 155 | |
105 | 158 | hash_table = conversion_function(r_env.value.hash_table) |
106 | 159 | |
107 | 160 | dictionary = {} |
108 | for d in hash_table: | |
109 | if d is not None: | |
110 | dictionary.update(d) | |
111 | ||
112 | return ChainMap(dictionary, enclosure) | |
161 | if hash_table is not None: | |
162 | for d in hash_table: | |
163 | if d is not None: | |
164 | dictionary.update(d) | |
165 | ||
166 | return REnvironment(dictionary, enclosure, frame=frame) | |
113 | 167 | |
114 | 168 | |
115 | 169 | def convert_attrs( |
116 | 170 | r_obj: parser.RObject, |
117 | conversion_function: Callable[ | |
118 | [Union[parser.RData, parser.RObject] | |
119 | ], Any]=lambda x: x | |
120 | ) -> Mapping[Union[str, bytes], Any]: | |
171 | conversion_function: ConversionFunction, | |
172 | ) -> StrMap: | |
121 | 173 | """ |
122 | 174 | Return the attributes of an object as a Python dictionary. |
123 | 175 | |
142 | 194 | """ |
143 | 195 | if r_obj.attributes: |
144 | 196 | attrs = cast( |
145 | Mapping[Union[str, bytes], Any], | |
197 | StrMap, | |
146 | 198 | conversion_function(r_obj.attributes), |
147 | 199 | ) |
148 | 200 | else: |
152 | 204 | |
153 | 205 | def convert_vector( |
154 | 206 | r_vec: parser.RObject, |
155 | conversion_function: Callable[ | |
156 | [Union[parser.RData, parser.RObject]], Any]=lambda x: x, | |
157 | attrs: Optional[Mapping[Union[str, bytes], Any]] = None, | |
158 | ) -> Union[List[Any], Mapping[Union[str, bytes], Any]]: | |
207 | conversion_function: ConversionFunction, | |
208 | attrs: Optional[StrMap] = None, | |
209 | ) -> Union[List[Any], StrMap]: | |
159 | 210 | """ |
160 | 211 | Convert a R vector to a Python list or dictionary. |
161 | 212 | |
185 | 236 | if attrs is None: |
186 | 237 | attrs = {} |
187 | 238 | |
188 | if r_vec.info.type not in [parser.RObjectType.VEC, | |
189 | parser.RObjectType.EXPR]: | |
239 | if r_vec.info.type not in { | |
240 | parser.RObjectType.VEC, | |
241 | parser.RObjectType.EXPR, | |
242 | }: | |
190 | 243 | raise TypeError("Must receive a VEC or EXPR object") |
191 | 244 | |
192 | value: Union[List[Any], Mapping[Union[str, bytes], Any]] = [ | |
245 | value: Union[List[Any], StrMap] = [ | |
193 | 246 | conversion_function(o) for o in r_vec.value |
194 | 247 | ] |
195 | 248 | |
202 | 255 | |
203 | 256 | |
204 | 257 | def safe_decode(byte_str: bytes, encoding: str) -> Union[str, bytes]: |
205 | """ | |
206 | Decode a (possibly malformed) string. | |
207 | """ | |
258 | """Decode a (possibly malformed) string.""" | |
208 | 259 | try: |
209 | 260 | return byte_str.decode(encoding) |
210 | 261 | except UnicodeDecodeError as e: |
249 | 300 | |
250 | 301 | assert isinstance(r_char.value, bytes) |
251 | 302 | |
303 | encoding = None | |
304 | ||
252 | 305 | if not force_default_encoding: |
253 | 306 | if r_char.info.gp & parser.CharFlags.UTF8: |
254 | return safe_decode(r_char.value, "utf_8") | |
307 | encoding = "utf_8" | |
255 | 308 | elif r_char.info.gp & parser.CharFlags.LATIN1: |
256 | return safe_decode(r_char.value, "latin_1") | |
309 | encoding = "latin_1" | |
257 | 310 | elif r_char.info.gp & parser.CharFlags.ASCII: |
258 | return safe_decode(r_char.value, "ascii") | |
311 | encoding = "ascii" | |
259 | 312 | elif r_char.info.gp & parser.CharFlags.BYTES: |
260 | return r_char.value | |
261 | ||
262 | if default_encoding: | |
263 | return safe_decode(r_char.value, default_encoding) | |
264 | else: | |
265 | # Assume ASCII if no encoding is marked | |
266 | warnings.warn(f"Unknown encoding. Assumed ASCII.") | |
267 | return safe_decode(r_char.value, "ascii") | |
268 | ||
269 | ||
270 | def convert_symbol(r_symbol: parser.RObject, | |
271 | conversion_function: Callable[ | |
272 | [Union[parser.RData, parser.RObject]], | |
273 | Any]=lambda x: x | |
274 | ) -> Union[str, bytes]: | |
313 | encoding = "bytes" | |
314 | ||
315 | if encoding is None: | |
316 | if default_encoding: | |
317 | encoding = default_encoding | |
318 | else: | |
319 | # Assume ASCII if no encoding is marked | |
320 | warnings.warn("Unknown encoding. Assumed ASCII.") | |
321 | encoding = "ascii" | |
322 | ||
323 | return ( | |
324 | r_char.value | |
325 | if encoding == "bytes" | |
326 | else safe_decode(r_char.value, encoding) | |
327 | ) | |
328 | ||
329 | ||
330 | def convert_symbol( | |
331 | r_symbol: parser.RObject, | |
332 | conversion_function: ConversionFunction, | |
333 | ) -> Union[str, bytes]: | |
275 | 334 | """ |
276 | 335 | Decode a R symbol to a Python string or bytes. |
277 | 336 | |
297 | 356 | symbol = conversion_function(r_symbol.value) |
298 | 357 | assert isinstance(symbol, (str, bytes)) |
299 | 358 | return symbol |
300 | else: | |
301 | raise TypeError("Must receive a SYM object") | |
359 | ||
360 | raise TypeError("Must receive a SYM object") | |
302 | 361 | |
303 | 362 | |
304 | 363 | def convert_array( |
305 | 364 | r_array: RObject, |
306 | conversion_function: Callable[ | |
307 | [Union[parser.RData, parser.RObject] | |
308 | ], Any]=lambda x: x, | |
309 | attrs: Optional[Mapping[Union[str, bytes], Any]] = None, | |
365 | conversion_function: ConversionFunction, | |
366 | attrs: Optional[StrMap] = None, | |
310 | 367 | ) -> Union[np.ndarray, xarray.DataArray]: |
311 | 368 | """ |
312 | 369 | Convert a R array to a Numpy ndarray or a Xarray DataArray. |
335 | 392 | if attrs is None: |
336 | 393 | attrs = {} |
337 | 394 | |
338 | if r_array.info.type not in {parser.RObjectType.LGL, | |
339 | parser.RObjectType.INT, | |
340 | parser.RObjectType.REAL, | |
341 | parser.RObjectType.CPLX}: | |
395 | if r_array.info.type not in { | |
396 | parser.RObjectType.LGL, | |
397 | parser.RObjectType.INT, | |
398 | parser.RObjectType.REAL, | |
399 | parser.RObjectType.CPLX, | |
400 | }: | |
342 | 401 | raise TypeError("Must receive an array object") |
343 | 402 | |
344 | 403 | value = r_array.value |
348 | 407 | # R matrix order is like FORTRAN |
349 | 408 | value = np.reshape(value, shape, order='F') |
350 | 409 | |
410 | dimension_names = None | |
411 | coords = None | |
412 | ||
351 | 413 | dimnames = attrs.get('dimnames') |
352 | 414 | if dimnames: |
353 | dimension_names = ["dim_" + str(i) for i, _ in enumerate(dimnames)] | |
354 | coords: Mapping[Hashable, Any] = { | |
355 | dimension_names[i]: d | |
356 | for i, d in enumerate(dimnames) if d is not None} | |
357 | ||
358 | value = xarray.DataArray(value, dims=dimension_names, coords=coords) | |
415 | if isinstance(dimnames, Mapping): | |
416 | dimension_names = list(dimnames.keys()) | |
417 | coords = dimnames | |
418 | else: | |
419 | dimension_names = [f"dim_{i}" for i, _ in enumerate(dimnames)] | |
420 | coords = { | |
421 | dimension_names[i]: d | |
422 | for i, d in enumerate(dimnames) | |
423 | if d is not None | |
424 | } | |
425 | ||
426 | value = xarray.DataArray( | |
427 | value, | |
428 | dims=dimension_names, | |
429 | coords=coords, | |
430 | ) | |
359 | 431 | |
360 | 432 | return value |
361 | 433 | |
362 | 434 | |
363 | 435 | def dataframe_constructor( |
364 | 436 | obj: Any, |
365 | attrs: Mapping[Union[str, bytes], Any], | |
437 | attrs: StrMap, | |
366 | 438 | ) -> pandas.DataFrame: |
367 | return pandas.DataFrame(obj, columns=obj) | |
439 | ||
440 | row_names = attrs["row.names"] | |
441 | ||
442 | # Default row names are stored as [INT_MIN, -len] | |
443 | INT_MIN = -2**31 # noqa: WPS432 | |
444 | index = ( | |
445 | pandas.RangeIndex(1, abs(row_names[1]) + 1) | |
446 | if len(row_names) == 2 and row_names[0] == INT_MIN | |
447 | else tuple(row_names) | |
448 | ) | |
449 | ||
450 | return pandas.DataFrame(obj, columns=obj, index=index) | |
368 | 451 | |
369 | 452 | |
370 | 453 | def _factor_constructor_internal( |
371 | 454 | obj: Any, |
372 | attrs: Mapping[Union[str, bytes], Any], | |
455 | attrs: StrMap, | |
373 | 456 | ordered: bool, |
374 | 457 | ) -> pandas.Categorical: |
375 | 458 | values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj] |
379 | 462 | |
380 | 463 | def factor_constructor( |
381 | 464 | obj: Any, |
382 | attrs: Mapping[Union[str, bytes], Any], | |
465 | attrs: StrMap, | |
383 | 466 | ) -> pandas.Categorical: |
467 | """Construct a factor objects.""" | |
384 | 468 | return _factor_constructor_internal(obj, attrs, ordered=False) |
385 | 469 | |
386 | 470 | |
387 | 471 | def ordered_constructor( |
388 | 472 | obj: Any, |
389 | attrs: Mapping[Union[str, bytes], Any], | |
473 | attrs: StrMap, | |
390 | 474 | ) -> pandas.Categorical: |
475 | """Contruct an ordered factor.""" | |
391 | 476 | return _factor_constructor_internal(obj, attrs, ordered=True) |
392 | 477 | |
393 | 478 | |
394 | 479 | def ts_constructor( |
395 | 480 | obj: Any, |
396 | attrs: Mapping[Union[str, bytes], Any], | |
481 | attrs: StrMap, | |
397 | 482 | ) -> pandas.Series: |
398 | ||
483 | """Construct a time series object.""" | |
399 | 484 | start, end, frequency = attrs['tsp'] |
400 | 485 | |
401 | 486 | frequency = int(frequency) |
403 | 488 | real_start = Fraction(int(round(start * frequency)), frequency) |
404 | 489 | real_end = Fraction(int(round(end * frequency)), frequency) |
405 | 490 | |
406 | index = np.arange(real_start, real_end + Fraction(1, frequency), | |
407 | Fraction(1, frequency)) | |
491 | index = np.arange( | |
492 | real_start, | |
493 | real_end + Fraction(1, frequency), | |
494 | Fraction(1, frequency), | |
495 | ) | |
408 | 496 | |
409 | 497 | if frequency == 1: |
410 | 498 | index = index.astype(int) |
412 | 500 | return pandas.Series(obj, index=index) |
413 | 501 | |
414 | 502 | |
503 | @dataclass | |
504 | class SrcRef: | |
505 | first_line: int | |
506 | first_byte: int | |
507 | last_line: int | |
508 | last_byte: int | |
509 | first_column: int | |
510 | last_column: int | |
511 | first_parsed: int | |
512 | last_parsed: int | |
513 | srcfile: SrcFile | |
514 | ||
515 | ||
516 | def srcref_constructor( | |
517 | obj: Any, | |
518 | attrs: StrMap, | |
519 | ) -> SrcRef: | |
520 | return SrcRef(*obj, srcfile=attrs["srcfile"]) | |
521 | ||
522 | ||
523 | @dataclass | |
524 | class SrcFile: | |
525 | filename: str | |
526 | file_encoding: str | None | |
527 | string_encoding: str | None | |
528 | ||
529 | ||
530 | def srcfile_constructor( | |
531 | obj: Any, | |
532 | attrs: StrMap, | |
533 | ) -> SrcFile: | |
534 | ||
535 | filename = obj.frame["filename"][0] | |
536 | file_encoding = obj.frame.get("encoding") | |
537 | string_encoding = obj.frame.get("Enc") | |
538 | ||
539 | return SrcFile( | |
540 | filename=filename, | |
541 | file_encoding=file_encoding, | |
542 | string_encoding=string_encoding, | |
543 | ) | |
544 | ||
545 | ||
546 | @dataclass | |
547 | class SrcFileCopy(SrcFile): | |
548 | lines: Sequence[str] | |
549 | ||
550 | ||
551 | def srcfilecopy_constructor( | |
552 | obj: Any, | |
553 | attrs: StrMap, | |
554 | ) -> SrcFile: | |
555 | ||
556 | filename = obj.frame["filename"][0] | |
557 | file_encoding = obj.frame.get("encoding", (None,))[0] | |
558 | string_encoding = obj.frame.get("Enc", (None,))[0] | |
559 | lines = obj.frame["lines"] | |
560 | ||
561 | return SrcFileCopy( | |
562 | filename=filename, | |
563 | file_encoding=file_encoding, | |
564 | string_encoding=string_encoding, | |
565 | lines=lines, | |
566 | ) | |
567 | ||
568 | ||
415 | 569 | Constructor = Callable[[Any, Mapping], Any] |
570 | ConstructorDict = Mapping[ | |
571 | Union[str, bytes], | |
572 | Constructor, | |
573 | ] | |
416 | 574 | |
417 | 575 | default_class_map_dict: Mapping[Union[str, bytes], Constructor] = { |
418 | 576 | "data.frame": dataframe_constructor, |
419 | 577 | "factor": factor_constructor, |
420 | 578 | "ordered": ordered_constructor, |
421 | 579 | "ts": ts_constructor, |
580 | "srcref": srcref_constructor, | |
581 | "srcfile": srcfile_constructor, | |
582 | "srcfilecopy": srcfilecopy_constructor, | |
422 | 583 | } |
423 | 584 | |
424 | 585 | DEFAULT_CLASS_MAP = MappingProxyType(default_class_map_dict) |
439 | 600 | |
440 | 601 | |
441 | 602 | class Converter(abc.ABC): |
442 | """ | |
443 | Interface of a class converting R objects in Python objects. | |
444 | """ | |
603 | """Interface of a class converting R objects in Python objects.""" | |
445 | 604 | |
446 | 605 | @abc.abstractmethod |
447 | 606 | def convert(self, data: Union[parser.RData, parser.RObject]) -> Any: |
448 | """ | |
449 | Convert a R object to a Python one. | |
450 | """ | |
607 | """Convert a R object to a Python one.""" | |
451 | 608 | pass |
452 | 609 | |
453 | 610 | |
479 | 636 | |
480 | 637 | def __init__( |
481 | 638 | self, |
482 | constructor_dict: Mapping[ | |
483 | Union[str, bytes], | |
484 | Constructor, | |
485 | ] = DEFAULT_CLASS_MAP, | |
639 | constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, | |
486 | 640 | default_encoding: Optional[str] = None, |
487 | 641 | force_default_encoding: bool = False, |
488 | global_environment: Optional[Mapping[Union[str, bytes], Any]] = None, | |
642 | global_environment: MutableMapping[str | bytes, Any] | None = None, | |
489 | 643 | ) -> None: |
490 | 644 | |
491 | 645 | self.constructor_dict = constructor_dict |
492 | 646 | self.default_encoding = default_encoding |
493 | 647 | self.force_default_encoding = force_default_encoding |
494 | self.global_environment = ChainMap( | |
648 | self.global_environment = REnvironment( | |
495 | 649 | {} if global_environment is None |
496 | else global_environment | |
650 | else global_environment, | |
497 | 651 | ) |
498 | self.empty_environment: Mapping[Union[str, bytes], Any] = ChainMap({}) | |
652 | self.empty_environment: StrMap = REnvironment({}) | |
499 | 653 | |
500 | 654 | self._reset() |
501 | 655 | |
503 | 657 | self.references: MutableMapping[int, Any] = {} |
504 | 658 | self.default_encoding_used = self.default_encoding |
505 | 659 | |
506 | def convert(self, data: Union[parser.RData, parser.RObject]) -> Any: | |
660 | def convert( # noqa: D102 | |
661 | self, | |
662 | data: Union[parser.RData, parser.RObject], | |
663 | ) -> Any: | |
507 | 664 | self._reset() |
508 | 665 | return self._convert_next(data) |
509 | 666 | |
510 | 667 | def _convert_next(self, data: Union[parser.RData, parser.RObject]) -> Any: |
511 | """ | |
512 | Convert a R object to a Python one. | |
513 | """ | |
514 | ||
668 | """Convert a R object to a Python one.""" | |
515 | 669 | obj: RObject |
516 | 670 | if isinstance(data, parser.RData): |
517 | 671 | obj = data.object |
539 | 693 | # Expand the list and process the elements |
540 | 694 | value = convert_list(obj, self._convert_next) |
541 | 695 | |
696 | elif obj.info.type == parser.RObjectType.CLO: | |
697 | assert obj.tag is not None | |
698 | environment = self._convert_next(obj.tag) | |
699 | formals = self._convert_next(obj.value[0]) | |
700 | body = self._convert_next(obj.value[1]) | |
701 | attributes = self._convert_next(obj.attributes) | |
702 | ||
703 | value = RFunction( | |
704 | environment=environment, | |
705 | formals=formals, | |
706 | body=body, | |
707 | attributes=attributes, | |
708 | ) | |
709 | ||
542 | 710 | elif obj.info.type == parser.RObjectType.ENV: |
543 | 711 | |
544 | 712 | # Return a ChainMap of the environments |
550 | 718 | # special object |
551 | 719 | rlanguage_list = convert_list(obj, self._convert_next) |
552 | 720 | assert isinstance(rlanguage_list, list) |
553 | ||
554 | value = RLanguage(rlanguage_list) | |
721 | attributes = self._convert_next( | |
722 | obj.attributes, | |
723 | ) if obj.attributes else {} | |
724 | ||
725 | value = RLanguage(rlanguage_list, attributes) | |
726 | ||
727 | elif obj.info.type in {parser.RObjectType.SPECIAL, parser.RObjectType.BUILTIN}: | |
728 | ||
729 | value = RBuiltin(name=obj.value.decode("ascii")) | |
555 | 730 | |
556 | 731 | elif obj.info.type == parser.RObjectType.CHAR: |
557 | 732 | |
562 | 737 | force_default_encoding=self.force_default_encoding, |
563 | 738 | ) |
564 | 739 | |
565 | elif obj.info.type in {parser.RObjectType.LGL, | |
566 | parser.RObjectType.INT, | |
567 | parser.RObjectType.REAL, | |
568 | parser.RObjectType.CPLX}: | |
740 | elif obj.info.type in { | |
741 | parser.RObjectType.LGL, | |
742 | parser.RObjectType.INT, | |
743 | parser.RObjectType.REAL, | |
744 | parser.RObjectType.CPLX, | |
745 | }: | |
569 | 746 | |
570 | 747 | # Return the internal array |
571 | 748 | value = convert_array(obj, self._convert_next, attrs=attrs) |
582 | 759 | |
583 | 760 | elif obj.info.type == parser.RObjectType.EXPR: |
584 | 761 | rexpression_list = convert_vector( |
585 | obj, self._convert_next, attrs=attrs) | |
762 | obj, | |
763 | self._convert_next, | |
764 | attrs=attrs, | |
765 | ) | |
586 | 766 | assert isinstance(rexpression_list, list) |
587 | 767 | |
588 | 768 | # Convert the internal objects returning a special object |
589 | 769 | value = RExpression(rexpression_list) |
590 | 770 | |
771 | elif obj.info.type == parser.RObjectType.BCODE: | |
772 | ||
773 | value = RBytecode( | |
774 | code=self._convert_next(obj.value[0]), | |
775 | constants=[self._convert_next(c) for c in obj.value[1]], | |
776 | attributes=attrs, | |
777 | ) | |
778 | ||
779 | elif obj.info.type == parser.RObjectType.EXTPTR: | |
780 | ||
781 | value = RExternalPointer( | |
782 | protected=self._convert_next(obj.value[0]), | |
783 | tag=self._convert_next(obj.value[1]), | |
784 | ) | |
785 | ||
591 | 786 | elif obj.info.type == parser.RObjectType.S4: |
592 | 787 | value = SimpleNamespace(**attrs) |
593 | 788 | |
594 | 789 | elif obj.info.type == parser.RObjectType.EMPTYENV: |
595 | 790 | value = self.empty_environment |
596 | 791 | |
792 | elif obj.info.type == parser.RObjectType.MISSINGARG: | |
793 | value = NotImplemented | |
794 | ||
597 | 795 | elif obj.info.type == parser.RObjectType.GLOBALENV: |
598 | 796 | value = self.global_environment |
599 | 797 | |
601 | 799 | |
602 | 800 | # Return the referenced value |
603 | 801 | value = self.references.get(id(obj.referenced_object)) |
604 | # value = self.references[id(obj.referenced_object)] | |
605 | 802 | if value is None: |
606 | 803 | reference_id = id(obj.referenced_object) |
607 | 804 | assert obj.referenced_object is not None |
614 | 811 | else: |
615 | 812 | raise NotImplementedError(f"Type {obj.info.type} not implemented") |
616 | 813 | |
617 | if obj.info.object: | |
618 | classname = attrs["class"] | |
814 | if obj.info.object and attrs is not None: | |
815 | classname = attrs.get("class", ()) | |
619 | 816 | for i, c in enumerate(classname): |
620 | 817 | |
621 | 818 | constructor = self.constructor_dict.get(c, None) |
626 | 823 | new_value = NotImplemented |
627 | 824 | |
628 | 825 | if new_value is NotImplemented: |
629 | missing_msg = (f"Missing constructor for R class " | |
630 | f"\"{c}\". ") | |
826 | missing_msg = ( | |
827 | f"Missing constructor for R class \"{c}\". " | |
828 | ) | |
631 | 829 | |
632 | 830 | if len(classname) > (i + 1): |
633 | solution_msg = (f"The constructor for class " | |
634 | f"\"{classname[i+1]}\" will be " | |
635 | f"used instead." | |
636 | ) | |
831 | solution_msg = ( | |
832 | f"The constructor for class " | |
833 | f"\"{classname[i+1]}\" will be " | |
834 | f"used instead." | |
835 | ) | |
637 | 836 | else: |
638 | solution_msg = ("The underlying R object is " | |
639 | "returned instead.") | |
640 | ||
641 | warnings.warn(missing_msg + solution_msg, | |
642 | stacklevel=1) | |
837 | solution_msg = ( | |
838 | "The underlying R object is " | |
839 | "returned instead." | |
840 | ) | |
841 | ||
842 | warnings.warn( | |
843 | missing_msg + solution_msg, | |
844 | stacklevel=1, | |
845 | ) | |
643 | 846 | else: |
644 | 847 | value = new_value |
645 | 848 | break |
655 | 858 | **kwargs: Any, |
656 | 859 | ) -> Any: |
657 | 860 | """ |
658 | Uses the default converter (:func:`SimpleConverter`) to convert the data. | |
861 | Use the default converter (:func:`SimpleConverter`) to convert the data. | |
659 | 862 | |
660 | 863 | Examples: |
661 | ||
662 | 864 | Parse one of the included examples, containing a vector |
663 | 865 | |
664 | 866 | >>> import rdata |
678 | 880 | >>> converted = rdata.conversion.convert(parsed) |
679 | 881 | >>> converted |
680 | 882 | {'test_dataframe': class value |
681 | 0 a 1 | |
682 | 1 b 2 | |
683 | 2 b 3} | |
883 | 1 a 1 | |
884 | 2 b 2 | |
885 | 3 b 3} | |
684 | 886 | |
685 | 887 | """ |
686 | 888 | return SimpleConverter(*args, **kwargs).convert(data) |
0 | """Utilities for parsing a rdata file.""" | |
1 | ||
0 | 2 | from ._parser import ( |
1 | DEFAULT_ALTREP_MAP, | |
2 | CharFlags, | |
3 | RData, | |
4 | RObject, | |
5 | RObjectInfo, | |
6 | RObjectType, | |
7 | parse_data, | |
8 | parse_file, | |
3 | DEFAULT_ALTREP_MAP as DEFAULT_ALTREP_MAP, | |
4 | CharFlags as CharFlags, | |
5 | RData as RData, | |
6 | RObject as RObject, | |
7 | RObjectInfo as RObjectInfo, | |
8 | RObjectType as RObjectType, | |
9 | parse_data as parse_data, | |
10 | parse_file as parse_file, | |
9 | 11 | ) |
11 | 11 | from dataclasses import dataclass |
12 | 12 | from types import MappingProxyType |
13 | 13 | from typing import ( |
14 | TYPE_CHECKING, | |
14 | 15 | Any, |
15 | 16 | BinaryIO, |
16 | 17 | Callable, |
17 | 18 | List, |
18 | 19 | Mapping, |
19 | 20 | Optional, |
21 | Sequence, | |
20 | 22 | Set, |
21 | 23 | TextIO, |
22 | 24 | Tuple, |
27 | 29 | |
28 | 30 | |
29 | 31 | class FileTypes(enum.Enum): |
30 | """ | |
31 | Type of file containing a R file. | |
32 | """ | |
32 | """Type of file containing a R file.""" | |
33 | ||
33 | 34 | bzip2 = "bz2" |
34 | 35 | gzip = "gzip" |
35 | 36 | xz = "xz" |
42 | 43 | FileTypes.gzip: b"\x1f\x8b", |
43 | 44 | FileTypes.xz: b"\xFD7zXZ\x00", |
44 | 45 | FileTypes.rdata_binary_v2: b"RDX2\n", |
45 | FileTypes.rdata_binary_v3: b"RDX3\n" | |
46 | FileTypes.rdata_binary_v3: b"RDX3\n", | |
46 | 47 | } |
47 | 48 | |
48 | 49 | |
49 | 50 | def file_type(data: memoryview) -> Optional[FileTypes]: |
50 | """ | |
51 | Returns the type of the file. | |
52 | """ | |
53 | ||
51 | """Return the type of the file.""" | |
54 | 52 | for filetype, magic in magic_dict.items(): |
55 | 53 | if data[:len(magic)] == magic: |
56 | 54 | return filetype |
58 | 56 | |
59 | 57 | |
60 | 58 | class RdataFormats(enum.Enum): |
61 | """ | |
62 | Format of a R file. | |
63 | """ | |
59 | """Format of a R file.""" | |
60 | ||
64 | 61 | XDR = "XDR" |
65 | 62 | ASCII = "ASCII" |
66 | 63 | binary = "binary" |
74 | 71 | |
75 | 72 | |
76 | 73 | def rdata_format(data: memoryview) -> Optional[RdataFormats]: |
77 | """ | |
78 | Returns the format of the data. | |
79 | """ | |
80 | ||
74 | """Return the format of the data.""" | |
81 | 75 | for format_type, magic in format_dict.items(): |
82 | 76 | if data[:len(magic)] == magic: |
83 | 77 | return format_type |
85 | 79 | |
86 | 80 | |
87 | 81 | class RObjectType(enum.Enum): |
88 | """ | |
89 | Type of a R object. | |
90 | """ | |
82 | """Type of a R object.""" | |
83 | ||
91 | 84 | NIL = 0 # NULL |
92 | 85 | SYM = 1 # symbols |
93 | 86 | LIST = 2 # pairlists |
113 | 106 | RAW = 24 # raw vector |
114 | 107 | S4 = 25 # S4 classes not of simple type |
115 | 108 | ALTREP = 238 # Alternative representations |
109 | ATTRLIST = 239 # Bytecode attribute | |
110 | ATTRLANG = 240 # Bytecode attribute | |
116 | 111 | EMPTYENV = 242 # Empty environment |
112 | BCREPREF = 243 # Bytecode repetition reference | |
113 | BCREPDEF = 244 # Bytecode repetition definition | |
114 | MISSINGARG = 251 # Missinf argument | |
117 | 115 | GLOBALENV = 253 # Global environment |
118 | 116 | NILVALUE = 254 # NIL value |
119 | 117 | REF = 255 # Reference |
120 | 118 | |
121 | 119 | |
120 | BYTECODE_SPECIAL_SET = { | |
121 | RObjectType.BCODE, | |
122 | RObjectType.BCREPREF, | |
123 | RObjectType.BCREPDEF, | |
124 | RObjectType.LANG, | |
125 | RObjectType.LIST, | |
126 | RObjectType.ATTRLANG, | |
127 | RObjectType.ATTRLIST, | |
128 | } | |
129 | ||
130 | ||
122 | 131 | class CharFlags(enum.IntFlag): |
132 | """Flags for R objects of type char.""" | |
133 | ||
123 | 134 | HAS_HASH = 1 |
124 | 135 | BYTES = 1 << 1 |
125 | 136 | LATIN1 = 1 << 2 |
130 | 141 | |
131 | 142 | @dataclass |
132 | 143 | class RVersions(): |
133 | """ | |
134 | R versions. | |
135 | """ | |
136 | format: int | |
144 | """R versions.""" | |
145 | ||
146 | format: int # noqa: E701 | |
137 | 147 | serialized: int |
138 | 148 | minimum: int |
139 | 149 | |
144 | 154 | Extra information. |
145 | 155 | |
146 | 156 | Contains the default encoding (only in version 3). |
157 | ||
147 | 158 | """ |
159 | ||
148 | 160 | encoding: Optional[str] = None |
149 | 161 | |
150 | 162 | |
151 | 163 | @dataclass |
152 | 164 | class RObjectInfo(): |
153 | """ | |
154 | Internal attributes of a R object. | |
155 | """ | |
165 | """Internal attributes of a R object.""" | |
166 | ||
156 | 167 | type: RObjectType |
157 | 168 | object: bool |
158 | 169 | attributes: bool |
161 | 172 | reference: int |
162 | 173 | |
163 | 174 | |
175 | def _str_internal( | |
176 | obj: RObject | Sequence[RObject], | |
177 | indent: int = 0, | |
178 | used_references: Optional[Set[int]] = None, | |
179 | ) -> str: | |
180 | ||
181 | if used_references is None: | |
182 | used_references = set() | |
183 | ||
184 | small_indent = indent + 2 | |
185 | big_indent = indent + 4 | |
186 | ||
187 | indent_spaces = ' ' * indent | |
188 | small_indent_spaces = ' ' * small_indent | |
189 | big_indent_spaces = ' ' * big_indent | |
190 | ||
191 | string = "" | |
192 | ||
193 | if isinstance(obj, Sequence): | |
194 | string += f"{indent_spaces}[\n" | |
195 | for elem in obj: | |
196 | string += _str_internal( | |
197 | elem, | |
198 | big_indent, | |
199 | used_references.copy(), | |
200 | ) | |
201 | string += f"{indent_spaces}]\n" | |
202 | ||
203 | return string | |
204 | ||
205 | string += f"{indent_spaces}{obj.info.type}\n" | |
206 | ||
207 | if obj.tag: | |
208 | tag_string = _str_internal( | |
209 | obj.tag, | |
210 | big_indent, | |
211 | used_references.copy(), | |
212 | ) | |
213 | string += f"{small_indent_spaces}tag:\n{tag_string}\n" | |
214 | ||
215 | if obj.info.reference: | |
216 | assert obj.referenced_object | |
217 | reference_string = ( | |
218 | f"{big_indent_spaces}..." | |
219 | if obj.info.reference in used_references | |
220 | else _str_internal( | |
221 | obj.referenced_object, | |
222 | indent + 4, used_references.copy()) | |
223 | ) | |
224 | string += ( | |
225 | f"{small_indent_spaces}reference: " | |
226 | f"{obj.info.reference}\n{reference_string}\n" | |
227 | ) | |
228 | ||
229 | string += f"{small_indent_spaces}value:\n" | |
230 | ||
231 | if isinstance(obj.value, RObject): | |
232 | string += _str_internal( | |
233 | obj.value, | |
234 | big_indent, | |
235 | used_references.copy(), | |
236 | ) | |
237 | elif isinstance(obj.value, (tuple, list)): | |
238 | for elem in obj.value: | |
239 | string += _str_internal( | |
240 | elem, | |
241 | big_indent, | |
242 | used_references.copy(), | |
243 | ) | |
244 | elif isinstance(obj.value, np.ndarray): | |
245 | string += big_indent_spaces | |
246 | if len(obj.value) > 4: | |
247 | string += ( | |
248 | f"[{obj.value[0]}, {obj.value[1]} ... " | |
249 | f"{obj.value[-2]}, {obj.value[-1]}]\n" | |
250 | ) | |
251 | else: | |
252 | string += f"{obj.value}\n" | |
253 | else: | |
254 | string += f"{big_indent_spaces}{obj.value}\n" | |
255 | ||
256 | if obj.attributes: | |
257 | attr_string = _str_internal( | |
258 | obj.attributes, | |
259 | big_indent, | |
260 | used_references.copy(), | |
261 | ) | |
262 | string += f"{small_indent_spaces}attributes:\n{attr_string}\n" | |
263 | ||
264 | return string | |
265 | ||
266 | ||
164 | 267 | @dataclass |
165 | 268 | class RObject(): |
166 | """ | |
167 | Representation of a R object. | |
168 | """ | |
269 | """Representation of a R object.""" | |
270 | ||
169 | 271 | info: RObjectInfo |
170 | 272 | value: Any |
171 | 273 | attributes: Optional[RObject] |
172 | 274 | tag: Optional[RObject] = None |
173 | 275 | referenced_object: Optional[RObject] = None |
174 | 276 | |
175 | def _str_internal( | |
176 | self, | |
177 | indent: int = 0, | |
178 | used_references: Optional[Set[int]] = None | |
179 | ) -> str: | |
180 | ||
181 | if used_references is None: | |
182 | used_references = set() | |
183 | ||
184 | string = "" | |
185 | ||
186 | string += f"{' ' * indent}{self.info.type}\n" | |
187 | ||
188 | if self.tag: | |
189 | tag_string = self.tag._str_internal(indent + 4, | |
190 | used_references.copy()) | |
191 | string += f"{' ' * (indent + 2)}tag:\n{tag_string}\n" | |
192 | ||
193 | if self.info.reference: | |
194 | assert self.referenced_object | |
195 | reference_string = (f"{' ' * (indent + 4)}..." | |
196 | if self.info.reference in used_references | |
197 | else self.referenced_object._str_internal( | |
198 | indent + 4, used_references.copy())) | |
199 | string += (f"{' ' * (indent + 2)}reference: " | |
200 | f"{self.info.reference}\n{reference_string}\n") | |
201 | ||
202 | string += f"{' ' * (indent + 2)}value:\n" | |
203 | ||
204 | if isinstance(self.value, RObject): | |
205 | string += self.value._str_internal(indent + 4, | |
206 | used_references.copy()) | |
207 | elif isinstance(self.value, tuple) or isinstance(self.value, list): | |
208 | for elem in self.value: | |
209 | string += elem._str_internal(indent + 4, | |
210 | used_references.copy()) | |
211 | elif isinstance(self.value, np.ndarray): | |
212 | string += " " * (indent + 4) | |
213 | if len(self.value) > 4: | |
214 | string += (f"[{self.value[0]}, {self.value[1]} ... " | |
215 | f"{self.value[-2]}, {self.value[-1]}]\n") | |
216 | else: | |
217 | string += f"{self.value}\n" | |
218 | else: | |
219 | string += f"{' ' * (indent + 4)}{self.value}\n" | |
220 | ||
221 | if(self.attributes): | |
222 | attr_string = self.attributes._str_internal( | |
223 | indent + 4, | |
224 | used_references.copy()) | |
225 | string += f"{' ' * (indent + 2)}attributes:\n{attr_string}\n" | |
226 | ||
227 | return string | |
228 | ||
229 | 277 | def __str__(self) -> str: |
230 | return self._str_internal() | |
278 | return _str_internal(self) | |
231 | 279 | |
232 | 280 | |
233 | 281 | @dataclass |
234 | 282 | class RData(): |
235 | """ | |
236 | Data contained in a R file. | |
237 | """ | |
283 | """Data contained in a R file.""" | |
284 | ||
238 | 285 | versions: RVersions |
239 | 286 | extra: RExtraInfo |
240 | 287 | object: RObject |
241 | 288 | |
289 | def __str__(self) -> str: | |
290 | return ( | |
291 | "RData(\n" | |
292 | f" versions: {self.versions}\n" | |
293 | f" extra: {self.extra}\n" | |
294 | f" object: \n{_str_internal(self.object, indent=4)}\n" | |
295 | ")\n" | |
296 | ) | |
297 | ||
242 | 298 | |
243 | 299 | @dataclass |
244 | 300 | class EnvironmentValue(): |
245 | """ | |
246 | Value of an environment. | |
247 | """ | |
301 | """Value of an environment.""" | |
302 | ||
248 | 303 | locked: bool |
249 | 304 | enclosure: RObject |
250 | 305 | frame: RObject |
259 | 314 | |
260 | 315 | |
261 | 316 | def format_float_with_scipen(number: float, scipen: int) -> bytes: |
317 | """Format a floating point value as in R.""" | |
262 | 318 | fixed = np.format_float_positional(number, trim="-") |
263 | 319 | scientific = np.format_float_scientific(number, trim="-") |
264 | 320 | |
265 | assert(isinstance(fixed, str)) | |
266 | assert(isinstance(scientific, str)) | |
321 | assert isinstance(fixed, str) | |
322 | assert isinstance(scientific, str) | |
267 | 323 | |
268 | 324 | return ( |
269 | 325 | scientific if len(fixed) - len(scientific) > scipen |
274 | 330 | def deferred_string_constructor( |
275 | 331 | state: RObject, |
276 | 332 | ) -> Tuple[RObjectInfo, Any]: |
277 | ||
333 | """Expand a deferred string ALTREP.""" | |
278 | 334 | new_info = RObjectInfo( |
279 | 335 | type=RObjectType.STR, |
280 | 336 | object=False, |
311 | 367 | def compact_seq_constructor( |
312 | 368 | state: RObject, |
313 | 369 | *, |
314 | is_int: bool = False | |
370 | is_int: bool = False, | |
315 | 371 | ) -> Tuple[RObjectInfo, Any]: |
316 | ||
372 | """Expand a compact_seq ALTREP.""" | |
317 | 373 | new_info = RObjectInfo( |
318 | 374 | type=RObjectType.INT if is_int else RObjectType.REAL, |
319 | 375 | object=False, |
340 | 396 | def compact_intseq_constructor( |
341 | 397 | state: RObject, |
342 | 398 | ) -> Tuple[RObjectInfo, Any]: |
399 | """Expand a compact_intseq ALTREP.""" | |
343 | 400 | return compact_seq_constructor(state, is_int=True) |
344 | 401 | |
345 | 402 | |
346 | 403 | def compact_realseq_constructor( |
347 | 404 | state: RObject, |
348 | 405 | ) -> Tuple[RObjectInfo, Any]: |
406 | """Expand a compact_realseq ALTREP.""" | |
349 | 407 | return compact_seq_constructor(state, is_int=False) |
350 | 408 | |
351 | 409 | |
352 | 410 | def wrap_constructor( |
353 | 411 | state: RObject, |
354 | 412 | ) -> Tuple[RObjectInfo, Any]: |
355 | ||
413 | """Expand any wrap_* ALTREP.""" | |
356 | 414 | new_info = RObjectInfo( |
357 | 415 | type=state.value[0].info.type, |
358 | 416 | object=False, |
383 | 441 | |
384 | 442 | |
385 | 443 | class Parser(abc.ABC): |
386 | """ | |
387 | Parser interface for a R file. | |
388 | """ | |
444 | """Parser interface for a R file.""" | |
389 | 445 | |
390 | 446 | def __init__( |
391 | 447 | self, |
397 | 453 | self.altrep_constructor_dict = altrep_constructor_dict |
398 | 454 | |
399 | 455 | def parse_bool(self) -> bool: |
400 | """ | |
401 | Parse a boolean. | |
402 | """ | |
456 | """Parse a boolean.""" | |
403 | 457 | return bool(self.parse_int()) |
404 | 458 | |
405 | 459 | @abc.abstractmethod |
406 | 460 | def parse_int(self) -> int: |
407 | """ | |
408 | Parse an integer. | |
409 | """ | |
461 | """Parse an integer.""" | |
410 | 462 | pass |
411 | 463 | |
412 | 464 | @abc.abstractmethod |
413 | 465 | def parse_double(self) -> float: |
414 | """ | |
415 | Parse a double. | |
416 | """ | |
466 | """Parse a double.""" | |
417 | 467 | pass |
418 | 468 | |
419 | 469 | def parse_complex(self) -> complex: |
420 | """ | |
421 | Parse a complex number. | |
422 | """ | |
470 | """Parse a complex number.""" | |
423 | 471 | return complex(self.parse_double(), self.parse_double()) |
424 | 472 | |
425 | 473 | @abc.abstractmethod |
426 | 474 | def parse_string(self, length: int) -> bytes: |
427 | """ | |
428 | Parse a string. | |
429 | """ | |
475 | """Parse a string.""" | |
430 | 476 | pass |
431 | 477 | |
432 | 478 | def parse_all(self) -> RData: |
433 | """ | |
434 | Parse all the file. | |
435 | """ | |
436 | ||
479 | """Parse all the file.""" | |
437 | 480 | versions = self.parse_versions() |
438 | 481 | extra_info = self.parse_extra_info(versions) |
439 | 482 | obj = self.parse_R_object() |
441 | 484 | return RData(versions, extra_info, obj) |
442 | 485 | |
443 | 486 | def parse_versions(self) -> RVersions: |
444 | """ | |
445 | Parse the versions header. | |
446 | """ | |
447 | ||
487 | """Parse the versions header.""" | |
448 | 488 | format_version = self.parse_int() |
449 | 489 | r_version = self.parse_int() |
450 | 490 | minimum_r_version = self.parse_int() |
451 | 491 | |
452 | if format_version not in [2, 3]: | |
492 | if format_version not in {2, 3}: | |
453 | 493 | raise NotImplementedError( |
454 | 494 | f"Format version {format_version} unsupported", |
455 | 495 | ) |
458 | 498 | |
459 | 499 | def parse_extra_info(self, versions: RVersions) -> RExtraInfo: |
460 | 500 | """ |
461 | Parse the versions header. | |
501 | Parse the extra info. | |
502 | ||
503 | Parses de encoding in version 3 format. | |
504 | ||
462 | 505 | """ |
463 | ||
464 | 506 | encoding = None |
465 | 507 | |
466 | 508 | if versions.format >= 3: |
467 | 509 | encoding_len = self.parse_int() |
468 | 510 | encoding = self.parse_string(encoding_len).decode("ASCII") |
469 | 511 | |
470 | extra_info = RExtraInfo(encoding) | |
471 | ||
472 | return extra_info | |
512 | return RExtraInfo(encoding) | |
473 | 513 | |
474 | 514 | def expand_altrep_to_object( |
475 | 515 | self, |
477 | 517 | state: RObject, |
478 | 518 | ) -> Tuple[RObjectInfo, Any]: |
479 | 519 | """Expand alternative representation to normal object.""" |
480 | ||
481 | 520 | assert info.info.type == RObjectType.LIST |
482 | 521 | |
483 | 522 | class_sym = info.value[0] |
493 | 532 | constructor = self.altrep_constructor_dict[altrep_name] |
494 | 533 | return constructor(state) |
495 | 534 | |
535 | def _parse_bytecode_constant( | |
536 | self, | |
537 | reference_list: Optional[List[RObject]], | |
538 | bytecode_rep_list: List[RObject | None] | None = None, | |
539 | ) -> RObject: | |
540 | ||
541 | obj_type = self.parse_int() | |
542 | ||
543 | return self.parse_R_object( | |
544 | reference_list, | |
545 | bytecode_rep_list, | |
546 | info_int=obj_type, | |
547 | ) | |
548 | ||
549 | def _parse_bytecode( | |
550 | self, | |
551 | reference_list: Optional[List[RObject]], | |
552 | bytecode_rep_list: List[RObject | None] | None = None, | |
553 | ) -> Tuple[RObject, Sequence[RObject]]: | |
554 | """Parse R bytecode.""" | |
555 | if bytecode_rep_list is None: | |
556 | n_repeated = self.parse_int() | |
557 | ||
558 | code = self.parse_R_object(reference_list, bytecode_rep_list) | |
559 | ||
560 | if bytecode_rep_list is None: | |
561 | bytecode_rep_list = [None] * n_repeated | |
562 | ||
563 | n_constants = self.parse_int() | |
564 | constants = [ | |
565 | self._parse_bytecode_constant( | |
566 | reference_list, | |
567 | bytecode_rep_list, | |
568 | ) | |
569 | for _ in range(n_constants) | |
570 | ] | |
571 | ||
572 | return (code, constants) | |
573 | ||
496 | 574 | def parse_R_object( |
497 | 575 | self, |
498 | reference_list: Optional[List[RObject]] = None | |
576 | reference_list: List[RObject] | None = None, | |
577 | bytecode_rep_list: List[RObject | None] | None = None, | |
578 | info_int: int | None = None, | |
499 | 579 | ) -> RObject: |
500 | """ | |
501 | Parse a R object. | |
502 | """ | |
503 | ||
580 | """Parse a R object.""" | |
504 | 581 | if reference_list is None: |
505 | 582 | # Index is 1-based, so we insert a dummy object |
506 | 583 | reference_list = [] |
507 | 584 | |
508 | info_int = self.parse_int() | |
509 | ||
510 | info = parse_r_object_info(info_int) | |
585 | original_info_int = info_int | |
586 | if ( | |
587 | info_int is not None | |
588 | and RObjectType(info_int) in BYTECODE_SPECIAL_SET | |
589 | ): | |
590 | info = parse_r_object_info(info_int) | |
591 | info.tag = info.type not in { | |
592 | RObjectType.BCREPREF, | |
593 | RObjectType.BCODE, | |
594 | } | |
595 | else: | |
596 | info_int = self.parse_int() | |
597 | info = parse_r_object_info(info_int) | |
511 | 598 | |
512 | 599 | tag = None |
513 | 600 | attributes = None |
514 | 601 | referenced_object = None |
515 | 602 | |
603 | bytecode_rep_position = -1 | |
516 | 604 | tag_read = False |
517 | 605 | attributes_read = False |
518 | 606 | add_reference = False |
521 | 609 | |
522 | 610 | value: Any |
523 | 611 | |
612 | if info.type == RObjectType.BCREPDEF: | |
613 | assert bytecode_rep_list | |
614 | bytecode_rep_position = self.parse_int() | |
615 | info.type = RObjectType(self.parse_int()) | |
616 | ||
524 | 617 | if info.type == RObjectType.NIL: |
525 | 618 | value = None |
526 | 619 | |
527 | 620 | elif info.type == RObjectType.SYM: |
528 | 621 | # Read Char |
529 | value = self.parse_R_object(reference_list) | |
622 | value = self.parse_R_object(reference_list, bytecode_rep_list) | |
530 | 623 | # Symbols can be referenced |
531 | 624 | add_reference = True |
532 | 625 | |
533 | elif info.type in [RObjectType.LIST, RObjectType.LANG]: | |
626 | elif info.type in { | |
627 | RObjectType.LIST, | |
628 | RObjectType.LANG, | |
629 | RObjectType.CLO, | |
630 | RObjectType.PROM, | |
631 | RObjectType.DOT, | |
632 | RObjectType.ATTRLANG, | |
633 | }: | |
634 | if info.type is RObjectType.ATTRLANG: | |
635 | info.type = RObjectType.LANG | |
636 | info.attributes = True | |
637 | ||
534 | 638 | tag = None |
535 | 639 | if info.attributes: |
536 | attributes = self.parse_R_object(reference_list) | |
640 | attributes = self.parse_R_object( | |
641 | reference_list, | |
642 | bytecode_rep_list, | |
643 | ) | |
537 | 644 | attributes_read = True |
538 | elif info.tag: | |
539 | tag = self.parse_R_object(reference_list) | |
645 | ||
646 | if info.tag: | |
647 | tag = self.parse_R_object(reference_list, bytecode_rep_list) | |
540 | 648 | tag_read = True |
541 | 649 | |
542 | 650 | # Read CAR and CDR |
543 | car = self.parse_R_object(reference_list) | |
544 | cdr = self.parse_R_object(reference_list) | |
651 | car = self.parse_R_object( | |
652 | reference_list, | |
653 | bytecode_rep_list, | |
654 | info_int=( | |
655 | None if original_info_int is None | |
656 | else self.parse_int() | |
657 | ), | |
658 | ) | |
659 | cdr = self.parse_R_object( | |
660 | reference_list, | |
661 | bytecode_rep_list, | |
662 | info_int=( | |
663 | None if original_info_int is None | |
664 | else self.parse_int() | |
665 | ), | |
666 | ) | |
545 | 667 | value = (car, cdr) |
546 | 668 | |
547 | 669 | elif info.type == RObjectType.ENV: |
670 | info.object = True | |
671 | ||
548 | 672 | result = RObject( |
549 | 673 | info=info, |
550 | 674 | tag=tag, |
556 | 680 | reference_list.append(result) |
557 | 681 | |
558 | 682 | locked = self.parse_bool() |
559 | enclosure = self.parse_R_object(reference_list) | |
560 | frame = self.parse_R_object(reference_list) | |
561 | hash_table = self.parse_R_object(reference_list) | |
562 | attributes = self.parse_R_object(reference_list) | |
683 | enclosure = self.parse_R_object(reference_list, bytecode_rep_list) | |
684 | frame = self.parse_R_object(reference_list, bytecode_rep_list) | |
685 | hash_table = self.parse_R_object(reference_list, bytecode_rep_list) | |
686 | attributes = self.parse_R_object(reference_list, bytecode_rep_list) | |
563 | 687 | |
564 | 688 | value = EnvironmentValue( |
565 | 689 | locked=locked, |
567 | 691 | frame=frame, |
568 | 692 | hash_table=hash_table, |
569 | 693 | ) |
694 | ||
695 | elif info.type in {RObjectType.SPECIAL, RObjectType.BUILTIN}: | |
696 | length = self.parse_int() | |
697 | if length > 0: | |
698 | value = self.parse_string(length=length) | |
570 | 699 | |
571 | 700 | elif info.type == RObjectType.CHAR: |
572 | 701 | length = self.parse_int() |
578 | 707 | value = None |
579 | 708 | else: |
580 | 709 | raise NotImplementedError( |
581 | f"Length of CHAR cannot be {length}") | |
710 | f"Length of CHAR cannot be {length}", | |
711 | ) | |
582 | 712 | |
583 | 713 | elif info.type == RObjectType.LGL: |
584 | 714 | length = self.parse_int() |
612 | 742 | for i in range(length): |
613 | 743 | value[i] = self.parse_complex() |
614 | 744 | |
615 | elif info.type in [RObjectType.STR, | |
616 | RObjectType.VEC, RObjectType.EXPR]: | |
745 | elif info.type in { | |
746 | RObjectType.STR, | |
747 | RObjectType.VEC, | |
748 | RObjectType.EXPR, | |
749 | }: | |
617 | 750 | length = self.parse_int() |
618 | 751 | |
619 | 752 | value = [None] * length |
620 | 753 | |
621 | 754 | for i in range(length): |
622 | value[i] = self.parse_R_object(reference_list) | |
755 | value[i] = self.parse_R_object( | |
756 | reference_list, bytecode_rep_list) | |
757 | ||
758 | elif info.type == RObjectType.BCODE: | |
759 | value = self._parse_bytecode(reference_list, bytecode_rep_list) | |
760 | tag_read = True | |
761 | ||
762 | elif info.type == RObjectType.EXTPTR: | |
763 | ||
764 | result = RObject( | |
765 | info=info, | |
766 | tag=tag, | |
767 | attributes=attributes, | |
768 | value=None, | |
769 | referenced_object=referenced_object, | |
770 | ) | |
771 | ||
772 | reference_list.append(result) | |
773 | protected = self.parse_R_object( | |
774 | reference_list, | |
775 | bytecode_rep_list, | |
776 | ) | |
777 | extptr_tag = self.parse_R_object( | |
778 | reference_list, | |
779 | bytecode_rep_list, | |
780 | ) | |
781 | ||
782 | value = (protected, extptr_tag) | |
623 | 783 | |
624 | 784 | elif info.type == RObjectType.S4: |
625 | 785 | value = None |
626 | 786 | |
627 | 787 | elif info.type == RObjectType.ALTREP: |
628 | altrep_info = self.parse_R_object(reference_list) | |
629 | altrep_state = self.parse_R_object(reference_list) | |
630 | altrep_attr = self.parse_R_object(reference_list) | |
788 | altrep_info = self.parse_R_object( | |
789 | reference_list, | |
790 | bytecode_rep_list, | |
791 | ) | |
792 | altrep_state = self.parse_R_object( | |
793 | reference_list, | |
794 | bytecode_rep_list, | |
795 | ) | |
796 | altrep_attr = self.parse_R_object( | |
797 | reference_list, | |
798 | bytecode_rep_list, | |
799 | ) | |
631 | 800 | |
632 | 801 | if self.expand_altrep: |
633 | 802 | info, value = self.expand_altrep_to_object( |
641 | 810 | elif info.type == RObjectType.EMPTYENV: |
642 | 811 | value = None |
643 | 812 | |
813 | elif info.type == RObjectType.BCREPREF: | |
814 | assert bytecode_rep_list | |
815 | position = self.parse_int() | |
816 | result = bytecode_rep_list[position] | |
817 | assert result | |
818 | return result | |
819 | ||
820 | elif info.type == RObjectType.MISSINGARG: | |
821 | value = None | |
822 | ||
644 | 823 | elif info.type == RObjectType.GLOBALENV: |
645 | 824 | value = None |
646 | 825 | |
656 | 835 | raise NotImplementedError(f"Type {info.type} not implemented") |
657 | 836 | |
658 | 837 | if info.tag and not tag_read: |
659 | warnings.warn(f"Tag not implemented for type {info.type} " | |
660 | "and ignored") | |
838 | warnings.warn( | |
839 | f"Tag not implemented for type {info.type} " | |
840 | "and ignored", | |
841 | ) | |
661 | 842 | if info.attributes and not attributes_read: |
662 | attributes = self.parse_R_object(reference_list) | |
843 | attributes = self.parse_R_object(reference_list, bytecode_rep_list) | |
663 | 844 | |
664 | 845 | if result is None: |
665 | 846 | result = RObject( |
678 | 859 | if add_reference: |
679 | 860 | reference_list.append(result) |
680 | 861 | |
862 | if bytecode_rep_position >= 0: | |
863 | assert bytecode_rep_list | |
864 | bytecode_rep_list[bytecode_rep_position] = result | |
865 | ||
681 | 866 | return result |
682 | 867 | |
683 | 868 | |
684 | 869 | class ParserXDR(Parser): |
685 | """ | |
686 | Parser used when the integers and doubles are in XDR format. | |
687 | """ | |
870 | """Parser used when the integers and doubles are in XDR format.""" | |
688 | 871 | |
689 | 872 | def __init__( |
690 | 873 | self, |
702 | 885 | self.position = position |
703 | 886 | self.xdr_parser = xdrlib.Unpacker(data) |
704 | 887 | |
705 | def parse_int(self) -> int: | |
888 | def parse_int(self) -> int: # noqa: D102 | |
706 | 889 | self.xdr_parser.set_position(self.position) |
707 | 890 | result = self.xdr_parser.unpack_int() |
708 | 891 | self.position = self.xdr_parser.get_position() |
709 | 892 | |
710 | 893 | return result |
711 | 894 | |
712 | def parse_double(self) -> float: | |
895 | def parse_double(self) -> float: # noqa: D102 | |
713 | 896 | self.xdr_parser.set_position(self.position) |
714 | 897 | result = self.xdr_parser.unpack_double() |
715 | 898 | self.position = self.xdr_parser.get_position() |
716 | 899 | |
717 | 900 | return result |
718 | 901 | |
719 | def parse_string(self, length: int) -> bytes: | |
902 | def parse_string(self, length: int) -> bytes: # noqa: D102 | |
720 | 903 | result = self.data[self.position:(self.position + length)] |
721 | 904 | self.position += length |
722 | 905 | return bytes(result) |
906 | ||
907 | def parse_all(self) -> RData: | |
908 | rdata = super().parse_all() | |
909 | assert self.position == len(self.data) | |
910 | return rdata | |
723 | 911 | |
724 | 912 | |
725 | 913 | def parse_file( |
727 | 915 | *, |
728 | 916 | expand_altrep: bool = True, |
729 | 917 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, |
918 | extension: str | None = None, | |
730 | 919 | ) -> RData: |
731 | 920 | """ |
732 | 921 | Parse a R file (.rda or .rdata). |
733 | 922 | |
734 | 923 | Parameters: |
735 | file_or_path (file-like, str, bytes or path-like): File | |
736 | in the R serialization format. | |
737 | expand_altrep (bool): Wether to translate ALTREPs to normal objects. | |
924 | file_or_path: File in the R serialization format. | |
925 | expand_altrep: Wether to translate ALTREPs to normal objects. | |
738 | 926 | altrep_constructor_dict: Dictionary mapping each ALTREP to |
739 | 927 | its constructor. |
928 | extension: Extension of the file. | |
740 | 929 | |
741 | 930 | Returns: |
742 | RData: Data contained in the file (versions and object). | |
931 | Data contained in the file (versions and object). | |
743 | 932 | |
744 | 933 | See Also: |
745 | 934 | :func:`parse_data`: Similar function that receives the data directly. |
746 | 935 | |
747 | 936 | Examples: |
748 | ||
749 | 937 | Parse one of the included examples, containing a vector |
750 | 938 | |
751 | 939 | >>> import rdata |
808 | 996 | """ |
809 | 997 | if isinstance(file_or_path, (os.PathLike, str)): |
810 | 998 | path = pathlib.Path(file_or_path) |
999 | if extension is None: | |
1000 | extension = path.suffix | |
811 | 1001 | data = path.read_bytes() |
812 | 1002 | else: |
813 | 1003 | # file is a pre-opened file |
822 | 1012 | data, |
823 | 1013 | expand_altrep=expand_altrep, |
824 | 1014 | altrep_constructor_dict=altrep_constructor_dict, |
1015 | extension=extension, | |
825 | 1016 | ) |
826 | 1017 | |
827 | 1018 | |
830 | 1021 | *, |
831 | 1022 | expand_altrep: bool = True, |
832 | 1023 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, |
1024 | extension: str | None = None, | |
833 | 1025 | ) -> RData: |
834 | 1026 | """ |
835 | 1027 | Parse the data of a R file, received as a sequence of bytes. |
836 | 1028 | |
837 | 1029 | Parameters: |
838 | data (bytes): Data extracted of a R file. | |
839 | expand_altrep (bool): Wether to translate ALTREPs to normal objects. | |
1030 | data: Data extracted of a R file. | |
1031 | expand_altrep: Wether to translate ALTREPs to normal objects. | |
840 | 1032 | altrep_constructor_dict: Dictionary mapping each ALTREP to |
841 | 1033 | its constructor. |
1034 | extension: Extension of the file. | |
842 | 1035 | |
843 | 1036 | Returns: |
844 | RData: Data contained in the file (versions and object). | |
1037 | Data contained in the file (versions and object). | |
845 | 1038 | |
846 | 1039 | See Also: |
847 | 1040 | :func:`parse_file`: Similar function that parses a file directly. |
848 | 1041 | |
849 | 1042 | Examples: |
850 | ||
851 | 1043 | Parse one of the included examples, containing a vector |
852 | 1044 | |
853 | 1045 | >>> import rdata |
918 | 1110 | if filetype in { |
919 | 1111 | FileTypes.rdata_binary_v2, |
920 | 1112 | FileTypes.rdata_binary_v3, |
1113 | None, | |
921 | 1114 | } else parse_data |
922 | 1115 | ) |
923 | 1116 | |
928 | 1121 | elif filetype is FileTypes.xz: |
929 | 1122 | new_data = lzma.decompress(data) |
930 | 1123 | elif filetype in {FileTypes.rdata_binary_v2, FileTypes.rdata_binary_v3}: |
1124 | if extension == ".rds": | |
1125 | warnings.warn( | |
1126 | f"Wrong extension {extension} for file in RDATA format", | |
1127 | ) | |
1128 | ||
931 | 1129 | view = view[len(magic_dict[filetype]):] |
932 | 1130 | new_data = view |
933 | 1131 | else: |
934 | raise NotImplementedError("Unknown file type") | |
1132 | new_data = view | |
1133 | if extension != ".rds": | |
1134 | warnings.warn("Unknown file type: assumed RDS") | |
935 | 1135 | |
936 | 1136 | return parse_function( |
937 | 1137 | new_data, # type: ignore |
938 | 1138 | expand_altrep=expand_altrep, |
939 | 1139 | altrep_constructor_dict=altrep_constructor_dict, |
1140 | extension=extension, | |
940 | 1141 | ) |
941 | 1142 | |
942 | 1143 | |
944 | 1145 | data: memoryview, |
945 | 1146 | expand_altrep: bool = True, |
946 | 1147 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, |
1148 | extension: str | None = None, | |
947 | 1149 | ) -> RData: |
948 | """ | |
949 | Select the appropiate parser and parse all the info. | |
950 | """ | |
1150 | """Select the appropiate parser and parse all the info.""" | |
951 | 1151 | format_type = rdata_format(data) |
952 | 1152 | |
953 | 1153 | if format_type: |
960 | 1160 | altrep_constructor_dict=altrep_constructor_dict, |
961 | 1161 | ) |
962 | 1162 | return parser.parse_all() |
963 | else: | |
964 | raise NotImplementedError("Unknown file format") | |
1163 | ||
1164 | raise NotImplementedError("Unknown file format") | |
965 | 1165 | |
966 | 1166 | |
967 | 1167 | def bits(data: int, start: int, stop: int) -> int: |
968 | """ | |
969 | Read bits [start, stop) of an integer. | |
970 | """ | |
1168 | """Read bits [start, stop) of an integer.""" | |
971 | 1169 | count = stop - start |
972 | 1170 | mask = ((1 << count) - 1) << start |
973 | 1171 | |
976 | 1174 | |
977 | 1175 | |
978 | 1176 | def is_special_r_object_type(r_object_type: RObjectType) -> bool: |
979 | """ | |
980 | Check if a R type has a different serialization than the usual one. | |
981 | """ | |
982 | return (r_object_type is RObjectType.NILVALUE | |
983 | or r_object_type is RObjectType.REF) | |
1177 | """Check if a R type has a different serialization than the usual one.""" | |
1178 | return ( | |
1179 | r_object_type is RObjectType.NILVALUE | |
1180 | or r_object_type is RObjectType.REF | |
1181 | ) | |
984 | 1182 | |
985 | 1183 | |
986 | 1184 | def parse_r_object_info(info_int: int) -> RObjectInfo: |
987 | """ | |
988 | Parse the internal information of an object. | |
989 | """ | |
1185 | """Parse the internal information of an object.""" | |
990 | 1186 | type_exp = RObjectType(bits(info_int, 0, 8)) |
991 | 1187 | |
992 | 1188 | reference = 0 |
999 | 1195 | else: |
1000 | 1196 | object_flag = bool(bits(info_int, 8, 9)) |
1001 | 1197 | attributes = bool(bits(info_int, 9, 10)) |
1002 | tag = bool(bits(info_int, 10, 11)) | |
1003 | gp = bits(info_int, 12, 28) | |
1198 | tag = bool(bits(info_int, 10, 11)) # noqa: WPS432 | |
1199 | gp = bits(info_int, 12, 28) # noqa: WPS432 | |
1004 | 1200 | |
1005 | 1201 | if type_exp == RObjectType.REF: |
1006 | reference = bits(info_int, 8, 32) | |
1202 | reference = bits(info_int, 8, 32) # noqa: WPS432 | |
1007 | 1203 | |
1008 | 1204 | return RObjectInfo( |
1009 | 1205 | type=type_exp, |
1011 | 1207 | attributes=attributes, |
1012 | 1208 | tag=tag, |
1013 | 1209 | gp=gp, |
1014 | reference=reference | |
1210 | reference=reference, | |
1015 | 1211 | ) |
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
0 | """Tests of parsing and conversion.""" | |
1 | ||
0 | 2 | import unittest |
1 | 3 | from collections import ChainMap |
2 | 4 | from fractions import Fraction |
5 | 7 | |
6 | 8 | import numpy as np |
7 | 9 | import pandas as pd |
10 | import xarray | |
8 | 11 | |
9 | 12 | import rdata |
10 | 13 | |
12 | 15 | |
13 | 16 | |
14 | 17 | class SimpleTests(unittest.TestCase): |
18 | """Collection of simple test cases.""" | |
15 | 19 | |
16 | 20 | def test_opened_file(self) -> None: |
17 | parsed = rdata.parser.parse_file(open(TESTDATA_PATH / | |
18 | "test_vector.rda")) | |
21 | """Test that an opened file can be passed to parse_file.""" | |
22 | with open(TESTDATA_PATH / "test_vector.rda") as f: | |
23 | parsed = rdata.parser.parse_file(f) | |
24 | converted = rdata.conversion.convert(parsed) | |
25 | ||
26 | self.assertIsInstance(converted, dict) | |
27 | ||
28 | def test_opened_string(self) -> None: | |
29 | """Test that a string can be passed to parse_file.""" | |
30 | parsed = rdata.parser.parse_file( | |
31 | str(TESTDATA_PATH / "test_vector.rda"), | |
32 | ) | |
19 | 33 | converted = rdata.conversion.convert(parsed) |
20 | 34 | |
21 | 35 | self.assertIsInstance(converted, dict) |
22 | 36 | |
23 | def test_opened_string(self) -> None: | |
24 | parsed = rdata.parser.parse_file(str(TESTDATA_PATH / | |
25 | "test_vector.rda")) | |
26 | converted = rdata.conversion.convert(parsed) | |
27 | ||
28 | self.assertIsInstance(converted, dict) | |
29 | ||
30 | 37 | def test_logical(self) -> None: |
38 | """Test parsing of logical vectors.""" | |
31 | 39 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_logical.rda") |
32 | 40 | converted = rdata.conversion.convert(parsed) |
33 | 41 | |
34 | 42 | np.testing.assert_equal(converted, { |
35 | "test_logical": np.array([True, True, False, True, False]) | |
43 | "test_logical": np.array([True, True, False, True, False]), | |
36 | 44 | }) |
37 | 45 | |
38 | 46 | def test_vector(self) -> None: |
47 | """Test parsing of numerical vectors.""" | |
39 | 48 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_vector.rda") |
40 | 49 | converted = rdata.conversion.convert(parsed) |
41 | 50 | |
42 | 51 | np.testing.assert_equal(converted, { |
43 | "test_vector": np.array([1., 2., 3.]) | |
52 | "test_vector": np.array([1.0, 2.0, 3.0]), | |
44 | 53 | }) |
45 | 54 | |
46 | 55 | def test_empty_string(self) -> None: |
56 | """Test that the empty string is parsed correctly.""" | |
47 | 57 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_empty_str.rda") |
48 | 58 | converted = rdata.conversion.convert(parsed) |
49 | 59 | |
50 | 60 | np.testing.assert_equal(converted, { |
51 | "test_empty_str": [""] | |
61 | "test_empty_str": [""], | |
52 | 62 | }) |
53 | 63 | |
54 | 64 | def test_na_string(self) -> None: |
55 | parsed = rdata.parser.parse_file( | |
56 | TESTDATA_PATH / "test_na_string.rda") | |
57 | converted = rdata.conversion.convert(parsed) | |
58 | ||
59 | np.testing.assert_equal(converted, { | |
60 | "test_na_string": [None] | |
65 | """Test that the NA string is parsed correctly.""" | |
66 | parsed = rdata.parser.parse_file( | |
67 | TESTDATA_PATH / "test_na_string.rda", | |
68 | ) | |
69 | converted = rdata.conversion.convert(parsed) | |
70 | ||
71 | np.testing.assert_equal(converted, { | |
72 | "test_na_string": [None], | |
61 | 73 | }) |
62 | 74 | |
63 | 75 | def test_complex(self) -> None: |
76 | """Test that complex numbers can be parsed.""" | |
64 | 77 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_complex.rda") |
65 | 78 | converted = rdata.conversion.convert(parsed) |
66 | 79 | |
67 | 80 | np.testing.assert_equal(converted, { |
68 | "test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j]) | |
81 | "test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j]), | |
69 | 82 | }) |
70 | 83 | |
71 | 84 | def test_matrix(self) -> None: |
85 | """Test that a matrix can be parsed.""" | |
72 | 86 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_matrix.rda") |
73 | 87 | converted = rdata.conversion.convert(parsed) |
74 | 88 | |
75 | 89 | np.testing.assert_equal(converted, { |
76 | "test_matrix": np.array([[1., 2., 3.], | |
77 | [4., 5., 6.]]) | |
78 | }) | |
90 | "test_matrix": np.array([ | |
91 | [1.0, 2.0, 3.0], | |
92 | [4.0, 5.0, 6.0], | |
93 | ]), | |
94 | }) | |
95 | ||
96 | def test_named_matrix(self) -> None: | |
97 | """Test that a named matrix can be parsed.""" | |
98 | parsed = rdata.parser.parse_file( | |
99 | TESTDATA_PATH / "test_named_matrix.rda", | |
100 | ) | |
101 | converted = rdata.conversion.convert(parsed) | |
102 | reference = xarray.DataArray( | |
103 | [ | |
104 | [1.0, 2.0, 3.0], | |
105 | [4.0, 5.0, 6.0], | |
106 | ], | |
107 | dims=["dim_0", "dim_1"], | |
108 | coords={ | |
109 | "dim_0": ["dim0_0", "dim0_1"], | |
110 | "dim_1": ["dim1_0", "dim1_1", "dim1_2"], | |
111 | }, | |
112 | ) | |
113 | ||
114 | xarray.testing.assert_identical( | |
115 | converted["test_named_matrix"], | |
116 | reference, | |
117 | ) | |
118 | ||
119 | def test_half_named_matrix(self) -> None: | |
120 | """Test that a named matrix with no name for a dim can be parsed.""" | |
121 | parsed = rdata.parser.parse_file( | |
122 | TESTDATA_PATH / "test_half_named_matrix.rda", | |
123 | ) | |
124 | converted = rdata.conversion.convert(parsed) | |
125 | reference = xarray.DataArray( | |
126 | [ | |
127 | [1.0, 2.0, 3.0], | |
128 | [4.0, 5.0, 6.0], | |
129 | ], | |
130 | dims=["dim_0", "dim_1"], | |
131 | coords={ | |
132 | "dim_0": ["dim0_0", "dim0_1"], | |
133 | }, | |
134 | ) | |
135 | ||
136 | xarray.testing.assert_identical( | |
137 | converted["test_half_named_matrix"], | |
138 | reference, | |
139 | ) | |
140 | ||
141 | def test_full_named_matrix(self) -> None: | |
142 | """Test that a named matrix with dim names can be parsed.""" | |
143 | parsed = rdata.parser.parse_file( | |
144 | TESTDATA_PATH / "test_full_named_matrix.rda", | |
145 | ) | |
146 | converted = rdata.conversion.convert(parsed) | |
147 | reference = xarray.DataArray( | |
148 | [ | |
149 | [1.0, 2.0, 3.0], | |
150 | [4.0, 5.0, 6.0], | |
151 | ], | |
152 | dims=["my_dim_0", "my_dim_1"], | |
153 | coords={ | |
154 | "my_dim_0": ["dim0_0", "dim0_1"], | |
155 | "my_dim_1": ["dim1_0", "dim1_1", "dim1_2"], | |
156 | }, | |
157 | ) | |
158 | ||
159 | xarray.testing.assert_identical( | |
160 | converted["test_full_named_matrix"], | |
161 | reference, | |
162 | ) | |
163 | ||
164 | def test_full_named_matrix_rds(self) -> None: | |
165 | """Test that a named matrix with dim names can be parsed.""" | |
166 | parsed = rdata.parser.parse_file( | |
167 | TESTDATA_PATH / "test_full_named_matrix.rds", | |
168 | ) | |
169 | converted = rdata.conversion.convert(parsed) | |
170 | reference = xarray.DataArray( | |
171 | [ | |
172 | [1.0, 2.0, 3.0], | |
173 | [4.0, 5.0, 6.0], | |
174 | ], | |
175 | dims=["my_dim_0", "my_dim_1"], | |
176 | coords={ | |
177 | "my_dim_0": ["dim0_0", "dim0_1"], | |
178 | "my_dim_1": ["dim1_0", "dim1_1", "dim1_2"], | |
179 | }, | |
180 | ) | |
181 | ||
182 | xarray.testing.assert_identical( | |
183 | converted, | |
184 | reference, | |
185 | ) | |
79 | 186 | |
80 | 187 | def test_list(self) -> None: |
188 | """Test that list can be parsed.""" | |
81 | 189 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list.rda") |
82 | 190 | converted = rdata.conversion.convert(parsed) |
83 | 191 | |
84 | 192 | np.testing.assert_equal(converted, { |
85 | 193 | "test_list": |
86 | 194 | [ |
87 | np.array([1.]), | |
195 | np.array([1.0]), | |
88 | 196 | ['a', 'b', 'c'], |
89 | np.array([2., 3.]), | |
90 | ['hi'] | |
91 | ] | |
197 | np.array([2.0, 3.0]), | |
198 | ['hi'], | |
199 | ], | |
200 | }) | |
201 | ||
202 | def test_file(self) -> None: | |
203 | """Test that external pointers can be parsed.""" | |
204 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_file.rda") | |
205 | converted = rdata.conversion.convert(parsed) | |
206 | ||
207 | np.testing.assert_equal(converted, { | |
208 | "test_file": [5], | |
92 | 209 | }) |
93 | 210 | |
94 | 211 | def test_expression(self) -> None: |
212 | """Test that expressions can be parsed.""" | |
95 | 213 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_expression.rda") |
96 | 214 | converted = rdata.conversion.convert(parsed) |
97 | 215 | |
98 | 216 | np.testing.assert_equal(converted, { |
99 | 217 | "test_expression": rdata.conversion.RExpression([ |
100 | rdata.conversion.RLanguage(['^', 'base', 'exponent'])]) | |
101 | }) | |
218 | rdata.conversion.RLanguage( | |
219 | ['^', 'base', 'exponent'], | |
220 | attributes={}, | |
221 | ), | |
222 | ]), | |
223 | }) | |
224 | ||
225 | def test_builtin(self) -> None: | |
226 | """Test that builtin functions can be parsed.""" | |
227 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_builtin.rda") | |
228 | converted = rdata.conversion.convert(parsed) | |
229 | ||
230 | np.testing.assert_equal(converted, { | |
231 | "test_builtin": rdata.conversion.RBuiltin(name="abs"), | |
232 | }) | |
233 | ||
234 | def test_minimal_function_uncompiled(self) -> None: | |
235 | """Test that a minimal function can be parsed.""" | |
236 | parsed = rdata.parser.parse_file( | |
237 | TESTDATA_PATH / "test_minimal_function_uncompiled.rda") | |
238 | converted = rdata.conversion.convert(parsed) | |
239 | ||
240 | converted_fun = converted["test_minimal_function_uncompiled"] | |
241 | ||
242 | self.assertIsInstance( | |
243 | converted_fun, | |
244 | rdata.conversion.RFunction, | |
245 | ) | |
246 | ||
247 | np.testing.assert_equal(converted_fun.environment, ChainMap({})) | |
248 | np.testing.assert_equal(converted_fun.formals, None) | |
249 | np.testing.assert_equal(converted_fun.body, None) | |
250 | np.testing.assert_equal( | |
251 | converted_fun.source, | |
252 | "test_minimal_function_uncompiled <- function() NULL\n", | |
253 | ) | |
254 | ||
255 | def test_minimal_function(self) -> None: | |
256 | """Test that a minimal function (compiled) can be parsed.""" | |
257 | parsed = rdata.parser.parse_file( | |
258 | TESTDATA_PATH / "test_minimal_function.rda") | |
259 | converted = rdata.conversion.convert(parsed) | |
260 | ||
261 | converted_fun = converted["test_minimal_function"] | |
262 | ||
263 | self.assertIsInstance( | |
264 | converted_fun, | |
265 | rdata.conversion.RFunction, | |
266 | ) | |
267 | ||
268 | np.testing.assert_equal(converted_fun.environment, ChainMap({})) | |
269 | np.testing.assert_equal(converted_fun.formals, None) | |
270 | ||
271 | converted_body = converted_fun.body | |
272 | ||
273 | self.assertIsInstance( | |
274 | converted_body, | |
275 | rdata.conversion.RBytecode, | |
276 | ) | |
277 | ||
278 | np.testing.assert_equal(converted_body.code, np.array([12, 17, 1])) | |
279 | np.testing.assert_equal(converted_body.attributes, {}) | |
280 | ||
281 | np.testing.assert_equal( | |
282 | converted_fun.source, | |
283 | "test_minimal_function <- function() NULL\n", | |
284 | ) | |
285 | ||
286 | def test_empty_function_uncompiled(self) -> None: | |
287 | """Test that a simple function can be parsed.""" | |
288 | parsed = rdata.parser.parse_file( | |
289 | TESTDATA_PATH / "test_empty_function_uncompiled.rda") | |
290 | converted = rdata.conversion.convert(parsed) | |
291 | ||
292 | converted_fun = converted["test_empty_function_uncompiled"] | |
293 | ||
294 | self.assertIsInstance( | |
295 | converted_fun, | |
296 | rdata.conversion.RFunction, | |
297 | ) | |
298 | ||
299 | np.testing.assert_equal(converted_fun.environment, ChainMap({})) | |
300 | np.testing.assert_equal(converted_fun.formals, None) | |
301 | self.assertIsInstance(converted_fun.body, rdata.conversion.RLanguage) | |
302 | np.testing.assert_equal( | |
303 | converted_fun.source, | |
304 | "test_empty_function_uncompiled <- function() {}\n", | |
305 | ) | |
306 | ||
307 | def test_empty_function(self) -> None: | |
308 | """Test that a simple function (compiled) can be parsed.""" | |
309 | parsed = rdata.parser.parse_file( | |
310 | TESTDATA_PATH / "test_empty_function.rda") | |
311 | converted = rdata.conversion.convert(parsed) | |
312 | ||
313 | converted_fun = converted["test_empty_function"] | |
314 | ||
315 | self.assertIsInstance( | |
316 | converted_fun, | |
317 | rdata.conversion.RFunction, | |
318 | ) | |
319 | ||
320 | np.testing.assert_equal(converted_fun.environment, ChainMap({})) | |
321 | np.testing.assert_equal(converted_fun.formals, None) | |
322 | ||
323 | converted_body = converted_fun.body | |
324 | ||
325 | self.assertIsInstance( | |
326 | converted_body, | |
327 | rdata.conversion.RBytecode, | |
328 | ) | |
329 | ||
330 | np.testing.assert_equal(converted_body.code, np.array([12, 17, 1])) | |
331 | np.testing.assert_equal(converted_body.attributes, {}) | |
332 | ||
333 | np.testing.assert_equal( | |
334 | converted_fun.source, | |
335 | "test_empty_function <- function() {}\n", | |
336 | ) | |
337 | ||
338 | def test_function(self) -> None: | |
339 | """Test that functions can be parsed.""" | |
340 | parsed = rdata.parser.parse_file( | |
341 | TESTDATA_PATH / "test_function.rda") | |
342 | converted = rdata.conversion.convert(parsed) | |
343 | ||
344 | converted_fun = converted["test_function"] | |
345 | ||
346 | self.assertIsInstance( | |
347 | converted_fun, | |
348 | rdata.conversion.RFunction, | |
349 | ) | |
350 | ||
351 | np.testing.assert_equal(converted_fun.environment, ChainMap({})) | |
352 | np.testing.assert_equal(converted_fun.formals, None) | |
353 | ||
354 | converted_body = converted_fun.body | |
355 | ||
356 | self.assertIsInstance( | |
357 | converted_body, | |
358 | rdata.conversion.RBytecode, | |
359 | ) | |
360 | ||
361 | np.testing.assert_equal( | |
362 | converted_body.code, | |
363 | np.array([12, 23, 1, 34, 4, 38, 2, 1]), | |
364 | ) | |
365 | np.testing.assert_equal(converted_body.attributes, {}) | |
366 | ||
367 | np.testing.assert_equal( | |
368 | converted_fun.source, | |
369 | "test_function <- function() {print(\"Hello\")}\n", | |
370 | ) | |
371 | ||
372 | def test_function_arg(self) -> None: | |
373 | """Test that functions can be parsed.""" | |
374 | parsed = rdata.parser.parse_file( | |
375 | TESTDATA_PATH / "test_function_arg.rda") | |
376 | converted = rdata.conversion.convert(parsed) | |
377 | ||
378 | converted_fun = converted["test_function_arg"] | |
379 | ||
380 | self.assertIsInstance( | |
381 | converted_fun, | |
382 | rdata.conversion.RFunction, | |
383 | ) | |
384 | ||
385 | np.testing.assert_equal(converted_fun.environment, ChainMap({})) | |
386 | np.testing.assert_equal(converted_fun.formals, {"a": NotImplemented}) | |
387 | ||
388 | converted_body = converted_fun.body | |
389 | ||
390 | self.assertIsInstance( | |
391 | converted_body, | |
392 | rdata.conversion.RBytecode, | |
393 | ) | |
394 | ||
395 | np.testing.assert_equal( | |
396 | converted_body.code, | |
397 | np.array([12, 23, 1, 29, 4, 38, 2, 1]), | |
398 | ) | |
399 | np.testing.assert_equal(converted_body.attributes, {}) | |
400 | ||
401 | np.testing.assert_equal( | |
402 | converted_fun.source, | |
403 | "test_function_arg <- function(a) {print(a)}\n", | |
404 | ) | |
102 | 405 | |
103 | 406 | def test_encodings(self) -> None: |
104 | ||
407 | """Test of differents encodings.""" | |
105 | 408 | with self.assertWarns( |
106 | 409 | UserWarning, |
107 | msg="Unknown encoding. Assumed ASCII." | |
410 | msg="Unknown encoding. Assumed ASCII.", | |
108 | 411 | ): |
109 | 412 | parsed = rdata.parser.parse_file( |
110 | 413 | TESTDATA_PATH / "test_encodings.rda", |
119 | 422 | }) |
120 | 423 | |
121 | 424 | def test_encodings_v3(self) -> None: |
122 | ||
425 | """Test encodings in version 3 format.""" | |
123 | 426 | parsed = rdata.parser.parse_file( |
124 | 427 | TESTDATA_PATH / "test_encodings_v3.rda", |
125 | 428 | ) |
133 | 436 | }) |
134 | 437 | |
135 | 438 | def test_dataframe(self) -> None: |
136 | ||
137 | for f in {"test_dataframe.rda", "test_dataframe_v3.rda"}: | |
439 | """Test dataframe conversion.""" | |
440 | for f in ("test_dataframe.rda", "test_dataframe_v3.rda"): | |
138 | 441 | with self.subTest(file=f): |
139 | 442 | parsed = rdata.parser.parse_file( |
140 | 443 | TESTDATA_PATH / f, |
143 | 446 | |
144 | 447 | pd.testing.assert_frame_equal( |
145 | 448 | converted["test_dataframe"], |
146 | pd.DataFrame({ | |
147 | "class": pd.Categorical( | |
148 | ["a", "b", "b"]), | |
149 | "value": [1, 2, 3], | |
150 | }) | |
449 | pd.DataFrame( | |
450 | { | |
451 | "class": pd.Categorical( | |
452 | ["a", "b", "b"], | |
453 | ), | |
454 | "value": [1, 2, 3], | |
455 | }, | |
456 | index=pd.RangeIndex(start=1, stop=4), | |
457 | ), | |
151 | 458 | ) |
152 | 459 | |
460 | def test_dataframe_rds(self) -> None: | |
461 | """Test dataframe conversion.""" | |
462 | for f in ("test_dataframe.rds", "test_dataframe_v3.rds"): | |
463 | with self.subTest(file=f): | |
464 | parsed = rdata.parser.parse_file( | |
465 | TESTDATA_PATH / f, | |
466 | ) | |
467 | converted = rdata.conversion.convert(parsed) | |
468 | ||
469 | pd.testing.assert_frame_equal( | |
470 | converted, | |
471 | pd.DataFrame( | |
472 | { | |
473 | "class": pd.Categorical( | |
474 | ["a", "b", "b"], | |
475 | ), | |
476 | "value": [1, 2, 3], | |
477 | }, | |
478 | index=pd.RangeIndex(start=1, stop=4), | |
479 | ), | |
480 | ) | |
481 | ||
482 | def test_dataframe_rownames(self) -> None: | |
483 | """Test dataframe conversion.""" | |
484 | parsed = rdata.parser.parse_file( | |
485 | TESTDATA_PATH / "test_dataframe_rownames.rda", | |
486 | ) | |
487 | converted = rdata.conversion.convert(parsed) | |
488 | ||
489 | pd.testing.assert_frame_equal( | |
490 | converted["test_dataframe_rownames"], | |
491 | pd.DataFrame( | |
492 | { | |
493 | "class": pd.Categorical( | |
494 | ["a", "b", "b"], | |
495 | ), | |
496 | "value": [1, 2, 3], | |
497 | }, | |
498 | index=('Madrid', 'Frankfurt', 'Herzberg am Harz'), | |
499 | ), | |
500 | ) | |
501 | ||
153 | 502 | def test_ts(self) -> None: |
503 | """Test time series conversion.""" | |
154 | 504 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_ts.rda") |
155 | 505 | converted = rdata.conversion.convert(parsed) |
156 | 506 | |
157 | pd.testing.assert_series_equal(converted["test_ts"], | |
158 | pd.Series({ | |
159 | 2000 + Fraction(2, 12): 1., | |
160 | 2000 + Fraction(3, 12): 2., | |
161 | 2000 + Fraction(4, 12): 3., | |
162 | })) | |
507 | pd.testing.assert_series_equal( | |
508 | converted["test_ts"], | |
509 | pd.Series({ | |
510 | 2000 + Fraction(2, 12): 1.0, | |
511 | 2000 + Fraction(3, 12): 2.0, | |
512 | 2000 + Fraction(4, 12): 3.0, | |
513 | }), | |
514 | ) | |
163 | 515 | |
164 | 516 | def test_s4(self) -> None: |
517 | """Test parsing of S4 classes.""" | |
165 | 518 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_s4.rda") |
166 | 519 | converted = rdata.conversion.convert(parsed) |
167 | 520 | |
169 | 522 | "test_s4": SimpleNamespace( |
170 | 523 | age=np.array(28), |
171 | 524 | name=["Carlos"], |
172 | **{'class': ["Person"]} | |
173 | ) | |
525 | **{'class': ["Person"]}, # noqa: WPS517 | |
526 | ), | |
174 | 527 | }) |
175 | 528 | |
176 | 529 | def test_environment(self) -> None: |
177 | parsed = rdata.parser.parse_file( | |
178 | TESTDATA_PATH / "test_environment.rda") | |
530 | """Test parsing of environments.""" | |
531 | parsed = rdata.parser.parse_file( | |
532 | TESTDATA_PATH / "test_environment.rda", | |
533 | ) | |
179 | 534 | converted = rdata.conversion.convert(parsed) |
180 | 535 | |
181 | 536 | dict_env = {'string': ['test']} |
182 | 537 | empty_global_env: Dict[str, Any] = {} |
183 | 538 | |
184 | 539 | np.testing.assert_equal(converted, { |
185 | "test_environment": ChainMap(dict_env, ChainMap(empty_global_env)) | |
540 | "test_environment": ChainMap(dict_env, ChainMap(empty_global_env)), | |
186 | 541 | }) |
187 | 542 | |
188 | 543 | global_env = {"global": "test"} |
193 | 548 | ) |
194 | 549 | |
195 | 550 | np.testing.assert_equal(converted_global, { |
196 | "test_environment": ChainMap(dict_env, ChainMap(global_env)) | |
551 | "test_environment": ChainMap(dict_env, ChainMap(global_env)), | |
197 | 552 | }) |
198 | 553 | |
199 | 554 | def test_emptyenv(self) -> None: |
200 | parsed = rdata.parser.parse_file( | |
201 | TESTDATA_PATH / "test_emptyenv.rda") | |
202 | converted = rdata.conversion.convert(parsed) | |
203 | ||
204 | np.testing.assert_equal(converted, { | |
205 | "test_emptyenv": ChainMap({}) | |
555 | """Test parsing the empty environment.""" | |
556 | parsed = rdata.parser.parse_file( | |
557 | TESTDATA_PATH / "test_emptyenv.rda", | |
558 | ) | |
559 | converted = rdata.conversion.convert(parsed) | |
560 | ||
561 | self.assertEqual(converted, { | |
562 | "test_emptyenv": ChainMap({}), | |
206 | 563 | }) |
207 | 564 | |
208 | 565 | def test_list_attrs(self) -> None: |
566 | """Test that lists accept attributes.""" | |
209 | 567 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list_attrs.rda") |
210 | 568 | converted = rdata.conversion.convert(parsed) |
211 | 569 | |
212 | 570 | np.testing.assert_equal(converted, { |
213 | "test_list_attrs": [['list'], [5]] | |
571 | "test_list_attrs": [['list'], [5]], | |
214 | 572 | }) |
215 | 573 | |
216 | 574 | def test_altrep_compact_intseq(self) -> None: |
243 | 601 | converted = rdata.conversion.convert(parsed) |
244 | 602 | |
245 | 603 | np.testing.assert_equal(converted, { |
246 | "test_altrep_deferred_string": [ | |
604 | "test_altrep_deferred_string": [ # noqa: WPS317 | |
247 | 605 | "1", "2.3", "10000", |
248 | 606 | "1e+05", "-10000", "-1e+05", |
249 | 607 | "0.001", "1e-04", "1e-05", |
285 | 643 | |
286 | 644 | |
287 | 645 | if __name__ == "__main__": |
288 | # import sys;sys.argv = ['', 'Test.testName'] | |
289 | 646 | unittest.main() |
9 | 9 | include_trailing_comma = true |
10 | 10 | use_parentheses = true |
11 | 11 | combine_as_imports = 1 |
12 | ||
13 | [flake8] | |
14 | ignore = | |
15 | # No docstring for magic methods | |
16 | D105, | |
17 | # No docstrings in __init__ | |
18 | D107, | |
19 | # Ignore until https://github.com/terrencepreilly/darglint/issues/54 is closed | |
20 | DAR202, | |
21 | # Ignore until https://github.com/terrencepreilly/darglint/issues/144 is closed | |
22 | DAR401, | |
23 | # Non-explicit exceptions may be documented in raises | |
24 | DAR402, | |
25 | # Uppercase arguments like X are common in scikit-learn | |
26 | N803, | |
27 | # Uppercase variables like X are common in scikit-learn | |
28 | N806, | |
29 | # There are no bad quotes | |
30 | Q000, | |
31 | # Google Python style is not RST until after processed by Napoleon | |
32 | # See https://github.com/peterjc/flake8-rst-docstrings/issues/17 | |
33 | RST201, RST203, RST301, | |
34 | # assert is used by pytest tests | |
35 | S101, | |
36 | # Line break occurred before a binary operator (antipattern) | |
37 | W503, | |
38 | # Utils is used as a module name | |
39 | WPS100, | |
40 | # Short names like X or y are common in scikit-learn | |
41 | WPS111, | |
42 | # We do not like this underscored numbers convention | |
43 | WPS114, | |
44 | # Attributes in uppercase are used in enums | |
45 | WPS115, | |
46 | # Trailing underscores are a scikit-learn convention | |
47 | WPS120, | |
48 | # Cognitive complexity cannot be avoided at some modules | |
49 | WPS232, | |
50 | # The number of imported things may be large, especially for typing | |
51 | WPS235, | |
52 | # We like local imports, thanks | |
53 | WPS300, | |
54 | # Dotted imports are ok | |
55 | WPS301, | |
56 | # We love f-strings | |
57 | WPS305, | |
58 | # Implicit string concatenation is useful for exception messages | |
59 | WPS306, | |
60 | # No base class needed | |
61 | WPS326, | |
62 | # We allow multiline conditions | |
63 | WPS337, | |
64 | # We order methods differently | |
65 | WPS338, | |
66 | # We need multine loops | |
67 | WPS352, | |
68 | # Assign to a subcript slice is normal behaviour in numpy | |
69 | WPS362, | |
70 | # All keywords are beautiful | |
71 | WPS420, | |
72 | # We use nested imports sometimes, and it is not THAT bad | |
73 | WPS433, | |
74 | # We use list multiplication to allocate list with immutable values (None or numbers) | |
75 | WPS435, | |
76 | # Our private modules are fine to import | |
77 | # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) | |
78 | WPS436, | |
79 | # Our private objects are fine to import | |
80 | WPS450, | |
81 | # Numpy mixes bitwise and comparison operators | |
82 | WPS465, | |
83 | # Explicit len compare is better than implicit | |
84 | WPS507, | |
85 | # Comparison with not is not the same as with equality | |
86 | WPS520, | |
87 | ||
88 | per-file-ignores = | |
89 | __init__.py: | |
90 | # Unused modules are allowed in `__init__.py`, to reduce imports | |
91 | F401, | |
92 | # Explicit re-exports allowed in __init__ | |
93 | WPS113, | |
94 | # Import multiple names is allowed in `__init__.py` | |
95 | WPS235, | |
96 | # Logic is allowed in `__init__.py` | |
97 | WPS412 | |
98 | ||
99 | # Tests benefit from overused expressions, magic numbers and fixtures | |
100 | test_*.py: WPS204, WPS432, WPS442 | |
101 | ||
102 | rst-directives = | |
103 | # These are sorted alphabetically - but that does not matter | |
104 | autosummary,data,currentmodule,deprecated, | |
105 | glossary,moduleauthor,plot,testcode, | |
106 | versionadded,versionchanged, | |
107 | ||
108 | rst-roles = | |
109 | attr,class,func,meth,mod,obj,ref,term, | |
110 | ||
111 | allowed-domain-names = data, info, obj, result, results, val, value, values, var | |
112 | ||
113 | # Needs to be tuned | |
114 | max-arguments = 10 | |
115 | max-attributes = 10 | |
116 | max-cognitive-score = 30 | |
117 | max-expressions = 15 | |
118 | max-imports = 20 | |
119 | max-line-complexity = 30 | |
120 | max-local-variables = 15 | |
121 | max-methods = 30 | |
122 | max-module-expressions = 15 | |
123 | max-module-members = 15 | |
124 | max-string-usages = 10 | |
125 | ||
126 | ignore-decorators = (property)|(overload) | |
127 | ||
128 | strictness = long | |
129 | ||
130 | # Beautify output and make it more informative | |
131 | format = wemake | |
132 | show-source = true | |
12 | 133 | |
13 | 134 | [mypy] |
14 | 135 | strict = True |
6 | 6 | language or its libraries, and thus it is released under a MIT license. |
7 | 7 | """ |
8 | 8 | import os |
9 | import pathlib | |
9 | 10 | import sys |
10 | 11 | |
11 | 12 | from setuptools import find_packages, setup |
15 | 16 | |
16 | 17 | DOCLINES = (__doc__ or '').split("\n") |
17 | 18 | |
18 | with open(os.path.join(os.path.dirname(__file__), | |
19 | 'VERSION'), 'r') as version_file: | |
19 | with open( | |
20 | pathlib.Path(os.path.dirname(__file__)) / 'rdata' / 'VERSION', | |
21 | 'r', | |
22 | ) as version_file: | |
20 | 23 | version = version_file.read().strip() |
21 | 24 | |
22 | setup(name='rdata', | |
23 | version=version, | |
24 | description=DOCLINES[1], | |
25 | long_description="\n".join(DOCLINES[3:]), | |
26 | url='https://github.com/vnmabus/rdata', | |
27 | author='Carlos Ramos Carreño', | |
28 | author_email='vnmabus@gmail.com', | |
29 | include_package_data=True, | |
30 | platforms=['any'], | |
31 | license='MIT', | |
32 | packages=find_packages(), | |
33 | python_requires='>=3.7, <4', | |
34 | classifiers=[ | |
35 | 'Development Status :: 4 - Beta', | |
36 | 'Intended Audience :: Developers', | |
37 | 'Intended Audience :: Science/Research', | |
38 | 'License :: OSI Approved :: MIT License', | |
39 | 'Natural Language :: English', | |
40 | 'Operating System :: OS Independent', | |
41 | 'Programming Language :: Python :: 3', | |
42 | 'Programming Language :: Python :: 3.6', | |
43 | 'Programming Language :: Python :: 3.7', | |
44 | 'Programming Language :: Python :: 3.8', | |
45 | 'Topic :: Scientific/Engineering :: Mathematics', | |
46 | 'Topic :: Software Development :: Libraries :: Python Modules', | |
47 | 'Typing :: Typed', | |
48 | ], | |
49 | keywords=['rdata', 'r', 'dataset'], | |
50 | install_requires=['numpy', | |
51 | 'xarray', | |
52 | 'pandas'], | |
53 | setup_requires=pytest_runner, | |
54 | tests_require=['pytest-cov', | |
55 | 'numpy>=1.14' # The printing format for numpy changes | |
56 | ], | |
57 | test_suite='rdata.tests', | |
58 | zip_safe=False) | |
25 | setup( | |
26 | name='rdata', | |
27 | version=version, | |
28 | description=DOCLINES[1], | |
29 | long_description="\n".join(DOCLINES[3:]), | |
30 | url='https://github.com/vnmabus/rdata', | |
31 | author='Carlos Ramos Carreño', | |
32 | author_email='vnmabus@gmail.com', | |
33 | include_package_data=True, | |
34 | platforms=['any'], | |
35 | license='MIT', | |
36 | packages=find_packages(), | |
37 | python_requires='>=3.7, <4', | |
38 | classifiers=[ | |
39 | 'Development Status :: 4 - Beta', | |
40 | 'Intended Audience :: Developers', | |
41 | 'Intended Audience :: Science/Research', | |
42 | 'License :: OSI Approved :: MIT License', | |
43 | 'Natural Language :: English', | |
44 | 'Operating System :: OS Independent', | |
45 | 'Programming Language :: Python :: 3', | |
46 | 'Programming Language :: Python :: 3.6', | |
47 | 'Programming Language :: Python :: 3.7', | |
48 | 'Programming Language :: Python :: 3.8', | |
49 | 'Topic :: Scientific/Engineering :: Mathematics', | |
50 | 'Topic :: Software Development :: Libraries :: Python Modules', | |
51 | 'Typing :: Typed', | |
52 | ], | |
53 | keywords=['rdata', 'r', 'dataset'], | |
54 | install_requires=[ | |
55 | 'numpy', | |
56 | 'xarray', | |
57 | 'pandas', | |
58 | ], | |
59 | setup_requires=pytest_runner, | |
60 | tests_require=[ | |
61 | 'pytest-cov', | |
62 | 'numpy>=1.14', # The printing format for numpy changes | |
63 | ], | |
64 | test_suite='rdata.tests', | |
65 | zip_safe=False, | |
66 | ) |