Codebase list python-rdata / fresh-releases/main
New upstream release. Debian Janitor 1 year, 5 months ago
33 changed file(s) with 1446 addition(s) and 449 deletion(s). Raw diff Collapse all Expand all
2929 pip3 install .
3030 coverage run --source=rdata/ --omit=rdata/tests/ setup.py test;
3131
32 - name: Generate coverage XML
33 run: |
34 coverage xml
35
3236 - name: Upload coverage to Codecov
33 uses: codecov/codecov-action@v1
37 uses: codecov/codecov-action@v2
0 # This workflow will upload a Python Package using Twine when a release is created
1 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
2
3 # This workflow uses actions that are not certified by GitHub.
4 # They are provided by a third-party and are governed by
5 # separate terms of service, privacy policy, and support
6 # documentation.
7
8 name: Upload Python Package
9
10 on:
11 release:
12 types: [published]
13
14 permissions:
15 contents: read
16
17 jobs:
18 deploy:
19
20 runs-on: ubuntu-latest
21
22 steps:
23 - uses: actions/checkout@v3
24 - name: Set up Python
25 uses: actions/setup-python@v3
26 with:
27 python-version: '3.x'
28 - name: Install dependencies
29 run: |
30 python -m pip install --upgrade pip
31 pip install build
32 - name: Build package
33 run: python -m build
34 - name: Publish package
35 uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
36 with:
37 user: __token__
38 password: ${{ secrets.PYPI_API_TOKEN }}
0 cff-version: 1.2.0
1 message: "If you use this software, please cite it as below."
2 authors:
3 - family-names: "Ramos-Carreño"
4 given-names: "Carlos"
5 orcid: "https://orcid.org/0000-0003-2566-7058"
6 affiliation: "Universidad Autónoma de Madrid"
7 email: vnmabus@gmail.com
8 title: "rdata: Read R datasets from Python"
9 date-released: 2022-03-24
10 doi: 10.5281/zenodo.6382237
11 url: "https://github.com/vnmabus/rdata"
12 license: MIT
13 keywords:
14 - rdata
15 - Python
16 - R
17 - parser
18 - conversion
19 identifiers:
20 - description: "This is the collection of archived snapshots of all versions of rdata"
21 type: doi
22 value: 10.5281/zenodo.6382237
23 - description: "This is the archived snapshot of version 0.7 of rdata"
24 type: doi
25 value: 10.5281/zenodo.6382238
00 include MANIFEST.in
1 include VERSION
1 include rdata/VERSION
22 include LICENSE
33 include rdata/py.typed
44 include *.txt
00 rdata
11 =====
22
3 |build-status| |docs| |coverage| |landscape| |pypi|
3 |build-status| |docs| |coverage| |landscape| |pypi| |zenodo|
44
55 Read R datasets from Python.
66
102102 >>> converted = rdata.conversion.convert(parsed, new_dict)
103103 >>> converted
104104 {'test_dataframe': class value
105 0 b'a' 1
106 1 b'b' 2
107 2 b'b' 3}
105 1 b'a' 1
106 2 b'b' 2
107 3 b'b' 3}
108108
109109
110110 .. |build-status| image:: https://github.com/vnmabus/rdata/actions/workflows/main.yml/badge.svg?branch=master
129129 .. |pypi| image:: https://badge.fury.io/py/rdata.svg
130130 :alt: Pypi version
131131 :scale: 100%
132 :target: https://pypi.python.org/pypi/rdata/
132 :target: https://pypi.python.org/pypi/rdata/
133
134 .. |zenodo| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.6382237.svg
135 :alt: Zenodo DOI
136 :scale: 100%
137 :target: https://doi.org/10.5281/zenodo.6382237
+0
-1
VERSION less more
0 0.5
0 python-rdata (0.9-1) UNRELEASED; urgency=low
1
2 * New upstream release.
3
4 -- Debian Janitor <janitor@jelmer.uk> Sat, 22 Oct 2022 12:20:08 -0000
5
06 python-rdata (0.5-3) unstable; urgency=medium
17
28 [ Debian Janitor ]
2121 # sys.path.insert(0, '/home/carlos/git/rdata/rdata')
2222
2323 import sys
24
2425 import pkg_resources
26
2527 try:
2628 release = pkg_resources.get_distribution('rdata').version
2729 except pkg_resources.DistributionNotFound:
207209
208210 intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
209211 'pandas': ('http://pandas.pydata.org/pandas-docs/dev', None)}
212
213 autodoc_preserve_defaults = True
214 autodoc_typehints = "description"
6969 >>> converted = rdata.conversion.convert(parsed, new_dict)
7070 >>> converted
7171 {'test_dataframe': class value
72 0 b'a' 1
73 1 b'b' 2
74 2 b'b' 3}
72 1 b'a' 1
73 2 b'b' 2
74 3 b'b' 3}
0 0.9
0 """rdata: Read R datasets from Python."""
1 import errno as _errno
02 import os as _os
13 import pathlib as _pathlib
24
1214 Path of the test data.
1315
1416 """
17
18 try:
19 with open(
20 _pathlib.Path(_os.path.dirname(__file__)) / 'VERSION',
21 'r',
22 ) as version_file:
23 __version__ = version_file.read().strip()
24 except IOError as e:
25 if e.errno != _errno.ENOENT:
26 raise
27
28 __version__ = "0.0"
0 from ._conversion import (RExpression, RLanguage,
1 convert_list, convert_attrs, convert_vector,
2 convert_char, convert_symbol, convert_array,
3 Converter, SimpleConverter,
4 dataframe_constructor,
5 factor_constructor,
6 ts_constructor,
7 DEFAULT_CLASS_MAP, convert)
0 from ._conversion import (
1 DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP,
2 Converter as Converter,
3 RBuiltin as RBuiltin,
4 RBytecode as RBytecode,
5 RExpression as RExpression,
6 RFunction as RFunction,
7 RLanguage as RLanguage,
8 SimpleConverter as SimpleConverter,
9 convert as convert,
10 convert_array as convert_array,
11 convert_attrs as convert_attrs,
12 convert_char as convert_char,
13 convert_list as convert_list,
14 convert_symbol as convert_symbol,
15 convert_vector as convert_vector,
16 dataframe_constructor as dataframe_constructor,
17 factor_constructor as factor_constructor,
18 ts_constructor as ts_constructor,
19 )
0 from __future__ import annotations
1
02 import abc
13 import warnings
4 from dataclasses import dataclass
25 from fractions import Fraction
36 from types import MappingProxyType, SimpleNamespace
47 from typing import (
58 Any,
69 Callable,
710 ChainMap,
8 Hashable,
911 List,
1012 Mapping,
1113 MutableMapping,
1214 NamedTuple,
1315 Optional,
16 Sequence,
1417 Union,
1518 cast,
1619 )
2225 from .. import parser
2326 from ..parser import RObject
2427
28 ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any]
29 StrMap = Mapping[Union[str, bytes], Any]
30
2531
2632 class RLanguage(NamedTuple):
27 """
28 R language construct.
29 """
33 """R language construct."""
34
3035 elements: List[Any]
36 attributes: Mapping[str, Any]
3137
3238
3339 class RExpression(NamedTuple):
34 """
35 R expression.
36 """
40 """R expression."""
41
3742 elements: List[RLanguage]
43
44
45 @dataclass
46 class RBuiltin:
47 """R builtin."""
48
49 name: str
50
51
52 @dataclass
53 class RFunction:
54 """R function."""
55
56 environment: Mapping[str, Any]
57 formals: Optional[Mapping[str, Any]]
58 body: RLanguage
59 attributes: StrMap
60
61 @property
62 def source(self) -> str:
63 return "\n".join(self.attributes["srcref"].srcfile.lines)
64
65
66 @dataclass
67 class RExternalPointer:
68 """R bytecode."""
69
70 protected: Any
71 tag: Any
72
73
74 @dataclass
75 class RBytecode:
76 """R bytecode."""
77
78 code: xarray.DataArray
79 constants: Sequence[Any]
80 attributes: StrMap
81
82
83 class REnvironment(ChainMap[Union[str, bytes], Any]):
84 """R environment."""
85
86 def __init__(
87 self,
88 *maps: MutableMapping[str | bytes, Any],
89 frame: StrMap | None = None,
90 ) -> None:
91 super().__init__(*maps)
92 self.frame = frame
3893
3994
4095 def convert_list(
4196 r_list: parser.RObject,
42 conversion_function: Callable[
43 [Union[parser.RData, parser.RObject]
44 ], Any]=lambda x: x
45 ) -> Union[Mapping[Union[str, bytes], Any], List[Any]]:
97 conversion_function: ConversionFunction,
98 ) -> Union[StrMap, List[Any]]:
4699 """
47100 Expand a tagged R pairlist to a Python dictionary.
48101
67120 """
68121 if r_list.info.type is parser.RObjectType.NILVALUE:
69122 return {}
70 elif r_list.info.type not in [parser.RObjectType.LIST,
71 parser.RObjectType.LANG]:
123 elif r_list.info.type not in {
124 parser.RObjectType.LIST,
125 parser.RObjectType.LANG,
126 }:
72127 raise TypeError("Must receive a LIST, LANG or NILVALUE object")
73128
74129 if r_list.tag is None:
83138 cdr = {}
84139
85140 return {tag: conversion_function(r_list.value[0]), **cdr}
86 else:
87 if cdr is None:
88 cdr = []
89
90 return [conversion_function(r_list.value[0]), *cdr]
141
142 if cdr is None:
143 cdr = []
144
145 return [conversion_function(r_list.value[0]), *cdr]
91146
92147
93148 def convert_env(
94149 r_env: parser.RObject,
95 conversion_function: Callable[
96 [Union[parser.RData, parser.RObject]
97 ], Any]=lambda x: x
98 ) -> ChainMap[Union[str, bytes], Any]:
99
150 conversion_function: ConversionFunction,
151 ) -> REnvironment:
152 """Convert environment objects."""
100153 if r_env.info.type is not parser.RObjectType.ENV:
101154 raise TypeError("Must receive a ENV object")
102155
105158 hash_table = conversion_function(r_env.value.hash_table)
106159
107160 dictionary = {}
108 for d in hash_table:
109 if d is not None:
110 dictionary.update(d)
111
112 return ChainMap(dictionary, enclosure)
161 if hash_table is not None:
162 for d in hash_table:
163 if d is not None:
164 dictionary.update(d)
165
166 return REnvironment(dictionary, enclosure, frame=frame)
113167
114168
115169 def convert_attrs(
116170 r_obj: parser.RObject,
117 conversion_function: Callable[
118 [Union[parser.RData, parser.RObject]
119 ], Any]=lambda x: x
120 ) -> Mapping[Union[str, bytes], Any]:
171 conversion_function: ConversionFunction,
172 ) -> StrMap:
121173 """
122174 Return the attributes of an object as a Python dictionary.
123175
142194 """
143195 if r_obj.attributes:
144196 attrs = cast(
145 Mapping[Union[str, bytes], Any],
197 StrMap,
146198 conversion_function(r_obj.attributes),
147199 )
148200 else:
152204
153205 def convert_vector(
154206 r_vec: parser.RObject,
155 conversion_function: Callable[
156 [Union[parser.RData, parser.RObject]], Any]=lambda x: x,
157 attrs: Optional[Mapping[Union[str, bytes], Any]] = None,
158 ) -> Union[List[Any], Mapping[Union[str, bytes], Any]]:
207 conversion_function: ConversionFunction,
208 attrs: Optional[StrMap] = None,
209 ) -> Union[List[Any], StrMap]:
159210 """
160211 Convert a R vector to a Python list or dictionary.
161212
185236 if attrs is None:
186237 attrs = {}
187238
188 if r_vec.info.type not in [parser.RObjectType.VEC,
189 parser.RObjectType.EXPR]:
239 if r_vec.info.type not in {
240 parser.RObjectType.VEC,
241 parser.RObjectType.EXPR,
242 }:
190243 raise TypeError("Must receive a VEC or EXPR object")
191244
192 value: Union[List[Any], Mapping[Union[str, bytes], Any]] = [
245 value: Union[List[Any], StrMap] = [
193246 conversion_function(o) for o in r_vec.value
194247 ]
195248
202255
203256
204257 def safe_decode(byte_str: bytes, encoding: str) -> Union[str, bytes]:
205 """
206 Decode a (possibly malformed) string.
207 """
258 """Decode a (possibly malformed) string."""
208259 try:
209260 return byte_str.decode(encoding)
210261 except UnicodeDecodeError as e:
249300
250301 assert isinstance(r_char.value, bytes)
251302
303 encoding = None
304
252305 if not force_default_encoding:
253306 if r_char.info.gp & parser.CharFlags.UTF8:
254 return safe_decode(r_char.value, "utf_8")
307 encoding = "utf_8"
255308 elif r_char.info.gp & parser.CharFlags.LATIN1:
256 return safe_decode(r_char.value, "latin_1")
309 encoding = "latin_1"
257310 elif r_char.info.gp & parser.CharFlags.ASCII:
258 return safe_decode(r_char.value, "ascii")
311 encoding = "ascii"
259312 elif r_char.info.gp & parser.CharFlags.BYTES:
260 return r_char.value
261
262 if default_encoding:
263 return safe_decode(r_char.value, default_encoding)
264 else:
265 # Assume ASCII if no encoding is marked
266 warnings.warn(f"Unknown encoding. Assumed ASCII.")
267 return safe_decode(r_char.value, "ascii")
268
269
270 def convert_symbol(r_symbol: parser.RObject,
271 conversion_function: Callable[
272 [Union[parser.RData, parser.RObject]],
273 Any]=lambda x: x
274 ) -> Union[str, bytes]:
313 encoding = "bytes"
314
315 if encoding is None:
316 if default_encoding:
317 encoding = default_encoding
318 else:
319 # Assume ASCII if no encoding is marked
320 warnings.warn("Unknown encoding. Assumed ASCII.")
321 encoding = "ascii"
322
323 return (
324 r_char.value
325 if encoding == "bytes"
326 else safe_decode(r_char.value, encoding)
327 )
328
329
330 def convert_symbol(
331 r_symbol: parser.RObject,
332 conversion_function: ConversionFunction,
333 ) -> Union[str, bytes]:
275334 """
276335 Decode a R symbol to a Python string or bytes.
277336
297356 symbol = conversion_function(r_symbol.value)
298357 assert isinstance(symbol, (str, bytes))
299358 return symbol
300 else:
301 raise TypeError("Must receive a SYM object")
359
360 raise TypeError("Must receive a SYM object")
302361
303362
304363 def convert_array(
305364 r_array: RObject,
306 conversion_function: Callable[
307 [Union[parser.RData, parser.RObject]
308 ], Any]=lambda x: x,
309 attrs: Optional[Mapping[Union[str, bytes], Any]] = None,
365 conversion_function: ConversionFunction,
366 attrs: Optional[StrMap] = None,
310367 ) -> Union[np.ndarray, xarray.DataArray]:
311368 """
312369 Convert a R array to a Numpy ndarray or a Xarray DataArray.
335392 if attrs is None:
336393 attrs = {}
337394
338 if r_array.info.type not in {parser.RObjectType.LGL,
339 parser.RObjectType.INT,
340 parser.RObjectType.REAL,
341 parser.RObjectType.CPLX}:
395 if r_array.info.type not in {
396 parser.RObjectType.LGL,
397 parser.RObjectType.INT,
398 parser.RObjectType.REAL,
399 parser.RObjectType.CPLX,
400 }:
342401 raise TypeError("Must receive an array object")
343402
344403 value = r_array.value
348407 # R matrix order is like FORTRAN
349408 value = np.reshape(value, shape, order='F')
350409
410 dimension_names = None
411 coords = None
412
351413 dimnames = attrs.get('dimnames')
352414 if dimnames:
353 dimension_names = ["dim_" + str(i) for i, _ in enumerate(dimnames)]
354 coords: Mapping[Hashable, Any] = {
355 dimension_names[i]: d
356 for i, d in enumerate(dimnames) if d is not None}
357
358 value = xarray.DataArray(value, dims=dimension_names, coords=coords)
415 if isinstance(dimnames, Mapping):
416 dimension_names = list(dimnames.keys())
417 coords = dimnames
418 else:
419 dimension_names = [f"dim_{i}" for i, _ in enumerate(dimnames)]
420 coords = {
421 dimension_names[i]: d
422 for i, d in enumerate(dimnames)
423 if d is not None
424 }
425
426 value = xarray.DataArray(
427 value,
428 dims=dimension_names,
429 coords=coords,
430 )
359431
360432 return value
361433
362434
363435 def dataframe_constructor(
364436 obj: Any,
365 attrs: Mapping[Union[str, bytes], Any],
437 attrs: StrMap,
366438 ) -> pandas.DataFrame:
367 return pandas.DataFrame(obj, columns=obj)
439
440 row_names = attrs["row.names"]
441
442 # Default row names are stored as [INT_MIN, -len]
443 INT_MIN = -2**31 # noqa: WPS432
444 index = (
445 pandas.RangeIndex(1, abs(row_names[1]) + 1)
446 if len(row_names) == 2 and row_names[0] == INT_MIN
447 else tuple(row_names)
448 )
449
450 return pandas.DataFrame(obj, columns=obj, index=index)
368451
369452
370453 def _factor_constructor_internal(
371454 obj: Any,
372 attrs: Mapping[Union[str, bytes], Any],
455 attrs: StrMap,
373456 ordered: bool,
374457 ) -> pandas.Categorical:
375458 values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj]
379462
380463 def factor_constructor(
381464 obj: Any,
382 attrs: Mapping[Union[str, bytes], Any],
465 attrs: StrMap,
383466 ) -> pandas.Categorical:
467 """Construct a factor objects."""
384468 return _factor_constructor_internal(obj, attrs, ordered=False)
385469
386470
387471 def ordered_constructor(
388472 obj: Any,
389 attrs: Mapping[Union[str, bytes], Any],
473 attrs: StrMap,
390474 ) -> pandas.Categorical:
475 """Contruct an ordered factor."""
391476 return _factor_constructor_internal(obj, attrs, ordered=True)
392477
393478
394479 def ts_constructor(
395480 obj: Any,
396 attrs: Mapping[Union[str, bytes], Any],
481 attrs: StrMap,
397482 ) -> pandas.Series:
398
483 """Construct a time series object."""
399484 start, end, frequency = attrs['tsp']
400485
401486 frequency = int(frequency)
403488 real_start = Fraction(int(round(start * frequency)), frequency)
404489 real_end = Fraction(int(round(end * frequency)), frequency)
405490
406 index = np.arange(real_start, real_end + Fraction(1, frequency),
407 Fraction(1, frequency))
491 index = np.arange(
492 real_start,
493 real_end + Fraction(1, frequency),
494 Fraction(1, frequency),
495 )
408496
409497 if frequency == 1:
410498 index = index.astype(int)
412500 return pandas.Series(obj, index=index)
413501
414502
503 @dataclass
504 class SrcRef:
505 first_line: int
506 first_byte: int
507 last_line: int
508 last_byte: int
509 first_column: int
510 last_column: int
511 first_parsed: int
512 last_parsed: int
513 srcfile: SrcFile
514
515
516 def srcref_constructor(
517 obj: Any,
518 attrs: StrMap,
519 ) -> SrcRef:
520 return SrcRef(*obj, srcfile=attrs["srcfile"])
521
522
523 @dataclass
524 class SrcFile:
525 filename: str
526 file_encoding: str | None
527 string_encoding: str | None
528
529
530 def srcfile_constructor(
531 obj: Any,
532 attrs: StrMap,
533 ) -> SrcFile:
534
535 filename = obj.frame["filename"][0]
536 file_encoding = obj.frame.get("encoding")
537 string_encoding = obj.frame.get("Enc")
538
539 return SrcFile(
540 filename=filename,
541 file_encoding=file_encoding,
542 string_encoding=string_encoding,
543 )
544
545
546 @dataclass
547 class SrcFileCopy(SrcFile):
548 lines: Sequence[str]
549
550
551 def srcfilecopy_constructor(
552 obj: Any,
553 attrs: StrMap,
554 ) -> SrcFile:
555
556 filename = obj.frame["filename"][0]
557 file_encoding = obj.frame.get("encoding", (None,))[0]
558 string_encoding = obj.frame.get("Enc", (None,))[0]
559 lines = obj.frame["lines"]
560
561 return SrcFileCopy(
562 filename=filename,
563 file_encoding=file_encoding,
564 string_encoding=string_encoding,
565 lines=lines,
566 )
567
568
415569 Constructor = Callable[[Any, Mapping], Any]
570 ConstructorDict = Mapping[
571 Union[str, bytes],
572 Constructor,
573 ]
416574
417575 default_class_map_dict: Mapping[Union[str, bytes], Constructor] = {
418576 "data.frame": dataframe_constructor,
419577 "factor": factor_constructor,
420578 "ordered": ordered_constructor,
421579 "ts": ts_constructor,
580 "srcref": srcref_constructor,
581 "srcfile": srcfile_constructor,
582 "srcfilecopy": srcfilecopy_constructor,
422583 }
423584
424585 DEFAULT_CLASS_MAP = MappingProxyType(default_class_map_dict)
439600
440601
441602 class Converter(abc.ABC):
442 """
443 Interface of a class converting R objects in Python objects.
444 """
603 """Interface of a class converting R objects in Python objects."""
445604
446605 @abc.abstractmethod
447606 def convert(self, data: Union[parser.RData, parser.RObject]) -> Any:
448 """
449 Convert a R object to a Python one.
450 """
607 """Convert a R object to a Python one."""
451608 pass
452609
453610
479636
480637 def __init__(
481638 self,
482 constructor_dict: Mapping[
483 Union[str, bytes],
484 Constructor,
485 ] = DEFAULT_CLASS_MAP,
639 constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
486640 default_encoding: Optional[str] = None,
487641 force_default_encoding: bool = False,
488 global_environment: Optional[Mapping[Union[str, bytes], Any]] = None,
642 global_environment: MutableMapping[str | bytes, Any] | None = None,
489643 ) -> None:
490644
491645 self.constructor_dict = constructor_dict
492646 self.default_encoding = default_encoding
493647 self.force_default_encoding = force_default_encoding
494 self.global_environment = ChainMap(
648 self.global_environment = REnvironment(
495649 {} if global_environment is None
496 else global_environment
650 else global_environment,
497651 )
498 self.empty_environment: Mapping[Union[str, bytes], Any] = ChainMap({})
652 self.empty_environment: StrMap = REnvironment({})
499653
500654 self._reset()
501655
503657 self.references: MutableMapping[int, Any] = {}
504658 self.default_encoding_used = self.default_encoding
505659
506 def convert(self, data: Union[parser.RData, parser.RObject]) -> Any:
660 def convert( # noqa: D102
661 self,
662 data: Union[parser.RData, parser.RObject],
663 ) -> Any:
507664 self._reset()
508665 return self._convert_next(data)
509666
510667 def _convert_next(self, data: Union[parser.RData, parser.RObject]) -> Any:
511 """
512 Convert a R object to a Python one.
513 """
514
668 """Convert a R object to a Python one."""
515669 obj: RObject
516670 if isinstance(data, parser.RData):
517671 obj = data.object
539693 # Expand the list and process the elements
540694 value = convert_list(obj, self._convert_next)
541695
696 elif obj.info.type == parser.RObjectType.CLO:
697 assert obj.tag is not None
698 environment = self._convert_next(obj.tag)
699 formals = self._convert_next(obj.value[0])
700 body = self._convert_next(obj.value[1])
701 attributes = self._convert_next(obj.attributes)
702
703 value = RFunction(
704 environment=environment,
705 formals=formals,
706 body=body,
707 attributes=attributes,
708 )
709
542710 elif obj.info.type == parser.RObjectType.ENV:
543711
544712 # Return a ChainMap of the environments
550718 # special object
551719 rlanguage_list = convert_list(obj, self._convert_next)
552720 assert isinstance(rlanguage_list, list)
553
554 value = RLanguage(rlanguage_list)
721 attributes = self._convert_next(
722 obj.attributes,
723 ) if obj.attributes else {}
724
725 value = RLanguage(rlanguage_list, attributes)
726
727 elif obj.info.type in {parser.RObjectType.SPECIAL, parser.RObjectType.BUILTIN}:
728
729 value = RBuiltin(name=obj.value.decode("ascii"))
555730
556731 elif obj.info.type == parser.RObjectType.CHAR:
557732
562737 force_default_encoding=self.force_default_encoding,
563738 )
564739
565 elif obj.info.type in {parser.RObjectType.LGL,
566 parser.RObjectType.INT,
567 parser.RObjectType.REAL,
568 parser.RObjectType.CPLX}:
740 elif obj.info.type in {
741 parser.RObjectType.LGL,
742 parser.RObjectType.INT,
743 parser.RObjectType.REAL,
744 parser.RObjectType.CPLX,
745 }:
569746
570747 # Return the internal array
571748 value = convert_array(obj, self._convert_next, attrs=attrs)
582759
583760 elif obj.info.type == parser.RObjectType.EXPR:
584761 rexpression_list = convert_vector(
585 obj, self._convert_next, attrs=attrs)
762 obj,
763 self._convert_next,
764 attrs=attrs,
765 )
586766 assert isinstance(rexpression_list, list)
587767
588768 # Convert the internal objects returning a special object
589769 value = RExpression(rexpression_list)
590770
771 elif obj.info.type == parser.RObjectType.BCODE:
772
773 value = RBytecode(
774 code=self._convert_next(obj.value[0]),
775 constants=[self._convert_next(c) for c in obj.value[1]],
776 attributes=attrs,
777 )
778
779 elif obj.info.type == parser.RObjectType.EXTPTR:
780
781 value = RExternalPointer(
782 protected=self._convert_next(obj.value[0]),
783 tag=self._convert_next(obj.value[1]),
784 )
785
591786 elif obj.info.type == parser.RObjectType.S4:
592787 value = SimpleNamespace(**attrs)
593788
594789 elif obj.info.type == parser.RObjectType.EMPTYENV:
595790 value = self.empty_environment
596791
792 elif obj.info.type == parser.RObjectType.MISSINGARG:
793 value = NotImplemented
794
597795 elif obj.info.type == parser.RObjectType.GLOBALENV:
598796 value = self.global_environment
599797
601799
602800 # Return the referenced value
603801 value = self.references.get(id(obj.referenced_object))
604 # value = self.references[id(obj.referenced_object)]
605802 if value is None:
606803 reference_id = id(obj.referenced_object)
607804 assert obj.referenced_object is not None
614811 else:
615812 raise NotImplementedError(f"Type {obj.info.type} not implemented")
616813
617 if obj.info.object:
618 classname = attrs["class"]
814 if obj.info.object and attrs is not None:
815 classname = attrs.get("class", ())
619816 for i, c in enumerate(classname):
620817
621818 constructor = self.constructor_dict.get(c, None)
626823 new_value = NotImplemented
627824
628825 if new_value is NotImplemented:
629 missing_msg = (f"Missing constructor for R class "
630 f"\"{c}\". ")
826 missing_msg = (
827 f"Missing constructor for R class \"{c}\". "
828 )
631829
632830 if len(classname) > (i + 1):
633 solution_msg = (f"The constructor for class "
634 f"\"{classname[i+1]}\" will be "
635 f"used instead."
636 )
831 solution_msg = (
832 f"The constructor for class "
833 f"\"{classname[i+1]}\" will be "
834 f"used instead."
835 )
637836 else:
638 solution_msg = ("The underlying R object is "
639 "returned instead.")
640
641 warnings.warn(missing_msg + solution_msg,
642 stacklevel=1)
837 solution_msg = (
838 "The underlying R object is "
839 "returned instead."
840 )
841
842 warnings.warn(
843 missing_msg + solution_msg,
844 stacklevel=1,
845 )
643846 else:
644847 value = new_value
645848 break
655858 **kwargs: Any,
656859 ) -> Any:
657860 """
658 Uses the default converter (:func:`SimpleConverter`) to convert the data.
861 Use the default converter (:func:`SimpleConverter`) to convert the data.
659862
660863 Examples:
661
662864 Parse one of the included examples, containing a vector
663865
664866 >>> import rdata
678880 >>> converted = rdata.conversion.convert(parsed)
679881 >>> converted
680882 {'test_dataframe': class value
681 0 a 1
682 1 b 2
683 2 b 3}
883 1 a 1
884 2 b 2
885 3 b 3}
684886
685887 """
686888 return SimpleConverter(*args, **kwargs).convert(data)
0 """Utilities for parsing a rdata file."""
1
02 from ._parser import (
1 DEFAULT_ALTREP_MAP,
2 CharFlags,
3 RData,
4 RObject,
5 RObjectInfo,
6 RObjectType,
7 parse_data,
8 parse_file,
3 DEFAULT_ALTREP_MAP as DEFAULT_ALTREP_MAP,
4 CharFlags as CharFlags,
5 RData as RData,
6 RObject as RObject,
7 RObjectInfo as RObjectInfo,
8 RObjectType as RObjectType,
9 parse_data as parse_data,
10 parse_file as parse_file,
911 )
1111 from dataclasses import dataclass
1212 from types import MappingProxyType
1313 from typing import (
14 TYPE_CHECKING,
1415 Any,
1516 BinaryIO,
1617 Callable,
1718 List,
1819 Mapping,
1920 Optional,
21 Sequence,
2022 Set,
2123 TextIO,
2224 Tuple,
2729
2830
2931 class FileTypes(enum.Enum):
30 """
31 Type of file containing a R file.
32 """
32 """Type of file containing a R file."""
33
3334 bzip2 = "bz2"
3435 gzip = "gzip"
3536 xz = "xz"
4243 FileTypes.gzip: b"\x1f\x8b",
4344 FileTypes.xz: b"\xFD7zXZ\x00",
4445 FileTypes.rdata_binary_v2: b"RDX2\n",
45 FileTypes.rdata_binary_v3: b"RDX3\n"
46 FileTypes.rdata_binary_v3: b"RDX3\n",
4647 }
4748
4849
4950 def file_type(data: memoryview) -> Optional[FileTypes]:
50 """
51 Returns the type of the file.
52 """
53
51 """Return the type of the file."""
5452 for filetype, magic in magic_dict.items():
5553 if data[:len(magic)] == magic:
5654 return filetype
5856
5957
6058 class RdataFormats(enum.Enum):
61 """
62 Format of a R file.
63 """
59 """Format of a R file."""
60
6461 XDR = "XDR"
6562 ASCII = "ASCII"
6663 binary = "binary"
7471
7572
7673 def rdata_format(data: memoryview) -> Optional[RdataFormats]:
77 """
78 Returns the format of the data.
79 """
80
74 """Return the format of the data."""
8175 for format_type, magic in format_dict.items():
8276 if data[:len(magic)] == magic:
8377 return format_type
8579
8680
8781 class RObjectType(enum.Enum):
88 """
89 Type of a R object.
90 """
82 """Type of a R object."""
83
9184 NIL = 0 # NULL
9285 SYM = 1 # symbols
9386 LIST = 2 # pairlists
113106 RAW = 24 # raw vector
114107 S4 = 25 # S4 classes not of simple type
115108 ALTREP = 238 # Alternative representations
109 ATTRLIST = 239 # Bytecode attribute
110 ATTRLANG = 240 # Bytecode attribute
116111 EMPTYENV = 242 # Empty environment
112 BCREPREF = 243 # Bytecode repetition reference
113 BCREPDEF = 244 # Bytecode repetition definition
114 MISSINGARG = 251 # Missinf argument
117115 GLOBALENV = 253 # Global environment
118116 NILVALUE = 254 # NIL value
119117 REF = 255 # Reference
120118
121119
120 BYTECODE_SPECIAL_SET = {
121 RObjectType.BCODE,
122 RObjectType.BCREPREF,
123 RObjectType.BCREPDEF,
124 RObjectType.LANG,
125 RObjectType.LIST,
126 RObjectType.ATTRLANG,
127 RObjectType.ATTRLIST,
128 }
129
130
122131 class CharFlags(enum.IntFlag):
132 """Flags for R objects of type char."""
133
123134 HAS_HASH = 1
124135 BYTES = 1 << 1
125136 LATIN1 = 1 << 2
130141
131142 @dataclass
132143 class RVersions():
133 """
134 R versions.
135 """
136 format: int
144 """R versions."""
145
146 format: int # noqa: E701
137147 serialized: int
138148 minimum: int
139149
144154 Extra information.
145155
146156 Contains the default encoding (only in version 3).
157
147158 """
159
148160 encoding: Optional[str] = None
149161
150162
151163 @dataclass
152164 class RObjectInfo():
153 """
154 Internal attributes of a R object.
155 """
165 """Internal attributes of a R object."""
166
156167 type: RObjectType
157168 object: bool
158169 attributes: bool
161172 reference: int
162173
163174
175 def _str_internal(
176 obj: RObject | Sequence[RObject],
177 indent: int = 0,
178 used_references: Optional[Set[int]] = None,
179 ) -> str:
180
181 if used_references is None:
182 used_references = set()
183
184 small_indent = indent + 2
185 big_indent = indent + 4
186
187 indent_spaces = ' ' * indent
188 small_indent_spaces = ' ' * small_indent
189 big_indent_spaces = ' ' * big_indent
190
191 string = ""
192
193 if isinstance(obj, Sequence):
194 string += f"{indent_spaces}[\n"
195 for elem in obj:
196 string += _str_internal(
197 elem,
198 big_indent,
199 used_references.copy(),
200 )
201 string += f"{indent_spaces}]\n"
202
203 return string
204
205 string += f"{indent_spaces}{obj.info.type}\n"
206
207 if obj.tag:
208 tag_string = _str_internal(
209 obj.tag,
210 big_indent,
211 used_references.copy(),
212 )
213 string += f"{small_indent_spaces}tag:\n{tag_string}\n"
214
215 if obj.info.reference:
216 assert obj.referenced_object
217 reference_string = (
218 f"{big_indent_spaces}..."
219 if obj.info.reference in used_references
220 else _str_internal(
221 obj.referenced_object,
222 indent + 4, used_references.copy())
223 )
224 string += (
225 f"{small_indent_spaces}reference: "
226 f"{obj.info.reference}\n{reference_string}\n"
227 )
228
229 string += f"{small_indent_spaces}value:\n"
230
231 if isinstance(obj.value, RObject):
232 string += _str_internal(
233 obj.value,
234 big_indent,
235 used_references.copy(),
236 )
237 elif isinstance(obj.value, (tuple, list)):
238 for elem in obj.value:
239 string += _str_internal(
240 elem,
241 big_indent,
242 used_references.copy(),
243 )
244 elif isinstance(obj.value, np.ndarray):
245 string += big_indent_spaces
246 if len(obj.value) > 4:
247 string += (
248 f"[{obj.value[0]}, {obj.value[1]} ... "
249 f"{obj.value[-2]}, {obj.value[-1]}]\n"
250 )
251 else:
252 string += f"{obj.value}\n"
253 else:
254 string += f"{big_indent_spaces}{obj.value}\n"
255
256 if obj.attributes:
257 attr_string = _str_internal(
258 obj.attributes,
259 big_indent,
260 used_references.copy(),
261 )
262 string += f"{small_indent_spaces}attributes:\n{attr_string}\n"
263
264 return string
265
266
164267 @dataclass
165268 class RObject():
166 """
167 Representation of a R object.
168 """
269 """Representation of a R object."""
270
169271 info: RObjectInfo
170272 value: Any
171273 attributes: Optional[RObject]
172274 tag: Optional[RObject] = None
173275 referenced_object: Optional[RObject] = None
174276
175 def _str_internal(
176 self,
177 indent: int = 0,
178 used_references: Optional[Set[int]] = None
179 ) -> str:
180
181 if used_references is None:
182 used_references = set()
183
184 string = ""
185
186 string += f"{' ' * indent}{self.info.type}\n"
187
188 if self.tag:
189 tag_string = self.tag._str_internal(indent + 4,
190 used_references.copy())
191 string += f"{' ' * (indent + 2)}tag:\n{tag_string}\n"
192
193 if self.info.reference:
194 assert self.referenced_object
195 reference_string = (f"{' ' * (indent + 4)}..."
196 if self.info.reference in used_references
197 else self.referenced_object._str_internal(
198 indent + 4, used_references.copy()))
199 string += (f"{' ' * (indent + 2)}reference: "
200 f"{self.info.reference}\n{reference_string}\n")
201
202 string += f"{' ' * (indent + 2)}value:\n"
203
204 if isinstance(self.value, RObject):
205 string += self.value._str_internal(indent + 4,
206 used_references.copy())
207 elif isinstance(self.value, tuple) or isinstance(self.value, list):
208 for elem in self.value:
209 string += elem._str_internal(indent + 4,
210 used_references.copy())
211 elif isinstance(self.value, np.ndarray):
212 string += " " * (indent + 4)
213 if len(self.value) > 4:
214 string += (f"[{self.value[0]}, {self.value[1]} ... "
215 f"{self.value[-2]}, {self.value[-1]}]\n")
216 else:
217 string += f"{self.value}\n"
218 else:
219 string += f"{' ' * (indent + 4)}{self.value}\n"
220
221 if(self.attributes):
222 attr_string = self.attributes._str_internal(
223 indent + 4,
224 used_references.copy())
225 string += f"{' ' * (indent + 2)}attributes:\n{attr_string}\n"
226
227 return string
228
229277 def __str__(self) -> str:
230 return self._str_internal()
278 return _str_internal(self)
231279
232280
233281 @dataclass
234282 class RData():
235 """
236 Data contained in a R file.
237 """
283 """Data contained in a R file."""
284
238285 versions: RVersions
239286 extra: RExtraInfo
240287 object: RObject
241288
289 def __str__(self) -> str:
290 return (
291 "RData(\n"
292 f" versions: {self.versions}\n"
293 f" extra: {self.extra}\n"
294 f" object: \n{_str_internal(self.object, indent=4)}\n"
295 ")\n"
296 )
297
242298
243299 @dataclass
244300 class EnvironmentValue():
245 """
246 Value of an environment.
247 """
301 """Value of an environment."""
302
248303 locked: bool
249304 enclosure: RObject
250305 frame: RObject
259314
260315
261316 def format_float_with_scipen(number: float, scipen: int) -> bytes:
317 """Format a floating point value as in R."""
262318 fixed = np.format_float_positional(number, trim="-")
263319 scientific = np.format_float_scientific(number, trim="-")
264320
265 assert(isinstance(fixed, str))
266 assert(isinstance(scientific, str))
321 assert isinstance(fixed, str)
322 assert isinstance(scientific, str)
267323
268324 return (
269325 scientific if len(fixed) - len(scientific) > scipen
274330 def deferred_string_constructor(
275331 state: RObject,
276332 ) -> Tuple[RObjectInfo, Any]:
277
333 """Expand a deferred string ALTREP."""
278334 new_info = RObjectInfo(
279335 type=RObjectType.STR,
280336 object=False,
311367 def compact_seq_constructor(
312368 state: RObject,
313369 *,
314 is_int: bool = False
370 is_int: bool = False,
315371 ) -> Tuple[RObjectInfo, Any]:
316
372 """Expand a compact_seq ALTREP."""
317373 new_info = RObjectInfo(
318374 type=RObjectType.INT if is_int else RObjectType.REAL,
319375 object=False,
340396 def compact_intseq_constructor(
341397 state: RObject,
342398 ) -> Tuple[RObjectInfo, Any]:
399 """Expand a compact_intseq ALTREP."""
343400 return compact_seq_constructor(state, is_int=True)
344401
345402
346403 def compact_realseq_constructor(
347404 state: RObject,
348405 ) -> Tuple[RObjectInfo, Any]:
406 """Expand a compact_realseq ALTREP."""
349407 return compact_seq_constructor(state, is_int=False)
350408
351409
352410 def wrap_constructor(
353411 state: RObject,
354412 ) -> Tuple[RObjectInfo, Any]:
355
413 """Expand any wrap_* ALTREP."""
356414 new_info = RObjectInfo(
357415 type=state.value[0].info.type,
358416 object=False,
383441
384442
385443 class Parser(abc.ABC):
386 """
387 Parser interface for a R file.
388 """
444 """Parser interface for a R file."""
389445
390446 def __init__(
391447 self,
397453 self.altrep_constructor_dict = altrep_constructor_dict
398454
399455 def parse_bool(self) -> bool:
400 """
401 Parse a boolean.
402 """
456 """Parse a boolean."""
403457 return bool(self.parse_int())
404458
405459 @abc.abstractmethod
406460 def parse_int(self) -> int:
407 """
408 Parse an integer.
409 """
461 """Parse an integer."""
410462 pass
411463
412464 @abc.abstractmethod
413465 def parse_double(self) -> float:
414 """
415 Parse a double.
416 """
466 """Parse a double."""
417467 pass
418468
419469 def parse_complex(self) -> complex:
420 """
421 Parse a complex number.
422 """
470 """Parse a complex number."""
423471 return complex(self.parse_double(), self.parse_double())
424472
425473 @abc.abstractmethod
426474 def parse_string(self, length: int) -> bytes:
427 """
428 Parse a string.
429 """
475 """Parse a string."""
430476 pass
431477
432478 def parse_all(self) -> RData:
433 """
434 Parse all the file.
435 """
436
479 """Parse all the file."""
437480 versions = self.parse_versions()
438481 extra_info = self.parse_extra_info(versions)
439482 obj = self.parse_R_object()
441484 return RData(versions, extra_info, obj)
442485
443486 def parse_versions(self) -> RVersions:
444 """
445 Parse the versions header.
446 """
447
487 """Parse the versions header."""
448488 format_version = self.parse_int()
449489 r_version = self.parse_int()
450490 minimum_r_version = self.parse_int()
451491
452 if format_version not in [2, 3]:
492 if format_version not in {2, 3}:
453493 raise NotImplementedError(
454494 f"Format version {format_version} unsupported",
455495 )
458498
459499 def parse_extra_info(self, versions: RVersions) -> RExtraInfo:
460500 """
461 Parse the versions header.
501 Parse the extra info.
502
503 Parses de encoding in version 3 format.
504
462505 """
463
464506 encoding = None
465507
466508 if versions.format >= 3:
467509 encoding_len = self.parse_int()
468510 encoding = self.parse_string(encoding_len).decode("ASCII")
469511
470 extra_info = RExtraInfo(encoding)
471
472 return extra_info
512 return RExtraInfo(encoding)
473513
474514 def expand_altrep_to_object(
475515 self,
477517 state: RObject,
478518 ) -> Tuple[RObjectInfo, Any]:
479519 """Expand alternative representation to normal object."""
480
481520 assert info.info.type == RObjectType.LIST
482521
483522 class_sym = info.value[0]
493532 constructor = self.altrep_constructor_dict[altrep_name]
494533 return constructor(state)
495534
535 def _parse_bytecode_constant(
536 self,
537 reference_list: Optional[List[RObject]],
538 bytecode_rep_list: List[RObject | None] | None = None,
539 ) -> RObject:
540
541 obj_type = self.parse_int()
542
543 return self.parse_R_object(
544 reference_list,
545 bytecode_rep_list,
546 info_int=obj_type,
547 )
548
549 def _parse_bytecode(
550 self,
551 reference_list: Optional[List[RObject]],
552 bytecode_rep_list: List[RObject | None] | None = None,
553 ) -> Tuple[RObject, Sequence[RObject]]:
554 """Parse R bytecode."""
555 if bytecode_rep_list is None:
556 n_repeated = self.parse_int()
557
558 code = self.parse_R_object(reference_list, bytecode_rep_list)
559
560 if bytecode_rep_list is None:
561 bytecode_rep_list = [None] * n_repeated
562
563 n_constants = self.parse_int()
564 constants = [
565 self._parse_bytecode_constant(
566 reference_list,
567 bytecode_rep_list,
568 )
569 for _ in range(n_constants)
570 ]
571
572 return (code, constants)
573
496574 def parse_R_object(
497575 self,
498 reference_list: Optional[List[RObject]] = None
576 reference_list: List[RObject] | None = None,
577 bytecode_rep_list: List[RObject | None] | None = None,
578 info_int: int | None = None,
499579 ) -> RObject:
500 """
501 Parse a R object.
502 """
503
580 """Parse a R object."""
504581 if reference_list is None:
505582 # Index is 1-based, so we insert a dummy object
506583 reference_list = []
507584
508 info_int = self.parse_int()
509
510 info = parse_r_object_info(info_int)
585 original_info_int = info_int
586 if (
587 info_int is not None
588 and RObjectType(info_int) in BYTECODE_SPECIAL_SET
589 ):
590 info = parse_r_object_info(info_int)
591 info.tag = info.type not in {
592 RObjectType.BCREPREF,
593 RObjectType.BCODE,
594 }
595 else:
596 info_int = self.parse_int()
597 info = parse_r_object_info(info_int)
511598
512599 tag = None
513600 attributes = None
514601 referenced_object = None
515602
603 bytecode_rep_position = -1
516604 tag_read = False
517605 attributes_read = False
518606 add_reference = False
521609
522610 value: Any
523611
612 if info.type == RObjectType.BCREPDEF:
613 assert bytecode_rep_list
614 bytecode_rep_position = self.parse_int()
615 info.type = RObjectType(self.parse_int())
616
524617 if info.type == RObjectType.NIL:
525618 value = None
526619
527620 elif info.type == RObjectType.SYM:
528621 # Read Char
529 value = self.parse_R_object(reference_list)
622 value = self.parse_R_object(reference_list, bytecode_rep_list)
530623 # Symbols can be referenced
531624 add_reference = True
532625
533 elif info.type in [RObjectType.LIST, RObjectType.LANG]:
626 elif info.type in {
627 RObjectType.LIST,
628 RObjectType.LANG,
629 RObjectType.CLO,
630 RObjectType.PROM,
631 RObjectType.DOT,
632 RObjectType.ATTRLANG,
633 }:
634 if info.type is RObjectType.ATTRLANG:
635 info.type = RObjectType.LANG
636 info.attributes = True
637
534638 tag = None
535639 if info.attributes:
536 attributes = self.parse_R_object(reference_list)
640 attributes = self.parse_R_object(
641 reference_list,
642 bytecode_rep_list,
643 )
537644 attributes_read = True
538 elif info.tag:
539 tag = self.parse_R_object(reference_list)
645
646 if info.tag:
647 tag = self.parse_R_object(reference_list, bytecode_rep_list)
540648 tag_read = True
541649
542650 # Read CAR and CDR
543 car = self.parse_R_object(reference_list)
544 cdr = self.parse_R_object(reference_list)
651 car = self.parse_R_object(
652 reference_list,
653 bytecode_rep_list,
654 info_int=(
655 None if original_info_int is None
656 else self.parse_int()
657 ),
658 )
659 cdr = self.parse_R_object(
660 reference_list,
661 bytecode_rep_list,
662 info_int=(
663 None if original_info_int is None
664 else self.parse_int()
665 ),
666 )
545667 value = (car, cdr)
546668
547669 elif info.type == RObjectType.ENV:
670 info.object = True
671
548672 result = RObject(
549673 info=info,
550674 tag=tag,
556680 reference_list.append(result)
557681
558682 locked = self.parse_bool()
559 enclosure = self.parse_R_object(reference_list)
560 frame = self.parse_R_object(reference_list)
561 hash_table = self.parse_R_object(reference_list)
562 attributes = self.parse_R_object(reference_list)
683 enclosure = self.parse_R_object(reference_list, bytecode_rep_list)
684 frame = self.parse_R_object(reference_list, bytecode_rep_list)
685 hash_table = self.parse_R_object(reference_list, bytecode_rep_list)
686 attributes = self.parse_R_object(reference_list, bytecode_rep_list)
563687
564688 value = EnvironmentValue(
565689 locked=locked,
567691 frame=frame,
568692 hash_table=hash_table,
569693 )
694
695 elif info.type in {RObjectType.SPECIAL, RObjectType.BUILTIN}:
696 length = self.parse_int()
697 if length > 0:
698 value = self.parse_string(length=length)
570699
571700 elif info.type == RObjectType.CHAR:
572701 length = self.parse_int()
578707 value = None
579708 else:
580709 raise NotImplementedError(
581 f"Length of CHAR cannot be {length}")
710 f"Length of CHAR cannot be {length}",
711 )
582712
583713 elif info.type == RObjectType.LGL:
584714 length = self.parse_int()
612742 for i in range(length):
613743 value[i] = self.parse_complex()
614744
615 elif info.type in [RObjectType.STR,
616 RObjectType.VEC, RObjectType.EXPR]:
745 elif info.type in {
746 RObjectType.STR,
747 RObjectType.VEC,
748 RObjectType.EXPR,
749 }:
617750 length = self.parse_int()
618751
619752 value = [None] * length
620753
621754 for i in range(length):
622 value[i] = self.parse_R_object(reference_list)
755 value[i] = self.parse_R_object(
756 reference_list, bytecode_rep_list)
757
758 elif info.type == RObjectType.BCODE:
759 value = self._parse_bytecode(reference_list, bytecode_rep_list)
760 tag_read = True
761
762 elif info.type == RObjectType.EXTPTR:
763
764 result = RObject(
765 info=info,
766 tag=tag,
767 attributes=attributes,
768 value=None,
769 referenced_object=referenced_object,
770 )
771
772 reference_list.append(result)
773 protected = self.parse_R_object(
774 reference_list,
775 bytecode_rep_list,
776 )
777 extptr_tag = self.parse_R_object(
778 reference_list,
779 bytecode_rep_list,
780 )
781
782 value = (protected, extptr_tag)
623783
624784 elif info.type == RObjectType.S4:
625785 value = None
626786
627787 elif info.type == RObjectType.ALTREP:
628 altrep_info = self.parse_R_object(reference_list)
629 altrep_state = self.parse_R_object(reference_list)
630 altrep_attr = self.parse_R_object(reference_list)
788 altrep_info = self.parse_R_object(
789 reference_list,
790 bytecode_rep_list,
791 )
792 altrep_state = self.parse_R_object(
793 reference_list,
794 bytecode_rep_list,
795 )
796 altrep_attr = self.parse_R_object(
797 reference_list,
798 bytecode_rep_list,
799 )
631800
632801 if self.expand_altrep:
633802 info, value = self.expand_altrep_to_object(
641810 elif info.type == RObjectType.EMPTYENV:
642811 value = None
643812
813 elif info.type == RObjectType.BCREPREF:
814 assert bytecode_rep_list
815 position = self.parse_int()
816 result = bytecode_rep_list[position]
817 assert result
818 return result
819
820 elif info.type == RObjectType.MISSINGARG:
821 value = None
822
644823 elif info.type == RObjectType.GLOBALENV:
645824 value = None
646825
656835 raise NotImplementedError(f"Type {info.type} not implemented")
657836
658837 if info.tag and not tag_read:
659 warnings.warn(f"Tag not implemented for type {info.type} "
660 "and ignored")
838 warnings.warn(
839 f"Tag not implemented for type {info.type} "
840 "and ignored",
841 )
661842 if info.attributes and not attributes_read:
662 attributes = self.parse_R_object(reference_list)
843 attributes = self.parse_R_object(reference_list, bytecode_rep_list)
663844
664845 if result is None:
665846 result = RObject(
678859 if add_reference:
679860 reference_list.append(result)
680861
862 if bytecode_rep_position >= 0:
863 assert bytecode_rep_list
864 bytecode_rep_list[bytecode_rep_position] = result
865
681866 return result
682867
683868
684869 class ParserXDR(Parser):
685 """
686 Parser used when the integers and doubles are in XDR format.
687 """
870 """Parser used when the integers and doubles are in XDR format."""
688871
689872 def __init__(
690873 self,
702885 self.position = position
703886 self.xdr_parser = xdrlib.Unpacker(data)
704887
705 def parse_int(self) -> int:
888 def parse_int(self) -> int: # noqa: D102
706889 self.xdr_parser.set_position(self.position)
707890 result = self.xdr_parser.unpack_int()
708891 self.position = self.xdr_parser.get_position()
709892
710893 return result
711894
712 def parse_double(self) -> float:
895 def parse_double(self) -> float: # noqa: D102
713896 self.xdr_parser.set_position(self.position)
714897 result = self.xdr_parser.unpack_double()
715898 self.position = self.xdr_parser.get_position()
716899
717900 return result
718901
719 def parse_string(self, length: int) -> bytes:
902 def parse_string(self, length: int) -> bytes: # noqa: D102
720903 result = self.data[self.position:(self.position + length)]
721904 self.position += length
722905 return bytes(result)
906
907 def parse_all(self) -> RData:
908 rdata = super().parse_all()
909 assert self.position == len(self.data)
910 return rdata
723911
724912
725913 def parse_file(
727915 *,
728916 expand_altrep: bool = True,
729917 altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
918 extension: str | None = None,
730919 ) -> RData:
731920 """
732921 Parse a R file (.rda or .rdata).
733922
734923 Parameters:
735 file_or_path (file-like, str, bytes or path-like): File
736 in the R serialization format.
737 expand_altrep (bool): Wether to translate ALTREPs to normal objects.
924 file_or_path: File in the R serialization format.
925 expand_altrep: Wether to translate ALTREPs to normal objects.
738926 altrep_constructor_dict: Dictionary mapping each ALTREP to
739927 its constructor.
928 extension: Extension of the file.
740929
741930 Returns:
742 RData: Data contained in the file (versions and object).
931 Data contained in the file (versions and object).
743932
744933 See Also:
745934 :func:`parse_data`: Similar function that receives the data directly.
746935
747936 Examples:
748
749937 Parse one of the included examples, containing a vector
750938
751939 >>> import rdata
808996 """
809997 if isinstance(file_or_path, (os.PathLike, str)):
810998 path = pathlib.Path(file_or_path)
999 if extension is None:
1000 extension = path.suffix
8111001 data = path.read_bytes()
8121002 else:
8131003 # file is a pre-opened file
8221012 data,
8231013 expand_altrep=expand_altrep,
8241014 altrep_constructor_dict=altrep_constructor_dict,
1015 extension=extension,
8251016 )
8261017
8271018
8301021 *,
8311022 expand_altrep: bool = True,
8321023 altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
1024 extension: str | None = None,
8331025 ) -> RData:
8341026 """
8351027 Parse the data of a R file, received as a sequence of bytes.
8361028
8371029 Parameters:
838 data (bytes): Data extracted of a R file.
839 expand_altrep (bool): Wether to translate ALTREPs to normal objects.
1030 data: Data extracted of a R file.
1031 expand_altrep: Wether to translate ALTREPs to normal objects.
8401032 altrep_constructor_dict: Dictionary mapping each ALTREP to
8411033 its constructor.
1034 extension: Extension of the file.
8421035
8431036 Returns:
844 RData: Data contained in the file (versions and object).
1037 Data contained in the file (versions and object).
8451038
8461039 See Also:
8471040 :func:`parse_file`: Similar function that parses a file directly.
8481041
8491042 Examples:
850
8511043 Parse one of the included examples, containing a vector
8521044
8531045 >>> import rdata
9181110 if filetype in {
9191111 FileTypes.rdata_binary_v2,
9201112 FileTypes.rdata_binary_v3,
1113 None,
9211114 } else parse_data
9221115 )
9231116
9281121 elif filetype is FileTypes.xz:
9291122 new_data = lzma.decompress(data)
9301123 elif filetype in {FileTypes.rdata_binary_v2, FileTypes.rdata_binary_v3}:
1124 if extension == ".rds":
1125 warnings.warn(
1126 f"Wrong extension {extension} for file in RDATA format",
1127 )
1128
9311129 view = view[len(magic_dict[filetype]):]
9321130 new_data = view
9331131 else:
934 raise NotImplementedError("Unknown file type")
1132 new_data = view
1133 if extension != ".rds":
1134 warnings.warn("Unknown file type: assumed RDS")
9351135
9361136 return parse_function(
9371137 new_data, # type: ignore
9381138 expand_altrep=expand_altrep,
9391139 altrep_constructor_dict=altrep_constructor_dict,
1140 extension=extension,
9401141 )
9411142
9421143
9441145 data: memoryview,
9451146 expand_altrep: bool = True,
9461147 altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
1148 extension: str | None = None,
9471149 ) -> RData:
948 """
949 Select the appropiate parser and parse all the info.
950 """
1150 """Select the appropiate parser and parse all the info."""
9511151 format_type = rdata_format(data)
9521152
9531153 if format_type:
9601160 altrep_constructor_dict=altrep_constructor_dict,
9611161 )
9621162 return parser.parse_all()
963 else:
964 raise NotImplementedError("Unknown file format")
1163
1164 raise NotImplementedError("Unknown file format")
9651165
9661166
9671167 def bits(data: int, start: int, stop: int) -> int:
968 """
969 Read bits [start, stop) of an integer.
970 """
1168 """Read bits [start, stop) of an integer."""
9711169 count = stop - start
9721170 mask = ((1 << count) - 1) << start
9731171
9761174
9771175
9781176 def is_special_r_object_type(r_object_type: RObjectType) -> bool:
979 """
980 Check if a R type has a different serialization than the usual one.
981 """
982 return (r_object_type is RObjectType.NILVALUE
983 or r_object_type is RObjectType.REF)
1177 """Check if a R type has a different serialization than the usual one."""
1178 return (
1179 r_object_type is RObjectType.NILVALUE
1180 or r_object_type is RObjectType.REF
1181 )
9841182
9851183
9861184 def parse_r_object_info(info_int: int) -> RObjectInfo:
987 """
988 Parse the internal information of an object.
989 """
1185 """Parse the internal information of an object."""
9901186 type_exp = RObjectType(bits(info_int, 0, 8))
9911187
9921188 reference = 0
9991195 else:
10001196 object_flag = bool(bits(info_int, 8, 9))
10011197 attributes = bool(bits(info_int, 9, 10))
1002 tag = bool(bits(info_int, 10, 11))
1003 gp = bits(info_int, 12, 28)
1198 tag = bool(bits(info_int, 10, 11)) # noqa: WPS432
1199 gp = bits(info_int, 12, 28) # noqa: WPS432
10041200
10051201 if type_exp == RObjectType.REF:
1006 reference = bits(info_int, 8, 32)
1202 reference = bits(info_int, 8, 32) # noqa: WPS432
10071203
10081204 return RObjectInfo(
10091205 type=type_exp,
10111207 attributes=attributes,
10121208 tag=tag,
10131209 gp=gp,
1014 reference=reference
1210 reference=reference,
10151211 )
0 """Tests of parsing and conversion."""
1
02 import unittest
13 from collections import ChainMap
24 from fractions import Fraction
57
68 import numpy as np
79 import pandas as pd
10 import xarray
811
912 import rdata
1013
1215
1316
1417 class SimpleTests(unittest.TestCase):
18 """Collection of simple test cases."""
1519
1620 def test_opened_file(self) -> None:
17 parsed = rdata.parser.parse_file(open(TESTDATA_PATH /
18 "test_vector.rda"))
21 """Test that an opened file can be passed to parse_file."""
22 with open(TESTDATA_PATH / "test_vector.rda") as f:
23 parsed = rdata.parser.parse_file(f)
24 converted = rdata.conversion.convert(parsed)
25
26 self.assertIsInstance(converted, dict)
27
28 def test_opened_string(self) -> None:
29 """Test that a string can be passed to parse_file."""
30 parsed = rdata.parser.parse_file(
31 str(TESTDATA_PATH / "test_vector.rda"),
32 )
1933 converted = rdata.conversion.convert(parsed)
2034
2135 self.assertIsInstance(converted, dict)
2236
23 def test_opened_string(self) -> None:
24 parsed = rdata.parser.parse_file(str(TESTDATA_PATH /
25 "test_vector.rda"))
26 converted = rdata.conversion.convert(parsed)
27
28 self.assertIsInstance(converted, dict)
29
3037 def test_logical(self) -> None:
38 """Test parsing of logical vectors."""
3139 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_logical.rda")
3240 converted = rdata.conversion.convert(parsed)
3341
3442 np.testing.assert_equal(converted, {
35 "test_logical": np.array([True, True, False, True, False])
43 "test_logical": np.array([True, True, False, True, False]),
3644 })
3745
3846 def test_vector(self) -> None:
47 """Test parsing of numerical vectors."""
3948 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_vector.rda")
4049 converted = rdata.conversion.convert(parsed)
4150
4251 np.testing.assert_equal(converted, {
43 "test_vector": np.array([1., 2., 3.])
52 "test_vector": np.array([1.0, 2.0, 3.0]),
4453 })
4554
4655 def test_empty_string(self) -> None:
56 """Test that the empty string is parsed correctly."""
4757 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_empty_str.rda")
4858 converted = rdata.conversion.convert(parsed)
4959
5060 np.testing.assert_equal(converted, {
51 "test_empty_str": [""]
61 "test_empty_str": [""],
5262 })
5363
5464 def test_na_string(self) -> None:
55 parsed = rdata.parser.parse_file(
56 TESTDATA_PATH / "test_na_string.rda")
57 converted = rdata.conversion.convert(parsed)
58
59 np.testing.assert_equal(converted, {
60 "test_na_string": [None]
65 """Test that the NA string is parsed correctly."""
66 parsed = rdata.parser.parse_file(
67 TESTDATA_PATH / "test_na_string.rda",
68 )
69 converted = rdata.conversion.convert(parsed)
70
71 np.testing.assert_equal(converted, {
72 "test_na_string": [None],
6173 })
6274
6375 def test_complex(self) -> None:
76 """Test that complex numbers can be parsed."""
6477 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_complex.rda")
6578 converted = rdata.conversion.convert(parsed)
6679
6780 np.testing.assert_equal(converted, {
68 "test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j])
81 "test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j]),
6982 })
7083
7184 def test_matrix(self) -> None:
85 """Test that a matrix can be parsed."""
7286 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_matrix.rda")
7387 converted = rdata.conversion.convert(parsed)
7488
7589 np.testing.assert_equal(converted, {
76 "test_matrix": np.array([[1., 2., 3.],
77 [4., 5., 6.]])
78 })
90 "test_matrix": np.array([
91 [1.0, 2.0, 3.0],
92 [4.0, 5.0, 6.0],
93 ]),
94 })
95
96 def test_named_matrix(self) -> None:
97 """Test that a named matrix can be parsed."""
98 parsed = rdata.parser.parse_file(
99 TESTDATA_PATH / "test_named_matrix.rda",
100 )
101 converted = rdata.conversion.convert(parsed)
102 reference = xarray.DataArray(
103 [
104 [1.0, 2.0, 3.0],
105 [4.0, 5.0, 6.0],
106 ],
107 dims=["dim_0", "dim_1"],
108 coords={
109 "dim_0": ["dim0_0", "dim0_1"],
110 "dim_1": ["dim1_0", "dim1_1", "dim1_2"],
111 },
112 )
113
114 xarray.testing.assert_identical(
115 converted["test_named_matrix"],
116 reference,
117 )
118
119 def test_half_named_matrix(self) -> None:
120 """Test that a named matrix with no name for a dim can be parsed."""
121 parsed = rdata.parser.parse_file(
122 TESTDATA_PATH / "test_half_named_matrix.rda",
123 )
124 converted = rdata.conversion.convert(parsed)
125 reference = xarray.DataArray(
126 [
127 [1.0, 2.0, 3.0],
128 [4.0, 5.0, 6.0],
129 ],
130 dims=["dim_0", "dim_1"],
131 coords={
132 "dim_0": ["dim0_0", "dim0_1"],
133 },
134 )
135
136 xarray.testing.assert_identical(
137 converted["test_half_named_matrix"],
138 reference,
139 )
140
141 def test_full_named_matrix(self) -> None:
142 """Test that a named matrix with dim names can be parsed."""
143 parsed = rdata.parser.parse_file(
144 TESTDATA_PATH / "test_full_named_matrix.rda",
145 )
146 converted = rdata.conversion.convert(parsed)
147 reference = xarray.DataArray(
148 [
149 [1.0, 2.0, 3.0],
150 [4.0, 5.0, 6.0],
151 ],
152 dims=["my_dim_0", "my_dim_1"],
153 coords={
154 "my_dim_0": ["dim0_0", "dim0_1"],
155 "my_dim_1": ["dim1_0", "dim1_1", "dim1_2"],
156 },
157 )
158
159 xarray.testing.assert_identical(
160 converted["test_full_named_matrix"],
161 reference,
162 )
163
164 def test_full_named_matrix_rds(self) -> None:
165 """Test that a named matrix with dim names can be parsed."""
166 parsed = rdata.parser.parse_file(
167 TESTDATA_PATH / "test_full_named_matrix.rds",
168 )
169 converted = rdata.conversion.convert(parsed)
170 reference = xarray.DataArray(
171 [
172 [1.0, 2.0, 3.0],
173 [4.0, 5.0, 6.0],
174 ],
175 dims=["my_dim_0", "my_dim_1"],
176 coords={
177 "my_dim_0": ["dim0_0", "dim0_1"],
178 "my_dim_1": ["dim1_0", "dim1_1", "dim1_2"],
179 },
180 )
181
182 xarray.testing.assert_identical(
183 converted,
184 reference,
185 )
79186
80187 def test_list(self) -> None:
188 """Test that list can be parsed."""
81189 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list.rda")
82190 converted = rdata.conversion.convert(parsed)
83191
84192 np.testing.assert_equal(converted, {
85193 "test_list":
86194 [
87 np.array([1.]),
195 np.array([1.0]),
88196 ['a', 'b', 'c'],
89 np.array([2., 3.]),
90 ['hi']
91 ]
197 np.array([2.0, 3.0]),
198 ['hi'],
199 ],
200 })
201
202 def test_file(self) -> None:
203 """Test that external pointers can be parsed."""
204 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_file.rda")
205 converted = rdata.conversion.convert(parsed)
206
207 np.testing.assert_equal(converted, {
208 "test_file": [5],
92209 })
93210
94211 def test_expression(self) -> None:
212 """Test that expressions can be parsed."""
95213 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_expression.rda")
96214 converted = rdata.conversion.convert(parsed)
97215
98216 np.testing.assert_equal(converted, {
99217 "test_expression": rdata.conversion.RExpression([
100 rdata.conversion.RLanguage(['^', 'base', 'exponent'])])
101 })
218 rdata.conversion.RLanguage(
219 ['^', 'base', 'exponent'],
220 attributes={},
221 ),
222 ]),
223 })
224
225 def test_builtin(self) -> None:
226 """Test that builtin functions can be parsed."""
227 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_builtin.rda")
228 converted = rdata.conversion.convert(parsed)
229
230 np.testing.assert_equal(converted, {
231 "test_builtin": rdata.conversion.RBuiltin(name="abs"),
232 })
233
234 def test_minimal_function_uncompiled(self) -> None:
235 """Test that a minimal function can be parsed."""
236 parsed = rdata.parser.parse_file(
237 TESTDATA_PATH / "test_minimal_function_uncompiled.rda")
238 converted = rdata.conversion.convert(parsed)
239
240 converted_fun = converted["test_minimal_function_uncompiled"]
241
242 self.assertIsInstance(
243 converted_fun,
244 rdata.conversion.RFunction,
245 )
246
247 np.testing.assert_equal(converted_fun.environment, ChainMap({}))
248 np.testing.assert_equal(converted_fun.formals, None)
249 np.testing.assert_equal(converted_fun.body, None)
250 np.testing.assert_equal(
251 converted_fun.source,
252 "test_minimal_function_uncompiled <- function() NULL\n",
253 )
254
255 def test_minimal_function(self) -> None:
256 """Test that a minimal function (compiled) can be parsed."""
257 parsed = rdata.parser.parse_file(
258 TESTDATA_PATH / "test_minimal_function.rda")
259 converted = rdata.conversion.convert(parsed)
260
261 converted_fun = converted["test_minimal_function"]
262
263 self.assertIsInstance(
264 converted_fun,
265 rdata.conversion.RFunction,
266 )
267
268 np.testing.assert_equal(converted_fun.environment, ChainMap({}))
269 np.testing.assert_equal(converted_fun.formals, None)
270
271 converted_body = converted_fun.body
272
273 self.assertIsInstance(
274 converted_body,
275 rdata.conversion.RBytecode,
276 )
277
278 np.testing.assert_equal(converted_body.code, np.array([12, 17, 1]))
279 np.testing.assert_equal(converted_body.attributes, {})
280
281 np.testing.assert_equal(
282 converted_fun.source,
283 "test_minimal_function <- function() NULL\n",
284 )
285
286 def test_empty_function_uncompiled(self) -> None:
287 """Test that a simple function can be parsed."""
288 parsed = rdata.parser.parse_file(
289 TESTDATA_PATH / "test_empty_function_uncompiled.rda")
290 converted = rdata.conversion.convert(parsed)
291
292 converted_fun = converted["test_empty_function_uncompiled"]
293
294 self.assertIsInstance(
295 converted_fun,
296 rdata.conversion.RFunction,
297 )
298
299 np.testing.assert_equal(converted_fun.environment, ChainMap({}))
300 np.testing.assert_equal(converted_fun.formals, None)
301 self.assertIsInstance(converted_fun.body, rdata.conversion.RLanguage)
302 np.testing.assert_equal(
303 converted_fun.source,
304 "test_empty_function_uncompiled <- function() {}\n",
305 )
306
307 def test_empty_function(self) -> None:
308 """Test that a simple function (compiled) can be parsed."""
309 parsed = rdata.parser.parse_file(
310 TESTDATA_PATH / "test_empty_function.rda")
311 converted = rdata.conversion.convert(parsed)
312
313 converted_fun = converted["test_empty_function"]
314
315 self.assertIsInstance(
316 converted_fun,
317 rdata.conversion.RFunction,
318 )
319
320 np.testing.assert_equal(converted_fun.environment, ChainMap({}))
321 np.testing.assert_equal(converted_fun.formals, None)
322
323 converted_body = converted_fun.body
324
325 self.assertIsInstance(
326 converted_body,
327 rdata.conversion.RBytecode,
328 )
329
330 np.testing.assert_equal(converted_body.code, np.array([12, 17, 1]))
331 np.testing.assert_equal(converted_body.attributes, {})
332
333 np.testing.assert_equal(
334 converted_fun.source,
335 "test_empty_function <- function() {}\n",
336 )
337
338 def test_function(self) -> None:
339 """Test that functions can be parsed."""
340 parsed = rdata.parser.parse_file(
341 TESTDATA_PATH / "test_function.rda")
342 converted = rdata.conversion.convert(parsed)
343
344 converted_fun = converted["test_function"]
345
346 self.assertIsInstance(
347 converted_fun,
348 rdata.conversion.RFunction,
349 )
350
351 np.testing.assert_equal(converted_fun.environment, ChainMap({}))
352 np.testing.assert_equal(converted_fun.formals, None)
353
354 converted_body = converted_fun.body
355
356 self.assertIsInstance(
357 converted_body,
358 rdata.conversion.RBytecode,
359 )
360
361 np.testing.assert_equal(
362 converted_body.code,
363 np.array([12, 23, 1, 34, 4, 38, 2, 1]),
364 )
365 np.testing.assert_equal(converted_body.attributes, {})
366
367 np.testing.assert_equal(
368 converted_fun.source,
369 "test_function <- function() {print(\"Hello\")}\n",
370 )
371
372 def test_function_arg(self) -> None:
373 """Test that functions can be parsed."""
374 parsed = rdata.parser.parse_file(
375 TESTDATA_PATH / "test_function_arg.rda")
376 converted = rdata.conversion.convert(parsed)
377
378 converted_fun = converted["test_function_arg"]
379
380 self.assertIsInstance(
381 converted_fun,
382 rdata.conversion.RFunction,
383 )
384
385 np.testing.assert_equal(converted_fun.environment, ChainMap({}))
386 np.testing.assert_equal(converted_fun.formals, {"a": NotImplemented})
387
388 converted_body = converted_fun.body
389
390 self.assertIsInstance(
391 converted_body,
392 rdata.conversion.RBytecode,
393 )
394
395 np.testing.assert_equal(
396 converted_body.code,
397 np.array([12, 23, 1, 29, 4, 38, 2, 1]),
398 )
399 np.testing.assert_equal(converted_body.attributes, {})
400
401 np.testing.assert_equal(
402 converted_fun.source,
403 "test_function_arg <- function(a) {print(a)}\n",
404 )
102405
103406 def test_encodings(self) -> None:
104
407 """Test of differents encodings."""
105408 with self.assertWarns(
106409 UserWarning,
107 msg="Unknown encoding. Assumed ASCII."
410 msg="Unknown encoding. Assumed ASCII.",
108411 ):
109412 parsed = rdata.parser.parse_file(
110413 TESTDATA_PATH / "test_encodings.rda",
119422 })
120423
121424 def test_encodings_v3(self) -> None:
122
425 """Test encodings in version 3 format."""
123426 parsed = rdata.parser.parse_file(
124427 TESTDATA_PATH / "test_encodings_v3.rda",
125428 )
133436 })
134437
135438 def test_dataframe(self) -> None:
136
137 for f in {"test_dataframe.rda", "test_dataframe_v3.rda"}:
439 """Test dataframe conversion."""
440 for f in ("test_dataframe.rda", "test_dataframe_v3.rda"):
138441 with self.subTest(file=f):
139442 parsed = rdata.parser.parse_file(
140443 TESTDATA_PATH / f,
143446
144447 pd.testing.assert_frame_equal(
145448 converted["test_dataframe"],
146 pd.DataFrame({
147 "class": pd.Categorical(
148 ["a", "b", "b"]),
149 "value": [1, 2, 3],
150 })
449 pd.DataFrame(
450 {
451 "class": pd.Categorical(
452 ["a", "b", "b"],
453 ),
454 "value": [1, 2, 3],
455 },
456 index=pd.RangeIndex(start=1, stop=4),
457 ),
151458 )
152459
460 def test_dataframe_rds(self) -> None:
461 """Test dataframe conversion."""
462 for f in ("test_dataframe.rds", "test_dataframe_v3.rds"):
463 with self.subTest(file=f):
464 parsed = rdata.parser.parse_file(
465 TESTDATA_PATH / f,
466 )
467 converted = rdata.conversion.convert(parsed)
468
469 pd.testing.assert_frame_equal(
470 converted,
471 pd.DataFrame(
472 {
473 "class": pd.Categorical(
474 ["a", "b", "b"],
475 ),
476 "value": [1, 2, 3],
477 },
478 index=pd.RangeIndex(start=1, stop=4),
479 ),
480 )
481
482 def test_dataframe_rownames(self) -> None:
483 """Test dataframe conversion."""
484 parsed = rdata.parser.parse_file(
485 TESTDATA_PATH / "test_dataframe_rownames.rda",
486 )
487 converted = rdata.conversion.convert(parsed)
488
489 pd.testing.assert_frame_equal(
490 converted["test_dataframe_rownames"],
491 pd.DataFrame(
492 {
493 "class": pd.Categorical(
494 ["a", "b", "b"],
495 ),
496 "value": [1, 2, 3],
497 },
498 index=('Madrid', 'Frankfurt', 'Herzberg am Harz'),
499 ),
500 )
501
153502 def test_ts(self) -> None:
503 """Test time series conversion."""
154504 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_ts.rda")
155505 converted = rdata.conversion.convert(parsed)
156506
157 pd.testing.assert_series_equal(converted["test_ts"],
158 pd.Series({
159 2000 + Fraction(2, 12): 1.,
160 2000 + Fraction(3, 12): 2.,
161 2000 + Fraction(4, 12): 3.,
162 }))
507 pd.testing.assert_series_equal(
508 converted["test_ts"],
509 pd.Series({
510 2000 + Fraction(2, 12): 1.0,
511 2000 + Fraction(3, 12): 2.0,
512 2000 + Fraction(4, 12): 3.0,
513 }),
514 )
163515
164516 def test_s4(self) -> None:
517 """Test parsing of S4 classes."""
165518 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_s4.rda")
166519 converted = rdata.conversion.convert(parsed)
167520
169522 "test_s4": SimpleNamespace(
170523 age=np.array(28),
171524 name=["Carlos"],
172 **{'class': ["Person"]}
173 )
525 **{'class': ["Person"]}, # noqa: WPS517
526 ),
174527 })
175528
176529 def test_environment(self) -> None:
177 parsed = rdata.parser.parse_file(
178 TESTDATA_PATH / "test_environment.rda")
530 """Test parsing of environments."""
531 parsed = rdata.parser.parse_file(
532 TESTDATA_PATH / "test_environment.rda",
533 )
179534 converted = rdata.conversion.convert(parsed)
180535
181536 dict_env = {'string': ['test']}
182537 empty_global_env: Dict[str, Any] = {}
183538
184539 np.testing.assert_equal(converted, {
185 "test_environment": ChainMap(dict_env, ChainMap(empty_global_env))
540 "test_environment": ChainMap(dict_env, ChainMap(empty_global_env)),
186541 })
187542
188543 global_env = {"global": "test"}
193548 )
194549
195550 np.testing.assert_equal(converted_global, {
196 "test_environment": ChainMap(dict_env, ChainMap(global_env))
551 "test_environment": ChainMap(dict_env, ChainMap(global_env)),
197552 })
198553
199554 def test_emptyenv(self) -> None:
200 parsed = rdata.parser.parse_file(
201 TESTDATA_PATH / "test_emptyenv.rda")
202 converted = rdata.conversion.convert(parsed)
203
204 np.testing.assert_equal(converted, {
205 "test_emptyenv": ChainMap({})
555 """Test parsing the empty environment."""
556 parsed = rdata.parser.parse_file(
557 TESTDATA_PATH / "test_emptyenv.rda",
558 )
559 converted = rdata.conversion.convert(parsed)
560
561 self.assertEqual(converted, {
562 "test_emptyenv": ChainMap({}),
206563 })
207564
208565 def test_list_attrs(self) -> None:
566 """Test that lists accept attributes."""
209567 parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list_attrs.rda")
210568 converted = rdata.conversion.convert(parsed)
211569
212570 np.testing.assert_equal(converted, {
213 "test_list_attrs": [['list'], [5]]
571 "test_list_attrs": [['list'], [5]],
214572 })
215573
216574 def test_altrep_compact_intseq(self) -> None:
243601 converted = rdata.conversion.convert(parsed)
244602
245603 np.testing.assert_equal(converted, {
246 "test_altrep_deferred_string": [
604 "test_altrep_deferred_string": [ # noqa: WPS317
247605 "1", "2.3", "10000",
248606 "1e+05", "-10000", "-1e+05",
249607 "0.001", "1e-04", "1e-05",
285643
286644
287645 if __name__ == "__main__":
288 # import sys;sys.argv = ['', 'Test.testName']
289646 unittest.main()
99 include_trailing_comma = true
1010 use_parentheses = true
1111 combine_as_imports = 1
12
13 [flake8]
14 ignore =
15 # No docstring for magic methods
16 D105,
17 # No docstrings in __init__
18 D107,
19 # Ignore until https://github.com/terrencepreilly/darglint/issues/54 is closed
20 DAR202,
21 # Ignore until https://github.com/terrencepreilly/darglint/issues/144 is closed
22 DAR401,
23 # Non-explicit exceptions may be documented in raises
24 DAR402,
25 # Uppercase arguments like X are common in scikit-learn
26 N803,
27 # Uppercase variables like X are common in scikit-learn
28 N806,
29 # There are no bad quotes
30 Q000,
31 # Google Python style is not RST until after processed by Napoleon
32 # See https://github.com/peterjc/flake8-rst-docstrings/issues/17
33 RST201, RST203, RST301,
34 # assert is used by pytest tests
35 S101,
36 # Line break occurred before a binary operator (antipattern)
37 W503,
38 # Utils is used as a module name
39 WPS100,
40 # Short names like X or y are common in scikit-learn
41 WPS111,
42 # We do not like this underscored numbers convention
43 WPS114,
44 # Attributes in uppercase are used in enums
45 WPS115,
46 # Trailing underscores are a scikit-learn convention
47 WPS120,
48 # Cognitive complexity cannot be avoided at some modules
49 WPS232,
50 # The number of imported things may be large, especially for typing
51 WPS235,
52 # We like local imports, thanks
53 WPS300,
54 # Dotted imports are ok
55 WPS301,
56 # We love f-strings
57 WPS305,
58 # Implicit string concatenation is useful for exception messages
59 WPS306,
60 # No base class needed
61 WPS326,
62 # We allow multiline conditions
63 WPS337,
64 # We order methods differently
65 WPS338,
66 # We need multine loops
67 WPS352,
68 # Assign to a subcript slice is normal behaviour in numpy
69 WPS362,
70 # All keywords are beautiful
71 WPS420,
72 # We use nested imports sometimes, and it is not THAT bad
73 WPS433,
74 # We use list multiplication to allocate list with immutable values (None or numbers)
75 WPS435,
76 # Our private modules are fine to import
77 # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441)
78 WPS436,
79 # Our private objects are fine to import
80 WPS450,
81 # Numpy mixes bitwise and comparison operators
82 WPS465,
83 # Explicit len compare is better than implicit
84 WPS507,
85 # Comparison with not is not the same as with equality
86 WPS520,
87
88 per-file-ignores =
89 __init__.py:
90 # Unused modules are allowed in `__init__.py`, to reduce imports
91 F401,
92 # Explicit re-exports allowed in __init__
93 WPS113,
94 # Import multiple names is allowed in `__init__.py`
95 WPS235,
96 # Logic is allowed in `__init__.py`
97 WPS412
98
99 # Tests benefit from overused expressions, magic numbers and fixtures
100 test_*.py: WPS204, WPS432, WPS442
101
102 rst-directives =
103 # These are sorted alphabetically - but that does not matter
104 autosummary,data,currentmodule,deprecated,
105 glossary,moduleauthor,plot,testcode,
106 versionadded,versionchanged,
107
108 rst-roles =
109 attr,class,func,meth,mod,obj,ref,term,
110
111 allowed-domain-names = data, info, obj, result, results, val, value, values, var
112
113 # Needs to be tuned
114 max-arguments = 10
115 max-attributes = 10
116 max-cognitive-score = 30
117 max-expressions = 15
118 max-imports = 20
119 max-line-complexity = 30
120 max-local-variables = 15
121 max-methods = 30
122 max-module-expressions = 15
123 max-module-members = 15
124 max-string-usages = 10
125
126 ignore-decorators = (property)|(overload)
127
128 strictness = long
129
130 # Beautify output and make it more informative
131 format = wemake
132 show-source = true
12133
13134 [mypy]
14135 strict = True
66 language or its libraries, and thus it is released under a MIT license.
77 """
88 import os
9 import pathlib
910 import sys
1011
1112 from setuptools import find_packages, setup
1516
1617 DOCLINES = (__doc__ or '').split("\n")
1718
18 with open(os.path.join(os.path.dirname(__file__),
19 'VERSION'), 'r') as version_file:
19 with open(
20 pathlib.Path(os.path.dirname(__file__)) / 'rdata' / 'VERSION',
21 'r',
22 ) as version_file:
2023 version = version_file.read().strip()
2124
22 setup(name='rdata',
23 version=version,
24 description=DOCLINES[1],
25 long_description="\n".join(DOCLINES[3:]),
26 url='https://github.com/vnmabus/rdata',
27 author='Carlos Ramos Carreño',
28 author_email='vnmabus@gmail.com',
29 include_package_data=True,
30 platforms=['any'],
31 license='MIT',
32 packages=find_packages(),
33 python_requires='>=3.7, <4',
34 classifiers=[
35 'Development Status :: 4 - Beta',
36 'Intended Audience :: Developers',
37 'Intended Audience :: Science/Research',
38 'License :: OSI Approved :: MIT License',
39 'Natural Language :: English',
40 'Operating System :: OS Independent',
41 'Programming Language :: Python :: 3',
42 'Programming Language :: Python :: 3.6',
43 'Programming Language :: Python :: 3.7',
44 'Programming Language :: Python :: 3.8',
45 'Topic :: Scientific/Engineering :: Mathematics',
46 'Topic :: Software Development :: Libraries :: Python Modules',
47 'Typing :: Typed',
48 ],
49 keywords=['rdata', 'r', 'dataset'],
50 install_requires=['numpy',
51 'xarray',
52 'pandas'],
53 setup_requires=pytest_runner,
54 tests_require=['pytest-cov',
55 'numpy>=1.14' # The printing format for numpy changes
56 ],
57 test_suite='rdata.tests',
58 zip_safe=False)
25 setup(
26 name='rdata',
27 version=version,
28 description=DOCLINES[1],
29 long_description="\n".join(DOCLINES[3:]),
30 url='https://github.com/vnmabus/rdata',
31 author='Carlos Ramos Carreño',
32 author_email='vnmabus@gmail.com',
33 include_package_data=True,
34 platforms=['any'],
35 license='MIT',
36 packages=find_packages(),
37 python_requires='>=3.7, <4',
38 classifiers=[
39 'Development Status :: 4 - Beta',
40 'Intended Audience :: Developers',
41 'Intended Audience :: Science/Research',
42 'License :: OSI Approved :: MIT License',
43 'Natural Language :: English',
44 'Operating System :: OS Independent',
45 'Programming Language :: Python :: 3',
46 'Programming Language :: Python :: 3.6',
47 'Programming Language :: Python :: 3.7',
48 'Programming Language :: Python :: 3.8',
49 'Topic :: Scientific/Engineering :: Mathematics',
50 'Topic :: Software Development :: Libraries :: Python Modules',
51 'Typing :: Typed',
52 ],
53 keywords=['rdata', 'r', 'dataset'],
54 install_requires=[
55 'numpy',
56 'xarray',
57 'pandas',
58 ],
59 setup_requires=pytest_runner,
60 tests_require=[
61 'pytest-cov',
62 'numpy>=1.14', # The printing format for numpy changes
63 ],
64 test_suite='rdata.tests',
65 zip_safe=False,
66 )